From f96b85ab44b82736363764ea39ee62884007f4a3 Mon Sep 17 00:00:00 2001
From: Liang-Chi Hsieh <viirya@gmail.com>
Date: Fri, 15 May 2015 10:03:29 -0700
Subject: [PATCH 001/525] [SPARK-7668] [MLLIB] Preserve isTransposed property
 for Matrix after calling map function

JIRA: https://issues.apache.org/jira/browse/SPARK-7668

Author: Liang-Chi Hsieh <viirya@gmail.com>

Closes #6188 from viirya/fix_matrix_map and squashes the following commits:

2a7cc97 [Liang-Chi Hsieh] Preserve isTransposed property for Matrix after calling map function.
---
 .../main/scala/org/apache/spark/mllib/linalg/Matrices.scala  | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Matrices.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Matrices.scala
index 3fa5e068d16d4..a609674df6b8b 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Matrices.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Matrices.scala
@@ -273,7 +273,8 @@ class DenseMatrix(
 
   override def copy: DenseMatrix = new DenseMatrix(numRows, numCols, values.clone())
 
-  private[mllib] def map(f: Double => Double) = new DenseMatrix(numRows, numCols, values.map(f))
+  private[mllib] def map(f: Double => Double) = new DenseMatrix(numRows, numCols, values.map(f),
+    isTransposed)
 
   private[mllib] def update(f: Double => Double): DenseMatrix = {
     val len = values.length
@@ -535,7 +536,7 @@ class SparseMatrix(
   }
 
   private[mllib] def map(f: Double => Double) =
-    new SparseMatrix(numRows, numCols, colPtrs, rowIndices, values.map(f))
+    new SparseMatrix(numRows, numCols, colPtrs, rowIndices, values.map(f), isTransposed)
 
   private[mllib] def update(f: Double => Double): SparseMatrix = {
     val len = values.length

From 8f4aaba0e4e3350ab152a476d08ff60e9495c6d2 Mon Sep 17 00:00:00 2001
From: FlytxtRnD <meethu.mathew@flytxt.com>
Date: Fri, 15 May 2015 10:43:18 -0700
Subject: [PATCH 002/525] [SPARK-7651] [MLLIB] [PYSPARK] GMM predict,
 predictSoft should raise error on bad input

In the Python API for Gaussian Mixture Model, predict() and predictSoft() methods should raise an error when the input argument is not an RDD.

Author: FlytxtRnD <meethu.mathew@flytxt.com>

Closes #6180 from FlytxtRnD/GmmPredictException and squashes the following commits:

4b6aa11 [FlytxtRnD] Raise error if the input to predict()/predictSoft() is not an RDD
---
 python/pyspark/mllib/clustering.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/python/pyspark/mllib/clustering.py b/python/pyspark/mllib/clustering.py
index a53333dae6a82..b55583f82223f 100644
--- a/python/pyspark/mllib/clustering.py
+++ b/python/pyspark/mllib/clustering.py
@@ -212,6 +212,9 @@ def predict(self, x):
         if isinstance(x, RDD):
             cluster_labels = self.predictSoft(x).map(lambda z: z.index(max(z)))
             return cluster_labels
+        else:
+            raise TypeError("x should be represented by an RDD, "
+                            "but got %s." % type(x))
 
     def predictSoft(self, x):
         """
@@ -225,6 +228,9 @@ def predictSoft(self, x):
             membership_matrix = callMLlibFunc("predictSoftGMM", x.map(_convert_to_vector),
                                               _convert_to_vector(self._weights), means, sigmas)
             return membership_matrix.map(lambda x: pyarray.array('d', x))
+        else:
+            raise TypeError("x should be represented by an RDD, "
+                            "but got %s." % type(x))
 
 
 class GaussianMixture(object):

From b1b9d5802e3d185f42711ab043a21c9d1eb4763f Mon Sep 17 00:00:00 2001
From: Oleksii Kostyliev <etander@gmail.com>
Date: Fri, 15 May 2015 11:19:56 -0700
Subject: [PATCH 003/525] [SPARK-7233] [CORE] Detect REPL mode once

<h3>Description</h3>
Detect REPL mode once per JVM lifespan.
Previous behavior was to check presence of interpreter mode every time a job was submitted. In the case of execution of multiple short-living jobs this was causing massive mutual blocks between submission threads.

For more details please refer to https://issues.apache.org/jira/browse/SPARK-7233.

<h3>Notes</h3>
* I inverted the return value in case of catching an exception from `true` to `false`. It seems more logical to assume that if the REPL class is not found, we aren't in the interpreter mode.
* I'd personally would call `classForName` with just a Spark classloader (`org.apache.spark.util.Utils#getSparkClassLoader`) but `org.apache.spark.util.Utils#getContextOrSparkClassLoader` is said to be preferable.
* I struggled to come up with a concise, readable and clear unit test. Suggestions are welcome if you feel necessary.

Author: Oleksii Kostyliev <etander@gmail.com>
Author: Oleksii Kostyliev <okostyliev@thunderhead.com>

Closes #5835 from preeze/SPARK-7233 and squashes the following commits:

69bb9e4 [Oleksii Kostyliev] SPARK-7527: fixed explanatory comment to meet style-checker requirements
26dcc24 [Oleksii Kostyliev] SPARK-7527: fixed explanatory comment to meet style-checker requirements
c6f9685 [Oleksii Kostyliev] Merge remote-tracking branch 'remotes/upstream/master' into SPARK-7233
b78a983 [Oleksii Kostyliev] SPARK-7527: revert the fix and let it be addressed separately at a later stage
b64d441 [Oleksii Kostyliev] SPARK-7233: inline inInterpreter parameter into instantiateClass
86e2606 [Oleksii Kostyliev] SPARK-7233, SPARK-7527: Handle interpreter mode properly.
c7ee69c [Oleksii Kostyliev] Merge remote-tracking branch 'upstream/master' into SPARK-7233
d6c07fc [Oleksii Kostyliev] SPARK-7233: properly handle the inverted meaning of isInInterpreter
c319039 [Oleksii Kostyliev] SPARK-7233: move inInterpreter to Utils and make it lazy
---
 .../org/apache/spark/util/ClosureCleaner.scala   | 16 +++-------------
 .../main/scala/org/apache/spark/util/Utils.scala | 14 ++++++++++++++
 2 files changed, 17 insertions(+), 13 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/util/ClosureCleaner.scala b/core/src/main/scala/org/apache/spark/util/ClosureCleaner.scala
index 6fe32e469c732..6f2966bd4fd31 100644
--- a/core/src/main/scala/org/apache/spark/util/ClosureCleaner.scala
+++ b/core/src/main/scala/org/apache/spark/util/ClosureCleaner.scala
@@ -239,15 +239,6 @@ private[spark] object ClosureCleaner extends Logging {
     logDebug(s" + fields accessed by starting closure: " + accessedFields.size)
     accessedFields.foreach { f => logDebug("     " + f) }
 
-    val inInterpreter = {
-      try {
-        val interpClass = Class.forName("spark.repl.Main")
-        interpClass.getMethod("interp").invoke(null) != null
-      } catch {
-        case _: ClassNotFoundException => true
-      }
-    }
-
     // List of outer (class, object) pairs, ordered from outermost to innermost
     // Note that all outer objects but the outermost one (first one in this list) must be closures
     var outerPairs: List[(Class[_], AnyRef)] = (outerClasses zip outerObjects).reverse
@@ -274,7 +265,7 @@ private[spark] object ClosureCleaner extends Logging {
       // required fields from the original object. We need the parent here because the Java
       // language specification requires the first constructor parameter of any closure to be
       // its enclosing object.
-      val clone = instantiateClass(cls, parent, inInterpreter)
+      val clone = instantiateClass(cls, parent)
       for (fieldName <- accessedFields(cls)) {
         val field = cls.getDeclaredField(fieldName)
         field.setAccessible(true)
@@ -327,9 +318,8 @@ private[spark] object ClosureCleaner extends Logging {
 
   private def instantiateClass(
       cls: Class[_],
-      enclosingObject: AnyRef,
-      inInterpreter: Boolean): AnyRef = {
-    if (!inInterpreter) {
+      enclosingObject: AnyRef): AnyRef = {
+    if (!Utils.isInInterpreter) {
       // This is a bona fide closure class, whose constructor has no effects
       // other than to set its fields, so use its constructor
       val cons = cls.getConstructors()(0)
diff --git a/core/src/main/scala/org/apache/spark/util/Utils.scala b/core/src/main/scala/org/apache/spark/util/Utils.scala
index 48843b4ae57c6..6a7d1fae3320e 100644
--- a/core/src/main/scala/org/apache/spark/util/Utils.scala
+++ b/core/src/main/scala/org/apache/spark/util/Utils.scala
@@ -1795,6 +1795,20 @@ private[spark] object Utils extends Logging {
     }
   }
 
+  lazy val isInInterpreter: Boolean = {
+    try {
+      val interpClass = classForName("spark.repl.Main")
+      interpClass.getMethod("interp").invoke(null) != null
+    } catch {
+      // Returning true seems to be a mistake.
+      // Currently changing it to false causes tests failures in Streaming.
+      // For a more detailed discussion, please, refer to
+      // https://github.com/apache/spark/pull/5835#issuecomment-101042271 and subsequent comments.
+      // Addressing this changed is tracked as https://issues.apache.org/jira/browse/SPARK-7527
+      case _: ClassNotFoundException => true
+    }
+  }
+
   /**
    * Return a well-formed URI for the file described by a user input string.
    *

From 270d4b5181b95e3f1f131b1d65dde00a7e5b9d6e Mon Sep 17 00:00:00 2001
From: Tim Ellison <t.p.ellison@gmail.com>
Date: Fri, 15 May 2015 11:27:24 -0700
Subject: [PATCH 004/525] [CORE] Protect additional test vars from early GC

Fix more places in which some test variables could be collected early by aggressive JVM optimization.
Added a couple of comments to note where existing references are sufficient in the same test pattern.

Author: Tim Ellison <t.p.ellison@gmail.com>

Closes #6187 from tellison/DefeatEarlyGC and squashes the following commits:

27329d9 [Tim Ellison] [CORE] Protect additional test vars from early GC
---
 .../scala/org/apache/spark/ContextCleanerSuite.scala   | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/core/src/test/scala/org/apache/spark/ContextCleanerSuite.scala b/core/src/test/scala/org/apache/spark/ContextCleanerSuite.scala
index cb30e1f4e63a1..0922a2c3599cc 100644
--- a/core/src/test/scala/org/apache/spark/ContextCleanerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/ContextCleanerSuite.scala
@@ -165,6 +165,7 @@ class ContextCleanerSuite extends ContextCleanerSuiteBase {
     }
 
     // Test that GC causes RDD cleanup after dereferencing the RDD
+    // Note rdd is used after previous GC to avoid early collection by the JVM
     val postGCTester = new CleanerTester(sc, rddIds = Seq(rdd.id))
     rdd = null // Make RDD out of scope
     runGC()
@@ -181,9 +182,9 @@ class ContextCleanerSuite extends ContextCleanerSuiteBase {
     intercept[Exception] {
       preGCTester.assertCleanup()(timeout(1000 millis))
     }
+    rdd.count()  // Defeat early collection by the JVM
 
     // Test that GC causes shuffle cleanup after dereferencing the RDD
-    rdd.count()  // Defeat any early collection of rdd variable by the JVM
     val postGCTester = new CleanerTester(sc, shuffleIds = Seq(0))
     rdd = null  // Make RDD out of scope, so that corresponding shuffle goes out of scope
     runGC()
@@ -201,6 +202,7 @@ class ContextCleanerSuite extends ContextCleanerSuiteBase {
     }
 
     // Test that GC causes broadcast cleanup after dereferencing the broadcast variable
+    // Note broadcast is used after previous GC to avoid early collection by the JVM
     val postGCTester = new CleanerTester(sc, broadcastIds = Seq(broadcast.id))
     broadcast = null  // Make broadcast variable out of scope
     runGC()
@@ -226,7 +228,7 @@ class ContextCleanerSuite extends ContextCleanerSuiteBase {
 
     // the checkpoint is not cleaned by default (without the configuration set)
     var postGCTester = new CleanerTester(sc, Seq(rddId), Nil, Nil, Nil)
-    rdd = null // Make RDD out of scope
+    rdd = null // Make RDD out of scope, ok if collected earlier
     runGC()
     postGCTester.assertCleanup()
     assert(fs.exists(RDDCheckpointData.rddCheckpointDataPath(sc, rddId).get))
@@ -245,6 +247,9 @@ class ContextCleanerSuite extends ContextCleanerSuiteBase {
     // Confirm the checkpoint directory exists
     assert(fs.exists(RDDCheckpointData.rddCheckpointDataPath(sc, rddId).get))
 
+    // Reference rdd to defeat any early collection by the JVM
+    rdd.count()
+
     // Test that GC causes checkpoint data cleanup after dereferencing the RDD
     postGCTester = new CleanerTester(sc, Seq(rddId), Nil, Nil, Seq(rddId))
     rdd = null // Make RDD out of scope
@@ -352,6 +357,7 @@ class SortShuffleContextCleanerSuite extends ContextCleanerSuiteBase(classOf[Sor
     intercept[Exception] {
       preGCTester.assertCleanup()(timeout(1000 millis))
     }
+    rdd.count()  // Defeat early collection by the JVM
 
     // Test that GC causes shuffle cleanup after dereferencing the RDD
     val postGCTester = new CleanerTester(sc, shuffleIds = Seq(0))

From 8ab1450d3995b0c3ef64c5991b88c258e17bcb12 Mon Sep 17 00:00:00 2001
From: Sean Owen <sowen@cloudera.com>
Date: Fri, 15 May 2015 11:30:19 -0700
Subject: [PATCH 005/525] [SPARK-5412] [DEPLOY] Cannot bind Master to a
 specific hostname as per the documentation

Pass args to start-master.sh through to start-daemon.sh, as other scripts do, so that things like --host have effect on start-master.sh as per docs

Author: Sean Owen <sowen@cloudera.com>

Closes #6185 from srowen/SPARK-5412 and squashes the following commits:

b3ce9da [Sean Owen] Pass args to start-master.sh through to start-daemon.sh, as other scripts do, so that things like --host have effect on start-master.sh as per docs
---
 sbin/start-master.sh | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/sbin/start-master.sh b/sbin/start-master.sh
index 17fff58f4f768..a7f5d5702fd80 100755
--- a/sbin/start-master.sh
+++ b/sbin/start-master.sh
@@ -22,6 +22,8 @@
 sbin="`dirname "$0"`"
 sbin="`cd "$sbin"; pwd`"
 
+ORIGINAL_ARGS="$@"
+
 START_TACHYON=false
 
 while (( "$#" )); do
@@ -53,7 +55,9 @@ if [ "$SPARK_MASTER_WEBUI_PORT" = "" ]; then
   SPARK_MASTER_WEBUI_PORT=8080
 fi
 
-"$sbin"/spark-daemon.sh start org.apache.spark.deploy.master.Master 1 --ip $SPARK_MASTER_IP --port $SPARK_MASTER_PORT --webui-port $SPARK_MASTER_WEBUI_PORT
+"$sbin"/spark-daemon.sh start org.apache.spark.deploy.master.Master 1 \
+  --ip $SPARK_MASTER_IP --port $SPARK_MASTER_PORT --webui-port $SPARK_MASTER_WEBUI_PORT \
+  $ORIGINAL_ARGS
 
 if [ "$START_TACHYON" == "true" ]; then
   "$sbin"/../tachyon/bin/tachyon bootstrap-conf $SPARK_MASTER_IP

From ad92af9dbbd0c4e1224cca26da166382ed4f15b9 Mon Sep 17 00:00:00 2001
From: Kousuke Saruta <sarutak@oss.nttdata.co.jp>
Date: Fri, 15 May 2015 11:54:13 -0700
Subject: [PATCH 006/525] [SPARK-7664] [WEBUI] DAG visualization: Fix incorrect
 link paths of DAG.

In JobPage, we can jump a StagePage when we click corresponding box of DAG viz but the link path is incorrect.

When we click a box like as follows ...
![screenshot_from_2015-05-15 19 24 25](https://cloud.githubusercontent.com/assets/4736016/7651528/5f7ef824-fb3c-11e4-9518-8c9ade2dff7a.png)

We jump to index page.
![screenshot_from_2015-05-15 19 24 45](https://cloud.githubusercontent.com/assets/4736016/7651534/6d666274-fb3c-11e4-971c-c3f2dc2b1da2.png)

Author: Kousuke Saruta <sarutak@oss.nttdata.co.jp>

Closes #6184 from sarutak/fix-link-path-of-dag-viz and squashes the following commits:

faba3ba [Kousuke Saruta] Fix a incorrect link
---
 .../resources/org/apache/spark/ui/static/spark-dag-viz.js    | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/core/src/main/resources/org/apache/spark/ui/static/spark-dag-viz.js b/core/src/main/resources/org/apache/spark/ui/static/spark-dag-viz.js
index 8138eb0d4f390..ee48fd29a6432 100644
--- a/core/src/main/resources/org/apache/spark/ui/static/spark-dag-viz.js
+++ b/core/src/main/resources/org/apache/spark/ui/static/spark-dag-viz.js
@@ -186,8 +186,9 @@ function renderDagVizForJob(svgContainer) {
     var stageId = metadata.attr("stage-id");
     var containerId = VizConstants.graphPrefix + stageId;
     // Link each graph to the corresponding stage page (TODO: handle stage attempts)
-    var stageLink = "/stages/stage/?id=" +
-      stageId.replace(VizConstants.stagePrefix, "") + "&attempt=0&expandDagViz=true";
+    var stageLink = $("#stage-" + stageId.replace(VizConstants.stagePrefix, "") + "-0")
+      .find("a")
+      .attr("href") + "&expandDagViz=true";
     var container = svgContainer
       .append("a")
       .attr("xlink:href", stageLink)

From 8e3822a0794b8b18436bd63d6859d40139a77090 Mon Sep 17 00:00:00 2001
From: ehnalis <zoltan.zvara@gmail.com>
Date: Fri, 15 May 2015 12:14:02 -0700
Subject: [PATCH 007/525] [SPARK-7504] [YARN] NullPointerException when
 initializing SparkContext in YARN-cluster mode

Added a simple checking for SparkContext.
Also added two rational checking against null at AM object.

Author: ehnalis <zoltan.zvara@gmail.com>

Closes #6083 from ehnalis/cluster and squashes the following commits:

926bd96 [ehnalis] Moved check to SparkContext.
7c89b6e [ehnalis] Remove false line.
ea2a5fe [ehnalis] [SPARK-7504] [YARN] NullPointerException when initializing SparkContext in YARN-cluster mode
4924e01 [ehnalis] [SPARK-7504] [YARN] NullPointerException when initializing SparkContext in YARN-cluster mode
39e4fa3 [ehnalis] SPARK-7504 [YARN] NullPointerException when initializing SparkContext in YARN-cluster mode
9f287c5 [ehnalis] [SPARK-7504] [YARN] NullPointerException when initializing SparkContext in YARN-cluster mode
---
 core/src/main/scala/org/apache/spark/SparkContext.scala | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/core/src/main/scala/org/apache/spark/SparkContext.scala b/core/src/main/scala/org/apache/spark/SparkContext.scala
index b59f562d05ead..af276e7b8d40c 100644
--- a/core/src/main/scala/org/apache/spark/SparkContext.scala
+++ b/core/src/main/scala/org/apache/spark/SparkContext.scala
@@ -371,6 +371,14 @@ class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationCli
       throw new SparkException("An application name must be set in your configuration")
     }
 
+    // System property spark.yarn.app.id must be set if user code ran by AM on a YARN cluster
+    // yarn-standalone is deprecated, but still supported
+    if ((master == "yarn-cluster" || master == "yarn-standalone") &&
+        !_conf.contains("spark.yarn.app.id")) {
+      throw new SparkException("Detected yarn-cluster mode, but isn't running on a cluster. " +
+        "Deployment to YARN is not supported directly by SparkContext. Please use spark-submit.")
+    }
+
     if (_conf.getBoolean("spark.logConf", false)) {
       logInfo("Spark configuration:\n" + _conf.toDebugString)
     }

From 9b6cf285d0b60848b01b6c7e3421e8ac850a88ab Mon Sep 17 00:00:00 2001
From: Kousuke Saruta <sarutak@oss.nttdata.co.jp>
Date: Fri, 15 May 2015 13:54:09 -0700
Subject: [PATCH 008/525] [SPARK-7296] Add timeline visualization for stages in
 the UI.

This PR builds on #2342 by adding a timeline view for the Stage page,
showing how tasks spend their time.

With this timeline, we can understand following things of a Stage.

* When/where each task ran
* Total duration of each task
* Proportion of the time each task spends

Also, this timeline view can scrollable and zoomable.

Author: Kousuke Saruta <sarutak@oss.nttdata.co.jp>

Closes #5843 from sarutak/stage-page-timeline and squashes the following commits:

4ba9604 [Kousuke Saruta] Fixed the order of legends
16bb552 [Kousuke Saruta] Removed border of legend area
2e5d605 [Kousuke Saruta] Modified warning message
16cb2e6 [Kousuke Saruta] Merge branch 'master' of https://github.com/apache/spark into stage-page-timeline
7ae328f [Kousuke Saruta] Modified code style
d5f794a [Kousuke Saruta] Fixed performance issues more
64e6642 [Kousuke Saruta] Merge branch 'master' of https://github.com/apache/spark into stage-page-timeline
e4a3354 [Kousuke Saruta] minor code style change
878e3b8 [Kousuke Saruta] Fixed a bug that tooltip remains
b9d8f1b [Kousuke Saruta] Fixed performance issue
ac8842b [Kousuke Saruta] Fixed layout
2319739 [Kousuke Saruta] Modified appearances more
81903ab [Kousuke Saruta] Modified appearances
a79dcc3 [Kousuke Saruta] Modified appearance
55a390c [Kousuke Saruta] Ignored scalastyle for a line-comment
29eae3e [Kousuke Saruta] limited to longest 1000 tasks
2a9e376 [Kousuke Saruta] Minor cleanup
385b6d2 [Kousuke Saruta] Added link feature
ba1ac3e [Kousuke Saruta] Fixed style
2ae8520 [Kousuke Saruta] Updated bootstrap-tooltip.js from 2.2.2 to 2.3.2
af430f1 [Kousuke Saruta] Merge branch 'master' of git://git.apache.org/spark into stage-page-timeline
e694b8e [Kousuke Saruta] Added timeline view to StagePage
8f6610c [Kousuke Saruta] Fixed conflict
b587cf2 [Kousuke Saruta] initial commit
11fe67d [Kousuke Saruta] Fixed conflict
79ac03d [Kousuke Saruta] Merge branch 'master' of git://git.apache.org/spark into timeline-viewer-feature
a91abd3 [Kousuke Saruta] Merge branch 'master' of https://github.com/apache/spark into timeline-viewer-feature
ef34a5b [Kousuke Saruta] Implement tooltip using bootstrap
b09d0c5 [Kousuke Saruta] Move `stroke` and `fill` attribute of rect elements to css
d3c63c8 [Kousuke Saruta] Fixed a little bit bugs
a36291b [Kousuke Saruta] Merge branch 'master' of https://github.com/apache/spark into timeline-viewer-feature
28714b6 [Kousuke Saruta] Fixed highlight issue
0dc4278 [Kousuke Saruta] Addressed most of Patrics's feedbacks
8110acf [Kousuke Saruta] Added scroll limit to Job timeline
974a64a [Kousuke Saruta] Removed unused function
ee7a7f0 [Kousuke Saruta] Refactored
6a91872 [Kousuke Saruta] Temporary commit
6693f34 [Kousuke Saruta] Added link to job/stage box in the timeline in order to move to corresponding row when we click
8f88222 [Kousuke Saruta] Added job/stage description
aeed4b1 [Kousuke Saruta] Removed stage timeline
fc1696c [Kousuke Saruta] Merge branch 'timeline-viewer-feature' of github.com:sarutak/spark into timeline-viewer-feature
999ccd4 [Kousuke Saruta] Improved scalability
0fc6a31 [Kousuke Saruta] Merge branch 'master' of git://git.apache.org/spark into timeline-viewer-feature
19815ae [Kousuke Saruta] Merge branch 'master' of git://git.apache.org/spark into timeline-viewer-feature
68b7540 [Kousuke Saruta] Merge branch 'timeline-viewer-feature' of github.com:sarutak/spark into timeline-viewer-feature
52b5f0b [Kousuke Saruta] Merge branch 'master' of git://git.apache.org/spark into timeline-viewer-feature
dec85db [Kousuke Saruta] Merge branch 'master' of git://git.apache.org/spark into timeline-viewer-feature
fcdab7d [Kousuke Saruta] Merge branch 'master' of git://git.apache.org/spark into timeline-viewer-feature
dab7cc1 [Kousuke Saruta] Merge branch 'master' of git://git.apache.org/spark into timeline-viewer-feature
09cce97 [Kousuke Saruta] Cleanuped
16f82cf [Kousuke Saruta] Cleanuped
9fb522e [Kousuke Saruta] Cleanuped
d05f2c2 [Kousuke Saruta] Merge branch 'master' of git://git.apache.org/spark into timeline-viewer-feature
e85e9aa [Kousuke Saruta] Cleanup: Added TimelineViewUtils.scala
a76e569 [Kousuke Saruta] Removed unused setting in timeline-view.css
5ce1b21 [Kousuke Saruta] Added vis.min.js, vis.min.css and vis.map to .rat-exclude
082f709 [Kousuke Saruta] Added Timeline-View feature for Applications, Jobs and Stages
---
 .../apache/spark/ui/static/timeline-view.css  |  66 +++++-
 .../apache/spark/ui/static/timeline-view.js   |  71 +++++-
 .../org/apache/spark/ui/jobs/StagePage.scala  | 220 +++++++++++++++++-
 .../org/apache/spark/ui/jobs/StagesTab.scala  |   1 +
 4 files changed, 348 insertions(+), 10 deletions(-)

diff --git a/core/src/main/resources/org/apache/spark/ui/static/timeline-view.css b/core/src/main/resources/org/apache/spark/ui/static/timeline-view.css
index d1e6d462b836f..0f400461c5293 100644
--- a/core/src/main/resources/org/apache/spark/ui/static/timeline-view.css
+++ b/core/src/main/resources/org/apache/spark/ui/static/timeline-view.css
@@ -24,6 +24,65 @@ div#application-timeline, div#job-timeline {
   margin-top: 5px;
 }
 
+#task-assignment-timeline div.legend-area {
+  width: 574px;
+}
+
+#task-assignment-timeline .legend-area > svg {
+  width: 100%;
+  height: 55px;
+}
+
+#task-assignment-timeline div.item.range {
+  padding: 0px;
+  height: 26px;
+  border-width: 0;
+}
+
+.task-assignment-timeline-content {
+  width: 100%;
+}
+
+.task-assignment-timeline-duration-bar {
+  width: 100%;
+  height: 26px;
+}
+
+rect.scheduler-delay-proportion {
+  fill: #80B1D3;
+  stroke: #6B94B0;
+}
+
+rect.deserialization-time-proportion {
+  fill: #FB8072;
+  stroke: #D26B5F;
+}
+
+rect.shuffle-read-time-proportion {
+  fill: #FDB462;
+  stroke: #D39651;
+}
+
+rect.executor-runtime-proportion {
+  fill: #B3DE69;
+  stroke: #95B957;
+}
+
+rect.shuffle-write-time-proportion {
+  fill: #FFED6F;
+  stroke: #D5C65C;
+}
+
+rect.serialization-time-proportion {
+  fill: #BC80BD;
+  stroke: #9D6B9E;
+}
+
+rect.getting-result-time-proportion {
+  fill: #8DD3C7;
+  stroke: #75B0A6;
+}
+
 .vis.timeline {
   line-height: 14px;
 }
@@ -178,6 +237,10 @@ tr.corresponding-item-hover > td, tr.corresponding-item-hover > th {
   display: none;
 }
 
+#task-assignment-timeline.collapsed {
+  display: none;
+}
+
 .control-panel {
   margin-bottom: 5px;
 }
@@ -186,7 +249,8 @@ tr.corresponding-item-hover > td, tr.corresponding-item-hover > th {
   margin: 0;
 }
 
-span.expand-application-timeline, span.expand-job-timeline {
+span.expand-application-timeline, span.expand-job-timeline,
+span.expand-task-assignment-timeline {
   cursor: pointer;
 }
 
diff --git a/core/src/main/resources/org/apache/spark/ui/static/timeline-view.js b/core/src/main/resources/org/apache/spark/ui/static/timeline-view.js
index 558beb8a5867f..e1150359bc901 100644
--- a/core/src/main/resources/org/apache/spark/ui/static/timeline-view.js
+++ b/core/src/main/resources/org/apache/spark/ui/static/timeline-view.js
@@ -133,6 +133,73 @@ function drawJobTimeline(groupArray, eventObjArray, startTime) {
   });
 }
 
+function drawTaskAssignmentTimeline(groupArray, eventObjArray, minLaunchTime, zoomMax) {
+  var groups = new vis.DataSet(groupArray);
+  var items = new vis.DataSet(eventObjArray);
+  var container = $("#task-assignment-timeline")[0]
+  var options = {
+    groupOrder: function(a, b) {
+      return a.value - b.value
+    },
+    editable: false,
+    align: 'left',
+    selectable: false,
+    showCurrentTime: false,
+    min: minLaunchTime,
+    zoomable: false,
+    zoomMax: zoomMax
+  };
+
+  var taskTimeline = new vis.Timeline(container)
+  taskTimeline.setOptions(options);
+  taskTimeline.setGroups(groups);
+  taskTimeline.setItems(items);
+
+  taskTimeline.on("rangechange", function(prop) {
+    if (currentDisplayedTooltip !== null) {
+      $(currentDisplayedTooltip).tooltip("hide");
+    }
+  });
+
+  function getTaskIdxAndAttempt(selector) {
+    var taskIdxText = $(selector).attr("data-title");
+    var taskIdxAndAttempt = taskIdxText.match("Task (\\d+) \\(attempt (\\d+)");
+    var taskIdx = taskIdxAndAttempt[1];
+    var taskAttempt = taskIdxAndAttempt[2];
+    return taskIdx + "-" + taskAttempt;
+  }
+
+  // If we zoom up and a box moves away when the corresponding tooltip is shown,
+  // the tooltip can be remain.
+  // So, we need to hide tooltips using another mechanism.
+  var currentDisplayedTooltip = null;
+
+  $("#task-assignment-timeline").on({
+    "mouseenter": function() {
+      var taskIdxAndAttempt = getTaskIdxAndAttempt(this);
+      $("#task-" + taskIdxAndAttempt).addClass("corresponding-item-hover");
+      $(this).tooltip("show");
+      currentDisplayedTooltip = this;
+    },
+    "mouseleave" : function() {
+      var taskIdxAndAttempt = getTaskIdxAndAttempt(this);
+      $("#task-" + taskIdxAndAttempt).removeClass("corresponding-item-hover");
+      $(this).tooltip("hide");
+      currentDisplayedTooltip = null;
+    }
+  }, ".task-assignment-timeline-content");
+
+  setupZoomable('#task-assignment-timeline-zoom-lock', taskTimeline);
+
+  $("span.expand-task-assignment-timeline").click(function() {
+    $("#task-assignment-timeline").toggleClass('collapsed');
+
+     // Switch the class of the arrow from open to closed.
+    $(this).find('.expand-task-assignment-timeline-arrow').toggleClass('arrow-open');
+    $(this).find('.expand-task-assignment-timeline-arrow').toggleClass('arrow-closed');
+  });
+}
+
 function setupExecutorEventAction() {
   $(".item.box.executor").each(function () {
     $(this).hover(
@@ -147,7 +214,7 @@ function setupExecutorEventAction() {
 }
 
 function setupZoomable(id, timeline) {
-  $(id + '>input[type="checkbox"]').click(function() {
+  $(id + ' > input[type="checkbox"]').click(function() {
     if (this.checked) {
       timeline.setOptions({zoomable: true});
     } else {
@@ -155,7 +222,7 @@ function setupZoomable(id, timeline) {
     }
   });
 
-  $(id + ">span").click(function() {
+  $(id + " > span").click(function() {
     $(this).parent().find('input:checkbox').trigger('click');
   });
 }
diff --git a/core/src/main/scala/org/apache/spark/ui/jobs/StagePage.scala b/core/src/main/scala/org/apache/spark/ui/jobs/StagePage.scala
index 8f7b1c2f09665..1a75ea62504a0 100644
--- a/core/src/main/scala/org/apache/spark/ui/jobs/StagePage.scala
+++ b/core/src/main/scala/org/apache/spark/ui/jobs/StagePage.scala
@@ -20,6 +20,7 @@ package org.apache.spark.ui.jobs
 import java.util.Date
 import javax.servlet.http.HttpServletRequest
 
+import scala.collection.mutable.HashSet
 import scala.xml.{Elem, Node, Unparsed}
 
 import org.apache.commons.lang3.StringEscapeUtils
@@ -36,6 +37,35 @@ private[ui] class StagePage(parent: StagesTab) extends WebUIPage("stage") {
   private val progressListener = parent.progressListener
   private val operationGraphListener = parent.operationGraphListener
 
+  private val TIMELINE_LEGEND = {
+    <div class="legend-area">
+      <svg>
+        {
+          val legendPairs = List(("scheduler-delay-proportion", "Scheduler Delay"),
+            ("deserialization-time-proportion", "Task Deserialization Time"),
+            ("shuffle-read-time-proportion", "Shuffle Read Time"),
+            ("executor-runtime-proportion", "Executor Computing Time"),
+            ("shuffle-write-time-proportion", "Shuffle Write Time"),
+            ("serialization-time-proportion", "Result Serialization TIme"),
+            ("getting-result-time-proportion", "Getting Result Time"))
+
+          legendPairs.zipWithIndex.map {
+            case ((classAttr, name), index) =>
+              <rect x={5 + (index / 3) * 210 + "px"} y={10 + (index % 3) * 15 + "px"}
+                width="10px" height="10px" class={classAttr}></rect>
+                <text x={25 + (index / 3) * 210 + "px"}
+                  y={20 + (index % 3) * 15 + "px"}>{name}</text>
+          }
+        }
+      </svg>
+    </div>
+  }
+
+  // TODO: We should consider increasing the number of this parameter over time
+  // if we find that it's okay.
+  private val MAX_TIMELINE_TASKS = parent.conf.getInt("spark.ui.timeline.tasks.maximum", 1000)
+
+
   def render(request: HttpServletRequest): Seq[Node] = {
     progressListener.synchronized {
       val parameterId = request.getParameter("id")
@@ -196,7 +226,9 @@ private[ui] class StagePage(parent: StagesTab) extends WebUIPage("stage") {
       val accumulableHeaders: Seq[String] = Seq("Accumulable", "Value")
       def accumulableRow(acc: AccumulableInfo): Elem =
         <tr><td>{acc.name}</td><td>{acc.value}</td></tr>
-      val accumulableTable = UIUtils.listingTable(accumulableHeaders, accumulableRow,
+      val accumulableTable = UIUtils.listingTable(
+        accumulableHeaders,
+        accumulableRow,
         accumulables.values.toSeq)
 
       val taskHeadersAndCssClasses: Seq[(String, String)] =
@@ -232,10 +264,17 @@ private[ui] class StagePage(parent: StagesTab) extends WebUIPage("stage") {
 
       val unzipped = taskHeadersAndCssClasses.unzip
 
+      val currentTime = System.currentTimeMillis()
       val taskTable = UIUtils.listingTable(
         unzipped._1,
-        taskRow(hasAccumulators, stageData.hasInput, stageData.hasOutput,
-          stageData.hasShuffleRead, stageData.hasShuffleWrite, stageData.hasBytesSpilled),
+        taskRow(
+          hasAccumulators,
+          stageData.hasInput,
+          stageData.hasOutput,
+          stageData.hasShuffleRead,
+          stageData.hasShuffleWrite,
+          stageData.hasBytesSpilled,
+          currentTime),
         tasks,
         headerClasses = unzipped._2)
       // Excludes tasks which failed and have incomplete metrics
@@ -460,25 +499,192 @@ private[ui] class StagePage(parent: StagesTab) extends WebUIPage("stage") {
         dagViz ++
         maybeExpandDagViz ++
         showAdditionalMetrics ++
+        makeTimeline(stageData.taskData.values.toSeq, currentTime) ++
         <h4>Summary Metrics for {numCompleted} Completed Tasks</h4> ++
         <div>{summaryTable.getOrElse("No tasks have reported metrics yet.")}</div> ++
         <h4>Aggregated Metrics by Executor</h4> ++ executorTable.toNodeSeq ++
         maybeAccumulableTable ++
         <h4>Tasks</h4> ++ taskTable
-
       UIUtils.headerSparkPage(stageHeader, content, parent, showVisualization = true)
     }
   }
 
+  def makeTimeline(tasks: Seq[TaskUIData], currentTime: Long): Seq[Node] = {
+    val executorsSet = new HashSet[(String, String)]
+    var minLaunchTime = Long.MaxValue
+    var maxFinishTime = Long.MinValue
+
+    val executorsArrayStr =
+      tasks.sortBy(-_.taskInfo.launchTime).take(MAX_TIMELINE_TASKS).map { taskUIData =>
+        val taskInfo = taskUIData.taskInfo
+        val executorId = taskInfo.executorId
+        val host = taskInfo.host
+        executorsSet += ((executorId, host))
+
+        val classNameByStatus = {
+          if (taskInfo.successful) {
+            "succeeded"
+          } else if (taskInfo.failed) {
+            "failed"
+          } else if (taskInfo.running) {
+            "running"
+          }
+        }
+
+        val launchTime = taskInfo.launchTime
+        val finishTime = if (!taskInfo.running) taskInfo.finishTime else currentTime
+        val totalExecutionTime = finishTime - launchTime
+        minLaunchTime = launchTime.min(minLaunchTime)
+        maxFinishTime = launchTime.max(maxFinishTime)
+
+        def toProportion(time: Long) = (time.toDouble / totalExecutionTime * 100).toLong
+
+        val metricsOpt = taskUIData.taskMetrics
+        val shuffleReadTime =
+          metricsOpt.flatMap(_.shuffleReadMetrics.map(_.fetchWaitTime)).getOrElse(0L)
+        val shuffleReadTimeProportion = toProportion(shuffleReadTime)
+        val shuffleWriteTime =
+          (metricsOpt.flatMap(_.shuffleWriteMetrics
+            .map(_.shuffleWriteTime)).getOrElse(0L) / 1e6).toLong
+        val shuffleWriteTimeProportion = toProportion(shuffleWriteTime)
+        val executorComputingTime = metricsOpt.map(_.executorRunTime).getOrElse(0L) -
+          shuffleReadTime - shuffleWriteTime
+        val executorComputingTimeProportion = toProportion(executorComputingTime)
+        val serializationTime = metricsOpt.map(_.resultSerializationTime).getOrElse(0L)
+        val serializationTimeProportion = toProportion(serializationTime)
+        val deserializationTime = metricsOpt.map(_.executorDeserializeTime).getOrElse(0L)
+        val deserializationTimeProportion = toProportion(deserializationTime)
+        val gettingResultTime = getGettingResultTime(taskUIData.taskInfo)
+        val gettingResultTimeProportion = toProportion(gettingResultTime)
+        val schedulerDelay = totalExecutionTime -
+          (executorComputingTime + shuffleReadTime + shuffleWriteTime +
+            serializationTime + deserializationTime + gettingResultTime)
+        val schedulerDelayProportion =
+          (100 - executorComputingTimeProportion - shuffleReadTimeProportion -
+            shuffleWriteTimeProportion - serializationTimeProportion -
+            deserializationTimeProportion - gettingResultTimeProportion)
+
+        val schedulerDelayProportionPos = 0
+        val deserializationTimeProportionPos =
+          schedulerDelayProportionPos + schedulerDelayProportion
+        val shuffleReadTimeProportionPos =
+          deserializationTimeProportionPos + deserializationTimeProportion
+        val executorRuntimeProportionPos =
+          shuffleReadTimeProportionPos + shuffleReadTimeProportion
+        val shuffleWriteTimeProportionPos =
+          executorRuntimeProportionPos + executorComputingTimeProportion
+        val serializationTimeProportionPos =
+          shuffleWriteTimeProportionPos + shuffleWriteTimeProportion
+        val gettingResultTimeProportionPos =
+          serializationTimeProportionPos + serializationTimeProportion
+
+        val index = taskInfo.index
+        val attempt = taskInfo.attempt
+        val timelineObject =
+          s"""
+             {
+               'className': 'task task-assignment-timeline-object $classNameByStatus',
+               'group': '$executorId',
+               'content': '<div class="task-assignment-timeline-content"' +
+                 'data-toggle="tooltip" data-placement="top"' +
+                 'data-html="true" data-container="body"' +
+                 'data-title="${s"Task " + index + " (attempt " + attempt + ")"}<br>' +
+                 'Status: ${taskInfo.status}<br>' +
+                 'Launch Time: ${UIUtils.formatDate(new Date(launchTime))}' +
+                 '${
+                     if (!taskInfo.running) {
+                       s"""<br>Finish Time: ${UIUtils.formatDate(new Date(finishTime))}"""
+                     } else {
+                        ""
+                      }
+                   }' +
+                 '<br>Scheduler Delay: $schedulerDelay ms' +
+                 '<br>Task Deserialization Time: ${UIUtils.formatDuration(deserializationTime)}' +
+                 '<br>Shuffle Read Time: ${UIUtils.formatDuration(shuffleReadTime)}' +
+                 '<br>Executor Computing Time: ${UIUtils.formatDuration(executorComputingTime)}' +
+                 '<br>Shuffle Write Time: ${UIUtils.formatDuration(shuffleWriteTime)}' +
+                 '<br>Result Serialization Time: ${UIUtils.formatDuration(serializationTime)}' +
+                 '<br>Getting Result Time: ${UIUtils.formatDuration(gettingResultTime)}">' +
+                 '<svg class="task-assignment-timeline-duration-bar">' +
+                 '<rect class="scheduler-delay-proportion" ' +
+                   'x="$schedulerDelayProportionPos%" y="0px" height="26px"' +
+                   'width="$schedulerDelayProportion%""></rect>' +
+                 '<rect class="deserialization-time-proportion" '+
+                   'x="$deserializationTimeProportionPos%" y="0px" height="26px"' +
+                   'width="$deserializationTimeProportion%"></rect>' +
+                 '<rect class="shuffle-read-time-proportion" ' +
+                   'x="$shuffleReadTimeProportionPos%" y="0px" height="26px"' +
+                   'width="$shuffleReadTimeProportion%"></rect>' +
+                 '<rect class="executor-runtime-proportion" ' +
+                   'x="$executorRuntimeProportionPos%" y="0px" height="26px"' +
+                   'width="$executorComputingTimeProportion%"></rect>' +
+                 '<rect class="shuffle-write-time-proportion" ' +
+                   'x="$shuffleWriteTimeProportionPos%" y="0px" height="26px"' +
+                   'width="$shuffleWriteTimeProportion%"></rect>' +
+                 '<rect class="serialization-time-proportion" ' +
+                   'x="$serializationTimeProportionPos%" y="0px" height="26px"' +
+                   'width="$serializationTimeProportion%"></rect>' +
+                 '<rect class="getting-result-time-proportion" ' +
+                   'x="$gettingResultTimeProportionPos%" y="0px" height="26px"' +
+                   'width="$gettingResultTimeProportion%"></rect></svg>',
+               'start': new Date($launchTime),
+               'end': new Date($finishTime)
+             }
+           """
+        timelineObject
+      }.mkString("[", ",", "]")
+
+    val groupArrayStr = executorsSet.map {
+      case (executorId, host) =>
+        s"""
+            {
+              'id': '$executorId',
+              'content': '$executorId / $host',
+            }
+          """
+    }.mkString("[", ",", "]")
+
+    val maxZoom = maxFinishTime - minLaunchTime
+    <span class="expand-task-assignment-timeline">
+      <span class="expand-task-assignment-timeline-arrow arrow-closed"></span>
+      <a>Event Timeline</a>
+    </span> ++
+    <div id="task-assignment-timeline" class="collapsed">
+      {
+        if (MAX_TIMELINE_TASKS < tasks.size) {
+          <strong>
+            This stage has more than the maximum number of tasks that can be shown in the
+            visualization! Only the most recent {MAX_TIMELINE_TASKS} tasks
+            (of {tasks.size} total) are shown.
+          </strong>
+        } else {
+          Seq.empty
+        }
+      }
+      <div class="control-panel">
+        <div id="task-assignment-timeline-zoom-lock">
+          <input type="checkbox"></input>
+          <span>Enable zooming</span>
+        </div>
+      </div>
+      {TIMELINE_LEGEND}
+    </div> ++
+    <script type="text/javascript">
+      {Unparsed(s"drawTaskAssignmentTimeline(" +
+      s"$groupArrayStr, $executorsArrayStr, $minLaunchTime, $maxZoom)")}
+    </script>
+  }
+
   def taskRow(
       hasAccumulators: Boolean,
       hasInput: Boolean,
       hasOutput: Boolean,
       hasShuffleRead: Boolean,
       hasShuffleWrite: Boolean,
-      hasBytesSpilled: Boolean)(taskData: TaskUIData): Seq[Node] = {
+      hasBytesSpilled: Boolean,
+      currentTime: Long)(taskData: TaskUIData): Seq[Node] = {
     taskData match { case TaskUIData(info, metrics, errorMessage) =>
-      val duration = if (info.status == "RUNNING") info.timeRunning(System.currentTimeMillis())
+      val duration = if (info.status == "RUNNING") info.timeRunning(currentTime)
         else metrics.map(_.executorRunTime).getOrElse(1L)
       val formatDuration = if (info.status == "RUNNING") UIUtils.formatDuration(duration)
         else metrics.map(m => UIUtils.formatDuration(m.executorRunTime)).getOrElse("")
@@ -542,7 +748,7 @@ private[ui] class StagePage(parent: StagesTab) extends WebUIPage("stage") {
       val diskBytesSpilledSortable = maybeDiskBytesSpilled.map(_.toString).getOrElse("")
       val diskBytesSpilledReadable = maybeDiskBytesSpilled.map(Utils.bytesToString).getOrElse("")
 
-      <tr>
+      <tr id={"task-" + info.index + "-" + info.attempt}>
         <td>{info.index}</td>
         <td>{info.taskId}</td>
         <td sorttable_customkey={info.attempt.toString}>{
diff --git a/core/src/main/scala/org/apache/spark/ui/jobs/StagesTab.scala b/core/src/main/scala/org/apache/spark/ui/jobs/StagesTab.scala
index 55169956d8304..5989f0035b270 100644
--- a/core/src/main/scala/org/apache/spark/ui/jobs/StagesTab.scala
+++ b/core/src/main/scala/org/apache/spark/ui/jobs/StagesTab.scala
@@ -25,6 +25,7 @@ import org.apache.spark.ui.{SparkUI, SparkUITab}
 /** Web UI showing progress status of all stages in the given SparkContext. */
 private[ui] class StagesTab(parent: SparkUI) extends SparkUITab(parent, "stages") {
   val sc = parent.sc
+  val conf = parent.conf
   val killEnabled = parent.killEnabled
   val progressListener = parent.jobProgressListener
   val operationGraphListener = parent.operationGraphListener

From 50da9e89161faa0ecdc1feb3ffee6c822a742034 Mon Sep 17 00:00:00 2001
From: qhuang <qian.huang@intel.com>
Date: Fri, 15 May 2015 14:06:16 -0700
Subject: [PATCH 009/525] [SPARK-7226] [SPARKR] Support math functions in R
 DataFrame

Author: qhuang <qian.huang@intel.com>

Closes #6170 from hqzizania/master and squashes the following commits:

f20c39f [qhuang] add tests units and fixes
2a7d121 [qhuang] use a function name more familiar to R users
07aa72e [qhuang] Support math functions in R DataFrame
---
 R/pkg/NAMESPACE                  | 23 ++++++++++++++++++++
 R/pkg/R/column.R                 | 36 +++++++++++++++++++++++++++++---
 R/pkg/R/generics.R               | 20 ++++++++++++++++++
 R/pkg/inst/tests/test_sparkSQL.R | 24 +++++++++++++++++++++
 4 files changed, 100 insertions(+), 3 deletions(-)

diff --git a/R/pkg/NAMESPACE b/R/pkg/NAMESPACE
index ba29614e7b179..64ffdcffc9caf 100644
--- a/R/pkg/NAMESPACE
+++ b/R/pkg/NAMESPACE
@@ -59,33 +59,56 @@ exportMethods("arrange",
 exportClasses("Column")
 
 exportMethods("abs",
+              "acos",
               "alias",
               "approxCountDistinct",
               "asc",
+              "asin",
+              "atan",
+              "atan2",
               "avg",
               "cast",
+              "cbrt",
+              "ceiling",
               "contains",
+              "cos",
+              "cosh",
               "countDistinct",
               "desc",
               "endsWith",
+              "exp",
+              "expm1",
+              "floor",
               "getField",
               "getItem",
+              "hypot",
               "isNotNull",
               "isNull",
               "last",
               "like",
+              "log",
+              "log10",
+              "log1p",
               "lower",
               "max",
               "mean",
               "min",
               "n",
               "n_distinct",
+              "rint",
               "rlike",
+              "sign",
+              "sin",
+              "sinh",
               "sqrt",
               "startsWith",
               "substr",
               "sum",
               "sumDistinct",
+              "tan",
+              "tanh",
+              "toDegrees",
+              "toRadians",
               "upper")
 
 exportClasses("GroupedData")
diff --git a/R/pkg/R/column.R b/R/pkg/R/column.R
index 9a68445ab451a..80e92d3105a36 100644
--- a/R/pkg/R/column.R
+++ b/R/pkg/R/column.R
@@ -55,12 +55,17 @@ operators <- list(
   "+" = "plus", "-" = "minus", "*" = "multiply", "/" = "divide", "%%" = "mod",
   "==" = "equalTo", ">" = "gt", "<" = "lt", "!=" = "notEqual", "<=" = "leq", ">=" = "geq",
   # we can not override `&&` and `||`, so use `&` and `|` instead
-  "&" = "and", "|" = "or" #, "!" = "unary_$bang"
+  "&" = "and", "|" = "or", #, "!" = "unary_$bang"
+  "^" = "pow"
 )
 column_functions1 <- c("asc", "desc", "isNull", "isNotNull")
 column_functions2 <- c("like", "rlike", "startsWith", "endsWith", "getField", "getItem", "contains")
 functions <- c("min", "max", "sum", "avg", "mean", "count", "abs", "sqrt",
-               "first", "last", "lower", "upper", "sumDistinct")
+               "first", "last", "lower", "upper", "sumDistinct",
+               "acos", "asin", "atan", "cbrt", "ceiling", "cos", "cosh", "exp",
+               "expm1", "floor", "log", "log10", "log1p", "rint", "sign",
+               "sin", "sinh", "tan", "tanh", "toDegrees", "toRadians")
+binary_mathfunctions<- c("atan2", "hypot")
 
 createOperator <- function(op) {
   setMethod(op,
@@ -76,7 +81,11 @@ createOperator <- function(op) {
                 if (class(e2) == "Column") {
                   e2 <- e2@jc
                 }
-                callJMethod(e1@jc, operators[[op]], e2)
+                if (op == "^") {
+                  jc <- callJStatic("org.apache.spark.sql.functions", operators[[op]], e1@jc, e2)
+                } else {
+                  callJMethod(e1@jc, operators[[op]], e2)
+                }
               }
               column(jc)
             })
@@ -106,11 +115,29 @@ createStaticFunction <- function(name) {
   setMethod(name,
             signature(x = "Column"),
             function(x) {
+              if (name == "ceiling") {
+                  name <- "ceil"
+              }
+              if (name == "sign") {
+                  name <- "signum"
+              }
               jc <- callJStatic("org.apache.spark.sql.functions", name, x@jc)
               column(jc)
             })
 }
 
+createBinaryMathfunctions <- function(name) {
+  setMethod(name,
+            signature(y = "Column"),
+            function(y, x) {
+              if (class(x) == "Column") {
+                x <- x@jc
+              }
+              jc <- callJStatic("org.apache.spark.sql.functions", name, y@jc, x)
+              column(jc)
+            })
+}
+
 createMethods <- function() {
   for (op in names(operators)) {
     createOperator(op)
@@ -124,6 +151,9 @@ createMethods <- function() {
   for (x in functions) {
     createStaticFunction(x)
   }
+  for (name in binary_mathfunctions) {
+    createBinaryMathfunctions(name)
+  }
 }
 
 createMethods()
diff --git a/R/pkg/R/generics.R b/R/pkg/R/generics.R
index 6d2bfb1181e5a..a23d3b217b2fd 100644
--- a/R/pkg/R/generics.R
+++ b/R/pkg/R/generics.R
@@ -552,6 +552,10 @@ setGeneric("avg", function(x, ...) { standardGeneric("avg") })
 #' @export
 setGeneric("cast", function(x, dataType) { standardGeneric("cast") })
 
+#' @rdname column
+#' @export
+setGeneric("cbrt", function(x) { standardGeneric("cbrt") })
+
 #' @rdname column
 #' @export
 setGeneric("contains", function(x, ...) { standardGeneric("contains") })
@@ -575,6 +579,10 @@ setGeneric("getField", function(x, ...) { standardGeneric("getField") })
 #' @export
 setGeneric("getItem", function(x, ...) { standardGeneric("getItem") })
 
+#' @rdname column
+#' @export
+setGeneric("hypot", function(y, x) { standardGeneric("hypot") })
+
 #' @rdname column
 #' @export
 setGeneric("isNull", function(x) { standardGeneric("isNull") })
@@ -603,6 +611,10 @@ setGeneric("n", function(x) { standardGeneric("n") })
 #' @export
 setGeneric("n_distinct", function(x, ...) { standardGeneric("n_distinct") })
 
+#' @rdname column
+#' @export
+setGeneric("rint", function(x, ...) { standardGeneric("rint") })
+
 #' @rdname column
 #' @export
 setGeneric("rlike", function(x, ...) { standardGeneric("rlike") })
@@ -615,6 +627,14 @@ setGeneric("startsWith", function(x, ...) { standardGeneric("startsWith") })
 #' @export
 setGeneric("sumDistinct", function(x) { standardGeneric("sumDistinct") })
 
+#' @rdname column
+#' @export
+setGeneric("toDegrees", function(x) { standardGeneric("toDegrees") })
+
+#' @rdname column
+#' @export
+setGeneric("toRadians", function(x) { standardGeneric("toRadians") })
+
 #' @rdname column
 #' @export
 setGeneric("upper", function(x) { standardGeneric("upper") })
diff --git a/R/pkg/inst/tests/test_sparkSQL.R b/R/pkg/inst/tests/test_sparkSQL.R
index 1109e8fdba3fd..3e5658eb5b24b 100644
--- a/R/pkg/inst/tests/test_sparkSQL.R
+++ b/R/pkg/inst/tests/test_sparkSQL.R
@@ -530,6 +530,7 @@ test_that("column operators", {
   c2 <- (- c + 1 - 2) * 3 / 4.0
   c3 <- (c + c2 - c2) * c2 %% c2
   c4 <- (c > c2) & (c2 <= c3) | (c == c2) & (c2 != c3)
+  c5 <- c2 ^ c3 ^ c4
 })
 
 test_that("column functions", {
@@ -538,6 +539,29 @@ test_that("column functions", {
   c3 <- lower(c) + upper(c) + first(c) + last(c)
   c4 <- approxCountDistinct(c) + countDistinct(c) + cast(c, "string")
   c5 <- n(c) + n_distinct(c)
+  c5 <- acos(c) + asin(c) + atan(c) + cbrt(c) 
+  c6 <- ceiling(c) + cos(c) + cosh(c) + exp(c) + expm1(c)
+  c7 <- floor(c) + log(c) + log10(c) + log1p(c) + rint(c)
+  c8 <- sign(c) + sin(c) + sinh(c) + tan(c) + tanh(c)
+  c9 <- toDegrees(c) + toRadians(c)
+})
+
+test_that("column binary mathfunctions", {
+  lines <- c("{\"a\":1, \"b\":5}",
+             "{\"a\":2, \"b\":6}",
+             "{\"a\":3, \"b\":7}",
+             "{\"a\":4, \"b\":8}")
+  jsonPathWithDup <- tempfile(pattern="sparkr-test", fileext=".tmp")
+  writeLines(lines, jsonPathWithDup)
+  df <- jsonFile(sqlCtx, jsonPathWithDup)
+  expect_equal(collect(select(df, atan2(df$a, df$b)))[1, "ATAN2(a, b)"], atan2(1, 5))
+  expect_equal(collect(select(df, atan2(df$a, df$b)))[2, "ATAN2(a, b)"], atan2(2, 6))
+  expect_equal(collect(select(df, atan2(df$a, df$b)))[3, "ATAN2(a, b)"], atan2(3, 7))
+  expect_equal(collect(select(df, atan2(df$a, df$b)))[4, "ATAN2(a, b)"], atan2(4, 8))
+  expect_equal(collect(select(df, hypot(df$a, df$b)))[1, "HYPOT(a, b)"], sqrt(1^2 + 5^2))
+  expect_equal(collect(select(df, hypot(df$a, df$b)))[2, "HYPOT(a, b)"], sqrt(2^2 + 6^2))
+  expect_equal(collect(select(df, hypot(df$a, df$b)))[3, "HYPOT(a, b)"], sqrt(3^2 + 7^2))
+  expect_equal(collect(select(df, hypot(df$a, df$b)))[4, "HYPOT(a, b)"], sqrt(4^2 + 8^2))
 })
 
 test_that("string operators", {

From 6e77105e11ff81bfd84561f4e1121111f686df21 Mon Sep 17 00:00:00 2001
From: Iulian Dragos <jaguarul@gmail.com>
Date: Fri, 15 May 2015 14:57:29 -0700
Subject: [PATCH 010/525] [SPARK-7677] [STREAMING] Add Kafka modules to the
 2.11 build.

This is somewhat related to [SPARK-6154](https://issues.apache.org/jira/browse/SPARK-6154), though it only touches Kafka, not the jline dependency for thriftserver.

I tested this locally on 2.11 (./run-tests) and everything looked good (I had to disable mima, because `MimaBuild` harcodes 2.10 for the previous version -- that's another PR).

Author: Iulian Dragos <jaguarul@gmail.com>

Closes #6149 from dragos/issue/spark-2.11-kafka and squashes the following commits:

aa15d99 [Iulian Dragos] Add Kafka modules to the 2.11 build.
---
 pom.xml | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/pom.xml b/pom.xml
index 91d1d843c762a..86aa0a9fa134c 100644
--- a/pom.xml
+++ b/pom.xml
@@ -107,6 +107,8 @@
     <module>examples</module>
     <module>repl</module>
     <module>launcher</module>
+    <module>external/kafka</module>
+    <module>external/kafka-assembly</module>
   </modules>
 
   <properties>
@@ -1757,10 +1759,6 @@
         <jline.version>${scala.version}</jline.version>
         <jline.groupid>org.scala-lang</jline.groupid>
       </properties>
-      <modules>
-        <module>external/kafka</module>
-        <module>external/kafka-assembly</module>
-      </modules>
     </profile>
 
     <profile>

From c8696337e2a5878f3171eb574c0a1365d45814c9 Mon Sep 17 00:00:00 2001
From: Liang-Chi Hsieh <viirya@gmail.com>
Date: Fri, 15 May 2015 15:05:04 -0700
Subject: [PATCH 011/525] [SPARK-7556] [ML] [DOC] Add user guide for spark.ml
 Binarizer, including Scala, Java and Python examples

JIRA: https://issues.apache.org/jira/browse/SPARK-7556

Author: Liang-Chi Hsieh <viirya@gmail.com>

Closes #6116 from viirya/binarizer_doc and squashes the following commits:

40cb677 [Liang-Chi Hsieh] Better print out.
5b7ef1d [Liang-Chi Hsieh] Make examples more clear.
1bf9c09 [Liang-Chi Hsieh] For comments.
6cf8cba [Liang-Chi Hsieh] Add user guide for Binarizer.
---
 docs/ml-features.md | 84 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 84 insertions(+)

diff --git a/docs/ml-features.md b/docs/ml-features.md
index 0cbebcb739b14..5df61dd36a070 100644
--- a/docs/ml-features.md
+++ b/docs/ml-features.md
@@ -183,6 +183,90 @@ for words_label in wordsDataFrame.select("words", "label").take(3):
 </div>
 </div>
 
+## Binarizer
+
+Binarization is the process of thresholding numerical features to binary features. As some probabilistic estimators make assumption that the input data is distributed according to [Bernoulli distribution](http://en.wikipedia.org/wiki/Bernoulli_distribution), a binarizer is useful for pre-processing the input data with continuous numerical features.
+
+A simple [Binarizer](api/scala/index.html#org.apache.spark.ml.feature.Binarizer) class provides this functionality. Besides the common parameters of `inputCol` and `outputCol`, `Binarizer` has the parameter `threshold` used for binarizing continuous numerical features. The features greater than the threshold, will be binarized to 1.0. The features equal to or less than the threshold, will be binarized to 0.0. The example below shows how to binarize numerical features.
+
+<div class="codetabs">
+<div data-lang="scala" markdown="1">
+{% highlight scala %}
+import org.apache.spark.ml.feature.Binarizer
+import org.apache.spark.sql.DataFrame
+
+val data = Array(
+  (0, 0.1),
+  (1, 0.8),
+  (2, 0.2)
+)
+val dataFrame: DataFrame = sqlContext.createDataFrame(data).toDF("label", "feature")
+
+val binarizer: Binarizer = new Binarizer()
+  .setInputCol("feature")
+  .setOutputCol("binarized_feature")
+  .setThreshold(0.5)
+
+val binarizedDataFrame = binarizer.transform(dataFrame)
+val binarizedFeatures = binarizedDataFrame.select("binarized_feature")
+binarizedFeatures.collect().foreach(println)
+{% endhighlight %}
+</div>
+
+<div data-lang="java" markdown="1">
+{% highlight java %}
+import com.google.common.collect.Lists;
+
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.ml.feature.Binarizer;
+import org.apache.spark.sql.DataFrame;
+import org.apache.spark.sql.Row;
+import org.apache.spark.sql.RowFactory;
+import org.apache.spark.sql.types.DataTypes;
+import org.apache.spark.sql.types.Metadata;
+import org.apache.spark.sql.types.StructField;
+import org.apache.spark.sql.types.StructType;
+
+JavaRDD<Row> jrdd = jsc.parallelize(Lists.newArrayList(
+  RowFactory.create(0, 0.1),
+  RowFactory.create(1, 0.8),
+  RowFactory.create(2, 0.2)
+));
+StructType schema = new StructType(new StructField[]{
+  new StructField("label", DataTypes.DoubleType, false, Metadata.empty()),
+  new StructField("feature", DataTypes.DoubleType, false, Metadata.empty())
+});
+DataFrame continuousDataFrame = jsql.createDataFrame(jrdd, schema);
+Binarizer binarizer = new Binarizer()
+  .setInputCol("feature")
+  .setOutputCol("binarized_feature")
+  .setThreshold(0.5);
+DataFrame binarizedDataFrame = binarizer.transform(continuousDataFrame);
+DataFrame binarizedFeatures = binarizedDataFrame.select("binarized_feature");
+for (Row r : binarizedFeatures.collect()) {
+  Double binarized_value = r.getDouble(0);
+  System.out.println(binarized_value);
+}
+{% endhighlight %}
+</div>
+
+<div data-lang="python" markdown="1">
+{% highlight python %}
+from pyspark.ml.feature import Binarizer
+
+continuousDataFrame = sqlContext.createDataFrame([
+  (0, 0.1),
+  (1, 0.8),
+  (2, 0.2)
+], ["label", "feature"])
+binarizer = Binarizer(threshold=0.5, inputCol="feature", outputCol="binarized_feature")
+binarizedDataFrame = binarizer.transform(continuousDataFrame)
+binarizedFeatures = binarizedDataFrame.select("binarized_feature")
+for binarized_feature, in binarizedFeatures.collect():
+  print binarized_feature
+{% endhighlight %}
+</div>
+</div>
 
 # Feature Selectors
 

From e74545647684b3047248ca3cfee894ac5378dead Mon Sep 17 00:00:00 2001
From: Kay Ousterhout <kayousterhout@gmail.com>
Date: Fri, 15 May 2015 17:45:14 -0700
Subject: [PATCH 012/525] [SPARK-7676] Bug fix and cleanup of stage timeline
 view

cc pwendell sarutak

This commit cleans up some unnecessary code, eliminates the feature where when you mouse-over a box in the timeline, the corresponding task is highlighted in the table (because that feature is only useful in the rare case when you have a very small number of tasks, in which case it's easy to figure out the mapping anyway), and fixes a bug where nothing shows up if you try to visualize a stage with only 1 task.

Author: Kay Ousterhout <kayousterhout@gmail.com>

Closes #6202 from kayousterhout/SPARK-7676 and squashes the following commits:

dfd29d4 [Kay Ousterhout] [SPARK-7676] Bug fix and cleanup of stage timeline view
---
 .../apache/spark/ui/static/timeline-view.js   | 48 +++++++------------
 .../org/apache/spark/ui/jobs/StagePage.scala  | 19 ++------
 2 files changed, 20 insertions(+), 47 deletions(-)

diff --git a/core/src/main/resources/org/apache/spark/ui/static/timeline-view.js b/core/src/main/resources/org/apache/spark/ui/static/timeline-view.js
index e1150359bc901..604c29994145a 100644
--- a/core/src/main/resources/org/apache/spark/ui/static/timeline-view.js
+++ b/core/src/main/resources/org/apache/spark/ui/static/timeline-view.js
@@ -133,7 +133,7 @@ function drawJobTimeline(groupArray, eventObjArray, startTime) {
   });
 }
 
-function drawTaskAssignmentTimeline(groupArray, eventObjArray, minLaunchTime, zoomMax) {
+function drawTaskAssignmentTimeline(groupArray, eventObjArray, minLaunchTime, maxFinishTime) {
   var groups = new vis.DataSet(groupArray);
   var items = new vis.DataSet(eventObjArray);
   var container = $("#task-assignment-timeline")[0]
@@ -146,8 +146,8 @@ function drawTaskAssignmentTimeline(groupArray, eventObjArray, minLaunchTime, zo
     selectable: false,
     showCurrentTime: false,
     min: minLaunchTime,
-    zoomable: false,
-    zoomMax: zoomMax
+    max: maxFinishTime,
+    zoomable: false
   };
 
   var taskTimeline = new vis.Timeline(container)
@@ -155,48 +155,32 @@ function drawTaskAssignmentTimeline(groupArray, eventObjArray, minLaunchTime, zo
   taskTimeline.setGroups(groups);
   taskTimeline.setItems(items);
 
-  taskTimeline.on("rangechange", function(prop) {
-    if (currentDisplayedTooltip !== null) {
-      $(currentDisplayedTooltip).tooltip("hide");
-    }
-  });
-
-  function getTaskIdxAndAttempt(selector) {
-    var taskIdxText = $(selector).attr("data-title");
-    var taskIdxAndAttempt = taskIdxText.match("Task (\\d+) \\(attempt (\\d+)");
-    var taskIdx = taskIdxAndAttempt[1];
-    var taskAttempt = taskIdxAndAttempt[2];
-    return taskIdx + "-" + taskAttempt;
-  }
-
-  // If we zoom up and a box moves away when the corresponding tooltip is shown,
-  // the tooltip can be remain.
-  // So, we need to hide tooltips using another mechanism.
+  // If a user zooms while a tooltip is displayed, the user may zoom such that the cursor is no
+  // longer over the task that the tooltip corresponds to. So, when a user zooms, we should hide
+  // any currently displayed tooltips.
   var currentDisplayedTooltip = null;
-
   $("#task-assignment-timeline").on({
     "mouseenter": function() {
-      var taskIdxAndAttempt = getTaskIdxAndAttempt(this);
-      $("#task-" + taskIdxAndAttempt).addClass("corresponding-item-hover");
-      $(this).tooltip("show");
       currentDisplayedTooltip = this;
     },
-    "mouseleave" : function() {
-      var taskIdxAndAttempt = getTaskIdxAndAttempt(this);
-      $("#task-" + taskIdxAndAttempt).removeClass("corresponding-item-hover");
-      $(this).tooltip("hide");
+    "mouseleave": function() {
       currentDisplayedTooltip = null;
     }
   }, ".task-assignment-timeline-content");
+  taskTimeline.on("rangechange", function(prop) {
+    if (currentDisplayedTooltip !== null) {
+      $(currentDisplayedTooltip).tooltip("hide");
+    }
+  });
 
-  setupZoomable('#task-assignment-timeline-zoom-lock', taskTimeline);
+  setupZoomable("#task-assignment-timeline-zoom-lock", taskTimeline);
 
   $("span.expand-task-assignment-timeline").click(function() {
-    $("#task-assignment-timeline").toggleClass('collapsed');
+    $("#task-assignment-timeline").toggleClass("collapsed");
 
      // Switch the class of the arrow from open to closed.
-    $(this).find('.expand-task-assignment-timeline-arrow').toggleClass('arrow-open');
-    $(this).find('.expand-task-assignment-timeline-arrow').toggleClass('arrow-closed');
+    $(this).find(".expand-task-assignment-timeline-arrow").toggleClass("arrow-open");
+    $(this).find(".expand-task-assignment-timeline-arrow").toggleClass("arrow-closed");
   });
 }
 
diff --git a/core/src/main/scala/org/apache/spark/ui/jobs/StagePage.scala b/core/src/main/scala/org/apache/spark/ui/jobs/StagePage.scala
index 1a75ea62504a0..31e2e7fba9783 100644
--- a/core/src/main/scala/org/apache/spark/ui/jobs/StagePage.scala
+++ b/core/src/main/scala/org/apache/spark/ui/jobs/StagePage.scala
@@ -521,21 +521,11 @@ private[ui] class StagePage(parent: StagesTab) extends WebUIPage("stage") {
         val host = taskInfo.host
         executorsSet += ((executorId, host))
 
-        val classNameByStatus = {
-          if (taskInfo.successful) {
-            "succeeded"
-          } else if (taskInfo.failed) {
-            "failed"
-          } else if (taskInfo.running) {
-            "running"
-          }
-        }
-
         val launchTime = taskInfo.launchTime
         val finishTime = if (!taskInfo.running) taskInfo.finishTime else currentTime
         val totalExecutionTime = finishTime - launchTime
         minLaunchTime = launchTime.min(minLaunchTime)
-        maxFinishTime = launchTime.max(maxFinishTime)
+        maxFinishTime = finishTime.max(maxFinishTime)
 
         def toProportion(time: Long) = (time.toDouble / totalExecutionTime * 100).toLong
 
@@ -583,7 +573,7 @@ private[ui] class StagePage(parent: StagesTab) extends WebUIPage("stage") {
         val timelineObject =
           s"""
              {
-               'className': 'task task-assignment-timeline-object $classNameByStatus',
+               'className': 'task task-assignment-timeline-object',
                'group': '$executorId',
                'content': '<div class="task-assignment-timeline-content"' +
                  'data-toggle="tooltip" data-placement="top"' +
@@ -644,7 +634,6 @@ private[ui] class StagePage(parent: StagesTab) extends WebUIPage("stage") {
           """
     }.mkString("[", ",", "]")
 
-    val maxZoom = maxFinishTime - minLaunchTime
     <span class="expand-task-assignment-timeline">
       <span class="expand-task-assignment-timeline-arrow arrow-closed"></span>
       <a>Event Timeline</a>
@@ -671,7 +660,7 @@ private[ui] class StagePage(parent: StagesTab) extends WebUIPage("stage") {
     </div> ++
     <script type="text/javascript">
       {Unparsed(s"drawTaskAssignmentTimeline(" +
-      s"$groupArrayStr, $executorsArrayStr, $minLaunchTime, $maxZoom)")}
+      s"$groupArrayStr, $executorsArrayStr, $minLaunchTime, $maxFinishTime)")}
     </script>
   }
 
@@ -748,7 +737,7 @@ private[ui] class StagePage(parent: StagesTab) extends WebUIPage("stage") {
       val diskBytesSpilledSortable = maybeDiskBytesSpilled.map(_.toString).getOrElse("")
       val diskBytesSpilledReadable = maybeDiskBytesSpilled.map(Utils.bytesToString).getOrElse("")
 
-      <tr id={"task-" + info.index + "-" + info.attempt}>
+      <tr>
         <td>{info.index}</td>
         <td>{info.taskId}</td>
         <td sorttable_customkey={info.attempt.toString}>{

From 2c04c8a1aed34cce420b3d30d9e885daa6e03d74 Mon Sep 17 00:00:00 2001
From: Josh Rosen <joshrosen@databricks.com>
Date: Fri, 15 May 2015 18:06:01 -0700
Subject: [PATCH 013/525] [SPARK-7563] OutputCommitCoordinator.stop() should
 only run on the driver

This fixes a bug where an executor that exits can cause the driver's OutputCommitCoordinator to stop. To fix this, we use an `isDriver` flag and check it in `stop()`.

See https://issues.apache.org/jira/browse/SPARK-7563 for more details.

Author: Josh Rosen <joshrosen@databricks.com>

Closes #6197 from JoshRosen/SPARK-7563 and squashes the following commits:

04b2cc5 [Josh Rosen] [SPARK-7563] OutputCommitCoordinator.stop() should only be executed on the driver
---
 core/src/main/scala/org/apache/spark/SparkEnv.scala    |  2 +-
 .../spark/scheduler/OutputCommitCoordinator.scala      | 10 ++++++----
 .../spark/scheduler/OutputCommitCoordinatorSuite.scala |  2 +-
 3 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/SparkEnv.scala b/core/src/main/scala/org/apache/spark/SparkEnv.scala
index a5d831c7e68ad..327114542880d 100644
--- a/core/src/main/scala/org/apache/spark/SparkEnv.scala
+++ b/core/src/main/scala/org/apache/spark/SparkEnv.scala
@@ -379,7 +379,7 @@ object SparkEnv extends Logging {
     }
 
     val outputCommitCoordinator = mockOutputCommitCoordinator.getOrElse {
-      new OutputCommitCoordinator(conf)
+      new OutputCommitCoordinator(conf, isDriver)
     }
     val outputCommitCoordinatorRef = registerOrLookupEndpoint("OutputCommitCoordinator",
       new OutputCommitCoordinatorEndpoint(rpcEnv, outputCommitCoordinator))
diff --git a/core/src/main/scala/org/apache/spark/scheduler/OutputCommitCoordinator.scala b/core/src/main/scala/org/apache/spark/scheduler/OutputCommitCoordinator.scala
index 0b1d47cff3746..8321037cdc026 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/OutputCommitCoordinator.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/OutputCommitCoordinator.scala
@@ -38,7 +38,7 @@ private case class AskPermissionToCommitOutput(stage: Int, task: Long, taskAttem
  * This class was introduced in SPARK-4879; see that JIRA issue (and the associated pull requests)
  * for an extensive design discussion.
  */
-private[spark] class OutputCommitCoordinator(conf: SparkConf) extends Logging {
+private[spark] class OutputCommitCoordinator(conf: SparkConf, isDriver: Boolean) extends Logging {
 
   // Initialized by SparkEnv
   var coordinatorRef: Option[RpcEndpointRef] = None
@@ -129,9 +129,11 @@ private[spark] class OutputCommitCoordinator(conf: SparkConf) extends Logging {
   }
 
   def stop(): Unit = synchronized {
-    coordinatorRef.foreach(_ send StopCoordinator)
-    coordinatorRef = None
-    authorizedCommittersByStage.clear()
+    if (isDriver) {
+      coordinatorRef.foreach(_ send StopCoordinator)
+      coordinatorRef = None
+      authorizedCommittersByStage.clear()
+    }
   }
 
   // Marked private[scheduler] instead of private so this can be mocked in tests
diff --git a/core/src/test/scala/org/apache/spark/scheduler/OutputCommitCoordinatorSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/OutputCommitCoordinatorSuite.scala
index cf97707946706..7078a7a12232a 100644
--- a/core/src/test/scala/org/apache/spark/scheduler/OutputCommitCoordinatorSuite.scala
+++ b/core/src/test/scala/org/apache/spark/scheduler/OutputCommitCoordinatorSuite.scala
@@ -81,7 +81,7 @@ class OutputCommitCoordinatorSuite extends FunSuite with BeforeAndAfter {
           conf: SparkConf,
           isLocal: Boolean,
           listenerBus: LiveListenerBus): SparkEnv = {
-        outputCommitCoordinator = spy(new OutputCommitCoordinator(conf))
+        outputCommitCoordinator = spy(new OutputCommitCoordinator(conf, isDriver = true))
         // Use Mockito.spy() to maintain the default infrastructure everywhere else.
         // This mocking allows us to control the coordinator responses in test cases.
         SparkEnv.createDriverEnv(conf, isLocal, listenerBus, Some(outputCommitCoordinator))

From cc12a86fb049f2be1f45baf461d202ec356ccf8f Mon Sep 17 00:00:00 2001
From: Ram Sriharsha <rsriharsha@hw11853.local>
Date: Fri, 15 May 2015 19:33:20 -0700
Subject: [PATCH 014/525] [SPARK-7575] [ML] [DOC] Example code for OneVsRest

Java and Scala examples for OneVsRest. Fixes the base classifier to be Logistic Regression and accepts the configuration parameters of the base classifier.

Author: Ram Sriharsha <rsriharsha@hw11853.local>

Closes #6115 from harsha2010/SPARK-7575 and squashes the following commits:

87ad3c7 [Ram Sriharsha] extra line
f5d9891 [Ram Sriharsha] Merge branch 'master' into SPARK-7575
7076084 [Ram Sriharsha] cleanup
dfd660c [Ram Sriharsha] cleanup
8703e4f [Ram Sriharsha] update doc
cb23995 [Ram Sriharsha] fix commandline options for JavaOneVsRestExample
69e91f8 [Ram Sriharsha] cleanup
7f4e127 [Ram Sriharsha] cleanup
d4c40d0 [Ram Sriharsha] Code Review fixes
461eb38 [Ram Sriharsha] cleanup
e0106d9 [Ram Sriharsha] Fix typo
935cf56 [Ram Sriharsha] Try to match Java and Scala Example Commandline options
5323ff9 [Ram Sriharsha] cleanup
196a59a [Ram Sriharsha] cleanup
6adfa0c [Ram Sriharsha] Style Fix
8cfc5d5 [Ram Sriharsha] [SPARK-7575] Example code for OneVsRest
---
 .../examples/ml/JavaOneVsRestExample.java     | 236 ++++++++++++++++++
 .../spark/examples/ml/OneVsRestExample.scala  | 185 ++++++++++++++
 2 files changed, 421 insertions(+)
 create mode 100644 examples/src/main/java/org/apache/spark/examples/ml/JavaOneVsRestExample.java
 create mode 100644 examples/src/main/scala/org/apache/spark/examples/ml/OneVsRestExample.scala

diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaOneVsRestExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaOneVsRestExample.java
new file mode 100644
index 0000000000000..75063dbf800d8
--- /dev/null
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaOneVsRestExample.java
@@ -0,0 +1,236 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.examples.ml;
+
+import org.apache.commons.cli.*;
+
+import org.apache.spark.SparkConf;
+import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.ml.classification.LogisticRegression;
+import org.apache.spark.ml.classification.OneVsRest;
+import org.apache.spark.ml.classification.OneVsRestModel;
+import org.apache.spark.ml.util.MetadataUtils;
+import org.apache.spark.mllib.evaluation.MulticlassMetrics;
+import org.apache.spark.mllib.linalg.Matrix;
+import org.apache.spark.mllib.regression.LabeledPoint;
+import org.apache.spark.mllib.util.MLUtils;
+import org.apache.spark.rdd.RDD;
+import org.apache.spark.sql.DataFrame;
+import org.apache.spark.sql.SQLContext;
+import org.apache.spark.sql.types.StructField;
+
+/**
+ * An example runner for Multiclass to Binary Reduction with One Vs Rest.
+ * The example uses Logistic Regression as the base classifier. All parameters that
+ * can be specified on the base classifier can be passed in to the runner options.
+ * Run with
+ * <pre>
+ * bin/run-example ml.JavaOneVsRestExample [options]
+ * </pre>
+ */
+public class JavaOneVsRestExample {
+
+  private static class Params {
+    String input;
+    String testInput = null;
+    Integer maxIter = 100;
+    double tol = 1E-6;
+    boolean fitIntercept = true;
+    Double regParam = null;
+    Double elasticNetParam = null;
+    double fracTest = 0.2;
+  }
+
+  public static void main(String[] args) {
+    // parse the arguments
+    Params params = parse(args);
+    SparkConf conf = new SparkConf().setAppName("JavaOneVsRestExample");
+    JavaSparkContext jsc = new JavaSparkContext(conf);
+    SQLContext jsql = new SQLContext(jsc);
+
+    // configure the base classifier
+    LogisticRegression classifier = new LogisticRegression()
+      .setMaxIter(params.maxIter)
+      .setTol(params.tol)
+      .setFitIntercept(params.fitIntercept);
+
+    if (params.regParam != null) {
+      classifier.setRegParam(params.regParam);
+    }
+    if (params.elasticNetParam != null) {
+      classifier.setElasticNetParam(params.elasticNetParam);
+    }
+
+    // instantiate the One Vs Rest Classifier
+    OneVsRest ovr = new OneVsRest().setClassifier(classifier);
+
+    String input = params.input;
+    RDD<LabeledPoint> inputData = MLUtils.loadLibSVMFile(jsc.sc(), input);
+    RDD<LabeledPoint> train;
+    RDD<LabeledPoint> test;
+
+    // compute the train/ test split: if testInput is not provided use part of input
+    String testInput = params.testInput;
+    if (testInput != null) {
+      train = inputData;
+      // compute the number of features in the training set.
+      int numFeatures = inputData.first().features().size();
+      test = MLUtils.loadLibSVMFile(jsc.sc(), testInput, numFeatures);
+    } else {
+      double f = params.fracTest;
+      RDD<LabeledPoint>[] tmp = inputData.randomSplit(new double[]{1 - f, f}, 12345);
+      train = tmp[0];
+      test = tmp[1];
+    }
+
+    // train the multiclass model
+    DataFrame trainingDataFrame = jsql.createDataFrame(train, LabeledPoint.class);
+    OneVsRestModel ovrModel = ovr.fit(trainingDataFrame.cache());
+
+    // score the model on test data
+    DataFrame testDataFrame = jsql.createDataFrame(test, LabeledPoint.class);
+    DataFrame predictions = ovrModel.transform(testDataFrame.cache())
+      .select("prediction", "label");
+
+    // obtain metrics
+    MulticlassMetrics metrics = new MulticlassMetrics(predictions);
+    StructField predictionColSchema = predictions.schema().apply("prediction");
+    Integer numClasses = (Integer) MetadataUtils.getNumClasses(predictionColSchema).get();
+
+    // compute the false positive rate per label
+    StringBuilder results = new StringBuilder();
+    results.append("label\tfpr\n");
+    for (int label = 0; label < numClasses; label++) {
+      results.append(label);
+      results.append("\t");
+      results.append(metrics.falsePositiveRate((double) label));
+      results.append("\n");
+    }
+
+    Matrix confusionMatrix = metrics.confusionMatrix();
+    // output the Confusion Matrix
+    System.out.println("Confusion Matrix");
+    System.out.println(confusionMatrix);
+    System.out.println();
+    System.out.println(results);
+
+    jsc.stop();
+  }
+
+  private static Params parse(String[] args) {
+    Options options = generateCommandlineOptions();
+    CommandLineParser parser = new PosixParser();
+    Params params = new Params();
+
+    try {
+      CommandLine cmd = parser.parse(options, args);
+      String value;
+      if (cmd.hasOption("input")) {
+        params.input = cmd.getOptionValue("input");
+      }
+      if (cmd.hasOption("maxIter")) {
+        value = cmd.getOptionValue("maxIter");
+        params.maxIter = Integer.parseInt(value);
+      }
+      if (cmd.hasOption("tol")) {
+        value = cmd.getOptionValue("tol");
+        params.tol = Double.parseDouble(value);
+      }
+      if (cmd.hasOption("fitIntercept")) {
+        value = cmd.getOptionValue("fitIntercept");
+        params.fitIntercept = Boolean.parseBoolean(value);
+      }
+      if (cmd.hasOption("regParam")) {
+        value = cmd.getOptionValue("regParam");
+        params.regParam = Double.parseDouble(value);
+      }
+      if (cmd.hasOption("elasticNetParam")) {
+        value = cmd.getOptionValue("elasticNetParam");
+        params.elasticNetParam = Double.parseDouble(value);
+      }
+      if (cmd.hasOption("testInput")) {
+        value = cmd.getOptionValue("testInput");
+        params.testInput = value;
+      }
+      if (cmd.hasOption("fracTest")) {
+        value = cmd.getOptionValue("fracTest");
+        params.fracTest = Double.parseDouble(value);
+      }
+
+    } catch (ParseException e) {
+      printHelpAndQuit(options);
+    }
+    return params;
+  }
+
+  private static Options generateCommandlineOptions() {
+    Option input = OptionBuilder.withArgName("input")
+      .hasArg()
+      .isRequired()
+      .withDescription("input path to labeled examples. This path must be specified")
+      .create("input");
+    Option testInput = OptionBuilder.withArgName("testInput")
+      .hasArg()
+      .withDescription("input path to test examples")
+      .create("testInput");
+    Option fracTest = OptionBuilder.withArgName("testInput")
+      .hasArg()
+      .withDescription("fraction of data to hold out for testing." +
+        " If given option testInput, this option is ignored. default: 0.2")
+      .create("fracTest");
+    Option maxIter = OptionBuilder.withArgName("maxIter")
+      .hasArg()
+      .withDescription("maximum number of iterations for Logistic Regression. default:100")
+      .create("maxIter");
+    Option tol = OptionBuilder.withArgName("tol")
+      .hasArg()
+      .withDescription("the convergence tolerance of iterations " +
+        "for Logistic Regression. default: 1E-6")
+      .create("tol");
+    Option fitIntercept = OptionBuilder.withArgName("fitIntercept")
+      .hasArg()
+      .withDescription("fit intercept for logistic regression. default true")
+      .create("fitIntercept");
+    Option regParam = OptionBuilder.withArgName( "regParam" )
+      .hasArg()
+      .withDescription("the regularization parameter for Logistic Regression.")
+      .create("regParam");
+    Option elasticNetParam = OptionBuilder.withArgName("elasticNetParam" )
+      .hasArg()
+      .withDescription("the ElasticNet mixing parameter for Logistic Regression.")
+      .create("elasticNetParam");
+
+    Options options = new Options()
+      .addOption(input)
+      .addOption(testInput)
+      .addOption(fracTest)
+      .addOption(maxIter)
+      .addOption(tol)
+      .addOption(fitIntercept)
+      .addOption(regParam)
+      .addOption(elasticNetParam);
+
+    return options;
+  }
+
+  private static void printHelpAndQuit(Options options) {
+    HelpFormatter formatter = new HelpFormatter();
+    formatter.printHelp("JavaOneVsRestExample", options);
+    System.exit(-1);
+  }
+}
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/OneVsRestExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/OneVsRestExample.scala
new file mode 100644
index 0000000000000..b99d0a1246011
--- /dev/null
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/OneVsRestExample.scala
@@ -0,0 +1,185 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.examples.ml
+
+import java.util.concurrent.TimeUnit.{NANOSECONDS => NANO}
+
+import scopt.OptionParser
+
+import org.apache.spark.{SparkContext, SparkConf}
+import org.apache.spark.examples.mllib.AbstractParams
+import org.apache.spark.ml.classification.{OneVsRest, LogisticRegression}
+import org.apache.spark.ml.util.MetadataUtils
+import org.apache.spark.mllib.evaluation.MulticlassMetrics
+import org.apache.spark.mllib.regression.LabeledPoint
+import org.apache.spark.mllib.util.MLUtils
+import org.apache.spark.rdd.RDD
+import org.apache.spark.sql.SQLContext
+
+/**
+ * An example runner for Multiclass to Binary Reduction with One Vs Rest.
+ * The example uses Logistic Regression as the base classifier. All parameters that
+ * can be specified on the base classifier can be passed in to the runner options.
+ * Run with
+ * {{{
+ * ./bin/run-example ml.OneVsRestExample [options]
+ * }}}
+ * For local mode, run
+ * {{{
+ * ./bin/spark-submit --class org.apache.spark.examples.ml.OneVsRestExample --driver-memory 1g
+ *   [examples JAR path] [options]
+ * }}}
+ * If you use it as a template to create your own app, please use `spark-submit` to submit your app.
+ */
+object OneVsRestExample {
+
+  case class Params private[ml] (
+      input: String = null,
+      testInput: Option[String] = None,
+      maxIter: Int = 100,
+      tol: Double = 1E-6,
+      fitIntercept: Boolean = true,
+      regParam: Option[Double] = None,
+      elasticNetParam: Option[Double] = None,
+      fracTest: Double = 0.2) extends AbstractParams[Params]
+
+  def main(args: Array[String]) {
+    val defaultParams = Params()
+
+    val parser = new OptionParser[Params]("OneVsRest Example") {
+      head("OneVsRest Example: multiclass to binary reduction using OneVsRest")
+      opt[String]("input")
+        .text("input path to labeled examples. This path must be specified")
+        .required()
+        .action((x, c) => c.copy(input = x))
+      opt[Double]("fracTest")
+        .text(s"fraction of data to hold out for testing.  If given option testInput, " +
+        s"this option is ignored. default: ${defaultParams.fracTest}")
+        .action((x, c) => c.copy(fracTest = x))
+      opt[String]("testInput")
+        .text("input path to test dataset.  If given, option fracTest is ignored")
+        .action((x,c) => c.copy(testInput = Some(x)))
+      opt[Int]("maxIter")
+        .text(s"maximum number of iterations for Logistic Regression." +
+          s" default: ${defaultParams.maxIter}")
+        .action((x, c) => c.copy(maxIter = x))
+      opt[Double]("tol")
+        .text(s"the convergence tolerance of iterations for Logistic Regression." +
+          s" default: ${defaultParams.tol}")
+        .action((x, c) => c.copy(tol = x))
+      opt[Boolean]("fitIntercept")
+        .text(s"fit intercept for Logistic Regression." +
+        s" default: ${defaultParams.fitIntercept}")
+        .action((x, c) => c.copy(fitIntercept = x))
+      opt[Double]("regParam")
+        .text(s"the regularization parameter for Logistic Regression.")
+        .action((x,c) => c.copy(regParam = Some(x)))
+      opt[Double]("elasticNetParam")
+        .text(s"the ElasticNet mixing parameter for Logistic Regression.")
+        .action((x,c) => c.copy(elasticNetParam = Some(x)))
+      checkConfig { params =>
+        if (params.fracTest < 0 || params.fracTest >= 1) {
+          failure(s"fracTest ${params.fracTest} value incorrect; should be in [0,1).")
+        } else {
+          success
+        }
+      }
+    }
+    parser.parse(args, defaultParams).map { params =>
+      run(params)
+    }.getOrElse {
+      sys.exit(1)
+    }
+  }
+
+  private def run(params: Params) {
+    val conf = new SparkConf().setAppName(s"OneVsRestExample with $params")
+    val sc = new SparkContext(conf)
+    val inputData = MLUtils.loadLibSVMFile(sc, params.input)
+    val sqlContext = new SQLContext(sc)
+    import sqlContext.implicits._
+
+    // compute the train/test split: if testInput is not provided use part of input.
+    val data = params.testInput match {
+      case Some(t) => {
+        // compute the number of features in the training set.
+        val numFeatures = inputData.first().features.size
+        val testData = MLUtils.loadLibSVMFile(sc, t, numFeatures)
+        Array[RDD[LabeledPoint]](inputData, testData)
+      }
+      case None => {
+        val f = params.fracTest
+        inputData.randomSplit(Array(1 - f, f), seed = 12345)
+      }
+    }
+    val Array(train, test) = data.map(_.toDF().cache())
+
+    // instantiate the base classifier
+    val classifier = new LogisticRegression()
+      .setMaxIter(params.maxIter)
+      .setTol(params.tol)
+      .setFitIntercept(params.fitIntercept)
+
+    // Set regParam, elasticNetParam if specified in params
+    params.regParam.foreach(classifier.setRegParam)
+    params.elasticNetParam.foreach(classifier.setElasticNetParam)
+
+    // instantiate the One Vs Rest Classifier.
+
+    val ovr = new OneVsRest()
+    ovr.setClassifier(classifier)
+
+    // train the multiclass model.
+    val (trainingDuration, ovrModel) = time(ovr.fit(train))
+
+    // score the model on test data.
+    val (predictionDuration, predictions) = time(ovrModel.transform(test))
+
+    // evaluate the model
+    val predictionsAndLabels = predictions.select("prediction", "label")
+      .map(row => (row.getDouble(0), row.getDouble(1)))
+
+    val metrics = new MulticlassMetrics(predictionsAndLabels)
+
+    val confusionMatrix = metrics.confusionMatrix
+
+    // compute the false positive rate per label
+    val predictionColSchema = predictions.schema("prediction")
+    val numClasses = MetadataUtils.getNumClasses(predictionColSchema).get
+    val fprs = Range(0, numClasses).map(p => (p, metrics.falsePositiveRate(p.toDouble)))
+
+    println(s" Training Time ${trainingDuration} sec\n")
+
+    println(s" Prediction Time ${predictionDuration} sec\n")
+
+    println(s" Confusion Matrix\n ${confusionMatrix.toString}\n")
+
+    println("label\tfpr")
+
+    println(fprs.map {case (label, fpr) => label + "\t" + fpr}.mkString("\n"))
+
+    sc.stop()
+  }
+
+  private def time[R](block: => R): (Long, R) = {
+    val t0 = System.nanoTime()
+    val result = block    // call-by-name
+    val t1 = System.nanoTime()
+    (NANO.toSeconds(t1 - t0), result)
+  }
+}

From adfd366814499c0540a15dd6017091ba8c0f05da Mon Sep 17 00:00:00 2001
From: Davies Liu <davies@databricks.com>
Date: Fri, 15 May 2015 20:05:26 -0700
Subject: [PATCH 015/525] [SPARK-7073] [SQL] [PySpark] Clean up SQL data type
 hierarchy in Python

Author: Davies Liu <davies@databricks.com>

Closes #6206 from davies/sql_type and squashes the following commits:

33d6860 [Davies Liu] [SPARK-7073] [SQL] [PySpark] Clean up SQL data type hierarchy in Python
---
 python/pyspark/sql/_types.py | 76 ++++++++++++++++++++++--------------
 1 file changed, 46 insertions(+), 30 deletions(-)

diff --git a/python/pyspark/sql/_types.py b/python/pyspark/sql/_types.py
index 629c3a94513b8..9e7e9f04bc35d 100644
--- a/python/pyspark/sql/_types.py
+++ b/python/pyspark/sql/_types.py
@@ -73,56 +73,74 @@ def json(self):
 
 # This singleton pattern does not work with pickle, you will get
 # another object after pickle and unpickle
-class PrimitiveTypeSingleton(type):
-    """Metaclass for PrimitiveType"""
+class DataTypeSingleton(type):
+    """Metaclass for DataType"""
 
     _instances = {}
 
     def __call__(cls):
         if cls not in cls._instances:
-            cls._instances[cls] = super(PrimitiveTypeSingleton, cls).__call__()
+            cls._instances[cls] = super(DataTypeSingleton, cls).__call__()
         return cls._instances[cls]
 
 
-class PrimitiveType(DataType):
-    """Spark SQL PrimitiveType"""
+class NullType(DataType):
+    """Null type.
 
-    __metaclass__ = PrimitiveTypeSingleton
+    The data type representing None, used for the types that cannot be inferred.
+    """
 
+    __metaclass__ = DataTypeSingleton
 
-class NullType(PrimitiveType):
-    """Null type.
 
-    The data type representing None, used for the types that cannot be inferred.
+class AtomicType(DataType):
+    """An internal type used to represent everything that is not
+    null, UDTs, arrays, structs, and maps."""
+
+    __metaclass__ = DataTypeSingleton
+
+
+class NumericType(AtomicType):
+    """Numeric data types.
     """
 
 
-class StringType(PrimitiveType):
+class IntegralType(NumericType):
+    """Integral data types.
+    """
+
+
+class FractionalType(NumericType):
+    """Fractional data types.
+    """
+
+
+class StringType(AtomicType):
     """String data type.
     """
 
 
-class BinaryType(PrimitiveType):
+class BinaryType(AtomicType):
     """Binary (byte array) data type.
     """
 
 
-class BooleanType(PrimitiveType):
+class BooleanType(AtomicType):
     """Boolean data type.
     """
 
 
-class DateType(PrimitiveType):
+class DateType(AtomicType):
     """Date (datetime.date) data type.
     """
 
 
-class TimestampType(PrimitiveType):
+class TimestampType(AtomicType):
     """Timestamp (datetime.datetime) data type.
     """
 
 
-class DecimalType(DataType):
+class DecimalType(FractionalType):
     """Decimal (decimal.Decimal) data type.
     """
 
@@ -150,31 +168,31 @@ def __repr__(self):
             return "DecimalType()"
 
 
-class DoubleType(PrimitiveType):
+class DoubleType(FractionalType):
     """Double data type, representing double precision floats.
     """
 
 
-class FloatType(PrimitiveType):
+class FloatType(FractionalType):
     """Float data type, representing single precision floats.
     """
 
 
-class ByteType(PrimitiveType):
+class ByteType(IntegralType):
     """Byte data type, i.e. a signed integer in a single byte.
     """
     def simpleString(self):
         return 'tinyint'
 
 
-class IntegerType(PrimitiveType):
+class IntegerType(IntegralType):
     """Int data type, i.e. a signed 32-bit integer.
     """
     def simpleString(self):
         return 'int'
 
 
-class LongType(PrimitiveType):
+class LongType(IntegralType):
     """Long data type, i.e. a signed 64-bit integer.
 
     If the values are beyond the range of [-9223372036854775808, 9223372036854775807],
@@ -184,7 +202,7 @@ def simpleString(self):
         return 'bigint'
 
 
-class ShortType(PrimitiveType):
+class ShortType(IntegralType):
     """Short data type, i.e. a signed 16-bit integer.
     """
     def simpleString(self):
@@ -426,11 +444,9 @@ def __eq__(self, other):
         return type(self) == type(other)
 
 
-_all_primitive_types = dict((v.typeName(), v)
-                            for v in list(globals().values())
-                            if (type(v) is type or type(v) is PrimitiveTypeSingleton)
-                            and v.__base__ == PrimitiveType)
-
+_atomic_types = [StringType, BinaryType, BooleanType, DecimalType, FloatType, DoubleType,
+                 ByteType, ShortType, IntegerType, LongType, DateType, TimestampType]
+_all_atomic_types = dict((t.typeName(), t) for t in _atomic_types)
 _all_complex_types = dict((v.typeName(), v)
                           for v in [ArrayType, MapType, StructType])
 
@@ -444,7 +460,7 @@ def _parse_datatype_json_string(json_string):
     ...     scala_datatype = sqlContext._ssql_ctx.parseDataType(datatype.json())
     ...     python_datatype = _parse_datatype_json_string(scala_datatype.json())
     ...     assert datatype == python_datatype
-    >>> for cls in _all_primitive_types.values():
+    >>> for cls in _all_atomic_types.values():
     ...     check_datatype(cls())
 
     >>> # Simple ArrayType.
@@ -494,8 +510,8 @@ def _parse_datatype_json_string(json_string):
 
 def _parse_datatype_json_value(json_value):
     if not isinstance(json_value, dict):
-        if json_value in _all_primitive_types.keys():
-            return _all_primitive_types[json_value]()
+        if json_value in _all_atomic_types.keys():
+            return _all_atomic_types[json_value]()
         elif json_value == 'decimal':
             return DecimalType()
         elif _FIXED_DECIMAL.match(json_value):
@@ -1125,7 +1141,7 @@ def Dict(d):
         return lambda datum: dataType.deserialize(datum)
 
     elif not isinstance(dataType, StructType):
-        # no wrapper for primitive types
+        # no wrapper for atomic types
         return lambda x: x
 
     class Row(tuple):

From d7b69946cb21cd2781c9ad3e691e54b28efbbf3d Mon Sep 17 00:00:00 2001
From: Davies Liu <davies@databricks.com>
Date: Fri, 15 May 2015 20:09:15 -0700
Subject: [PATCH 016/525] [SPARK-7543] [SQL] [PySpark] split dataframe.py into
 multiple files

dataframe.py is splited into column.py, group.py and dataframe.py:
```
   360 column.py
  1223 dataframe.py
   183 group.py
```

Author: Davies Liu <davies@databricks.com>

Closes #6201 from davies/split_df and squashes the following commits:

fc8f5ab [Davies Liu] split dataframe.py into multiple files
---
 python/pyspark/sql/__init__.py  |   5 +-
 python/pyspark/sql/column.py    | 360 +++++++++++++++++++++++++
 python/pyspark/sql/dataframe.py | 449 +-------------------------------
 python/pyspark/sql/functions.py |   2 +-
 python/pyspark/sql/group.py     | 183 +++++++++++++
 python/run-tests                |   2 +
 6 files changed, 552 insertions(+), 449 deletions(-)
 create mode 100644 python/pyspark/sql/column.py
 create mode 100644 python/pyspark/sql/group.py

diff --git a/python/pyspark/sql/__init__.py b/python/pyspark/sql/__init__.py
index 7192c89b3dc7f..19805e291e91b 100644
--- a/python/pyspark/sql/__init__.py
+++ b/python/pyspark/sql/__init__.py
@@ -55,8 +55,9 @@
 
 from pyspark.sql.types import Row
 from pyspark.sql.context import SQLContext, HiveContext
-from pyspark.sql.dataframe import DataFrame, GroupedData, Column, SchemaRDD, DataFrameNaFunctions
-from pyspark.sql.dataframe import DataFrameStatFunctions
+from pyspark.sql.column import Column
+from pyspark.sql.dataframe import DataFrame, SchemaRDD, DataFrameNaFunctions, DataFrameStatFunctions
+from pyspark.sql.group import GroupedData
 
 __all__ = [
     'SQLContext', 'HiveContext', 'DataFrame', 'GroupedData', 'Column', 'Row',
diff --git a/python/pyspark/sql/column.py b/python/pyspark/sql/column.py
new file mode 100644
index 0000000000000..fc7ad674daa5b
--- /dev/null
+++ b/python/pyspark/sql/column.py
@@ -0,0 +1,360 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import sys
+
+if sys.version >= '3':
+    basestring = str
+    long = int
+
+from pyspark.context import SparkContext
+from pyspark.rdd import ignore_unicode_prefix
+from pyspark.sql.types import *
+
+__all__ = ["DataFrame", "Column", "SchemaRDD", "DataFrameNaFunctions",
+           "DataFrameStatFunctions"]
+
+
+def _create_column_from_literal(literal):
+    sc = SparkContext._active_spark_context
+    return sc._jvm.functions.lit(literal)
+
+
+def _create_column_from_name(name):
+    sc = SparkContext._active_spark_context
+    return sc._jvm.functions.col(name)
+
+
+def _to_java_column(col):
+    if isinstance(col, Column):
+        jcol = col._jc
+    else:
+        jcol = _create_column_from_name(col)
+    return jcol
+
+
+def _to_seq(sc, cols, converter=None):
+    """
+    Convert a list of Column (or names) into a JVM Seq of Column.
+
+    An optional `converter` could be used to convert items in `cols`
+    into JVM Column objects.
+    """
+    if converter:
+        cols = [converter(c) for c in cols]
+    return sc._jvm.PythonUtils.toSeq(cols)
+
+
+def _unary_op(name, doc="unary operator"):
+    """ Create a method for given unary operator """
+    def _(self):
+        jc = getattr(self._jc, name)()
+        return Column(jc)
+    _.__doc__ = doc
+    return _
+
+
+def _func_op(name, doc=''):
+    def _(self):
+        sc = SparkContext._active_spark_context
+        jc = getattr(sc._jvm.functions, name)(self._jc)
+        return Column(jc)
+    _.__doc__ = doc
+    return _
+
+
+def _bin_op(name, doc="binary operator"):
+    """ Create a method for given binary operator
+    """
+    def _(self, other):
+        jc = other._jc if isinstance(other, Column) else other
+        njc = getattr(self._jc, name)(jc)
+        return Column(njc)
+    _.__doc__ = doc
+    return _
+
+
+def _reverse_op(name, doc="binary operator"):
+    """ Create a method for binary operator (this object is on right side)
+    """
+    def _(self, other):
+        jother = _create_column_from_literal(other)
+        jc = getattr(jother, name)(self._jc)
+        return Column(jc)
+    _.__doc__ = doc
+    return _
+
+
+class Column(object):
+
+    """
+    A column in a DataFrame.
+
+    :class:`Column` instances can be created by::
+
+        # 1. Select a column out of a DataFrame
+
+        df.colName
+        df["colName"]
+
+        # 2. Create from an expression
+        df.colName + 1
+        1 / df.colName
+    """
+
+    def __init__(self, jc):
+        self._jc = jc
+
+    # arithmetic operators
+    __neg__ = _func_op("negate")
+    __add__ = _bin_op("plus")
+    __sub__ = _bin_op("minus")
+    __mul__ = _bin_op("multiply")
+    __div__ = _bin_op("divide")
+    __truediv__ = _bin_op("divide")
+    __mod__ = _bin_op("mod")
+    __radd__ = _bin_op("plus")
+    __rsub__ = _reverse_op("minus")
+    __rmul__ = _bin_op("multiply")
+    __rdiv__ = _reverse_op("divide")
+    __rtruediv__ = _reverse_op("divide")
+    __rmod__ = _reverse_op("mod")
+
+    # logistic operators
+    __eq__ = _bin_op("equalTo")
+    __ne__ = _bin_op("notEqual")
+    __lt__ = _bin_op("lt")
+    __le__ = _bin_op("leq")
+    __ge__ = _bin_op("geq")
+    __gt__ = _bin_op("gt")
+
+    # `and`, `or`, `not` cannot be overloaded in Python,
+    # so use bitwise operators as boolean operators
+    __and__ = _bin_op('and')
+    __or__ = _bin_op('or')
+    __invert__ = _func_op('not')
+    __rand__ = _bin_op("and")
+    __ror__ = _bin_op("or")
+
+    # container operators
+    __contains__ = _bin_op("contains")
+    __getitem__ = _bin_op("apply")
+
+    # bitwise operators
+    bitwiseOR = _bin_op("bitwiseOR")
+    bitwiseAND = _bin_op("bitwiseAND")
+    bitwiseXOR = _bin_op("bitwiseXOR")
+
+    def getItem(self, key):
+        """An expression that gets an item at position `ordinal` out of a list,
+         or gets an item by key out of a dict.
+
+        >>> df = sc.parallelize([([1, 2], {"key": "value"})]).toDF(["l", "d"])
+        >>> df.select(df.l.getItem(0), df.d.getItem("key")).show()
+        +----+------+
+        |l[0]|d[key]|
+        +----+------+
+        |   1| value|
+        +----+------+
+        >>> df.select(df.l[0], df.d["key"]).show()
+        +----+------+
+        |l[0]|d[key]|
+        +----+------+
+        |   1| value|
+        +----+------+
+        """
+        return self[key]
+
+    def getField(self, name):
+        """An expression that gets a field by name in a StructField.
+
+        >>> from pyspark.sql import Row
+        >>> df = sc.parallelize([Row(r=Row(a=1, b="b"))]).toDF()
+        >>> df.select(df.r.getField("b")).show()
+        +----+
+        |r[b]|
+        +----+
+        |   b|
+        +----+
+        >>> df.select(df.r.a).show()
+        +----+
+        |r[a]|
+        +----+
+        |   1|
+        +----+
+        """
+        return self[name]
+
+    def __getattr__(self, item):
+        if item.startswith("__"):
+            raise AttributeError(item)
+        return self.getField(item)
+
+    # string methods
+    rlike = _bin_op("rlike")
+    like = _bin_op("like")
+    startswith = _bin_op("startsWith")
+    endswith = _bin_op("endsWith")
+
+    @ignore_unicode_prefix
+    def substr(self, startPos, length):
+        """
+        Return a :class:`Column` which is a substring of the column
+
+        :param startPos: start position (int or Column)
+        :param length:  length of the substring (int or Column)
+
+        >>> df.select(df.name.substr(1, 3).alias("col")).collect()
+        [Row(col=u'Ali'), Row(col=u'Bob')]
+        """
+        if type(startPos) != type(length):
+            raise TypeError("Can not mix the type")
+        if isinstance(startPos, (int, long)):
+            jc = self._jc.substr(startPos, length)
+        elif isinstance(startPos, Column):
+            jc = self._jc.substr(startPos._jc, length._jc)
+        else:
+            raise TypeError("Unexpected type: %s" % type(startPos))
+        return Column(jc)
+
+    __getslice__ = substr
+
+    @ignore_unicode_prefix
+    def inSet(self, *cols):
+        """ A boolean expression that is evaluated to true if the value of this
+        expression is contained by the evaluated values of the arguments.
+
+        >>> df[df.name.inSet("Bob", "Mike")].collect()
+        [Row(age=5, name=u'Bob')]
+        >>> df[df.age.inSet([1, 2, 3])].collect()
+        [Row(age=2, name=u'Alice')]
+        """
+        if len(cols) == 1 and isinstance(cols[0], (list, set)):
+            cols = cols[0]
+        cols = [c._jc if isinstance(c, Column) else _create_column_from_literal(c) for c in cols]
+        sc = SparkContext._active_spark_context
+        jc = getattr(self._jc, "in")(_to_seq(sc, cols))
+        return Column(jc)
+
+    # order
+    asc = _unary_op("asc", "Returns a sort expression based on the"
+                           " ascending order of the given column name.")
+    desc = _unary_op("desc", "Returns a sort expression based on the"
+                             " descending order of the given column name.")
+
+    isNull = _unary_op("isNull", "True if the current expression is null.")
+    isNotNull = _unary_op("isNotNull", "True if the current expression is not null.")
+
+    def alias(self, *alias):
+        """Returns this column aliased with a new name or names (in the case of expressions that
+        return more than one column, such as explode).
+
+        >>> df.select(df.age.alias("age2")).collect()
+        [Row(age2=2), Row(age2=5)]
+        """
+
+        if len(alias) == 1:
+            return Column(getattr(self._jc, "as")(alias[0]))
+        else:
+            sc = SparkContext._active_spark_context
+            return Column(getattr(self._jc, "as")(_to_seq(sc, list(alias))))
+
+    @ignore_unicode_prefix
+    def cast(self, dataType):
+        """ Convert the column into type `dataType`
+
+        >>> df.select(df.age.cast("string").alias('ages')).collect()
+        [Row(ages=u'2'), Row(ages=u'5')]
+        >>> df.select(df.age.cast(StringType()).alias('ages')).collect()
+        [Row(ages=u'2'), Row(ages=u'5')]
+        """
+        if isinstance(dataType, basestring):
+            jc = self._jc.cast(dataType)
+        elif isinstance(dataType, DataType):
+            sc = SparkContext._active_spark_context
+            ssql_ctx = sc._jvm.SQLContext(sc._jsc.sc())
+            jdt = ssql_ctx.parseDataType(dataType.json())
+            jc = self._jc.cast(jdt)
+        else:
+            raise TypeError("unexpected type: %s" % type(dataType))
+        return Column(jc)
+
+    @ignore_unicode_prefix
+    def between(self, lowerBound, upperBound):
+        """ A boolean expression that is evaluated to true if the value of this
+        expression is between the given columns.
+        """
+        return (self >= lowerBound) & (self <= upperBound)
+
+    @ignore_unicode_prefix
+    def when(self, condition, value):
+        """Evaluates a list of conditions and returns one of multiple possible result expressions.
+        If :func:`Column.otherwise` is not invoked, None is returned for unmatched conditions.
+
+        See :func:`pyspark.sql.functions.when` for example usage.
+
+        :param condition: a boolean :class:`Column` expression.
+        :param value: a literal value, or a :class:`Column` expression.
+
+        """
+        sc = SparkContext._active_spark_context
+        if not isinstance(condition, Column):
+            raise TypeError("condition should be a Column")
+        v = value._jc if isinstance(value, Column) else value
+        jc = sc._jvm.functions.when(condition._jc, v)
+        return Column(jc)
+
+    @ignore_unicode_prefix
+    def otherwise(self, value):
+        """Evaluates a list of conditions and returns one of multiple possible result expressions.
+        If :func:`Column.otherwise` is not invoked, None is returned for unmatched conditions.
+
+        See :func:`pyspark.sql.functions.when` for example usage.
+
+        :param value: a literal value, or a :class:`Column` expression.
+        """
+        v = value._jc if isinstance(value, Column) else value
+        jc = self._jc.otherwise(value)
+        return Column(jc)
+
+    def __repr__(self):
+        return 'Column<%s>' % self._jc.toString().encode('utf8')
+
+
+def _test():
+    import doctest
+    from pyspark.context import SparkContext
+    from pyspark.sql import SQLContext
+    import pyspark.sql.column
+    globs = pyspark.sql.column.__dict__.copy()
+    sc = SparkContext('local[4]', 'PythonTest')
+    globs['sc'] = sc
+    globs['sqlContext'] = SQLContext(sc)
+    globs['df'] = sc.parallelize([(2, 'Alice'), (5, 'Bob')]) \
+        .toDF(StructType([StructField('age', IntegerType()),
+                          StructField('name', StringType())]))
+
+    (failure_count, test_count) = doctest.testmod(
+        pyspark.sql.column, globs=globs,
+        optionflags=doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE | doctest.REPORT_NDIFF)
+    globs['sc'].stop()
+    if failure_count:
+        exit(-1)
+
+
+if __name__ == "__main__":
+    _test()
diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py
index 2ed95ac8e2505..96d927b9ba35c 100644
--- a/python/pyspark/sql/dataframe.py
+++ b/python/pyspark/sql/dataframe.py
@@ -25,17 +25,15 @@
 else:
     from itertools import imap as map
 
-from pyspark.context import SparkContext
 from pyspark.rdd import RDD, _load_from_socket, ignore_unicode_prefix
 from pyspark.serializers import BatchedSerializer, PickleSerializer, UTF8Deserializer
 from pyspark.storagelevel import StorageLevel
 from pyspark.traceback_utils import SCCallSiteSync
 from pyspark.sql.types import *
 from pyspark.sql.types import _create_cls, _parse_datatype_json_string
+from pyspark.sql.column import Column, _to_seq, _to_java_column
 
-
-__all__ = ["DataFrame", "GroupedData", "Column", "SchemaRDD", "DataFrameNaFunctions",
-           "DataFrameStatFunctions"]
+__all__ = ["DataFrame", "SchemaRDD", "DataFrameNaFunctions", "DataFrameStatFunctions"]
 
 
 class DataFrame(object):
@@ -757,6 +755,7 @@ def groupBy(self, *cols):
         [Row(name=u'Bob', age=5, count=1), Row(name=u'Alice', age=2, count=1)]
         """
         jdf = self._jdf.groupBy(self._jcols(*cols))
+        from pyspark.sql.group import GroupedData
         return GroupedData(jdf, self.sql_ctx)
 
     def agg(self, *exprs):
@@ -1141,169 +1140,6 @@ class SchemaRDD(DataFrame):
     """
 
 
-def dfapi(f):
-    def _api(self):
-        name = f.__name__
-        jdf = getattr(self._jdf, name)()
-        return DataFrame(jdf, self.sql_ctx)
-    _api.__name__ = f.__name__
-    _api.__doc__ = f.__doc__
-    return _api
-
-
-def df_varargs_api(f):
-    def _api(self, *args):
-        name = f.__name__
-        jdf = getattr(self._jdf, name)(_to_seq(self.sql_ctx._sc, args))
-        return DataFrame(jdf, self.sql_ctx)
-    _api.__name__ = f.__name__
-    _api.__doc__ = f.__doc__
-    return _api
-
-
-class GroupedData(object):
-    """
-    A set of methods for aggregations on a :class:`DataFrame`,
-    created by :func:`DataFrame.groupBy`.
-    """
-
-    def __init__(self, jdf, sql_ctx):
-        self._jdf = jdf
-        self.sql_ctx = sql_ctx
-
-    @ignore_unicode_prefix
-    def agg(self, *exprs):
-        """Compute aggregates and returns the result as a :class:`DataFrame`.
-
-        The available aggregate functions are `avg`, `max`, `min`, `sum`, `count`.
-
-        If ``exprs`` is a single :class:`dict` mapping from string to string, then the key
-        is the column to perform aggregation on, and the value is the aggregate function.
-
-        Alternatively, ``exprs`` can also be a list of aggregate :class:`Column` expressions.
-
-        :param exprs: a dict mapping from column name (string) to aggregate functions (string),
-            or a list of :class:`Column`.
-
-        >>> gdf = df.groupBy(df.name)
-        >>> gdf.agg({"*": "count"}).collect()
-        [Row(name=u'Alice', COUNT(1)=1), Row(name=u'Bob', COUNT(1)=1)]
-
-        >>> from pyspark.sql import functions as F
-        >>> gdf.agg(F.min(df.age)).collect()
-        [Row(name=u'Alice', MIN(age)=2), Row(name=u'Bob', MIN(age)=5)]
-        """
-        assert exprs, "exprs should not be empty"
-        if len(exprs) == 1 and isinstance(exprs[0], dict):
-            jdf = self._jdf.agg(exprs[0])
-        else:
-            # Columns
-            assert all(isinstance(c, Column) for c in exprs), "all exprs should be Column"
-            jdf = self._jdf.agg(exprs[0]._jc,
-                                _to_seq(self.sql_ctx._sc, [c._jc for c in exprs[1:]]))
-        return DataFrame(jdf, self.sql_ctx)
-
-    @dfapi
-    def count(self):
-        """Counts the number of records for each group.
-
-        >>> df.groupBy(df.age).count().collect()
-        [Row(age=2, count=1), Row(age=5, count=1)]
-        """
-
-    @df_varargs_api
-    def mean(self, *cols):
-        """Computes average values for each numeric columns for each group.
-
-        :func:`mean` is an alias for :func:`avg`.
-
-        :param cols: list of column names (string). Non-numeric columns are ignored.
-
-        >>> df.groupBy().mean('age').collect()
-        [Row(AVG(age)=3.5)]
-        >>> df3.groupBy().mean('age', 'height').collect()
-        [Row(AVG(age)=3.5, AVG(height)=82.5)]
-        """
-
-    @df_varargs_api
-    def avg(self, *cols):
-        """Computes average values for each numeric columns for each group.
-
-        :func:`mean` is an alias for :func:`avg`.
-
-        :param cols: list of column names (string). Non-numeric columns are ignored.
-
-        >>> df.groupBy().avg('age').collect()
-        [Row(AVG(age)=3.5)]
-        >>> df3.groupBy().avg('age', 'height').collect()
-        [Row(AVG(age)=3.5, AVG(height)=82.5)]
-        """
-
-    @df_varargs_api
-    def max(self, *cols):
-        """Computes the max value for each numeric columns for each group.
-
-        >>> df.groupBy().max('age').collect()
-        [Row(MAX(age)=5)]
-        >>> df3.groupBy().max('age', 'height').collect()
-        [Row(MAX(age)=5, MAX(height)=85)]
-        """
-
-    @df_varargs_api
-    def min(self, *cols):
-        """Computes the min value for each numeric column for each group.
-
-        :param cols: list of column names (string). Non-numeric columns are ignored.
-
-        >>> df.groupBy().min('age').collect()
-        [Row(MIN(age)=2)]
-        >>> df3.groupBy().min('age', 'height').collect()
-        [Row(MIN(age)=2, MIN(height)=80)]
-        """
-
-    @df_varargs_api
-    def sum(self, *cols):
-        """Compute the sum for each numeric columns for each group.
-
-        :param cols: list of column names (string). Non-numeric columns are ignored.
-
-        >>> df.groupBy().sum('age').collect()
-        [Row(SUM(age)=7)]
-        >>> df3.groupBy().sum('age', 'height').collect()
-        [Row(SUM(age)=7, SUM(height)=165)]
-        """
-
-
-def _create_column_from_literal(literal):
-    sc = SparkContext._active_spark_context
-    return sc._jvm.functions.lit(literal)
-
-
-def _create_column_from_name(name):
-    sc = SparkContext._active_spark_context
-    return sc._jvm.functions.col(name)
-
-
-def _to_java_column(col):
-    if isinstance(col, Column):
-        jcol = col._jc
-    else:
-        jcol = _create_column_from_name(col)
-    return jcol
-
-
-def _to_seq(sc, cols, converter=None):
-    """
-    Convert a list of Column (or names) into a JVM Seq of Column.
-
-    An optional `converter` could be used to convert items in `cols`
-    into JVM Column objects.
-    """
-    if converter:
-        cols = [converter(c) for c in cols]
-    return sc._jvm.PythonUtils.toSeq(cols)
-
-
 def _to_scala_map(sc, jm):
     """
     Convert a dict into a JVM Map.
@@ -1311,282 +1147,6 @@ def _to_scala_map(sc, jm):
     return sc._jvm.PythonUtils.toScalaMap(jm)
 
 
-def _unary_op(name, doc="unary operator"):
-    """ Create a method for given unary operator """
-    def _(self):
-        jc = getattr(self._jc, name)()
-        return Column(jc)
-    _.__doc__ = doc
-    return _
-
-
-def _func_op(name, doc=''):
-    def _(self):
-        sc = SparkContext._active_spark_context
-        jc = getattr(sc._jvm.functions, name)(self._jc)
-        return Column(jc)
-    _.__doc__ = doc
-    return _
-
-
-def _bin_op(name, doc="binary operator"):
-    """ Create a method for given binary operator
-    """
-    def _(self, other):
-        jc = other._jc if isinstance(other, Column) else other
-        njc = getattr(self._jc, name)(jc)
-        return Column(njc)
-    _.__doc__ = doc
-    return _
-
-
-def _reverse_op(name, doc="binary operator"):
-    """ Create a method for binary operator (this object is on right side)
-    """
-    def _(self, other):
-        jother = _create_column_from_literal(other)
-        jc = getattr(jother, name)(self._jc)
-        return Column(jc)
-    _.__doc__ = doc
-    return _
-
-
-class Column(object):
-
-    """
-    A column in a DataFrame.
-
-    :class:`Column` instances can be created by::
-
-        # 1. Select a column out of a DataFrame
-
-        df.colName
-        df["colName"]
-
-        # 2. Create from an expression
-        df.colName + 1
-        1 / df.colName
-    """
-
-    def __init__(self, jc):
-        self._jc = jc
-
-    # arithmetic operators
-    __neg__ = _func_op("negate")
-    __add__ = _bin_op("plus")
-    __sub__ = _bin_op("minus")
-    __mul__ = _bin_op("multiply")
-    __div__ = _bin_op("divide")
-    __truediv__ = _bin_op("divide")
-    __mod__ = _bin_op("mod")
-    __radd__ = _bin_op("plus")
-    __rsub__ = _reverse_op("minus")
-    __rmul__ = _bin_op("multiply")
-    __rdiv__ = _reverse_op("divide")
-    __rtruediv__ = _reverse_op("divide")
-    __rmod__ = _reverse_op("mod")
-
-    # logistic operators
-    __eq__ = _bin_op("equalTo")
-    __ne__ = _bin_op("notEqual")
-    __lt__ = _bin_op("lt")
-    __le__ = _bin_op("leq")
-    __ge__ = _bin_op("geq")
-    __gt__ = _bin_op("gt")
-
-    # `and`, `or`, `not` cannot be overloaded in Python,
-    # so use bitwise operators as boolean operators
-    __and__ = _bin_op('and')
-    __or__ = _bin_op('or')
-    __invert__ = _func_op('not')
-    __rand__ = _bin_op("and")
-    __ror__ = _bin_op("or")
-
-    # container operators
-    __contains__ = _bin_op("contains")
-    __getitem__ = _bin_op("apply")
-
-    # bitwise operators
-    bitwiseOR = _bin_op("bitwiseOR")
-    bitwiseAND = _bin_op("bitwiseAND")
-    bitwiseXOR = _bin_op("bitwiseXOR")
-
-    def getItem(self, key):
-        """An expression that gets an item at position `ordinal` out of a list,
-         or gets an item by key out of a dict.
-
-        >>> df = sc.parallelize([([1, 2], {"key": "value"})]).toDF(["l", "d"])
-        >>> df.select(df.l.getItem(0), df.d.getItem("key")).show()
-        +----+------+
-        |l[0]|d[key]|
-        +----+------+
-        |   1| value|
-        +----+------+
-        >>> df.select(df.l[0], df.d["key"]).show()
-        +----+------+
-        |l[0]|d[key]|
-        +----+------+
-        |   1| value|
-        +----+------+
-        """
-        return self[key]
-
-    def getField(self, name):
-        """An expression that gets a field by name in a StructField.
-
-        >>> from pyspark.sql import Row
-        >>> df = sc.parallelize([Row(r=Row(a=1, b="b"))]).toDF()
-        >>> df.select(df.r.getField("b")).show()
-        +----+
-        |r[b]|
-        +----+
-        |   b|
-        +----+
-        >>> df.select(df.r.a).show()
-        +----+
-        |r[a]|
-        +----+
-        |   1|
-        +----+
-        """
-        return self[name]
-
-    def __getattr__(self, item):
-        if item.startswith("__"):
-            raise AttributeError(item)
-        return self.getField(item)
-
-    # string methods
-    rlike = _bin_op("rlike")
-    like = _bin_op("like")
-    startswith = _bin_op("startsWith")
-    endswith = _bin_op("endsWith")
-
-    @ignore_unicode_prefix
-    def substr(self, startPos, length):
-        """
-        Return a :class:`Column` which is a substring of the column
-
-        :param startPos: start position (int or Column)
-        :param length:  length of the substring (int or Column)
-
-        >>> df.select(df.name.substr(1, 3).alias("col")).collect()
-        [Row(col=u'Ali'), Row(col=u'Bob')]
-        """
-        if type(startPos) != type(length):
-            raise TypeError("Can not mix the type")
-        if isinstance(startPos, (int, long)):
-            jc = self._jc.substr(startPos, length)
-        elif isinstance(startPos, Column):
-            jc = self._jc.substr(startPos._jc, length._jc)
-        else:
-            raise TypeError("Unexpected type: %s" % type(startPos))
-        return Column(jc)
-
-    __getslice__ = substr
-
-    @ignore_unicode_prefix
-    def inSet(self, *cols):
-        """ A boolean expression that is evaluated to true if the value of this
-        expression is contained by the evaluated values of the arguments.
-
-        >>> df[df.name.inSet("Bob", "Mike")].collect()
-        [Row(age=5, name=u'Bob')]
-        >>> df[df.age.inSet([1, 2, 3])].collect()
-        [Row(age=2, name=u'Alice')]
-        """
-        if len(cols) == 1 and isinstance(cols[0], (list, set)):
-            cols = cols[0]
-        cols = [c._jc if isinstance(c, Column) else _create_column_from_literal(c) for c in cols]
-        sc = SparkContext._active_spark_context
-        jc = getattr(self._jc, "in")(_to_seq(sc, cols))
-        return Column(jc)
-
-    # order
-    asc = _unary_op("asc", "Returns a sort expression based on the"
-                           " ascending order of the given column name.")
-    desc = _unary_op("desc", "Returns a sort expression based on the"
-                             " descending order of the given column name.")
-
-    isNull = _unary_op("isNull", "True if the current expression is null.")
-    isNotNull = _unary_op("isNotNull", "True if the current expression is not null.")
-
-    def alias(self, *alias):
-        """Returns this column aliased with a new name or names (in the case of expressions that
-        return more than one column, such as explode).
-
-        >>> df.select(df.age.alias("age2")).collect()
-        [Row(age2=2), Row(age2=5)]
-        """
-
-        if len(alias) == 1:
-            return Column(getattr(self._jc, "as")(alias[0]))
-        else:
-            sc = SparkContext._active_spark_context
-            return Column(getattr(self._jc, "as")(_to_seq(sc, list(alias))))
-
-    @ignore_unicode_prefix
-    def cast(self, dataType):
-        """ Convert the column into type `dataType`
-
-        >>> df.select(df.age.cast("string").alias('ages')).collect()
-        [Row(ages=u'2'), Row(ages=u'5')]
-        >>> df.select(df.age.cast(StringType()).alias('ages')).collect()
-        [Row(ages=u'2'), Row(ages=u'5')]
-        """
-        if isinstance(dataType, basestring):
-            jc = self._jc.cast(dataType)
-        elif isinstance(dataType, DataType):
-            sc = SparkContext._active_spark_context
-            ssql_ctx = sc._jvm.SQLContext(sc._jsc.sc())
-            jdt = ssql_ctx.parseDataType(dataType.json())
-            jc = self._jc.cast(jdt)
-        else:
-            raise TypeError("unexpected type: %s" % type(dataType))
-        return Column(jc)
-
-    @ignore_unicode_prefix
-    def between(self, lowerBound, upperBound):
-        """ A boolean expression that is evaluated to true if the value of this
-        expression is between the given columns.
-        """
-        return (self >= lowerBound) & (self <= upperBound)
-
-    @ignore_unicode_prefix
-    def when(self, condition, value):
-        """Evaluates a list of conditions and returns one of multiple possible result expressions.
-        If :func:`Column.otherwise` is not invoked, None is returned for unmatched conditions.
-
-        See :func:`pyspark.sql.functions.when` for example usage.
-
-        :param condition: a boolean :class:`Column` expression.
-        :param value: a literal value, or a :class:`Column` expression.
-
-        """
-        sc = SparkContext._active_spark_context
-        if not isinstance(condition, Column):
-            raise TypeError("condition should be a Column")
-        v = value._jc if isinstance(value, Column) else value
-        jc = sc._jvm.functions.when(condition._jc, v)
-        return Column(jc)
-
-    @ignore_unicode_prefix
-    def otherwise(self, value):
-        """Evaluates a list of conditions and returns one of multiple possible result expressions.
-        If :func:`Column.otherwise` is not invoked, None is returned for unmatched conditions.
-
-        See :func:`pyspark.sql.functions.when` for example usage.
-
-        :param value: a literal value, or a :class:`Column` expression.
-        """
-        v = value._jc if isinstance(value, Column) else value
-        jc = self._jc.otherwise(value)
-        return Column(jc)
-
-    def __repr__(self):
-        return 'Column<%s>' % self._jc.toString().encode('utf8')
-
-
 class DataFrameNaFunctions(object):
     """Functionality for working with missing data in :class:`DataFrame`.
     """
@@ -1646,9 +1206,6 @@ def _test():
         .toDF(StructType([StructField('age', IntegerType()),
                           StructField('name', StringType())]))
     globs['df2'] = sc.parallelize([Row(name='Tom', height=80), Row(name='Bob', height=85)]).toDF()
-    globs['df3'] = sc.parallelize([Row(name='Alice', age=2, height=80),
-                                  Row(name='Bob', age=5, height=85)]).toDF()
-
     globs['df4'] = sc.parallelize([Row(name='Alice', age=10, height=80),
                                   Row(name='Bob', age=5, height=None),
                                   Row(name='Tom', age=None, height=None),
diff --git a/python/pyspark/sql/functions.py b/python/pyspark/sql/functions.py
index 6cd6974b0e5bb..8d0e766ecd3b4 100644
--- a/python/pyspark/sql/functions.py
+++ b/python/pyspark/sql/functions.py
@@ -27,7 +27,7 @@
 from pyspark.rdd import _prepare_for_python_RDD, ignore_unicode_prefix
 from pyspark.serializers import PickleSerializer, AutoBatchedSerializer
 from pyspark.sql.types import StringType
-from pyspark.sql.dataframe import Column, _to_java_column, _to_seq
+from pyspark.sql.column import Column, _to_java_column, _to_seq
 
 
 __all__ = [
diff --git a/python/pyspark/sql/group.py b/python/pyspark/sql/group.py
new file mode 100644
index 0000000000000..9f7c743c051d3
--- /dev/null
+++ b/python/pyspark/sql/group.py
@@ -0,0 +1,183 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from pyspark.rdd import ignore_unicode_prefix
+from pyspark.sql.column import Column, _to_seq
+from pyspark.sql.dataframe import DataFrame
+from pyspark.sql.types import *
+
+__all__ = ["GroupedData"]
+
+
+def dfapi(f):
+    def _api(self):
+        name = f.__name__
+        jdf = getattr(self._jdf, name)()
+        return DataFrame(jdf, self.sql_ctx)
+    _api.__name__ = f.__name__
+    _api.__doc__ = f.__doc__
+    return _api
+
+
+def df_varargs_api(f):
+    def _api(self, *args):
+        name = f.__name__
+        jdf = getattr(self._jdf, name)(_to_seq(self.sql_ctx._sc, args))
+        return DataFrame(jdf, self.sql_ctx)
+    _api.__name__ = f.__name__
+    _api.__doc__ = f.__doc__
+    return _api
+
+
+class GroupedData(object):
+    """
+    A set of methods for aggregations on a :class:`DataFrame`,
+    created by :func:`DataFrame.groupBy`.
+    """
+
+    def __init__(self, jdf, sql_ctx):
+        self._jdf = jdf
+        self.sql_ctx = sql_ctx
+
+    @ignore_unicode_prefix
+    def agg(self, *exprs):
+        """Compute aggregates and returns the result as a :class:`DataFrame`.
+
+        The available aggregate functions are `avg`, `max`, `min`, `sum`, `count`.
+
+        If ``exprs`` is a single :class:`dict` mapping from string to string, then the key
+        is the column to perform aggregation on, and the value is the aggregate function.
+
+        Alternatively, ``exprs`` can also be a list of aggregate :class:`Column` expressions.
+
+        :param exprs: a dict mapping from column name (string) to aggregate functions (string),
+            or a list of :class:`Column`.
+
+        >>> gdf = df.groupBy(df.name)
+        >>> gdf.agg({"*": "count"}).collect()
+        [Row(name=u'Alice', COUNT(1)=1), Row(name=u'Bob', COUNT(1)=1)]
+
+        >>> from pyspark.sql import functions as F
+        >>> gdf.agg(F.min(df.age)).collect()
+        [Row(name=u'Alice', MIN(age)=2), Row(name=u'Bob', MIN(age)=5)]
+        """
+        assert exprs, "exprs should not be empty"
+        if len(exprs) == 1 and isinstance(exprs[0], dict):
+            jdf = self._jdf.agg(exprs[0])
+        else:
+            # Columns
+            assert all(isinstance(c, Column) for c in exprs), "all exprs should be Column"
+            jdf = self._jdf.agg(exprs[0]._jc,
+                                _to_seq(self.sql_ctx._sc, [c._jc for c in exprs[1:]]))
+        return DataFrame(jdf, self.sql_ctx)
+
+    @dfapi
+    def count(self):
+        """Counts the number of records for each group.
+
+        >>> df.groupBy(df.age).count().collect()
+        [Row(age=2, count=1), Row(age=5, count=1)]
+        """
+
+    @df_varargs_api
+    def mean(self, *cols):
+        """Computes average values for each numeric columns for each group.
+
+        :func:`mean` is an alias for :func:`avg`.
+
+        :param cols: list of column names (string). Non-numeric columns are ignored.
+
+        >>> df.groupBy().mean('age').collect()
+        [Row(AVG(age)=3.5)]
+        >>> df3.groupBy().mean('age', 'height').collect()
+        [Row(AVG(age)=3.5, AVG(height)=82.5)]
+        """
+
+    @df_varargs_api
+    def avg(self, *cols):
+        """Computes average values for each numeric columns for each group.
+
+        :func:`mean` is an alias for :func:`avg`.
+
+        :param cols: list of column names (string). Non-numeric columns are ignored.
+
+        >>> df.groupBy().avg('age').collect()
+        [Row(AVG(age)=3.5)]
+        >>> df3.groupBy().avg('age', 'height').collect()
+        [Row(AVG(age)=3.5, AVG(height)=82.5)]
+        """
+
+    @df_varargs_api
+    def max(self, *cols):
+        """Computes the max value for each numeric columns for each group.
+
+        >>> df.groupBy().max('age').collect()
+        [Row(MAX(age)=5)]
+        >>> df3.groupBy().max('age', 'height').collect()
+        [Row(MAX(age)=5, MAX(height)=85)]
+        """
+
+    @df_varargs_api
+    def min(self, *cols):
+        """Computes the min value for each numeric column for each group.
+
+        :param cols: list of column names (string). Non-numeric columns are ignored.
+
+        >>> df.groupBy().min('age').collect()
+        [Row(MIN(age)=2)]
+        >>> df3.groupBy().min('age', 'height').collect()
+        [Row(MIN(age)=2, MIN(height)=80)]
+        """
+
+    @df_varargs_api
+    def sum(self, *cols):
+        """Compute the sum for each numeric columns for each group.
+
+        :param cols: list of column names (string). Non-numeric columns are ignored.
+
+        >>> df.groupBy().sum('age').collect()
+        [Row(SUM(age)=7)]
+        >>> df3.groupBy().sum('age', 'height').collect()
+        [Row(SUM(age)=7, SUM(height)=165)]
+        """
+
+
+def _test():
+    import doctest
+    from pyspark.context import SparkContext
+    from pyspark.sql import Row, SQLContext
+    import pyspark.sql.group
+    globs = pyspark.sql.group.__dict__.copy()
+    sc = SparkContext('local[4]', 'PythonTest')
+    globs['sc'] = sc
+    globs['sqlContext'] = SQLContext(sc)
+    globs['df'] = sc.parallelize([(2, 'Alice'), (5, 'Bob')]) \
+        .toDF(StructType([StructField('age', IntegerType()),
+                          StructField('name', StringType())]))
+    globs['df3'] = sc.parallelize([Row(name='Alice', age=2, height=80),
+                                   Row(name='Bob', age=5, height=85)]).toDF()
+
+    (failure_count, test_count) = doctest.testmod(
+        pyspark.sql.group, globs=globs,
+        optionflags=doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE | doctest.REPORT_NDIFF)
+    globs['sc'].stop()
+    if failure_count:
+        exit(-1)
+
+
+if __name__ == "__main__":
+    _test()
diff --git a/python/run-tests b/python/run-tests
index f2757a3967e81..ffde2fb24b369 100755
--- a/python/run-tests
+++ b/python/run-tests
@@ -72,7 +72,9 @@ function run_sql_tests() {
     echo "Run sql tests ..."
     run_test "pyspark/sql/_types.py"
     run_test "pyspark/sql/context.py"
+    run_test "pyspark/sql/column.py"
     run_test "pyspark/sql/dataframe.py"
+    run_test "pyspark/sql/group.py"
     run_test "pyspark/sql/functions.py"
     run_test "pyspark/sql/tests.py"
 }

From deb411335a09b91eb1f75421d77e1c3686719621 Mon Sep 17 00:00:00 2001
From: AiHe <ai.he@ussuning.com>
Date: Fri, 15 May 2015 20:42:35 -0700
Subject: [PATCH 017/525] [SPARK-7473] [MLLIB] Add reservoir sample in
 RandomForest

reservoir feature sample by using existing api

Author: AiHe <ai.he@ussuning.com>

Closes #5988 from AiHe/reservoir and squashes the following commits:

e7a41ac [AiHe] remove non-robust testing case
28ffb9a [AiHe] set seed as rng.nextLong
37459e1 [AiHe] set fixed seed
1e98a4c [AiHe] [MLLIB][tree] Add reservoir sample in RandomForest
---
 .../scala/org/apache/spark/mllib/tree/RandomForest.scala    | 6 +++---
 .../org/apache/spark/mllib/tree/RandomForestSuite.scala     | 1 -
 2 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/RandomForest.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/RandomForest.scala
index 055e60c7d9c95..b347c450c1aa8 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/tree/RandomForest.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/RandomForest.scala
@@ -36,6 +36,7 @@ import org.apache.spark.mllib.tree.model._
 import org.apache.spark.rdd.RDD
 import org.apache.spark.storage.StorageLevel
 import org.apache.spark.util.Utils
+import org.apache.spark.util.random.SamplingUtils
 
 /**
  * :: Experimental ::
@@ -473,9 +474,8 @@ object RandomForest extends Serializable with Logging {
       val (treeIndex, node) = nodeQueue.head
       // Choose subset of features for node (if subsampling).
       val featureSubset: Option[Array[Int]] = if (metadata.subsamplingFeatures) {
-        // TODO: Use more efficient subsampling?  (use selection-and-rejection or reservoir)
-        Some(rng.shuffle(Range(0, metadata.numFeatures).toList)
-          .take(metadata.numFeaturesPerNode).toArray)
+        Some(SamplingUtils.reservoirSampleAndCount(Range(0, 
+          metadata.numFeatures).iterator, metadata.numFeaturesPerNode, rng.nextLong)._1)
       } else {
         None
       }
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/tree/RandomForestSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/tree/RandomForestSuite.scala
index ee3bc98486862..4ed66953cb628 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/tree/RandomForestSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/tree/RandomForestSuite.scala
@@ -196,7 +196,6 @@ class RandomForestSuite extends FunSuite with MLlibTestSparkContext {
       numClasses = 3, categoricalFeaturesInfo = categoricalFeaturesInfo)
     val model = RandomForest.trainClassifier(input, strategy, numTrees = 2,
       featureSubsetStrategy = "sqrt", seed = 12345)
-    EnsembleTestHelper.validateClassifier(model, arr, 1.0)
   }
 
   test("subsampling rate in RandomForest"){

From 578bfeeff514228f6fd4b07a536815fbb3510f7e Mon Sep 17 00:00:00 2001
From: Reynold Xin <rxin@databricks.com>
Date: Fri, 15 May 2015 22:00:31 -0700
Subject: [PATCH 018/525] [SPARK-7654][SQL] DataFrameReader and DataFrameWriter
 for input/output API

This patch introduces DataFrameWriter and DataFrameReader.

DataFrameReader interface, accessible through SQLContext.read, contains methods that create DataFrames. These methods used to reside in SQLContext. Example usage:
```scala
sqlContext.read.json("...")
sqlContext.read.parquet("...")
```

DataFrameWriter interface, accessible through DataFrame.write, implements a builder pattern to avoid the proliferation of options in writing DataFrame out. It currently implements:
- mode
- format (e.g. "parquet", "json")
- options (generic options passed down into data sources)
- partitionBy (partitioning columns)
Example usage:
```scala
df.write.mode("append").format("json").partitionBy("date").saveAsTable("myJsonTable")
```

TODO:

- [ ] Documentation update
- [ ] Move JDBC into reader / writer?
- [ ] Deprecate the old interfaces
- [ ] Move the generic load interface into reader.
- [ ] Update example code and documentation

Author: Reynold Xin <rxin@databricks.com>

Closes #6175 from rxin/reader-writer and squashes the following commits:

b146c95 [Reynold Xin] Deprecation of old APIs.
bd8abdf [Reynold Xin] Fixed merge conflict.
26abea2 [Reynold Xin] Added general load methods.
244fbec [Reynold Xin] Added equivalent to example.
4f15d92 [Reynold Xin] Added documentation for partitionBy.
7e91611 [Reynold Xin] [SPARK-7654][SQL] DataFrameReader and DataFrameWriter for input/output API.
---
 .../spark/examples/sql/JavaSparkSQL.java      |   4 +-
 .../spark/examples/mllib/DatasetExample.scala |   2 +-
 .../spark/examples/sql/RDDRelation.scala      |   2 +-
 .../org/apache/spark/sql/DataFrame.scala      | 172 +++-------
 .../apache/spark/sql/DataFrameReader.scala    | 218 ++++++++++++
 .../apache/spark/sql/DataFrameWriter.scala    | 198 +++++++++++
 .../org/apache/spark/sql/SQLContext.scala     | 158 +++------
 .../spark/sql/parquet/ParquetTest.scala       |   8 +-
 .../spark/sql/sources/JavaSaveLoadSuite.java  |   8 +-
 .../org/apache/spark/sql/DataFrameSuite.scala |   4 +-
 .../org/apache/spark/sql/SQLQuerySuite.scala  |  17 +-
 .../spark/sql/UserDefinedTypeSuite.scala      |   4 +-
 .../org/apache/spark/sql/json/JsonSuite.scala |  50 +--
 .../sql/parquet/ParquetFilterSuite.scala      |   6 +-
 .../spark/sql/parquet/ParquetIOSuite.scala    |  41 ++-
 .../ParquetPartitionDiscoverySuite.scala      |  16 +-
 .../sources/CreateTableAsSelectSuite.scala    |   2 +-
 .../spark/sql/sources/InsertSuite.scala       |  10 +-
 .../spark/sql/sources/SaveLoadSuite.scala     |  26 +-
 .../spark/sql/hive/HiveStrategies.scala       |   4 +-
 .../spark/sql/hive/HiveParquetSuite.scala     |   8 +-
 .../sql/hive/MetastoreDataSourcesSuite.scala  |  18 +-
 .../apache/spark/sql/hive/parquetSuites.scala |  16 +-
 .../sql/sources/hadoopFsRelationSuites.scala  | 321 ++++++++----------
 24 files changed, 772 insertions(+), 541 deletions(-)
 create mode 100644 sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala
 create mode 100644 sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala

diff --git a/examples/src/main/java/org/apache/spark/examples/sql/JavaSparkSQL.java b/examples/src/main/java/org/apache/spark/examples/sql/JavaSparkSQL.java
index 8159ffbe2d269..173633ce059e3 100644
--- a/examples/src/main/java/org/apache/spark/examples/sql/JavaSparkSQL.java
+++ b/examples/src/main/java/org/apache/spark/examples/sql/JavaSparkSQL.java
@@ -99,7 +99,7 @@ public String call(Row row) {
     // Read in the parquet file created above.
     // Parquet files are self-describing so the schema is preserved.
     // The result of loading a parquet file is also a DataFrame.
-    DataFrame parquetFile = sqlContext.parquetFile("people.parquet");
+    DataFrame parquetFile = sqlContext.read().parquet("people.parquet");
 
     //Parquet files can also be registered as tables and then used in SQL statements.
     parquetFile.registerTempTable("parquetFile");
@@ -120,7 +120,7 @@ public String call(Row row) {
     // The path can be either a single text file or a directory storing text files.
     String path = "examples/src/main/resources/people.json";
     // Create a DataFrame from the file(s) pointed by path
-    DataFrame peopleFromJsonFile = sqlContext.jsonFile(path);
+    DataFrame peopleFromJsonFile = sqlContext.read().json(path);
 
     // Because the schema of a JSON dataset is automatically inferred, to write queries,
     // it is better to take a look at what is the schema.
diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/DatasetExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/DatasetExample.scala
index e943d6c889fab..c95cca7d656e8 100644
--- a/examples/src/main/scala/org/apache/spark/examples/mllib/DatasetExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/mllib/DatasetExample.scala
@@ -106,7 +106,7 @@ object DatasetExample {
     df.saveAsParquetFile(outputDir)
 
     println(s"Loading Parquet file with UDT from $outputDir.")
-    val newDataset = sqlContext.parquetFile(outputDir)
+    val newDataset = sqlContext.read.parquet(outputDir)
 
     println(s"Schema from Parquet: ${newDataset.schema.prettyJson}")
     val newFeatures = newDataset.select("features").map { case Row(v: Vector) => v }
diff --git a/examples/src/main/scala/org/apache/spark/examples/sql/RDDRelation.scala b/examples/src/main/scala/org/apache/spark/examples/sql/RDDRelation.scala
index 6331d1c0060f8..acc89199d5849 100644
--- a/examples/src/main/scala/org/apache/spark/examples/sql/RDDRelation.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/sql/RDDRelation.scala
@@ -61,7 +61,7 @@ object RDDRelation {
     df.saveAsParquetFile("pair.parquet")
 
     // Read in parquet file.  Parquet files are self-describing so the schmema is preserved.
-    val parquetFile = sqlContext.parquetFile("pair.parquet")
+    val parquetFile = sqlContext.read.parquet("pair.parquet")
 
     // Queries can be run using the DSL on parequet files just like the original RDD.
     parquetFile.where($"key" === 1).select($"value".as("a")).collect().foreach(println)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
index 2e20c3d3f4ed2..55ef357a99f71 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
@@ -1289,6 +1289,16 @@ class DataFrame private[sql](
     sqlContext.registerDataFrameAsTable(this, tableName)
   }
 
+  /**
+   * :: Experimental ::
+   * Interface for saving the content of the [[DataFrame]] out into external storage.
+   *
+   * @group output
+   * @since 1.4.0
+   */
+  @Experimental
+  def write: DataFrameWriter = new DataFrameWriter(this)
+
   /**
    * Saves the contents of this [[DataFrame]] as a parquet file, preserving the schema.
    * Files that are written out using this method can be read back in as a [[DataFrame]]
@@ -1296,16 +1306,16 @@ class DataFrame private[sql](
    * @group output
    * @since 1.3.0
    */
+  @deprecated("Use write.parquet(path)", "1.4.0")
   def saveAsParquetFile(path: String): Unit = {
     if (sqlContext.conf.parquetUseDataSourceApi) {
-      save("org.apache.spark.sql.parquet", SaveMode.ErrorIfExists, Map("path" -> path))
+      write.format("parquet").mode(SaveMode.ErrorIfExists).save(path)
     } else {
       sqlContext.executePlan(WriteToFile(path, logicalPlan)).toRdd
     }
   }
 
   /**
-   * :: Experimental ::
    * Creates a table from the the contents of this DataFrame.
    * It will use the default data source configured by spark.sql.sources.default.
    * This will fail if the table already exists.
@@ -1320,13 +1330,12 @@ class DataFrame private[sql](
    * @group output
    * @since 1.3.0
    */
-  @Experimental
+  @deprecated("Use write.saveAsTable(tableName)", "1.4.0")
   def saveAsTable(tableName: String): Unit = {
-    saveAsTable(tableName, SaveMode.ErrorIfExists)
+    write.mode(SaveMode.ErrorIfExists).saveAsTable(tableName)
   }
 
   /**
-   * :: Experimental ::
    * Creates a table from the the contents of this DataFrame, using the default data source
    * configured by spark.sql.sources.default and [[SaveMode.ErrorIfExists]] as the save mode.
    *
@@ -1340,20 +1349,18 @@ class DataFrame private[sql](
    * @group output
    * @since 1.3.0
    */
-  @Experimental
+  @deprecated("Use write.mode(mode).saveAsTable(tableName)", "1.4.0")
   def saveAsTable(tableName: String, mode: SaveMode): Unit = {
     if (sqlContext.catalog.tableExists(Seq(tableName)) && mode == SaveMode.Append) {
       // If table already exists and the save mode is Append,
       // we will just call insertInto to append the contents of this DataFrame.
       insertInto(tableName, overwrite = false)
     } else {
-      val dataSourceName = sqlContext.conf.defaultDataSourceName
-      saveAsTable(tableName, dataSourceName, mode)
+      write.mode(mode).saveAsTable(tableName)
     }
   }
 
   /**
-   * :: Experimental ::
    * Creates a table at the given path from the the contents of this DataFrame
    * based on a given data source and a set of options,
    * using [[SaveMode.ErrorIfExists]] as the save mode.
@@ -1368,9 +1375,9 @@ class DataFrame private[sql](
    * @group output
    * @since 1.3.0
    */
-  @Experimental
+  @deprecated("Use write.format(source).saveAsTable(tableName)", "1.4.0")
   def saveAsTable(tableName: String, source: String): Unit = {
-    saveAsTable(tableName, source, SaveMode.ErrorIfExists)
+    write.format(source).saveAsTable(tableName)
   }
 
   /**
@@ -1388,13 +1395,12 @@ class DataFrame private[sql](
    * @group output
    * @since 1.3.0
    */
-  @Experimental
+  @deprecated("Use write.format(source).mode(mode).saveAsTable(tableName)", "1.4.0")
   def saveAsTable(tableName: String, source: String, mode: SaveMode): Unit = {
-    saveAsTable(tableName, source, mode, Map.empty[String, String])
+    write.format(source).mode(mode).saveAsTable(tableName)
   }
 
   /**
-   * :: Experimental ::
    * Creates a table at the given path from the the contents of this DataFrame
    * based on a given data source, [[SaveMode]] specified by mode, and a set of options.
    *
@@ -1408,40 +1414,17 @@ class DataFrame private[sql](
    * @group output
    * @since 1.3.0
    */
-  @Experimental
+  @deprecated("Use write.format(source).mode(mode).options(options).saveAsTable(tableName)",
+    "1.4.0")
   def saveAsTable(
       tableName: String,
       source: String,
       mode: SaveMode,
       options: java.util.Map[String, String]): Unit = {
-    saveAsTable(tableName, source, mode, options.toMap)
-  }
-
-  /**
-   * :: Experimental ::
-   * Creates a table at the given path from the the contents of this DataFrame
-   * based on a given data source, [[SaveMode]] specified by mode, a set of options, and a list of
-   * partition columns.
-   *
-   * Note that this currently only works with DataFrames that are created from a HiveContext as
-   * there is no notion of a persisted catalog in a standard SQL context.  Instead you can write
-   * an RDD out to a parquet file, and then register that file as a table.  This "table" can then
-   * be the target of an `insertInto`.
-   * @group output
-   * @since 1.4.0
-   */
-  @Experimental
-  def saveAsTable(
-      tableName: String,
-      source: String,
-      mode: SaveMode,
-      options: java.util.Map[String, String],
-      partitionColumns: java.util.List[String]): Unit = {
-    saveAsTable(tableName, source, mode, options.toMap, partitionColumns)
+    write.format(source).mode(mode).options(options).saveAsTable(tableName)
   }
 
   /**
-   * :: Experimental ::
    * (Scala-specific)
    * Creates a table from the the contents of this DataFrame based on a given data source,
    * [[SaveMode]] specified by mode, and a set of options.
@@ -1456,167 +1439,88 @@ class DataFrame private[sql](
    * @group output
    * @since 1.3.0
    */
-  @Experimental
+  @deprecated("Use write.format(source).mode(mode).options(options).saveAsTable(tableName)",
+    "1.4.0")
   def saveAsTable(
       tableName: String,
       source: String,
       mode: SaveMode,
       options: Map[String, String]): Unit = {
-    val cmd =
-      CreateTableUsingAsSelect(
-        tableName,
-        source,
-        temporary = false,
-        Array.empty[String],
-        mode,
-        options,
-        logicalPlan)
-
-    sqlContext.executePlan(cmd).toRdd
+    write.format(source).mode(mode).options(options).saveAsTable(tableName)
   }
 
   /**
-   * :: Experimental ::
-   * Creates a table at the given path from the the contents of this DataFrame
-   * based on a given data source, [[SaveMode]] specified by mode, a set of options, and a list of
-   * partition columns.
-   *
-   * Note that this currently only works with DataFrames that are created from a HiveContext as
-   * there is no notion of a persisted catalog in a standard SQL context.  Instead you can write
-   * an RDD out to a parquet file, and then register that file as a table.  This "table" can then
-   * be the target of an `insertInto`.
-   * @group output
-   * @since 1.4.0
-   */
-  @Experimental
-  def saveAsTable(
-      tableName: String,
-      source: String,
-      mode: SaveMode,
-      options: Map[String, String],
-      partitionColumns: Seq[String]): Unit = {
-    sqlContext.executePlan(
-      CreateTableUsingAsSelect(
-        tableName,
-        source,
-        temporary = false,
-        partitionColumns.toArray,
-        mode,
-        options,
-        logicalPlan)).toRdd
-  }
-
-  /**
-   * :: Experimental ::
    * Saves the contents of this DataFrame to the given path,
    * using the default data source configured by spark.sql.sources.default and
    * [[SaveMode.ErrorIfExists]] as the save mode.
    * @group output
    * @since 1.3.0
    */
-  @Experimental
+  @deprecated("Use write.save(path)", "1.4.0")
   def save(path: String): Unit = {
-    save(path, SaveMode.ErrorIfExists)
+    write.save(path)
   }
 
   /**
-   * :: Experimental ::
    * Saves the contents of this DataFrame to the given path and [[SaveMode]] specified by mode,
    * using the default data source configured by spark.sql.sources.default.
    * @group output
    * @since 1.3.0
    */
-  @Experimental
+  @deprecated("Use write.mode(mode).save(path)", "1.4.0")
   def save(path: String, mode: SaveMode): Unit = {
-    val dataSourceName = sqlContext.conf.defaultDataSourceName
-    save(path, dataSourceName, mode)
+    write.mode(mode).save(path)
   }
 
   /**
-   * :: Experimental ::
    * Saves the contents of this DataFrame to the given path based on the given data source,
    * using [[SaveMode.ErrorIfExists]] as the save mode.
    * @group output
    * @since 1.3.0
    */
-  @Experimental
+  @deprecated("Use write.format(source).save(path)", "1.4.0")
   def save(path: String, source: String): Unit = {
-    save(source, SaveMode.ErrorIfExists, Map("path" -> path))
+    write.format(source).save(path)
   }
 
   /**
-   * :: Experimental ::
    * Saves the contents of this DataFrame to the given path based on the given data source and
    * [[SaveMode]] specified by mode.
    * @group output
    * @since 1.3.0
    */
-  @Experimental
+  @deprecated("Use write.format(source).mode(mode).save(path)", "1.4.0")
   def save(path: String, source: String, mode: SaveMode): Unit = {
-    save(source, mode, Map("path" -> path))
+    write.format(source).mode(mode).save(path)
   }
 
   /**
-   * :: Experimental ::
    * Saves the contents of this DataFrame based on the given data source,
    * [[SaveMode]] specified by mode, and a set of options.
    * @group output
    * @since 1.3.0
    */
-  @Experimental
+  @deprecated("Use write.format(source).mode(mode).options(options).save()", "1.4.0")
   def save(
       source: String,
       mode: SaveMode,
       options: java.util.Map[String, String]): Unit = {
-    save(source, mode, options.toMap)
+    write.format(source).mode(mode).options(options).save()
   }
 
   /**
-   * :: Experimental ::
-   * Saves the contents of this DataFrame to the given path based on the given data source,
-   * [[SaveMode]] specified by mode, and partition columns specified by `partitionColumns`.
-   * @group output
-   * @since 1.4.0
-   */
-  @Experimental
-  def save(
-      source: String,
-      mode: SaveMode,
-      options: java.util.Map[String, String],
-      partitionColumns: java.util.List[String]): Unit = {
-    save(source, mode, options.toMap, partitionColumns)
-  }
-
-  /**
-   * :: Experimental ::
    * (Scala-specific)
    * Saves the contents of this DataFrame based on the given data source,
    * [[SaveMode]] specified by mode, and a set of options
    * @group output
    * @since 1.3.0
    */
-  @Experimental
+  @deprecated("Use write.format(source).mode(mode).options(options).save()", "1.4.0")
   def save(
       source: String,
       mode: SaveMode,
       options: Map[String, String]): Unit = {
-    ResolvedDataSource(sqlContext, source, Array.empty[String], mode, options, this)
-  }
-
-  /**
-   * :: Experimental ::
-   * Saves the contents of this DataFrame to the given path based on the given data source,
-   * [[SaveMode]] specified by mode, and partition columns specified by `partitionColumns`.
-   * @group output
-   * @since 1.4.0
-   */
-  @Experimental
-  def save(
-      source: String,
-      mode: SaveMode,
-      options: Map[String, String],
-      partitionColumns: Seq[String]): Unit = {
-    ResolvedDataSource(sqlContext, source, partitionColumns.toArray, mode, options, this)
+    write.format(source).mode(mode).options(options).save()
   }
 
   /**
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala
new file mode 100644
index 0000000000000..4d63faad6fb7c
--- /dev/null
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala
@@ -0,0 +1,218 @@
+/*
+* Licensed to the Apache Software Foundation (ASF) under one or more
+* contributor license agreements.  See the NOTICE file distributed with
+* this work for additional information regarding copyright ownership.
+* The ASF licenses this file to You under the Apache License, Version 2.0
+* (the "License"); you may not use this file except in compliance with
+* the License.  You may obtain a copy of the License at
+*
+*    http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+package org.apache.spark.sql
+
+import org.apache.hadoop.fs.Path
+
+import org.apache.spark.annotation.Experimental
+import org.apache.spark.api.java.JavaRDD
+import org.apache.spark.deploy.SparkHadoopUtil
+import org.apache.spark.rdd.RDD
+import org.apache.spark.sql.json.{JsonRDD, JSONRelation}
+import org.apache.spark.sql.parquet.ParquetRelation2
+import org.apache.spark.sql.sources.{LogicalRelation, ResolvedDataSource}
+import org.apache.spark.sql.types.StructType
+
+/**
+ * :: Experimental ::
+ * Interface used to load a [[DataFrame]] from external storage systems (e.g. file systems,
+ * key-value stores, etc).
+ *
+ * @since 1.4.0
+ */
+@Experimental
+class DataFrameReader private[sql](sqlContext: SQLContext) {
+
+  /**
+   * Specifies the input data source format.
+   *
+   * @since 1.4.0
+   */
+  def format(source: String): DataFrameReader = {
+    this.source = source
+    this
+  }
+
+  /**
+   * Specifies the input schema. Some data sources (e.g. JSON) can infer the input schema
+   * automatically from data. By specifying the schema here, the underlying data source can
+   * skip the schema inference step, and thus speed up data loading.
+   *
+   * @since 1.4.0
+   */
+  def schema(schema: StructType): DataFrameReader = {
+    this.userSpecifiedSchema = Option(schema)
+    this
+  }
+
+  /**
+   * Adds an input option for the underlying data source.
+   *
+   * @since 1.4.0
+   */
+  def option(key: String, value: String): DataFrameReader = {
+    this.extraOptions += (key -> value)
+    this
+  }
+
+  /**
+   * (Scala-specific) Adds input options for the underlying data source.
+   *
+   * @since 1.4.0
+   */
+  def options(options: scala.collection.Map[String, String]): DataFrameReader = {
+    this.extraOptions ++= options
+    this
+  }
+
+  /**
+   * Adds input options for the underlying data source.
+   *
+   * @since 1.4.0
+   */
+  def options(options: java.util.Map[String, String]): DataFrameReader = {
+    this.options(scala.collection.JavaConversions.mapAsScalaMap(options))
+    this
+  }
+
+  /**
+   * Specifies the input partitioning. If specified, the underlying data source does not need to
+   * discover the data partitioning scheme, and thus can speed up very large inputs.
+   *
+   * @since 1.4.0
+   */
+  @scala.annotation.varargs
+  def partitionBy(colNames: String*): DataFrameReader = {
+    this.partitioningColumns = Option(colNames)
+    this
+  }
+
+  /**
+   * Loads input in as a [[DataFrame]], for data sources that require a path (e.g. data backed by
+   * a local or distributed file system).
+   *
+   * @since 1.4.0
+   */
+  def load(path: String): DataFrame = {
+    option("path", path).load()
+  }
+
+  /**
+   * Loads input in as a [[DataFrame]], for data sources that don't require a path (e.g. external
+   * key-value stores).
+   *
+   * @since 1.4.0
+   */
+  def load(): DataFrame = {
+    val resolved = ResolvedDataSource(
+      sqlContext,
+      userSpecifiedSchema = userSpecifiedSchema,
+      partitionColumns = partitioningColumns.map(_.toArray).getOrElse(Array.empty[String]),
+      provider = source,
+      options = extraOptions.toMap)
+    DataFrame(sqlContext, LogicalRelation(resolved.relation))
+  }
+
+  /**
+   * Loads a JSON file (one object per line) and returns the result as a [[DataFrame]].
+   *
+   * This function goes through the input once to determine the input schema. If you know the
+   * schema in advance, use the version that specifies the schema to avoid the extra scan.
+   *
+   * @param path input path
+   * @since 1.4.0
+   */
+  def json(path: String): DataFrame = format("json").load(path)
+
+  /**
+   * Loads an `JavaRDD[String]` storing JSON objects (one object per record) and
+   * returns the result as a [[DataFrame]].
+   *
+   * Unless the schema is specified using [[schema]] function, this function goes through the
+   * input once to determine the input schema.
+   *
+   * @param jsonRDD input RDD with one JSON object per record
+   * @since 1.4.0
+   */
+  def json(jsonRDD: JavaRDD[String]): DataFrame = json(jsonRDD.rdd)
+
+  /**
+   * Loads an `RDD[String]` storing JSON objects (one object per record) and
+   * returns the result as a [[DataFrame]].
+   *
+   * Unless the schema is specified using [[schema]] function, this function goes through the
+   * input once to determine the input schema.
+   *
+   * @param jsonRDD input RDD with one JSON object per record
+   * @since 1.4.0
+   */
+  def json(jsonRDD: RDD[String]): DataFrame = {
+    val samplingRatio = extraOptions.getOrElse("samplingRatio", "1.0").toDouble
+    if (sqlContext.conf.useJacksonStreamingAPI) {
+      sqlContext.baseRelationToDataFrame(
+        new JSONRelation(() => jsonRDD, None, samplingRatio, userSpecifiedSchema)(sqlContext))
+    } else {
+      val columnNameOfCorruptJsonRecord = sqlContext.conf.columnNameOfCorruptRecord
+      val appliedSchema = userSpecifiedSchema.getOrElse(
+        JsonRDD.nullTypeToStringType(
+          JsonRDD.inferSchema(jsonRDD, 1.0, columnNameOfCorruptJsonRecord)))
+      val rowRDD = JsonRDD.jsonStringToRow(jsonRDD, appliedSchema, columnNameOfCorruptJsonRecord)
+      sqlContext.createDataFrame(rowRDD, appliedSchema, needsConversion = false)
+    }
+  }
+
+  /**
+   * Loads a Parquet file, returning the result as a [[DataFrame]]. This function returns an empty
+   * [[DataFrame]] if no paths are passed in.
+   *
+   * @since 1.4.0
+   */
+  @scala.annotation.varargs
+  def parquet(paths: String*): DataFrame = {
+    if (paths.isEmpty) {
+      sqlContext.emptyDataFrame
+    } else {
+      val globbedPaths = paths.map(new Path(_)).flatMap(SparkHadoopUtil.get.globPath).toArray
+      sqlContext.baseRelationToDataFrame(
+        new ParquetRelation2(
+          globbedPaths.map(_.toString), None, None, Map.empty[String, String])(sqlContext))
+    }
+  }
+
+  /**
+   * Returns the specified table as a [[DataFrame]].
+   *
+   * @since 1.4.0
+   */
+  def table(tableName: String): DataFrame = {
+    DataFrame(sqlContext, sqlContext.catalog.lookupRelation(Seq(tableName)))
+  }
+
+  ///////////////////////////////////////////////////////////////////////////////////////
+  // Builder pattern config options
+  ///////////////////////////////////////////////////////////////////////////////////////
+
+  private var source: String = sqlContext.conf.defaultDataSourceName
+
+  private var userSpecifiedSchema: Option[StructType] = None
+
+  private var extraOptions = new scala.collection.mutable.HashMap[String, String]
+
+  private var partitioningColumns: Option[Seq[String]] = None
+
+}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala
new file mode 100644
index 0000000000000..b1fc18ac3cb54
--- /dev/null
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala
@@ -0,0 +1,198 @@
+/*
+* Licensed to the Apache Software Foundation (ASF) under one or more
+* contributor license agreements.  See the NOTICE file distributed with
+* this work for additional information regarding copyright ownership.
+* The ASF licenses this file to You under the Apache License, Version 2.0
+* (the "License"); you may not use this file except in compliance with
+* the License.  You may obtain a copy of the License at
+*
+*    http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+package org.apache.spark.sql
+
+import org.apache.spark.annotation.Experimental
+import org.apache.spark.sql.sources.{ResolvedDataSource, CreateTableUsingAsSelect}
+
+
+/**
+ * :: Experimental ::
+ * Interface used to write a [[DataFrame]] to external storage systems (e.g. file systems,
+ * key-value stores, etc).
+ *
+ * @since 1.4.0
+ */
+@Experimental
+final class DataFrameWriter private[sql](df: DataFrame) {
+
+  /**
+   * Specifies the behavior when data or table already exists. Options include:
+   *   - `SaveMode.Overwrite`: overwrite the existing data.
+   *   - `SaveMode.Append`: append the data.
+   *   - `SaveMode.Ignore`: ignore the operation (i.e. no-op).
+   *   - `SaveMode.ErrorIfExists`: default option, throw an exception at runtime.
+   *
+   * @since 1.4.0
+   */
+  def mode(saveMode: SaveMode): DataFrameWriter = {
+    this.mode = saveMode
+    this
+  }
+
+  /**
+   * Specifies the behavior when data or table already exists. Options include:
+   *   - `overwrite`: overwrite the existing data.
+   *   - `append`: append the data.
+   *   - `ignore`: ignore the operation (i.e. no-op).
+   *   - `error`: default option, throw an exception at runtime.
+   *
+   * @since 1.4.0
+   */
+  def mode(saveMode: String): DataFrameWriter = {
+    saveMode.toLowerCase match {
+      case "overwrite" => SaveMode.Overwrite
+      case "append" => SaveMode.Append
+      case "ignore" => SaveMode.Ignore
+      case "error" | "default" => SaveMode.ErrorIfExists
+      case _ => throw new IllegalArgumentException(s"Unknown save mode: $saveMode. " +
+        "Accepted modes are 'overwrite', 'append', 'ignore', 'error'.")
+    }
+    this
+  }
+
+  /**
+   * Specifies the underlying output data source. Built-in options include "parquet", "json", etc.
+   *
+   * @since 1.4.0
+   */
+  def format(source: String): DataFrameWriter = {
+    this.source = source
+    this
+  }
+
+  /**
+   * Adds an output option for the underlying data source.
+   *
+   * @since 1.4.0
+   */
+  def option(key: String, value: String): DataFrameWriter = {
+    this.extraOptions += (key -> value)
+    this
+  }
+
+  /**
+   * (Scala-specific) Adds output options for the underlying data source.
+   *
+   * @since 1.4.0
+   */
+  def options(options: scala.collection.Map[String, String]): DataFrameWriter = {
+    this.extraOptions ++= options
+    this
+  }
+
+  /**
+   * Adds output options for the underlying data source.
+   *
+   * @since 1.4.0
+   */
+  def options(options: java.util.Map[String, String]): DataFrameWriter = {
+    this.options(scala.collection.JavaConversions.mapAsScalaMap(options))
+    this
+  }
+
+  /**
+   * Partitions the output by the given columns on the file system. If specified, the output is
+   * laid out on the file system similar to Hive's partitioning scheme.
+   *
+   * @since 1.4.0
+   */
+  @scala.annotation.varargs
+  def partitionBy(colNames: String*): DataFrameWriter = {
+    this.partitioningColumns = Option(colNames)
+    this
+  }
+
+  /**
+   * Saves the content of the [[DataFrame]] at the specified path.
+   *
+   * @since 1.4.0
+   */
+  def save(path: String): Unit = {
+    this.extraOptions += ("path" -> path)
+    save()
+  }
+
+  /**
+   * Saves the content of the [[DataFrame]] as the specified table.
+   *
+   * @since 1.4.0
+   */
+  def save(): Unit = {
+    ResolvedDataSource(
+      df.sqlContext,
+      source,
+      partitioningColumns.map(_.toArray).getOrElse(Array.empty[String]),
+      mode,
+      extraOptions.toMap,
+      df)
+  }
+
+  /**
+   * Saves the content of the [[DataFrame]] as the specified table.
+   *
+   * @since 1.4.0
+   */
+  def saveAsTable(tableName: String): Unit = {
+    val cmd =
+      CreateTableUsingAsSelect(
+        tableName,
+        source,
+        temporary = false,
+        partitioningColumns.map(_.toArray).getOrElse(Array.empty[String]),
+        mode,
+        extraOptions.toMap,
+        df.logicalPlan)
+    df.sqlContext.executePlan(cmd).toRdd
+  }
+
+  /**
+   * Saves the content of the [[DataFrame]] in JSON format at the specified path.
+   * This is equivalent to:
+   * {{{
+   *   format("json").save(path)
+   * }}}
+   *
+   * @since 1.4.0
+   */
+  def json(path: String): Unit = format("json").save(path)
+
+  /**
+   * Saves the content of the [[DataFrame]] in Parquet format at the specified path.
+   * This is equivalent to:
+   * {{{
+   *   format("parquet").save(path)
+   * }}}
+   *
+   * @since 1.4.0
+   */
+  def parquet(path: String): Unit = format("parquet").save(path)
+
+  ///////////////////////////////////////////////////////////////////////////////////////
+  // Builder pattern config options
+  ///////////////////////////////////////////////////////////////////////////////////////
+
+  private var source: String = df.sqlContext.conf.defaultDataSourceName
+
+  private var mode: SaveMode = SaveMode.ErrorIfExists
+
+  private var extraOptions = new scala.collection.mutable.HashMap[String, String]
+
+  private var partitioningColumns: Option[Seq[String]] = None
+
+}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
index 9fb355eb81939..34a50e522c4ca 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
@@ -27,11 +27,9 @@ import scala.reflect.runtime.universe.TypeTag
 import scala.util.control.NonFatal
 
 import com.google.common.reflect.TypeToken
-import org.apache.hadoop.fs.Path
 
 import org.apache.spark.annotation.{DeveloperApi, Experimental}
 import org.apache.spark.api.java.{JavaRDD, JavaSparkContext}
-import org.apache.spark.deploy.SparkHadoopUtil
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.catalyst._
 import org.apache.spark.sql.catalyst.analysis._
@@ -43,8 +41,6 @@ import org.apache.spark.sql.catalyst.rules.RuleExecutor
 import org.apache.spark.sql.catalyst.ParserDialect
 import org.apache.spark.sql.execution.{Filter, _}
 import org.apache.spark.sql.jdbc.{JDBCPartition, JDBCPartitioningInfo, JDBCRelation}
-import org.apache.spark.sql.json._
-import org.apache.spark.sql.parquet.ParquetRelation2
 import org.apache.spark.sql.sources._
 import org.apache.spark.sql.types._
 import org.apache.spark.util.Utils
@@ -596,6 +592,20 @@ class SQLContext(@transient val sparkContext: SparkContext)
     createDataFrame(rdd, beanClass)
   }
 
+  /**
+   * :: Experimental ::
+   * Returns a [[DataFrameReader]] that can be used to read data in as a [[DataFrame]].
+   * {{{
+   *   sqlContext.read.parquet("/path/to/file.parquet")
+   *   sqlContext.read.schema(schema).json("/path/to/file.json")
+   * }}}
+   *
+   * @group genericdata
+   * @since 1.4.0
+   */
+  @Experimental
+  def read: DataFrameReader = new DataFrameReader(this)
+
   /**
    * Loads a Parquet file, returning the result as a [[DataFrame]]. This function returns an empty
    * [[DataFrame]] if no paths are passed in.
@@ -603,15 +613,13 @@ class SQLContext(@transient val sparkContext: SparkContext)
    * @group specificdata
    * @since 1.3.0
    */
+  @deprecated("Use read.parquet()", "1.4.0")
   @scala.annotation.varargs
   def parquetFile(paths: String*): DataFrame = {
     if (paths.isEmpty) {
       emptyDataFrame
     } else if (conf.parquetUseDataSourceApi) {
-      val globbedPaths = paths.map(new Path(_)).flatMap(SparkHadoopUtil.get.globPath).toArray
-      baseRelationToDataFrame(
-        new ParquetRelation2(
-          globbedPaths.map(_.toString), None, None, Map.empty[String, String])(this))
+      read.parquet(paths : _*)
     } else {
       DataFrame(this, parquet.ParquetRelation(
         paths.mkString(","), Some(sparkContext.hadoopConfiguration), this))
@@ -625,28 +633,31 @@ class SQLContext(@transient val sparkContext: SparkContext)
    * @group specificdata
    * @since 1.3.0
    */
-  def jsonFile(path: String): DataFrame = jsonFile(path, 1.0)
+  @deprecated("Use read.json()", "1.4.0")
+  def jsonFile(path: String): DataFrame = {
+    read.json(path)
+  }
 
   /**
-   * :: Experimental ::
    * Loads a JSON file (one object per line) and applies the given schema,
    * returning the result as a [[DataFrame]].
    *
    * @group specificdata
    * @since 1.3.0
    */
-  @Experimental
-  def jsonFile(path: String, schema: StructType): DataFrame =
-    load("json", schema, Map("path" -> path))
+  @deprecated("Use read.json()", "1.4.0")
+  def jsonFile(path: String, schema: StructType): DataFrame = {
+    read.schema(schema).json(path)
+  }
 
   /**
-   * :: Experimental ::
    * @group specificdata
    * @since 1.3.0
    */
-  @Experimental
-  def jsonFile(path: String, samplingRatio: Double): DataFrame =
-    load("json", Map("path" -> path, "samplingRatio" -> samplingRatio.toString))
+  @deprecated("Use read.json()", "1.4.0")
+  def jsonFile(path: String, samplingRatio: Double): DataFrame = {
+    read.option("samplingRatio", samplingRatio.toString).json(path)
+  }
 
   /**
    * Loads an RDD[String] storing JSON objects (one object per record), returning the result as a
@@ -656,8 +667,8 @@ class SQLContext(@transient val sparkContext: SparkContext)
    * @group specificdata
    * @since 1.3.0
    */
-  def jsonRDD(json: RDD[String]): DataFrame = jsonRDD(json, 1.0)
-
+  @deprecated("Use read.json()", "1.4.0")
+  def jsonRDD(json: RDD[String]): DataFrame = read.json(json)
 
   /**
    * Loads an RDD[String] storing JSON objects (one object per record), returning the result as a
@@ -667,196 +678,131 @@ class SQLContext(@transient val sparkContext: SparkContext)
    * @group specificdata
    * @since 1.3.0
    */
-  def jsonRDD(json: JavaRDD[String]): DataFrame = jsonRDD(json.rdd, 1.0)
+  @deprecated("Use read.json()", "1.4.0")
+  def jsonRDD(json: JavaRDD[String]): DataFrame = read.json(json)
 
   /**
-   * :: Experimental ::
    * Loads an RDD[String] storing JSON objects (one object per record) and applies the given schema,
    * returning the result as a [[DataFrame]].
    *
    * @group specificdata
    * @since 1.3.0
    */
-  @Experimental
+  @deprecated("Use read.json()", "1.4.0")
   def jsonRDD(json: RDD[String], schema: StructType): DataFrame = {
-    if (conf.useJacksonStreamingAPI) {
-      baseRelationToDataFrame(new JSONRelation(() => json, None, 1.0, Some(schema))(this))
-    } else {
-      val columnNameOfCorruptJsonRecord = conf.columnNameOfCorruptRecord
-      val appliedSchema =
-        Option(schema).getOrElse(
-          JsonRDD.nullTypeToStringType(
-            JsonRDD.inferSchema(json, 1.0, columnNameOfCorruptJsonRecord)))
-      val rowRDD = JsonRDD.jsonStringToRow(json, appliedSchema, columnNameOfCorruptJsonRecord)
-      createDataFrame(rowRDD, appliedSchema, needsConversion = false)
-    }
+    read.schema(schema).json(json)
   }
 
   /**
-   * :: Experimental ::
    * Loads an JavaRDD<String> storing JSON objects (one object per record) and applies the given
    * schema, returning the result as a [[DataFrame]].
    *
    * @group specificdata
    * @since 1.3.0
    */
-  @Experimental
+  @deprecated("Use read.json()", "1.4.0")
   def jsonRDD(json: JavaRDD[String], schema: StructType): DataFrame = {
-    jsonRDD(json.rdd, schema)
+    read.schema(schema).json(json)
   }
 
   /**
-   * :: Experimental ::
    * Loads an RDD[String] storing JSON objects (one object per record) inferring the
    * schema, returning the result as a [[DataFrame]].
    *
    * @group specificdata
    * @since 1.3.0
    */
-  @Experimental
+  @deprecated("Use read.json()", "1.4.0")
   def jsonRDD(json: RDD[String], samplingRatio: Double): DataFrame = {
-    if (conf.useJacksonStreamingAPI) {
-      baseRelationToDataFrame(new JSONRelation(() => json, None, samplingRatio, None)(this))
-    } else {
-      val columnNameOfCorruptJsonRecord = conf.columnNameOfCorruptRecord
-      val appliedSchema =
-        JsonRDD.nullTypeToStringType(
-          JsonRDD.inferSchema(json, samplingRatio, columnNameOfCorruptJsonRecord))
-      val rowRDD = JsonRDD.jsonStringToRow(json, appliedSchema, columnNameOfCorruptJsonRecord)
-      createDataFrame(rowRDD, appliedSchema, needsConversion = false)
-    }
+    read.option("samplingRatio", samplingRatio.toString).json(json)
   }
 
   /**
-   * :: Experimental ::
    * Loads a JavaRDD[String] storing JSON objects (one object per record) inferring the
    * schema, returning the result as a [[DataFrame]].
    *
    * @group specificdata
    * @since 1.3.0
    */
-  @Experimental
+  @deprecated("Use read.json()", "1.4.0")
   def jsonRDD(json: JavaRDD[String], samplingRatio: Double): DataFrame = {
-    jsonRDD(json.rdd, samplingRatio);
+    read.option("samplingRatio", samplingRatio.toString).json(json)
   }
 
   /**
-   * :: Experimental ::
    * Returns the dataset stored at path as a DataFrame,
    * using the default data source configured by spark.sql.sources.default.
    *
    * @group genericdata
    * @since 1.3.0
    */
-  @Experimental
+  @deprecated("Use read.load(path)", "1.4.0")
   def load(path: String): DataFrame = {
-    val dataSourceName = conf.defaultDataSourceName
-    load(path, dataSourceName)
+    read.load(path)
   }
 
   /**
-   * :: Experimental ::
    * Returns the dataset stored at path as a DataFrame, using the given data source.
    *
    * @group genericdata
    * @since 1.3.0
    */
-  @Experimental
+  @deprecated("Use read.format(source).load(path)", "1.4.0")
   def load(path: String, source: String): DataFrame = {
-    load(source, Map("path" -> path))
+    read.format(source).load(path)
   }
 
   /**
-   * :: Experimental ::
    * (Java-specific) Returns the dataset specified by the given data source and
    * a set of options as a DataFrame.
    *
    * @group genericdata
    * @since 1.3.0
    */
-  @Experimental
+  @deprecated("Use read.format(source).options(options).load()", "1.4.0")
   def load(source: String, options: java.util.Map[String, String]): DataFrame = {
-    load(source, options.toMap)
+    read.options(options).format(source).load()
   }
 
   /**
-   * :: Experimental ::
    * (Scala-specific) Returns the dataset specified by the given data source and
    * a set of options as a DataFrame.
    *
    * @group genericdata
    * @since 1.3.0
    */
-  @Experimental
+  @deprecated("Use read.format(source).options(options).load()", "1.4.0")
   def load(source: String, options: Map[String, String]): DataFrame = {
-    val resolved = ResolvedDataSource(this, None, Array.empty[String], source, options)
-    DataFrame(this, LogicalRelation(resolved.relation))
-  }
-
-  /**
-   * :: Experimental ::
-   * (Java-specific) Returns the dataset specified by the given data source and
-   * a set of options as a DataFrame, using the given schema as the schema of the DataFrame.
-   *
-   * @group genericdata
-   * @since 1.3.0
-   */
-  @Experimental
-  def load(
-      source: String,
-      schema: StructType,
-      options: java.util.Map[String, String]): DataFrame = {
-    load(source, schema, options.toMap)
+    read.options(options).format(source).load()
   }
 
   /**
-   * :: Experimental ::
    * (Java-specific) Returns the dataset specified by the given data source and
    * a set of options as a DataFrame, using the given schema as the schema of the DataFrame.
    *
    * @group genericdata
    * @since 1.3.0
    */
-  @Experimental
+  @deprecated("Use read.format(source).schema(schema).options(options).load()", "1.4.0")
   def load(
       source: String,
       schema: StructType,
-      partitionColumns: Array[String],
       options: java.util.Map[String, String]): DataFrame = {
-    load(source, schema, partitionColumns, options.toMap)
-  }
-
-  /**
-   * :: Experimental ::
-   * (Scala-specific) Returns the dataset specified by the given data source and
-   * a set of options as a DataFrame, using the given schema as the schema of the DataFrame.
-   * @group genericdata
-   * @since 1.3.0
-   */
-  @Experimental
-  def load(
-      source: String,
-      schema: StructType,
-      options: Map[String, String]): DataFrame = {
-    val resolved = ResolvedDataSource(this, Some(schema), Array.empty[String], source, options)
-    DataFrame(this, LogicalRelation(resolved.relation))
+    read.format(source).schema(schema).options(options).load()
   }
 
   /**
-   * :: Experimental ::
    * (Scala-specific) Returns the dataset specified by the given data source and
    * a set of options as a DataFrame, using the given schema as the schema of the DataFrame.
    * @group genericdata
    * @since 1.3.0
    */
-  @Experimental
+  @deprecated("Use read.format(source).schema(schema).options(options).load()", "1.4.0")
   def load(
       source: String,
       schema: StructType,
-      partitionColumns: Array[String],
       options: Map[String, String]): DataFrame = {
-    val resolved = ResolvedDataSource(this, Some(schema), partitionColumns, source, options)
-    DataFrame(this, LogicalRelation(resolved.relation))
+    read.format(source).schema(schema).options(options).load()
   }
 
   /**
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTest.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTest.scala
index 9d17516e0ef7d..7a73b6f1ac601 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTest.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTest.scala
@@ -90,7 +90,7 @@ private[sql] trait ParquetTest {
       (data: Seq[T])
       (f: String => Unit): Unit = {
     withTempPath { file =>
-      sparkContext.parallelize(data).toDF().saveAsParquetFile(file.getCanonicalPath)
+      sparkContext.parallelize(data).toDF().write.parquet(file.getCanonicalPath)
       f(file.getCanonicalPath)
     }
   }
@@ -102,7 +102,7 @@ private[sql] trait ParquetTest {
   protected def withParquetDataFrame[T <: Product: ClassTag: TypeTag]
       (data: Seq[T])
       (f: DataFrame => Unit): Unit = {
-    withParquetFile(data)(path => f(sqlContext.parquetFile(path)))
+    withParquetFile(data)(path => f(sqlContext.read.parquet(path)))
   }
 
   /**
@@ -128,12 +128,12 @@ private[sql] trait ParquetTest {
 
   protected def makeParquetFile[T <: Product: ClassTag: TypeTag](
       data: Seq[T], path: File): Unit = {
-    data.toDF().save(path.getCanonicalPath, "org.apache.spark.sql.parquet", SaveMode.Overwrite)
+    data.toDF().write.mode(SaveMode.Overwrite).parquet(path.getCanonicalPath)
   }
 
   protected def makeParquetFile[T <: Product: ClassTag: TypeTag](
       df: DataFrame, path: File): Unit = {
-    df.save(path.getCanonicalPath, "org.apache.spark.sql.parquet", SaveMode.Overwrite)
+    df.write.mode(SaveMode.Overwrite).parquet(path.getCanonicalPath)
   }
 
   protected def makePartitionDir(
diff --git a/sql/core/src/test/java/test/org/apache/spark/sql/sources/JavaSaveLoadSuite.java b/sql/core/src/test/java/test/org/apache/spark/sql/sources/JavaSaveLoadSuite.java
index b76f7d421f643..6a0bcefe7aa88 100644
--- a/sql/core/src/test/java/test/org/apache/spark/sql/sources/JavaSaveLoadSuite.java
+++ b/sql/core/src/test/java/test/org/apache/spark/sql/sources/JavaSaveLoadSuite.java
@@ -75,9 +75,9 @@ public void setUp() throws IOException {
   public void saveAndLoad() {
     Map<String, String> options = new HashMap<String, String>();
     options.put("path", path.toString());
-    df.save("org.apache.spark.sql.json", SaveMode.ErrorIfExists, options);
+    df.save("json", SaveMode.ErrorIfExists, options);
 
-    DataFrame loadedDF = sqlContext.load("org.apache.spark.sql.json", options);
+    DataFrame loadedDF = sqlContext.read().format("json").options(options).load();
 
     checkAnswer(loadedDF, df.collectAsList());
   }
@@ -86,12 +86,12 @@ public void saveAndLoad() {
   public void saveAndLoadWithSchema() {
     Map<String, String> options = new HashMap<String, String>();
     options.put("path", path.toString());
-    df.save("org.apache.spark.sql.json", SaveMode.ErrorIfExists, options);
+    df.save("json", SaveMode.ErrorIfExists, options);
 
     List<StructField> fields = new ArrayList<StructField>();
     fields.add(DataTypes.createStructField("b", DataTypes.StringType, true));
     StructType schema = DataTypes.createStructType(fields);
-    DataFrame loadedDF = sqlContext.load("org.apache.spark.sql.json", schema, options);
+    DataFrame loadedDF = sqlContext.load("json", schema, options);
 
     checkAnswer(loadedDF, sqlContext.sql("SELECT b FROM jsonTable").collectAsList());
   }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
index 1d5f6b3aad6fd..054b23dba84c5 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
@@ -460,14 +460,14 @@ class DataFrameSuite extends QueryTest {
   }
 
   test("SPARK-7551: support backticks for DataFrame attribute resolution") {
-    val df = TestSQLContext.jsonRDD(TestSQLContext.sparkContext.makeRDD(
+    val df = TestSQLContext.read.json(TestSQLContext.sparkContext.makeRDD(
       """{"a.b": {"c": {"d..e": {"f": 1}}}}""" :: Nil))
     checkAnswer(
       df.select(df("`a.b`.c.`d..e`.`f`")),
       Row(1)
     )
 
-    val df2 = TestSQLContext.jsonRDD(TestSQLContext.sparkContext.makeRDD(
+    val df2 = TestSQLContext.read.json(TestSQLContext.sparkContext.makeRDD(
       """{"a  b": {"c": {"d  e": {"f": 1}}}}""" :: Nil))
     checkAnswer(
       df2.select(df2("`a  b`.c.d  e.f")),
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
index 479ad9fe621d0..c5c4f448a7224 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
@@ -105,7 +105,7 @@ class SQLQuerySuite extends QueryTest with BeforeAndAfterAll {
   }
 
   test("grouping on nested fields") {
-    jsonRDD(sparkContext.parallelize("""{"nested": {"attribute": 1}, "value": 2}""" :: Nil))
+    read.json(sparkContext.parallelize("""{"nested": {"attribute": 1}, "value": 2}""" :: Nil))
      .registerTempTable("rows")
 
     checkAnswer(
@@ -122,7 +122,8 @@ class SQLQuerySuite extends QueryTest with BeforeAndAfterAll {
   }
 
   test("SPARK-6201 IN type conversion") {
-    jsonRDD(sparkContext.parallelize(Seq("{\"a\": \"1\"}}", "{\"a\": \"2\"}}", "{\"a\": \"3\"}}")))
+    read.json(
+      sparkContext.parallelize(Seq("{\"a\": \"1\"}}", "{\"a\": \"2\"}}", "{\"a\": \"3\"}}")))
       .registerTempTable("d")
 
     checkAnswer(
@@ -1199,7 +1200,7 @@ class SQLQuerySuite extends QueryTest with BeforeAndAfterAll {
   test("SPARK-3483 Special chars in column names") {
     val data = sparkContext.parallelize(
       Seq("""{"key?number1": "value1", "key.number2": "value2"}"""))
-    jsonRDD(data).registerTempTable("records")
+    read.json(data).registerTempTable("records")
     sql("SELECT `key?number1`, `key.number2` FROM records")
   }
 
@@ -1240,11 +1241,11 @@ class SQLQuerySuite extends QueryTest with BeforeAndAfterAll {
   }
 
   test("SPARK-4322 Grouping field with struct field as sub expression") {
-    jsonRDD(sparkContext.makeRDD("""{"a": {"b": [{"c": 1}]}}""" :: Nil)).registerTempTable("data")
+    read.json(sparkContext.makeRDD("""{"a": {"b": [{"c": 1}]}}""" :: Nil)).registerTempTable("data")
     checkAnswer(sql("SELECT a.b[0].c FROM data GROUP BY a.b[0].c"), Row(1))
     dropTempTable("data")
 
-    jsonRDD(sparkContext.makeRDD("""{"a": {"b": 1}}""" :: Nil)).registerTempTable("data")
+    read.json(sparkContext.makeRDD("""{"a": {"b": 1}}""" :: Nil)).registerTempTable("data")
     checkAnswer(sql("SELECT a.b + 1 FROM data GROUP BY a.b + 1"), Row(2))
     dropTempTable("data")
   }
@@ -1292,7 +1293,7 @@ class SQLQuerySuite extends QueryTest with BeforeAndAfterAll {
   }
 
   test("SPARK-6145: ORDER BY test for nested fields") {
-    jsonRDD(sparkContext.makeRDD("""{"a": {"b": 1, "a": {"a": 1}}, "c": [{"d": 1}]}""" :: Nil))
+    read.json(sparkContext.makeRDD("""{"a": {"b": 1, "a": {"a": 1}}, "c": [{"d": 1}]}""" :: Nil))
       .registerTempTable("nestedOrder")
 
     checkAnswer(sql("SELECT 1 FROM nestedOrder ORDER BY a.b"), Row(1))
@@ -1304,14 +1305,14 @@ class SQLQuerySuite extends QueryTest with BeforeAndAfterAll {
   }
 
   test("SPARK-6145: special cases") {
-    jsonRDD(sparkContext.makeRDD(
+    read.json(sparkContext.makeRDD(
       """{"a": {"b": [1]}, "b": [{"a": 1}], "c0": {"a": 1}}""" :: Nil)).registerTempTable("t")
     checkAnswer(sql("SELECT a.b[0] FROM t ORDER BY c0.a"), Row(1))
     checkAnswer(sql("SELECT b[0].a FROM t ORDER BY c0.a"), Row(1))
   }
 
   test("SPARK-6898: complete support for special chars in column names") {
-    jsonRDD(sparkContext.makeRDD(
+    read.json(sparkContext.makeRDD(
       """{"a": {"c.b": 1}, "b.$q": [{"a@!.q": 1}], "q.w": {"w.i&": [1]}}""" :: Nil))
       .registerTempTable("t")
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/UserDefinedTypeSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/UserDefinedTypeSuite.scala
index 2672e20deadc5..dc2d43a197f40 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/UserDefinedTypeSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/UserDefinedTypeSuite.scala
@@ -105,13 +105,13 @@ class UserDefinedTypeSuite extends QueryTest {
   test("UDTs with Parquet") {
     val tempDir = Utils.createTempDir()
     tempDir.delete()
-    pointsRDD.saveAsParquetFile(tempDir.getCanonicalPath)
+    pointsRDD.write.parquet(tempDir.getCanonicalPath)
   }
 
   test("Repartition UDTs with Parquet") {
     val tempDir = Utils.createTempDir()
     tempDir.delete()
-    pointsRDD.repartition(1).saveAsParquetFile(tempDir.getCanonicalPath)
+    pointsRDD.repartition(1).write.parquet(tempDir.getCanonicalPath)
   }
 
   // Tests to make sure that all operators correctly convert types on the way out.
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/json/JsonSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/json/JsonSuite.scala
index b06e3385980f7..6f747e5846f74 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/json/JsonSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/json/JsonSuite.scala
@@ -215,7 +215,7 @@ class JsonSuite extends QueryTest {
   }
 
   test("Complex field and type inferring with null in sampling") {
-    val jsonDF = jsonRDD(jsonNullStruct)
+    val jsonDF = read.json(jsonNullStruct)
     val expectedSchema = StructType(
       StructField("headers", StructType(
         StructField("Charset", StringType, true) ::
@@ -234,7 +234,7 @@ class JsonSuite extends QueryTest {
   }
 
   test("Primitive field and type inferring") {
-    val jsonDF = jsonRDD(primitiveFieldAndType)
+    val jsonDF = read.json(primitiveFieldAndType)
 
     val expectedSchema = StructType(
       StructField("bigInteger", DecimalType.Unlimited, true) ::
@@ -262,7 +262,7 @@ class JsonSuite extends QueryTest {
   }
 
   test("Complex field and type inferring") {
-    val jsonDF = jsonRDD(complexFieldAndType1)
+    val jsonDF = read.json(complexFieldAndType1)
 
     val expectedSchema = StructType(
       StructField("arrayOfArray1", ArrayType(ArrayType(StringType, true), true), true) ::
@@ -361,7 +361,7 @@ class JsonSuite extends QueryTest {
   }
 
   test("GetField operation on complex data type") {
-    val jsonDF = jsonRDD(complexFieldAndType1)
+    val jsonDF = read.json(complexFieldAndType1)
     jsonDF.registerTempTable("jsonTable")
 
     checkAnswer(
@@ -377,7 +377,7 @@ class JsonSuite extends QueryTest {
   }
 
   test("Type conflict in primitive field values") {
-    val jsonDF = jsonRDD(primitiveFieldValueTypeConflict)
+    val jsonDF = read.json(primitiveFieldValueTypeConflict)
 
     val expectedSchema = StructType(
       StructField("num_bool", StringType, true) ::
@@ -451,7 +451,7 @@ class JsonSuite extends QueryTest {
   }
 
   ignore("Type conflict in primitive field values (Ignored)") {
-    val jsonDF = jsonRDD(primitiveFieldValueTypeConflict)
+    val jsonDF = read.json(primitiveFieldValueTypeConflict)
     jsonDF.registerTempTable("jsonTable")
 
     // Right now, the analyzer does not promote strings in a boolean expression.
@@ -504,7 +504,7 @@ class JsonSuite extends QueryTest {
   }
 
   test("Type conflict in complex field values") {
-    val jsonDF = jsonRDD(complexFieldValueTypeConflict)
+    val jsonDF = read.json(complexFieldValueTypeConflict)
 
     val expectedSchema = StructType(
       StructField("array", ArrayType(LongType, true), true) ::
@@ -528,7 +528,7 @@ class JsonSuite extends QueryTest {
   }
 
   test("Type conflict in array elements") {
-    val jsonDF = jsonRDD(arrayElementTypeConflict)
+    val jsonDF = read.json(arrayElementTypeConflict)
 
     val expectedSchema = StructType(
       StructField("array1", ArrayType(StringType, true), true) ::
@@ -556,7 +556,7 @@ class JsonSuite extends QueryTest {
   }
 
   test("Handling missing fields") {
-    val jsonDF = jsonRDD(missingFields)
+    val jsonDF = read.json(missingFields)
 
     val expectedSchema = StructType(
       StructField("a", BooleanType, true) ::
@@ -576,7 +576,7 @@ class JsonSuite extends QueryTest {
     dir.delete()
     val path = dir.getCanonicalPath
     sparkContext.parallelize(1 to 100).map(i => s"""{"a": 1, "b": "str$i"}""").saveAsTextFile(path)
-    val jsonDF = jsonFile(path, 0.49)
+    val jsonDF = read.option("samplingRatio", "0.49").json(path)
 
     val analyzed = jsonDF.queryExecution.analyzed
     assert(
@@ -591,7 +591,7 @@ class JsonSuite extends QueryTest {
 
     val schema = StructType(StructField("a", LongType, true) :: Nil)
     val logicalRelation =
-      jsonFile(path, schema).queryExecution.analyzed.asInstanceOf[LogicalRelation]
+      read.schema(schema).json(path).queryExecution.analyzed.asInstanceOf[LogicalRelation]
     val relationWithSchema = logicalRelation.relation.asInstanceOf[JSONRelation]
     assert(relationWithSchema.path === Some(path))
     assert(relationWithSchema.schema === schema)
@@ -603,7 +603,7 @@ class JsonSuite extends QueryTest {
     dir.delete()
     val path = dir.getCanonicalPath
     primitiveFieldAndType.map(record => record.replaceAll("\n", " ")).saveAsTextFile(path)
-    val jsonDF = jsonFile(path)
+    val jsonDF = read.json(path)
 
     val expectedSchema = StructType(
       StructField("bigInteger", DecimalType.Unlimited, true) ::
@@ -672,7 +672,7 @@ class JsonSuite extends QueryTest {
       StructField("null", StringType, true) ::
       StructField("string", StringType, true) :: Nil)
 
-    val jsonDF1 = jsonFile(path, schema)
+    val jsonDF1 = read.schema(schema).json(path)
 
     assert(schema === jsonDF1.schema)
 
@@ -689,7 +689,7 @@ class JsonSuite extends QueryTest {
       "this is a simple string.")
     )
 
-    val jsonDF2 = jsonRDD(primitiveFieldAndType, schema)
+    val jsonDF2 = read.schema(schema).json(primitiveFieldAndType)
 
     assert(schema === jsonDF2.schema)
 
@@ -710,7 +710,7 @@ class JsonSuite extends QueryTest {
   test("Applying schemas with MapType") {
     val schemaWithSimpleMap = StructType(
       StructField("map", MapType(StringType, IntegerType, true), false) :: Nil)
-    val jsonWithSimpleMap = jsonRDD(mapType1, schemaWithSimpleMap)
+    val jsonWithSimpleMap = read.schema(schemaWithSimpleMap).json(mapType1)
 
     jsonWithSimpleMap.registerTempTable("jsonWithSimpleMap")
 
@@ -738,7 +738,7 @@ class JsonSuite extends QueryTest {
     val schemaWithComplexMap = StructType(
       StructField("map", MapType(StringType, innerStruct, true), false) :: Nil)
 
-    val jsonWithComplexMap = jsonRDD(mapType2, schemaWithComplexMap)
+    val jsonWithComplexMap = read.schema(schemaWithComplexMap).json(mapType2)
 
     jsonWithComplexMap.registerTempTable("jsonWithComplexMap")
 
@@ -764,7 +764,7 @@ class JsonSuite extends QueryTest {
   }
 
   test("SPARK-2096 Correctly parse dot notations") {
-    val jsonDF = jsonRDD(complexFieldAndType2)
+    val jsonDF = read.json(complexFieldAndType2)
     jsonDF.registerTempTable("jsonTable")
 
     checkAnswer(
@@ -782,7 +782,7 @@ class JsonSuite extends QueryTest {
   }
 
   test("SPARK-3390 Complex arrays") {
-    val jsonDF = jsonRDD(complexFieldAndType2)
+    val jsonDF = read.json(complexFieldAndType2)
     jsonDF.registerTempTable("jsonTable")
 
     checkAnswer(
@@ -805,7 +805,7 @@ class JsonSuite extends QueryTest {
   }
 
   test("SPARK-3308 Read top level JSON arrays") {
-    val jsonDF = jsonRDD(jsonArray)
+    val jsonDF = read.json(jsonArray)
     jsonDF.registerTempTable("jsonTable")
 
     checkAnswer(
@@ -826,7 +826,7 @@ class JsonSuite extends QueryTest {
     val oldColumnNameOfCorruptRecord = TestSQLContext.conf.columnNameOfCorruptRecord
     TestSQLContext.setConf(SQLConf.COLUMN_NAME_OF_CORRUPT_RECORD, "_unparsed")
 
-    val jsonDF = jsonRDD(corruptRecords)
+    val jsonDF = read.json(corruptRecords)
     jsonDF.registerTempTable("jsonTable")
 
     val schema = StructType(
@@ -880,7 +880,7 @@ class JsonSuite extends QueryTest {
   }
 
   test("SPARK-4068: nulls in arrays") {
-    val jsonDF = jsonRDD(nullsInArrays)
+    val jsonDF = read.json(nullsInArrays)
     jsonDF.registerTempTable("jsonTable")
 
     val schema = StructType(
@@ -957,8 +957,8 @@ class JsonSuite extends QueryTest {
     assert(result2(1) === "{\"f1\":{\"f11\":2,\"f12\":false},\"f2\":{\"B2\":null}}")
     assert(result2(3) === "{\"f1\":{\"f11\":4,\"f12\":true},\"f2\":{\"D4\":2147483644}}")
 
-    val jsonDF = jsonRDD(primitiveFieldAndType)
-    val primTable = jsonRDD(jsonDF.toJSON)
+    val jsonDF = read.json(primitiveFieldAndType)
+    val primTable = read.json(jsonDF.toJSON)
     primTable.registerTempTable("primativeTable")
     checkAnswer(
         sql("select * from primativeTable"),
@@ -970,8 +970,8 @@ class JsonSuite extends QueryTest {
         "this is a simple string.")
       )
 
-    val complexJsonDF = jsonRDD(complexFieldAndType1)
-    val compTable = jsonRDD(complexJsonDF.toJSON)
+    val complexJsonDF = read.json(complexFieldAndType1)
+    val compTable = read.json(complexJsonDF.toJSON)
     compTable.registerTempTable("complexTable")
     // Access elements of a primitive array.
     checkAnswer(
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetFilterSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetFilterSuite.scala
index 5ad439584716f..bdc2ebabc5e9a 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetFilterSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetFilterSuite.scala
@@ -328,12 +328,12 @@ class ParquetDataSourceOnFilterSuite extends ParquetFilterSuiteBase with BeforeA
     withSQLConf(SQLConf.PARQUET_FILTER_PUSHDOWN_ENABLED -> "true") {
       withTempPath { dir =>
         val path = s"${dir.getCanonicalPath}/part=1"
-        (1 to 3).map(i => (i, i.toString)).toDF("a", "b").saveAsParquetFile(path)
+        (1 to 3).map(i => (i, i.toString)).toDF("a", "b").write.parquet(path)
 
         // If the "part = 1" filter gets pushed down, this query will throw an exception since
         // "part" is not a valid column in the actual Parquet file
         checkAnswer(
-          sqlContext.parquetFile(path).filter("part = 1"),
+          sqlContext.read.parquet(path).filter("part = 1"),
           (1 to 3).map(i => Row(i, i.toString, 1)))
       }
     }
@@ -357,7 +357,7 @@ class ParquetDataSourceOffFilterSuite extends ParquetFilterSuiteBase with Before
     withSQLConf(SQLConf.PARQUET_FILTER_PUSHDOWN_ENABLED -> "true") {
       withTempPath { dir =>
         val path = s"${dir.getCanonicalPath}/part=1"
-        (1 to 3).map(i => (i, i.toString)).toDF("a", "b").saveAsParquetFile(path)
+        (1 to 3).map(i => (i, i.toString)).toDF("a", "b").write.parquet(path)
 
         // If the "part = 1" filter gets pushed down, this query will throw an exception since
         // "part" is not a valid column in the actual Parquet file
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetIOSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetIOSuite.scala
index 008443df216aa..dd48bb350f26d 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetIOSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetIOSuite.scala
@@ -114,24 +114,24 @@ class ParquetIOSuiteBase extends QueryTest with ParquetTest {
     for ((precision, scale) <- Seq((5, 2), (1, 0), (1, 1), (18, 10), (18, 17))) {
       withTempPath { dir =>
         val data = makeDecimalRDD(DecimalType(precision, scale))
-        data.saveAsParquetFile(dir.getCanonicalPath)
-        checkAnswer(parquetFile(dir.getCanonicalPath), data.collect().toSeq)
+        data.write.parquet(dir.getCanonicalPath)
+        checkAnswer(read.parquet(dir.getCanonicalPath), data.collect().toSeq)
       }
     }
 
     // Decimals with precision above 18 are not yet supported
     intercept[Throwable] {
       withTempPath { dir =>
-        makeDecimalRDD(DecimalType(19, 10)).saveAsParquetFile(dir.getCanonicalPath)
-        parquetFile(dir.getCanonicalPath).collect()
+        makeDecimalRDD(DecimalType(19, 10)).write.parquet(dir.getCanonicalPath)
+        read.parquet(dir.getCanonicalPath).collect()
       }
     }
 
     // Unlimited-length decimals are not yet supported
     intercept[Throwable] {
       withTempPath { dir =>
-        makeDecimalRDD(DecimalType.Unlimited).saveAsParquetFile(dir.getCanonicalPath)
-        parquetFile(dir.getCanonicalPath).collect()
+        makeDecimalRDD(DecimalType.Unlimited).write.parquet(dir.getCanonicalPath)
+        read.parquet(dir.getCanonicalPath).collect()
       }
     }
   }
@@ -146,8 +146,8 @@ class ParquetIOSuiteBase extends QueryTest with ParquetTest {
 
     withTempPath { dir =>
       val data = makeDateRDD()
-      data.saveAsParquetFile(dir.getCanonicalPath)
-      checkAnswer(parquetFile(dir.getCanonicalPath), data.collect().toSeq)
+      data.write.parquet(dir.getCanonicalPath)
+      checkAnswer(read.parquet(dir.getCanonicalPath), data.collect().toSeq)
     }
   }
 
@@ -283,7 +283,7 @@ class ParquetIOSuiteBase extends QueryTest with ParquetTest {
     withTempDir { dir =>
       val path = new Path(dir.toURI.toString, "part-r-0.parquet")
       makeRawParquetFile(path)
-      checkAnswer(parquetFile(path.toString), (0 until 10).map { i =>
+      checkAnswer(read.parquet(path.toString), (0 until 10).map { i =>
         Row(i % 2 == 0, i, i.toLong, i.toFloat, i.toDouble)
       })
     }
@@ -311,8 +311,8 @@ class ParquetIOSuiteBase extends QueryTest with ParquetTest {
   test("save - overwrite") {
     withParquetFile((1 to 10).map(i => (i, i.toString))) { file =>
       val newData = (11 to 20).map(i => (i, i.toString))
-      newData.toDF().save("org.apache.spark.sql.parquet", SaveMode.Overwrite, Map("path" -> file))
-      checkAnswer(parquetFile(file), newData.map(Row.fromTuple))
+      newData.toDF().write.format("parquet").mode(SaveMode.Overwrite).save(file)
+      checkAnswer(read.parquet(file), newData.map(Row.fromTuple))
     }
   }
 
@@ -320,8 +320,8 @@ class ParquetIOSuiteBase extends QueryTest with ParquetTest {
     val data = (1 to 10).map(i => (i, i.toString))
     withParquetFile(data) { file =>
       val newData = (11 to 20).map(i => (i, i.toString))
-      newData.toDF().save("org.apache.spark.sql.parquet", SaveMode.Ignore, Map("path" -> file))
-      checkAnswer(parquetFile(file), data.map(Row.fromTuple))
+      newData.toDF().write.format("parquet").mode(SaveMode.Ignore).save(file)
+      checkAnswer(read.parquet(file), data.map(Row.fromTuple))
     }
   }
 
@@ -330,8 +330,7 @@ class ParquetIOSuiteBase extends QueryTest with ParquetTest {
     withParquetFile(data) { file =>
       val newData = (11 to 20).map(i => (i, i.toString))
       val errorMessage = intercept[Throwable] {
-        newData.toDF().save(
-          "org.apache.spark.sql.parquet", SaveMode.ErrorIfExists, Map("path" -> file))
+        newData.toDF().write.format("parquet").mode(SaveMode.ErrorIfExists).save(file)
       }.getMessage
       assert(errorMessage.contains("already exists"))
     }
@@ -341,8 +340,8 @@ class ParquetIOSuiteBase extends QueryTest with ParquetTest {
     val data = (1 to 10).map(i => (i, i.toString))
     withParquetFile(data) { file =>
       val newData = (11 to 20).map(i => (i, i.toString))
-      newData.toDF().save("org.apache.spark.sql.parquet", SaveMode.Append, Map("path" -> file))
-      checkAnswer(parquetFile(file), (data ++ newData).map(Row.fromTuple))
+      newData.toDF().write.format("parquet").mode(SaveMode.Append).save(file)
+      checkAnswer(read.parquet(file), (data ++ newData).map(Row.fromTuple))
     }
   }
 
@@ -374,7 +373,7 @@ class ParquetIOSuiteBase extends QueryTest with ParquetTest {
         path,
         new Footer(path, new ParquetMetadata(fileMetadata, Nil)) :: Nil)
 
-      assertResult(parquetFile(path.toString).schema) {
+      assertResult(read.parquet(path.toString).schema) {
         StructType(
           StructField("a", BooleanType, nullable = false) ::
           StructField("b", IntegerType, nullable = false) ::
@@ -392,7 +391,7 @@ class ParquetIOSuiteBase extends QueryTest with ParquetTest {
       sqlContext.udf.register("div0", (x: Int) => x / 0)
       withTempPath { dir =>
         intercept[org.apache.spark.SparkException] {
-          sqlContext.sql("select div0(1)").saveAsParquetFile(dir.getCanonicalPath)
+          sqlContext.sql("select div0(1)").write.parquet(dir.getCanonicalPath)
         }
         val path = new Path(dir.getCanonicalPath, "_temporary")
         val fs = path.getFileSystem(configuration)
@@ -421,10 +420,10 @@ class ParquetDataSourceOnIOSuite extends ParquetIOSuiteBase with BeforeAndAfterA
     // In 1.3.0, save to fs other than file: without configuring core-site.xml would get:
     // IllegalArgumentException: Wrong FS: hdfs://..., expected: file:///
     intercept[Throwable] {
-      sqlContext.parquetFile("file:///nonexistent")
+      sqlContext.read.parquet("file:///nonexistent")
     }
     val errorMessage = intercept[Throwable] {
-      sqlContext.parquetFile("hdfs://nonexistent")
+      sqlContext.read.parquet("hdfs://nonexistent")
     }.toString
     assert(errorMessage.contains("UnknownHostException"))
   }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetPartitionDiscoverySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetPartitionDiscoverySuite.scala
index 138e19766dc88..8079c460713da 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetPartitionDiscoverySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetPartitionDiscoverySuite.scala
@@ -155,7 +155,7 @@ class ParquetPartitionDiscoverySuite extends QueryTest with ParquetTest {
           makePartitionDir(base, defaultPartitionName, "pi" -> pi, "ps" -> ps))
       }
 
-      parquetFile(base.getCanonicalPath).registerTempTable("t")
+      read.parquet(base.getCanonicalPath).registerTempTable("t")
 
       withTempTable("t") {
         checkAnswer(
@@ -202,7 +202,7 @@ class ParquetPartitionDiscoverySuite extends QueryTest with ParquetTest {
           makePartitionDir(base, defaultPartitionName, "pi" -> pi, "ps" -> ps))
       }
 
-      parquetFile(base.getCanonicalPath).registerTempTable("t")
+      read.parquet(base.getCanonicalPath).registerTempTable("t")
 
       withTempTable("t") {
         checkAnswer(
@@ -250,10 +250,7 @@ class ParquetPartitionDiscoverySuite extends QueryTest with ParquetTest {
           makePartitionDir(base, defaultPartitionName, "pi" -> pi, "ps" -> ps))
       }
 
-      val parquetRelation = load(
-        "org.apache.spark.sql.parquet",
-        Map("path" -> base.getCanonicalPath))
-
+      val parquetRelation = read.format("org.apache.spark.sql.parquet").load(base.getCanonicalPath)
       parquetRelation.registerTempTable("t")
 
       withTempTable("t") {
@@ -293,10 +290,7 @@ class ParquetPartitionDiscoverySuite extends QueryTest with ParquetTest {
           makePartitionDir(base, defaultPartitionName, "pi" -> pi, "ps" -> ps))
       }
 
-      val parquetRelation = load(
-        "org.apache.spark.sql.parquet",
-        Map("path" -> base.getCanonicalPath))
-
+      val parquetRelation = read.format("org.apache.spark.sql.parquet").load(base.getCanonicalPath)
       parquetRelation.registerTempTable("t")
 
       withTempTable("t") {
@@ -328,7 +322,7 @@ class ParquetPartitionDiscoverySuite extends QueryTest with ParquetTest {
         (1 to 10).map(i => (i, i.toString)).toDF("intField", "stringField"),
         makePartitionDir(base, defaultPartitionName, "pi" -> 2))
 
-      load(base.getCanonicalPath, "org.apache.spark.sql.parquet").registerTempTable("t")
+      read.format("org.apache.spark.sql.parquet").load(base.getCanonicalPath).registerTempTable("t")
 
       withTempTable("t") {
         checkAnswer(
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/CreateTableAsSelectSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/CreateTableAsSelectSuite.scala
index 4e54b2eb8df7a..d2d1011b8e917 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/sources/CreateTableAsSelectSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/sources/CreateTableAsSelectSuite.scala
@@ -33,7 +33,7 @@ class CreateTableAsSelectSuite extends DataSourceTest with BeforeAndAfterAll {
   override def beforeAll(): Unit = {
     path = Utils.createTempDir()
     val rdd = sparkContext.parallelize((1 to 10).map(i => s"""{"a":$i, "b":"str${i}"}"""))
-    jsonRDD(rdd).registerTempTable("jt")
+    read.json(rdd).registerTempTable("jt")
   }
 
   override def afterAll(): Unit = {
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/InsertSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/InsertSuite.scala
index d1d427e1790bd..6f375ef36237d 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/sources/InsertSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/sources/InsertSuite.scala
@@ -33,7 +33,7 @@ class InsertSuite extends DataSourceTest with BeforeAndAfterAll {
   override def beforeAll: Unit = {
     path = Utils.createTempDir()
     val rdd = sparkContext.parallelize((1 to 10).map(i => s"""{"a":$i, "b":"str${i}"}"""))
-    jsonRDD(rdd).registerTempTable("jt")
+    read.json(rdd).registerTempTable("jt")
     sql(
       s"""
         |CREATE TEMPORARY TABLE jsonTable (a int, b string)
@@ -109,7 +109,7 @@ class InsertSuite extends DataSourceTest with BeforeAndAfterAll {
 
     // Writing the table to less part files.
     val rdd1 = sparkContext.parallelize((1 to 10).map(i => s"""{"a":$i, "b":"str${i}"}"""), 5)
-    jsonRDD(rdd1).registerTempTable("jt1")
+    read.json(rdd1).registerTempTable("jt1")
     sql(
       s"""
          |INSERT OVERWRITE TABLE jsonTable SELECT a, b FROM jt1
@@ -121,7 +121,7 @@ class InsertSuite extends DataSourceTest with BeforeAndAfterAll {
 
     // Writing the table to more part files.
     val rdd2 = sparkContext.parallelize((1 to 10).map(i => s"""{"a":$i, "b":"str${i}"}"""), 10)
-    jsonRDD(rdd2).registerTempTable("jt2")
+    read.json(rdd2).registerTempTable("jt2")
     sql(
       s"""
          |INSERT OVERWRITE TABLE jsonTable SELECT a, b FROM jt2
@@ -154,13 +154,13 @@ class InsertSuite extends DataSourceTest with BeforeAndAfterAll {
   }
 
   test("save directly to the path of a JSON table") {
-    table("jt").selectExpr("a * 5 as a", "b").save(path.toString, "json", SaveMode.Overwrite)
+    table("jt").selectExpr("a * 5 as a", "b").write.mode(SaveMode.Overwrite).json(path.toString)
     checkAnswer(
       sql("SELECT a, b FROM jsonTable"),
       (1 to 10).map(i => Row(i * 5, s"str$i"))
     )
 
-    table("jt").save(path.toString, "json", SaveMode.Overwrite)
+    table("jt").write.mode(SaveMode.Overwrite).json(path.toString)
     checkAnswer(
       sql("SELECT a, b FROM jsonTable"),
       (1 to 10).map(i => Row(i, s"str$i"))
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/SaveLoadSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/SaveLoadSuite.scala
index 6567d1acd7644..7a28e9af3673c 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/sources/SaveLoadSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/sources/SaveLoadSuite.scala
@@ -42,7 +42,7 @@ class SaveLoadSuite extends DataSourceTest with BeforeAndAfterAll {
     path.delete()
 
     val rdd = sparkContext.parallelize((1 to 10).map(i => s"""{"a":$i, "b":"str${i}"}"""))
-    df = jsonRDD(rdd)
+    df = read.json(rdd)
     df.registerTempTable("jsonTable")
   }
 
@@ -57,41 +57,41 @@ class SaveLoadSuite extends DataSourceTest with BeforeAndAfterAll {
 
   def checkLoad(): Unit = {
     conf.setConf(SQLConf.DEFAULT_DATA_SOURCE_NAME, "org.apache.spark.sql.json")
-    checkAnswer(load(path.toString), df.collect())
+    checkAnswer(read.load(path.toString), df.collect())
 
     // Test if we can pick up the data source name passed in load.
     conf.setConf(SQLConf.DEFAULT_DATA_SOURCE_NAME, "not a source name")
-    checkAnswer(load(path.toString, "org.apache.spark.sql.json"), df.collect())
-    checkAnswer(load("org.apache.spark.sql.json", Map("path" -> path.toString)), df.collect())
+    checkAnswer(read.format("json").load(path.toString), df.collect())
+    checkAnswer(read.format("json").load(path.toString), df.collect())
     val schema = StructType(StructField("b", StringType, true) :: Nil)
     checkAnswer(
-      load("org.apache.spark.sql.json", schema, Map("path" -> path.toString)),
+      read.format("json").schema(schema).load(path.toString),
       sql("SELECT b FROM jsonTable").collect())
   }
 
   test("save with path and load") {
     conf.setConf(SQLConf.DEFAULT_DATA_SOURCE_NAME, "org.apache.spark.sql.json")
-    df.save(path.toString)
+    df.write.save(path.toString)
     checkLoad()
   }
 
   test("save with path and datasource, and load") {
     conf.setConf(SQLConf.DEFAULT_DATA_SOURCE_NAME, "not a source name")
-    df.save(path.toString, "org.apache.spark.sql.json")
+    df.write.json(path.toString)
     checkLoad()
   }
 
   test("save with data source and options, and load") {
     conf.setConf(SQLConf.DEFAULT_DATA_SOURCE_NAME, "not a source name")
-    df.save("org.apache.spark.sql.json", SaveMode.ErrorIfExists, Map("path" -> path.toString))
+    df.write.mode(SaveMode.ErrorIfExists).json(path.toString)
     checkLoad()
   }
 
   test("save and save again") {
-    df.save(path.toString, "org.apache.spark.sql.json")
+    df.write.json(path.toString)
 
     var message = intercept[RuntimeException] {
-      df.save(path.toString, "org.apache.spark.sql.json")
+      df.write.json(path.toString)
     }.getMessage
 
     assert(
@@ -100,14 +100,14 @@ class SaveLoadSuite extends DataSourceTest with BeforeAndAfterAll {
 
     if (path.exists()) Utils.deleteRecursively(path)
 
-    df.save(path.toString, "org.apache.spark.sql.json")
+    df.write.json(path.toString)
     checkLoad()
 
-    df.save("org.apache.spark.sql.json", SaveMode.Overwrite, Map("path" -> path.toString))
+    df.write.mode(SaveMode.Overwrite).json(path.toString)
     checkLoad()
 
     message = intercept[RuntimeException] {
-      df.save("org.apache.spark.sql.json", SaveMode.Append, Map("path" -> path.toString))
+      df.write.mode(SaveMode.Append).json(path.toString)
     }.getMessage
 
     assert(
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveStrategies.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveStrategies.scala
index d46a127d47d31..c6b65106452bf 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveStrategies.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveStrategies.scala
@@ -140,7 +140,7 @@ private[hive] trait HiveStrategies {
               PhysicalRDD(plan.output, sparkContext.emptyRDD[Row]) :: Nil
             } else {
               hiveContext
-                .parquetFile(partitionLocations: _*)
+                .read.parquet(partitionLocations: _*)
                 .addPartitioningAttributes(relation.partitionKeys)
                 .lowerCase
                 .where(unresolvedOtherPredicates)
@@ -152,7 +152,7 @@ private[hive] trait HiveStrategies {
 
           } else {
             hiveContext
-              .parquetFile(relation.hiveQlTable.getDataLocation.toString)
+              .read.parquet(relation.hiveQlTable.getDataLocation.toString)
               .lowerCase
               .where(unresolvedOtherPredicates)
               .select(unresolvedProjection: _*)
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveParquetSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveParquetSuite.scala
index 7ff5719adb3ab..5a5ea10e3c82e 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveParquetSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveParquetSuite.scala
@@ -55,8 +55,8 @@ class HiveParquetSuite extends QueryTest with ParquetTest {
 
     test(s"$prefix: Converting Hive to Parquet Table via saveAsParquetFile") {
       withTempPath { dir =>
-        sql("SELECT * FROM src").saveAsParquetFile(dir.getCanonicalPath)
-        parquetFile(dir.getCanonicalPath).registerTempTable("p")
+        sql("SELECT * FROM src").write.parquet(dir.getCanonicalPath)
+        read.parquet(dir.getCanonicalPath).registerTempTable("p")
         withTempTable("p") {
           checkAnswer(
             sql("SELECT * FROM src ORDER BY key"),
@@ -68,8 +68,8 @@ class HiveParquetSuite extends QueryTest with ParquetTest {
     test(s"$prefix: INSERT OVERWRITE TABLE Parquet table") {
       withParquetTable((1 to 10).map(i => (i, s"val_$i")), "t") {
         withTempPath { file =>
-          sql("SELECT * FROM t LIMIT 1").saveAsParquetFile(file.getCanonicalPath)
-          parquetFile(file.getCanonicalPath).registerTempTable("p")
+          sql("SELECT * FROM t LIMIT 1").write.parquet(file.getCanonicalPath)
+          read.parquet(file.getCanonicalPath).registerTempTable("p")
           withTempTable("p") {
             // let's do three overwrites for good measure
             sql("INSERT OVERWRITE TABLE p SELECT * FROM t")
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala
index 1bf1c1be3e3d3..58b0b80c31e2e 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala
@@ -60,7 +60,7 @@ class MetastoreDataSourcesSuite extends QueryTest with BeforeAndAfterEach {
 
     checkAnswer(
       sql("SELECT * FROM jsonTable"),
-      jsonFile(filePath).collect().toSeq)
+      read.json(filePath).collect().toSeq)
   }
 
   test ("persistent JSON table with a user specified schema") {
@@ -77,7 +77,7 @@ class MetastoreDataSourcesSuite extends QueryTest with BeforeAndAfterEach {
         |)
       """.stripMargin)
 
-    jsonFile(filePath).registerTempTable("expectedJsonTable")
+    read.json(filePath).registerTempTable("expectedJsonTable")
 
     checkAnswer(
       sql("SELECT a, b, `c_!@(3)`, `<d>`.`d!`, `<d>`.`=` FROM jsonTable"),
@@ -104,7 +104,7 @@ class MetastoreDataSourcesSuite extends QueryTest with BeforeAndAfterEach {
 
     assert(expectedSchema === table("jsonTable").schema)
 
-    jsonFile(filePath).registerTempTable("expectedJsonTable")
+    read.json(filePath).registerTempTable("expectedJsonTable")
 
     checkAnswer(
       sql("SELECT b, `<d>`.`=` FROM jsonTable"),
@@ -123,7 +123,7 @@ class MetastoreDataSourcesSuite extends QueryTest with BeforeAndAfterEach {
 
     checkAnswer(
       sql("SELECT * FROM jsonTable"),
-      jsonFile(filePath).collect().toSeq)
+      read.json(filePath).collect().toSeq)
   }
 
   test("drop table") {
@@ -138,7 +138,7 @@ class MetastoreDataSourcesSuite extends QueryTest with BeforeAndAfterEach {
 
     checkAnswer(
       sql("SELECT * FROM jsonTable"),
-      jsonFile(filePath).collect().toSeq)
+      read.json(filePath).collect().toSeq)
 
     sql("DROP TABLE jsonTable")
 
@@ -241,7 +241,7 @@ class MetastoreDataSourcesSuite extends QueryTest with BeforeAndAfterEach {
         |)
       """.stripMargin)
 
-    jsonFile(filePath).registerTempTable("expectedJsonTable")
+    read.json(filePath).registerTempTable("expectedJsonTable")
 
     checkAnswer(
       sql("SELECT * FROM jsonTable"),
@@ -474,7 +474,7 @@ class MetastoreDataSourcesSuite extends QueryTest with BeforeAndAfterEach {
     // Drop table will also delete the data.
     sql("DROP TABLE savedJsonTable")
     intercept[InvalidInputException] {
-      jsonFile(catalog.hiveDefaultTableFilePath("savedJsonTable"))
+      read.json(catalog.hiveDefaultTableFilePath("savedJsonTable"))
     }
 
     // Create an external table by specifying the path.
@@ -491,7 +491,7 @@ class MetastoreDataSourcesSuite extends QueryTest with BeforeAndAfterEach {
     // Data should not be deleted after we drop the table.
     sql("DROP TABLE savedJsonTable")
     checkAnswer(
-      jsonFile(tempPath.toString),
+      read.json(tempPath.toString),
       df.collect())
 
     conf.setConf(SQLConf.DEFAULT_DATA_SOURCE_NAME, originalDefaultSource)
@@ -526,7 +526,7 @@ class MetastoreDataSourcesSuite extends QueryTest with BeforeAndAfterEach {
     // Data should not be deleted.
     sql("DROP TABLE createdJsonTable")
     checkAnswer(
-      jsonFile(tempPath.toString),
+      read.json(tempPath.toString),
       df.collect())
 
     // Try to specify the schema.
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/parquetSuites.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/parquetSuites.scala
index b6be09e2f8837..a0075f1e44ca8 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/parquetSuites.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/parquetSuites.scala
@@ -688,11 +688,11 @@ class ParquetDataSourceOnSourceSuite extends ParquetSourceSuiteBase {
 
     val df = Seq(1,2,3).map(i => (i, i.toString)).toDF("int", "str")
     val df2 = df.as('x).join(df.as('y), $"x.str" === $"y.str").groupBy("y.str").max("y.int")
-    intercept[Throwable](df2.saveAsParquetFile(filePath))
+    intercept[Throwable](df2.write.parquet(filePath))
 
     val df3 = df2.toDF("str", "max_int")
-    df3.saveAsParquetFile(filePath2)
-    val df4 = parquetFile(filePath2)
+    df3.write.parquet(filePath2)
+    val df4 = read.parquet(filePath2)
     checkAnswer(df4, Row("1", 1) :: Row("2", 2) :: Row("3", 3) :: Nil)
     assert(df4.columns === Array("str", "max_int"))
   }
@@ -731,14 +731,14 @@ abstract class ParquetPartitioningTest extends QueryTest with BeforeAndAfterAll
       sparkContext.makeRDD(1 to 10)
         .map(i => ParquetData(i, s"part-$p"))
         .toDF()
-        .saveAsParquetFile(partDir.getCanonicalPath)
+        .write.parquet(partDir.getCanonicalPath)
     }
 
     sparkContext
       .makeRDD(1 to 10)
       .map(i => ParquetData(i, s"part-1"))
       .toDF()
-      .saveAsParquetFile(new File(normalTableDir, "normal").getCanonicalPath)
+      .write.parquet(new File(normalTableDir, "normal").getCanonicalPath)
 
     partitionedTableDirWithKey = Utils.createTempDir()
 
@@ -747,7 +747,7 @@ abstract class ParquetPartitioningTest extends QueryTest with BeforeAndAfterAll
       sparkContext.makeRDD(1 to 10)
         .map(i => ParquetDataWithKey(p, i, s"part-$p"))
         .toDF()
-        .saveAsParquetFile(partDir.getCanonicalPath)
+        .write.parquet(partDir.getCanonicalPath)
     }
 
     partitionedTableDirWithKeyAndComplexTypes = Utils.createTempDir()
@@ -757,7 +757,7 @@ abstract class ParquetPartitioningTest extends QueryTest with BeforeAndAfterAll
       sparkContext.makeRDD(1 to 10).map { i =>
         ParquetDataWithKeyAndComplexTypes(
           p, i, s"part-$p", StructContainer(i, f"${i}_string"), 1 to i)
-      }.toDF().saveAsParquetFile(partDir.getCanonicalPath)
+      }.toDF().write.parquet(partDir.getCanonicalPath)
     }
 
     partitionedTableDirWithComplexTypes = Utils.createTempDir()
@@ -766,7 +766,7 @@ abstract class ParquetPartitioningTest extends QueryTest with BeforeAndAfterAll
       val partDir = new File(partitionedTableDirWithComplexTypes, s"p=$p")
       sparkContext.makeRDD(1 to 10).map { i =>
         ParquetDataWithComplexTypes(i, s"part-$p", StructContainer(i, f"${i}_string"), 1 to i)
-      }.toDF().saveAsParquetFile(partDir.getCanonicalPath)
+      }.toDF().write.parquet(partDir.getCanonicalPath)
     }
   }
 
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/sources/hadoopFsRelationSuites.scala b/sql/hive/src/test/scala/org/apache/spark/sql/sources/hadoopFsRelationSuites.scala
index cf6afd25ae5a0..f44b3c521e647 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/sources/hadoopFsRelationSuites.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/sources/hadoopFsRelationSuites.scala
@@ -92,44 +92,27 @@ class HadoopFsRelationTest extends QueryTest with ParquetTest {
 
   test("save()/load() - non-partitioned table - Overwrite") {
     withTempPath { file =>
-      testDF.save(
-        path = file.getCanonicalPath,
-        source = dataSourceName,
-        mode = SaveMode.Overwrite)
-
-      testDF.save(
-        path = file.getCanonicalPath,
-        source = dataSourceName,
-        mode = SaveMode.Overwrite)
+      testDF.write.mode(SaveMode.Overwrite).format(dataSourceName).save(file.getCanonicalPath)
+      testDF.write.mode(SaveMode.Overwrite).format(dataSourceName).save(file.getCanonicalPath)
 
       checkAnswer(
-        load(
-          source = dataSourceName,
-          options = Map(
-            "path" -> file.getCanonicalPath,
-            "dataSchema" -> dataSchema.json)),
+        read.format(dataSourceName)
+          .option("path", file.getCanonicalPath)
+          .option("dataSchema", dataSchema.json)
+          .load(),
         testDF.collect())
     }
   }
 
   test("save()/load() - non-partitioned table - Append") {
     withTempPath { file =>
-      testDF.save(
-        path = file.getCanonicalPath,
-        source = dataSourceName,
-        mode = SaveMode.Overwrite)
-
-      testDF.save(
-        path = file.getCanonicalPath,
-        source = dataSourceName,
-        mode = SaveMode.Append)
+      testDF.write.mode(SaveMode.Overwrite).format(dataSourceName).save(file.getCanonicalPath)
+      testDF.write.mode(SaveMode.Append).format(dataSourceName).save(file.getCanonicalPath)
 
       checkAnswer(
-        load(
-          source = dataSourceName,
-          options = Map(
-            "path" -> file.getCanonicalPath,
-            "dataSchema" -> dataSchema.json)).orderBy("a"),
+        read.format(dataSourceName)
+          .option("dataSchema", dataSchema.json)
+          .load(file.getCanonicalPath).orderBy("a"),
         testDF.unionAll(testDF).orderBy("a").collect())
     }
   }
@@ -147,10 +130,7 @@ class HadoopFsRelationTest extends QueryTest with ParquetTest {
 
   test("save()/load() - non-partitioned table - Ignore") {
     withTempDir { file =>
-      testDF.save(
-        path = file.getCanonicalPath,
-        source = dataSourceName,
-        mode = SaveMode.Ignore)
+      testDF.write.mode(SaveMode.Ignore).format(dataSourceName).save(file.getCanonicalPath)
 
       val path = new Path(file.getCanonicalPath)
       val fs = path.getFileSystem(sqlContext.sparkContext.hadoopConfiguration)
@@ -160,89 +140,81 @@ class HadoopFsRelationTest extends QueryTest with ParquetTest {
 
   test("save()/load() - partitioned table - simple queries") {
     withTempPath { file =>
-      partitionedTestDF.save(
-        source = dataSourceName,
-        mode = SaveMode.ErrorIfExists,
-        options = Map("path" -> file.getCanonicalPath),
-        partitionColumns = Seq("p1", "p2"))
+      partitionedTestDF.write
+        .format(dataSourceName)
+        .mode(SaveMode.ErrorIfExists)
+        .partitionBy("p1", "p2")
+        .save(file.getCanonicalPath)
 
       checkQueries(
-        load(
-          source = dataSourceName,
-          options = Map(
-            "path" -> file.getCanonicalPath,
-            "dataSchema" -> dataSchema.json)))
+        read.format(dataSourceName)
+          .option("dataSchema", dataSchema.json)
+          .load(file.getCanonicalPath))
     }
   }
 
   test("save()/load() - partitioned table - Overwrite") {
     withTempPath { file =>
-      partitionedTestDF.save(
-        source = dataSourceName,
-        mode = SaveMode.Overwrite,
-        options = Map("path" -> file.getCanonicalPath),
-        partitionColumns = Seq("p1", "p2"))
-
-      partitionedTestDF.save(
-        source = dataSourceName,
-        mode = SaveMode.Overwrite,
-        options = Map("path" -> file.getCanonicalPath),
-        partitionColumns = Seq("p1", "p2"))
+      partitionedTestDF.write
+        .format(dataSourceName)
+        .mode(SaveMode.Overwrite)
+        .partitionBy("p1", "p2")
+        .save(file.getCanonicalPath)
+
+      partitionedTestDF.write
+        .format(dataSourceName)
+        .mode(SaveMode.Overwrite)
+        .partitionBy("p1", "p2")
+        .save(file.getCanonicalPath)
 
       checkAnswer(
-        load(
-          source = dataSourceName,
-          options = Map(
-            "path" -> file.getCanonicalPath,
-            "dataSchema" -> dataSchema.json)),
+        read.format(dataSourceName)
+          .option("dataSchema", dataSchema.json)
+          .load(file.getCanonicalPath),
         partitionedTestDF.collect())
     }
   }
 
   test("save()/load() - partitioned table - Append") {
     withTempPath { file =>
-      partitionedTestDF.save(
-        source = dataSourceName,
-        mode = SaveMode.Overwrite,
-        options = Map("path" -> file.getCanonicalPath),
-        partitionColumns = Seq("p1", "p2"))
-
-      partitionedTestDF.save(
-        source = dataSourceName,
-        mode = SaveMode.Append,
-        options = Map("path" -> file.getCanonicalPath),
-        partitionColumns = Seq("p1", "p2"))
+      partitionedTestDF.write
+        .format(dataSourceName)
+        .mode(SaveMode.Overwrite)
+        .partitionBy("p1", "p2")
+        .save(file.getCanonicalPath)
+
+      partitionedTestDF.write
+        .format(dataSourceName)
+        .mode(SaveMode.Append)
+        .partitionBy("p1", "p2")
+        .save(file.getCanonicalPath)
 
       checkAnswer(
-        load(
-          source = dataSourceName,
-          options = Map(
-            "path" -> file.getCanonicalPath,
-            "dataSchema" -> dataSchema.json)),
+        read.format(dataSourceName)
+          .option("dataSchema", dataSchema.json)
+          .load(file.getCanonicalPath),
         partitionedTestDF.unionAll(partitionedTestDF).collect())
     }
   }
 
   test("save()/load() - partitioned table - Append - new partition values") {
     withTempPath { file =>
-      partitionedTestDF1.save(
-        source = dataSourceName,
-        mode = SaveMode.Overwrite,
-        options = Map("path" -> file.getCanonicalPath),
-        partitionColumns = Seq("p1", "p2"))
-
-      partitionedTestDF2.save(
-        source = dataSourceName,
-        mode = SaveMode.Append,
-        options = Map("path" -> file.getCanonicalPath),
-        partitionColumns = Seq("p1", "p2"))
+      partitionedTestDF1.write
+        .format(dataSourceName)
+        .mode(SaveMode.Overwrite)
+        .partitionBy("p1", "p2")
+        .save(file.getCanonicalPath)
+
+      partitionedTestDF2.write
+        .format(dataSourceName)
+        .mode(SaveMode.Append)
+        .partitionBy("p1", "p2")
+        .save(file.getCanonicalPath)
 
       checkAnswer(
-        load(
-          source = dataSourceName,
-          options = Map(
-            "path" -> file.getCanonicalPath,
-            "dataSchema" -> dataSchema.json)),
+        read.format(dataSourceName)
+          .option("dataSchema", dataSchema.json)
+          .load(file.getCanonicalPath),
         partitionedTestDF.collect())
     }
   }
@@ -250,11 +222,11 @@ class HadoopFsRelationTest extends QueryTest with ParquetTest {
   test("save()/load() - partitioned table - ErrorIfExists") {
     withTempDir { file =>
       intercept[RuntimeException] {
-        partitionedTestDF.save(
-          source = dataSourceName,
-          mode = SaveMode.ErrorIfExists,
-          options = Map("path" -> file.getCanonicalPath),
-          partitionColumns = Seq("p1", "p2"))
+        partitionedTestDF.write
+          .format(dataSourceName)
+          .mode(SaveMode.ErrorIfExists)
+          .partitionBy("p1", "p2")
+          .save(file.getCanonicalPath)
       }
     }
   }
@@ -343,19 +315,19 @@ class HadoopFsRelationTest extends QueryTest with ParquetTest {
   }
 
   test("saveAsTable()/load() - partitioned table - Overwrite") {
-    partitionedTestDF.saveAsTable(
-      tableName = "t",
-      source = dataSourceName,
-      mode = SaveMode.Overwrite,
-      options = Map("dataSchema" -> dataSchema.json),
-      partitionColumns = Seq("p1", "p2"))
-
-    partitionedTestDF.saveAsTable(
-      tableName = "t",
-      source = dataSourceName,
-      mode = SaveMode.Overwrite,
-      options = Map("dataSchema" -> dataSchema.json),
-      partitionColumns = Seq("p1", "p2"))
+    partitionedTestDF.write
+      .format(dataSourceName)
+      .mode(SaveMode.Overwrite)
+      .option("dataSchema", dataSchema.json)
+      .partitionBy("p1", "p2")
+      .saveAsTable("t")
+
+    partitionedTestDF.write
+      .format(dataSourceName)
+      .mode(SaveMode.Overwrite)
+      .option("dataSchema", dataSchema.json)
+      .partitionBy("p1", "p2")
+      .saveAsTable("t")
 
     withTable("t") {
       checkAnswer(table("t"), partitionedTestDF.collect())
@@ -363,19 +335,19 @@ class HadoopFsRelationTest extends QueryTest with ParquetTest {
   }
 
   test("saveAsTable()/load() - partitioned table - Append") {
-    partitionedTestDF.saveAsTable(
-      tableName = "t",
-      source = dataSourceName,
-      mode = SaveMode.Overwrite,
-      options = Map("dataSchema" -> dataSchema.json),
-      partitionColumns = Seq("p1", "p2"))
-
-    partitionedTestDF.saveAsTable(
-      tableName = "t",
-      source = dataSourceName,
-      mode = SaveMode.Append,
-      options = Map("dataSchema" -> dataSchema.json),
-      partitionColumns = Seq("p1", "p2"))
+    partitionedTestDF.write
+      .format(dataSourceName)
+      .mode(SaveMode.Overwrite)
+      .option("dataSchema", dataSchema.json)
+      .partitionBy("p1", "p2")
+      .saveAsTable("t")
+
+    partitionedTestDF.write
+      .format(dataSourceName)
+      .mode(SaveMode.Append)
+      .option("dataSchema", dataSchema.json)
+      .partitionBy("p1", "p2")
+      .saveAsTable("t")
 
     withTable("t") {
       checkAnswer(table("t"), partitionedTestDF.unionAll(partitionedTestDF).collect())
@@ -383,19 +355,19 @@ class HadoopFsRelationTest extends QueryTest with ParquetTest {
   }
 
   test("saveAsTable()/load() - partitioned table - Append - new partition values") {
-    partitionedTestDF1.saveAsTable(
-      tableName = "t",
-      source = dataSourceName,
-      mode = SaveMode.Overwrite,
-      options = Map("dataSchema" -> dataSchema.json),
-      partitionColumns = Seq("p1", "p2"))
-
-    partitionedTestDF2.saveAsTable(
-      tableName = "t",
-      source = dataSourceName,
-      mode = SaveMode.Append,
-      options = Map("dataSchema" -> dataSchema.json),
-      partitionColumns = Seq("p1", "p2"))
+    partitionedTestDF1.write
+      .format(dataSourceName)
+      .mode(SaveMode.Overwrite)
+      .option("dataSchema", dataSchema.json)
+      .partitionBy("p1", "p2")
+      .saveAsTable("t")
+
+    partitionedTestDF2.write
+      .format(dataSourceName)
+      .mode(SaveMode.Append)
+      .option("dataSchema", dataSchema.json)
+      .partitionBy("p1", "p2")
+      .saveAsTable("t")
 
     withTable("t") {
       checkAnswer(table("t"), partitionedTestDF.collect())
@@ -403,31 +375,31 @@ class HadoopFsRelationTest extends QueryTest with ParquetTest {
   }
 
   test("saveAsTable()/load() - partitioned table - Append - mismatched partition columns") {
-    partitionedTestDF1.saveAsTable(
-      tableName = "t",
-      source = dataSourceName,
-      mode = SaveMode.Overwrite,
-      options = Map("dataSchema" -> dataSchema.json),
-      partitionColumns = Seq("p1", "p2"))
+    partitionedTestDF1.write
+      .format(dataSourceName)
+      .mode(SaveMode.Overwrite)
+      .option("dataSchema", dataSchema.json)
+      .partitionBy("p1", "p2")
+      .saveAsTable("t")
 
     // Using only a subset of all partition columns
     intercept[Throwable] {
-      partitionedTestDF2.saveAsTable(
-        tableName = "t",
-        source = dataSourceName,
-        mode = SaveMode.Append,
-        options = Map("dataSchema" -> dataSchema.json),
-        partitionColumns = Seq("p1"))
+      partitionedTestDF2.write
+        .format(dataSourceName)
+        .mode(SaveMode.Append)
+        .option("dataSchema", dataSchema.json)
+        .partitionBy("p1")
+        .saveAsTable("t")
     }
 
     // Using different order of partition columns
     intercept[Throwable] {
-      partitionedTestDF2.saveAsTable(
-        tableName = "t",
-        source = dataSourceName,
-        mode = SaveMode.Append,
-        options = Map("dataSchema" -> dataSchema.json),
-        partitionColumns = Seq("p2", "p1"))
+      partitionedTestDF2.write
+        .format(dataSourceName)
+        .mode(SaveMode.Append)
+        .option("dataSchema", dataSchema.json)
+        .partitionBy("p2", "p1")
+        .saveAsTable("t")
     }
   }
 
@@ -436,12 +408,12 @@ class HadoopFsRelationTest extends QueryTest with ParquetTest {
 
     withTempTable("t") {
       intercept[AnalysisException] {
-        partitionedTestDF.saveAsTable(
-          tableName = "t",
-          source = dataSourceName,
-          mode = SaveMode.ErrorIfExists,
-          options = Map("dataSchema" -> dataSchema.json),
-          partitionColumns = Seq("p1", "p2"))
+        partitionedTestDF.write
+          .format(dataSourceName)
+          .mode(SaveMode.ErrorIfExists)
+          .option("dataSchema", dataSchema.json)
+          .partitionBy("p1", "p2")
+          .saveAsTable("t")
       }
     }
   }
@@ -450,12 +422,12 @@ class HadoopFsRelationTest extends QueryTest with ParquetTest {
     Seq.empty[(Int, String)].toDF().registerTempTable("t")
 
     withTempTable("t") {
-      partitionedTestDF.saveAsTable(
-        tableName = "t",
-        source = dataSourceName,
-        mode = SaveMode.Ignore,
-        options = Map("dataSchema" -> dataSchema.json),
-        partitionColumns = Seq("p1", "p2"))
+      partitionedTestDF.write
+        .format(dataSourceName)
+        .mode(SaveMode.Ignore)
+        .option("dataSchema", dataSchema.json)
+        .partitionBy("p1", "p2")
+        .saveAsTable("t")
 
       assert(table("t").collect().isEmpty)
     }
@@ -463,17 +435,16 @@ class HadoopFsRelationTest extends QueryTest with ParquetTest {
 
   test("Hadoop style globbing") {
     withTempPath { file =>
-      partitionedTestDF.save(
-        source = dataSourceName,
-        mode = SaveMode.Overwrite,
-        options = Map("path" -> file.getCanonicalPath),
-        partitionColumns = Seq("p1", "p2"))
-
-      val df = load(
-        source = dataSourceName,
-        options = Map(
-          "path" -> s"${file.getCanonicalPath}/p1=*/p2=???",
-          "dataSchema" -> dataSchema.json))
+      partitionedTestDF.write
+        .format(dataSourceName)
+        .mode(SaveMode.Overwrite)
+        .partitionBy("p1", "p2")
+        .save(file.getCanonicalPath)
+
+      val df = read
+        .format(dataSourceName)
+        .option("dataSchema", dataSchema.json)
+        .load(s"${file.getCanonicalPath}/p1=*/p2=???")
 
       val expectedPaths = Set(
         s"${file.getCanonicalFile}/p1=1/p2=foo",

From d41ae4344c07064de03a120804830886e1614d92 Mon Sep 17 00:00:00 2001
From: FavioVazquez <favio.vazquezp@gmail.com>
Date: Sat, 16 May 2015 08:07:03 +0100
Subject: [PATCH 019/525] [SPARK-7671] Fix wrong URLs in MLlib Data Types
 Documentation

There is a mistake in the URL of Matrices in the MLlib Data Types documentation (Local matrix scala section), the URL points to https://spark.apache.org/docs/latest/api/scala/index.html#org.apache.spark.mllib.linalg.Matrices which is a mistake, since Matrices is an object that implements factory methods for Matrix that does not have a companion class. The correct link should point to https://spark.apache.org/docs/latest/api/scala/index.html#org.apache.spark.mllib.linalg.Matrices$

There is another mistake, in the Local Vector section in Scala, Java and Python

In the Scala section the URL of Vectors points to the trait Vector (https://spark.apache.org/docs/latest/api/scala/index.html#org.apache.spark.mllib.linalg.Vector) and not to the factory methods implemented in Vectors.

The correct link should be: https://spark.apache.org/docs/latest/api/scala/index.html#org.apache.spark.mllib.linalg.Vectors$

In the Java section the URL of Vectors points to the Interface Vector (https://spark.apache.org/docs/latest/api/java/org/apache/spark/mllib/linalg/Vector.html) and not to the Class Vectors

The correct link should be:
https://spark.apache.org/docs/latest/api/java/org/apache/spark/mllib/linalg/Vectors.html

In the Python section the URL of Vectors points to the class Vector (https://spark.apache.org/docs/latest/api/python/pyspark.mllib.html#pyspark.mllib.linalg.Vector) and not the Class Vectors

The correct link should be:
https://spark.apache.org/docs/latest/api/python/pyspark.mllib.html#pyspark.mllib.linalg.Vectors

Author: FavioVazquez <favio.vazquezp@gmail.com>

Closes #6196 from FavioVazquez/fix-typo-matrices-mllib-datatypes and squashes the following commits:

3e9efd5 [FavioVazquez] - Fixed wrong URLs in the MLlib Data Types Documentation
9af7074 [FavioVazquez] Merge remote-tracking branch 'upstream/master'
edab1ef [FavioVazquez] Merge remote-tracking branch 'upstream/master'
b2e2f8c [FavioVazquez] Merge remote-tracking branch 'upstream/master'
---
 docs/mllib-data-types.md | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/docs/mllib-data-types.md b/docs/mllib-data-types.md
index 4f2a2f71048f7..acec0426dc69b 100644
--- a/docs/mllib-data-types.md
+++ b/docs/mllib-data-types.md
@@ -31,7 +31,7 @@ The base class of local vectors is
 implementations: [`DenseVector`](api/scala/index.html#org.apache.spark.mllib.linalg.DenseVector) and
 [`SparseVector`](api/scala/index.html#org.apache.spark.mllib.linalg.SparseVector).  We recommend
 using the factory methods implemented in
-[`Vectors`](api/scala/index.html#org.apache.spark.mllib.linalg.Vector) to create local vectors.
+[`Vectors`](api/scala/index.html#org.apache.spark.mllib.linalg.Vectors$) to create local vectors.
 
 {% highlight scala %}
 import org.apache.spark.mllib.linalg.{Vector, Vectors}
@@ -57,7 +57,7 @@ The base class of local vectors is
 implementations: [`DenseVector`](api/java/org/apache/spark/mllib/linalg/DenseVector.html) and
 [`SparseVector`](api/java/org/apache/spark/mllib/linalg/SparseVector.html).  We recommend
 using the factory methods implemented in
-[`Vectors`](api/java/org/apache/spark/mllib/linalg/Vector.html) to create local vectors.
+[`Vectors`](api/java/org/apache/spark/mllib/linalg/Vectors.html) to create local vectors.
 
 {% highlight java %}
 import org.apache.spark.mllib.linalg.Vector;
@@ -84,7 +84,7 @@ and the following as sparse vectors:
   with a single column
 
 We recommend using NumPy arrays over lists for efficiency, and using the factory methods implemented
-in [`Vectors`](api/python/pyspark.mllib.html#pyspark.mllib.linalg.Vector) to create sparse vectors.
+in [`Vectors`](api/python/pyspark.mllib.html#pyspark.mllib.linalg.Vectors) to create sparse vectors.
 
 {% highlight python %}
 import numpy as np
@@ -241,7 +241,7 @@ The base class of local matrices is
 [`Matrix`](api/scala/index.html#org.apache.spark.mllib.linalg.Matrix), and we provide one
 implementation: [`DenseMatrix`](api/scala/index.html#org.apache.spark.mllib.linalg.DenseMatrix).
 We recommend using the factory methods implemented
-in [`Matrices`](api/scala/index.html#org.apache.spark.mllib.linalg.Matrices) to create local
+in [`Matrices`](api/scala/index.html#org.apache.spark.mllib.linalg.Matrices$) to create local
 matrices.
 
 {% highlight scala %}

From 1fd33815f47478f5f2e8b55b90757819b8cb5247 Mon Sep 17 00:00:00 2001
From: Sean Owen <sowen@cloudera.com>
Date: Sat, 16 May 2015 08:18:41 +0100
Subject: [PATCH 020/525] [SPARK-4556] [BUILD] binary distribution assembly
 can't run in local mode

Add note on building a runnable distribution with make-distribution.sh

Author: Sean Owen <sowen@cloudera.com>

Closes #6186 from srowen/SPARK-4556 and squashes the following commits:

4002966 [Sean Owen] Add pointer to --help flag
9fa7883 [Sean Owen] Add note on building a runnable distribution with make-distribution.sh
---
 docs/building-spark.md | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/docs/building-spark.md b/docs/building-spark.md
index 6e310ff424784..4dbccb9e6e46c 100644
--- a/docs/building-spark.md
+++ b/docs/building-spark.md
@@ -34,6 +34,16 @@ and in `project/SparkBuild.scala` add:
 
 to the `sharedSettings` val. See also [this PR](https://github.com/apache/spark/pull/2883/files) if you are unsure of where to add these lines.
 
+# Building a Runnable Distribution
+
+To create a Spark distribution like those distributed by the 
+[Spark Downloads](http://spark.apache.org/downloads.html) page, and that is laid out so as 
+to be runnable, use `make-distribution.sh` in the project root directory. It can be configured 
+with Maven profile settings and so on like the direct Maven build. Example:
+
+    ./make-distribution.sh --name custom-spark --tgz -Phadoop-2.4 -Pyarn
+    
+For more information on usage, run `./make-distribution.sh --help`
 
 # Setting up Maven's Memory Usage
 

From 0ac8b01a07840f199bbc79fb845762284aead6de Mon Sep 17 00:00:00 2001
From: Nishkam Ravi <nravi@cloudera.com>
Date: Sat, 16 May 2015 08:24:21 +0100
Subject: [PATCH 021/525] [SPARK-7672] [CORE] Use int conversion in translating
 kryoserializer.buffer.mb to kryoserializer.buffer

In translating spark.kryoserializer.buffer.mb to spark.kryoserializer.buffer, use of toDouble will lead to "Fractional values not supported" error even when spark.kryoserializer.buffer.mb is an integer.
ilganeli, andrewor14

Author: Nishkam Ravi <nravi@cloudera.com>
Author: nishkamravi2 <nishkamravi@gmail.com>
Author: nravi <nravi@c1704.halxg.cloudera.com>

Closes #6198 from nishkamravi2/master_nravi and squashes the following commits:

171a53c [nishkamravi2] Update SparkConfSuite.scala
5261bf6 [Nishkam Ravi] Add a test for deprecated config spark.kryoserializer.buffer.mb
5190f79 [Nishkam Ravi] In translating from deprecated spark.kryoserializer.buffer.mb to spark.kryoserializer.buffer use int conversion since fractions are not permissible
059ce82 [Nishkam Ravi] Merge branch 'master' of https://github.com/apache/spark into master_nravi
eaa13b5 [nishkamravi2] Update Client.scala
981afd2 [Nishkam Ravi] Check for read permission before initiating copy
1b81383 [Nishkam Ravi] Merge branch 'master' of https://github.com/apache/spark into master_nravi
0f1abd0 [nishkamravi2] Update Utils.scala
474e3bf [nishkamravi2] Update DiskBlockManager.scala
97c383e [nishkamravi2] Update Utils.scala
8691e0c [Nishkam Ravi] Add a try/catch block around Utils.removeShutdownHook
2be1e76 [Nishkam Ravi] Merge branch 'master' of https://github.com/apache/spark into master_nravi
1c13b79 [Nishkam Ravi] Merge branch 'master' of https://github.com/apache/spark into master_nravi
bad4349 [nishkamravi2] Update Main.java
36a6f87 [Nishkam Ravi] Minor changes and bug fixes
b7f4ae7 [Nishkam Ravi] Merge branch 'master' of https://github.com/apache/spark into master_nravi
4a45d6a [Nishkam Ravi] Merge branch 'master' of https://github.com/apache/spark into master_nravi
458af39 [Nishkam Ravi] Locate the jar using getLocation, obviates the need to pass assembly path as an argument
d9658d6 [Nishkam Ravi] Changes for SPARK-6406
ccdc334 [Nishkam Ravi] Merge branch 'master' of https://github.com/apache/spark into master_nravi
3faa7a4 [Nishkam Ravi] Launcher library changes (SPARK-6406)
345206a [Nishkam Ravi] spark-class merge Merge branch 'master_nravi' of https://github.com/nishkamravi2/spark into master_nravi
ac58975 [Nishkam Ravi] spark-class changes
06bfeb0 [nishkamravi2] Update spark-class
35af990 [Nishkam Ravi] Merge branch 'master' of https://github.com/apache/spark into master_nravi
32c3ab3 [nishkamravi2] Update AbstractCommandBuilder.java
4bd4489 [nishkamravi2] Update AbstractCommandBuilder.java
746f35b [Nishkam Ravi] "hadoop" string in the assembly name should not be mandatory (everywhere else in spark we mandate spark-assembly*hadoop*.jar)
bfe96e0 [Nishkam Ravi] Merge branch 'master' of https://github.com/apache/spark into master_nravi
ee902fa [Nishkam Ravi] Merge branch 'master' of https://github.com/apache/spark into master_nravi
d453197 [nishkamravi2] Update NewHadoopRDD.scala
6f41a1d [nishkamravi2] Update NewHadoopRDD.scala
0ce2c32 [nishkamravi2] Update HadoopRDD.scala
f7e33c2 [Nishkam Ravi] Merge branch 'master_nravi' of https://github.com/nishkamravi2/spark into master_nravi
ba1eb8b [Nishkam Ravi] Try-catch block around the two occurrences of removeShutDownHook. Deletion of semi-redundant occurrences of expensive operation inShutDown.
71d0e17 [Nishkam Ravi] Merge branch 'master' of https://github.com/apache/spark into master_nravi
494d8c0 [nishkamravi2] Update DiskBlockManager.scala
3c5ddba [nishkamravi2] Update DiskBlockManager.scala
f0d12de [Nishkam Ravi] Workaround for IllegalStateException caused by recent changes to BlockManager.stop
79ea8b4 [Nishkam Ravi] Merge branch 'master' of https://github.com/apache/spark into master_nravi
b446edc [Nishkam Ravi] Merge branch 'master' of https://github.com/apache/spark into master_nravi
5c9a4cb [nishkamravi2] Update TaskSetManagerSuite.scala
535295a [nishkamravi2] Update TaskSetManager.scala
3e1b616 [Nishkam Ravi] Modify test for maxResultSize
9f6583e [Nishkam Ravi] Changes to maxResultSize code (improve error message and add condition to check if maxResultSize > 0)
5f8f9ed [Nishkam Ravi] Merge branch 'master' of https://github.com/apache/spark into master_nravi
636a9ff [nishkamravi2] Update YarnAllocator.scala
8f76c8b [Nishkam Ravi] Doc change for yarn memory overhead
35daa64 [Nishkam Ravi] Slight change in the doc for yarn memory overhead
5ac2ec1 [Nishkam Ravi] Remove out
dac1047 [Nishkam Ravi] Additional documentation for yarn memory overhead issue
42c2c3d [Nishkam Ravi] Additional changes for yarn memory overhead issue
362da5e [Nishkam Ravi] Additional changes for yarn memory overhead
c726bd9 [Nishkam Ravi] Merge branch 'master' of https://github.com/apache/spark into master_nravi
f00fa31 [Nishkam Ravi] Improving logging for AM memoryOverhead
1cf2d1e [nishkamravi2] Update YarnAllocator.scala
ebcde10 [Nishkam Ravi] Modify default YARN memory_overhead-- from an additive constant to a multiplier (redone to resolve merge conflicts)
2e69f11 [Nishkam Ravi] Merge branch 'master' of https://github.com/apache/spark into master_nravi
efd688a [Nishkam Ravi] Merge branch 'master' of https://github.com/apache/spark
2b630f9 [nravi] Accept memory input as "30g", "512M" instead of an int value, to be consistent with rest of Spark
3bf8fad [nravi] Merge branch 'master' of https://github.com/apache/spark
5423a03 [nravi] Merge branch 'master' of https://github.com/apache/spark
eb663ca [nravi] Merge branch 'master' of https://github.com/apache/spark
df2aeb1 [nravi] Improved fix for ConcurrentModificationIssue (Spark-1097, Hadoop-10456)
6b840f0 [nravi] Undo the fix for SPARK-1758 (the problem is fixed)
5108700 [nravi] Fix in Spark for the Concurrent thread modification issue (SPARK-1097, HADOOP-10456)
681b36f [nravi] Fix for SPARK-1758: failing test org.apache.spark.JavaAPISuite.wholeTextFiles
---
 core/src/main/scala/org/apache/spark/SparkConf.scala      | 2 +-
 core/src/test/scala/org/apache/spark/SparkConfSuite.scala | 3 +++
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/core/src/main/scala/org/apache/spark/SparkConf.scala b/core/src/main/scala/org/apache/spark/SparkConf.scala
index a8fc90ad2050e..b5e5d6f1465f3 100644
--- a/core/src/main/scala/org/apache/spark/SparkConf.scala
+++ b/core/src/main/scala/org/apache/spark/SparkConf.scala
@@ -509,7 +509,7 @@ private[spark] object SparkConf extends Logging {
       AlternateConfig("spark.reducer.maxMbInFlight", "1.4")),
     "spark.kryoserializer.buffer" ->
         Seq(AlternateConfig("spark.kryoserializer.buffer.mb", "1.4", 
-          translation = s => s"${s.toDouble * 1000}k")),
+          translation = s => s"${(s.toDouble * 1000).toInt}k")),
     "spark.kryoserializer.buffer.max" -> Seq(
       AlternateConfig("spark.kryoserializer.buffer.max.mb", "1.4")),
     "spark.shuffle.file.buffer" -> Seq(
diff --git a/core/src/test/scala/org/apache/spark/SparkConfSuite.scala b/core/src/test/scala/org/apache/spark/SparkConfSuite.scala
index 68d08e32f9aa4..fafa4ed606b08 100644
--- a/core/src/test/scala/org/apache/spark/SparkConfSuite.scala
+++ b/core/src/test/scala/org/apache/spark/SparkConfSuite.scala
@@ -241,6 +241,9 @@ class SparkConfSuite extends FunSuite with LocalSparkContext with ResetSystemPro
 
     conf.set("spark.yarn.applicationMaster.waitTries", "42")
     assert(conf.getTimeAsSeconds("spark.yarn.am.waitTime") === 420)
+
+    conf.set("spark.kryoserializer.buffer.mb", "1.1")
+    assert(conf.getSizeAsKb("spark.kryoserializer.buffer") === 1100)
   }
 
   test("akka deprecated configs") {

From 47e7ffe36b8a8a246fe9af522aff480d19c0c8a6 Mon Sep 17 00:00:00 2001
From: zsxwing <zsxwing@gmail.com>
Date: Sat, 16 May 2015 00:44:29 -0700
Subject: [PATCH 022/525] [SPARK-7655][Core][SQL] Remove
 'scala.concurrent.ExecutionContext.Implicits.global' in 'ask' and
 'BroadcastHashJoin'

Because both `AkkaRpcEndpointRef.ask` and `BroadcastHashJoin` uses `scala.concurrent.ExecutionContext.Implicits.global`. However, because the tasks in `BroadcastHashJoin` are usually long-running tasks, which will occupy all threads in `global`. Then `ask` cannot get a chance to process the replies.

For `ask`, actually the tasks are very simple, so we can use `MoreExecutors.sameThreadExecutor()`. For `BroadcastHashJoin`, it's better to use `ThreadUtils.newDaemonCachedThreadPool`.

Author: zsxwing <zsxwing@gmail.com>

Closes #6200 from zsxwing/SPARK-7655-2 and squashes the following commits:

cfdc605 [zsxwing] Remove redundant imort and minor doc fix
cf83153 [zsxwing] Add "sameThread" and "newDaemonCachedThreadPool with maxThreadNumber" to ThreadUtils
08ad0ee [zsxwing] Remove 'scala.concurrent.ExecutionContext.Implicits.global' in 'ask' and 'BroadcastHashJoin'
---
 .../apache/spark/rpc/akka/AkkaRpcEnv.scala    |  8 ++++---
 .../org/apache/spark/util/ThreadUtils.scala   | 24 ++++++++++++++++++-
 .../apache/spark/util/ThreadUtilsSuite.scala  | 12 ++++++++++
 .../execution/joins/BroadcastHashJoin.scala   | 10 ++++++--
 4 files changed, 48 insertions(+), 6 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/rpc/akka/AkkaRpcEnv.scala b/core/src/main/scala/org/apache/spark/rpc/akka/AkkaRpcEnv.scala
index ba0d468f111ef..0161962cde073 100644
--- a/core/src/main/scala/org/apache/spark/rpc/akka/AkkaRpcEnv.scala
+++ b/core/src/main/scala/org/apache/spark/rpc/akka/AkkaRpcEnv.scala
@@ -29,9 +29,11 @@ import akka.actor.{ActorSystem, ExtendedActorSystem, Actor, ActorRef, Props, Add
 import akka.event.Logging.Error
 import akka.pattern.{ask => akkaAsk}
 import akka.remote.{AssociationEvent, AssociatedEvent, DisassociatedEvent, AssociationErrorEvent}
+import com.google.common.util.concurrent.MoreExecutors
+
 import org.apache.spark.{SparkException, Logging, SparkConf}
 import org.apache.spark.rpc._
-import org.apache.spark.util.{ActorLogReceive, AkkaUtils}
+import org.apache.spark.util.{ActorLogReceive, AkkaUtils, ThreadUtils}
 
 /**
  * A RpcEnv implementation based on Akka.
@@ -294,8 +296,8 @@ private[akka] class AkkaRpcEndpointRef(
   }
 
   override def ask[T: ClassTag](message: Any, timeout: FiniteDuration): Future[T] = {
-    import scala.concurrent.ExecutionContext.Implicits.global
     actorRef.ask(AkkaMessage(message, true))(timeout).flatMap {
+      // The function will run in the calling thread, so it should be short and never block.
       case msg @ AkkaMessage(message, reply) =>
         if (reply) {
           logError(s"Receive $msg but the sender cannot reply")
@@ -305,7 +307,7 @@ private[akka] class AkkaRpcEndpointRef(
         }
       case AkkaFailure(e) =>
         Future.failed(e)
-    }.mapTo[T]
+    }(ThreadUtils.sameThread).mapTo[T]
   }
 
   override def toString: String = s"${getClass.getSimpleName}($actorRef)"
diff --git a/core/src/main/scala/org/apache/spark/util/ThreadUtils.scala b/core/src/main/scala/org/apache/spark/util/ThreadUtils.scala
index 098a4b79496b2..ca5624a3d8b3d 100644
--- a/core/src/main/scala/org/apache/spark/util/ThreadUtils.scala
+++ b/core/src/main/scala/org/apache/spark/util/ThreadUtils.scala
@@ -20,10 +20,22 @@ package org.apache.spark.util
 
 import java.util.concurrent._
 
-import com.google.common.util.concurrent.ThreadFactoryBuilder
+import scala.concurrent.{ExecutionContext, ExecutionContextExecutor}
+
+import com.google.common.util.concurrent.{MoreExecutors, ThreadFactoryBuilder}
 
 private[spark] object ThreadUtils {
 
+  private val sameThreadExecutionContext =
+    ExecutionContext.fromExecutorService(MoreExecutors.sameThreadExecutor())
+
+  /**
+   * An `ExecutionContextExecutor` that runs each task in the thread that invokes `execute/submit`.
+   * The caller should make sure the tasks running in this `ExecutionContextExecutor` are short and
+   * never block.
+   */
+  def sameThread: ExecutionContextExecutor = sameThreadExecutionContext
+
   /**
    * Create a thread factory that names threads with a prefix and also sets the threads to daemon.
    */
@@ -40,6 +52,16 @@ private[spark] object ThreadUtils {
     Executors.newCachedThreadPool(threadFactory).asInstanceOf[ThreadPoolExecutor]
   }
 
+  /**
+   * Create a cached thread pool whose max number of threads is `maxThreadNumber`. Thread names
+   * are formatted as prefix-ID, where ID is a unique, sequentially assigned integer.
+   */
+  def newDaemonCachedThreadPool(prefix: String, maxThreadNumber: Int): ThreadPoolExecutor = {
+    val threadFactory = namedThreadFactory(prefix)
+    new ThreadPoolExecutor(
+      0, maxThreadNumber, 60L, TimeUnit.SECONDS, new SynchronousQueue[Runnable], threadFactory)
+  }
+
   /**
    * Wrapper over newFixedThreadPool. Thread names are formatted as prefix-ID, where ID is a
    * unique, sequentially assigned integer.
diff --git a/core/src/test/scala/org/apache/spark/util/ThreadUtilsSuite.scala b/core/src/test/scala/org/apache/spark/util/ThreadUtilsSuite.scala
index a3aa3e953fbec..751d3df9cc8f7 100644
--- a/core/src/test/scala/org/apache/spark/util/ThreadUtilsSuite.scala
+++ b/core/src/test/scala/org/apache/spark/util/ThreadUtilsSuite.scala
@@ -20,6 +20,9 @@ package org.apache.spark.util
 
 import java.util.concurrent.{CountDownLatch, TimeUnit}
 
+import scala.concurrent.{Await, Future}
+import scala.concurrent.duration._
+
 import org.scalatest.FunSuite
 
 class ThreadUtilsSuite extends FunSuite {
@@ -54,4 +57,13 @@ class ThreadUtilsSuite extends FunSuite {
       executor.shutdownNow()
     }
   }
+
+  test("sameThread") {
+    val callerThreadName = Thread.currentThread().getName()
+    val f = Future {
+      Thread.currentThread().getName()
+    }(ThreadUtils.sameThread)
+    val futureThreadName = Await.result(f, 10.seconds)
+    assert(futureThreadName === callerThreadName)
+  }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/BroadcastHashJoin.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/BroadcastHashJoin.scala
index 05dd5681edfac..fe43fc4125c8e 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/BroadcastHashJoin.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/BroadcastHashJoin.scala
@@ -18,10 +18,10 @@
 package org.apache.spark.sql.execution.joins
 
 import org.apache.spark.rdd.RDD
+import org.apache.spark.util.ThreadUtils
 
 import scala.concurrent._
 import scala.concurrent.duration._
-import scala.concurrent.ExecutionContext.Implicits.global
 
 import org.apache.spark.annotation.DeveloperApi
 import org.apache.spark.sql.catalyst.expressions.{Row, Expression}
@@ -64,7 +64,7 @@ case class BroadcastHashJoin(
     val input: Array[Row] = buildPlan.execute().map(_.copy()).collect()
     val hashed = HashedRelation(input.iterator, buildSideKeyGenerator, input.length)
     sparkContext.broadcast(hashed)
-  }
+  }(BroadcastHashJoin.broadcastHashJoinExecutionContext)
 
   protected override def doExecute(): RDD[Row] = {
     val broadcastRelation = Await.result(broadcastFuture, timeout)
@@ -74,3 +74,9 @@ case class BroadcastHashJoin(
     }
   }
 }
+
+object BroadcastHashJoin {
+
+  private val broadcastHashJoinExecutionContext = ExecutionContext.fromExecutorService(
+    ThreadUtils.newDaemonCachedThreadPool("broadcast-hash-join", 1024))
+}

From ce6391296a061bc352386080a2ee96bb63fcc4ac Mon Sep 17 00:00:00 2001
From: Cheng Lian <lian@databricks.com>
Date: Sat, 16 May 2015 20:55:10 +0800
Subject: [PATCH 023/525] [HOTFIX] [SQL] Fixes DataFrameWriter.mode(String)

We forgot an assignment there.

/cc rxin

Author: Cheng Lian <lian@databricks.com>

Closes #6212 from liancheng/fix-df-writer and squashes the following commits:

711fbb0 [Cheng Lian] Adds a test case
3b72d78 [Cheng Lian] Fixes DataFrameWriter.mode(String)
---
 .../main/scala/org/apache/spark/sql/DataFrameWriter.scala  | 2 +-
 .../scala/org/apache/spark/sql/sources/SaveLoadSuite.scala | 7 +++++++
 2 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala
index b1fc18ac3cb54..9f42f0f1f4398 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala
@@ -55,7 +55,7 @@ final class DataFrameWriter private[sql](df: DataFrame) {
    * @since 1.4.0
    */
   def mode(saveMode: String): DataFrameWriter = {
-    saveMode.toLowerCase match {
+    this.mode = saveMode.toLowerCase match {
       case "overwrite" => SaveMode.Overwrite
       case "append" => SaveMode.Append
       case "ignore" => SaveMode.Ignore
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/SaveLoadSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/SaveLoadSuite.scala
index 7a28e9af3673c..274c652dd14d6 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/sources/SaveLoadSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/sources/SaveLoadSuite.scala
@@ -75,6 +75,13 @@ class SaveLoadSuite extends DataSourceTest with BeforeAndAfterAll {
     checkLoad()
   }
 
+  test("save with string mode and path, and load") {
+    conf.setConf(SQLConf.DEFAULT_DATA_SOURCE_NAME, "org.apache.spark.sql.json")
+    path.createNewFile()
+    df.write.mode("overwrite").save(path.toString)
+    checkLoad()
+  }
+
   test("save with path and datasource, and load") {
     conf.setConf(SQLConf.DEFAULT_DATA_SOURCE_NAME, "not a source name")
     df.write.json(path.toString)

From 1b4e710e5cdb00febb4c5920d81e77c2e3966a8b Mon Sep 17 00:00:00 2001
From: Matthew Brandyberry <mbrandy@us.ibm.com>
Date: Sat, 16 May 2015 18:17:48 +0100
Subject: [PATCH 024/525] [BUILD] update jblas dependency version to 1.2.4

jblas 1.2.4 includes native library support for PPC64LE.

Author: Matthew Brandyberry <mbrandy@us.ibm.com>

Closes #6199 from mtbrandy/jblas-1.2.4 and squashes the following commits:

9df9301 [Matthew Brandyberry] [BUILD] update jblas dependency version to 1.2.4
---
 LICENSE | 2 +-
 pom.xml | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/LICENSE b/LICENSE
index d6b9ccf07d999..9d1b00beff748 100644
--- a/LICENSE
+++ b/LICENSE
@@ -861,7 +861,7 @@ The following components are provided under a BSD-style license. See project lin
 
      (BSD 3 Clause) core (com.github.fommil.netlib:core:1.1.2 - https://github.com/fommil/netlib-java/core)
      (BSD 3 Clause) JPMML-Model (org.jpmml:pmml-model:1.1.15 - https://github.com/jpmml/jpmml-model)
-     (BSD 3-clause style license) jblas (org.jblas:jblas:1.2.3 - http://jblas.org/)
+     (BSD 3-clause style license) jblas (org.jblas:jblas:1.2.4 - http://jblas.org/)
      (BSD License) AntLR Parser Generator (antlr:antlr:2.7.7 - http://www.antlr.org/)
      (BSD License) Javolution (javolution:javolution:5.5.1 - http://javolution.org)
      (BSD licence) ANTLR ST4 4.0.4 (org.antlr:ST4:4.0.4 - http://www.stringtemplate.org)
diff --git a/pom.xml b/pom.xml
index 86aa0a9fa134c..1b45cdb67012a 100644
--- a/pom.xml
+++ b/pom.xml
@@ -137,7 +137,7 @@
     <hive.version.short>0.13.1</hive.version.short>
     <derby.version>10.10.1.1</derby.version>
     <parquet.version>1.6.0rc3</parquet.version>
-    <jblas.version>1.2.3</jblas.version>
+    <jblas.version>1.2.4</jblas.version>
     <jetty.version>8.1.14.v20131031</jetty.version>
     <orbit.version>3.0.0.v201112011016</orbit.version>
     <chill.version>0.5.0</chill.version>

From 161d0b4a41f453b21adde46a86e16c2743752799 Mon Sep 17 00:00:00 2001
From: Reynold Xin <rxin@databricks.com>
Date: Sat, 16 May 2015 15:03:57 -0700
Subject: [PATCH 025/525] [SPARK-7654][MLlib] Migrate MLlib to the DataFrame
 reader/writer API.

Author: Reynold Xin <rxin@databricks.com>

Closes #6211 from rxin/mllib-reader and squashes the following commits:

79a2cb9 [Reynold Xin] [SPARK-7654][MLlib] Migrate MLlib to the DataFrame reader/writer API.
---
 .../org/apache/spark/examples/mllib/DatasetExample.scala      | 2 +-
 .../scala/org/apache/spark/examples/sql/RDDRelation.scala     | 2 +-
 .../org/apache/spark/mllib/classification/NaiveBayes.scala    | 4 ++--
 .../mllib/classification/impl/GLMClassificationModel.scala    | 2 +-
 .../apache/spark/mllib/clustering/GaussianMixtureModel.scala  | 2 +-
 .../scala/org/apache/spark/mllib/clustering/KMeansModel.scala | 2 +-
 .../spark/mllib/clustering/PowerIterationClustering.scala     | 4 ++--
 .../main/scala/org/apache/spark/mllib/feature/Word2Vec.scala  | 2 +-
 .../spark/mllib/recommendation/MatrixFactorizationModel.scala | 4 ++--
 .../apache/spark/mllib/regression/IsotonicRegression.scala    | 2 +-
 .../spark/mllib/regression/impl/GLMRegressionModel.scala      | 2 +-
 .../org/apache/spark/mllib/tree/model/DecisionTreeModel.scala | 2 +-
 .../apache/spark/mllib/tree/model/treeEnsembleModels.scala    | 2 +-
 13 files changed, 16 insertions(+), 16 deletions(-)

diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/DatasetExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/DatasetExample.scala
index c95cca7d656e8..520893b26d595 100644
--- a/examples/src/main/scala/org/apache/spark/examples/mllib/DatasetExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/mllib/DatasetExample.scala
@@ -103,7 +103,7 @@ object DatasetExample {
     tmpDir.deleteOnExit()
     val outputDir = new File(tmpDir, "dataset").toString
     println(s"Saving to $outputDir as Parquet file.")
-    df.saveAsParquetFile(outputDir)
+    df.write.parquet(outputDir)
 
     println(s"Loading Parquet file with UDT from $outputDir.")
     val newDataset = sqlContext.read.parquet(outputDir)
diff --git a/examples/src/main/scala/org/apache/spark/examples/sql/RDDRelation.scala b/examples/src/main/scala/org/apache/spark/examples/sql/RDDRelation.scala
index acc89199d5849..b11e32047dc34 100644
--- a/examples/src/main/scala/org/apache/spark/examples/sql/RDDRelation.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/sql/RDDRelation.scala
@@ -58,7 +58,7 @@ object RDDRelation {
     df.where($"key" === 1).orderBy($"value".asc).select($"key").collect().foreach(println)
 
     // Write out an RDD as a parquet file.
-    df.saveAsParquetFile("pair.parquet")
+    df.write.parquet("pair.parquet")
 
     // Read in parquet file.  Parquet files are self-describing so the schmema is preserved.
     val parquetFile = sqlContext.read.parquet("pair.parquet")
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/classification/NaiveBayes.scala b/mllib/src/main/scala/org/apache/spark/mllib/classification/NaiveBayes.scala
index af24ab616663b..ac0ebeceaa1df 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/classification/NaiveBayes.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/classification/NaiveBayes.scala
@@ -140,7 +140,7 @@ object NaiveBayesModel extends Loader[NaiveBayesModel] {
 
       // Create Parquet data.
       val dataRDD: DataFrame = sc.parallelize(Seq(data), 1).toDF()
-      dataRDD.saveAsParquetFile(dataPath(path))
+      dataRDD.write.parquet(dataPath(path))
     }
 
     def load(sc: SparkContext, path: String): NaiveBayesModel = {
@@ -186,7 +186,7 @@ object NaiveBayesModel extends Loader[NaiveBayesModel] {
 
       // Create Parquet data.
       val dataRDD: DataFrame = sc.parallelize(Seq(data), 1).toDF()
-      dataRDD.saveAsParquetFile(dataPath(path))
+      dataRDD.write.parquet(dataPath(path))
     }
 
     def load(sc: SparkContext, path: String): NaiveBayesModel = {
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/classification/impl/GLMClassificationModel.scala b/mllib/src/main/scala/org/apache/spark/mllib/classification/impl/GLMClassificationModel.scala
index 3b6790cce47c6..d842ec57b2f52 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/classification/impl/GLMClassificationModel.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/classification/impl/GLMClassificationModel.scala
@@ -62,7 +62,7 @@ private[classification] object GLMClassificationModel {
 
       // Create Parquet data.
       val data = Data(weights, intercept, threshold)
-      sc.parallelize(Seq(data), 1).toDF().saveAsParquetFile(Loader.dataPath(path))
+      sc.parallelize(Seq(data), 1).toDF().write.parquet(Loader.dataPath(path))
     }
 
     /**
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixtureModel.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixtureModel.scala
index c22862c130e77..731b43a1be574 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixtureModel.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixtureModel.scala
@@ -126,7 +126,7 @@ object GaussianMixtureModel extends Loader[GaussianMixtureModel] {
       val dataArray = Array.tabulate(weights.length) { i =>
         Data(weights(i), gaussians(i).mu, gaussians(i).sigma)
       }
-      sc.parallelize(dataArray, 1).toDF().saveAsParquetFile(Loader.dataPath(path))
+      sc.parallelize(dataArray, 1).toDF().write.parquet(Loader.dataPath(path))
     }
 
     def load(sc: SparkContext, path: String): GaussianMixtureModel = {
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeansModel.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeansModel.scala
index ba228b11fcec3..252e166e85cef 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeansModel.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeansModel.scala
@@ -110,7 +110,7 @@ object KMeansModel extends Loader[KMeansModel] {
       val dataRDD = sc.parallelize(model.clusterCenters.zipWithIndex).map { case (point, id) =>
         Cluster(id, point)
       }.toDF()
-      dataRDD.saveAsParquetFile(Loader.dataPath(path))
+      dataRDD.write.parquet(Loader.dataPath(path))
     }
 
     def load(sc: SparkContext, path: String): KMeansModel = {
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/PowerIterationClustering.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/PowerIterationClustering.scala
index aa53e88d59856..1ed01c9d8ba0b 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/PowerIterationClustering.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/PowerIterationClustering.scala
@@ -74,7 +74,7 @@ object PowerIterationClusteringModel extends Loader[PowerIterationClusteringMode
       sc.parallelize(Seq(metadata), 1).saveAsTextFile(Loader.metadataPath(path))
 
       val dataRDD = model.assignments.toDF()
-      dataRDD.saveAsParquetFile(Loader.dataPath(path))
+      dataRDD.write.parquet(Loader.dataPath(path))
     }
 
     def load(sc: SparkContext, path: String): PowerIterationClusteringModel = {
@@ -86,7 +86,7 @@ object PowerIterationClusteringModel extends Loader[PowerIterationClusteringMode
       assert(formatVersion == thisFormatVersion)
 
       val k = (metadata \ "k").extract[Int]
-      val assignments = sqlContext.parquetFile(Loader.dataPath(path))
+      val assignments = sqlContext.read.parquet(Loader.dataPath(path))
       Loader.checkSchema[PowerIterationClustering.Assignment](assignments.schema)
 
       val assignmentsRDD = assignments.map {
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala b/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala
index 98e83112f52ae..731f7576c2335 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala
@@ -580,7 +580,7 @@ object Word2VecModel extends Loader[Word2VecModel] {
       sc.parallelize(Seq(metadata), 1).saveAsTextFile(Loader.metadataPath(path))
 
       val dataArray = model.toSeq.map { case (w, v) => Data(w, v) }
-      sc.parallelize(dataArray.toSeq, 1).toDF().saveAsParquetFile(Loader.dataPath(path))
+      sc.parallelize(dataArray.toSeq, 1).toDF().write.parquet(Loader.dataPath(path))
     }
   }
 
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/recommendation/MatrixFactorizationModel.scala b/mllib/src/main/scala/org/apache/spark/mllib/recommendation/MatrixFactorizationModel.scala
index 88c2148403313..b960fbc5bf5f5 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/recommendation/MatrixFactorizationModel.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/recommendation/MatrixFactorizationModel.scala
@@ -281,8 +281,8 @@ object MatrixFactorizationModel extends Loader[MatrixFactorizationModel] {
       val metadata = compact(render(
         ("class" -> thisClassName) ~ ("version" -> thisFormatVersion) ~ ("rank" -> model.rank)))
       sc.parallelize(Seq(metadata), 1).saveAsTextFile(metadataPath(path))
-      model.userFeatures.toDF("id", "features").saveAsParquetFile(userPath(path))
-      model.productFeatures.toDF("id", "features").saveAsParquetFile(productPath(path))
+      model.userFeatures.toDF("id", "features").write.parquet(userPath(path))
+      model.productFeatures.toDF("id", "features").write.parquet(productPath(path))
     }
 
     def load(sc: SparkContext, path: String): MatrixFactorizationModel = {
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/regression/IsotonicRegression.scala b/mllib/src/main/scala/org/apache/spark/mllib/regression/IsotonicRegression.scala
index 4ce541ae5bed9..22b9b22a871f0 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/regression/IsotonicRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/regression/IsotonicRegression.scala
@@ -184,7 +184,7 @@ object IsotonicRegressionModel extends Loader[IsotonicRegressionModel] {
 
       sqlContext.createDataFrame(
         boundaries.toSeq.zip(predictions).map { case (b, p) => Data(b, p) }
-      ).saveAsParquetFile(dataPath(path))
+      ).write.parquet(dataPath(path))
     }
 
     def load(sc: SparkContext, path: String): (Array[Double], Array[Double]) = {
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/regression/impl/GLMRegressionModel.scala b/mllib/src/main/scala/org/apache/spark/mllib/regression/impl/GLMRegressionModel.scala
index b55944f74f623..2aa0e9ef96d48 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/regression/impl/GLMRegressionModel.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/regression/impl/GLMRegressionModel.scala
@@ -60,7 +60,7 @@ private[regression] object GLMRegressionModel {
       val data = Data(weights, intercept)
       val dataRDD: DataFrame = sc.parallelize(Seq(data), 1).toDF()
       // TODO: repartition with 1 partition after SPARK-5532 gets fixed
-      dataRDD.saveAsParquetFile(Loader.dataPath(path))
+      dataRDD.write.parquet(Loader.dataPath(path))
     }
 
     /**
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/model/DecisionTreeModel.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/model/DecisionTreeModel.scala
index 331af428533de..a558f84c8d506 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/tree/model/DecisionTreeModel.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/model/DecisionTreeModel.scala
@@ -223,7 +223,7 @@ object DecisionTreeModel extends Loader[DecisionTreeModel] with Logging {
       val dataRDD: DataFrame = sc.parallelize(nodes)
         .map(NodeData.apply(0, _))
         .toDF()
-      dataRDD.saveAsParquetFile(Loader.dataPath(path))
+      dataRDD.write.parquet(Loader.dataPath(path))
     }
 
     def load(sc: SparkContext, path: String, algo: String, numNodes: Int): DecisionTreeModel = {
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/model/treeEnsembleModels.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/model/treeEnsembleModels.scala
index 8341219bfa71c..f9cd0140fe63f 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/tree/model/treeEnsembleModels.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/model/treeEnsembleModels.scala
@@ -414,7 +414,7 @@ private[tree] object TreeEnsembleModel extends Logging {
       val dataRDD = sc.parallelize(model.trees.zipWithIndex).flatMap { case (tree, treeId) =>
         tree.topNode.subtreeIterator.toSeq.map(node => NodeData(treeId, node))
       }.toDF()
-      dataRDD.saveAsParquetFile(Loader.dataPath(path))
+      dataRDD.write.parquet(Loader.dataPath(path))
     }
 
     /**

From 3b6ef2c5391b528ef989e24400fbb0c496c3b245 Mon Sep 17 00:00:00 2001
From: zsxwing <zsxwing@gmail.com>
Date: Sat, 16 May 2015 21:03:22 -0700
Subject: [PATCH 026/525] [SPARK-7655][Core] Deserializing value should not
 hold the TaskSchedulerImpl lock

We should not call `DirectTaskResult.value` when holding the `TaskSchedulerImpl` lock. It may cost dozens of seconds to deserialize a large object.

Author: zsxwing <zsxwing@gmail.com>

Closes #6195 from zsxwing/SPARK-7655 and squashes the following commits:

21f502e [zsxwing] Add more comments
e25fa88 [zsxwing] Add comments
15010b5 [zsxwing] Deserialize value should not hold the TaskSchedulerImpl lock
---
 .../apache/spark/scheduler/TaskResult.scala   | 23 +++++++++++++++++--
 .../spark/scheduler/TaskResultGetter.scala    |  4 ++++
 .../spark/scheduler/TaskSetManager.scala      |  6 +++++
 3 files changed, 31 insertions(+), 2 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/scheduler/TaskResult.scala b/core/src/main/scala/org/apache/spark/scheduler/TaskResult.scala
index 1f114a0207f7b..8b2a742b96988 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/TaskResult.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/TaskResult.scala
@@ -40,6 +40,9 @@ class DirectTaskResult[T](var valueBytes: ByteBuffer, var accumUpdates: Map[Long
     var metrics: TaskMetrics)
   extends TaskResult[T] with Externalizable {
 
+  private var valueObjectDeserialized = false
+  private var valueObject: T = _
+
   def this() = this(null.asInstanceOf[ByteBuffer], null, null)
 
   override def writeExternal(out: ObjectOutput): Unit = Utils.tryOrIOException {
@@ -72,10 +75,26 @@ class DirectTaskResult[T](var valueBytes: ByteBuffer, var accumUpdates: Map[Long
       }
     }
     metrics = in.readObject().asInstanceOf[TaskMetrics]
+    valueObjectDeserialized = false
   }
 
+  /**
+   * When `value()` is called at the first time, it needs to deserialize `valueObject` from
+   * `valueBytes`. It may cost dozens of seconds for a large instance. So when calling `value` at
+   * the first time, the caller should avoid to block other threads.
+   *
+   * After the first time, `value()` is trivial and just returns the deserialized `valueObject`.
+   */
   def value(): T = {
-    val resultSer = SparkEnv.get.serializer.newInstance()
-    resultSer.deserialize(valueBytes)
+    if (valueObjectDeserialized) {
+      valueObject
+    } else {
+      // This should not run when holding a lock because it may cost dozens of seconds for a large
+      // value.
+      val resultSer = SparkEnv.get.serializer.newInstance()
+      valueObject = resultSer.deserialize(valueBytes)
+      valueObjectDeserialized = true
+      valueObject
+    }
   }
 }
diff --git a/core/src/main/scala/org/apache/spark/scheduler/TaskResultGetter.scala b/core/src/main/scala/org/apache/spark/scheduler/TaskResultGetter.scala
index 391827c1d2156..46a6f6537e2ee 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/TaskResultGetter.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/TaskResultGetter.scala
@@ -54,6 +54,10 @@ private[spark] class TaskResultGetter(sparkEnv: SparkEnv, scheduler: TaskSchedul
               if (!taskSetManager.canFetchMoreResults(serializedData.limit())) {
                 return
               }
+              // deserialize "value" without holding any lock so that it won't block other threads.
+              // We should call it here, so that when it's called again in
+              // "TaskSetManager.handleSuccessfulTask", it does not need to deserialize the value.
+              directResult.value()
               (directResult, serializedData.limit())
             case IndirectTaskResult(blockId, size) =>
               if (!taskSetManager.canFetchMoreResults(size)) {
diff --git a/core/src/main/scala/org/apache/spark/scheduler/TaskSetManager.scala b/core/src/main/scala/org/apache/spark/scheduler/TaskSetManager.scala
index 7dc325283d961..c4487d5b37247 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/TaskSetManager.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/TaskSetManager.scala
@@ -620,6 +620,12 @@ private[spark] class TaskSetManager(
     val index = info.index
     info.markSuccessful()
     removeRunningTask(tid)
+    // This method is called by "TaskSchedulerImpl.handleSuccessfulTask" which holds the
+    // "TaskSchedulerImpl" lock until exiting. To avoid the SPARK-7655 issue, we should not
+    // "deserialize" the value when holding a lock to avoid blocking other threads. So we call
+    // "result.value()" in "TaskResultGetter.enqueueSuccessfulTask" before reaching here.
+    // Note: "result.value()" only deserializes the value when it's called at the first time, so
+    // here "result.value()" just returns the value and won't block other threads.
     sched.dagScheduler.taskEnded(
       tasks(index), Success, result.value(), result.accumUpdates, info, result.metrics)
     if (!successful(index)) {

From 517eb37a85e0a28820bcfd5d98c50d02df6521c6 Mon Sep 17 00:00:00 2001
From: Reynold Xin <rxin@databricks.com>
Date: Sat, 16 May 2015 22:01:53 -0700
Subject: [PATCH 027/525] [SPARK-7654][SQL] Move JDBC into DataFrame's
 reader/writer interface.

Also moved all the deprecated functions into one place for SQLContext and DataFrame, and updated tests to use the new API.

Author: Reynold Xin <rxin@databricks.com>

Closes #6210 from rxin/df-writer-reader-jdbc and squashes the following commits:

7465c2c [Reynold Xin] Fixed unit test.
118e609 [Reynold Xin] Updated tests.
3441b57 [Reynold Xin] Updated javadoc.
13cdd1c [Reynold Xin] [SPARK-7654][SQL] Move JDBC into DataFrame's reader/writer interface.
---
 .../spark/examples/sql/JavaSparkSQL.java      |   4 +-
 .../org/apache/spark/sql/DataFrame.scala      | 284 +++-----
 .../apache/spark/sql/DataFrameReader.scala    |  89 ++-
 .../apache/spark/sql/DataFrameWriter.scala    |  53 +-
 .../org/apache/spark/sql/SQLContext.scala     | 682 +++++++-----------
 .../org/apache/spark/sql/jdbc/JDBCRDD.scala   |  30 +-
 .../apache/spark/sql/jdbc/JDBCRelation.scala  |  16 +-
 .../org/apache/spark/sql/jdbc/JdbcUtils.scala |  52 ++
 .../org/apache/spark/sql/jdbc/jdbc.scala      |   6 +-
 .../spark/sql/JavaApplySchemaSuite.java       |   4 +-
 .../spark/sql/sources/JavaSaveLoadSuite.java  |  10 +-
 .../org/apache/spark/sql/jdbc/JDBCSuite.scala |  31 +-
 .../spark/sql/jdbc/JDBCWriteSuite.scala       |  54 +-
 .../hive/JavaMetastoreDataSourcesSuite.java   |  20 +-
 .../spark/sql/hive/CachedTableSuite.scala     |   4 +-
 .../sql/hive/MetastoreDataSourcesSuite.scala  |  73 +-
 .../hive/execution/HiveResolutionSuite.scala  |   6 +-
 .../sql/hive/execution/SQLQuerySuite.scala    |   8 +-
 .../apache/spark/sql/hive/parquetSuites.scala |  14 +-
 .../sql/sources/hadoopFsRelationSuites.scala  |  68 +-
 20 files changed, 747 insertions(+), 761 deletions(-)
 create mode 100644 sql/core/src/main/scala/org/apache/spark/sql/jdbc/JdbcUtils.scala

diff --git a/examples/src/main/java/org/apache/spark/examples/sql/JavaSparkSQL.java b/examples/src/main/java/org/apache/spark/examples/sql/JavaSparkSQL.java
index 173633ce059e3..afee279ec32b1 100644
--- a/examples/src/main/java/org/apache/spark/examples/sql/JavaSparkSQL.java
+++ b/examples/src/main/java/org/apache/spark/examples/sql/JavaSparkSQL.java
@@ -94,7 +94,7 @@ public String call(Row row) {
 
     System.out.println("=== Data source: Parquet File ===");
     // DataFrames can be saved as parquet files, maintaining the schema information.
-    schemaPeople.saveAsParquetFile("people.parquet");
+    schemaPeople.write().parquet("people.parquet");
 
     // Read in the parquet file created above.
     // Parquet files are self-describing so the schema is preserved.
@@ -151,7 +151,7 @@ public String call(Row row) {
     List<String> jsonData = Arrays.asList(
           "{\"name\":\"Yin\",\"address\":{\"city\":\"Columbus\",\"state\":\"Ohio\"}}");
     JavaRDD<String> anotherPeopleRDD = ctx.parallelize(jsonData);
-    DataFrame peopleFromJsonRDD = sqlContext.jsonRDD(anotherPeopleRDD.rdd());
+    DataFrame peopleFromJsonRDD = sqlContext.read().json(anotherPeopleRDD.rdd());
 
     // Take a look at the schema of this new DataFrame.
     peopleFromJsonRDD.printSchema();
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
index 55ef357a99f71..27e9af49f0664 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
@@ -18,7 +18,6 @@
 package org.apache.spark.sql
 
 import java.io.CharArrayWriter
-import java.sql.DriverManager
 import java.util.Properties
 
 import scala.collection.JavaConversions._
@@ -40,9 +39,8 @@ import org.apache.spark.sql.catalyst.plans.logical.{Filter, _}
 import org.apache.spark.sql.catalyst.plans.{Inner, JoinType}
 import org.apache.spark.sql.catalyst.{expressions, CatalystTypeConverters, ScalaReflection, SqlParser}
 import org.apache.spark.sql.execution.{EvaluatePython, ExplainCommand, LogicalRDD}
-import org.apache.spark.sql.jdbc.JDBCWriteDetails
 import org.apache.spark.sql.json.JacksonGenerator
-import org.apache.spark.sql.sources.{CreateTableUsingAsSelect, ResolvedDataSource}
+import org.apache.spark.sql.sources.CreateTableUsingAsSelect
 import org.apache.spark.sql.types._
 import org.apache.spark.storage.StorageLevel
 import org.apache.spark.util.Utils
@@ -227,10 +225,6 @@ class DataFrame private[sql](
     }
   }
 
-  /** Left here for backward compatibility. */
-  @deprecated("1.3.0", "use toDF")
-  def toSchemaRDD: DataFrame = this
-
   /**
    * Returns the object itself.
    * @group basic
@@ -1299,12 +1293,119 @@ class DataFrame private[sql](
   @Experimental
   def write: DataFrameWriter = new DataFrameWriter(this)
 
+  /**
+   * :: Experimental ::
+   * Adds the rows from this RDD to the specified table, optionally overwriting the existing data.
+   * @group output
+   * @since 1.3.0
+   */
+  @Experimental
+  def insertInto(tableName: String, overwrite: Boolean): Unit = {
+    sqlContext.executePlan(InsertIntoTable(UnresolvedRelation(Seq(tableName)),
+      Map.empty, logicalPlan, overwrite, ifNotExists = false)).toRdd
+  }
+
+  /**
+   * :: Experimental ::
+   * Adds the rows from this RDD to the specified table.
+   * Throws an exception if the table already exists.
+   * @group output
+   * @since 1.3.0
+   */
+  @Experimental
+  def insertInto(tableName: String): Unit = insertInto(tableName, overwrite = false)
+
+  /**
+   * Returns the content of the [[DataFrame]] as a RDD of JSON strings.
+   * @group rdd
+   * @since 1.3.0
+   */
+  def toJSON: RDD[String] = {
+    val rowSchema = this.schema
+    this.mapPartitions { iter =>
+      val writer = new CharArrayWriter()
+      // create the Generator without separator inserted between 2 records
+      val gen = new JsonFactory().createGenerator(writer).setRootValueSeparator(null)
+
+      new Iterator[String] {
+        override def hasNext: Boolean = iter.hasNext
+        override def next(): String = {
+          JacksonGenerator(rowSchema, gen)(iter.next())
+          gen.flush()
+
+          val json = writer.toString
+          if (hasNext) {
+            writer.reset()
+          } else {
+            gen.close()
+          }
+
+          json
+        }
+      }
+    }
+  }
+
+  ////////////////////////////////////////////////////////////////////////////
+  // for Python API
+  ////////////////////////////////////////////////////////////////////////////
+
+  /**
+   * Converts a JavaRDD to a PythonRDD.
+   */
+  protected[sql] def javaToPython: JavaRDD[Array[Byte]] = {
+    val fieldTypes = schema.fields.map(_.dataType)
+    val jrdd = rdd.map(EvaluatePython.rowToArray(_, fieldTypes)).toJavaRDD()
+    SerDeUtil.javaToPython(jrdd)
+  }
+
+  ////////////////////////////////////////////////////////////////////////////
+  ////////////////////////////////////////////////////////////////////////////
+  // Deprecated methods
+  ////////////////////////////////////////////////////////////////////////////
+  ////////////////////////////////////////////////////////////////////////////
+
+  /** Left here for backward compatibility. */
+  @deprecated("use toDF", "1.3.0")
+  def toSchemaRDD: DataFrame = this
+
+  /**
+   * Save this [[DataFrame]] to a JDBC database at `url` under the table name `table`.
+   * This will run a `CREATE TABLE` and a bunch of `INSERT INTO` statements.
+   * If you pass `true` for `allowExisting`, it will drop any table with the
+   * given name; if you pass `false`, it will throw if the table already
+   * exists.
+   * @group output
+   */
+  @deprecated("Use write.jdbc()", "1.4.0")
+  def createJDBCTable(url: String, table: String, allowExisting: Boolean): Unit = {
+    val w = if (allowExisting) write.mode(SaveMode.Overwrite) else write
+    w.jdbc(url, table, new Properties)
+  }
+
+  /**
+   * Save this [[DataFrame]] to a JDBC database at `url` under the table name `table`.
+   * Assumes the table already exists and has a compatible schema.  If you
+   * pass `true` for `overwrite`, it will `TRUNCATE` the table before
+   * performing the `INSERT`s.
+   *
+   * The table must already exist on the database.  It must have a schema
+   * that is compatible with the schema of this RDD; inserting the rows of
+   * the RDD in order via the simple statement
+   * `INSERT INTO table VALUES (?, ?, ..., ?)` should not fail.
+   * @group output
+   */
+  @deprecated("Use write.jdbc()", "1.4.0")
+  def insertIntoJDBC(url: String, table: String, overwrite: Boolean): Unit = {
+    val w = if (overwrite) write.mode(SaveMode.Overwrite) else write
+    w.jdbc(url, table, new Properties)
+  }
+
   /**
    * Saves the contents of this [[DataFrame]] as a parquet file, preserving the schema.
    * Files that are written out using this method can be read back in as a [[DataFrame]]
    * using the `parquetFile` function in [[SQLContext]].
    * @group output
-   * @since 1.3.0
    */
   @deprecated("Use write.parquet(path)", "1.4.0")
   def saveAsParquetFile(path: String): Unit = {
@@ -1328,7 +1429,6 @@ class DataFrame private[sql](
    * Also note that while this function can persist the table metadata into Hive's metastore,
    * the table will NOT be accessible from Hive, until SPARK-7550 is resolved.
    * @group output
-   * @since 1.3.0
    */
   @deprecated("Use write.saveAsTable(tableName)", "1.4.0")
   def saveAsTable(tableName: String): Unit = {
@@ -1347,7 +1447,6 @@ class DataFrame private[sql](
    * Also note that while this function can persist the table metadata into Hive's metastore,
    * the table will NOT be accessible from Hive, until SPARK-7550 is resolved.
    * @group output
-   * @since 1.3.0
    */
   @deprecated("Use write.mode(mode).saveAsTable(tableName)", "1.4.0")
   def saveAsTable(tableName: String, mode: SaveMode): Unit = {
@@ -1373,7 +1472,6 @@ class DataFrame private[sql](
    * Also note that while this function can persist the table metadata into Hive's metastore,
    * the table will NOT be accessible from Hive, until SPARK-7550 is resolved.
    * @group output
-   * @since 1.3.0
    */
   @deprecated("Use write.format(source).saveAsTable(tableName)", "1.4.0")
   def saveAsTable(tableName: String, source: String): Unit = {
@@ -1393,7 +1491,6 @@ class DataFrame private[sql](
    * Also note that while this function can persist the table metadata into Hive's metastore,
    * the table will NOT be accessible from Hive, until SPARK-7550 is resolved.
    * @group output
-   * @since 1.3.0
    */
   @deprecated("Use write.format(source).mode(mode).saveAsTable(tableName)", "1.4.0")
   def saveAsTable(tableName: String, source: String, mode: SaveMode): Unit = {
@@ -1412,7 +1509,6 @@ class DataFrame private[sql](
    * Also note that while this function can persist the table metadata into Hive's metastore,
    * the table will NOT be accessible from Hive, until SPARK-7550 is resolved.
    * @group output
-   * @since 1.3.0
    */
   @deprecated("Use write.format(source).mode(mode).options(options).saveAsTable(tableName)",
     "1.4.0")
@@ -1437,7 +1533,6 @@ class DataFrame private[sql](
    * Also note that while this function can persist the table metadata into Hive's metastore,
    * the table will NOT be accessible from Hive, until SPARK-7550 is resolved.
    * @group output
-   * @since 1.3.0
    */
   @deprecated("Use write.format(source).mode(mode).options(options).saveAsTable(tableName)",
     "1.4.0")
@@ -1454,7 +1549,6 @@ class DataFrame private[sql](
    * using the default data source configured by spark.sql.sources.default and
    * [[SaveMode.ErrorIfExists]] as the save mode.
    * @group output
-   * @since 1.3.0
    */
   @deprecated("Use write.save(path)", "1.4.0")
   def save(path: String): Unit = {
@@ -1465,7 +1559,6 @@ class DataFrame private[sql](
    * Saves the contents of this DataFrame to the given path and [[SaveMode]] specified by mode,
    * using the default data source configured by spark.sql.sources.default.
    * @group output
-   * @since 1.3.0
    */
   @deprecated("Use write.mode(mode).save(path)", "1.4.0")
   def save(path: String, mode: SaveMode): Unit = {
@@ -1476,7 +1569,6 @@ class DataFrame private[sql](
    * Saves the contents of this DataFrame to the given path based on the given data source,
    * using [[SaveMode.ErrorIfExists]] as the save mode.
    * @group output
-   * @since 1.3.0
    */
   @deprecated("Use write.format(source).save(path)", "1.4.0")
   def save(path: String, source: String): Unit = {
@@ -1487,7 +1579,6 @@ class DataFrame private[sql](
    * Saves the contents of this DataFrame to the given path based on the given data source and
    * [[SaveMode]] specified by mode.
    * @group output
-   * @since 1.3.0
    */
   @deprecated("Use write.format(source).mode(mode).save(path)", "1.4.0")
   def save(path: String, source: String, mode: SaveMode): Unit = {
@@ -1498,7 +1589,6 @@ class DataFrame private[sql](
    * Saves the contents of this DataFrame based on the given data source,
    * [[SaveMode]] specified by mode, and a set of options.
    * @group output
-   * @since 1.3.0
    */
   @deprecated("Use write.format(source).mode(mode).options(options).save()", "1.4.0")
   def save(
@@ -1513,7 +1603,6 @@ class DataFrame private[sql](
    * Saves the contents of this DataFrame based on the given data source,
    * [[SaveMode]] specified by mode, and a set of options
    * @group output
-   * @since 1.3.0
    */
   @deprecated("Use write.format(source).mode(mode).options(options).save()", "1.4.0")
   def save(
@@ -1523,163 +1612,10 @@ class DataFrame private[sql](
     write.format(source).mode(mode).options(options).save()
   }
 
-  /**
-   * :: Experimental ::
-   * Adds the rows from this RDD to the specified table, optionally overwriting the existing data.
-   * @group output
-   * @since 1.3.0
-   */
-  @Experimental
-  def insertInto(tableName: String, overwrite: Boolean): Unit = {
-    sqlContext.executePlan(InsertIntoTable(UnresolvedRelation(Seq(tableName)),
-      Map.empty, logicalPlan, overwrite, ifNotExists = false)).toRdd
-  }
-
-  /**
-   * :: Experimental ::
-   * Adds the rows from this RDD to the specified table.
-   * Throws an exception if the table already exists.
-   * @group output
-   * @since 1.3.0
-   */
-  @Experimental
-  def insertInto(tableName: String): Unit = insertInto(tableName, overwrite = false)
-
-  /**
-   * Returns the content of the [[DataFrame]] as a RDD of JSON strings.
-   * @group rdd
-   * @since 1.3.0
-   */
-  def toJSON: RDD[String] = {
-    val rowSchema = this.schema
-    this.mapPartitions { iter =>
-      val writer = new CharArrayWriter()
-      // create the Generator without separator inserted between 2 records
-      val gen = new JsonFactory().createGenerator(writer).setRootValueSeparator(null)
-
-      new Iterator[String] {
-        override def hasNext: Boolean = iter.hasNext
-        override def next(): String = {
-          JacksonGenerator(rowSchema, gen)(iter.next())
-          gen.flush()
-
-          val json = writer.toString
-          if (hasNext) {
-            writer.reset()
-          } else {
-            gen.close()
-          }
-
-          json
-        }
-      }
-    }
-  }
-
   ////////////////////////////////////////////////////////////////////////////
-  // JDBC Write Support
   ////////////////////////////////////////////////////////////////////////////
-
-  /**
-   * Save this [[DataFrame]] to a JDBC database at `url` under the table name `table`.
-   * This will run a `CREATE TABLE` and a bunch of `INSERT INTO` statements.
-   * If you pass `true` for `allowExisting`, it will drop any table with the
-   * given name; if you pass `false`, it will throw if the table already
-   * exists.
-   * @group output
-   * @since 1.3.0
-   */
-  def createJDBCTable(url: String, table: String, allowExisting: Boolean): Unit = {
-    createJDBCTable(url, table, allowExisting, new Properties())
-  }
-    
-  /**
-   * Save this [[DataFrame]] to a JDBC database at `url` under the table name `table`
-   * using connection properties defined in `properties`.
-   * This will run a `CREATE TABLE` and a bunch of `INSERT INTO` statements.
-   * If you pass `true` for `allowExisting`, it will drop any table with the
-   * given name; if you pass `false`, it will throw if the table already
-   * exists.
-   * @group output
-   * @since 1.4.0
-   */
-  def createJDBCTable(
-      url: String,
-      table: String,
-      allowExisting: Boolean,
-      properties: Properties): Unit = {
-    val conn = DriverManager.getConnection(url, properties)
-    try {
-      if (allowExisting) {
-        val sql = s"DROP TABLE IF EXISTS $table"
-        conn.prepareStatement(sql).executeUpdate()
-      }
-      val schema = JDBCWriteDetails.schemaString(this, url)
-      val sql = s"CREATE TABLE $table ($schema)"
-      conn.prepareStatement(sql).executeUpdate()
-    } finally {
-      conn.close()
-    }
-    JDBCWriteDetails.saveTable(this, url, table, properties)
-  }
-
-  /**
-   * Save this [[DataFrame]] to a JDBC database at `url` under the table name `table`.
-   * Assumes the table already exists and has a compatible schema.  If you
-   * pass `true` for `overwrite`, it will `TRUNCATE` the table before
-   * performing the `INSERT`s.
-   *
-   * The table must already exist on the database.  It must have a schema
-   * that is compatible with the schema of this RDD; inserting the rows of
-   * the RDD in order via the simple statement
-   * `INSERT INTO table VALUES (?, ?, ..., ?)` should not fail.
-   * @group output
-   * @since 1.3.0
-   */
-  def insertIntoJDBC(url: String, table: String, overwrite: Boolean): Unit = {
-    insertIntoJDBC(url, table, overwrite, new Properties())
-  }
-
-  /**
-   * Save this [[DataFrame]] to a JDBC database at `url` under the table name `table`
-   * using connection properties defined in `properties`.
-   * Assumes the table already exists and has a compatible schema.  If you
-   * pass `true` for `overwrite`, it will `TRUNCATE` the table before
-   * performing the `INSERT`s.
-   *
-   * The table must already exist on the database.  It must have a schema
-   * that is compatible with the schema of this RDD; inserting the rows of
-   * the RDD in order via the simple statement
-   * `INSERT INTO table VALUES (?, ?, ..., ?)` should not fail.
-   * @group output
-   * @since 1.4.0
-   */
-  def insertIntoJDBC(
-      url: String,
-      table: String,
-      overwrite: Boolean,
-      properties: Properties): Unit = {
-    if (overwrite) {
-      val conn = DriverManager.getConnection(url, properties)
-      try {
-        val sql = s"TRUNCATE TABLE $table"
-        conn.prepareStatement(sql).executeUpdate()
-      } finally {
-        conn.close()
-      }
-    }
-    JDBCWriteDetails.saveTable(this, url, table, properties)
-  }
+  // End of eeprecated methods
   ////////////////////////////////////////////////////////////////////////////
-  // for Python API
   ////////////////////////////////////////////////////////////////////////////
 
-  /**
-   * Converts a JavaRDD to a PythonRDD.
-   */
-  protected[sql] def javaToPython: JavaRDD[Array[Byte]] = {
-    val fieldTypes = schema.fields.map(_.dataType)
-    val jrdd = rdd.map(EvaluatePython.rowToArray(_, fieldTypes)).toJavaRDD()
-    SerDeUtil.javaToPython(jrdd)
-  }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala
index 4d63faad6fb7c..381c10f48f3c3 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala
@@ -17,12 +17,16 @@
 
 package org.apache.spark.sql
 
+import java.util.Properties
+
 import org.apache.hadoop.fs.Path
+import org.apache.spark.Partition
 
 import org.apache.spark.annotation.Experimental
 import org.apache.spark.api.java.JavaRDD
 import org.apache.spark.deploy.SparkHadoopUtil
 import org.apache.spark.rdd.RDD
+import org.apache.spark.sql.jdbc.{JDBCPartition, JDBCPartitioningInfo, JDBCRelation}
 import org.apache.spark.sql.json.{JsonRDD, JSONRelation}
 import org.apache.spark.sql.parquet.ParquetRelation2
 import org.apache.spark.sql.sources.{LogicalRelation, ResolvedDataSource}
@@ -31,7 +35,7 @@ import org.apache.spark.sql.types.StructType
 /**
  * :: Experimental ::
  * Interface used to load a [[DataFrame]] from external storage systems (e.g. file systems,
- * key-value stores, etc).
+ * key-value stores, etc). Use [[SQLContext.read]] to access this.
  *
  * @since 1.4.0
  */
@@ -94,6 +98,8 @@ class DataFrameReader private[sql](sqlContext: SQLContext) {
    * Specifies the input partitioning. If specified, the underlying data source does not need to
    * discover the data partitioning scheme, and thus can speed up very large inputs.
    *
+   * This is only applicable for Parquet at the moment.
+   *
    * @since 1.4.0
    */
   @scala.annotation.varargs
@@ -128,6 +134,87 @@ class DataFrameReader private[sql](sqlContext: SQLContext) {
     DataFrame(sqlContext, LogicalRelation(resolved.relation))
   }
 
+  /**
+   * Construct a [[DataFrame]] representing the database table accessible via JDBC URL
+   * url named table and connection properties.
+   *
+   * @since 1.4.0
+   */
+  def jdbc(url: String, table: String, properties: Properties): DataFrame = {
+    jdbc(url, table, JDBCRelation.columnPartition(null), properties)
+  }
+
+  /**
+   * Construct a [[DataFrame]] representing the database table accessible via JDBC URL
+   * url named table. Partitions of the table will be retrieved in parallel based on the parameters
+   * passed to this function.
+   *
+   * Don't create too many partitions in parallel on a large cluster; otherwise Spark might crash
+   * your external database systems.
+   *
+   * @param url JDBC database url of the form `jdbc:subprotocol:subname`
+   * @param table Name of the table in the external database.
+   * @param columnName the name of a column of integral type that will be used for partitioning.
+   * @param lowerBound the minimum value of `columnName` used to decide partition stride
+   * @param upperBound the maximum value of `columnName` used to decide partition stride
+   * @param numPartitions the number of partitions.  the range `minValue`-`maxValue` will be split
+   *                      evenly into this many partitions
+   * @param connectionProperties JDBC database connection arguments, a list of arbitrary string
+   *                             tag/value. Normally at least a "user" and "password" property
+   *                             should be included.
+   *
+   * @since 1.4.0
+   */
+  def jdbc(
+      url: String,
+      table: String,
+      columnName: String,
+      lowerBound: Long,
+      upperBound: Long,
+      numPartitions: Int,
+      connectionProperties: Properties): DataFrame = {
+    val partitioning = JDBCPartitioningInfo(columnName, lowerBound, upperBound, numPartitions)
+    val parts = JDBCRelation.columnPartition(partitioning)
+    jdbc(url, table, parts, connectionProperties)
+  }
+
+  /**
+   * Construct a [[DataFrame]] representing the database table accessible via JDBC URL
+   * url named table using connection properties. The `predicates` parameter gives a list
+   * expressions suitable for inclusion in WHERE clauses; each one defines one partition
+   * of the [[DataFrame]].
+   *
+   * Don't create too many partitions in parallel on a large cluster; otherwise Spark might crash
+   * your external database systems.
+   *
+   * @param url JDBC database url of the form `jdbc:subprotocol:subname`
+   * @param table Name of the table in the external database.
+   * @param predicates Condition in the where clause for each partition.
+   * @param connectionProperties JDBC database connection arguments, a list of arbitrary string
+   *                             tag/value. Normally at least a "user" and "password" property
+   *                             should be included.
+   * @since 1.4.0
+   */
+  def jdbc(
+      url: String,
+      table: String,
+      predicates: Array[String],
+      connectionProperties: Properties): DataFrame = {
+    val parts: Array[Partition] = predicates.zipWithIndex.map { case (part, i) =>
+      JDBCPartition(part, i) : Partition
+    }
+    jdbc(url, table, parts, connectionProperties)
+  }
+
+  private def jdbc(
+      url: String,
+      table: String,
+      parts: Array[Partition],
+      connectionProperties: Properties): DataFrame = {
+    val relation = JDBCRelation(url, table, parts, connectionProperties)(sqlContext)
+    sqlContext.baseRelationToDataFrame(relation)
+  }
+
   /**
    * Loads a JSON file (one object per line) and returns the result as a [[DataFrame]].
    *
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala
index 9f42f0f1f4398..f2e721d4db271 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala
@@ -17,14 +17,17 @@
 
 package org.apache.spark.sql
 
+import java.util.Properties
+
 import org.apache.spark.annotation.Experimental
+import org.apache.spark.sql.jdbc.{JDBCWriteDetails, JdbcUtils}
 import org.apache.spark.sql.sources.{ResolvedDataSource, CreateTableUsingAsSelect}
 
 
 /**
  * :: Experimental ::
  * Interface used to write a [[DataFrame]] to external storage systems (e.g. file systems,
- * key-value stores, etc).
+ * key-value stores, etc). Use [[DataFrame.write]] to access this.
  *
  * @since 1.4.0
  */
@@ -110,6 +113,8 @@ final class DataFrameWriter private[sql](df: DataFrame) {
    * Partitions the output by the given columns on the file system. If specified, the output is
    * laid out on the file system similar to Hive's partitioning scheme.
    *
+   * This is only applicable for Parquet at the moment.
+   *
    * @since 1.4.0
    */
   @scala.annotation.varargs
@@ -161,6 +166,52 @@ final class DataFrameWriter private[sql](df: DataFrame) {
     df.sqlContext.executePlan(cmd).toRdd
   }
 
+  /**
+   * Saves the content of the [[DataFrame]] to a external database table via JDBC. In the case the
+   * table already exists in the external database, behavior of this function depends on the
+   * save mode, specified by the `mode` function (default to throwing an exception).
+   *
+   * Don't create too many partitions in parallel on a large cluster; otherwise Spark might crash
+   * your external database systems.
+   *
+   * @param url JDBC database url of the form `jdbc:subprotocol:subname`
+   * @param table Name of the table in the external database.
+   * @param connectionProperties JDBC database connection arguments, a list of arbitrary string
+   *                             tag/value. Normally at least a "user" and "password" property
+   *                             should be included.
+   */
+  def jdbc(url: String, table: String, connectionProperties: Properties): Unit = {
+    val conn = JdbcUtils.createConnection(url, connectionProperties)
+
+    try {
+      var tableExists = JdbcUtils.tableExists(conn, table)
+
+      if (mode == SaveMode.Ignore && tableExists) {
+        return
+      }
+
+      if (mode == SaveMode.ErrorIfExists && tableExists) {
+        sys.error(s"Table $table already exists.")
+      }
+
+      if (mode == SaveMode.Overwrite && tableExists) {
+        JdbcUtils.dropTable(conn, table)
+        tableExists = false
+      }
+
+      // Create the table if the table didn't exist.
+      if (!tableExists) {
+        val schema = JDBCWriteDetails.schemaString(df, url)
+        val sql = s"CREATE TABLE $table ($schema)"
+        conn.prepareStatement(sql).executeUpdate()
+      }
+    } finally {
+      conn.close()
+    }
+
+    JDBCWriteDetails.saveTable(df, url, table, connectionProperties)
+  }
+
   /**
    * Saves the content of the [[DataFrame]] in JSON format at the specified path.
    * This is equivalent to:
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
index 34a50e522c4ca..ac1a800219423 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
@@ -28,6 +28,7 @@ import scala.util.control.NonFatal
 
 import com.google.common.reflect.TypeToken
 
+import org.apache.spark.SparkContext
 import org.apache.spark.annotation.{DeveloperApi, Experimental}
 import org.apache.spark.api.java.{JavaRDD, JavaSparkContext}
 import org.apache.spark.rdd.RDD
@@ -40,11 +41,9 @@ import org.apache.spark.sql.catalyst.plans.logical.{LocalRelation, LogicalPlan}
 import org.apache.spark.sql.catalyst.rules.RuleExecutor
 import org.apache.spark.sql.catalyst.ParserDialect
 import org.apache.spark.sql.execution.{Filter, _}
-import org.apache.spark.sql.jdbc.{JDBCPartition, JDBCPartitioningInfo, JDBCRelation}
 import org.apache.spark.sql.sources._
 import org.apache.spark.sql.types._
 import org.apache.spark.util.Utils
-import org.apache.spark.{Partition, SparkContext}
 
 /**
  * The entry point for working with structured data (rows and columns) in Spark.  Allows the
@@ -531,67 +530,6 @@ class SQLContext(@transient val sparkContext: SparkContext)
     createDataFrame(rdd.rdd, beanClass)
   }
 
-  /**
-   * :: DeveloperApi ::
-   * Creates a [[DataFrame]] from an [[RDD]] containing [[Row]]s by applying a schema to this RDD.
-   * It is important to make sure that the structure of every [[Row]] of the provided RDD matches
-   * the provided schema. Otherwise, there will be runtime exception.
-   * Example:
-   * {{{
-   *  import org.apache.spark.sql._
-   *  import org.apache.spark.sql.types._
-   *  val sqlContext = new org.apache.spark.sql.SQLContext(sc)
-   *
-   *  val schema =
-   *    StructType(
-   *      StructField("name", StringType, false) ::
-   *      StructField("age", IntegerType, true) :: Nil)
-   *
-   *  val people =
-   *    sc.textFile("examples/src/main/resources/people.txt").map(
-   *      _.split(",")).map(p => Row(p(0), p(1).trim.toInt))
-   *  val dataFrame = sqlContext. applySchema(people, schema)
-   *  dataFrame.printSchema
-   *  // root
-   *  // |-- name: string (nullable = false)
-   *  // |-- age: integer (nullable = true)
-   *
-   *  dataFrame.registerTempTable("people")
-   *  sqlContext.sql("select name from people").collect.foreach(println)
-   * }}}
-   */
-  @deprecated("use createDataFrame", "1.3.0")
-  def applySchema(rowRDD: RDD[Row], schema: StructType): DataFrame = {
-    createDataFrame(rowRDD, schema)
-  }
-
-  @deprecated("use createDataFrame", "1.3.0")
-  def applySchema(rowRDD: JavaRDD[Row], schema: StructType): DataFrame = {
-    createDataFrame(rowRDD, schema)
-  }
-
-  /**
-   * Applies a schema to an RDD of Java Beans.
-   *
-   * WARNING: Since there is no guaranteed ordering for fields in a Java Bean,
-   *          SELECT * queries will return the columns in an undefined order.
-   */
-  @deprecated("use createDataFrame", "1.3.0")
-  def applySchema(rdd: RDD[_], beanClass: Class[_]): DataFrame = {
-    createDataFrame(rdd, beanClass)
-  }
-
-  /**
-   * Applies a schema to an RDD of Java Beans.
-   *
-   * WARNING: Since there is no guaranteed ordering for fields in a Java Bean,
-   *          SELECT * queries will return the columns in an undefined order.
-   */
-  @deprecated("use createDataFrame", "1.3.0")
-  def applySchema(rdd: JavaRDD[_], beanClass: Class[_]): DataFrame = {
-    createDataFrame(rdd, beanClass)
-  }
-
   /**
    * :: Experimental ::
    * Returns a [[DataFrameReader]] that can be used to read data in as a [[DataFrame]].
@@ -606,205 +544,6 @@ class SQLContext(@transient val sparkContext: SparkContext)
   @Experimental
   def read: DataFrameReader = new DataFrameReader(this)
 
-  /**
-   * Loads a Parquet file, returning the result as a [[DataFrame]]. This function returns an empty
-   * [[DataFrame]] if no paths are passed in.
-   *
-   * @group specificdata
-   * @since 1.3.0
-   */
-  @deprecated("Use read.parquet()", "1.4.0")
-  @scala.annotation.varargs
-  def parquetFile(paths: String*): DataFrame = {
-    if (paths.isEmpty) {
-      emptyDataFrame
-    } else if (conf.parquetUseDataSourceApi) {
-      read.parquet(paths : _*)
-    } else {
-      DataFrame(this, parquet.ParquetRelation(
-        paths.mkString(","), Some(sparkContext.hadoopConfiguration), this))
-    }
-  }
-
-  /**
-   * Loads a JSON file (one object per line), returning the result as a [[DataFrame]].
-   * It goes through the entire dataset once to determine the schema.
-   *
-   * @group specificdata
-   * @since 1.3.0
-   */
-  @deprecated("Use read.json()", "1.4.0")
-  def jsonFile(path: String): DataFrame = {
-    read.json(path)
-  }
-
-  /**
-   * Loads a JSON file (one object per line) and applies the given schema,
-   * returning the result as a [[DataFrame]].
-   *
-   * @group specificdata
-   * @since 1.3.0
-   */
-  @deprecated("Use read.json()", "1.4.0")
-  def jsonFile(path: String, schema: StructType): DataFrame = {
-    read.schema(schema).json(path)
-  }
-
-  /**
-   * @group specificdata
-   * @since 1.3.0
-   */
-  @deprecated("Use read.json()", "1.4.0")
-  def jsonFile(path: String, samplingRatio: Double): DataFrame = {
-    read.option("samplingRatio", samplingRatio.toString).json(path)
-  }
-
-  /**
-   * Loads an RDD[String] storing JSON objects (one object per record), returning the result as a
-   * [[DataFrame]].
-   * It goes through the entire dataset once to determine the schema.
-   *
-   * @group specificdata
-   * @since 1.3.0
-   */
-  @deprecated("Use read.json()", "1.4.0")
-  def jsonRDD(json: RDD[String]): DataFrame = read.json(json)
-
-  /**
-   * Loads an RDD[String] storing JSON objects (one object per record), returning the result as a
-   * [[DataFrame]].
-   * It goes through the entire dataset once to determine the schema.
-   *
-   * @group specificdata
-   * @since 1.3.0
-   */
-  @deprecated("Use read.json()", "1.4.0")
-  def jsonRDD(json: JavaRDD[String]): DataFrame = read.json(json)
-
-  /**
-   * Loads an RDD[String] storing JSON objects (one object per record) and applies the given schema,
-   * returning the result as a [[DataFrame]].
-   *
-   * @group specificdata
-   * @since 1.3.0
-   */
-  @deprecated("Use read.json()", "1.4.0")
-  def jsonRDD(json: RDD[String], schema: StructType): DataFrame = {
-    read.schema(schema).json(json)
-  }
-
-  /**
-   * Loads an JavaRDD<String> storing JSON objects (one object per record) and applies the given
-   * schema, returning the result as a [[DataFrame]].
-   *
-   * @group specificdata
-   * @since 1.3.0
-   */
-  @deprecated("Use read.json()", "1.4.0")
-  def jsonRDD(json: JavaRDD[String], schema: StructType): DataFrame = {
-    read.schema(schema).json(json)
-  }
-
-  /**
-   * Loads an RDD[String] storing JSON objects (one object per record) inferring the
-   * schema, returning the result as a [[DataFrame]].
-   *
-   * @group specificdata
-   * @since 1.3.0
-   */
-  @deprecated("Use read.json()", "1.4.0")
-  def jsonRDD(json: RDD[String], samplingRatio: Double): DataFrame = {
-    read.option("samplingRatio", samplingRatio.toString).json(json)
-  }
-
-  /**
-   * Loads a JavaRDD[String] storing JSON objects (one object per record) inferring the
-   * schema, returning the result as a [[DataFrame]].
-   *
-   * @group specificdata
-   * @since 1.3.0
-   */
-  @deprecated("Use read.json()", "1.4.0")
-  def jsonRDD(json: JavaRDD[String], samplingRatio: Double): DataFrame = {
-    read.option("samplingRatio", samplingRatio.toString).json(json)
-  }
-
-  /**
-   * Returns the dataset stored at path as a DataFrame,
-   * using the default data source configured by spark.sql.sources.default.
-   *
-   * @group genericdata
-   * @since 1.3.0
-   */
-  @deprecated("Use read.load(path)", "1.4.0")
-  def load(path: String): DataFrame = {
-    read.load(path)
-  }
-
-  /**
-   * Returns the dataset stored at path as a DataFrame, using the given data source.
-   *
-   * @group genericdata
-   * @since 1.3.0
-   */
-  @deprecated("Use read.format(source).load(path)", "1.4.0")
-  def load(path: String, source: String): DataFrame = {
-    read.format(source).load(path)
-  }
-
-  /**
-   * (Java-specific) Returns the dataset specified by the given data source and
-   * a set of options as a DataFrame.
-   *
-   * @group genericdata
-   * @since 1.3.0
-   */
-  @deprecated("Use read.format(source).options(options).load()", "1.4.0")
-  def load(source: String, options: java.util.Map[String, String]): DataFrame = {
-    read.options(options).format(source).load()
-  }
-
-  /**
-   * (Scala-specific) Returns the dataset specified by the given data source and
-   * a set of options as a DataFrame.
-   *
-   * @group genericdata
-   * @since 1.3.0
-   */
-  @deprecated("Use read.format(source).options(options).load()", "1.4.0")
-  def load(source: String, options: Map[String, String]): DataFrame = {
-    read.options(options).format(source).load()
-  }
-
-  /**
-   * (Java-specific) Returns the dataset specified by the given data source and
-   * a set of options as a DataFrame, using the given schema as the schema of the DataFrame.
-   *
-   * @group genericdata
-   * @since 1.3.0
-   */
-  @deprecated("Use read.format(source).schema(schema).options(options).load()", "1.4.0")
-  def load(
-      source: String,
-      schema: StructType,
-      options: java.util.Map[String, String]): DataFrame = {
-    read.format(source).schema(schema).options(options).load()
-  }
-
-  /**
-   * (Scala-specific) Returns the dataset specified by the given data source and
-   * a set of options as a DataFrame, using the given schema as the schema of the DataFrame.
-   * @group genericdata
-   * @since 1.3.0
-   */
-  @deprecated("Use read.format(source).schema(schema).options(options).load()", "1.4.0")
-  def load(
-      source: String,
-      schema: StructType,
-      options: Map[String, String]): DataFrame = {
-    read.format(source).schema(schema).options(options).load()
-  }
-
   /**
    * :: Experimental ::
    * Creates an external table from the given path and returns the corresponding DataFrame.
@@ -903,150 +642,24 @@ class SQLContext(@transient val sparkContext: SparkContext)
    *
    * @group ddl_ops
    * @since 1.3.0
-   */
-  @Experimental
-  def createExternalTable(
-      tableName: String,
-      source: String,
-      schema: StructType,
-      options: Map[String, String]): DataFrame = {
-    val cmd =
-      CreateTableUsing(
-        tableName,
-        userSpecifiedSchema = Some(schema),
-        source,
-        temporary = false,
-        options,
-        allowExisting = false,
-        managedIfNoPath = false)
-    executePlan(cmd).toRdd
-    table(tableName)
-  }
-
-  /**
-   * :: Experimental ::
-   * Construct a [[DataFrame]] representing the database table accessible via JDBC URL
-   * url named table.
-   *
-   * @group specificdata
-   * @since 1.3.0
-   */
-  @Experimental
-  def jdbc(url: String, table: String): DataFrame = {
-    jdbc(url, table, JDBCRelation.columnPartition(null), new Properties())
-  }
-
-  /**
-   * :: Experimental ::
-   * Construct a [[DataFrame]] representing the database table accessible via JDBC URL
-   * url named table and connection properties.
-   *
-   * @group specificdata
-   * @since 1.4.0
-   */
-  @Experimental
-  def jdbc(url: String, table: String, properties: Properties): DataFrame = {
-    jdbc(url, table, JDBCRelation.columnPartition(null), properties)
-  }
-
-  /**
-   * :: Experimental ::
-   * Construct a [[DataFrame]] representing the database table accessible via JDBC URL
-   * url named table.  Partitions of the table will be retrieved in parallel based on the parameters
-   * passed to this function.
-   *
-   * @param columnName the name of a column of integral type that will be used for partitioning.
-   * @param lowerBound the minimum value of `columnName` used to decide partition stride
-   * @param upperBound the maximum value of `columnName` used to decide partition stride
-   * @param numPartitions the number of partitions.  the range `minValue`-`maxValue` will be split
-   *                      evenly into this many partitions
-   * @group specificdata
-   * @since 1.3.0
-   */
-  @Experimental
-  def jdbc(
-      url: String,
-      table: String,
-      columnName: String,
-      lowerBound: Long,
-      upperBound: Long,
-      numPartitions: Int): DataFrame = {
-    jdbc(url, table, columnName, lowerBound, upperBound, numPartitions, new Properties())
-  }
-
-  /**
-   * :: Experimental ::
-   * Construct a [[DataFrame]] representing the database table accessible via JDBC URL
-   * url named table.  Partitions of the table will be retrieved in parallel based on the parameters
-   * passed to this function.
-   *
-   * @param columnName the name of a column of integral type that will be used for partitioning.
-   * @param lowerBound the minimum value of `columnName` used to decide partition stride
-   * @param upperBound the maximum value of `columnName` used to decide partition stride
-   * @param numPartitions the number of partitions.  the range `minValue`-`maxValue` will be split
-   *                      evenly into this many partitions
-   * @param properties connection properties
-   * @group specificdata
-   * @since 1.4.0
-   */
-  @Experimental
-  def jdbc(
-      url: String,
-      table: String,
-      columnName: String,
-      lowerBound: Long,
-      upperBound: Long,
-      numPartitions: Int,
-      properties: Properties): DataFrame = {
-    val partitioning = JDBCPartitioningInfo(columnName, lowerBound, upperBound, numPartitions)
-    val parts = JDBCRelation.columnPartition(partitioning)
-    jdbc(url, table, parts, properties)
-  }
-
-  /**
-   * :: Experimental ::
-   * Construct a [[DataFrame]] representing the database table accessible via JDBC URL
-   * url named table. The theParts parameter gives a list expressions
-   * suitable for inclusion in WHERE clauses; each one defines one partition
-   * of the [[DataFrame]].
-   *
-   * @group specificdata
-   * @since 1.3.0
-   */
-  @Experimental
-  def jdbc(url: String, table: String, theParts: Array[String]): DataFrame = {
-    jdbc(url, table, theParts, new Properties())
-  }
-
-  /**
-   * :: Experimental ::
-   * Construct a [[DataFrame]] representing the database table accessible via JDBC URL
-   * url named table using connection properties. The theParts parameter gives a list expressions
-   * suitable for inclusion in WHERE clauses; each one defines one partition
-   * of the [[DataFrame]].
-   *
-   * @group specificdata
-   * @since 1.4.0
-   */
-  @Experimental
-  def jdbc(
-      url: String,
-      table: String,
-      theParts: Array[String],
-      properties: Properties): DataFrame = {
-    val parts: Array[Partition] = theParts.zipWithIndex.map { case (part, i) =>
-      JDBCPartition(part, i) : Partition
-    }
-    jdbc(url, table, parts, properties)
-  }
-
-  private def jdbc(
-      url: String,
-      table: String,
-      parts: Array[Partition],
-      properties: Properties): DataFrame = {
-    val relation = JDBCRelation(url, table, parts, properties)(this)
-    baseRelationToDataFrame(relation)
+   */
+  @Experimental
+  def createExternalTable(
+      tableName: String,
+      source: String,
+      schema: StructType,
+      options: Map[String, String]): DataFrame = {
+    val cmd =
+      CreateTableUsing(
+        tableName,
+        userSpecifiedSchema = Some(schema),
+        source,
+        temporary = false,
+        options,
+        allowExisting = false,
+        managedIfNoPath = false)
+    executePlan(cmd).toRdd
+    table(tableName)
   }
 
   /**
@@ -1372,6 +985,263 @@ class SQLContext(@transient val sparkContext: SparkContext)
     }
   }
 
+  ////////////////////////////////////////////////////////////////////////////
+  ////////////////////////////////////////////////////////////////////////////
+  // Deprecated methods
+  ////////////////////////////////////////////////////////////////////////////
+  ////////////////////////////////////////////////////////////////////////////
+
+  @deprecated("use createDataFrame", "1.3.0")
+  def applySchema(rowRDD: RDD[Row], schema: StructType): DataFrame = {
+    createDataFrame(rowRDD, schema)
+  }
+
+  @deprecated("use createDataFrame", "1.3.0")
+  def applySchema(rowRDD: JavaRDD[Row], schema: StructType): DataFrame = {
+    createDataFrame(rowRDD, schema)
+  }
+
+  @deprecated("use createDataFrame", "1.3.0")
+  def applySchema(rdd: RDD[_], beanClass: Class[_]): DataFrame = {
+    createDataFrame(rdd, beanClass)
+  }
+
+  @deprecated("use createDataFrame", "1.3.0")
+  def applySchema(rdd: JavaRDD[_], beanClass: Class[_]): DataFrame = {
+    createDataFrame(rdd, beanClass)
+  }
+
+  /**
+   * Loads a Parquet file, returning the result as a [[DataFrame]]. This function returns an empty
+   * [[DataFrame]] if no paths are passed in.
+   *
+   * @group specificdata
+   */
+  @deprecated("Use read.parquet()", "1.4.0")
+  @scala.annotation.varargs
+  def parquetFile(paths: String*): DataFrame = {
+    if (paths.isEmpty) {
+      emptyDataFrame
+    } else if (conf.parquetUseDataSourceApi) {
+      read.parquet(paths : _*)
+    } else {
+      DataFrame(this, parquet.ParquetRelation(
+        paths.mkString(","), Some(sparkContext.hadoopConfiguration), this))
+    }
+  }
+
+  /**
+   * Loads a JSON file (one object per line), returning the result as a [[DataFrame]].
+   * It goes through the entire dataset once to determine the schema.
+   *
+   * @group specificdata
+   */
+  @deprecated("Use read.json()", "1.4.0")
+  def jsonFile(path: String): DataFrame = {
+    read.json(path)
+  }
+
+  /**
+   * Loads a JSON file (one object per line) and applies the given schema,
+   * returning the result as a [[DataFrame]].
+   *
+   * @group specificdata
+   */
+  @deprecated("Use read.json()", "1.4.0")
+  def jsonFile(path: String, schema: StructType): DataFrame = {
+    read.schema(schema).json(path)
+  }
+
+  /**
+   * @group specificdata
+   */
+  @deprecated("Use read.json()", "1.4.0")
+  def jsonFile(path: String, samplingRatio: Double): DataFrame = {
+    read.option("samplingRatio", samplingRatio.toString).json(path)
+  }
+
+  /**
+   * Loads an RDD[String] storing JSON objects (one object per record), returning the result as a
+   * [[DataFrame]].
+   * It goes through the entire dataset once to determine the schema.
+   *
+   * @group specificdata
+   */
+  @deprecated("Use read.json()", "1.4.0")
+  def jsonRDD(json: RDD[String]): DataFrame = read.json(json)
+
+  /**
+   * Loads an RDD[String] storing JSON objects (one object per record), returning the result as a
+   * [[DataFrame]].
+   * It goes through the entire dataset once to determine the schema.
+   *
+   * @group specificdata
+   */
+  @deprecated("Use read.json()", "1.4.0")
+  def jsonRDD(json: JavaRDD[String]): DataFrame = read.json(json)
+
+  /**
+   * Loads an RDD[String] storing JSON objects (one object per record) and applies the given schema,
+   * returning the result as a [[DataFrame]].
+   *
+   * @group specificdata
+   */
+  @deprecated("Use read.json()", "1.4.0")
+  def jsonRDD(json: RDD[String], schema: StructType): DataFrame = {
+    read.schema(schema).json(json)
+  }
+
+  /**
+   * Loads an JavaRDD<String> storing JSON objects (one object per record) and applies the given
+   * schema, returning the result as a [[DataFrame]].
+   *
+   * @group specificdata
+   */
+  @deprecated("Use read.json()", "1.4.0")
+  def jsonRDD(json: JavaRDD[String], schema: StructType): DataFrame = {
+    read.schema(schema).json(json)
+  }
+
+  /**
+   * Loads an RDD[String] storing JSON objects (one object per record) inferring the
+   * schema, returning the result as a [[DataFrame]].
+   *
+   * @group specificdata
+   */
+  @deprecated("Use read.json()", "1.4.0")
+  def jsonRDD(json: RDD[String], samplingRatio: Double): DataFrame = {
+    read.option("samplingRatio", samplingRatio.toString).json(json)
+  }
+
+  /**
+   * Loads a JavaRDD[String] storing JSON objects (one object per record) inferring the
+   * schema, returning the result as a [[DataFrame]].
+   *
+   * @group specificdata
+   */
+  @deprecated("Use read.json()", "1.4.0")
+  def jsonRDD(json: JavaRDD[String], samplingRatio: Double): DataFrame = {
+    read.option("samplingRatio", samplingRatio.toString).json(json)
+  }
+
+  /**
+   * Returns the dataset stored at path as a DataFrame,
+   * using the default data source configured by spark.sql.sources.default.
+   *
+   * @group genericdata
+   */
+  @deprecated("Use read.load(path)", "1.4.0")
+  def load(path: String): DataFrame = {
+    read.load(path)
+  }
+
+  /**
+   * Returns the dataset stored at path as a DataFrame, using the given data source.
+   *
+   * @group genericdata
+   */
+  @deprecated("Use read.format(source).load(path)", "1.4.0")
+  def load(path: String, source: String): DataFrame = {
+    read.format(source).load(path)
+  }
+
+  /**
+   * (Java-specific) Returns the dataset specified by the given data source and
+   * a set of options as a DataFrame.
+   *
+   * @group genericdata
+   */
+  @deprecated("Use read.format(source).options(options).load()", "1.4.0")
+  def load(source: String, options: java.util.Map[String, String]): DataFrame = {
+    read.options(options).format(source).load()
+  }
+
+  /**
+   * (Scala-specific) Returns the dataset specified by the given data source and
+   * a set of options as a DataFrame.
+   *
+   * @group genericdata
+   */
+  @deprecated("Use read.format(source).options(options).load()", "1.4.0")
+  def load(source: String, options: Map[String, String]): DataFrame = {
+    read.options(options).format(source).load()
+  }
+
+  /**
+   * (Java-specific) Returns the dataset specified by the given data source and
+   * a set of options as a DataFrame, using the given schema as the schema of the DataFrame.
+   *
+   * @group genericdata
+   */
+  @deprecated("Use read.format(source).schema(schema).options(options).load()", "1.4.0")
+  def load(source: String, schema: StructType, options: java.util.Map[String, String]): DataFrame =
+  {
+    read.format(source).schema(schema).options(options).load()
+  }
+
+  /**
+   * (Scala-specific) Returns the dataset specified by the given data source and
+   * a set of options as a DataFrame, using the given schema as the schema of the DataFrame.
+   *
+   * @group genericdata
+   */
+  @deprecated("Use read.format(source).schema(schema).options(options).load()", "1.4.0")
+  def load(source: String, schema: StructType, options: Map[String, String]): DataFrame = {
+    read.format(source).schema(schema).options(options).load()
+  }
+
+  /**
+   * Construct a [[DataFrame]] representing the database table accessible via JDBC URL
+   * url named table.
+   *
+   * @group specificdata
+   */
+  @deprecated("use read.jdbc()", "1.4.0")
+  def jdbc(url: String, table: String): DataFrame = {
+    read.jdbc(url, table, new Properties)
+  }
+
+  /**
+   * Construct a [[DataFrame]] representing the database table accessible via JDBC URL
+   * url named table.  Partitions of the table will be retrieved in parallel based on the parameters
+   * passed to this function.
+   *
+   * @param columnName the name of a column of integral type that will be used for partitioning.
+   * @param lowerBound the minimum value of `columnName` used to decide partition stride
+   * @param upperBound the maximum value of `columnName` used to decide partition stride
+   * @param numPartitions the number of partitions.  the range `minValue`-`maxValue` will be split
+   *                      evenly into this many partitions
+   * @group specificdata
+   */
+  @deprecated("use read.jdbc()", "1.4.0")
+  def jdbc(
+      url: String,
+      table: String,
+      columnName: String,
+      lowerBound: Long,
+      upperBound: Long,
+      numPartitions: Int): DataFrame = {
+    read.jdbc(url, table, columnName, lowerBound, upperBound, numPartitions, new Properties)
+  }
+
+  /**
+   * Construct a [[DataFrame]] representing the database table accessible via JDBC URL
+   * url named table. The theParts parameter gives a list expressions
+   * suitable for inclusion in WHERE clauses; each one defines one partition
+   * of the [[DataFrame]].
+   *
+   * @group specificdata
+   */
+  @deprecated("use read.jdbc()", "1.4.0")
+  def jdbc(url: String, table: String, theParts: Array[String]): DataFrame = {
+    read.jdbc(url, table, theParts, new Properties)
+  }
+
+  ////////////////////////////////////////////////////////////////////////////
+  ////////////////////////////////////////////////////////////////////////////
+  // End of eeprecated methods
+  ////////////////////////////////////////////////////////////////////////////
+  ////////////////////////////////////////////////////////////////////////////
 }
 
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/jdbc/JDBCRDD.scala b/sql/core/src/main/scala/org/apache/spark/sql/jdbc/JDBCRDD.scala
index 40483d3ec7701..95935ba874a72 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/jdbc/JDBCRDD.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/jdbc/JDBCRDD.scala
@@ -29,7 +29,16 @@ import org.apache.spark.sql.catalyst.util.DateUtils
 import org.apache.spark.sql.types._
 import org.apache.spark.sql.sources._
 
+/**
+ * Data corresponding to one partition of a JDBCRDD.
+ */
+private[sql] case class JDBCPartition(whereClause: String, idx: Int) extends Partition {
+  override def index: Int = idx
+}
+
+
 private[sql] object JDBCRDD extends Logging {
+
   /**
    * Maps a JDBC type to a Catalyst type.  This function is called only when
    * the DriverQuirks class corresponding to your database driver returns null.
@@ -168,6 +177,7 @@ private[sql] object JDBCRDD extends Logging {
       DriverManager.getConnection(url, properties)
     }
   }
+
   /**
    * Build and return JDBCRDD from the given information.
    *
@@ -193,18 +203,14 @@ private[sql] object JDBCRDD extends Logging {
       requiredColumns: Array[String],
       filters: Array[Filter],
       parts: Array[Partition]): RDD[Row] = {
-
-    val prunedSchema = pruneSchema(schema, requiredColumns)
-
-    return new
-        JDBCRDD(
-          sc,
-          getConnector(driver, url, properties),
-          prunedSchema,
-          fqTable,
-          requiredColumns,
-          filters,
-          parts)
+    new JDBCRDD(
+      sc,
+      getConnector(driver, url, properties),
+      pruneSchema(schema, requiredColumns),
+      fqTable,
+      requiredColumns,
+      filters,
+      parts)
   }
 }
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/jdbc/JDBCRelation.scala b/sql/core/src/main/scala/org/apache/spark/sql/jdbc/JDBCRelation.scala
index 93e82549f213b..09d6865457df6 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/jdbc/JDBCRelation.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/jdbc/JDBCRelation.scala
@@ -17,26 +17,16 @@
 
 package org.apache.spark.sql.jdbc
 
-import java.sql.DriverManager
 import java.util.Properties
 
 import scala.collection.mutable.ArrayBuffer
 
 import org.apache.spark.Partition
 import org.apache.spark.rdd.RDD
-import org.apache.spark.sql.DataFrame
-import org.apache.spark.sql.SQLContext
+import org.apache.spark.sql.{SaveMode, DataFrame, SQLContext}
 import org.apache.spark.sql.catalyst.expressions.Row
 import org.apache.spark.sql.sources._
 import org.apache.spark.sql.types.StructType
-import org.apache.spark.util.Utils
-
-/**
- * Data corresponding to one partition of a JDBCRDD.
- */
-private[sql] case class JDBCPartition(whereClause: String, idx: Int) extends Partition {
-  override def index: Int = idx
-}
 
 /**
  * Instructions on how to partition the table among workers.
@@ -152,6 +142,8 @@ private[sql] case class JDBCRelation(
   }
   
   override def insert(data: DataFrame, overwrite: Boolean): Unit = {
-    data.insertIntoJDBC(url, table, overwrite, properties)
+    data.write
+      .mode(if (overwrite) SaveMode.Overwrite else SaveMode.Append)
+      .jdbc(url, table, properties)
   }  
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/jdbc/JdbcUtils.scala b/sql/core/src/main/scala/org/apache/spark/sql/jdbc/JdbcUtils.scala
new file mode 100644
index 0000000000000..cc918c237192b
--- /dev/null
+++ b/sql/core/src/main/scala/org/apache/spark/sql/jdbc/JdbcUtils.scala
@@ -0,0 +1,52 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.jdbc
+
+import java.sql.{Connection, DriverManager}
+import java.util.Properties
+
+import scala.util.Try
+
+/**
+ * Util functions for JDBC tables.
+ */
+private[sql] object JdbcUtils {
+
+  /**
+   * Establishes a JDBC connection.
+   */
+  def createConnection(url: String, connectionProperties: Properties): Connection = {
+    DriverManager.getConnection(url, connectionProperties)
+  }
+
+  /**
+   * Returns true if the table already exists in the JDBC database.
+   */
+  def tableExists(conn: Connection, table: String): Boolean = {
+    // Somewhat hacky, but there isn't a good way to identify whether a table exists for all
+    // SQL database systems, considering "table" could also include the database name.
+    Try(conn.prepareStatement(s"SELECT 1 FROM $table LIMIT 1").executeQuery().next()).isSuccess
+  }
+
+  /**
+   * Drops a table from the JDBC database.
+   */
+  def dropTable(conn: Connection, table: String): Unit = {
+    conn.prepareStatement(s"DROP TABLE $table").executeUpdate()
+  }
+}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/jdbc/jdbc.scala b/sql/core/src/main/scala/org/apache/spark/sql/jdbc/jdbc.scala
index c099881a01226..a61790b8472c8 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/jdbc/jdbc.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/jdbc/jdbc.scala
@@ -163,8 +163,8 @@ package object jdbc {
         table: String,
         properties: Properties = new Properties()) {
       val quirks = DriverQuirks.get(url)
-      var nullTypes: Array[Int] = df.schema.fields.map(field => {
-        var nullType: Option[Int] = quirks.getJDBCType(field.dataType)._2
+      val nullTypes: Array[Int] = df.schema.fields.map { field =>
+        val nullType: Option[Int] = quirks.getJDBCType(field.dataType)._2
         if (nullType.isEmpty) {
           field.dataType match {
             case IntegerType => java.sql.Types.INTEGER
@@ -183,7 +183,7 @@ package object jdbc {
               s"Can't translate null value for field $field")
           }
         } else nullType.get
-      }).toArray
+      }
 
       val rddSchema = df.schema
       df.foreachPartition { iterator =>
diff --git a/sql/core/src/test/java/test/org/apache/spark/sql/JavaApplySchemaSuite.java b/sql/core/src/test/java/test/org/apache/spark/sql/JavaApplySchemaSuite.java
index c344a9b095c52..fcb8f5499cf84 100644
--- a/sql/core/src/test/java/test/org/apache/spark/sql/JavaApplySchemaSuite.java
+++ b/sql/core/src/test/java/test/org/apache/spark/sql/JavaApplySchemaSuite.java
@@ -187,14 +187,14 @@ public void applySchemaToJSON() {
         null,
         "this is another simple string."));
 
-    DataFrame df1 = sqlContext.jsonRDD(jsonRDD);
+    DataFrame df1 = sqlContext.read().json(jsonRDD);
     StructType actualSchema1 = df1.schema();
     Assert.assertEquals(expectedSchema, actualSchema1);
     df1.registerTempTable("jsonTable1");
     List<Row> actual1 = sqlContext.sql("select * from jsonTable1").collectAsList();
     Assert.assertEquals(expectedResult, actual1);
 
-    DataFrame df2 = sqlContext.jsonRDD(jsonRDD, expectedSchema);
+    DataFrame df2 = sqlContext.read().schema(expectedSchema).json(jsonRDD);
     StructType actualSchema2 = df2.schema();
     Assert.assertEquals(expectedSchema, actualSchema2);
     df2.registerTempTable("jsonTable2");
diff --git a/sql/core/src/test/java/test/org/apache/spark/sql/sources/JavaSaveLoadSuite.java b/sql/core/src/test/java/test/org/apache/spark/sql/sources/JavaSaveLoadSuite.java
index 6a0bcefe7aa88..2706e01bd28af 100644
--- a/sql/core/src/test/java/test/org/apache/spark/sql/sources/JavaSaveLoadSuite.java
+++ b/sql/core/src/test/java/test/org/apache/spark/sql/sources/JavaSaveLoadSuite.java
@@ -67,7 +67,7 @@ public void setUp() throws IOException {
       jsonObjects.add("{\"a\":" + i + ", \"b\":\"str" + i + "\"}");
     }
     JavaRDD<String> rdd = sc.parallelize(jsonObjects);
-    df = sqlContext.jsonRDD(rdd);
+    df = sqlContext.read().json(rdd);
     df.registerTempTable("jsonTable");
   }
 
@@ -75,10 +75,8 @@ public void setUp() throws IOException {
   public void saveAndLoad() {
     Map<String, String> options = new HashMap<String, String>();
     options.put("path", path.toString());
-    df.save("json", SaveMode.ErrorIfExists, options);
-
+    df.write().mode(SaveMode.ErrorIfExists).format("json").options(options).save();
     DataFrame loadedDF = sqlContext.read().format("json").options(options).load();
-
     checkAnswer(loadedDF, df.collectAsList());
   }
 
@@ -86,12 +84,12 @@ public void saveAndLoad() {
   public void saveAndLoadWithSchema() {
     Map<String, String> options = new HashMap<String, String>();
     options.put("path", path.toString());
-    df.save("json", SaveMode.ErrorIfExists, options);
+    df.write().format("json").mode(SaveMode.ErrorIfExists).options(options).save();
 
     List<StructField> fields = new ArrayList<StructField>();
     fields.add(DataTypes.createStructField("b", DataTypes.StringType, true));
     StructType schema = DataTypes.createStructType(fields);
-    DataFrame loadedDF = sqlContext.load("json", schema, options);
+    DataFrame loadedDF = sqlContext.read().format("json").schema(schema).options(options).load();
 
     checkAnswer(loadedDF, sqlContext.sql("SELECT b FROM jsonTable").collectAsList());
   }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCSuite.scala
index 2abfe7f167f77..5a7b6f0aac6f7 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCSuite.scala
@@ -221,22 +221,25 @@ class JDBCSuite extends FunSuite with BeforeAndAfter {
   }
 
   test("Basic API") {
-    assert(TestSQLContext.jdbc(urlWithUserAndPass, "TEST.PEOPLE").collect().size === 3)
+    assert(TestSQLContext.read.jdbc(
+      urlWithUserAndPass, "TEST.PEOPLE", new Properties).collect().length === 3)
   }
 
   test("Partitioning via JDBCPartitioningInfo API") {
-    assert(TestSQLContext.jdbc(urlWithUserAndPass, "TEST.PEOPLE", "THEID", 0, 4, 3)
-      .collect.size === 3)
+    assert(
+      TestSQLContext.read.jdbc(urlWithUserAndPass, "TEST.PEOPLE", "THEID", 0, 4, 3, new Properties)
+      .collect().length === 3)
   }
 
   test("Partitioning via list-of-where-clauses API") {
     val parts = Array[String]("THEID < 2", "THEID >= 2")
-    assert(TestSQLContext.jdbc(urlWithUserAndPass, "TEST.PEOPLE", parts).collect().size === 3)
+    assert(TestSQLContext.read.jdbc(urlWithUserAndPass, "TEST.PEOPLE", parts, new Properties)
+      .collect().length === 3)
   }
 
   test("H2 integral types") {
     val rows = sql("SELECT * FROM inttypes WHERE A IS NOT NULL").collect()
-    assert(rows.size === 1)
+    assert(rows.length === 1)
     assert(rows(0).getInt(0) === 1)
     assert(rows(0).getBoolean(1) === false)
     assert(rows(0).getInt(2) === 3)
@@ -246,7 +249,7 @@ class JDBCSuite extends FunSuite with BeforeAndAfter {
 
   test("H2 null entries") {
     val rows = sql("SELECT * FROM inttypes WHERE A IS NULL").collect()
-    assert(rows.size === 1)
+    assert(rows.length === 1)
     assert(rows(0).isNullAt(0))
     assert(rows(0).isNullAt(1))
     assert(rows(0).isNullAt(2))
@@ -286,24 +289,28 @@ class JDBCSuite extends FunSuite with BeforeAndAfter {
   }
 
   test("test DATE types") {
-    val rows = TestSQLContext.jdbc(urlWithUserAndPass, "TEST.TIMETYPES").collect()
-    val cachedRows = TestSQLContext.jdbc(urlWithUserAndPass, "TEST.TIMETYPES").cache().collect()
+    val rows = TestSQLContext.read.jdbc(
+      urlWithUserAndPass, "TEST.TIMETYPES", new Properties).collect()
+    val cachedRows = TestSQLContext.read.jdbc(urlWithUserAndPass, "TEST.TIMETYPES", new Properties)
+      .cache().collect()
     assert(rows(0).getAs[java.sql.Date](1) === java.sql.Date.valueOf("1996-01-01"))
     assert(rows(1).getAs[java.sql.Date](1) === null)
     assert(cachedRows(0).getAs[java.sql.Date](1) === java.sql.Date.valueOf("1996-01-01"))
   }
 
   test("test DATE types in cache") {
-    val rows = TestSQLContext.jdbc(urlWithUserAndPass, "TEST.TIMETYPES").collect()
-    TestSQLContext
-      .jdbc(urlWithUserAndPass, "TEST.TIMETYPES").cache().registerTempTable("mycached_date")
+    val rows =
+      TestSQLContext.read.jdbc(urlWithUserAndPass, "TEST.TIMETYPES", new Properties).collect()
+    TestSQLContext.read.jdbc(urlWithUserAndPass, "TEST.TIMETYPES", new Properties)
+      .cache().registerTempTable("mycached_date")
     val cachedRows = sql("select * from mycached_date").collect()
     assert(rows(0).getAs[java.sql.Date](1) === java.sql.Date.valueOf("1996-01-01"))
     assert(cachedRows(0).getAs[java.sql.Date](1) === java.sql.Date.valueOf("1996-01-01"))
   }
 
   test("test types for null value") {
-    val rows = TestSQLContext.jdbc(urlWithUserAndPass, "TEST.NULLTYPES").collect()
+    val rows = TestSQLContext.read.jdbc(
+      urlWithUserAndPass, "TEST.NULLTYPES", new Properties).collect()
     assert((0 to 14).forall(i => rows(0).isNullAt(i)))
   }
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCWriteSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCWriteSuite.scala
index 0800eded443de..2e4c12f9da80c 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCWriteSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCWriteSuite.scala
@@ -22,7 +22,7 @@ import java.util.Properties
 
 import org.scalatest.{BeforeAndAfter, FunSuite}
 
-import org.apache.spark.sql.Row
+import org.apache.spark.sql.{SaveMode, Row}
 import org.apache.spark.sql.test._
 import org.apache.spark.sql.types._
 
@@ -90,64 +90,66 @@ class JDBCWriteSuite extends FunSuite with BeforeAndAfter {
   test("Basic CREATE") {
     val df = TestSQLContext.createDataFrame(sc.parallelize(arr2x2), schema2)
 
-    df.createJDBCTable(url, "TEST.BASICCREATETEST", false)
-    assert(2 == TestSQLContext.jdbc(url, "TEST.BASICCREATETEST").count)
-    assert(2 == TestSQLContext.jdbc(url, "TEST.BASICCREATETEST").collect()(0).length)
+    df.write.jdbc(url, "TEST.BASICCREATETEST", new Properties)
+    assert(2 == TestSQLContext.read.jdbc(url, "TEST.BASICCREATETEST", new Properties).count)
+    assert(2 ==
+      TestSQLContext.read.jdbc(url, "TEST.BASICCREATETEST", new Properties).collect()(0).length)
   }
 
   test("CREATE with overwrite") {
     val df = TestSQLContext.createDataFrame(sc.parallelize(arr2x3), schema3)
     val df2 = TestSQLContext.createDataFrame(sc.parallelize(arr1x2), schema2)
 
-    df.createJDBCTable(url1, "TEST.DROPTEST", false, properties)
-    assert(2 == TestSQLContext.jdbc(url1, "TEST.DROPTEST", properties).count)
-    assert(3 == TestSQLContext.jdbc(url1, "TEST.DROPTEST", properties).collect()(0).length)
+    df.write.jdbc(url1, "TEST.DROPTEST", properties)
+    assert(2 == TestSQLContext.read.jdbc(url1, "TEST.DROPTEST", properties).count)
+    assert(3 == TestSQLContext.read.jdbc(url1, "TEST.DROPTEST", properties).collect()(0).length)
 
-    df2.createJDBCTable(url1, "TEST.DROPTEST", true, properties)
-    assert(1 == TestSQLContext.jdbc(url1, "TEST.DROPTEST", properties).count)
-    assert(2 == TestSQLContext.jdbc(url1, "TEST.DROPTEST", properties).collect()(0).length)
+    df2.write.mode(SaveMode.Overwrite).jdbc(url1, "TEST.DROPTEST", properties)
+    assert(1 == TestSQLContext.read.jdbc(url1, "TEST.DROPTEST", properties).count)
+    assert(2 == TestSQLContext.read.jdbc(url1, "TEST.DROPTEST", properties).collect()(0).length)
   }
 
   test("CREATE then INSERT to append") {
     val df = TestSQLContext.createDataFrame(sc.parallelize(arr2x2), schema2)
     val df2 = TestSQLContext.createDataFrame(sc.parallelize(arr1x2), schema2)
 
-    df.createJDBCTable(url, "TEST.APPENDTEST", false)
-    df2.insertIntoJDBC(url, "TEST.APPENDTEST", false)
-    assert(3 == TestSQLContext.jdbc(url, "TEST.APPENDTEST").count)
-    assert(2 == TestSQLContext.jdbc(url, "TEST.APPENDTEST").collect()(0).length)
+    df.write.jdbc(url, "TEST.APPENDTEST", new Properties)
+    df2.write.mode(SaveMode.Append).jdbc(url, "TEST.APPENDTEST", new Properties)
+    assert(3 == TestSQLContext.read.jdbc(url, "TEST.APPENDTEST", new Properties).count)
+    assert(2 ==
+      TestSQLContext.read.jdbc(url, "TEST.APPENDTEST", new Properties).collect()(0).length)
   }
 
   test("CREATE then INSERT to truncate") {
     val df = TestSQLContext.createDataFrame(sc.parallelize(arr2x2), schema2)
     val df2 = TestSQLContext.createDataFrame(sc.parallelize(arr1x2), schema2)
 
-    df.createJDBCTable(url1, "TEST.TRUNCATETEST", false, properties)
-    df2.insertIntoJDBC(url1, "TEST.TRUNCATETEST", true, properties)
-    assert(1 == TestSQLContext.jdbc(url1, "TEST.TRUNCATETEST", properties).count)
-    assert(2 == TestSQLContext.jdbc(url1, "TEST.TRUNCATETEST", properties).collect()(0).length)
+    df.write.jdbc(url1, "TEST.TRUNCATETEST", properties)
+    df2.write.mode(SaveMode.Overwrite).jdbc(url1, "TEST.TRUNCATETEST", properties)
+    assert(1 == TestSQLContext.read.jdbc(url1, "TEST.TRUNCATETEST", properties).count)
+    assert(2 == TestSQLContext.read.jdbc(url1, "TEST.TRUNCATETEST", properties).collect()(0).length)
   }
 
   test("Incompatible INSERT to append") {
     val df = TestSQLContext.createDataFrame(sc.parallelize(arr2x2), schema2)
     val df2 = TestSQLContext.createDataFrame(sc.parallelize(arr2x3), schema3)
 
-    df.createJDBCTable(url, "TEST.INCOMPATIBLETEST", false)
+    df.write.jdbc(url, "TEST.INCOMPATIBLETEST", new Properties)
     intercept[org.apache.spark.SparkException] {
-      df2.insertIntoJDBC(url, "TEST.INCOMPATIBLETEST", true)
+      df2.write.mode(SaveMode.Append).jdbc(url, "TEST.INCOMPATIBLETEST", new Properties)
     }
   }
-  
+
   test("INSERT to JDBC Datasource") {
     TestSQLContext.sql("INSERT INTO TABLE PEOPLE1 SELECT * FROM PEOPLE")
-    assert(2 == TestSQLContext.jdbc(url1, "TEST.PEOPLE1", properties).count)
-    assert(2 == TestSQLContext.jdbc(url1, "TEST.PEOPLE1", properties).collect()(0).length)
+    assert(2 == TestSQLContext.read.jdbc(url1, "TEST.PEOPLE1", properties).count)
+    assert(2 == TestSQLContext.read.jdbc(url1, "TEST.PEOPLE1", properties).collect()(0).length)
   }
-  
+
   test("INSERT to JDBC Datasource with overwrite") {
     TestSQLContext.sql("INSERT INTO TABLE PEOPLE1 SELECT * FROM PEOPLE")
     TestSQLContext.sql("INSERT OVERWRITE TABLE PEOPLE1 SELECT * FROM PEOPLE")
-    assert(2 == TestSQLContext.jdbc(url1, "TEST.PEOPLE1", properties).count)
-    assert(2 == TestSQLContext.jdbc(url1, "TEST.PEOPLE1", properties).collect()(0).length)
+    assert(2 == TestSQLContext.read.jdbc(url1, "TEST.PEOPLE1", properties).count)
+    assert(2 == TestSQLContext.read.jdbc(url1, "TEST.PEOPLE1", properties).collect()(0).length)
   } 
 }
diff --git a/sql/hive/src/test/java/org/apache/spark/sql/hive/JavaMetastoreDataSourcesSuite.java b/sql/hive/src/test/java/org/apache/spark/sql/hive/JavaMetastoreDataSourcesSuite.java
index 53ddecf57958b..58fe96adab17e 100644
--- a/sql/hive/src/test/java/org/apache/spark/sql/hive/JavaMetastoreDataSourcesSuite.java
+++ b/sql/hive/src/test/java/org/apache/spark/sql/hive/JavaMetastoreDataSourcesSuite.java
@@ -81,7 +81,7 @@ public void setUp() throws IOException {
       jsonObjects.add("{\"a\":" + i + ", \"b\":\"str" + i + "\"}");
     }
     JavaRDD<String> rdd = sc.parallelize(jsonObjects);
-    df = sqlContext.jsonRDD(rdd);
+    df = sqlContext.read().json(rdd);
     df.registerTempTable("jsonTable");
   }
 
@@ -96,7 +96,11 @@ public void tearDown() throws IOException {
   public void saveExternalTableAndQueryIt() {
     Map<String, String> options = new HashMap<String, String>();
     options.put("path", path.toString());
-    df.saveAsTable("javaSavedTable", "org.apache.spark.sql.json", SaveMode.Append, options);
+    df.write()
+      .format("org.apache.spark.sql.json")
+      .mode(SaveMode.Append)
+      .options(options)
+      .saveAsTable("javaSavedTable");
 
     checkAnswer(
       sqlContext.sql("SELECT * FROM javaSavedTable"),
@@ -115,7 +119,11 @@ public void saveExternalTableAndQueryIt() {
   public void saveExternalTableWithSchemaAndQueryIt() {
     Map<String, String> options = new HashMap<String, String>();
     options.put("path", path.toString());
-    df.saveAsTable("javaSavedTable", "org.apache.spark.sql.json", SaveMode.Append, options);
+    df.write()
+      .format("org.apache.spark.sql.json")
+      .mode(SaveMode.Append)
+      .options(options)
+      .saveAsTable("javaSavedTable");
 
     checkAnswer(
       sqlContext.sql("SELECT * FROM javaSavedTable"),
@@ -138,7 +146,11 @@ public void saveExternalTableWithSchemaAndQueryIt() {
   @Test
   public void saveTableAndQueryIt() {
     Map<String, String> options = new HashMap<String, String>();
-    df.saveAsTable("javaSavedTable", "org.apache.spark.sql.json", SaveMode.Append, options);
+    df.write()
+      .format("org.apache.spark.sql.json")
+      .mode(SaveMode.Append)
+      .options(options)
+      .saveAsTable("javaSavedTable");
 
     checkAnswer(
       sqlContext.sql("SELECT * FROM javaSavedTable"),
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/CachedTableSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/CachedTableSuite.scala
index fc6c3c35037b0..945596db80326 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/CachedTableSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/CachedTableSuite.scala
@@ -162,7 +162,7 @@ class CachedTableSuite extends QueryTest {
   test("REFRESH TABLE also needs to recache the data (data source tables)") {
     val tempPath: File = Utils.createTempDir()
     tempPath.delete()
-    table("src").save(tempPath.toString, "parquet", SaveMode.Overwrite)
+    table("src").write.mode(SaveMode.Overwrite).parquet(tempPath.toString)
     sql("DROP TABLE IF EXISTS refreshTable")
     createExternalTable("refreshTable", tempPath.toString, "parquet")
     checkAnswer(
@@ -172,7 +172,7 @@ class CachedTableSuite extends QueryTest {
     sql("CACHE TABLE refreshTable")
     assertCached(table("refreshTable"))
     // Append new data.
-    table("src").save(tempPath.toString, "parquet", SaveMode.Append)
+    table("src").write.mode(SaveMode.Append).parquet(tempPath.toString)
     // We are still using the old data.
     assertCached(table("refreshTable"))
     checkAnswer(
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala
index 58b0b80c31e2e..30db976a3ae74 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala
@@ -409,11 +409,11 @@ class MetastoreDataSourcesSuite extends QueryTest with BeforeAndAfterEach {
     val originalDefaultSource = conf.defaultDataSourceName
 
     val rdd = sparkContext.parallelize((1 to 10).map(i => s"""{"a":$i, "b":"str${i}"}"""))
-    val df = jsonRDD(rdd)
+    val df = read.json(rdd)
 
     conf.setConf(SQLConf.DEFAULT_DATA_SOURCE_NAME, "org.apache.spark.sql.json")
     // Save the df as a managed table (by not specifiying the path).
-    df.saveAsTable("savedJsonTable")
+    df.write.saveAsTable("savedJsonTable")
 
     checkAnswer(
       sql("SELECT * FROM savedJsonTable where savedJsonTable.a < 5"),
@@ -443,11 +443,11 @@ class MetastoreDataSourcesSuite extends QueryTest with BeforeAndAfterEach {
     val originalDefaultSource = conf.defaultDataSourceName
 
     val rdd = sparkContext.parallelize((1 to 10).map(i => s"""{"a":$i, "b":"str${i}"}"""))
-    val df = jsonRDD(rdd)
+    val df = read.json(rdd)
 
     conf.setConf(SQLConf.DEFAULT_DATA_SOURCE_NAME, "org.apache.spark.sql.json")
     // Save the df as a managed table (by not specifiying the path).
-    df.saveAsTable("savedJsonTable")
+    df.write.saveAsTable("savedJsonTable")
 
     checkAnswer(
       sql("SELECT * FROM savedJsonTable"),
@@ -455,17 +455,17 @@ class MetastoreDataSourcesSuite extends QueryTest with BeforeAndAfterEach {
 
     // Right now, we cannot append to an existing JSON table.
     intercept[RuntimeException] {
-      df.saveAsTable("savedJsonTable", SaveMode.Append)
+      df.write.mode(SaveMode.Append).saveAsTable("savedJsonTable")
     }
 
     // We can overwrite it.
-    df.saveAsTable("savedJsonTable", SaveMode.Overwrite)
+    df.write.mode(SaveMode.Overwrite).saveAsTable("savedJsonTable")
     checkAnswer(
       sql("SELECT * FROM savedJsonTable"),
       df.collect())
 
     // When the save mode is Ignore, we will do nothing when the table already exists.
-    df.select("b").saveAsTable("savedJsonTable", SaveMode.Ignore)
+    df.select("b").write.mode(SaveMode.Ignore).saveAsTable("savedJsonTable")
     assert(df.schema === table("savedJsonTable").schema)
     checkAnswer(
       sql("SELECT * FROM savedJsonTable"),
@@ -479,11 +479,11 @@ class MetastoreDataSourcesSuite extends QueryTest with BeforeAndAfterEach {
 
     // Create an external table by specifying the path.
     conf.setConf(SQLConf.DEFAULT_DATA_SOURCE_NAME, "not a source name")
-    df.saveAsTable(
-      "savedJsonTable",
-      "org.apache.spark.sql.json",
-      SaveMode.Append,
-      Map("path" -> tempPath.toString))
+    df.write
+      .format("org.apache.spark.sql.json")
+      .mode(SaveMode.Append)
+      .option("path", tempPath.toString)
+      .saveAsTable("savedJsonTable")
     checkAnswer(
       sql("SELECT * FROM savedJsonTable"),
       df.collect())
@@ -501,14 +501,13 @@ class MetastoreDataSourcesSuite extends QueryTest with BeforeAndAfterEach {
     val originalDefaultSource = conf.defaultDataSourceName
 
     val rdd = sparkContext.parallelize((1 to 10).map(i => s"""{"a":$i, "b":"str${i}"}"""))
-    val df = jsonRDD(rdd)
+    val df = read.json(rdd)
 
     conf.setConf(SQLConf.DEFAULT_DATA_SOURCE_NAME, "not a source name")
-    df.saveAsTable(
-      "savedJsonTable",
-      "org.apache.spark.sql.json",
-      SaveMode.Append,
-      Map("path" -> tempPath.toString))
+    df.write.format("org.apache.spark.sql.json")
+      .mode(SaveMode.Append)
+      .option("path", tempPath.toString)
+      .saveAsTable("savedJsonTable")
 
     conf.setConf(SQLConf.DEFAULT_DATA_SOURCE_NAME, "org.apache.spark.sql.json")
     createExternalTable("createdJsonTable", tempPath.toString)
@@ -566,7 +565,7 @@ class MetastoreDataSourcesSuite extends QueryTest with BeforeAndAfterEach {
       setConf(SQLConf.PARQUET_USE_DATA_SOURCE_API, "true")
 
       val rdd = sparkContext.parallelize((1 to 10).map(i => s"""{"a":$i, "b":"str${i}"}"""))
-      jsonRDD(rdd).registerTempTable("jt")
+      read.json(rdd).registerTempTable("jt")
       sql(
         """
           |create table test_parquet_ctas STORED AS parquET
@@ -601,7 +600,7 @@ class MetastoreDataSourcesSuite extends QueryTest with BeforeAndAfterEach {
       StructType(
         StructField("a", ArrayType(IntegerType, containsNull = true), nullable = true) :: Nil)
     assert(df1.schema === expectedSchema1)
-    df1.saveAsTable("arrayInParquet", "parquet", SaveMode.Overwrite)
+    df1.write.mode(SaveMode.Overwrite).format("parquet").saveAsTable("arrayInParquet")
 
     val df2 =
       createDataFrame(Tuple1(Seq(2, 3)) :: Nil).toDF("a")
@@ -610,10 +609,10 @@ class MetastoreDataSourcesSuite extends QueryTest with BeforeAndAfterEach {
         StructField("a", ArrayType(IntegerType, containsNull = false), nullable = true) :: Nil)
     assert(df2.schema === expectedSchema2)
     df2.insertInto("arrayInParquet", overwrite = false)
-    createDataFrame(Tuple1(Seq(4, 5)) :: Nil).toDF("a")
-      .saveAsTable("arrayInParquet", SaveMode.Append) // This one internally calls df2.insertInto.
-    createDataFrame(Tuple1(Seq(Int.box(6), null.asInstanceOf[Integer])) :: Nil).toDF("a")
-      .saveAsTable("arrayInParquet", "parquet", SaveMode.Append)
+    createDataFrame(Tuple1(Seq(4, 5)) :: Nil).toDF("a").write.mode(SaveMode.Append)
+      .saveAsTable("arrayInParquet") // This one internally calls df2.insertInto.
+    createDataFrame(Tuple1(Seq(Int.box(6), null.asInstanceOf[Integer])) :: Nil).toDF("a").write
+      .mode(SaveMode.Append).saveAsTable("arrayInParquet")
     refreshTable("arrayInParquet")
 
     checkAnswer(
@@ -634,7 +633,7 @@ class MetastoreDataSourcesSuite extends QueryTest with BeforeAndAfterEach {
       StructType(
         StructField("a", mapType1, nullable = true) :: Nil)
     assert(df1.schema === expectedSchema1)
-    df1.saveAsTable("mapInParquet", "parquet", SaveMode.Overwrite)
+    df1.write.mode(SaveMode.Overwrite).format("parquet").saveAsTable("mapInParquet")
 
     val df2 =
       createDataFrame(Tuple1(Map(2 -> 3)) :: Nil).toDF("a")
@@ -644,10 +643,10 @@ class MetastoreDataSourcesSuite extends QueryTest with BeforeAndAfterEach {
         StructField("a", mapType2, nullable = true) :: Nil)
     assert(df2.schema === expectedSchema2)
     df2.insertInto("mapInParquet", overwrite = false)
-    createDataFrame(Tuple1(Map(4 -> 5)) :: Nil).toDF("a")
-      .saveAsTable("mapInParquet", SaveMode.Append) // This one internally calls df2.insertInto.
-    createDataFrame(Tuple1(Map(6 -> null.asInstanceOf[Integer])) :: Nil).toDF("a")
-      .saveAsTable("mapInParquet", "parquet", SaveMode.Append)
+    createDataFrame(Tuple1(Map(4 -> 5)) :: Nil).toDF("a").write.mode(SaveMode.Append)
+      .saveAsTable("mapInParquet") // This one internally calls df2.insertInto.
+    createDataFrame(Tuple1(Map(6 -> null.asInstanceOf[Integer])) :: Nil).toDF("a").write
+      .format("parquet").mode(SaveMode.Append).saveAsTable("mapInParquet")
     refreshTable("mapInParquet")
 
     checkAnswer(
@@ -711,30 +710,30 @@ class MetastoreDataSourcesSuite extends QueryTest with BeforeAndAfterEach {
     def createDF(from: Int, to: Int): DataFrame =
       createDataFrame((from to to).map(i => Tuple2(i, s"str$i"))).toDF("c1", "c2")
 
-    createDF(0, 9).saveAsTable("insertParquet", "parquet")
+    createDF(0, 9).write.format("parquet").saveAsTable("insertParquet")
     checkAnswer(
       sql("SELECT p.c1, p.c2 FROM insertParquet p WHERE p.c1 > 5"),
       (6 to 9).map(i => Row(i, s"str$i")))
 
     intercept[AnalysisException] {
-      createDF(10, 19).saveAsTable("insertParquet", "parquet")
+      createDF(10, 19).write.format("parquet").saveAsTable("insertParquet")
     }
 
-    createDF(10, 19).saveAsTable("insertParquet", "parquet", SaveMode.Append)
+    createDF(10, 19).write.mode(SaveMode.Append).format("parquet").saveAsTable("insertParquet")
     checkAnswer(
       sql("SELECT p.c1, p.c2 FROM insertParquet p WHERE p.c1 > 5"),
       (6 to 19).map(i => Row(i, s"str$i")))
 
-    createDF(20, 29).saveAsTable("insertParquet", "parquet", SaveMode.Append)
+    createDF(20, 29).write.mode(SaveMode.Append).format("parquet").saveAsTable("insertParquet")
     checkAnswer(
       sql("SELECT p.c1, c2 FROM insertParquet p WHERE p.c1 > 5 AND p.c1 < 25"),
       (6 to 24).map(i => Row(i, s"str$i")))
 
     intercept[AnalysisException] {
-      createDF(30, 39).saveAsTable("insertParquet")
+      createDF(30, 39).write.saveAsTable("insertParquet")
     }
 
-    createDF(30, 39).saveAsTable("insertParquet", SaveMode.Append)
+    createDF(30, 39).write.mode(SaveMode.Append).saveAsTable("insertParquet")
     checkAnswer(
       sql("SELECT p.c1, c2 FROM insertParquet p WHERE p.c1 > 5 AND p.c1 < 35"),
       (6 to 34).map(i => Row(i, s"str$i")))
@@ -744,11 +743,11 @@ class MetastoreDataSourcesSuite extends QueryTest with BeforeAndAfterEach {
       sql("SELECT p.c1, c2 FROM insertParquet p WHERE p.c1 > 5 AND p.c1 < 45"),
       (6 to 44).map(i => Row(i, s"str$i")))
 
-    createDF(50, 59).saveAsTable("insertParquet", SaveMode.Overwrite)
+    createDF(50, 59).write.mode(SaveMode.Overwrite).saveAsTable("insertParquet")
     checkAnswer(
       sql("SELECT p.c1, c2 FROM insertParquet p WHERE p.c1 > 51 AND p.c1 < 55"),
       (52 to 54).map(i => Row(i, s"str$i")))
-    createDF(60, 69).saveAsTable("insertParquet", SaveMode.Ignore)
+    createDF(60, 69).write.mode(SaveMode.Ignore).saveAsTable("insertParquet")
     checkAnswer(
       sql("SELECT p.c1, c2 FROM insertParquet p"),
       (50 to 59).map(i => Row(i, s"str$i")))
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveResolutionSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveResolutionSuite.scala
index 8ad3627504229..3dfa6e72e1242 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveResolutionSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveResolutionSuite.scala
@@ -18,7 +18,7 @@
 package org.apache.spark.sql.hive.execution
 
 import org.apache.spark.sql.AnalysisException
-import org.apache.spark.sql.hive.test.TestHive.{sparkContext, jsonRDD, sql}
+import org.apache.spark.sql.hive.test.TestHive.{read, sparkContext, jsonRDD, sql}
 import org.apache.spark.sql.hive.test.TestHive.implicits._
 
 case class Nested(a: Int, B: Int)
@@ -31,14 +31,14 @@ case class Data(a: Int, B: Int, n: Nested, nestedArray: Seq[Nested])
 class HiveResolutionSuite extends HiveComparisonTest {
 
   test("SPARK-3698: case insensitive test for nested data") {
-    jsonRDD(sparkContext.makeRDD(
+    read.json(sparkContext.makeRDD(
       """{"a": [{"a": {"a": 1}}]}""" :: Nil)).registerTempTable("nested")
     // This should be successfully analyzed
     sql("SELECT a[0].A.A from nested").queryExecution.analyzed
   }
 
   test("SPARK-5278: check ambiguous reference to fields") {
-    jsonRDD(sparkContext.makeRDD(
+    read.json(sparkContext.makeRDD(
       """{"a": [{"b": 1, "B": 2}]}""" :: Nil)).registerTempTable("nested")
 
     // there are 2 filed matching field name "b", we should report Ambiguous reference error
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
index dfe73c62c42b9..ca2c4b4019c55 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
@@ -535,14 +535,14 @@ class SQLQuerySuite extends QueryTest {
 
   test("SPARK-4296 Grouping field with Hive UDF as sub expression") {
     val rdd = sparkContext.makeRDD( """{"a": "str", "b":"1", "c":"1970-01-01 00:00:00"}""" :: Nil)
-    jsonRDD(rdd).registerTempTable("data")
+    read.json(rdd).registerTempTable("data")
     checkAnswer(
       sql("SELECT concat(a, '-', b), year(c) FROM data GROUP BY concat(a, '-', b), year(c)"),
       Row("str-1", 1970))
 
     dropTempTable("data")
 
-    jsonRDD(rdd).registerTempTable("data")
+    read.json(rdd).registerTempTable("data")
     checkAnswer(sql("SELECT year(c) + 1 FROM data GROUP BY year(c) + 1"), Row(1971))
 
     dropTempTable("data")
@@ -550,7 +550,7 @@ class SQLQuerySuite extends QueryTest {
 
   test("resolve udtf with single alias") {
     val rdd = sparkContext.makeRDD((1 to 5).map(i => s"""{"a":[$i, ${i + 1}]}"""))
-    jsonRDD(rdd).registerTempTable("data")
+    read.json(rdd).registerTempTable("data")
     val df = sql("SELECT explode(a) AS val FROM data")
     val col = df("val")
   }
@@ -563,7 +563,7 @@ class SQLQuerySuite extends QueryTest {
     // PreInsertionCasts will actually start to work before ImplicitGenerate and then
     // generates an invalid query plan.
     val rdd = sparkContext.makeRDD((1 to 5).map(i => s"""{"a":[$i, ${i + 1}]}"""))
-    jsonRDD(rdd).registerTempTable("data")
+    read.json(rdd).registerTempTable("data")
     val originalConf = getConf("spark.sql.hive.convertCTAS", "false")
     setConf("spark.sql.hive.convertCTAS", "false")
 
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/parquetSuites.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/parquetSuites.scala
index a0075f1e44ca8..05d99983b6a63 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/parquetSuites.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/parquetSuites.scala
@@ -150,9 +150,9 @@ class ParquetMetastoreSuiteBase extends ParquetPartitioningTest {
     }
 
     val rdd1 = sparkContext.parallelize((1 to 10).map(i => s"""{"a":$i, "b":"str$i"}"""))
-    jsonRDD(rdd1).registerTempTable("jt")
+    read.json(rdd1).registerTempTable("jt")
     val rdd2 = sparkContext.parallelize((1 to 10).map(i => s"""{"a":[$i, null]}"""))
-    jsonRDD(rdd2).registerTempTable("jt_array")
+    read.json(rdd2).registerTempTable("jt_array")
 
     setConf("spark.sql.hive.convertMetastoreParquet", "true")
   }
@@ -617,16 +617,16 @@ class ParquetSourceSuiteBase extends ParquetPartitioningTest {
     sql("drop table if exists spark_6016_fix")
 
     // Create a DataFrame with two partitions. So, the created table will have two parquet files.
-    val df1 = jsonRDD(sparkContext.parallelize((1 to 10).map(i => s"""{"a":$i}"""), 2))
-    df1.saveAsTable("spark_6016_fix", "parquet", SaveMode.Overwrite)
+    val df1 = read.json(sparkContext.parallelize((1 to 10).map(i => s"""{"a":$i}"""), 2))
+    df1.write.mode(SaveMode.Overwrite).format("parquet").saveAsTable("spark_6016_fix")
     checkAnswer(
       sql("select * from spark_6016_fix"),
       (1 to 10).map(i => Row(i))
     )
 
     // Create a DataFrame with four partitions. So, the created table will have four parquet files.
-    val df2 = jsonRDD(sparkContext.parallelize((1 to 10).map(i => s"""{"b":$i}"""), 4))
-    df2.saveAsTable("spark_6016_fix", "parquet", SaveMode.Overwrite)
+    val df2 = read.json(sparkContext.parallelize((1 to 10).map(i => s"""{"b":$i}"""), 4))
+    df2.write.mode(SaveMode.Overwrite).format("parquet").saveAsTable("spark_6016_fix")
     // For the bug of SPARK-6016, we are caching two outdated footers for df1. Then,
     // since the new table has four parquet files, we are trying to read new footers from two files
     // and then merge metadata in footers of these four (two outdated ones and two latest one),
@@ -663,7 +663,7 @@ class ParquetDataSourceOnSourceSuite extends ParquetSourceSuiteBase {
         StructField("a", arrayType1, nullable = true) :: Nil)
     assert(df.schema === expectedSchema1)
 
-    df.saveAsTable("alwaysNullable", "parquet")
+    df.write.format("parquet").saveAsTable("alwaysNullable")
 
     val mapType2 = MapType(IntegerType, IntegerType, valueContainsNull = true)
     val arrayType2 = ArrayType(IntegerType, containsNull = true)
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/sources/hadoopFsRelationSuites.scala b/sql/hive/src/test/scala/org/apache/spark/sql/sources/hadoopFsRelationSuites.scala
index f44b3c521e647..9d9b436cabe3c 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/sources/hadoopFsRelationSuites.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/sources/hadoopFsRelationSuites.scala
@@ -120,10 +120,7 @@ class HadoopFsRelationTest extends QueryTest with ParquetTest {
   test("save()/load() - non-partitioned table - ErrorIfExists") {
     withTempDir { file =>
       intercept[RuntimeException] {
-        testDF.save(
-          path = file.getCanonicalPath,
-          source = dataSourceName,
-          mode = SaveMode.ErrorIfExists)
+        testDF.write.format(dataSourceName).mode(SaveMode.ErrorIfExists).save(file.getCanonicalPath)
       }
     }
   }
@@ -233,10 +230,8 @@ class HadoopFsRelationTest extends QueryTest with ParquetTest {
 
   test("save()/load() - partitioned table - Ignore") {
     withTempDir { file =>
-      partitionedTestDF.save(
-        path = file.getCanonicalPath,
-        source = dataSourceName,
-        mode = SaveMode.Ignore)
+      partitionedTestDF.write
+        .format(dataSourceName).mode(SaveMode.Ignore).save(file.getCanonicalPath)
 
       val path = new Path(file.getCanonicalPath)
       val fs = path.getFileSystem(SparkHadoopUtil.get.conf)
@@ -249,11 +244,9 @@ class HadoopFsRelationTest extends QueryTest with ParquetTest {
   }
 
   test("saveAsTable()/load() - non-partitioned table - Overwrite") {
-    testDF.saveAsTable(
-      tableName = "t",
-      source = dataSourceName,
-      mode = SaveMode.Overwrite,
-      Map("dataSchema" -> dataSchema.json))
+    testDF.write.format(dataSourceName).mode(SaveMode.Overwrite)
+      .option("dataSchema", dataSchema.json)
+      .saveAsTable("t")
 
     withTable("t") {
       checkAnswer(table("t"), testDF.collect())
@@ -261,15 +254,8 @@ class HadoopFsRelationTest extends QueryTest with ParquetTest {
   }
 
   test("saveAsTable()/load() - non-partitioned table - Append") {
-    testDF.saveAsTable(
-      tableName = "t",
-      source = dataSourceName,
-      mode = SaveMode.Overwrite)
-
-    testDF.saveAsTable(
-      tableName = "t",
-      source = dataSourceName,
-      mode = SaveMode.Append)
+    testDF.write.format(dataSourceName).mode(SaveMode.Overwrite).saveAsTable("t")
+    testDF.write.format(dataSourceName).mode(SaveMode.Append).saveAsTable("t")
 
     withTable("t") {
       checkAnswer(table("t"), testDF.unionAll(testDF).orderBy("a").collect())
@@ -281,10 +267,7 @@ class HadoopFsRelationTest extends QueryTest with ParquetTest {
 
     withTempTable("t") {
       intercept[AnalysisException] {
-        testDF.saveAsTable(
-          tableName = "t",
-          source = dataSourceName,
-          mode = SaveMode.ErrorIfExists)
+        testDF.write.format(dataSourceName).mode(SaveMode.ErrorIfExists).saveAsTable("t")
       }
     }
   }
@@ -293,21 +276,16 @@ class HadoopFsRelationTest extends QueryTest with ParquetTest {
     Seq.empty[(Int, String)].toDF().registerTempTable("t")
 
     withTempTable("t") {
-      testDF.saveAsTable(
-        tableName = "t",
-        source = dataSourceName,
-        mode = SaveMode.Ignore)
-
+      testDF.write.format(dataSourceName).mode(SaveMode.Ignore).saveAsTable("t")
       assert(table("t").collect().isEmpty)
     }
   }
 
   test("saveAsTable()/load() - partitioned table - simple queries") {
-    partitionedTestDF.saveAsTable(
-      tableName = "t",
-      source = dataSourceName,
-      mode = SaveMode.Overwrite,
-      Map("dataSchema" -> dataSchema.json))
+    partitionedTestDF.write.format(dataSourceName)
+      .mode(SaveMode.Overwrite)
+      .option("dataSchema", dataSchema.json)
+      .saveAsTable("t")
 
     withTable("t") {
       checkQueries(table("t"))
@@ -492,11 +470,9 @@ class SimpleTextHadoopFsRelationSuite extends HadoopFsRelationTest {
         StructType(dataSchema.fields :+ StructField("p1", IntegerType, nullable = true))
 
       checkQueries(
-        load(
-          source = dataSourceName,
-          options = Map(
-            "path" -> file.getCanonicalPath,
-            "dataSchema" -> dataSchemaWithPartition.json)))
+        read.format(dataSourceName)
+          .option("dataSchema", dataSchemaWithPartition.json)
+          .load(file.getCanonicalPath))
     }
   }
 }
@@ -518,18 +494,16 @@ class ParquetHadoopFsRelationSuite extends HadoopFsRelationTest {
         sparkContext
           .parallelize(for (i <- 1 to 3) yield (i, s"val_$i", p1))
           .toDF("a", "b", "p1")
-          .saveAsParquetFile(partitionDir.toString)
+          .write.parquet(partitionDir.toString)
       }
 
       val dataSchemaWithPartition =
         StructType(dataSchema.fields :+ StructField("p1", IntegerType, nullable = true))
 
       checkQueries(
-        load(
-          source = dataSourceName,
-          options = Map(
-            "path" -> file.getCanonicalPath,
-            "dataSchema" -> dataSchemaWithPartition.json)))
+        read.format(dataSourceName)
+          .option("dataSchema", dataSchemaWithPartition.json)
+          .load(file.getCanonicalPath))
     }
   }
 }

From ba4f8ca0d9ccc0a39a8a0105541d0cc1f4912d62 Mon Sep 17 00:00:00 2001
From: Cheng Lian <lian@databricks.com>
Date: Sat, 16 May 2015 23:20:09 -0700
Subject: [PATCH 028/525] [MINOR] [SQL] Removes an unreachable case clause

This case clause is already covered by the one above, and generates a compilation warning.

Author: Cheng Lian <lian@databricks.com>

Closes #6214 from liancheng/remove-unreachable-code and squashes the following commits:

c38ca7c [Cheng Lian] Removes an unreachable case clause
---
 sql/core/src/main/scala/org/apache/spark/sql/sources/rules.scala | 1 -
 1 file changed, 1 deletion(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/sources/rules.scala b/sql/core/src/main/scala/org/apache/spark/sql/sources/rules.scala
index 1eacdde7413f1..ab33125b74c17 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/sources/rules.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/sources/rules.scala
@@ -101,7 +101,6 @@ private[sql] case class PreWriteCheck(catalog: Catalog) extends (LogicalPlan =>
           }
         }
 
-      case logical.InsertIntoTable(LogicalRelation(_: InsertableRelation), _, _, _, _) => // OK
       case logical.InsertIntoTable(LogicalRelation(_: HadoopFsRelation), _, _, _, _) => // OK
       case logical.InsertIntoTable(l: LogicalRelation, _, _, _, _) =>
         // The relation in l is not an InsertableRelation.

From 1a7b9ce80bb5649796dda48d6a6d662a2809d0ef Mon Sep 17 00:00:00 2001
From: Shivaram Venkataraman <shivaram@cs.berkeley.edu>
Date: Sun, 17 May 2015 00:12:20 -0700
Subject: [PATCH 029/525] [MINOR] Add 1.3, 1.3.1 to master branch EC2 scripts

cc pwendell

P.S: I can't believe this was outdated all along ?

Author: Shivaram Venkataraman <shivaram@cs.berkeley.edu>

Closes #6215 from shivaram/update-ec2-map and squashes the following commits:

ae3937a [Shivaram Venkataraman] Add 1.3, 1.3.1 to master branch EC2 scripts
---
 ec2/spark_ec2.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/ec2/spark_ec2.py b/ec2/spark_ec2.py
index ab4a96f232c13..be92d5f45aa77 100755
--- a/ec2/spark_ec2.py
+++ b/ec2/spark_ec2.py
@@ -48,7 +48,7 @@
     from urllib.request import urlopen, Request
     from urllib.error import HTTPError
 
-SPARK_EC2_VERSION = "1.2.1"
+SPARK_EC2_VERSION = "1.3.1"
 SPARK_EC2_DIR = os.path.dirname(os.path.realpath(__file__))
 
 VALID_SPARK_VERSIONS = set([
@@ -65,6 +65,8 @@
     "1.1.1",
     "1.2.0",
     "1.2.1",
+    "1.3.0",
+    "1.3.1",
 ])
 
 SPARK_TACHYON_MAP = {
@@ -75,6 +77,8 @@
     "1.1.1": "0.5.0",
     "1.2.0": "0.5.0",
     "1.2.1": "0.5.0",
+    "1.3.0": "0.5.0",
+    "1.3.1": "0.5.0",
 }
 
 DEFAULT_SPARK_VERSION = SPARK_EC2_VERSION

From edf09ea1bd4bf7692e0085ad9c70cb1bfc8d06d8 Mon Sep 17 00:00:00 2001
From: scwf <wangfei1@huawei.com>
Date: Sun, 17 May 2015 15:17:11 +0800
Subject: [PATCH 030/525] [SQL] [MINOR] Skip unresolved expression for
 InConversion

Author: scwf <wangfei1@huawei.com>

Closes #6145 from scwf/InConversion and squashes the following commits:

5c8ac6b [scwf] minir fix for InConversion
---
 .../apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala  | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala
index fe0d3f29977c3..b45b17d856fac 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala
@@ -296,6 +296,9 @@ trait HiveTypeCoercion {
    */
   object InConversion extends Rule[LogicalPlan] {
     def apply(plan: LogicalPlan): LogicalPlan = plan transformAllExpressions {
+      // Skip nodes who's children have not been resolved yet.
+      case e if !e.childrenResolved => e 
+      
       case i @ In(a, b) if b.exists(_.dataType != a.dataType) =>
         i.makeCopy(Array(a, b.map(Cast(_, a.dataType))))
     }

From 339905578790fa37fcad9684b859b443313a5aa2 Mon Sep 17 00:00:00 2001
From: Liang-Chi Hsieh <viirya@gmail.com>
Date: Sun, 17 May 2015 15:42:21 +0800
Subject: [PATCH 031/525] [SPARK-7447] [SQL] Don't re-merge Parquet schema when
 the relation is deserialized

JIRA: https://issues.apache.org/jira/browse/SPARK-7447

`MetadataCache` in `ParquetRelation2` is annotated as `transient`. When `ParquetRelation2` is deserialized, we ask `MetadataCache` to refresh and perform schema merging again. It is time-consuming especially for very many parquet files.

With the new `FSBasedParquetRelation`, although `MetadataCache` is not `transient` now, `MetadataCache.refresh()` still performs schema merging again when the relation is deserialized.

Author: Liang-Chi Hsieh <viirya@gmail.com>

Closes #6012 from viirya/without_remerge_schema and squashes the following commits:

2663957 [Liang-Chi Hsieh] Merge remote-tracking branch 'upstream/master' into without_remerge_schema
6ac7d93 [Liang-Chi Hsieh] Merge remote-tracking branch 'upstream/master' into without_remerge_schema
b0fc09b [Liang-Chi Hsieh] Don't generate and merge parquetSchema multiple times.
---
 .../apache/spark/sql/parquet/newParquet.scala | 32 +++++++++++--------
 1 file changed, 18 insertions(+), 14 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/newParquet.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/newParquet.scala
index 946062f6ea64e..bcbdb1ebd236a 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/newParquet.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/newParquet.scala
@@ -340,7 +340,7 @@ private[sql] class ParquetRelation2(
 
     // Schema of the actual Parquet files, without partition columns discovered from partition
     // directory paths.
-    var dataSchema: StructType = _
+    var dataSchema: StructType = null
 
     // Schema of the whole table, including partition columns.
     var schema: StructType = _
@@ -379,19 +379,23 @@ private[sql] class ParquetRelation2(
         f -> new Footer(f.getPath, parquetMetadata)
       }.seq.toMap
 
-      dataSchema = {
-        val dataSchema0 =
-          maybeDataSchema
-            .orElse(readSchema())
-            .orElse(maybeMetastoreSchema)
-            .getOrElse(sys.error("Failed to get the schema."))
-
-        // If this Parquet relation is converted from a Hive Metastore table, must reconcile case
-        // case insensitivity issue and possible schema mismatch (probably caused by schema
-        // evolution).
-        maybeMetastoreSchema
-          .map(ParquetRelation2.mergeMetastoreParquetSchema(_, dataSchema0))
-          .getOrElse(dataSchema0)
+      // If we already get the schema, don't need to re-compute it since the schema merging is
+      // time-consuming.
+      if (dataSchema == null) {
+        dataSchema = {
+          val dataSchema0 =
+            maybeDataSchema
+              .orElse(readSchema())
+              .orElse(maybeMetastoreSchema)
+              .getOrElse(sys.error("Failed to get the schema."))
+        
+          // If this Parquet relation is converted from a Hive Metastore table, must reconcile case
+          // case insensitivity issue and possible schema mismatch (probably caused by schema
+          // evolution).
+          maybeMetastoreSchema
+            .map(ParquetRelation2.mergeMetastoreParquetSchema(_, dataSchema0))
+            .getOrElse(dataSchema0)
+        }
       }
     }
 

From 50217667cc1239ed3b15f4d10907b727ed85d7fa Mon Sep 17 00:00:00 2001
From: Steve Loughran <stevel@hortonworks.com>
Date: Sun, 17 May 2015 17:03:11 +0100
Subject: [PATCH 032/525] =?UTF-8?q?[SPARK-7669]=20Builds=20against=20Hadoo?=
 =?UTF-8?q?p=202.6+=20get=20inconsistent=20curator=20depend=E2=80=A6?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This adds a new profile, `hadoop-2.6`, copying over the hadoop-2.4 properties, updating ZK to 3.4.6 and making the curator version a configurable option. That keeps the curator-recipes JAR in sync with that used in hadoop.

There's one more option to consider: making the full curator-client version explicit with its own dependency version. This will pin down the version from hadoop and hive imports

Author: Steve Loughran <stevel@hortonworks.com>

Closes #6191 from steveloughran/stevel/SPARK-7669-hadoop-2.6 and squashes the following commits:

e3e281a [Steve Loughran] SPARK-7669 declare the version of curator-client and curator-framework JARs
2901ea9 [Steve Loughran] SPARK-7669 Builds against Hadoop 2.6+ get inconsistent curator dependencies
---
 pom.xml | 26 ++++++++++++++++++++++++--
 1 file changed, 24 insertions(+), 2 deletions(-)

diff --git a/pom.xml b/pom.xml
index 1b45cdb67012a..6768a039d11e0 100644
--- a/pom.xml
+++ b/pom.xml
@@ -130,6 +130,7 @@
     <hbase.artifact>hbase</hbase.artifact>
     <flume.version>1.4.0</flume.version>
     <zookeeper.version>3.4.5</zookeeper.version>
+    <curator.version>2.4.0</curator.version>
     <hive.group>org.spark-project.hive</hive.group>
     <!-- Version used in Maven Hive dependency -->
     <hive.version>0.13.1a</hive.version>
@@ -707,7 +708,7 @@
       <dependency>
         <groupId>org.apache.curator</groupId>
         <artifactId>curator-recipes</artifactId>
-        <version>2.4.0</version>
+        <version>${curator.version}</version>
         <scope>${hadoop.deps.scope}</scope>
         <exclusions>
           <exclusion>
@@ -716,6 +717,16 @@
           </exclusion>
         </exclusions>
       </dependency>
+      <dependency>
+        <groupId>org.apache.curator</groupId>
+        <artifactId>curator-client</artifactId>
+        <version>${curator.version}</version>
+      </dependency>
+      <dependency>
+        <groupId>org.apache.curator</groupId>
+        <artifactId>curator-framework</artifactId>
+        <version>${curator.version}</version>
+      </dependency>
       <dependency>
         <groupId>org.apache.hadoop</groupId>
         <artifactId>hadoop-client</artifactId>
@@ -1679,6 +1690,17 @@
       </properties>
     </profile>
 
+    <profile>
+      <id>hadoop-2.6</id>
+      <properties>
+        <hadoop.version>2.6.0</hadoop.version>
+        <jets3t.version>0.9.3</jets3t.version>
+        <commons.math3.version>3.1.1</commons.math3.version>
+        <zookeeper.version>3.4.6</zookeeper.version>
+        <curator.version>2.6.0</curator.version>
+      </properties>
+    </profile>
+
     <profile>
       <id>yarn</id>
       <modules>
@@ -1709,7 +1731,7 @@
         <dependency>
           <groupId>org.apache.curator</groupId>
           <artifactId>curator-recipes</artifactId>
-          <version>2.4.0</version>
+          <version>${curator.version}</version>
           <exclusions>
             <exclusion>
               <groupId>org.apache.zookeeper</groupId>

From f2cc6b5bccc3a70fd7d69183b1a068800831fe19 Mon Sep 17 00:00:00 2001
From: Josh Rosen <joshrosen@databricks.com>
Date: Sun, 17 May 2015 09:30:49 -0700
Subject: [PATCH 033/525] [SPARK-7660] Wrap SnappyOutputStream to work around
 snappy-java bug

This patch wraps `SnappyOutputStream` to ensure that `close()` is idempotent and to guard against write-after-`close()` bugs. This is a workaround for https://github.com/xerial/snappy-java/issues/107, a bug where a non-idempotent `close()` method can lead to stream corruption. We can remove this workaround if we upgrade to a snappy-java version that contains my fix for this bug, but in the meantime this patch offers a backportable Spark fix.

Author: Josh Rosen <joshrosen@databricks.com>

Closes #6176 from JoshRosen/SPARK-7660-wrap-snappy and squashes the following commits:

8b77aae [Josh Rosen] Wrap SnappyOutputStream to fix SPARK-7660
---
 .../apache/spark/io/CompressionCodec.scala    | 49 ++++++++++++++++++-
 .../unsafe/UnsafeShuffleWriterSuite.java      |  8 ---
 2 files changed, 47 insertions(+), 10 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/io/CompressionCodec.scala b/core/src/main/scala/org/apache/spark/io/CompressionCodec.scala
index 0756cdb2ed8e6..0d8ac1f80a9f4 100644
--- a/core/src/main/scala/org/apache/spark/io/CompressionCodec.scala
+++ b/core/src/main/scala/org/apache/spark/io/CompressionCodec.scala
@@ -17,7 +17,7 @@
 
 package org.apache.spark.io
 
-import java.io.{InputStream, OutputStream}
+import java.io.{IOException, InputStream, OutputStream}
 
 import com.ning.compress.lzf.{LZFInputStream, LZFOutputStream}
 import net.jpountz.lz4.{LZ4BlockInputStream, LZ4BlockOutputStream}
@@ -154,8 +154,53 @@ class SnappyCompressionCodec(conf: SparkConf) extends CompressionCodec {
 
   override def compressedOutputStream(s: OutputStream): OutputStream = {
     val blockSize = conf.getSizeAsBytes("spark.io.compression.snappy.blockSize", "32k").toInt
-    new SnappyOutputStream(s, blockSize)
+    new SnappyOutputStreamWrapper(new SnappyOutputStream(s, blockSize))
   }
 
   override def compressedInputStream(s: InputStream): InputStream = new SnappyInputStream(s)
 }
+
+/**
+ * Wrapper over [[SnappyOutputStream]] which guards against write-after-close and double-close
+ * issues. See SPARK-7660 for more details. This wrapping can be removed if we upgrade to a version
+ * of snappy-java that contains the fix for https://github.com/xerial/snappy-java/issues/107.
+ */
+private final class SnappyOutputStreamWrapper(os: SnappyOutputStream) extends OutputStream {
+
+  private[this] var closed: Boolean = false
+
+  override def write(b: Int): Unit = {
+    if (closed) {
+      throw new IOException("Stream is closed")
+    }
+    os.write(b)
+  }
+
+  override def write(b: Array[Byte]): Unit = {
+    if (closed) {
+      throw new IOException("Stream is closed")
+    }
+    os.write(b)
+  }
+
+  override def write(b: Array[Byte], off: Int, len: Int): Unit = {
+    if (closed) {
+      throw new IOException("Stream is closed")
+    }
+    os.write(b, off, len)
+  }
+
+  override def flush(): Unit = {
+    if (closed) {
+      throw new IOException("Stream is closed")
+    }
+    os.flush()
+  }
+
+  override def close(): Unit = {
+    if (!closed) {
+      closed = true
+      os.close()
+    }
+  }
+}
diff --git a/core/src/test/java/org/apache/spark/shuffle/unsafe/UnsafeShuffleWriterSuite.java b/core/src/test/java/org/apache/spark/shuffle/unsafe/UnsafeShuffleWriterSuite.java
index 78e52643531e0..730d265c87f88 100644
--- a/core/src/test/java/org/apache/spark/shuffle/unsafe/UnsafeShuffleWriterSuite.java
+++ b/core/src/test/java/org/apache/spark/shuffle/unsafe/UnsafeShuffleWriterSuite.java
@@ -35,7 +35,6 @@
 import org.mockito.MockitoAnnotations;
 import org.mockito.invocation.InvocationOnMock;
 import org.mockito.stubbing.Answer;
-import org.xerial.snappy.buffer.CachedBufferAllocator;
 import static org.hamcrest.MatcherAssert.assertThat;
 import static org.hamcrest.Matchers.greaterThan;
 import static org.hamcrest.Matchers.lessThan;
@@ -97,13 +96,6 @@ public OutputStream apply(OutputStream stream) {
   @After
   public void tearDown() {
     Utils.deleteRecursively(tempDir);
-    // This call is a workaround for SPARK-7660, a snappy-java bug which is exposed by this test
-    // suite. Clearing the cached buffer allocator's pool of reusable buffers masks this bug,
-    // preventing a test failure in JavaAPISuite that would otherwise occur. The underlying bug
-    // needs to be fixed, but in the meantime this workaround avoids spurious Jenkins failures.
-    synchronized (CachedBufferAllocator.class) {
-      CachedBufferAllocator.queueTable.clear();
-    }
     final long leakedMemory = taskMemoryManager.cleanUpAllAllocatedMemory();
     if (leakedMemory != 0) {
       fail("Test leaked " + leakedMemory + " bytes of managed memory");

From 564562874f589c4c8bcabcd9d6eb9a6b0eada938 Mon Sep 17 00:00:00 2001
From: Josh Rosen <joshrosen@databricks.com>
Date: Sun, 17 May 2015 11:59:28 -0700
Subject: [PATCH 034/525] [SPARK-7686] [SQL] DescribeCommand is assigned wrong
 output attributes in SparkStrategies

In `SparkStrategies`, `RunnableDescribeCommand` is called with the output attributes of the table being described rather than the attributes for the `describe` command's output.  I discovered this issue because it caused type conversion errors in some UnsafeRow conversion code that I'm writing.

Author: Josh Rosen <joshrosen@databricks.com>

Closes #6217 from JoshRosen/SPARK-7686 and squashes the following commits:

953a344 [Josh Rosen] Fix SPARK-7686 with a simple change in SparkStrategies.
a4eec9f [Josh Rosen] Add failing regression test for SPARK-7686
---
 .../org/apache/spark/sql/execution/SparkStrategies.scala    | 4 ++--
 .../scala/org/apache/spark/sql/sources/DDLTestSuite.scala   | 6 ++++++
 2 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala
index af0029cb84f9a..3f6a0345bc17d 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala
@@ -354,10 +354,10 @@ private[sql] abstract class SparkStrategies extends QueryPlanner[SparkPlan] {
       case c: CreateTableUsingAsSelect if !c.temporary =>
         sys.error("Tables created with SQLContext must be TEMPORARY. Use a HiveContext instead.")
 
-      case LogicalDescribeCommand(table, isExtended) =>
+      case describe @ LogicalDescribeCommand(table, isExtended) =>
         val resultPlan = self.sqlContext.executePlan(table).executedPlan
         ExecutedCommand(
-          RunnableDescribeCommand(resultPlan, resultPlan.output, isExtended)) :: Nil
+          RunnableDescribeCommand(resultPlan, describe.output, isExtended)) :: Nil
 
       case _ => Nil
     }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/DDLTestSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/DDLTestSuite.scala
index 6664e8d64c13a..f5106f67a08df 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/sources/DDLTestSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/sources/DDLTestSuite.scala
@@ -99,4 +99,10 @@ class DDLTestSuite extends DataSourceTest {
         Row("arrayType", "array<string>", ""),
         Row("structType", "struct<f1:string,f2:int>", "")
       ))
+
+  test("SPARK-7686 DescribeCommand should have correct physical plan output attributes") {
+    val attributes = sql("describe ddlPeople").queryExecution.executedPlan.output
+    assert(attributes.map(_.name) === Seq("col_name", "data_type", "comment"))
+    assert(attributes.map(_.dataType).toSet === Set(StringType))
+  }
 }

From 2ca60ace8f42cf0bd4569d86c86c37a8a2b6a37c Mon Sep 17 00:00:00 2001
From: Michael Armbrust <michael@databricks.com>
Date: Sun, 17 May 2015 12:43:15 -0700
Subject: [PATCH 035/525] [SPARK-7491] [SQL] Allow configuration of classloader
 isolation for hive

Author: Michael Armbrust <michael@databricks.com>

Closes #6167 from marmbrus/configureIsolation and squashes the following commits:

6147cbe [Michael Armbrust] filter other conf
22cc3bc7 [Michael Armbrust] Merge remote-tracking branch 'origin/master' into configureIsolation
07476ee [Michael Armbrust] filter empty prefixes
dfdf19c [Michael Armbrust] [SPARK-6906][SQL] Allow configuration of classloader isolation for hive
---
 .../apache/spark/sql/hive/HiveContext.scala   | 33 +++++++++++++++++--
 .../hive/client/IsolatedClientLoader.scala    | 14 ++++----
 .../apache/spark/sql/hive/test/TestHive.scala |  9 ++++-
 3 files changed, 46 insertions(+), 10 deletions(-)

diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala
index 9d98c36e947a1..2733ebdb95bca 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala
@@ -122,6 +122,29 @@ class HiveContext(sc: SparkContext) extends SQLContext(sc) {
   protected[hive] def hiveMetastoreJars: String =
     getConf(HIVE_METASTORE_JARS, "builtin")
 
+  /**
+   * A comma separated list of class prefixes that should be loaded using the classloader that
+   * is shared between Spark SQL and a specific version of Hive. An example of classes that should
+   * be shared is JDBC drivers that are needed to talk to the metastore. Other classes that need
+   * to be shared are those that interact with classes that are already shared.  For example,
+   * custom appenders that are used by log4j.
+   */
+  protected[hive] def hiveMetastoreSharedPrefixes: Seq[String] =
+    getConf("spark.sql.hive.metastore.sharedPrefixes", jdbcPrefixes)
+      .split(",").filterNot(_ == "")
+
+  private def jdbcPrefixes = Seq(
+    "com.mysql.jdbc", "org.postgresql", "com.microsoft.sqlserver", "oracle.jdbc").mkString(",")
+
+  /**
+   * A comma separated list of class prefixes that should explicitly be reloaded for each version
+   * of Hive that Spark SQL is communicating with.  For example, Hive UDFs that are declared in a
+   * prefix that typically would be shared (i.e. org.apache.spark.*)
+   */
+  protected[hive] def hiveMetastoreBarrierPrefixes: Seq[String] =
+    getConf("spark.sql.hive.metastore.barrierPrefixes", "")
+      .split(",").filterNot(_ == "")
+
   @transient
   protected[sql] lazy val substitutor = new VariableSubstitution()
 
@@ -179,12 +202,14 @@ class HiveContext(sc: SparkContext) extends SQLContext(sc) {
         version = metaVersion,
         execJars = jars.toSeq,
         config = allConfig,
-        isolationOn = true)
+        isolationOn = true,
+        barrierPrefixes = hiveMetastoreBarrierPrefixes,
+        sharedPrefixes = hiveMetastoreSharedPrefixes)
     } else if (hiveMetastoreJars == "maven") {
       // TODO: Support for loading the jars from an already downloaded location.
       logInfo(
         s"Initializing HiveMetastoreConnection version $hiveMetastoreVersion using maven.")
-      IsolatedClientLoader.forVersion(hiveMetastoreVersion, allConfig )
+      IsolatedClientLoader.forVersion(hiveMetastoreVersion, allConfig)
     } else {
       // Convert to files and expand any directories.
       val jars =
@@ -210,7 +235,9 @@ class HiveContext(sc: SparkContext) extends SQLContext(sc) {
         version = metaVersion,
         execJars = jars.toSeq,
         config = allConfig,
-        isolationOn = true)
+        isolationOn = true,
+        barrierPrefixes = hiveMetastoreBarrierPrefixes,
+        sharedPrefixes = hiveMetastoreSharedPrefixes)
     }
     isolatedLoader.client
   }
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/IsolatedClientLoader.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/IsolatedClientLoader.scala
index 7f94c93ba49c1..196a3d836cab2 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/IsolatedClientLoader.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/IsolatedClientLoader.scala
@@ -56,8 +56,7 @@ private[hive] object IsolatedClientLoader {
         (if (version.hasBuiltinsJar) "hive-builtins" :: Nil else Nil))
         .map(a => s"org.apache.hive:$a:${version.fullVersion}") :+
         "com.google.guava:guava:14.0.1" :+
-        "org.apache.hadoop:hadoop-client:2.4.0" :+
-        "mysql:mysql-connector-java:5.1.12"
+        "org.apache.hadoop:hadoop-client:2.4.0"
 
     val classpath = quietly {
       SparkSubmitUtils.resolveMavenCoordinates(
@@ -106,7 +105,9 @@ private[hive] class IsolatedClientLoader(
     val config: Map[String, String] = Map.empty,
     val isolationOn: Boolean = true,
     val rootClassLoader: ClassLoader = ClassLoader.getSystemClassLoader.getParent.getParent,
-    val baseClassLoader: ClassLoader = Thread.currentThread().getContextClassLoader)
+    val baseClassLoader: ClassLoader = Thread.currentThread().getContextClassLoader,
+    val sharedPrefixes: Seq[String] = Seq.empty,
+    val barrierPrefixes: Seq[String] = Seq.empty)
   extends Logging {
 
   // Check to make sure that the root classloader does not know about Hive.
@@ -122,13 +123,14 @@ private[hive] class IsolatedClientLoader(
     name.startsWith("scala.") ||
     name.startsWith("com.google") ||
     name.startsWith("java.lang.") ||
-    name.startsWith("java.net")
+    name.startsWith("java.net") ||
+    sharedPrefixes.exists(name.startsWith)
 
   /** True if `name` refers to a spark class that must see specific version of Hive. */
   protected def isBarrierClass(name: String): Boolean =
-    name.startsWith("org.apache.spark.sql.hive.execution.PairSerDe") ||
     name.startsWith(classOf[ClientWrapper].getName) ||
-    name.startsWith(classOf[ReflectionMagic].getName)
+    name.startsWith(classOf[ReflectionMagic].getName) ||
+    barrierPrefixes.exists(name.startsWith)
 
   protected def classToPath(name: String): String =
     name.replaceAll("\\.", "/") + ".class"
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/test/TestHive.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/test/TestHive.scala
index 1598d4bd47550..964828407481e 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/test/TestHive.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/test/TestHive.scala
@@ -48,7 +48,14 @@ import scala.collection.JavaConversions._
 // SPARK-3729: Test key required to check for initialization errors with config.
 object TestHive
   extends TestHiveContext(
-    new SparkContext("local[2]", "TestSQLContext", new SparkConf().set("spark.sql.test", "")))
+    new SparkContext(
+      "local[2]",
+      "TestSQLContext",
+      new SparkConf()
+        .set("spark.sql.test", "")
+        .set(
+          "spark.sql.hive.metastore.barrierPrefixes",
+          "org.apache.spark.sql.hive.execution.PairSerDe")))
 
 /**
  * A locally running test instance of Spark's Hive execution engine.

From ca4257aec658aaa87f4f097dd7534033d5f13ddc Mon Sep 17 00:00:00 2001
From: Tathagata Das <tathagata.das1565@gmail.com>
Date: Sun, 17 May 2015 16:49:07 -0700
Subject: [PATCH 036/525] [SPARK-6514] [SPARK-5960] [SPARK-6656] [SPARK-7679]
 [STREAMING] [KINESIS] Updates to the Kinesis API

SPARK-6514 - Use correct region
SPARK-5960 - Allow AWS Credentials to be directly passed
SPARK-6656 - Specify kinesis application name explicitly
SPARK-7679 - Upgrade to latest KCL and AWS SDK.

Author: Tathagata Das <tathagata.das1565@gmail.com>

Closes #6147 from tdas/kinesis-api-update and squashes the following commits:

f23ea77 [Tathagata Das] Updated versions and updated APIs
373b201 [Tathagata Das] Updated Kinesis API
---
 .../kinesis/KinesisCheckpointState.scala      |   2 +-
 .../streaming/kinesis/KinesisReceiver.scala   | 152 +++++-----
 .../kinesis/KinesisRecordProcessor.scala      |  32 ++-
 .../streaming/kinesis/KinesisUtils.scala      | 263 +++++++++++++++---
 .../kinesis/KinesisReceiverSuite.scala        |  15 +-
 pom.xml                                       |   4 +-
 6 files changed, 348 insertions(+), 120 deletions(-)

diff --git a/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisCheckpointState.scala b/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisCheckpointState.scala
index 588e86a1887ec..1c9b0c218ae18 100644
--- a/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisCheckpointState.scala
+++ b/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisCheckpointState.scala
@@ -48,7 +48,7 @@ private[kinesis] class KinesisCheckpointState(
   /**
    * Advance the checkpoint clock by the checkpoint interval.
    */
-  def advanceCheckpoint() = {
+  def advanceCheckpoint(): Unit = {
     checkpointClock.advance(checkpointInterval.milliseconds)
   }
 }
diff --git a/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisReceiver.scala b/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisReceiver.scala
index a7fe4476cacb8..01608fbd3fd31 100644
--- a/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisReceiver.scala
+++ b/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisReceiver.scala
@@ -16,32 +16,31 @@
  */
 package org.apache.spark.streaming.kinesis
 
-import java.net.InetAddress
 import java.util.UUID
 
+import com.amazonaws.auth.{AWSCredentials, AWSCredentialsProvider, BasicAWSCredentials, DefaultAWSCredentialsProviderChain}
+import com.amazonaws.services.kinesis.clientlibrary.interfaces.{IRecordProcessor, IRecordProcessorFactory}
+import com.amazonaws.services.kinesis.clientlibrary.lib.worker.{InitialPositionInStream, KinesisClientLibConfiguration, Worker}
+
 import org.apache.spark.Logging
 import org.apache.spark.storage.StorageLevel
 import org.apache.spark.streaming.Duration
 import org.apache.spark.streaming.receiver.Receiver
 import org.apache.spark.util.Utils
 
-import com.amazonaws.auth.AWSCredentialsProvider
-import com.amazonaws.auth.DefaultAWSCredentialsProviderChain
-import com.amazonaws.services.kinesis.clientlibrary.interfaces.IRecordProcessor
-import com.amazonaws.services.kinesis.clientlibrary.interfaces.IRecordProcessorFactory
-import com.amazonaws.services.kinesis.clientlibrary.lib.worker.InitialPositionInStream
-import com.amazonaws.services.kinesis.clientlibrary.lib.worker.KinesisClientLibConfiguration
-import com.amazonaws.services.kinesis.clientlibrary.lib.worker.Worker
+
+private[kinesis]
+case class SerializableAWSCredentials(accessKeyId: String, secretKey: String)
+  extends BasicAWSCredentials(accessKeyId, secretKey) with Serializable
 
 /**
  * Custom AWS Kinesis-specific implementation of Spark Streaming's Receiver.
  * This implementation relies on the Kinesis Client Library (KCL) Worker as described here:
  * https://github.com/awslabs/amazon-kinesis-client
- * This is a custom receiver used with StreamingContext.receiverStream(Receiver) 
- *   as described here:
- *     http://spark.apache.org/docs/latest/streaming-custom-receivers.html
- * Instances of this class will get shipped to the Spark Streaming Workers 
- *   to run within a Spark Executor.
+ * This is a custom receiver used with StreamingContext.receiverStream(Receiver) as described here:
+ *   http://spark.apache.org/docs/latest/streaming-custom-receivers.html
+ * Instances of this class will get shipped to the Spark Streaming Workers to run within a 
+ *   Spark Executor.
  *
  * @param appName  Kinesis application name. Kinesis Apps are mapped to Kinesis Streams
  *                 by the Kinesis Client Library.  If you change the App name or Stream name,
@@ -49,6 +48,8 @@ import com.amazonaws.services.kinesis.clientlibrary.lib.worker.Worker
  *                 DynamoDB table with the same name this Kinesis application.
  * @param streamName   Kinesis stream name
  * @param endpointUrl  Url of Kinesis service (e.g., https://kinesis.us-east-1.amazonaws.com)
+ * @param regionName  Region name used by the Kinesis Client Library for
+ *                    DynamoDB (lease coordination and checkpointing) and CloudWatch (metrics)
  * @param checkpointInterval  Checkpoint interval for Kinesis checkpointing.
  *                            See the Kinesis Spark Streaming documentation for more
  *                            details on the different types of checkpoints.
@@ -59,92 +60,103 @@ import com.amazonaws.services.kinesis.clientlibrary.lib.worker.Worker
  *                                 (InitialPositionInStream.TRIM_HORIZON) or
  *                                 the tip of the stream (InitialPositionInStream.LATEST).
  * @param storageLevel Storage level to use for storing the received objects
- *
- * @return ReceiverInputDStream[Array[Byte]]   
+ * @param awsCredentialsOption Optional AWS credentials, used when user directly specifies
+ *                             the credentials
  */
 private[kinesis] class KinesisReceiver(
     appName: String,
     streamName: String,
     endpointUrl: String,
-    checkpointInterval: Duration,
+    regionName: String,
     initialPositionInStream: InitialPositionInStream,
-    storageLevel: StorageLevel)
-  extends Receiver[Array[Byte]](storageLevel) with Logging { receiver =>
-
-  /*
-   * The following vars are built in the onStart() method which executes in the Spark Worker after
-   *   this code is serialized and shipped remotely.
-   */
-
-  /*
-   *  workerId should be based on the ip address of the actual Spark Worker where this code runs
-   *   (not the Driver's ip address.)
-   */
-  var workerId: String = null
+    checkpointInterval: Duration,
+    storageLevel: StorageLevel,
+    awsCredentialsOption: Option[SerializableAWSCredentials]
+  ) extends Receiver[Array[Byte]](storageLevel) with Logging { receiver =>
 
   /*
-   * This impl uses the DefaultAWSCredentialsProviderChain and searches for credentials 
-   *   in the following order of precedence:
-   * Environment Variables - AWS_ACCESS_KEY_ID and AWS_SECRET_KEY
-   * Java System Properties - aws.accessKeyId and aws.secretKey
-   * Credential profiles file at the default location (~/.aws/credentials) shared by all 
-   *   AWS SDKs and the AWS CLI
-   * Instance profile credentials delivered through the Amazon EC2 metadata service
+   * =================================================================================
+   * The following vars are initialize in the onStart() method which executes in the
+   * Spark worker after this Receiver is serialized and shipped to the worker.
+   * =================================================================================
    */
-  var credentialsProvider: AWSCredentialsProvider = null
-
-  /* KCL config instance. */
-  var kinesisClientLibConfiguration: KinesisClientLibConfiguration = null
 
-  /*
-   *  RecordProcessorFactory creates impls of IRecordProcessor.
-   *  IRecordProcessor adapts the KCL to our Spark KinesisReceiver via the 
-   *    IRecordProcessor.processRecords() method.
-   *  We're using our custom KinesisRecordProcessor in this case.
+  /**
+   * workerId is used by the KCL should be based on the ip address of the actual Spark Worker where this code runs
+   * (not the driver's IP address.)
    */
-  var recordProcessorFactory: IRecordProcessorFactory = null
+  private var workerId: String = null
 
-  /*
-   * Create a Kinesis Worker.
-   * This is the core client abstraction from the Kinesis Client Library (KCL).
-   * We pass the RecordProcessorFactory from above as well as the KCL config instance.
-   * A Kinesis Worker can process 1..* shards from the given stream - each with its 
-   *   own RecordProcessor.
+  /**
+   * Worker is the core client abstraction from the Kinesis Client Library (KCL).
+   * A worker can process more than one shards from the given stream.
+   * Each shard is assigned its own IRecordProcessor and the worker run multiple such
+   * processors.
    */
-  var worker: Worker = null
+  private var worker: Worker = null
 
   /**
-   *  This is called when the KinesisReceiver starts and must be non-blocking.
-   *  The KCL creates and manages the receiving/processing thread pool through the Worker.run() 
-   *    method.
+   * This is called when the KinesisReceiver starts and must be non-blocking.
+   * The KCL creates and manages the receiving/processing thread pool through Worker.run().
    */
   override def onStart() {
     workerId = Utils.localHostName() + ":" + UUID.randomUUID()
-    credentialsProvider = new DefaultAWSCredentialsProviderChain()
-    kinesisClientLibConfiguration = new KinesisClientLibConfiguration(appName, streamName,
-      credentialsProvider, workerId).withKinesisEndpoint(endpointUrl)
-      .withInitialPositionInStream(initialPositionInStream).withTaskBackoffTimeMillis(500)
-    recordProcessorFactory = new IRecordProcessorFactory {
+
+    // KCL config instance
+    val awsCredProvider = resolveAWSCredentialsProvider()
+    val kinesisClientLibConfiguration =
+      new KinesisClientLibConfiguration(appName, streamName, awsCredProvider, workerId)
+      .withKinesisEndpoint(endpointUrl)
+      .withInitialPositionInStream(initialPositionInStream)
+      .withTaskBackoffTimeMillis(500)
+      .withRegionName(regionName)
+
+   /*
+    *  RecordProcessorFactory creates impls of IRecordProcessor.
+    *  IRecordProcessor adapts the KCL to our Spark KinesisReceiver via the 
+    *  IRecordProcessor.processRecords() method.
+    *  We're using our custom KinesisRecordProcessor in this case.
+    */
+    val recordProcessorFactory = new IRecordProcessorFactory {
       override def createProcessor: IRecordProcessor = new KinesisRecordProcessor(receiver,
         workerId, new KinesisCheckpointState(checkpointInterval))
     }
+
     worker = new Worker(recordProcessorFactory, kinesisClientLibConfiguration)
     worker.run()
+
     logInfo(s"Started receiver with workerId $workerId")
   }
 
   /**
-   *  This is called when the KinesisReceiver stops.
-   *  The KCL worker.shutdown() method stops the receiving/processing threads.
-   *  The KCL will do its best to drain and checkpoint any in-flight records upon shutdown.
+   * This is called when the KinesisReceiver stops.
+   * The KCL worker.shutdown() method stops the receiving/processing threads.
+   * The KCL will do its best to drain and checkpoint any in-flight records upon shutdown.
    */
   override def onStop() {
-    worker.shutdown()
-    logInfo(s"Shut down receiver with workerId $workerId")
+    if (worker != null) {
+      worker.shutdown()
+      logInfo(s"Stopped receiver for workerId $workerId")
+      worker = null
+    }
     workerId = null
-    credentialsProvider = null
-    kinesisClientLibConfiguration = null
-    recordProcessorFactory = null
-    worker = null
+  }
+
+  /**
+   * If AWS credential is provided, return a AWSCredentialProvider returning that credential.
+   * Otherwise, return the DefaultAWSCredentialsProviderChain.
+   */
+  private def resolveAWSCredentialsProvider(): AWSCredentialsProvider = {
+    awsCredentialsOption match {
+      case Some(awsCredentials) =>
+        logInfo("Using provided AWS credentials")
+        new AWSCredentialsProvider {
+          override def getCredentials: AWSCredentials = awsCredentials
+          override def refresh(): Unit = { }
+        }
+      case None =>
+        logInfo("Using DefaultAWSCredentialsProviderChain")
+        new DefaultAWSCredentialsProviderChain()
+    }
   }
 }
diff --git a/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisRecordProcessor.scala b/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisRecordProcessor.scala
index af8cd875b4541..f65e743c4e2a3 100644
--- a/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisRecordProcessor.scala
+++ b/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisRecordProcessor.scala
@@ -35,7 +35,10 @@ import com.amazonaws.services.kinesis.model.Record
 /**
  * Kinesis-specific implementation of the Kinesis Client Library (KCL) IRecordProcessor.
  * This implementation operates on the Array[Byte] from the KinesisReceiver.
- * The Kinesis Worker creates an instance of this KinesisRecordProcessor upon startup.
+ * The Kinesis Worker creates an instance of this KinesisRecordProcessor for each 
+ *   shard in the Kinesis stream upon startup.  This is normally done in separate threads, 
+ *   but the KCLs within the KinesisReceivers will balance themselves out if you create 
+ *   multiple Receivers.
  *
  * @param receiver Kinesis receiver
  * @param workerId for logging purposes
@@ -47,8 +50,8 @@ private[kinesis] class KinesisRecordProcessor(
     workerId: String,
     checkpointState: KinesisCheckpointState) extends IRecordProcessor with Logging {
 
-  /* shardId to be populated during initialize() */
-  var shardId: String = _
+  // shardId to be populated during initialize()
+  private var shardId: String = _
 
   /**
    * The Kinesis Client Library calls this method during IRecordProcessor initialization.
@@ -56,8 +59,8 @@ private[kinesis] class KinesisRecordProcessor(
    * @param shardId assigned by the KCL to this particular RecordProcessor.
    */
   override def initialize(shardId: String) {
-    logInfo(s"Initialize:  Initializing workerId $workerId with shardId $shardId")
     this.shardId = shardId
+    logInfo(s"Initialized workerId $workerId with shardId $shardId")
   }
 
   /**
@@ -73,12 +76,17 @@ private[kinesis] class KinesisRecordProcessor(
     if (!receiver.isStopped()) {
       try {
         /*
-         * Note:  If we try to store the raw ByteBuffer from record.getData(), the Spark Streaming
-         * Receiver.store(ByteBuffer) attempts to deserialize the ByteBuffer using the
-         *   internally-configured Spark serializer (kryo, etc).
-         * This is not desirable, so we instead store a raw Array[Byte] and decouple
-         *   ourselves from Spark's internal serialization strategy.
-         */
+         * Notes:  
+         * 1) If we try to store the raw ByteBuffer from record.getData(), the Spark Streaming
+         *    Receiver.store(ByteBuffer) attempts to deserialize the ByteBuffer using the
+         *    internally-configured Spark serializer (kryo, etc).
+         * 2) This is not desirable, so we instead store a raw Array[Byte] and decouple
+         *    ourselves from Spark's internal serialization strategy.
+         * 3) For performance, the BlockGenerator is asynchronously queuing elements within its
+         *    memory before creating blocks.  This prevents the small block scenario, but requires
+         *    that you register callbacks to know when a block has been generated and stored 
+         *    (WAL is sufficient for storage) before can checkpoint back to the source.
+        */
         batch.foreach(record => receiver.store(record.getData().array()))
         
         logDebug(s"Stored:  Worker $workerId stored ${batch.size} records for shardId $shardId")
@@ -116,7 +124,7 @@ private[kinesis] class KinesisRecordProcessor(
           logError(s"Exception:  WorkerId $workerId encountered and exception while storing " +
               " or checkpointing a batch for workerId $workerId and shardId $shardId.", e)
 
-          /* Rethrow the exception to the Kinesis Worker that is managing this RecordProcessor.*/
+          /* Rethrow the exception to the Kinesis Worker that is managing this RecordProcessor. */
           throw e
         }
       }
@@ -190,7 +198,7 @@ private[kinesis] object KinesisRecordProcessor extends Logging {
                logError(s"Retryable Exception:  Random backOffMillis=${backOffMillis}", e)
                retryRandom(expression, numRetriesLeft - 1, maxBackOffMillis)
              }
-        /* Throw:  Shutdown has been requested by the Kinesis Client Library.*/
+        /* Throw:  Shutdown has been requested by the Kinesis Client Library. */
         case _: ShutdownException => {
           logError(s"ShutdownException:  Caught shutdown exception, skipping checkpoint.", e)
           throw e
diff --git a/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisUtils.scala b/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisUtils.scala
index 96f4399accd3a..b114bcff92d0f 100644
--- a/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisUtils.scala
+++ b/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisUtils.scala
@@ -16,29 +16,75 @@
  */
 package org.apache.spark.streaming.kinesis
 
-import org.apache.spark.annotation.Experimental
+import com.amazonaws.regions.RegionUtils
+import com.amazonaws.services.kinesis.clientlibrary.lib.worker.InitialPositionInStream
+
 import org.apache.spark.storage.StorageLevel
-import org.apache.spark.streaming.Duration
-import org.apache.spark.streaming.StreamingContext
-import org.apache.spark.streaming.api.java.JavaReceiverInputDStream
-import org.apache.spark.streaming.api.java.JavaStreamingContext
+import org.apache.spark.streaming.api.java.{JavaReceiverInputDStream, JavaStreamingContext}
 import org.apache.spark.streaming.dstream.ReceiverInputDStream
-
-import com.amazonaws.services.kinesis.clientlibrary.lib.worker.InitialPositionInStream
+import org.apache.spark.streaming.{Duration, StreamingContext}
 
 
-/**
- * Helper class to create Amazon Kinesis Input Stream
- * :: Experimental ::
- */
-@Experimental
 object KinesisUtils {
   /**
-   * Create an InputDStream that pulls messages from a Kinesis stream.
-   * :: Experimental ::
-   * @param ssc    StreamingContext object
+   * Create an input stream that pulls messages from a Kinesis stream.
+   * This uses the Kinesis Client Library (KCL) to pull messages from Kinesis.
+   *
+   * Note: The AWS credentials will be discovered using the DefaultAWSCredentialsProviderChain
+   * on the workers. See AWS documentation to understand how DefaultAWSCredentialsProviderChain
+   * gets the AWS credentials.
+   *
+   * @param ssc StreamingContext object
+   * @param kinesisAppName  Kinesis application name used by the Kinesis Client Library
+   *                        (KCL) to update DynamoDB
+   * @param streamName   Kinesis stream name
+   * @param endpointUrl  Url of Kinesis service (e.g., https://kinesis.us-east-1.amazonaws.com)
+   * @param regionName   Name of region used by the Kinesis Client Library (KCL) to update
+   *                     DynamoDB (lease coordination and checkpointing) and CloudWatch (metrics)
+   * @param initialPositionInStream  In the absence of Kinesis checkpoint info, this is the
+   *                                 worker's initial starting position in the stream.
+   *                                 The values are either the beginning of the stream
+   *                                 per Kinesis' limit of 24 hours
+   *                                 (InitialPositionInStream.TRIM_HORIZON) or
+   *                                 the tip of the stream (InitialPositionInStream.LATEST).
+   * @param checkpointInterval  Checkpoint interval for Kinesis checkpointing.
+   *                            See the Kinesis Spark Streaming documentation for more
+   *                            details on the different types of checkpoints.
+   * @param storageLevel Storage level to use for storing the received objects.
+   *                     StorageLevel.MEMORY_AND_DISK_2 is recommended.
+   */
+  def createStream(
+      ssc: StreamingContext,
+      kinesisAppName:  String,
+      streamName: String,
+      endpointUrl: String,
+      regionName: String,
+      initialPositionInStream: InitialPositionInStream,
+      checkpointInterval: Duration,
+      storageLevel: StorageLevel
+    ): ReceiverInputDStream[Array[Byte]] = {
+    ssc.receiverStream(
+      new KinesisReceiver(kinesisAppName, streamName, endpointUrl, validateRegion(regionName),
+        initialPositionInStream, checkpointInterval, storageLevel, None))
+  }
+
+  /**
+   * Create an input stream that pulls messages from a Kinesis stream.
+   * This uses the Kinesis Client Library (KCL) to pull messages from Kinesis.
+   *
+   * Note:
+   *  The given AWS credentials will get saved in DStream checkpoints if checkpointing
+   *  is enabled. Make sure that your checkpoint directory is secure.
+   *
+   * @param ssc StreamingContext object
+   * @param kinesisAppName  Kinesis application name used by the Kinesis Client Library
+   *                        (KCL) to update DynamoDB
    * @param streamName   Kinesis stream name
    * @param endpointUrl  Url of Kinesis service (e.g., https://kinesis.us-east-1.amazonaws.com)
+   * @param regionName   Name of region used by the Kinesis Client Library (KCL) to update
+   *                     DynamoDB (lease coordination and checkpointing) and CloudWatch (metrics)
+   * @param awsAccessKeyId  AWS AccessKeyId (if null, will use DefaultAWSCredentialsProviderChain)
+   * @param awsSecretKey  AWS SecretKey (if null, will use DefaultAWSCredentialsProviderChain)
    * @param checkpointInterval  Checkpoint interval for Kinesis checkpointing.
    *                            See the Kinesis Spark Streaming documentation for more
    *                            details on the different types of checkpoints.
@@ -48,28 +94,84 @@ object KinesisUtils {
    *                                 per Kinesis' limit of 24 hours
    *                                 (InitialPositionInStream.TRIM_HORIZON) or
    *                                 the tip of the stream (InitialPositionInStream.LATEST).
-   * @param storageLevel Storage level to use for storing the received objects
+   * @param storageLevel Storage level to use for storing the received objects.
+   *                     StorageLevel.MEMORY_AND_DISK_2 is recommended.
+   */
+  def createStream(
+      ssc: StreamingContext,
+      kinesisAppName:  String,
+      streamName: String,
+      endpointUrl: String,
+      regionName: String,
+      initialPositionInStream: InitialPositionInStream,
+      checkpointInterval: Duration,
+      storageLevel: StorageLevel,
+      awsAccessKeyId: String,
+      awsSecretKey: String
+    ): ReceiverInputDStream[Array[Byte]] = {
+    ssc.receiverStream(
+      new KinesisReceiver(kinesisAppName, streamName, endpointUrl, validateRegion(regionName),
+        initialPositionInStream, checkpointInterval, storageLevel,
+        Some(SerializableAWSCredentials(awsAccessKeyId, awsSecretKey))))
+  }
+
+  /**
+   * Create an input stream that pulls messages from a Kinesis stream.
+   * This uses the Kinesis Client Library (KCL) to pull messages from Kinesis.
    *
-   * @return ReceiverInputDStream[Array[Byte]]
+   * Note:
+   * - The AWS credentials will be discovered using the DefaultAWSCredentialsProviderChain
+   *   on the workers. See AWS documentation to understand how DefaultAWSCredentialsProviderChain
+   *   gets AWS credentials.
+   * - The region of the `endpointUrl` will be used for DynamoDB and CloudWatch.
+   * - The Kinesis application name used by the Kinesis Client Library (KCL) will be the app name in
+   *   [[org.apache.spark.SparkConf]].
+   *
+   * @param ssc Java StreamingContext object
+   * @param streamName   Kinesis stream name
+   * @param endpointUrl  Endpoint url of Kinesis service
+   *                     (e.g., https://kinesis.us-east-1.amazonaws.com)
+   * @param checkpointInterval  Checkpoint interval for Kinesis checkpointing.
+   *                            See the Kinesis Spark Streaming documentation for more
+   *                            details on the different types of checkpoints.
+   * @param initialPositionInStream  In the absence of Kinesis checkpoint info, this is the
+   *                                 worker's initial starting position in the stream.
+   *                                 The values are either the beginning of the stream
+   *                                 per Kinesis' limit of 24 hours
+   *                                 (InitialPositionInStream.TRIM_HORIZON) or
+   *                                 the tip of the stream (InitialPositionInStream.LATEST).
+   * @param storageLevel Storage level to use for storing the received objects
+   *                     StorageLevel.MEMORY_AND_DISK_2 is recommended.
    */
-  @Experimental
+  @deprecated("use other forms of createStream", "1.4.0")
   def createStream(
       ssc: StreamingContext,
       streamName: String,
       endpointUrl: String,
       checkpointInterval: Duration,
       initialPositionInStream: InitialPositionInStream,
-      storageLevel: StorageLevel): ReceiverInputDStream[Array[Byte]] = {
-    ssc.receiverStream(new KinesisReceiver(ssc.sc.appName, streamName, endpointUrl,
-        checkpointInterval, initialPositionInStream, storageLevel))
+      storageLevel: StorageLevel
+    ): ReceiverInputDStream[Array[Byte]] = {
+    ssc.receiverStream(
+      new KinesisReceiver(ssc.sc.appName, streamName, endpointUrl, getRegionByEndpoint(endpointUrl),
+        initialPositionInStream, checkpointInterval, storageLevel, None))
   }
 
   /**
-   * Create a Java-friendly InputDStream that pulls messages from a Kinesis stream.
-   * :: Experimental ::
+   * Create an input stream that pulls messages from a Kinesis stream.
+   * This uses the Kinesis Client Library (KCL) to pull messages from Kinesis.
+   *
+   * Note: The AWS credentials will be discovered using the DefaultAWSCredentialsProviderChain
+   * on the workers. See AWS documentation to understand how DefaultAWSCredentialsProviderChain
+   * gets the AWS credentials.
+   *
    * @param jssc Java StreamingContext object
+   * @param kinesisAppName  Kinesis application name used by the Kinesis Client Library
+   *                        (KCL) to update DynamoDB
    * @param streamName   Kinesis stream name
    * @param endpointUrl  Url of Kinesis service (e.g., https://kinesis.us-east-1.amazonaws.com)
+   * @param regionName   Name of region used by the Kinesis Client Library (KCL) to update
+   *                     DynamoDB (lease coordination and checkpointing) and CloudWatch (metrics)
    * @param checkpointInterval  Checkpoint interval for Kinesis checkpointing.
    *                            See the Kinesis Spark Streaming documentation for more
    *                            details on the different types of checkpoints.
@@ -79,19 +181,116 @@ object KinesisUtils {
    *                                 per Kinesis' limit of 24 hours
    *                                 (InitialPositionInStream.TRIM_HORIZON) or
    *                                 the tip of the stream (InitialPositionInStream.LATEST).
-   * @param storageLevel Storage level to use for storing the received objects
+   * @param storageLevel Storage level to use for storing the received objects.
+   *                     StorageLevel.MEMORY_AND_DISK_2 is recommended.
+   */
+  def createStream(
+      jssc: JavaStreamingContext,
+      kinesisAppName: String,
+      streamName: String,
+      endpointUrl: String,
+      regionName: String,
+      initialPositionInStream: InitialPositionInStream,
+      checkpointInterval: Duration,
+      storageLevel: StorageLevel
+    ): JavaReceiverInputDStream[Array[Byte]] = {
+    createStream(jssc.ssc, kinesisAppName, streamName, endpointUrl, regionName,
+      initialPositionInStream, checkpointInterval, storageLevel)
+  }
+
+  /**
+   * Create an input stream that pulls messages from a Kinesis stream.
+   * This uses the Kinesis Client Library (KCL) to pull messages from Kinesis.
    *
-   * @return JavaReceiverInputDStream[Array[Byte]]
+   * Note:
+   *  The given AWS credentials will get saved in DStream checkpoints if checkpointing
+   *  is enabled. Make sure that your checkpoint directory is secure.
+   *
+   * @param jssc Java StreamingContext object
+   * @param kinesisAppName  Kinesis application name used by the Kinesis Client Library
+   *                        (KCL) to update DynamoDB
+   * @param streamName   Kinesis stream name
+   * @param endpointUrl  Url of Kinesis service (e.g., https://kinesis.us-east-1.amazonaws.com)
+   * @param regionName   Name of region used by the Kinesis Client Library (KCL) to update
+   *                     DynamoDB (lease coordination and checkpointing) and CloudWatch (metrics)
+   * @param awsAccessKeyId  AWS AccessKeyId (if null, will use DefaultAWSCredentialsProviderChain)
+   * @param awsSecretKey  AWS SecretKey (if null, will use DefaultAWSCredentialsProviderChain)
+   * @param checkpointInterval  Checkpoint interval for Kinesis checkpointing.
+   *                            See the Kinesis Spark Streaming documentation for more
+   *                            details on the different types of checkpoints.
+   * @param initialPositionInStream  In the absence of Kinesis checkpoint info, this is the
+   *                                 worker's initial starting position in the stream.
+   *                                 The values are either the beginning of the stream
+   *                                 per Kinesis' limit of 24 hours
+   *                                 (InitialPositionInStream.TRIM_HORIZON) or
+   *                                 the tip of the stream (InitialPositionInStream.LATEST).
+   * @param storageLevel Storage level to use for storing the received objects.
+   *                     StorageLevel.MEMORY_AND_DISK_2 is recommended.
    */
-  @Experimental
   def createStream(
-      jssc: JavaStreamingContext, 
-      streamName: String, 
-      endpointUrl: String, 
+      jssc: JavaStreamingContext,
+      kinesisAppName: String,
+      streamName: String,
+      endpointUrl: String,
+      regionName: String,
+      initialPositionInStream: InitialPositionInStream,
+      checkpointInterval: Duration,
+      storageLevel: StorageLevel,
+      awsAccessKeyId: String,
+      awsSecretKey: String
+    ): JavaReceiverInputDStream[Array[Byte]] = {
+    createStream(jssc.ssc, kinesisAppName, streamName, endpointUrl, regionName,
+        initialPositionInStream, checkpointInterval, storageLevel, awsAccessKeyId, awsSecretKey)
+  }
+
+  /**
+   * Create an input stream that pulls messages from a Kinesis stream.
+   * This uses the Kinesis Client Library (KCL) to pull messages from Kinesis.
+   *
+   * Note:
+   * - The AWS credentials will be discovered using the DefaultAWSCredentialsProviderChain
+   *   on the workers. See AWS documentation to understand how DefaultAWSCredentialsProviderChain
+   *   gets AWS credentials.
+   * - The region of the `endpointUrl` will be used for DynamoDB and CloudWatch.
+   * - The Kinesis application name used by the Kinesis Client Library (KCL) will be the app name in
+   *   [[org.apache.spark.SparkConf]].
+   *
+   * @param jssc Java StreamingContext object
+   * @param streamName   Kinesis stream name
+   * @param endpointUrl  Endpoint url of Kinesis service
+   *                     (e.g., https://kinesis.us-east-1.amazonaws.com)
+   * @param checkpointInterval  Checkpoint interval for Kinesis checkpointing.
+   *                            See the Kinesis Spark Streaming documentation for more
+   *                            details on the different types of checkpoints.
+   * @param initialPositionInStream  In the absence of Kinesis checkpoint info, this is the
+   *                                 worker's initial starting position in the stream.
+   *                                 The values are either the beginning of the stream
+   *                                 per Kinesis' limit of 24 hours
+   *                                 (InitialPositionInStream.TRIM_HORIZON) or
+   *                                 the tip of the stream (InitialPositionInStream.LATEST).
+   * @param storageLevel Storage level to use for storing the received objects
+   *                     StorageLevel.MEMORY_AND_DISK_2 is recommended.
+   */
+  @deprecated("use other forms of createStream", "1.4.0")
+  def createStream(
+      jssc: JavaStreamingContext,
+      streamName: String,
+      endpointUrl: String,
       checkpointInterval: Duration,
       initialPositionInStream: InitialPositionInStream,
-      storageLevel: StorageLevel): JavaReceiverInputDStream[Array[Byte]] = {
-    jssc.receiverStream(new KinesisReceiver(jssc.ssc.sc.appName, streamName,
-        endpointUrl, checkpointInterval, initialPositionInStream, storageLevel))
+      storageLevel: StorageLevel
+    ): JavaReceiverInputDStream[Array[Byte]] = {
+    createStream(
+      jssc.ssc, streamName, endpointUrl, checkpointInterval, initialPositionInStream, storageLevel)
+  }
+
+  private def getRegionByEndpoint(endpointUrl: String): String = {
+    RegionUtils.getRegionByEndpoint(endpointUrl).getName()
+  }
+
+  private def validateRegion(regionName: String): String = {
+    Option(RegionUtils.getRegion(regionName)).map { _.getName }.getOrElse {
+      throw new IllegalArgumentException(s"Region name '$regionName' is not valid")
+    }
   }
 }
diff --git a/extras/kinesis-asl/src/test/scala/org/apache/spark/streaming/kinesis/KinesisReceiverSuite.scala b/extras/kinesis-asl/src/test/scala/org/apache/spark/streaming/kinesis/KinesisReceiverSuite.scala
index 255fe65819608..7c17ee9dceddd 100644
--- a/extras/kinesis-asl/src/test/scala/org/apache/spark/streaming/kinesis/KinesisReceiverSuite.scala
+++ b/extras/kinesis-asl/src/test/scala/org/apache/spark/streaming/kinesis/KinesisReceiverSuite.scala
@@ -40,6 +40,7 @@ import com.amazonaws.services.kinesis.clientlibrary.interfaces.IRecordProcessorC
 import com.amazonaws.services.kinesis.clientlibrary.lib.worker.InitialPositionInStream
 import com.amazonaws.services.kinesis.clientlibrary.types.ShutdownReason
 import com.amazonaws.services.kinesis.model.Record
+import com.amazonaws.auth.DefaultAWSCredentialsProviderChain
 
 /**
  * Suite of Kinesis streaming receiver tests focusing mostly on the KinesisRecordProcessor
@@ -81,12 +82,20 @@ class KinesisReceiverSuite extends TestSuiteBase with Matchers with BeforeAndAft
       checkpointStateMock, currentClockMock)
   }
 
-  test("kinesis utils api") {
+  test("KinesisUtils API") {
     val ssc = new StreamingContext(master, framework, batchDuration)
     // Tests the API, does not actually test data receiving
-    val kinesisStream = KinesisUtils.createStream(ssc, "mySparkStream",
+    val kinesisStream1 = KinesisUtils.createStream(ssc, "mySparkStream",
       "https://kinesis.us-west-2.amazonaws.com", Seconds(2),
-      InitialPositionInStream.LATEST, StorageLevel.MEMORY_AND_DISK_2);
+      InitialPositionInStream.LATEST, StorageLevel.MEMORY_AND_DISK_2)
+    val kinesisStream2 = KinesisUtils.createStream(ssc, "myAppNam", "mySparkStream",
+      "https://kinesis.us-west-2.amazonaws.com", "us-west-2",
+      InitialPositionInStream.LATEST, Seconds(2), StorageLevel.MEMORY_AND_DISK_2)
+    val kinesisStream3 = KinesisUtils.createStream(ssc, "myAppNam", "mySparkStream",
+      "https://kinesis.us-west-2.amazonaws.com", "us-west-2",
+      InitialPositionInStream.LATEST, Seconds(2), StorageLevel.MEMORY_AND_DISK_2,
+      "awsAccessKey", "awsSecretKey")
+
     ssc.stop()
   }
 
diff --git a/pom.xml b/pom.xml
index 6768a039d11e0..6f525b6ac81a3 100644
--- a/pom.xml
+++ b/pom.xml
@@ -148,8 +148,8 @@
     <avro.version>1.7.7</avro.version>
     <avro.mapred.classifier>hadoop2</avro.mapred.classifier>
     <jets3t.version>0.7.1</jets3t.version>
-    <aws.java.sdk.version>1.8.3</aws.java.sdk.version>
-    <aws.kinesis.client.version>1.1.0</aws.kinesis.client.version>
+    <aws.java.sdk.version>1.9.16</aws.java.sdk.version>
+    <aws.kinesis.client.version>1.2.1</aws.kinesis.client.version>
     <commons.httpclient.version>4.3.2</commons.httpclient.version>
     <commons.math3.version>3.4.1</commons.math3.version>
     <test_classpath_file>${project.build.directory}/spark-test-classpath.txt</test_classpath_file>

From 2f22424e9f6624097b292cb70e00787b69d80718 Mon Sep 17 00:00:00 2001
From: Wenchen Fan <cloud0fan@outlook.com>
Date: Sun, 17 May 2015 16:51:57 -0700
Subject: [PATCH 037/525] [SQL] [MINOR] use catalyst type converter in ScalaUdf

It's a follow-up of https://github.com/apache/spark/pull/5154, we can speed up scala udf evaluation by create type converter in advance.

Author: Wenchen Fan <cloud0fan@outlook.com>

Closes #6182 from cloud-fan/tmp and squashes the following commits:

241cfe9 [Wenchen Fan] use converter in ScalaUdf
---
 .../org/apache/spark/sql/catalyst/expressions/ScalaUdf.scala  | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ScalaUdf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ScalaUdf.scala
index 9a77ca624ebe2..d22eb10ad399f 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ScalaUdf.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ScalaUdf.scala
@@ -956,7 +956,7 @@ case class ScalaUdf(function: AnyRef, dataType: DataType, children: Seq[Expressi
   }
 
   // scalastyle:on
-
-  override def eval(input: Row): Any = CatalystTypeConverters.convertToCatalyst(f(input), dataType)
+  val converter = CatalystTypeConverters.createToCatalystConverter(dataType)
+  override def eval(input: Row): Any = converter(f(input))
 
 }

From ff71d34e00b64d70f671f9bf3e63aec39cd525e5 Mon Sep 17 00:00:00 2001
From: zsxwing <zsxwing@gmail.com>
Date: Sun, 17 May 2015 20:37:19 -0700
Subject: [PATCH 038/525] [SPARK-7693][Core] Remove "import
 scala.concurrent.ExecutionContext.Implicits.global"

Learnt a lesson from SPARK-7655: Spark should avoid to use `scala.concurrent.ExecutionContext.Implicits.global` because the user may submit blocking actions to `scala.concurrent.ExecutionContext.Implicits.global` and exhaust all threads in it. This could crash Spark. So Spark should always use its own thread pools for safety.

This PR removes all usages of `scala.concurrent.ExecutionContext.Implicits.global` and uses proper thread pools to replace them.

Author: zsxwing <zsxwing@gmail.com>

Closes #6223 from zsxwing/SPARK-7693 and squashes the following commits:

a33ff06 [zsxwing] Decrease the max thread number from 1024 to 128
cf4b3fc [zsxwing] Remove "import scala.concurrent.ExecutionContext.Implicits.global"
---
 .../CoarseGrainedExecutorBackend.scala        |  9 +++---
 .../apache/spark/rdd/AsyncRDDActions.scala    | 13 +++++++--
 .../apache/spark/storage/BlockManager.scala   | 17 ++++++++---
 .../spark/storage/BlockManagerMaster.scala    | 29 ++++++++++++-------
 .../execution/joins/BroadcastHashJoin.scala   |  2 +-
 .../receiver/ReceiverSupervisor.scala         | 14 ++++++---
 6 files changed, 58 insertions(+), 26 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/executor/CoarseGrainedExecutorBackend.scala b/core/src/main/scala/org/apache/spark/executor/CoarseGrainedExecutorBackend.scala
index ed159dec4f998..f3a26f54a81fb 100644
--- a/core/src/main/scala/org/apache/spark/executor/CoarseGrainedExecutorBackend.scala
+++ b/core/src/main/scala/org/apache/spark/executor/CoarseGrainedExecutorBackend.scala
@@ -33,7 +33,7 @@ import org.apache.spark.deploy.worker.WorkerWatcher
 import org.apache.spark.scheduler.TaskDescription
 import org.apache.spark.scheduler.cluster.CoarseGrainedClusterMessages._
 import org.apache.spark.serializer.SerializerInstance
-import org.apache.spark.util.{SignalLogger, Utils}
+import org.apache.spark.util.{ThreadUtils, SignalLogger, Utils}
 
 private[spark] class CoarseGrainedExecutorBackend(
     override val rpcEnv: RpcEnv,
@@ -55,18 +55,19 @@ private[spark] class CoarseGrainedExecutorBackend(
   private[this] val ser: SerializerInstance = env.closureSerializer.newInstance()
 
   override def onStart() {
-    import scala.concurrent.ExecutionContext.Implicits.global
     logInfo("Connecting to driver: " + driverUrl)
     rpcEnv.asyncSetupEndpointRefByURI(driverUrl).flatMap { ref =>
+      // This is a very fast action so we can use "ThreadUtils.sameThread"
       driver = Some(ref)
       ref.ask[RegisteredExecutor.type](
         RegisterExecutor(executorId, self, hostPort, cores, extractLogUrls))
-    } onComplete {
+    }(ThreadUtils.sameThread).onComplete {
+      // This is a very fast action so we can use "ThreadUtils.sameThread"
       case Success(msg) => Utils.tryLogNonFatalError {
         Option(self).foreach(_.send(msg)) // msg must be RegisteredExecutor
       }
       case Failure(e) => logError(s"Cannot register with driver: $driverUrl", e)
-    }
+    }(ThreadUtils.sameThread)
   }
 
   def extractLogUrls: Map[String, String] = {
diff --git a/core/src/main/scala/org/apache/spark/rdd/AsyncRDDActions.scala b/core/src/main/scala/org/apache/spark/rdd/AsyncRDDActions.scala
index ec185340c3a2d..bbf1b83af0795 100644
--- a/core/src/main/scala/org/apache/spark/rdd/AsyncRDDActions.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/AsyncRDDActions.scala
@@ -19,8 +19,10 @@ package org.apache.spark.rdd
 
 import java.util.concurrent.atomic.AtomicLong
 
+import org.apache.spark.util.ThreadUtils
+
 import scala.collection.mutable.ArrayBuffer
-import scala.concurrent.ExecutionContext.Implicits.global
+import scala.concurrent.ExecutionContext
 import scala.reflect.ClassTag
 
 import org.apache.spark.{ComplexFutureAction, FutureAction, Logging}
@@ -66,6 +68,8 @@ class AsyncRDDActions[T: ClassTag](self: RDD[T]) extends Serializable with Loggi
     val f = new ComplexFutureAction[Seq[T]]
 
     f.run {
+      // This is a blocking action so we should use "AsyncRDDActions.futureExecutionContext" which
+      // is a cached thread pool.
       val results = new ArrayBuffer[T](num)
       val totalParts = self.partitions.length
       var partsScanned = 0
@@ -101,7 +105,7 @@ class AsyncRDDActions[T: ClassTag](self: RDD[T]) extends Serializable with Loggi
         partsScanned += numPartsToTry
       }
       results.toSeq
-    }
+    }(AsyncRDDActions.futureExecutionContext)
 
     f
   }
@@ -123,3 +127,8 @@ class AsyncRDDActions[T: ClassTag](self: RDD[T]) extends Serializable with Loggi
       (index, data) => Unit, Unit)
   }
 }
+
+private object AsyncRDDActions {
+  val futureExecutionContext = ExecutionContext.fromExecutorService(
+    ThreadUtils.newDaemonCachedThreadPool("AsyncRDDActions-future", 128))
+}
diff --git a/core/src/main/scala/org/apache/spark/storage/BlockManager.scala b/core/src/main/scala/org/apache/spark/storage/BlockManager.scala
index cc794e5c90ffa..16d67cbfca80b 100644
--- a/core/src/main/scala/org/apache/spark/storage/BlockManager.scala
+++ b/core/src/main/scala/org/apache/spark/storage/BlockManager.scala
@@ -21,8 +21,7 @@ import java.io.{BufferedOutputStream, ByteArrayOutputStream, File, InputStream,
 import java.nio.{ByteBuffer, MappedByteBuffer}
 
 import scala.collection.mutable.{ArrayBuffer, HashMap}
-import scala.concurrent.{Await, Future}
-import scala.concurrent.ExecutionContext.Implicits.global
+import scala.concurrent.{ExecutionContext, Await, Future}
 import scala.concurrent.duration._
 import scala.util.Random
 
@@ -77,6 +76,9 @@ private[spark] class BlockManager(
 
   private val blockInfo = new TimeStampedHashMap[BlockId, BlockInfo]
 
+  private val futureExecutionContext = ExecutionContext.fromExecutorService(
+    ThreadUtils.newDaemonCachedThreadPool("block-manager-future", 128))
+
   // Actual storage of where blocks are kept
   private var externalBlockStoreInitialized = false
   private[spark] val memoryStore = new MemoryStore(this, maxMemory)
@@ -266,11 +268,13 @@ private[spark] class BlockManager(
     asyncReregisterLock.synchronized {
       if (asyncReregisterTask == null) {
         asyncReregisterTask = Future[Unit] {
+          // This is a blocking action and should run in futureExecutionContext which is a cached
+          // thread pool
           reregister()
           asyncReregisterLock.synchronized {
             asyncReregisterTask = null
           }
-        }
+        }(futureExecutionContext)
       }
     }
   }
@@ -744,7 +748,11 @@ private[spark] class BlockManager(
       case b: ByteBufferValues if putLevel.replication > 1 =>
         // Duplicate doesn't copy the bytes, but just creates a wrapper
         val bufferView = b.buffer.duplicate()
-        Future { replicate(blockId, bufferView, putLevel) }
+        Future {
+          // This is a blocking action and should run in futureExecutionContext which is a cached
+          // thread pool
+          replicate(blockId, bufferView, putLevel)
+        }(futureExecutionContext)
       case _ => null
     }
 
@@ -1218,6 +1226,7 @@ private[spark] class BlockManager(
     }
     metadataCleaner.cancel()
     broadcastCleaner.cancel()
+    futureExecutionContext.shutdownNow()
     logInfo("BlockManager stopped")
   }
 }
diff --git a/core/src/main/scala/org/apache/spark/storage/BlockManagerMaster.scala b/core/src/main/scala/org/apache/spark/storage/BlockManagerMaster.scala
index a85e1c7632973..abcad9438bf28 100644
--- a/core/src/main/scala/org/apache/spark/storage/BlockManagerMaster.scala
+++ b/core/src/main/scala/org/apache/spark/storage/BlockManagerMaster.scala
@@ -17,13 +17,14 @@
 
 package org.apache.spark.storage
 
+import scala.collection.Iterable
+import scala.collection.generic.CanBuildFrom
 import scala.concurrent.{Await, Future}
-import scala.concurrent.ExecutionContext.Implicits.global
 
 import org.apache.spark.rpc.RpcEndpointRef
 import org.apache.spark.{Logging, SparkConf, SparkException}
 import org.apache.spark.storage.BlockManagerMessages._
-import org.apache.spark.util.RpcUtils
+import org.apache.spark.util.{ThreadUtils, RpcUtils}
 
 private[spark]
 class BlockManagerMaster(
@@ -102,8 +103,8 @@ class BlockManagerMaster(
     val future = driverEndpoint.askWithRetry[Future[Seq[Int]]](RemoveRdd(rddId))
     future.onFailure {
       case e: Exception =>
-        logWarning(s"Failed to remove RDD $rddId - ${e.getMessage}}")
-    }
+        logWarning(s"Failed to remove RDD $rddId - ${e.getMessage}}", e)
+    }(ThreadUtils.sameThread)
     if (blocking) {
       Await.result(future, timeout)
     }
@@ -114,8 +115,8 @@ class BlockManagerMaster(
     val future = driverEndpoint.askWithRetry[Future[Seq[Boolean]]](RemoveShuffle(shuffleId))
     future.onFailure {
       case e: Exception =>
-        logWarning(s"Failed to remove shuffle $shuffleId - ${e.getMessage}}")
-    }
+        logWarning(s"Failed to remove shuffle $shuffleId - ${e.getMessage}}", e)
+    }(ThreadUtils.sameThread)
     if (blocking) {
       Await.result(future, timeout)
     }
@@ -128,8 +129,8 @@ class BlockManagerMaster(
     future.onFailure {
       case e: Exception =>
         logWarning(s"Failed to remove broadcast $broadcastId" +
-          s" with removeFromMaster = $removeFromMaster - ${e.getMessage}}")
-    }
+          s" with removeFromMaster = $removeFromMaster - ${e.getMessage}}", e)
+    }(ThreadUtils.sameThread)
     if (blocking) {
       Await.result(future, timeout)
     }
@@ -169,11 +170,17 @@ class BlockManagerMaster(
     val response = driverEndpoint.
       askWithRetry[Map[BlockManagerId, Future[Option[BlockStatus]]]](msg)
     val (blockManagerIds, futures) = response.unzip
-    val result = Await.result(Future.sequence(futures), timeout)
-    if (result == null) {
+    implicit val sameThread = ThreadUtils.sameThread
+    val cbf =
+      implicitly[
+        CanBuildFrom[Iterable[Future[Option[BlockStatus]]],
+        Option[BlockStatus],
+        Iterable[Option[BlockStatus]]]]
+    val blockStatus = Await.result(
+      Future.sequence[Option[BlockStatus], Iterable](futures)(cbf, ThreadUtils.sameThread), timeout)
+    if (blockStatus == null) {
       throw new SparkException("BlockManager returned null for BlockStatus query: " + blockId)
     }
-    val blockStatus = result.asInstanceOf[Iterable[Option[BlockStatus]]]
     blockManagerIds.zip(blockStatus).flatMap { case (blockManagerId, status) =>
       status.map { s => (blockManagerId, s) }
     }.toMap
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/BroadcastHashJoin.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/BroadcastHashJoin.scala
index fe43fc4125c8e..b8b12be8756f9 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/BroadcastHashJoin.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/BroadcastHashJoin.scala
@@ -78,5 +78,5 @@ case class BroadcastHashJoin(
 object BroadcastHashJoin {
 
   private val broadcastHashJoinExecutionContext = ExecutionContext.fromExecutorService(
-    ThreadUtils.newDaemonCachedThreadPool("broadcast-hash-join", 1024))
+    ThreadUtils.newDaemonCachedThreadPool("broadcast-hash-join", 128))
 }
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/receiver/ReceiverSupervisor.scala b/streaming/src/main/scala/org/apache/spark/streaming/receiver/ReceiverSupervisor.scala
index 4943f29395d12..33be067ebdaf2 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/receiver/ReceiverSupervisor.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/receiver/ReceiverSupervisor.scala
@@ -18,14 +18,14 @@
 package org.apache.spark.streaming.receiver
 
 import java.nio.ByteBuffer
+import java.util.concurrent.CountDownLatch
 
 import scala.collection.mutable.ArrayBuffer
+import scala.concurrent._
 
 import org.apache.spark.{Logging, SparkConf}
 import org.apache.spark.storage.StreamBlockId
-import java.util.concurrent.CountDownLatch
-import scala.concurrent._
-import ExecutionContext.Implicits.global
+import org.apache.spark.util.ThreadUtils
 
 /**
  * Abstract class that is responsible for supervising a Receiver in the worker.
@@ -46,6 +46,9 @@ private[streaming] abstract class ReceiverSupervisor(
   // Attach the executor to the receiver
   receiver.attachExecutor(this)
 
+  private val futureExecutionContext = ExecutionContext.fromExecutorService(
+    ThreadUtils.newDaemonCachedThreadPool("receiver-supervisor-future", 128))
+
   /** Receiver id */
   protected val streamId = receiver.streamId
 
@@ -111,6 +114,7 @@ private[streaming] abstract class ReceiverSupervisor(
     stoppingError = error.orNull
     stopReceiver(message, error)
     onStop(message, error)
+    futureExecutionContext.shutdownNow()
     stopLatch.countDown()
   }
 
@@ -150,6 +154,8 @@ private[streaming] abstract class ReceiverSupervisor(
   /** Restart receiver with delay */
   def restartReceiver(message: String, error: Option[Throwable], delay: Int) {
     Future {
+      // This is a blocking action so we should use "futureExecutionContext" which is a cached
+      // thread pool.
       logWarning("Restarting receiver with delay " + delay + " ms: " + message,
         error.getOrElse(null))
       stopReceiver("Restarting receiver with delay " + delay + "ms: " + message, error)
@@ -158,7 +164,7 @@ private[streaming] abstract class ReceiverSupervisor(
       logInfo("Starting receiver again")
       startReceiver()
       logInfo("Receiver started again")
-    }
+    }(futureExecutionContext)
   }
 
   /** Check if receiver has been marked for stopping */

From 775e6f9909d4495cbc11c377508b43482d782742 Mon Sep 17 00:00:00 2001
From: Shuo Xiang <shuoxiangpub@gmail.com>
Date: Sun, 17 May 2015 21:16:52 -0700
Subject: [PATCH 039/525] [SPARK-7694] [MLLIB] Use getOrElse for getting the
 threshold of LR model

The `toString` method of `LogisticRegressionModel` calls `get` method on an Option (threshold) without a safeguard. In spark-shell, the following code `val model = algorithm.run(data).clearThreshold()` in lbfgs code will fail as `toString `method will be called right after `clearThreshold()` to show the results in the REPL.

Author: Shuo Xiang <shuoxiangpub@gmail.com>

Closes #6224 from coderxiang/getorelse and squashes the following commits:

d5f53c9 [Shuo Xiang] use getOrElse for getting the threshold of LR model
5f109b4 [Shuo Xiang] Merge remote-tracking branch 'upstream/master'
c5c5bfe [Shuo Xiang] Merge remote-tracking branch 'upstream/master'
98804c9 [Shuo Xiang] fix bug in topBykey and update test
---
 .../apache/spark/mllib/classification/LogisticRegression.scala  | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/classification/LogisticRegression.scala b/mllib/src/main/scala/org/apache/spark/mllib/classification/LogisticRegression.scala
index bd2e9079ce1ae..2df4d21e8cd55 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/classification/LogisticRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/classification/LogisticRegression.scala
@@ -163,7 +163,7 @@ class LogisticRegressionModel (
   override protected def formatVersion: String = "1.0"
 
   override def toString: String = {
-    s"${super.toString}, numClasses = ${numClasses}, threshold = ${threshold.get}"
+    s"${super.toString}, numClasses = ${numClasses}, threshold = ${threshold.getOrElse("None")}"
   }
 }
 

From e32c0f69f38ad729e25c2d5f90eb73b4453f8279 Mon Sep 17 00:00:00 2001
From: Liang-Chi Hsieh <viirya@gmail.com>
Date: Mon, 18 May 2015 01:10:55 -0700
Subject: [PATCH 040/525] [SPARK-7299][SQL] Set precision and scale for Decimal
 according to JDBC metadata instead of returned BigDecimal

JIRA: https://issues.apache.org/jira/browse/SPARK-7299

When connecting with oracle db through jdbc, the precision and scale of `BigDecimal` object returned by `ResultSet.getBigDecimal` is not correctly matched to the table schema reported by `ResultSetMetaData.getPrecision` and `ResultSetMetaData.getScale`.

So in case you insert a value like `19999` into a column with `NUMBER(12, 2)` type, you get through a `BigDecimal` object with scale as 0. But the dataframe schema has correct type as `DecimalType(12, 2)`. Thus, after you save the dataframe into parquet file and then retrieve it, you will get wrong result `199.99`.

Because it is reported to be problematic on jdbc connection with oracle db. It might be difficult to add test case for it. But according to the user's test on JIRA, it solves this problem.

Author: Liang-Chi Hsieh <viirya@gmail.com>

Closes #5833 from viirya/jdbc_decimal_precision and squashes the following commits:

69bc2b5 [Liang-Chi Hsieh] Merge remote-tracking branch 'upstream/master' into jdbc_decimal_precision
928f864 [Liang-Chi Hsieh] Add comments.
5f9da94 [Liang-Chi Hsieh] Set up Decimal's precision and scale according to table schema instead of returned BigDecimal.
---
 .../org/apache/spark/sql/jdbc/JDBCRDD.scala   | 23 +++++++++++++++----
 1 file changed, 19 insertions(+), 4 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/jdbc/JDBCRDD.scala b/sql/core/src/main/scala/org/apache/spark/sql/jdbc/JDBCRDD.scala
index 95935ba874a72..4189dfcf956c0 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/jdbc/JDBCRDD.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/jdbc/JDBCRDD.scala
@@ -300,7 +300,7 @@ private[sql] class JDBCRDD(
   abstract class JDBCConversion
   case object BooleanConversion extends JDBCConversion
   case object DateConversion extends JDBCConversion
-  case object DecimalConversion extends JDBCConversion
+  case class  DecimalConversion(precisionInfo: Option[(Int, Int)]) extends JDBCConversion
   case object DoubleConversion extends JDBCConversion
   case object FloatConversion extends JDBCConversion
   case object IntegerConversion extends JDBCConversion
@@ -317,8 +317,8 @@ private[sql] class JDBCRDD(
     schema.fields.map(sf => sf.dataType match {
       case BooleanType           => BooleanConversion
       case DateType              => DateConversion
-      case DecimalType.Unlimited => DecimalConversion
-      case DecimalType.Fixed(d)  => DecimalConversion
+      case DecimalType.Unlimited => DecimalConversion(None)
+      case DecimalType.Fixed(d)  => DecimalConversion(Some(d))
       case DoubleType            => DoubleConversion
       case FloatType             => FloatConversion
       case IntegerType           => IntegerConversion
@@ -375,7 +375,22 @@ private[sql] class JDBCRDD(
               } else {
                 mutableRow.update(i, null)
               }
-            case DecimalConversion    =>
+            // When connecting with Oracle DB through JDBC, the precision and scale of BigDecimal
+            // object returned by ResultSet.getBigDecimal is not correctly matched to the table
+            // schema reported by ResultSetMetaData.getPrecision and ResultSetMetaData.getScale.
+            // If inserting values like 19999 into a column with NUMBER(12, 2) type, you get through
+            // a BigDecimal object with scale as 0. But the dataframe schema has correct type as
+            // DecimalType(12, 2). Thus, after saving the dataframe into parquet file and then
+            // retrieve it, you will get wrong result 199.99.
+            // So it is needed to set precision and scale for Decimal based on JDBC metadata.
+            case DecimalConversion(Some((p, s))) =>
+              val decimalVal = rs.getBigDecimal(pos)
+              if (decimalVal == null) {
+                mutableRow.update(i, null)
+              } else {
+                mutableRow.update(i, Decimal(decimalVal, p, s))
+              }
+            case DecimalConversion(None) =>
               val decimalVal = rs.getBigDecimal(pos)
               if (decimalVal == null) {
                 mutableRow.update(i, null)

From 1ecfac6e387b0934bfb5a9bbb4ad74b81ec210a4 Mon Sep 17 00:00:00 2001
From: Xiangrui Meng <meng@databricks.com>
Date: Mon, 18 May 2015 08:35:14 -0700
Subject: [PATCH 041/525] [SPARK-6657] [PYSPARK] Fix doc warnings

Fixed the following warnings in `make clean html` under `python/docs`:

~~~
/Users/meng/src/spark/python/pyspark/mllib/evaluation.py:docstring of pyspark.mllib.evaluation.RankingMetrics.ndcgAt:3: ERROR: Unexpected indentation.
/Users/meng/src/spark/python/pyspark/mllib/evaluation.py:docstring of pyspark.mllib.evaluation.RankingMetrics.ndcgAt:4: WARNING: Block quote ends without a blank line; unexpected unindent.
/Users/meng/src/spark/python/pyspark/mllib/fpm.py:docstring of pyspark.mllib.fpm.FPGrowth.train:3: ERROR: Unexpected indentation.
/Users/meng/src/spark/python/pyspark/mllib/fpm.py:docstring of pyspark.mllib.fpm.FPGrowth.train:4: WARNING: Block quote ends without a blank line; unexpected unindent.
/Users/meng/src/spark/python/pyspark/sql/__init__.py:docstring of pyspark.sql.DataFrame.replace:16: WARNING: Field list ends without a blank line; unexpected unindent.
/Users/meng/src/spark/python/pyspark/streaming/kafka.py:docstring of pyspark.streaming.kafka.KafkaUtils.createRDD:8: ERROR: Unexpected indentation.
/Users/meng/src/spark/python/pyspark/streaming/kafka.py:docstring of pyspark.streaming.kafka.KafkaUtils.createRDD:9: WARNING: Block quote ends without a blank line; unexpected unindent.
~~~

davies

Author: Xiangrui Meng <meng@databricks.com>

Closes #6221 from mengxr/SPARK-6657 and squashes the following commits:

e3f83fe [Xiangrui Meng] fix sql and streaming doc warnings
2b4371e [Xiangrui Meng] fix mllib python doc warnings
---
 python/pyspark/mllib/evaluation.py |  5 ++---
 python/pyspark/mllib/fpm.py        | 12 ++++++------
 python/pyspark/sql/dataframe.py    |  1 +
 python/pyspark/streaming/kafka.py  |  3 ++-
 4 files changed, 11 insertions(+), 10 deletions(-)

diff --git a/python/pyspark/mllib/evaluation.py b/python/pyspark/mllib/evaluation.py
index 4c777f2180dc9..a5e5ddc8fe506 100644
--- a/python/pyspark/mllib/evaluation.py
+++ b/python/pyspark/mllib/evaluation.py
@@ -334,11 +334,10 @@ def ndcgAt(self, k):
         """
         Compute the average NDCG value of all the queries, truncated at ranking position k.
         The discounted cumulative gain at position k is computed as:
-            sum,,i=1,,^k^ (2^{relevance of ''i''th item}^ - 1) / log(i + 1),
+        sum,,i=1,,^k^ (2^{relevance of ''i''th item}^ - 1) / log(i + 1),
         and the NDCG is obtained by dividing the DCG value on the ground truth set.
         In the current implementation, the relevance value is binary.
-
-        If a query has an empty ground truth set, zero will be used as ndcg together with
+        If a query has an empty ground truth set, zero will be used as NDCG together with
         a log warning.
         """
         return self.call("ndcgAt", int(k))
diff --git a/python/pyspark/mllib/fpm.py b/python/pyspark/mllib/fpm.py
index d8df02bdbaba9..bdc4a132b1b18 100644
--- a/python/pyspark/mllib/fpm.py
+++ b/python/pyspark/mllib/fpm.py
@@ -61,12 +61,12 @@ class FPGrowth(object):
     def train(cls, data, minSupport=0.3, numPartitions=-1):
         """
         Computes an FP-Growth model that contains frequent itemsets.
-        :param data:            The input data set, each element
-                                contains a transaction.
-        :param minSupport:      The minimal support level
-                                (default: `0.3`).
-        :param numPartitions:   The number of partitions used by parallel
-                                FP-growth (default: same as input data).
+
+        :param data: The input data set, each element contains a
+            transaction.
+        :param minSupport: The minimal support level (default: `0.3`).
+        :param numPartitions: The number of partitions used by
+            parallel FP-growth (default: same as input data).
         """
         model = callMLlibFunc("trainFPGrowthModel", data, float(minSupport), int(numPartitions))
         return FPGrowthModel(model)
diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py
index 96d927b9ba35c..e4a191a9ef07f 100644
--- a/python/pyspark/sql/dataframe.py
+++ b/python/pyspark/sql/dataframe.py
@@ -943,6 +943,7 @@ def replace(self, to_replace, value, subset=None):
             Columns specified in subset that do not have matching data type are ignored.
             For example, if `value` is a string, and subset contains a non-string column,
             then the non-string column is simply ignored.
+
         >>> df4.replace(10, 20).show()
         +----+------+-----+
         | age|height| name|
diff --git a/python/pyspark/streaming/kafka.py b/python/pyspark/streaming/kafka.py
index e278b29003f69..10a859a532e28 100644
--- a/python/pyspark/streaming/kafka.py
+++ b/python/pyspark/streaming/kafka.py
@@ -132,11 +132,12 @@ def createRDD(sc, kafkaParams, offsetRanges, leaders={},
         .. note:: Experimental
 
         Create a RDD from Kafka using offset ranges for each topic and partition.
+
         :param sc:  SparkContext object
         :param kafkaParams: Additional params for Kafka
         :param offsetRanges:  list of offsetRange to specify topic:partition:[start, end) to consume
         :param leaders: Kafka brokers for each TopicAndPartition in offsetRanges.  May be an empty
-                        map, in which case leaders will be looked up on the driver.
+            map, in which case leaders will be looked up on the driver.
         :param keyDecoder:  A function used to decode key (default is utf8_decoder)
         :param valueDecoder:  A function used to decode value (default is utf8_decoder)
         :return: A RDD object

From 814b3dabdf01abc7a2f25aa32284caccadeb7798 Mon Sep 17 00:00:00 2001
From: Vincenzo Selvaggio <vselvaggio@hotmail.it>
Date: Mon, 18 May 2015 08:46:33 -0700
Subject: [PATCH 042/525] [SPARK-7272] [MLLIB] User guide for PMML model export

https://issues.apache.org/jira/browse/SPARK-7272

Author: Vincenzo Selvaggio <vselvaggio@hotmail.it>

Closes #6219 from selvinsource/mllib_pmml_model_export_SPARK-7272 and squashes the following commits:

c866fb8 [Vincenzo Selvaggio] Update mllib-pmml-model-export.md
1beda98 [Vincenzo Selvaggio] [SPARK-7272] Initial user guide for pmml export
d670662 [Vincenzo Selvaggio] Update mllib-pmml-model-export.md
2731375 [Vincenzo Selvaggio] Update mllib-pmml-model-export.md
680dc33 [Vincenzo Selvaggio] Update mllib-pmml-model-export.md
2e298b5 [Vincenzo Selvaggio] Update mllib-pmml-model-export.md
a932f51 [Vincenzo Selvaggio] Create mllib-pmml-model-export.md
---
 docs/mllib-guide.md             |  1 +
 docs/mllib-pmml-model-export.md | 86 +++++++++++++++++++++++++++++++++
 2 files changed, 87 insertions(+)
 create mode 100644 docs/mllib-pmml-model-export.md

diff --git a/docs/mllib-guide.md b/docs/mllib-guide.md
index f8e879496c135..de7d66fb2dedf 100644
--- a/docs/mllib-guide.md
+++ b/docs/mllib-guide.md
@@ -39,6 +39,7 @@ filtering, dimensionality reduction, as well as underlying optimization primitiv
 * [Optimization (developer)](mllib-optimization.html)
   * stochastic gradient descent
   * limited-memory BFGS (L-BFGS)
+* [PMML model export](mllib-pmml-model-export.html)
 
 MLlib is under active development.
 The APIs marked `Experimental`/`DeveloperApi` may change in future releases, 
diff --git a/docs/mllib-pmml-model-export.md b/docs/mllib-pmml-model-export.md
new file mode 100644
index 0000000000000..42ea2ca81f80d
--- /dev/null
+++ b/docs/mllib-pmml-model-export.md
@@ -0,0 +1,86 @@
+---
+layout: global
+title: PMML model export - MLlib
+displayTitle: <a href="mllib-guide.html">MLlib</a> - PMML model export
+---
+
+* Table of contents
+{:toc}
+
+## MLlib supported models
+
+MLlib supports model export to Predictive Model Markup Language ([PMML](http://en.wikipedia.org/wiki/Predictive_Model_Markup_Language)).
+
+The table below outlines the MLlib models that can be exported to PMML and their equivalent PMML model.
+
+<table class="table">
+  <thead>
+    <tr><th>MLlib model</th><th>PMML model</th></tr>
+  </thead>
+  <tbody>
+    <tr>
+      <td>KMeansModel</td><td>ClusteringModel</td>
+    </tr>    
+    <tr>
+      <td>LinearRegressionModel</td><td>RegressionModel (functionName="regression")</td>
+    </tr>
+    <tr>
+      <td>RidgeRegressionModel</td><td>RegressionModel (functionName="regression")</td>
+    </tr>
+    <tr>
+      <td>LassoModel</td><td>RegressionModel (functionName="regression")</td>
+    </tr>
+    <tr>
+      <td>SVMModel</td><td>RegressionModel (functionName="classification" normalizationMethod="none")</td>
+    </tr>
+    <tr>
+      <td>Binary LogisticRegressionModel</td><td>RegressionModel (functionName="classification" normalizationMethod="logit")</td>
+    </tr>
+  </tbody>
+</table>
+
+## Examples
+<div class="codetabs">
+
+<div data-lang="scala" markdown="1">
+To export a supported `model` (see table above) to PMML, simply call `model.toPMML`.
+
+Here a complete example of building a KMeansModel and print it out in PMML format:
+{% highlight scala %}
+import org.apache.spark.mllib.clustering.KMeans
+import org.apache.spark.mllib.linalg.Vectors
+
+// Load and parse the data
+val data = sc.textFile("data/mllib/kmeans_data.txt")
+val parsedData = data.map(s => Vectors.dense(s.split(' ').map(_.toDouble))).cache()
+
+// Cluster the data into two classes using KMeans
+val numClusters = 2
+val numIterations = 20
+val clusters = KMeans.train(parsedData, numClusters, numIterations)
+
+// Export to PMML
+println("PMML Model:\n" + clusters.toPMML)
+{% endhighlight %}
+
+As well as exporting the PMML model to a String (`model.toPMML` as in the example above), you can export the PMML model to other formats:
+
+{% highlight scala %}
+// Export the model to a String in PMML format
+clusters.toPMML
+
+// Export the model to a local file in PMML format
+clusters.toPMML("/tmp/kmeans.xml")
+
+// Export the model to a directory on a distributed file system in PMML format
+clusters.toPMML(sc,"/tmp/kmeans")
+
+// Export the model to the OutputStream in PMML format
+clusters.toPMML(System.out)
+{% endhighlight %}
+
+For unsupported models, either you will not find a `.toPMML` method or an `IllegalArgumentException` will be thrown.
+
+</div>
+
+</div>

From 563bfcc1ab1b1c79b1845230c8c600db85a08fe3 Mon Sep 17 00:00:00 2001
From: Andrew Or <andrew@databricks.com>
Date: Mon, 18 May 2015 10:59:35 -0700
Subject: [PATCH 043/525] [SPARK-7627] [SPARK-7472] DAG visualization: style
 skipped stages

This patch fixes two things:

**SPARK-7627.** Cached RDDs no longer light up on the job page. This is a simple fix.
**SPARK-7472.** Display skipped stages differently from normal stages.

The latter is a major UX issue. Because we link the job viz to the stage viz even for skipped stages, the user may inadvertently click into the stage page of a skipped stage, which is empty.

-------------------
<img src="https://cloud.githubusercontent.com/assets/2133137/7675241/de1a3da6-fcea-11e4-8101-88055cef78c5.png" width="300px" />

Author: Andrew Or <andrew@databricks.com>

Closes #6171 from andrewor14/dag-viz-skipped and squashes the following commits:

f261797 [Andrew Or] Merge branch 'master' of github.com:apache/spark into dag-viz-skipped
0eda358 [Andrew Or] Tweak skipped stage border color
c604150 [Andrew Or] Tweak grayscale colors
7010676 [Andrew Or] Merge branch 'master' of github.com:apache/spark into dag-viz-skipped
762b541 [Andrew Or] Use special prefix for stage clusters to avoid collisions
51c95b9 [Andrew Or] Merge branch 'master' of github.com:apache/spark into dag-viz-skipped
b928cd4 [Andrew Or] Fix potential leak + write tests for it
7c4c364 [Andrew Or] Show skipped stages differently
7cc34ce [Andrew Or] Merge branch 'master' of github.com:apache/spark into dag-viz-skipped
c121fa2 [Andrew Or] Fix cache color
---
 .../apache/spark/ui/static/spark-dag-viz.css  |  71 +++---
 .../apache/spark/ui/static/spark-dag-viz.js   |  50 ++--
 .../scala/org/apache/spark/ui/UIUtils.scala   |   6 +-
 .../spark/ui/scope/RDDOperationGraph.scala    |  10 +-
 .../ui/scope/RDDOperationGraphListener.scala  |  96 ++++++--
 .../RDDOperationGraphListenerSuite.scala      | 227 ++++++++++++++----
 6 files changed, 352 insertions(+), 108 deletions(-)

diff --git a/core/src/main/resources/org/apache/spark/ui/static/spark-dag-viz.css b/core/src/main/resources/org/apache/spark/ui/static/spark-dag-viz.css
index eedefb44b96fc..3b4ae2ed354b8 100644
--- a/core/src/main/resources/org/apache/spark/ui/static/spark-dag-viz.css
+++ b/core/src/main/resources/org/apache/spark/ui/static/spark-dag-viz.css
@@ -15,32 +15,21 @@
  * limitations under the License.
  */
 
-#dag-viz-graph svg path {
-  stroke: #444;
-  stroke-width: 1.5px;
-}
-
-#dag-viz-graph svg g.cluster rect {
-  stroke-width: 1px;
-}
-
-#dag-viz-graph svg g.node circle {
-  fill: #444;
+#dag-viz-graph a, #dag-viz-graph a:hover {
+  text-decoration: none;
 }
 
-#dag-viz-graph svg g.node rect {
-  fill: #C3EBFF;
-  stroke: #3EC0FF;
-  stroke-width: 1px;
+#dag-viz-graph .label {
+  font-weight: normal;
+  text-shadow: none;
 }
 
-#dag-viz-graph svg g.node.cached circle {
-  fill: #444;
+#dag-viz-graph svg path {
+  stroke: #444;
+  stroke-width: 1.5px;
 }
 
-#dag-viz-graph svg g.node.cached rect {
-  fill: #B3F5C5;
-  stroke: #56F578;
+#dag-viz-graph svg g.cluster rect {
   stroke-width: 1px;
 }
 
@@ -61,12 +50,23 @@
   stroke-width: 1px;
 }
 
-#dag-viz-graph svg.job g.cluster[class*="stage"] rect {
+#dag-viz-graph svg.job g.cluster.skipped rect {
+  fill: #D6D6D6;
+  stroke: #B7B7B7;
+  stroke-width: 1px;
+}
+
+#dag-viz-graph svg.job g.cluster.stage rect {
   fill: #FFFFFF;
   stroke: #FF99AC;
   stroke-width: 1px;
 }
 
+#dag-viz-graph svg.job g.cluster.stage.skipped rect {
+  stroke: #ADADAD;
+  stroke-width: 1px;
+}
+
 #dag-viz-graph svg.job g#cross-stage-edges path {
   fill: none;
 }
@@ -75,6 +75,20 @@
   fill: #333;
 }
 
+#dag-viz-graph svg.job g.cluster.skipped text {
+  fill: #666;
+}
+
+#dag-viz-graph svg.job g.node circle {
+  fill: #444;
+}
+
+#dag-viz-graph svg.job g.node.cached circle {
+  fill: #A3F545;
+  stroke: #52C366;
+  stroke-width: 2px;
+}
+
 /* Stage page specific styles */
 
 #dag-viz-graph svg.stage g.cluster rect {
@@ -83,7 +97,7 @@
   stroke-width: 1px;
 }
 
-#dag-viz-graph svg.stage g.cluster[class*="stage"] rect {
+#dag-viz-graph svg.stage g.cluster.stage rect {
   fill: #FFFFFF;
   stroke: #FFA6B6;
   stroke-width: 1px;
@@ -97,11 +111,14 @@
   fill: #333;
 }
 
-#dag-viz-graph a, #dag-viz-graph a:hover {
-  text-decoration: none;
+#dag-viz-graph svg.stage g.node rect {
+  fill: #C3EBFF;
+  stroke: #3EC0FF;
+  stroke-width: 1px;
 }
 
-#dag-viz-graph .label {
-  font-weight: normal;
-  text-shadow: none;
+#dag-viz-graph svg.stage g.node.cached rect {
+  fill: #B3F5C5;
+  stroke: #52C366;
+  stroke-width: 2px;
 }
diff --git a/core/src/main/resources/org/apache/spark/ui/static/spark-dag-viz.js b/core/src/main/resources/org/apache/spark/ui/static/spark-dag-viz.js
index ee48fd29a6432..aaeba5b1027c9 100644
--- a/core/src/main/resources/org/apache/spark/ui/static/spark-dag-viz.js
+++ b/core/src/main/resources/org/apache/spark/ui/static/spark-dag-viz.js
@@ -57,9 +57,7 @@ var VizConstants = {
   stageSep: 40,
   graphPrefix: "graph_",
   nodePrefix: "node_",
-  stagePrefix: "stage_",
-  clusterPrefix: "cluster_",
-  stageClusterPrefix: "cluster_stage_"
+  clusterPrefix: "cluster_"
 };
 
 var JobPageVizConstants = {
@@ -133,9 +131,7 @@ function renderDagViz(forJob) {
   }
 
   // Render
-  var svg = graphContainer()
-    .append("svg")
-    .attr("class", jobOrStage);
+  var svg = graphContainer().append("svg").attr("class", jobOrStage);
   if (forJob) {
     renderDagVizForJob(svg);
   } else {
@@ -185,23 +181,32 @@ function renderDagVizForJob(svgContainer) {
     var dot = metadata.select(".dot-file").text();
     var stageId = metadata.attr("stage-id");
     var containerId = VizConstants.graphPrefix + stageId;
-    // Link each graph to the corresponding stage page (TODO: handle stage attempts)
-    var stageLink = $("#stage-" + stageId.replace(VizConstants.stagePrefix, "") + "-0")
-      .find("a")
-      .attr("href") + "&expandDagViz=true";
-    var container = svgContainer
-      .append("a")
-      .attr("xlink:href", stageLink)
-      .append("g")
-      .attr("id", containerId);
+    var isSkipped = metadata.attr("skipped") == "true";
+    var container;
+    if (isSkipped) {
+      container = svgContainer
+        .append("g")
+        .attr("id", containerId)
+        .attr("skipped", "true");
+    } else {
+      // Link each graph to the corresponding stage page (TODO: handle stage attempts)
+      // Use the link from the stage table so it also works for the history server
+      var attemptId = 0
+      var stageLink = d3.select("#stage-" + stageId + "-" + attemptId)
+        .select("a")
+        .attr("href") + "&expandDagViz=true";
+      container = svgContainer
+        .append("a")
+        .attr("xlink:href", stageLink)
+        .append("g")
+        .attr("id", containerId);
+    }
 
     // Now we need to shift the container for this stage so it doesn't overlap with
     // existing ones, taking into account the position and width of the last stage's
     // container. We do not need to do this for the first stage of this job.
     if (i > 0) {
-      var existingStages = svgContainer
-        .selectAll("g.cluster")
-        .filter("[class*=\"" + VizConstants.stageClusterPrefix + "\"]");
+      var existingStages = svgContainer.selectAll("g.cluster.stage")
       if (!existingStages.empty()) {
         var lastStage = d3.select(existingStages[0].pop());
         var lastStageWidth = toFloat(lastStage.select("rect").attr("width"));
@@ -214,6 +219,12 @@ function renderDagVizForJob(svgContainer) {
     // Actually render the stage
     renderDot(dot, container, true);
 
+    // Mark elements as skipped if appropriate. Unfortunately we need to mark all
+    // elements instead of the parent container because of CSS override rules.
+    if (isSkipped) {
+      container.selectAll("g").classed("skipped", true);
+    }
+
     // Round corners on rectangles
     container
       .selectAll("rect")
@@ -243,6 +254,9 @@ function renderDot(dot, container, forJob) {
   var renderer = new dagreD3.render();
   preprocessGraphLayout(g, forJob);
   renderer(container, g);
+
+  // Find the stage cluster and mark it for styling and post-processing
+  container.selectAll("g.cluster[name*=\"Stage\"]").classed("stage", true);
 }
 
 /* -------------------- *
diff --git a/core/src/main/scala/org/apache/spark/ui/UIUtils.scala b/core/src/main/scala/org/apache/spark/ui/UIUtils.scala
index ad16becde85dd..6194c50ec8c7c 100644
--- a/core/src/main/scala/org/apache/spark/ui/UIUtils.scala
+++ b/core/src/main/scala/org/apache/spark/ui/UIUtils.scala
@@ -352,10 +352,12 @@ private[spark] object UIUtils extends Logging {
         </a>
       </span>
       <div id="dag-viz-graph"></div>
-      <div id="dag-viz-metadata">
+      <div id="dag-viz-metadata" style="display:none">
         {
           graphs.map { g =>
-            <div class="stage-metadata" stage-id={g.rootCluster.id} style="display:none">
+            val stageId = g.rootCluster.id.replaceAll(RDDOperationGraph.STAGE_CLUSTER_PREFIX, "")
+            val skipped = g.rootCluster.name.contains("skipped").toString
+            <div class="stage-metadata" stage-id={stageId} skipped={skipped}>
               <div class="dot-file">{RDDOperationGraph.makeDotFile(g)}</div>
               { g.incomingEdges.map { e => <div class="incoming-edge">{e.fromId},{e.toId}</div> } }
               { g.outgoingEdges.map { e => <div class="outgoing-edge">{e.fromId},{e.toId}</div> } }
diff --git a/core/src/main/scala/org/apache/spark/ui/scope/RDDOperationGraph.scala b/core/src/main/scala/org/apache/spark/ui/scope/RDDOperationGraph.scala
index 25d5c6ff7e9cd..33a7303be711c 100644
--- a/core/src/main/scala/org/apache/spark/ui/scope/RDDOperationGraph.scala
+++ b/core/src/main/scala/org/apache/spark/ui/scope/RDDOperationGraph.scala
@@ -52,10 +52,13 @@ private[ui] case class RDDOperationEdge(fromId: Int, toId: Int)
  * This represents any grouping of RDDs, including operation scopes (e.g. textFile, flatMap),
  * stages, jobs, or any higher level construct. A cluster may be nested inside of other clusters.
  */
-private[ui] class RDDOperationCluster(val id: String, val name: String) {
+private[ui] class RDDOperationCluster(val id: String, private var _name: String) {
   private val _childNodes = new ListBuffer[RDDOperationNode]
   private val _childClusters = new ListBuffer[RDDOperationCluster]
 
+  def name: String = _name
+  def setName(n: String): Unit = { _name = n }
+
   def childNodes: Seq[RDDOperationNode] = _childNodes.iterator.toSeq
   def childClusters: Seq[RDDOperationCluster] = _childClusters.iterator.toSeq
   def attachChildNode(childNode: RDDOperationNode): Unit = { _childNodes += childNode }
@@ -71,6 +74,8 @@ private[ui] class RDDOperationCluster(val id: String, val name: String) {
 
 private[ui] object RDDOperationGraph extends Logging {
 
+  val STAGE_CLUSTER_PREFIX = "stage_"
+
   /**
    * Construct a RDDOperationGraph for a given stage.
    *
@@ -88,7 +93,8 @@ private[ui] object RDDOperationGraph extends Logging {
     val clusters = new mutable.HashMap[String, RDDOperationCluster] // indexed by cluster ID
 
     // Root cluster is the stage cluster
-    val stageClusterId = s"stage_${stage.stageId}"
+    // Use a special prefix here to differentiate this cluster from other operation clusters
+    val stageClusterId = STAGE_CLUSTER_PREFIX + stage.stageId
     val stageClusterName = s"Stage ${stage.stageId}" +
       { if (stage.attemptId == 0) "" else s" (attempt ${stage.attemptId})" }
     val rootCluster = new RDDOperationCluster(stageClusterId, stageClusterName)
diff --git a/core/src/main/scala/org/apache/spark/ui/scope/RDDOperationGraphListener.scala b/core/src/main/scala/org/apache/spark/ui/scope/RDDOperationGraphListener.scala
index aa9c25cb5c8c6..89119cd3579ef 100644
--- a/core/src/main/scala/org/apache/spark/ui/scope/RDDOperationGraphListener.scala
+++ b/core/src/main/scala/org/apache/spark/ui/scope/RDDOperationGraphListener.scala
@@ -27,8 +27,15 @@ import org.apache.spark.ui.SparkUI
  * A SparkListener that constructs a DAG of RDD operations.
  */
 private[ui] class RDDOperationGraphListener(conf: SparkConf) extends SparkListener {
+
+  // Note: the fate of jobs and stages are tied. This means when we clean up a job,
+  // we always clean up all of its stages. Similarly, when we clean up a stage, we
+  // always clean up its job (and, transitively, other stages in the same job).
   private[ui] val jobIdToStageIds = new mutable.HashMap[Int, Seq[Int]]
+  private[ui] val jobIdToSkippedStageIds = new mutable.HashMap[Int, Seq[Int]]
+  private[ui] val stageIdToJobId = new mutable.HashMap[Int, Int]
   private[ui] val stageIdToGraph = new mutable.HashMap[Int, RDDOperationGraph]
+  private[ui] val completedStageIds = new mutable.HashSet[Int]
 
   // Keep track of the order in which these are inserted so we can remove old ones
   private[ui] val jobIds = new mutable.ArrayBuffer[Int]
@@ -40,16 +47,23 @@ private[ui] class RDDOperationGraphListener(conf: SparkConf) extends SparkListen
   private val retainedStages =
     conf.getInt("spark.ui.retainedStages", SparkUI.DEFAULT_RETAINED_STAGES)
 
-  /** Return the graph metadata for the given stage, or None if no such information exists. */
+  /**
+   * Return the graph metadata for all stages in the given job.
+   * An empty list is returned if one or more of its stages has been cleaned up.
+   */
   def getOperationGraphForJob(jobId: Int): Seq[RDDOperationGraph] = synchronized {
-    val _stageIds = jobIdToStageIds.get(jobId).getOrElse { Seq.empty }
-    val graphs = _stageIds.flatMap { sid => stageIdToGraph.get(sid) }
-    // If the metadata for some stages have been removed, do not bother rendering this job
-    if (_stageIds.size != graphs.size) {
-      Seq.empty
-    } else {
-      graphs
+    val skippedStageIds = jobIdToSkippedStageIds.get(jobId).getOrElse(Seq.empty)
+    val graphs = jobIdToStageIds.get(jobId)
+      .getOrElse(Seq.empty)
+      .flatMap { sid => stageIdToGraph.get(sid) }
+    // Mark any skipped stages as such
+    graphs.foreach { g =>
+      val stageId = g.rootCluster.id.replaceAll(RDDOperationGraph.STAGE_CLUSTER_PREFIX, "").toInt
+      if (skippedStageIds.contains(stageId) && !g.rootCluster.name.contains("skipped")) {
+        g.rootCluster.setName(g.rootCluster.name + " (skipped)")
+      }
     }
+    graphs
   }
 
   /** Return the graph metadata for the given stage, or None if no such information exists. */
@@ -66,22 +80,68 @@ private[ui] class RDDOperationGraphListener(conf: SparkConf) extends SparkListen
     jobIdToStageIds(jobId) = jobStart.stageInfos.map(_.stageId).sorted
 
     stageInfos.foreach { stageInfo =>
-      stageIds += stageInfo.stageId
-      stageIdToGraph(stageInfo.stageId) = RDDOperationGraph.makeOperationGraph(stageInfo)
-      // Remove state for old stages
-      if (stageIds.size >= retainedStages) {
-        val toRemove = math.max(retainedStages / 10, 1)
-        stageIds.take(toRemove).foreach { id => stageIdToGraph.remove(id) }
-        stageIds.trimStart(toRemove)
-      }
+      val stageId = stageInfo.stageId
+      stageIds += stageId
+      stageIdToJobId(stageId) = jobId
+      stageIdToGraph(stageId) = RDDOperationGraph.makeOperationGraph(stageInfo)
+      trimStagesIfNecessary()
+    }
+
+    trimJobsIfNecessary()
+  }
+
+  /** Keep track of stages that have completed. */
+  override def onStageCompleted(stageCompleted: SparkListenerStageCompleted): Unit = synchronized {
+    val stageId = stageCompleted.stageInfo.stageId
+    if (stageIdToJobId.contains(stageId)) {
+      // Note: Only do this if the stage has not already been cleaned up
+      // Otherwise, we may never clean this stage from `completedStageIds`
+      completedStageIds += stageCompleted.stageInfo.stageId
+    }
+  }
+
+  /** On job end, find all stages in this job that are skipped and mark them as such. */
+  override def onJobEnd(jobEnd: SparkListenerJobEnd): Unit = synchronized {
+    val jobId = jobEnd.jobId
+    jobIdToStageIds.get(jobId).foreach { stageIds =>
+      val skippedStageIds = stageIds.filter { sid => !completedStageIds.contains(sid) }
+      // Note: Only do this if the job has not already been cleaned up
+      // Otherwise, we may never clean this job from `jobIdToSkippedStageIds`
+      jobIdToSkippedStageIds(jobId) = skippedStageIds
     }
+  }
+
+  /** Clean metadata for old stages if we have exceeded the number to retain. */
+  private def trimStagesIfNecessary(): Unit = {
+    if (stageIds.size >= retainedStages) {
+      val toRemove = math.max(retainedStages / 10, 1)
+      stageIds.take(toRemove).foreach { id => cleanStage(id) }
+      stageIds.trimStart(toRemove)
+    }
+  }
 
-    // Remove state for old jobs
+  /** Clean metadata for old jobs if we have exceeded the number to retain. */
+  private def trimJobsIfNecessary(): Unit = {
     if (jobIds.size >= retainedJobs) {
       val toRemove = math.max(retainedJobs / 10, 1)
-      jobIds.take(toRemove).foreach { id => jobIdToStageIds.remove(id) }
+      jobIds.take(toRemove).foreach { id => cleanJob(id) }
       jobIds.trimStart(toRemove)
     }
   }
 
+  /** Clean metadata for the given stage, its job, and all other stages that belong to the job. */
+  private[ui] def cleanStage(stageId: Int): Unit = {
+    completedStageIds.remove(stageId)
+    stageIdToGraph.remove(stageId)
+    stageIdToJobId.remove(stageId).foreach { jobId => cleanJob(jobId) }
+  }
+
+  /** Clean metadata for the given job and all stages that belong to it. */
+  private[ui] def cleanJob(jobId: Int): Unit = {
+    jobIdToSkippedStageIds.remove(jobId)
+    jobIdToStageIds.remove(jobId).foreach { stageIds =>
+      stageIds.foreach { stageId => cleanStage(stageId) }
+    }
+  }
+
 }
diff --git a/core/src/test/scala/org/apache/spark/ui/scope/RDDOperationGraphListenerSuite.scala b/core/src/test/scala/org/apache/spark/ui/scope/RDDOperationGraphListenerSuite.scala
index c659fc1e8b9a9..c1126f3af52e6 100644
--- a/core/src/test/scala/org/apache/spark/ui/scope/RDDOperationGraphListenerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/ui/scope/RDDOperationGraphListenerSuite.scala
@@ -20,67 +20,212 @@ package org.apache.spark.ui.scope
 import org.scalatest.FunSuite
 
 import org.apache.spark.SparkConf
-import org.apache.spark.scheduler.{SparkListenerJobStart, SparkListenerStageSubmitted, StageInfo}
+import org.apache.spark.scheduler._
+import org.apache.spark.scheduler.SparkListenerStageSubmitted
+import org.apache.spark.scheduler.SparkListenerStageCompleted
+import org.apache.spark.scheduler.SparkListenerJobStart
 
+/**
+ * Tests that this listener populates and cleans up its data structures properly.
+ */
 class RDDOperationGraphListenerSuite extends FunSuite {
   private var jobIdCounter = 0
   private var stageIdCounter = 0
+  private val maxRetainedJobs = 10
+  private val maxRetainedStages = 10
+  private val conf = new SparkConf()
+    .set("spark.ui.retainedJobs", maxRetainedJobs.toString)
+    .set("spark.ui.retainedStages", maxRetainedStages.toString)
 
-  /** Run a job with the specified number of stages. */
-  private def runOneJob(numStages: Int, listener: RDDOperationGraphListener): Unit = {
-    assert(numStages > 0, "I will not run a job with 0 stages for you.")
-    val stageInfos = (0 until numStages).map { _ =>
-      val stageInfo = new StageInfo(stageIdCounter, 0, "s", 0, Seq.empty, Seq.empty, "d")
-      stageIdCounter += 1
-      stageInfo
-    }
-    listener.onJobStart(new SparkListenerJobStart(jobIdCounter, 0, stageInfos))
-    jobIdCounter += 1
-  }
-
-  test("listener cleans up metadata") {
-
-    val conf = new SparkConf()
-      .set("spark.ui.retainedStages", "10")
-      .set("spark.ui.retainedJobs", "10")
-
+  test("run normal jobs") {
+    val startingJobId = jobIdCounter
+    val startingStageId = stageIdCounter
     val listener = new RDDOperationGraphListener(conf)
     assert(listener.jobIdToStageIds.isEmpty)
+    assert(listener.jobIdToSkippedStageIds.isEmpty)
+    assert(listener.stageIdToJobId.isEmpty)
     assert(listener.stageIdToGraph.isEmpty)
+    assert(listener.completedStageIds.isEmpty)
     assert(listener.jobIds.isEmpty)
     assert(listener.stageIds.isEmpty)
 
     // Run a few jobs, but not enough for clean up yet
-    runOneJob(1, listener)
-    runOneJob(2, listener)
-    runOneJob(3, listener)
+    (1 to 3).foreach { numStages => startJob(numStages, listener) } // start 3 jobs and 6 stages
+    (0 to 5).foreach { i => endStage(startingStageId + i, listener) } // finish all 6 stages
+    (0 to 2).foreach { i => endJob(startingJobId + i, listener) } // finish all 3 jobs
+
     assert(listener.jobIdToStageIds.size === 3)
+    assert(listener.jobIdToStageIds(startingJobId).size === 1)
+    assert(listener.jobIdToStageIds(startingJobId + 1).size === 2)
+    assert(listener.jobIdToStageIds(startingJobId + 2).size === 3)
+    assert(listener.jobIdToSkippedStageIds.size === 3)
+    assert(listener.jobIdToSkippedStageIds.values.forall(_.isEmpty)) // no skipped stages
+    assert(listener.stageIdToJobId.size === 6)
+    assert(listener.stageIdToJobId(startingStageId) === startingJobId)
+    assert(listener.stageIdToJobId(startingStageId + 1) === startingJobId + 1)
+    assert(listener.stageIdToJobId(startingStageId + 2) === startingJobId + 1)
+    assert(listener.stageIdToJobId(startingStageId + 3) === startingJobId + 2)
+    assert(listener.stageIdToJobId(startingStageId + 4) === startingJobId + 2)
+    assert(listener.stageIdToJobId(startingStageId + 5) === startingJobId + 2)
     assert(listener.stageIdToGraph.size === 6)
+    assert(listener.completedStageIds.size === 6)
     assert(listener.jobIds.size === 3)
     assert(listener.stageIds.size === 6)
+  }
+
+  test("run jobs with skipped stages") {
+    val startingJobId = jobIdCounter
+    val startingStageId = stageIdCounter
+    val listener = new RDDOperationGraphListener(conf)
+
+    // Run a few jobs, but not enough for clean up yet
+    // Leave some stages unfinished so that they are marked as skipped
+    (1 to 3).foreach { numStages => startJob(numStages, listener) } // start 3 jobs and 6 stages
+    (4 to 5).foreach { i => endStage(startingStageId + i, listener) } // finish only last 2 stages
+    (0 to 2).foreach { i => endJob(startingJobId + i, listener) } // finish all 3 jobs
+
+    assert(listener.jobIdToSkippedStageIds.size === 3)
+    assert(listener.jobIdToSkippedStageIds(startingJobId).size === 1)
+    assert(listener.jobIdToSkippedStageIds(startingJobId + 1).size === 2)
+    assert(listener.jobIdToSkippedStageIds(startingJobId + 2).size === 1) // 2 stages not skipped
+    assert(listener.completedStageIds.size === 2)
+
+    // The rest should be the same as before
+    assert(listener.jobIdToStageIds.size === 3)
+    assert(listener.jobIdToStageIds(startingJobId).size === 1)
+    assert(listener.jobIdToStageIds(startingJobId + 1).size === 2)
+    assert(listener.jobIdToStageIds(startingJobId + 2).size === 3)
+    assert(listener.stageIdToJobId.size === 6)
+    assert(listener.stageIdToJobId(startingStageId) === startingJobId)
+    assert(listener.stageIdToJobId(startingStageId + 1) === startingJobId + 1)
+    assert(listener.stageIdToJobId(startingStageId + 2) === startingJobId + 1)
+    assert(listener.stageIdToJobId(startingStageId + 3) === startingJobId + 2)
+    assert(listener.stageIdToJobId(startingStageId + 4) === startingJobId + 2)
+    assert(listener.stageIdToJobId(startingStageId + 5) === startingJobId + 2)
+    assert(listener.stageIdToGraph.size === 6)
+    assert(listener.jobIds.size === 3)
+    assert(listener.stageIds.size === 6)
+  }
+
+  test("clean up metadata") {
+    val startingJobId = jobIdCounter
+    val startingStageId = stageIdCounter
+    val listener = new RDDOperationGraphListener(conf)
 
-    // Run a few more, but this time the stages should be cleaned up, but not the jobs
-    runOneJob(5, listener)
-    runOneJob(100, listener)
-    assert(listener.jobIdToStageIds.size === 5)
-    assert(listener.stageIdToGraph.size === 9)
-    assert(listener.jobIds.size === 5)
-    assert(listener.stageIds.size === 9)
-
-    // Run a few more, but this time both jobs and stages should be cleaned up
-    (1 to 100).foreach { _ =>
-      runOneJob(1, listener)
+    // Run many jobs and stages to trigger clean up
+    (1 to 10000).foreach { i =>
+      // Note: this must be less than `maxRetainedStages`
+      val numStages = i % (maxRetainedStages - 2) + 1
+      val startingStageIdForJob = stageIdCounter
+      val jobId = startJob(numStages, listener)
+      // End some, but not all, stages that belong to this job
+      // This is to ensure that we have both completed and skipped stages
+      (startingStageIdForJob until stageIdCounter)
+        .filter { i => i % 2 == 0 }
+        .foreach { i => endStage(i, listener) }
+      // End all jobs
+      endJob(jobId, listener)
     }
-    assert(listener.jobIdToStageIds.size === 9)
-    assert(listener.stageIdToGraph.size === 9)
-    assert(listener.jobIds.size === 9)
-    assert(listener.stageIds.size === 9)
+
+    // Ensure we never exceed the max retained thresholds
+    assert(listener.jobIdToStageIds.size <= maxRetainedJobs)
+    assert(listener.jobIdToSkippedStageIds.size <= maxRetainedJobs)
+    assert(listener.stageIdToJobId.size <= maxRetainedStages)
+    assert(listener.stageIdToGraph.size <= maxRetainedStages)
+    assert(listener.completedStageIds.size <= maxRetainedStages)
+    assert(listener.jobIds.size <= maxRetainedJobs)
+    assert(listener.stageIds.size <= maxRetainedStages)
+
+    // Also ensure we're actually populating these data structures
+    // Otherwise the previous group of asserts will be meaningless
+    assert(listener.jobIdToStageIds.nonEmpty)
+    assert(listener.jobIdToSkippedStageIds.nonEmpty)
+    assert(listener.stageIdToJobId.nonEmpty)
+    assert(listener.stageIdToGraph.nonEmpty)
+    assert(listener.completedStageIds.nonEmpty)
+    assert(listener.jobIds.nonEmpty)
+    assert(listener.stageIds.nonEmpty)
 
     // Ensure we clean up old jobs and stages, not arbitrary ones
-    assert(!listener.jobIdToStageIds.contains(0))
-    assert(!listener.stageIdToGraph.contains(0))
-    assert(!listener.stageIds.contains(0))
-    assert(!listener.jobIds.contains(0))
+    assert(!listener.jobIdToStageIds.contains(startingJobId))
+    assert(!listener.jobIdToSkippedStageIds.contains(startingJobId))
+    assert(!listener.stageIdToJobId.contains(startingStageId))
+    assert(!listener.stageIdToGraph.contains(startingStageId))
+    assert(!listener.completedStageIds.contains(startingStageId))
+    assert(!listener.stageIds.contains(startingStageId))
+    assert(!listener.jobIds.contains(startingJobId))
+  }
+
+  test("fate sharing between jobs and stages") {
+    val startingJobId = jobIdCounter
+    val startingStageId = stageIdCounter
+    val listener = new RDDOperationGraphListener(conf)
+
+    // Run 3 jobs and 8 stages, finishing all 3 jobs but only 2 stages
+    startJob(5, listener)
+    startJob(1, listener)
+    startJob(2, listener)
+    (0 until 8).foreach { i => startStage(i + startingStageId, listener) }
+    endStage(startingStageId + 3, listener)
+    endStage(startingStageId + 4, listener)
+    (0 until 3).foreach { i => endJob(i + startingJobId, listener) }
+
+    // First, assert the old stuff
+    assert(listener.jobIdToStageIds.size === 3)
+    assert(listener.jobIdToSkippedStageIds.size === 3)
+    assert(listener.stageIdToJobId.size === 8)
+    assert(listener.stageIdToGraph.size === 8)
+    assert(listener.completedStageIds.size === 2)
+
+    // Cleaning the third job should clean all of its stages
+    listener.cleanJob(startingJobId + 2)
+    assert(listener.jobIdToStageIds.size === 2)
+    assert(listener.jobIdToSkippedStageIds.size === 2)
+    assert(listener.stageIdToJobId.size === 6)
+    assert(listener.stageIdToGraph.size === 6)
+    assert(listener.completedStageIds.size === 2)
+
+    // Cleaning one of the stages in the first job should clean that job and all of its stages
+    // Note that we still keep around the last stage because it belongs to a different job
+    listener.cleanStage(startingStageId)
+    assert(listener.jobIdToStageIds.size === 1)
+    assert(listener.jobIdToSkippedStageIds.size === 1)
+    assert(listener.stageIdToJobId.size === 1)
+    assert(listener.stageIdToGraph.size === 1)
+    assert(listener.completedStageIds.size === 0)
+  }
+
+  /** Start a job with the specified number of stages. */
+  private def startJob(numStages: Int, listener: RDDOperationGraphListener): Int = {
+    assert(numStages > 0, "I will not run a job with 0 stages for you.")
+    val stageInfos = (0 until numStages).map { _ =>
+      val stageInfo = new StageInfo(stageIdCounter, 0, "s", 0, Seq.empty, Seq.empty, "d")
+      stageIdCounter += 1
+      stageInfo
+    }
+    val jobId = jobIdCounter
+    listener.onJobStart(new SparkListenerJobStart(jobId, 0, stageInfos))
+    // Also start all stages that belong to this job
+    stageInfos.map(_.stageId).foreach { sid => startStage(sid, listener) }
+    jobIdCounter += 1
+    jobId
+  }
+
+  /** Start the stage specified by the given ID. */
+  private def startStage(stageId: Int, listener: RDDOperationGraphListener): Unit = {
+    val stageInfo = new StageInfo(stageId, 0, "s", 0, Seq.empty, Seq.empty, "d")
+    listener.onStageSubmitted(new SparkListenerStageSubmitted(stageInfo))
+  }
+
+  /** Finish the stage specified by the given ID. */
+  private def endStage(stageId: Int, listener: RDDOperationGraphListener): Unit = {
+    val stageInfo = new StageInfo(stageId, 0, "s", 0, Seq.empty, Seq.empty, "d")
+    listener.onStageCompleted(new SparkListenerStageCompleted(stageInfo))
+  }
+
+  /** Finish the job specified by the given ID. */
+  private def endJob(jobId: Int, listener: RDDOperationGraphListener): Unit = {
+    listener.onJobEnd(new SparkListenerJobEnd(jobId, 0, JobSucceeded))
   }
 
 }

From e1ac2a955be64b8df197195e3b225271cfa8201f Mon Sep 17 00:00:00 2001
From: Rene Treffer <treffer@measite.de>
Date: Mon, 18 May 2015 11:55:36 -0700
Subject: [PATCH 044/525] [SPARK-6888] [SQL] Make the jdbc driver handling
 user-definable

Replace the DriverQuirks with JdbcDialect(s) (and MySQLDialect/PostgresDialect)
and allow developers to change the dialects on the fly (for new JDBCRRDs only).

Some types (like an unsigned 64bit number) can be trivially mapped to java.
The status quo is that the RRD will fail to load.
This patch makes it possible to overwrite the type mapping to read e.g.
64Bit numbers as strings and handle them afterwards in software.

JDBCSuite has an example that maps all types to String, which should always
work (at the cost of extra code afterwards).

As a side effect it should now be possible to develop simple dialects
out-of-tree and even with spark-shell.

Author: Rene Treffer <treffer@measite.de>

Closes #5555 from rtreffer/jdbc-dialects and squashes the following commits:

3cbafd7 [Rene Treffer] [SPARK-6888] ignore classes belonging to changed API in MIMA report
fe7e2e8 [Rene Treffer] [SPARK-6888] Make the jdbc driver handling user-definable
---
 project/MimaExcludes.scala                    |   8 +
 .../apache/spark/sql/jdbc/DriverQuirks.scala  |  99 --------
 .../org/apache/spark/sql/jdbc/JDBCRDD.scala   |  11 +-
 .../apache/spark/sql/jdbc/JdbcDialects.scala  | 211 ++++++++++++++++++
 .../org/apache/spark/sql/jdbc/jdbc.scala      |  43 ++--
 .../org/apache/spark/sql/jdbc/JDBCSuite.scala |  49 ++++
 6 files changed, 295 insertions(+), 126 deletions(-)
 delete mode 100644 sql/core/src/main/scala/org/apache/spark/sql/jdbc/DriverQuirks.scala
 create mode 100644 sql/core/src/main/scala/org/apache/spark/sql/jdbc/JdbcDialects.scala

diff --git a/project/MimaExcludes.scala b/project/MimaExcludes.scala
index 487062a31f77f..513bbaf98d804 100644
--- a/project/MimaExcludes.scala
+++ b/project/MimaExcludes.scala
@@ -137,6 +137,14 @@ object MimaExcludes {
             // implementing this interface in Java. Note that ShuffleWriter is private[spark].
             ProblemFilters.exclude[IncompatibleTemplateDefProblem](
               "org.apache.spark.shuffle.ShuffleWriter")
+          ) ++ Seq(
+            // SPARK-6888 make jdbc driver handling user definable
+            // This patch renames some classes to API friendly names.
+            ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.jdbc.DriverQuirks$"),
+            ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.jdbc.DriverQuirks"),
+            ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.jdbc.PostgresQuirks"),
+            ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.jdbc.NoQuirks"),
+            ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.jdbc.MySQLQuirks")
           )
 
         case v if v.startsWith("1.3") =>
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/jdbc/DriverQuirks.scala b/sql/core/src/main/scala/org/apache/spark/sql/jdbc/DriverQuirks.scala
deleted file mode 100644
index 0feabc4282f4a..0000000000000
--- a/sql/core/src/main/scala/org/apache/spark/sql/jdbc/DriverQuirks.scala
+++ /dev/null
@@ -1,99 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.sql.jdbc
-
-import org.apache.spark.sql.types._
-
-import java.sql.Types
-
-
-/**
- * Encapsulates workarounds for the extensions, quirks, and bugs in various
- * databases.  Lots of databases define types that aren't explicitly supported
- * by the JDBC spec.  Some JDBC drivers also report inaccurate
- * information---for instance, BIT(n>1) being reported as a BIT type is quite
- * common, even though BIT in JDBC is meant for single-bit values.  Also, there
- * does not appear to be a standard name for an unbounded string or binary
- * type; we use BLOB and CLOB by default but override with database-specific
- * alternatives when these are absent or do not behave correctly.
- *
- * Currently, the only thing DriverQuirks does is handle type mapping.
- * `getCatalystType` is used when reading from a JDBC table and `getJDBCType`
- * is used when writing to a JDBC table.  If `getCatalystType` returns `null`,
- * the default type handling is used for the given JDBC type.  Similarly,
- * if `getJDBCType` returns `(null, None)`, the default type handling is used
- * for the given Catalyst type.
- */
-private[sql] abstract class DriverQuirks {
-  def getCatalystType(sqlType: Int, typeName: String, size: Int, md: MetadataBuilder): DataType
-  def getJDBCType(dt: DataType): (String, Option[Int])
-}
-
-private[sql] object DriverQuirks {
-  /**
-   * Fetch the DriverQuirks class corresponding to a given database url.
-   */
-  def get(url: String): DriverQuirks = {
-    if (url.startsWith("jdbc:mysql")) {
-      new MySQLQuirks()
-    } else if (url.startsWith("jdbc:postgresql")) {
-      new PostgresQuirks()
-    } else {
-      new NoQuirks()
-    }
-  }
-}
-
-private[sql] class NoQuirks extends DriverQuirks {
-  def getCatalystType(sqlType: Int, typeName: String, size: Int, md: MetadataBuilder): DataType =
-    null
-  def getJDBCType(dt: DataType): (String, Option[Int]) = (null, None)
-}
-
-private[sql] class PostgresQuirks extends DriverQuirks {
-  def getCatalystType(sqlType: Int, typeName: String, size: Int, md: MetadataBuilder): DataType = {
-    if (sqlType == Types.BIT && typeName.equals("bit") && size != 1) {
-      BinaryType
-    } else if (sqlType == Types.OTHER && typeName.equals("cidr")) {
-      StringType
-    } else if (sqlType == Types.OTHER && typeName.equals("inet")) {
-      StringType
-    } else null
-  }
-
-  def getJDBCType(dt: DataType): (String, Option[Int]) = dt match {
-    case StringType => ("TEXT", Some(java.sql.Types.CHAR))
-    case BinaryType => ("BYTEA", Some(java.sql.Types.BINARY))
-    case BooleanType => ("BOOLEAN", Some(java.sql.Types.BOOLEAN))
-    case _ => (null, None)
-  }
-}
-
-private[sql] class MySQLQuirks extends DriverQuirks {
-  def getCatalystType(sqlType: Int, typeName: String, size: Int, md: MetadataBuilder): DataType = {
-    if (sqlType == Types.VARBINARY && typeName.equals("BIT") && size != 1) {
-      // This could instead be a BinaryType if we'd rather return bit-vectors of up to 64 bits as
-      // byte arrays instead of longs.
-      md.putLong("binarylong", 1)
-      LongType
-    } else if (sqlType == Types.BIT && typeName.equals("TINYINT")) {
-      BooleanType
-    } else null
-  }
-  def getJDBCType(dt: DataType): (String, Option[Int]) = (null, None)
-}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/jdbc/JDBCRDD.scala b/sql/core/src/main/scala/org/apache/spark/sql/jdbc/JDBCRDD.scala
index 4189dfcf956c0..f7b19096eaacb 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/jdbc/JDBCRDD.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/jdbc/JDBCRDD.scala
@@ -41,7 +41,7 @@ private[sql] object JDBCRDD extends Logging {
 
   /**
    * Maps a JDBC type to a Catalyst type.  This function is called only when
-   * the DriverQuirks class corresponding to your database driver returns null.
+   * the JdbcDialect class corresponding to your database driver returns null.
    *
    * @param sqlType - A field of java.sql.Types
    * @return The Catalyst type corresponding to sqlType.
@@ -51,7 +51,7 @@ private[sql] object JDBCRDD extends Logging {
       case java.sql.Types.ARRAY         => null
       case java.sql.Types.BIGINT        => LongType
       case java.sql.Types.BINARY        => BinaryType
-      case java.sql.Types.BIT           => BooleanType // Per JDBC; Quirks handles quirky drivers.
+      case java.sql.Types.BIT           => BooleanType // @see JdbcDialect for quirks
       case java.sql.Types.BLOB          => BinaryType
       case java.sql.Types.BOOLEAN       => BooleanType
       case java.sql.Types.CHAR          => StringType
@@ -108,7 +108,7 @@ private[sql] object JDBCRDD extends Logging {
    * @throws SQLException if the table contains an unsupported type.
    */
   def resolveTable(url: String, table: String, properties: Properties): StructType = {
-    val quirks = DriverQuirks.get(url)
+    val dialect = JdbcDialects.get(url)
     val conn: Connection = DriverManager.getConnection(url, properties)
     try {
       val rs = conn.prepareStatement(s"SELECT * FROM $table WHERE 1=0").executeQuery()
@@ -125,8 +125,9 @@ private[sql] object JDBCRDD extends Logging {
           val fieldScale = rsmd.getScale(i + 1)
           val nullable = rsmd.isNullable(i + 1) != ResultSetMetaData.columnNoNulls
           val metadata = new MetadataBuilder().putString("name", columnName)
-          var columnType = quirks.getCatalystType(dataType, typeName, fieldSize, metadata)
-          if (columnType == null) columnType = getCatalystType(dataType, fieldSize, fieldScale)
+          val columnType =
+            dialect.getCatalystType(dataType, typeName, fieldSize, metadata).getOrElse(
+              getCatalystType(dataType, fieldSize, fieldScale))
           fields(i) = StructField(columnName, columnType, nullable, metadata.build())
           i = i + 1
         }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/jdbc/JdbcDialects.scala b/sql/core/src/main/scala/org/apache/spark/sql/jdbc/JdbcDialects.scala
new file mode 100644
index 0000000000000..6a169e106b968
--- /dev/null
+++ b/sql/core/src/main/scala/org/apache/spark/sql/jdbc/JdbcDialects.scala
@@ -0,0 +1,211 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.jdbc
+
+import org.apache.spark.sql.types._
+import org.apache.spark.annotation.DeveloperApi
+
+import java.sql.Types
+
+/**
+ * :: DeveloperApi ::
+ * A database type definition coupled with the jdbc type needed to send null
+ * values to the database.
+ * @param databaseTypeDefinition The database type definition
+ * @param jdbcNullType The jdbc type (as defined in java.sql.Types) used to
+ *                     send a null value to the database.
+ */
+@DeveloperApi
+case class JdbcType(databaseTypeDefinition : String, jdbcNullType : Int)
+
+/**
+ * :: DeveloperApi ::
+ * Encapsulates everything (extensions, workarounds, quirks) to handle the
+ * SQL dialect of a certain database or jdbc driver.
+ * Lots of databases define types that aren't explicitly supported
+ * by the JDBC spec.  Some JDBC drivers also report inaccurate
+ * information---for instance, BIT(n>1) being reported as a BIT type is quite
+ * common, even though BIT in JDBC is meant for single-bit values.  Also, there
+ * does not appear to be a standard name for an unbounded string or binary
+ * type; we use BLOB and CLOB by default but override with database-specific
+ * alternatives when these are absent or do not behave correctly.
+ *
+ * Currently, the only thing done by the dialect is type mapping.
+ * `getCatalystType` is used when reading from a JDBC table and `getJDBCType`
+ * is used when writing to a JDBC table.  If `getCatalystType` returns `null`,
+ * the default type handling is used for the given JDBC type.  Similarly,
+ * if `getJDBCType` returns `(null, None)`, the default type handling is used
+ * for the given Catalyst type.
+ */
+@DeveloperApi
+abstract class JdbcDialect {
+  /**
+   * Check if this dialect instance can handle a certain jdbc url.
+   * @param url the jdbc url.
+   * @return True if the dialect can be applied on the given jdbc url.
+   * @throws NullPointerException if the url is null.
+   */
+  def canHandle(url : String): Boolean
+
+  /**
+   * Get the custom datatype mapping for the given jdbc meta information.
+   * @param sqlType The sql type (see java.sql.Types)
+   * @param typeName The sql type name (e.g. "BIGINT UNSIGNED")
+   * @param size The size of the type.
+   * @param md Result metadata associated with this type.
+   * @return The actual DataType (subclasses of [[org.apache.spark.sql.types.DataType]])
+   *         or null if the default type mapping should be used.
+   */
+  def getCatalystType(
+    sqlType: Int, typeName: String, size: Int, md: MetadataBuilder): Option[DataType] = None
+
+  /**
+   * Retrieve the jdbc / sql type for a given datatype.
+   * @param dt The datatype (e.g. [[org.apache.spark.sql.types.StringType]])
+   * @return The new JdbcType if there is an override for this DataType
+   */
+  def getJDBCType(dt: DataType): Option[JdbcType] = None
+}
+
+/**
+ * :: DeveloperApi ::
+ * Registry of dialects that apply to every new jdbc [[org.apache.spark.sql.DataFrame]].
+ *
+ * If multiple matching dialects are registered then all matching ones will be
+ * tried in reverse order. A user-added dialect will thus be applied first,
+ * overwriting the defaults.
+ *
+ * Note that all new dialects are applied to new jdbc DataFrames only. Make
+ * sure to register your dialects first.
+ */
+@DeveloperApi
+object JdbcDialects {
+
+  private var dialects = List[JdbcDialect]()
+
+  /**
+   * Register a dialect for use on all new matching jdbc [[org.apache.spark.sql.DataFrame]].
+   * Readding an existing dialect will cause a move-to-front.
+   * @param dialect The new dialect.
+   */
+  def registerDialect(dialect: JdbcDialect) : Unit = {
+    dialects = dialect :: dialects.filterNot(_ == dialect)
+  }
+
+  /**
+   * Unregister a dialect. Does nothing if the dialect is not registered.
+   * @param dialect The jdbc dialect.
+   */
+  def unregisterDialect(dialect : JdbcDialect) : Unit = {
+    dialects = dialects.filterNot(_ == dialect)
+  }
+
+  registerDialect(MySQLDialect)
+  registerDialect(PostgresDialect)
+
+  /**
+   * Fetch the JdbcDialect class corresponding to a given database url.
+   */
+  private[sql] def get(url: String): JdbcDialect = {
+    val matchingDialects = dialects.filter(_.canHandle(url))
+    matchingDialects.length match {
+      case 0 => NoopDialect
+      case 1 => matchingDialects.head
+      case _ => new AggregatedDialect(matchingDialects)
+    }
+  }
+}
+
+/**
+ * :: DeveloperApi ::
+ * AggregatedDialect can unify multiple dialects into one virtual Dialect.
+ * Dialects are tried in order, and the first dialect that does not return a
+ * neutral element will will.
+ * @param dialects List of dialects.
+ */
+@DeveloperApi
+class AggregatedDialect(dialects: List[JdbcDialect]) extends JdbcDialect {
+
+  require(!dialects.isEmpty)
+
+  def canHandle(url : String): Boolean =
+    dialects.map(_.canHandle(url)).reduce(_ && _)
+
+  override def getCatalystType(
+      sqlType: Int, typeName: String, size: Int, md: MetadataBuilder): Option[DataType] =
+    dialects.map(_.getCatalystType(sqlType, typeName, size, md)).flatten.headOption
+
+  override def getJDBCType(dt: DataType): Option[JdbcType] =
+    dialects.map(_.getJDBCType(dt)).flatten.headOption
+
+}
+
+/**
+ * :: DeveloperApi ::
+ * NOOP dialect object, always returning the neutral element.
+ */
+@DeveloperApi
+case object NoopDialect extends JdbcDialect {
+  def canHandle(url : String): Boolean = true
+}
+
+/**
+ * :: DeveloperApi ::
+ * Default postgres dialect, mapping bit/cidr/inet on read and string/binary/boolean on write.
+ */
+@DeveloperApi
+case object PostgresDialect extends JdbcDialect {
+  def canHandle(url: String): Boolean = url.startsWith("jdbc:postgresql")
+  override def getCatalystType(
+      sqlType: Int, typeName: String, size: Int, md: MetadataBuilder): Option[DataType] = {
+    if (sqlType == Types.BIT && typeName.equals("bit") && size != 1) {
+      Some(BinaryType)
+    } else if (sqlType == Types.OTHER && typeName.equals("cidr")) {
+      Some(StringType)
+    } else if (sqlType == Types.OTHER && typeName.equals("inet")) {
+      Some(StringType)
+    } else None
+  }
+
+  override def getJDBCType(dt: DataType): Option[JdbcType] = dt match {
+    case StringType => Some(JdbcType("TEXT", java.sql.Types.CHAR))
+    case BinaryType => Some(JdbcType("BYTEA", java.sql.Types.BINARY))
+    case BooleanType => Some(JdbcType("BOOLEAN", java.sql.Types.BOOLEAN))
+    case _ => None
+  }
+}
+
+/**
+ * :: DeveloperApi ::
+ * Default mysql dialect to read bit/bitsets correctly.
+ */
+@DeveloperApi
+case object MySQLDialect extends JdbcDialect {
+  def canHandle(url : String): Boolean = url.startsWith("jdbc:mysql")
+  override def getCatalystType(
+      sqlType: Int, typeName: String, size: Int, md: MetadataBuilder): Option[DataType] = {
+    if (sqlType == Types.VARBINARY && typeName.equals("BIT") && size != 1) {
+      // This could instead be a BinaryType if we'd rather return bit-vectors of up to 64 bits as
+      // byte arrays instead of longs.
+      md.putLong("binarylong", 1)
+      Some(LongType)
+    } else if (sqlType == Types.BIT && typeName.equals("TINYINT")) {
+      Some(BooleanType)
+    } else None
+  }
+}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/jdbc/jdbc.scala b/sql/core/src/main/scala/org/apache/spark/sql/jdbc/jdbc.scala
index a61790b8472c8..f21dd29aca37f 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/jdbc/jdbc.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/jdbc/jdbc.scala
@@ -129,25 +129,26 @@ package object jdbc {
      */
     def schemaString(df: DataFrame, url: String): String = {
       val sb = new StringBuilder()
-      val quirks = DriverQuirks.get(url)
+      val dialect = JdbcDialects.get(url)
       df.schema.fields foreach { field => {
         val name = field.name
-        var typ: String = quirks.getJDBCType(field.dataType)._1
-        if (typ == null) typ = field.dataType match {
-          case IntegerType => "INTEGER"
-          case LongType => "BIGINT"
-          case DoubleType => "DOUBLE PRECISION"
-          case FloatType => "REAL"
-          case ShortType => "INTEGER"
-          case ByteType => "BYTE"
-          case BooleanType => "BIT(1)"
-          case StringType => "TEXT"
-          case BinaryType => "BLOB"
-          case TimestampType => "TIMESTAMP"
-          case DateType => "DATE"
-          case DecimalType.Unlimited => "DECIMAL(40,20)"
-          case _ => throw new IllegalArgumentException(s"Don't know how to save $field to JDBC")
-        }
+        val typ: String =
+          dialect.getJDBCType(field.dataType).map(_.databaseTypeDefinition).getOrElse(
+          field.dataType match {
+            case IntegerType => "INTEGER"
+            case LongType => "BIGINT"
+            case DoubleType => "DOUBLE PRECISION"
+            case FloatType => "REAL"
+            case ShortType => "INTEGER"
+            case ByteType => "BYTE"
+            case BooleanType => "BIT(1)"
+            case StringType => "TEXT"
+            case BinaryType => "BLOB"
+            case TimestampType => "TIMESTAMP"
+            case DateType => "DATE"
+            case DecimalType.Unlimited => "DECIMAL(40,20)"
+            case _ => throw new IllegalArgumentException(s"Don't know how to save $field to JDBC")
+          })
         val nullable = if (field.nullable) "" else "NOT NULL"
         sb.append(s", $name $typ $nullable")
       }}
@@ -162,10 +163,9 @@ package object jdbc {
         url: String,
         table: String,
         properties: Properties = new Properties()) {
-      val quirks = DriverQuirks.get(url)
+      val dialect = JdbcDialects.get(url)
       val nullTypes: Array[Int] = df.schema.fields.map { field =>
-        val nullType: Option[Int] = quirks.getJDBCType(field.dataType)._2
-        if (nullType.isEmpty) {
+        dialect.getJDBCType(field.dataType).map(_.jdbcNullType).getOrElse(
           field.dataType match {
             case IntegerType => java.sql.Types.INTEGER
             case LongType => java.sql.Types.BIGINT
@@ -181,8 +181,7 @@ package object jdbc {
             case DecimalType.Unlimited => java.sql.Types.DECIMAL
             case _ => throw new IllegalArgumentException(
               s"Can't translate null value for field $field")
-          }
-        } else nullType.get
+          })
       }
 
       val rddSchema = df.schema
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCSuite.scala
index 5a7b6f0aac6f7..a8dddfb9b6858 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCSuite.scala
@@ -35,6 +35,13 @@ class JDBCSuite extends FunSuite with BeforeAndAfter {
 
   val testBytes = Array[Byte](99.toByte, 134.toByte, 135.toByte, 200.toByte, 205.toByte)
 
+  val testH2Dialect = new JdbcDialect {
+    def canHandle(url: String) : Boolean = url.startsWith("jdbc:h2")
+    override def getCatalystType(
+        sqlType: Int, typeName: String, size: Int, md: MetadataBuilder): Option[DataType] =
+      Some(StringType)
+  }
+
   before {
     Class.forName("org.h2.Driver")
     // Extra properties that will be specified for our database. We need these to test
@@ -353,4 +360,46 @@ class JDBCSuite extends FunSuite with BeforeAndAfter {
         """.stripMargin.replaceAll("\n", " "))
     }
   }
+
+  test("Remap types via JdbcDialects") {
+    JdbcDialects.registerDialect(testH2Dialect)
+    val df = TestSQLContext.read.jdbc(urlWithUserAndPass, "TEST.PEOPLE", new Properties)
+    assert(df.schema.filter(
+      _.dataType != org.apache.spark.sql.types.StringType
+    ).isEmpty)
+    val rows = df.collect()
+    assert(rows(0).get(0).isInstanceOf[String])
+    assert(rows(0).get(1).isInstanceOf[String])
+    JdbcDialects.unregisterDialect(testH2Dialect)
+  }
+
+  test("Default jdbc dialect registration") {
+    assert(JdbcDialects.get("jdbc:mysql://127.0.0.1/db") == MySQLDialect)
+    assert(JdbcDialects.get("jdbc:postgresql://127.0.0.1/db") == PostgresDialect)
+    assert(JdbcDialects.get("test.invalid") == NoopDialect)
+  }
+
+  test("Dialect unregister") {
+    JdbcDialects.registerDialect(testH2Dialect)
+    JdbcDialects.unregisterDialect(testH2Dialect)
+    assert(JdbcDialects.get(urlWithUserAndPass) == NoopDialect)
+  }
+
+  test("Aggregated dialects") {
+    val agg = new AggregatedDialect(List(new JdbcDialect {
+      def canHandle(url: String) : Boolean = url.startsWith("jdbc:h2:")
+      override def getCatalystType(
+          sqlType: Int, typeName: String, size: Int, md: MetadataBuilder): Option[DataType] =
+        if (sqlType % 2 == 0) {
+          Some(LongType)
+        } else {
+          None
+        }
+    }, testH2Dialect))
+    assert(agg.canHandle("jdbc:h2:xxx"))
+    assert(!agg.canHandle("jdbc:h2"))
+    assert(agg.getCatalystType(0,"",1,null) == Some(LongType))
+    assert(agg.getCatalystType(1,"",1,null) == Some(StringType))
+  }
+
 }

From 010a1c278037130a69dcc79427d2b0380a2c82d8 Mon Sep 17 00:00:00 2001
From: Cheng Lian <lian@databricks.com>
Date: Mon, 18 May 2015 11:59:44 -0700
Subject: [PATCH 045/525] [SPARK-7570] [SQL] Ignores _temporary during
 partition discovery

<!-- Reviewable:start -->
[<img src="https://reviewable.io/review_button.png" height=40 alt="Review on Reviewable"/>](https://reviewable.io/reviews/apache/spark/6091)
<!-- Reviewable:end -->

Author: Cheng Lian <lian@databricks.com>

Closes #6091 from liancheng/spark-7570 and squashes the following commits:

8ff07e8 [Cheng Lian] Ignores _temporary during partition discovery
---
 .../spark/sql/sources/PartitioningUtils.scala | 15 ++++++---
 .../ParquetPartitionDiscoverySuite.scala      | 31 ++++++++++---------
 2 files changed, 27 insertions(+), 19 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/sources/PartitioningUtils.scala b/sql/core/src/main/scala/org/apache/spark/sql/sources/PartitioningUtils.scala
index d1f0cdab55f66..8f8138d6ebebc 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/sources/PartitioningUtils.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/sources/PartitioningUtils.scala
@@ -23,8 +23,7 @@ import java.math.{BigDecimal => JBigDecimal}
 import scala.collection.mutable.ArrayBuffer
 import scala.util.Try
 
-import com.google.common.cache.{CacheBuilder, Cache}
-import org.apache.hadoop.fs.{FileStatus, Path}
+import org.apache.hadoop.fs.Path
 
 import org.apache.spark.sql.Row
 import org.apache.spark.sql.catalyst.expressions.{Cast, Literal}
@@ -69,7 +68,7 @@ private[sql] object PartitioningUtils {
   private[sql] def parsePartitions(
       paths: Seq[Path],
       defaultPartitionName: String): PartitionSpec = {
-    val partitionValues = resolvePartitions(paths.map(parsePartition(_, defaultPartitionName)))
+    val partitionValues = resolvePartitions(paths.flatMap(parsePartition(_, defaultPartitionName)))
     val fields = {
       val (PartitionValues(columnNames, literals)) = partitionValues.head
       columnNames.zip(literals).map { case (name, Literal(_, dataType)) =>
@@ -103,13 +102,19 @@ private[sql] object PartitioningUtils {
    */
   private[sql] def parsePartition(
       path: Path,
-      defaultPartitionName: String): PartitionValues = {
+      defaultPartitionName: String): Option[PartitionValues] = {
     val columns = ArrayBuffer.empty[(String, Literal)]
     // Old Hadoop versions don't have `Path.isRoot`
     var finished = path.getParent == null
     var chopped = path
 
     while (!finished) {
+      // Sometimes (e.g., when speculative task is enabled), temporary directories may be left
+      // uncleaned.  Here we simply ignore them.
+      if (chopped.getName == "_temporary") {
+        return None
+      }
+
       val maybeColumn = parsePartitionColumn(chopped.getName, defaultPartitionName)
       maybeColumn.foreach(columns += _)
       chopped = chopped.getParent
@@ -117,7 +122,7 @@ private[sql] object PartitioningUtils {
     }
 
     val (columnNames, values) = columns.reverse.unzip
-    PartitionValues(columnNames, values)
+    Some(PartitionValues(columnNames, values))
   }
 
   private def parsePartitionColumn(
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetPartitionDiscoverySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetPartitionDiscoverySuite.scala
index 8079c460713da..1927114b8d58f 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetPartitionDiscoverySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetPartitionDiscoverySuite.scala
@@ -54,44 +54,47 @@ class ParquetPartitionDiscoverySuite extends QueryTest with ParquetTest {
   }
 
   test("parse partition") {
-    def check(path: String, expected: PartitionValues): Unit = {
+    def check(path: String, expected: Option[PartitionValues]): Unit = {
       assert(expected === parsePartition(new Path(path), defaultPartitionName))
     }
 
     def checkThrows[T <: Throwable: Manifest](path: String, expected: String): Unit = {
       val message = intercept[T] {
-        parsePartition(new Path(path), defaultPartitionName)
+        parsePartition(new Path(path), defaultPartitionName).get
       }.getMessage
 
       assert(message.contains(expected))
     }
 
-    check(
-      "file:///",
+    check("file:///", Some {
       PartitionValues(
         ArrayBuffer.empty[String],
-        ArrayBuffer.empty[Literal]))
+        ArrayBuffer.empty[Literal])
+    })
 
-    check(
-      "file://path/a=10",
+    check("file://path/a=10", Some {
       PartitionValues(
         ArrayBuffer("a"),
-        ArrayBuffer(Literal.create(10, IntegerType))))
+        ArrayBuffer(Literal.create(10, IntegerType)))
+    })
 
-    check(
-      "file://path/a=10/b=hello/c=1.5",
+    check("file://path/a=10/b=hello/c=1.5", Some {
       PartitionValues(
         ArrayBuffer("a", "b", "c"),
         ArrayBuffer(
           Literal.create(10, IntegerType),
           Literal.create("hello", StringType),
-          Literal.create(1.5, FloatType))))
+          Literal.create(1.5, FloatType)))
+    })
 
-    check(
-      "file://path/a=10/b_hello/c=1.5",
+    check("file://path/a=10/b_hello/c=1.5", Some {
       PartitionValues(
         ArrayBuffer("c"),
-        ArrayBuffer(Literal.create(1.5, FloatType))))
+        ArrayBuffer(Literal.create(1.5, FloatType)))
+    })
+
+    check("file://path/a=10/_temporary/c=1.5", None)
+    check("file://path/a=10/c=1.5/_temporary", None)
 
     checkThrows[AssertionError]("file://path/=10", "Empty partition column name")
     checkThrows[AssertionError]("file://path/a=", "Empty partition column value")

From 56ede88485cfca90974425fcb603b257be47229b Mon Sep 17 00:00:00 2001
From: Wenchen Fan <cloud0fan@outlook.com>
Date: Mon, 18 May 2015 12:01:30 -0700
Subject: [PATCH 046/525] [SQL] [MINOR] [THIS] use private for internal field
 in ScalaUdf

Author: Wenchen Fan <cloud0fan@outlook.com>

Closes #6235 from cloud-fan/tmp and squashes the following commits:

8f16367 [Wenchen Fan] use private[this]
---
 .../apache/spark/sql/catalyst/expressions/ScalaUdf.scala  | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ScalaUdf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ScalaUdf.scala
index d22eb10ad399f..fe2873e0be34d 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ScalaUdf.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ScalaUdf.scala
@@ -55,9 +55,9 @@ case class ScalaUdf(function: AnyRef, dataType: DataType, children: Seq[Expressi
     }.foreach(println)
 
   */
-  
-  val f = children.size match {
-    case 0 => 
+
+  private[this] val f = children.size match {
+    case 0 =>
       val func = function.asInstanceOf[() => Any]
       (input: Row) => {
         func()
@@ -956,7 +956,7 @@ case class ScalaUdf(function: AnyRef, dataType: DataType, children: Seq[Expressi
   }
 
   // scalastyle:on
-  val converter = CatalystTypeConverters.createToCatalystConverter(dataType)
+  private[this] val converter = CatalystTypeConverters.createToCatalystConverter(dataType)
   override def eval(input: Row): Any = converter(f(input))
 
 }

From 9c7e802a5a2b8cd3eb77642f84c54a8e976fc996 Mon Sep 17 00:00:00 2001
From: Xiangrui Meng <meng@databricks.com>
Date: Mon, 18 May 2015 12:02:18 -0700
Subject: [PATCH 047/525] [SPARK-7380] [MLLIB] pipeline stages should be
 copyable in Python

This PR makes pipeline stages in Python copyable and hence simplifies some implementations. It also includes the following changes:

1. Rename `paramMap` and `defaultParamMap` to `_paramMap` and `_defaultParamMap`, respectively.
2. Accept a list of param maps in `fit`.
3. Use parent uid and name to identify param.

jkbradley

Author: Xiangrui Meng <meng@databricks.com>
Author: Joseph K. Bradley <joseph@databricks.com>

Closes #6088 from mengxr/SPARK-7380 and squashes the following commits:

413c463 [Xiangrui Meng] remove unnecessary doc
4159f35 [Xiangrui Meng] Merge remote-tracking branch 'apache/master' into SPARK-7380
611c719 [Xiangrui Meng] fix python style
68862b8 [Xiangrui Meng] update _java_obj initialization
927ad19 [Xiangrui Meng] fix ml/tests.py
0138fc3 [Xiangrui Meng] update feature transformers and fix a bug in RegexTokenizer
9ca44fb [Xiangrui Meng] simplify Java wrappers and add tests
c7d84ef [Xiangrui Meng] update ml/tests.py to test copy params
7e0d27f [Xiangrui Meng] merge master
46840fb [Xiangrui Meng] update wrappers
b6db1ed [Xiangrui Meng] update all self.paramMap to self._paramMap
46cb6ed [Xiangrui Meng] merge master
a163413 [Xiangrui Meng] fix style
1042e80 [Xiangrui Meng] Merge remote-tracking branch 'apache/master' into SPARK-7380
9630eae [Xiangrui Meng] fix Identifiable._randomUID
13bd70a [Xiangrui Meng] update ml/tests.py
64a536c [Xiangrui Meng] use _fit/_transform/_evaluate to simplify the impl
02abf13 [Xiangrui Meng] Merge remote-tracking branch 'apache/master' into copyable-python
66ce18c [Joseph K. Bradley] some cleanups before sending to Xiangrui
7431272 [Joseph K. Bradley] Rebased with master
---
 .../apache/spark/ml/feature/Tokenizer.scala   |   2 +-
 .../org/apache/spark/ml/param/params.scala    |   7 +-
 .../apache/spark/ml/util/Identifiable.scala   |   6 +-
 python/pyspark/ml/classification.py           |  35 ++---
 python/pyspark/ml/evaluation.py               |   6 +-
 python/pyspark/ml/feature.py                  |  91 +++++++------
 python/pyspark/ml/param/__init__.py           | 118 ++++++++++++-----
 .../ml/param/_shared_params_code_gen.py       |   2 +-
 python/pyspark/ml/param/shared.py             |  42 +++---
 python/pyspark/ml/pipeline.py                 | 109 +++++++++++----
 python/pyspark/ml/recommendation.py           |  25 ++--
 python/pyspark/ml/regression.py               |  30 +++--
 python/pyspark/ml/tests.py                    | 105 ++++++++++-----
 python/pyspark/ml/tuning.py                   |  43 ++++--
 python/pyspark/ml/util.py                     |  13 +-
 python/pyspark/ml/wrapper.py                  | 125 +++++++++++-------
 16 files changed, 498 insertions(+), 261 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/Tokenizer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/Tokenizer.scala
index 36d9e17eca41b..3f7f4f96fc422 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/Tokenizer.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/Tokenizer.scala
@@ -61,7 +61,7 @@ class RegexTokenizer(override val uid: String)
    * Default: 1, to avoid returning empty strings
    * @group param
    */
-  val minTokenLength: IntParam = new IntParam(this, "minLength", "minimum token length (>= 0)",
+  val minTokenLength: IntParam = new IntParam(this, "minTokenLength", "minimum token length (>= 0)",
     ParamValidators.gtEq(0))
 
   /** @group setParam */
diff --git a/mllib/src/main/scala/org/apache/spark/ml/param/params.scala b/mllib/src/main/scala/org/apache/spark/ml/param/params.scala
index 247e08be1bb15..c33b66d31cd4f 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/param/params.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/param/params.scala
@@ -483,16 +483,15 @@ trait Params extends Identifiable with Serializable {
   def copy(extra: ParamMap): Params = {
     val that = this.getClass.getConstructor(classOf[String]).newInstance(uid)
     copyValues(that, extra)
-    that
   }
 
   /**
    * Extracts the embedded default param values and user-supplied values, and then merges them with
    * extra values from input into a flat param map, where the latter value is used if there exist
-   * conflicts, i.e., with ordering: default param values < user-supplied values < extraParamMap.
+   * conflicts, i.e., with ordering: default param values < user-supplied values < extra.
    */
-  final def extractParamMap(extraParamMap: ParamMap): ParamMap = {
-    defaultParamMap ++ paramMap ++ extraParamMap
+  final def extractParamMap(extra: ParamMap): ParamMap = {
+    defaultParamMap ++ paramMap ++ extra
   }
 
   /**
diff --git a/mllib/src/main/scala/org/apache/spark/ml/util/Identifiable.scala b/mllib/src/main/scala/org/apache/spark/ml/util/Identifiable.scala
index 146697680092c..ddd34a54503a6 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/util/Identifiable.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/util/Identifiable.scala
@@ -23,15 +23,17 @@ import java.util.UUID
 /**
  * Trait for an object with an immutable unique ID that identifies itself and its derivatives.
  */
-trait Identifiable {
+private[spark] trait Identifiable {
 
   /**
    * An immutable unique ID for the object and its derivatives.
    */
   val uid: String
+
+  override def toString: String = uid
 }
 
-object Identifiable {
+private[spark] object Identifiable {
 
   /**
    * Returns a random UID that concatenates the given prefix, "_", and 12 random hex chars.
diff --git a/python/pyspark/ml/classification.py b/python/pyspark/ml/classification.py
index 1411d3fd9c56e..4e645519c47c7 100644
--- a/python/pyspark/ml/classification.py
+++ b/python/pyspark/ml/classification.py
@@ -55,7 +55,7 @@ class LogisticRegression(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredicti
         ...
     TypeError: Method setParams forces keyword arguments.
     """
-    _java_class = "org.apache.spark.ml.classification.LogisticRegression"
+
     # a placeholder to make it appear in the generated doc
     elasticNetParam = \
         Param(Params._dummy(), "elasticNetParam",
@@ -75,6 +75,8 @@ def __init__(self, featuresCol="features", labelCol="label", predictionCol="pred
                  threshold=0.5, probabilityCol="probability")
         """
         super(LogisticRegression, self).__init__()
+        self._java_obj = self._new_java_obj(
+            "org.apache.spark.ml.classification.LogisticRegression", self.uid)
         #: param for the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty
         #  is an L2 penalty. For alpha = 1, it is an L1 penalty.
         self.elasticNetParam = \
@@ -111,7 +113,7 @@ def setElasticNetParam(self, value):
         """
         Sets the value of :py:attr:`elasticNetParam`.
         """
-        self.paramMap[self.elasticNetParam] = value
+        self._paramMap[self.elasticNetParam] = value
         return self
 
     def getElasticNetParam(self):
@@ -124,7 +126,7 @@ def setFitIntercept(self, value):
         """
         Sets the value of :py:attr:`fitIntercept`.
         """
-        self.paramMap[self.fitIntercept] = value
+        self._paramMap[self.fitIntercept] = value
         return self
 
     def getFitIntercept(self):
@@ -137,7 +139,7 @@ def setThreshold(self, value):
         """
         Sets the value of :py:attr:`threshold`.
         """
-        self.paramMap[self.threshold] = value
+        self._paramMap[self.threshold] = value
         return self
 
     def getThreshold(self):
@@ -208,7 +210,6 @@ class DecisionTreeClassifier(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPred
     1.0
     """
 
-    _java_class = "org.apache.spark.ml.classification.DecisionTreeClassifier"
     # a placeholder to make it appear in the generated doc
     impurity = Param(Params._dummy(), "impurity",
                      "Criterion used for information gain calculation (case-insensitive). " +
@@ -224,6 +225,8 @@ def __init__(self, featuresCol="features", labelCol="label", predictionCol="pred
                  maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, impurity="gini")
         """
         super(DecisionTreeClassifier, self).__init__()
+        self._java_obj = self._new_java_obj(
+            "org.apache.spark.ml.classification.DecisionTreeClassifier", self.uid)
         #: param for Criterion used for information gain calculation (case-insensitive).
         self.impurity = \
             Param(self, "impurity",
@@ -256,7 +259,7 @@ def setImpurity(self, value):
         """
         Sets the value of :py:attr:`impurity`.
         """
-        self.paramMap[self.impurity] = value
+        self._paramMap[self.impurity] = value
         return self
 
     def getImpurity(self):
@@ -299,7 +302,6 @@ class RandomForestClassifier(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPred
     1.0
     """
 
-    _java_class = "org.apache.spark.ml.classification.RandomForestClassifier"
     # a placeholder to make it appear in the generated doc
     impurity = Param(Params._dummy(), "impurity",
                      "Criterion used for information gain calculation (case-insensitive). " +
@@ -325,6 +327,8 @@ def __init__(self, featuresCol="features", labelCol="label", predictionCol="pred
                  numTrees=20, featureSubsetStrategy="auto", seed=42)
         """
         super(RandomForestClassifier, self).__init__()
+        self._java_obj = self._new_java_obj(
+            "org.apache.spark.ml.classification.RandomForestClassifier", self.uid)
         #: param for Criterion used for information gain calculation (case-insensitive).
         self.impurity = \
             Param(self, "impurity",
@@ -370,7 +374,7 @@ def setImpurity(self, value):
         """
         Sets the value of :py:attr:`impurity`.
         """
-        self.paramMap[self.impurity] = value
+        self._paramMap[self.impurity] = value
         return self
 
     def getImpurity(self):
@@ -383,7 +387,7 @@ def setSubsamplingRate(self, value):
         """
         Sets the value of :py:attr:`subsamplingRate`.
         """
-        self.paramMap[self.subsamplingRate] = value
+        self._paramMap[self.subsamplingRate] = value
         return self
 
     def getSubsamplingRate(self):
@@ -396,7 +400,7 @@ def setNumTrees(self, value):
         """
         Sets the value of :py:attr:`numTrees`.
         """
-        self.paramMap[self.numTrees] = value
+        self._paramMap[self.numTrees] = value
         return self
 
     def getNumTrees(self):
@@ -409,7 +413,7 @@ def setFeatureSubsetStrategy(self, value):
         """
         Sets the value of :py:attr:`featureSubsetStrategy`.
         """
-        self.paramMap[self.featureSubsetStrategy] = value
+        self._paramMap[self.featureSubsetStrategy] = value
         return self
 
     def getFeatureSubsetStrategy(self):
@@ -452,7 +456,6 @@ class GBTClassifier(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol
     1.0
     """
 
-    _java_class = "org.apache.spark.ml.classification.GBTClassifier"
     # a placeholder to make it appear in the generated doc
     lossType = Param(Params._dummy(), "lossType",
                      "Loss function which GBT tries to minimize (case-insensitive). " +
@@ -476,6 +479,8 @@ def __init__(self, featuresCol="features", labelCol="label", predictionCol="pred
                  lossType="logistic", maxIter=20, stepSize=0.1)
         """
         super(GBTClassifier, self).__init__()
+        self._java_obj = self._new_java_obj(
+            "org.apache.spark.ml.classification.GBTClassifier", self.uid)
         #: param for Loss function which GBT tries to minimize (case-insensitive).
         self.lossType = Param(self, "lossType",
                               "Loss function which GBT tries to minimize (case-insensitive). " +
@@ -517,7 +522,7 @@ def setLossType(self, value):
         """
         Sets the value of :py:attr:`lossType`.
         """
-        self.paramMap[self.lossType] = value
+        self._paramMap[self.lossType] = value
         return self
 
     def getLossType(self):
@@ -530,7 +535,7 @@ def setSubsamplingRate(self, value):
         """
         Sets the value of :py:attr:`subsamplingRate`.
         """
-        self.paramMap[self.subsamplingRate] = value
+        self._paramMap[self.subsamplingRate] = value
         return self
 
     def getSubsamplingRate(self):
@@ -543,7 +548,7 @@ def setStepSize(self, value):
         """
         Sets the value of :py:attr:`stepSize`.
         """
-        self.paramMap[self.stepSize] = value
+        self._paramMap[self.stepSize] = value
         return self
 
     def getStepSize(self):
diff --git a/python/pyspark/ml/evaluation.py b/python/pyspark/ml/evaluation.py
index 02020ebff94c2..f4655c513cae7 100644
--- a/python/pyspark/ml/evaluation.py
+++ b/python/pyspark/ml/evaluation.py
@@ -42,8 +42,6 @@ class BinaryClassificationEvaluator(JavaEvaluator, HasLabelCol, HasRawPrediction
     0.83...
     """
 
-    _java_class = "org.apache.spark.ml.evaluation.BinaryClassificationEvaluator"
-
     # a placeholder to make it appear in the generated doc
     metricName = Param(Params._dummy(), "metricName",
                        "metric name in evaluation (areaUnderROC|areaUnderPR)")
@@ -56,6 +54,8 @@ def __init__(self, rawPredictionCol="rawPrediction", labelCol="label",
                  metricName="areaUnderROC")
         """
         super(BinaryClassificationEvaluator, self).__init__()
+        self._java_obj = self._new_java_obj(
+            "org.apache.spark.ml.evaluation.BinaryClassificationEvaluator", self.uid)
         #: param for metric name in evaluation (areaUnderROC|areaUnderPR)
         self.metricName = Param(self, "metricName",
                                 "metric name in evaluation (areaUnderROC|areaUnderPR)")
@@ -68,7 +68,7 @@ def setMetricName(self, value):
         """
         Sets the value of :py:attr:`metricName`.
         """
-        self.paramMap[self.metricName] = value
+        self._paramMap[self.metricName] = value
         return self
 
     def getMetricName(self):
diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py
index 58e22190c7c3c..c8115cb5bcf63 100644
--- a/python/pyspark/ml/feature.py
+++ b/python/pyspark/ml/feature.py
@@ -43,7 +43,6 @@ class Binarizer(JavaTransformer, HasInputCol, HasOutputCol):
     1.0
     """
 
-    _java_class = "org.apache.spark.ml.feature.Binarizer"
     # a placeholder to make it appear in the generated doc
     threshold = Param(Params._dummy(), "threshold",
                       "threshold in binary classification prediction, in range [0, 1]")
@@ -54,6 +53,7 @@ def __init__(self, threshold=0.0, inputCol=None, outputCol=None):
         __init__(self, threshold=0.0, inputCol=None, outputCol=None)
         """
         super(Binarizer, self).__init__()
+        self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.Binarizer", self.uid)
         self.threshold = Param(self, "threshold",
                                "threshold in binary classification prediction, in range [0, 1]")
         self._setDefault(threshold=0.0)
@@ -73,7 +73,7 @@ def setThreshold(self, value):
         """
         Sets the value of :py:attr:`threshold`.
         """
-        self.paramMap[self.threshold] = value
+        self._paramMap[self.threshold] = value
         return self
 
     def getThreshold(self):
@@ -104,7 +104,6 @@ class Bucketizer(JavaTransformer, HasInputCol, HasOutputCol):
     0.0
     """
 
-    _java_class = "org.apache.spark.ml.feature.Bucketizer"
     # a placeholder to make it appear in the generated doc
     splits = \
         Param(Params._dummy(), "splits",
@@ -121,6 +120,7 @@ def __init__(self, splits=None, inputCol=None, outputCol=None):
         __init__(self, splits=None, inputCol=None, outputCol=None)
         """
         super(Bucketizer, self).__init__()
+        self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.Bucketizer", self.uid)
         #: param for Splitting points for mapping continuous features into buckets. With n+1 splits,
         #  there are n buckets. A bucket defined by splits x,y holds values in the range [x,y)
         #  except the last bucket, which also includes y. The splits should be strictly increasing.
@@ -150,7 +150,7 @@ def setSplits(self, value):
         """
         Sets the value of :py:attr:`splits`.
         """
-        self.paramMap[self.splits] = value
+        self._paramMap[self.splits] = value
         return self
 
     def getSplits(self):
@@ -177,14 +177,13 @@ class HashingTF(JavaTransformer, HasInputCol, HasOutputCol, HasNumFeatures):
     SparseVector(5, {2: 1.0, 3: 1.0, 4: 1.0})
     """
 
-    _java_class = "org.apache.spark.ml.feature.HashingTF"
-
     @keyword_only
     def __init__(self, numFeatures=1 << 18, inputCol=None, outputCol=None):
         """
         __init__(self, numFeatures=1 << 18, inputCol=None, outputCol=None)
         """
         super(HashingTF, self).__init__()
+        self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.HashingTF", self.uid)
         self._setDefault(numFeatures=1 << 18)
         kwargs = self.__init__._input_kwargs
         self.setParams(**kwargs)
@@ -217,8 +216,6 @@ class IDF(JavaEstimator, HasInputCol, HasOutputCol):
     DenseVector([0.2877, 0.0])
     """
 
-    _java_class = "org.apache.spark.ml.feature.IDF"
-
     # a placeholder to make it appear in the generated doc
     minDocFreq = Param(Params._dummy(), "minDocFreq",
                        "minimum of documents in which a term should appear for filtering")
@@ -229,6 +226,7 @@ def __init__(self, minDocFreq=0, inputCol=None, outputCol=None):
         __init__(self, minDocFreq=0, inputCol=None, outputCol=None)
         """
         super(IDF, self).__init__()
+        self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.IDF", self.uid)
         self.minDocFreq = Param(self, "minDocFreq",
                                 "minimum of documents in which a term should appear for filtering")
         self._setDefault(minDocFreq=0)
@@ -248,7 +246,7 @@ def setMinDocFreq(self, value):
         """
         Sets the value of :py:attr:`minDocFreq`.
         """
-        self.paramMap[self.minDocFreq] = value
+        self._paramMap[self.minDocFreq] = value
         return self
 
     def getMinDocFreq(self):
@@ -257,6 +255,9 @@ def getMinDocFreq(self):
         """
         return self.getOrDefault(self.minDocFreq)
 
+    def _create_model(self, java_model):
+        return IDFModel(java_model)
+
 
 class IDFModel(JavaModel):
     """
@@ -285,14 +286,13 @@ class Normalizer(JavaTransformer, HasInputCol, HasOutputCol):
     # a placeholder to make it appear in the generated doc
     p = Param(Params._dummy(), "p", "the p norm value.")
 
-    _java_class = "org.apache.spark.ml.feature.Normalizer"
-
     @keyword_only
     def __init__(self, p=2.0, inputCol=None, outputCol=None):
         """
         __init__(self, p=2.0, inputCol=None, outputCol=None)
         """
         super(Normalizer, self).__init__()
+        self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.Normalizer", self.uid)
         self.p = Param(self, "p", "the p norm value.")
         self._setDefault(p=2.0)
         kwargs = self.__init__._input_kwargs
@@ -311,7 +311,7 @@ def setP(self, value):
         """
         Sets the value of :py:attr:`p`.
         """
-        self.paramMap[self.p] = value
+        self._paramMap[self.p] = value
         return self
 
     def getP(self):
@@ -347,8 +347,6 @@ class OneHotEncoder(JavaTransformer, HasInputCol, HasOutputCol):
     SparseVector(3, {0: 1.0})
     """
 
-    _java_class = "org.apache.spark.ml.feature.OneHotEncoder"
-
     # a placeholder to make it appear in the generated doc
     includeFirst = Param(Params._dummy(), "includeFirst", "include first category")
 
@@ -358,6 +356,7 @@ def __init__(self, includeFirst=True, inputCol=None, outputCol=None):
         __init__(self, includeFirst=True, inputCol=None, outputCol=None)
         """
         super(OneHotEncoder, self).__init__()
+        self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.OneHotEncoder", self.uid)
         self.includeFirst = Param(self, "includeFirst", "include first category")
         self._setDefault(includeFirst=True)
         kwargs = self.__init__._input_kwargs
@@ -376,7 +375,7 @@ def setIncludeFirst(self, value):
         """
         Sets the value of :py:attr:`includeFirst`.
         """
-        self.paramMap[self.includeFirst] = value
+        self._paramMap[self.includeFirst] = value
         return self
 
     def getIncludeFirst(self):
@@ -404,8 +403,6 @@ class PolynomialExpansion(JavaTransformer, HasInputCol, HasOutputCol):
     DenseVector([0.5, 0.25, 2.0, 1.0, 4.0])
     """
 
-    _java_class = "org.apache.spark.ml.feature.PolynomialExpansion"
-
     # a placeholder to make it appear in the generated doc
     degree = Param(Params._dummy(), "degree", "the polynomial degree to expand (>= 1)")
 
@@ -415,6 +412,8 @@ def __init__(self, degree=2, inputCol=None, outputCol=None):
         __init__(self, degree=2, inputCol=None, outputCol=None)
         """
         super(PolynomialExpansion, self).__init__()
+        self._java_obj = self._new_java_obj(
+            "org.apache.spark.ml.feature.PolynomialExpansion", self.uid)
         self.degree = Param(self, "degree", "the polynomial degree to expand (>= 1)")
         self._setDefault(degree=2)
         kwargs = self.__init__._input_kwargs
@@ -433,7 +432,7 @@ def setDegree(self, value):
         """
         Sets the value of :py:attr:`degree`.
         """
-        self.paramMap[self.degree] = value
+        self._paramMap[self.degree] = value
         return self
 
     def getDegree(self):
@@ -471,7 +470,6 @@ class RegexTokenizer(JavaTransformer, HasInputCol, HasOutputCol):
     TypeError: Method setParams forces keyword arguments.
     """
 
-    _java_class = "org.apache.spark.ml.feature.RegexTokenizer"
     # a placeholder to make it appear in the generated doc
     minTokenLength = Param(Params._dummy(), "minTokenLength", "minimum token length (>= 0)")
     gaps = Param(Params._dummy(), "gaps", "Set regex to match gaps or tokens")
@@ -485,7 +483,8 @@ def __init__(self, minTokenLength=1, gaps=False, pattern="\\p{L}+|[^\\p{L}\\s]+"
                  inputCol=None, outputCol=None)
         """
         super(RegexTokenizer, self).__init__()
-        self.minTokenLength = Param(self, "minLength", "minimum token length (>= 0)")
+        self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.RegexTokenizer", self.uid)
+        self.minTokenLength = Param(self, "minTokenLength", "minimum token length (>= 0)")
         self.gaps = Param(self, "gaps", "Set regex to match gaps or tokens")
         self.pattern = Param(self, "pattern", "regex pattern used for tokenizing")
         self._setDefault(minTokenLength=1, gaps=False, pattern="\\p{L}+|[^\\p{L}\\s]+")
@@ -507,7 +506,7 @@ def setMinTokenLength(self, value):
         """
         Sets the value of :py:attr:`minTokenLength`.
         """
-        self.paramMap[self.minTokenLength] = value
+        self._paramMap[self.minTokenLength] = value
         return self
 
     def getMinTokenLength(self):
@@ -520,7 +519,7 @@ def setGaps(self, value):
         """
         Sets the value of :py:attr:`gaps`.
         """
-        self.paramMap[self.gaps] = value
+        self._paramMap[self.gaps] = value
         return self
 
     def getGaps(self):
@@ -533,7 +532,7 @@ def setPattern(self, value):
         """
         Sets the value of :py:attr:`pattern`.
         """
-        self.paramMap[self.pattern] = value
+        self._paramMap[self.pattern] = value
         return self
 
     def getPattern(self):
@@ -557,8 +556,6 @@ class StandardScaler(JavaEstimator, HasInputCol, HasOutputCol):
     DenseVector([1.4142])
     """
 
-    _java_class = "org.apache.spark.ml.feature.StandardScaler"
-
     # a placeholder to make it appear in the generated doc
     withMean = Param(Params._dummy(), "withMean", "Center data with mean")
     withStd = Param(Params._dummy(), "withStd", "Scale to unit standard deviation")
@@ -569,6 +566,7 @@ def __init__(self, withMean=False, withStd=True, inputCol=None, outputCol=None):
         __init__(self, withMean=False, withStd=True, inputCol=None, outputCol=None)
         """
         super(StandardScaler, self).__init__()
+        self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.StandardScaler", self.uid)
         self.withMean = Param(self, "withMean", "Center data with mean")
         self.withStd = Param(self, "withStd", "Scale to unit standard deviation")
         self._setDefault(withMean=False, withStd=True)
@@ -588,7 +586,7 @@ def setWithMean(self, value):
         """
         Sets the value of :py:attr:`withMean`.
         """
-        self.paramMap[self.withMean] = value
+        self._paramMap[self.withMean] = value
         return self
 
     def getWithMean(self):
@@ -601,7 +599,7 @@ def setWithStd(self, value):
         """
         Sets the value of :py:attr:`withStd`.
         """
-        self.paramMap[self.withStd] = value
+        self._paramMap[self.withStd] = value
         return self
 
     def getWithStd(self):
@@ -610,6 +608,9 @@ def getWithStd(self):
         """
         return self.getOrDefault(self.withStd)
 
+    def _create_model(self, java_model):
+        return StandardScalerModel(java_model)
+
 
 class StandardScalerModel(JavaModel):
     """
@@ -633,14 +634,13 @@ class StringIndexer(JavaEstimator, HasInputCol, HasOutputCol):
     [(0, 0.0), (1, 2.0), (2, 1.0), (3, 0.0), (4, 0.0), (5, 1.0)]
     """
 
-    _java_class = "org.apache.spark.ml.feature.StringIndexer"
-
     @keyword_only
     def __init__(self, inputCol=None, outputCol=None):
         """
         __init__(self, inputCol=None, outputCol=None)
         """
         super(StringIndexer, self).__init__()
+        self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.StringIndexer", self.uid)
         kwargs = self.__init__._input_kwargs
         self.setParams(**kwargs)
 
@@ -653,6 +653,9 @@ def setParams(self, inputCol=None, outputCol=None):
         kwargs = self.setParams._input_kwargs
         return self._set(**kwargs)
 
+    def _create_model(self, java_model):
+        return StringIndexerModel(java_model)
+
 
 class StringIndexerModel(JavaModel):
     """
@@ -686,14 +689,13 @@ class Tokenizer(JavaTransformer, HasInputCol, HasOutputCol):
     TypeError: Method setParams forces keyword arguments.
     """
 
-    _java_class = "org.apache.spark.ml.feature.Tokenizer"
-
     @keyword_only
     def __init__(self, inputCol=None, outputCol=None):
         """
         __init__(self, inputCol=None, outputCol=None)
         """
         super(Tokenizer, self).__init__()
+        self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.Tokenizer", self.uid)
         kwargs = self.__init__._input_kwargs
         self.setParams(**kwargs)
 
@@ -723,14 +725,13 @@ class VectorAssembler(JavaTransformer, HasInputCols, HasOutputCol):
     DenseVector([0.0, 1.0])
     """
 
-    _java_class = "org.apache.spark.ml.feature.VectorAssembler"
-
     @keyword_only
     def __init__(self, inputCols=None, outputCol=None):
         """
         __init__(self, inputCols=None, outputCol=None)
         """
         super(VectorAssembler, self).__init__()
+        self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.VectorAssembler", self.uid)
         kwargs = self.__init__._input_kwargs
         self.setParams(**kwargs)
 
@@ -797,7 +798,6 @@ class VectorIndexer(JavaEstimator, HasInputCol, HasOutputCol):
     DenseVector([1.0, 0.0])
     """
 
-    _java_class = "org.apache.spark.ml.feature.VectorIndexer"
     # a placeholder to make it appear in the generated doc
     maxCategories = Param(Params._dummy(), "maxCategories",
                           "Threshold for the number of values a categorical feature can take " +
@@ -810,6 +810,7 @@ def __init__(self, maxCategories=20, inputCol=None, outputCol=None):
         __init__(self, maxCategories=20, inputCol=None, outputCol=None)
         """
         super(VectorIndexer, self).__init__()
+        self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.VectorIndexer", self.uid)
         self.maxCategories = Param(self, "maxCategories",
                                    "Threshold for the number of values a categorical feature " +
                                    "can take (>= 2). If a feature is found to have " +
@@ -831,7 +832,7 @@ def setMaxCategories(self, value):
         """
         Sets the value of :py:attr:`maxCategories`.
         """
-        self.paramMap[self.maxCategories] = value
+        self._paramMap[self.maxCategories] = value
         return self
 
     def getMaxCategories(self):
@@ -840,6 +841,15 @@ def getMaxCategories(self):
         """
         return self.getOrDefault(self.maxCategories)
 
+    def _create_model(self, java_model):
+        return VectorIndexerModel(java_model)
+
+
+class VectorIndexerModel(JavaModel):
+    """
+    Model fitted by VectorIndexer.
+    """
+
 
 @inherit_doc
 @ignore_unicode_prefix
@@ -855,7 +865,6 @@ class Word2Vec(JavaEstimator, HasStepSize, HasMaxIter, HasSeed, HasInputCol, Has
     DenseVector([-0.0422, -0.5138, -0.2546, 0.6885, 0.276])
     """
 
-    _java_class = "org.apache.spark.ml.feature.Word2Vec"
     # a placeholder to make it appear in the generated doc
     vectorSize = Param(Params._dummy(), "vectorSize",
                        "the dimension of codes after transforming from words")
@@ -873,6 +882,7 @@ def __init__(self, vectorSize=100, minCount=5, numPartitions=1, stepSize=0.025,
                  seed=42, inputCol=None, outputCol=None)
         """
         super(Word2Vec, self).__init__()
+        self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.Word2Vec", self.uid)
         self.vectorSize = Param(self, "vectorSize",
                                 "the dimension of codes after transforming from words")
         self.numPartitions = Param(self, "numPartitions",
@@ -900,7 +910,7 @@ def setVectorSize(self, value):
         """
         Sets the value of :py:attr:`vectorSize`.
         """
-        self.paramMap[self.vectorSize] = value
+        self._paramMap[self.vectorSize] = value
         return self
 
     def getVectorSize(self):
@@ -913,7 +923,7 @@ def setNumPartitions(self, value):
         """
         Sets the value of :py:attr:`numPartitions`.
         """
-        self.paramMap[self.numPartitions] = value
+        self._paramMap[self.numPartitions] = value
         return self
 
     def getNumPartitions(self):
@@ -926,7 +936,7 @@ def setMinCount(self, value):
         """
         Sets the value of :py:attr:`minCount`.
         """
-        self.paramMap[self.minCount] = value
+        self._paramMap[self.minCount] = value
         return self
 
     def getMinCount(self):
@@ -935,6 +945,9 @@ def getMinCount(self):
         """
         return self.getOrDefault(self.minCount)
 
+    def _create_model(self, java_model):
+        return Word2VecModel(java_model)
+
 
 class Word2VecModel(JavaModel):
     """
diff --git a/python/pyspark/ml/param/__init__.py b/python/pyspark/ml/param/__init__.py
index 49c20b4cf70cf..67fb6e3dc74fb 100644
--- a/python/pyspark/ml/param/__init__.py
+++ b/python/pyspark/ml/param/__init__.py
@@ -16,6 +16,7 @@
 #
 
 from abc import ABCMeta
+import copy
 
 from pyspark.ml.util import Identifiable
 
@@ -29,9 +30,9 @@ class Param(object):
     """
 
     def __init__(self, parent, name, doc):
-        if not isinstance(parent, Params):
-            raise TypeError("Parent must be a Params but got type %s." % type(parent))
-        self.parent = parent
+        if not isinstance(parent, Identifiable):
+            raise TypeError("Parent must be an Identifiable but got type %s." % type(parent))
+        self.parent = parent.uid
         self.name = str(name)
         self.doc = str(doc)
 
@@ -41,6 +42,15 @@ def __str__(self):
     def __repr__(self):
         return "Param(parent=%r, name=%r, doc=%r)" % (self.parent, self.name, self.doc)
 
+    def __hash__(self):
+        return hash(str(self))
+
+    def __eq__(self, other):
+        if isinstance(other, Param):
+            return self.parent == other.parent and self.name == other.name
+        else:
+            return False
+
 
 class Params(Identifiable):
     """
@@ -51,10 +61,13 @@ class Params(Identifiable):
     __metaclass__ = ABCMeta
 
     #: internal param map for user-supplied values param map
-    paramMap = {}
+    _paramMap = {}
 
     #: internal param map for default values
-    defaultParamMap = {}
+    _defaultParamMap = {}
+
+    #: value returned by :py:func:`params`
+    _params = None
 
     @property
     def params(self):
@@ -63,10 +76,12 @@ def params(self):
         uses :py:func:`dir` to get all attributes of type
         :py:class:`Param`.
         """
-        return list(filter(lambda attr: isinstance(attr, Param),
-                           [getattr(self, x) for x in dir(self) if x != "params"]))
+        if self._params is None:
+            self._params = list(filter(lambda attr: isinstance(attr, Param),
+                                       [getattr(self, x) for x in dir(self) if x != "params"]))
+        return self._params
 
-    def _explain(self, param):
+    def explainParam(self, param):
         """
         Explains a single param and returns its name, doc, and optional
         default value and user-supplied value in a string.
@@ -74,10 +89,10 @@ def _explain(self, param):
         param = self._resolveParam(param)
         values = []
         if self.isDefined(param):
-            if param in self.defaultParamMap:
-                values.append("default: %s" % self.defaultParamMap[param])
-            if param in self.paramMap:
-                values.append("current: %s" % self.paramMap[param])
+            if param in self._defaultParamMap:
+                values.append("default: %s" % self._defaultParamMap[param])
+            if param in self._paramMap:
+                values.append("current: %s" % self._paramMap[param])
         else:
             values.append("undefined")
         valueStr = "(" + ", ".join(values) + ")"
@@ -88,7 +103,7 @@ def explainParams(self):
         Returns the documentation of all params with their optionally
         default values and user-supplied values.
         """
-        return "\n".join([self._explain(param) for param in self.params])
+        return "\n".join([self.explainParam(param) for param in self.params])
 
     def getParam(self, paramName):
         """
@@ -105,56 +120,76 @@ def isSet(self, param):
         Checks whether a param is explicitly set by user.
         """
         param = self._resolveParam(param)
-        return param in self.paramMap
+        return param in self._paramMap
 
     def hasDefault(self, param):
         """
         Checks whether a param has a default value.
         """
         param = self._resolveParam(param)
-        return param in self.defaultParamMap
+        return param in self._defaultParamMap
 
     def isDefined(self, param):
         """
-        Checks whether a param is explicitly set by user or has a default value.
+        Checks whether a param is explicitly set by user or has
+        a default value.
         """
         return self.isSet(param) or self.hasDefault(param)
 
+    def hasParam(self, paramName):
+        """
+        Tests whether this instance contains a param with a given
+        (string) name.
+        """
+        param = self._resolveParam(paramName)
+        return param in self.params
+
     def getOrDefault(self, param):
         """
         Gets the value of a param in the user-supplied param map or its
         default value. Raises an error if either is set.
         """
-        if isinstance(param, Param):
-            if param in self.paramMap:
-                return self.paramMap[param]
-            else:
-                return self.defaultParamMap[param]
-        elif isinstance(param, str):
-            return self.getOrDefault(self.getParam(param))
+        param = self._resolveParam(param)
+        if param in self._paramMap:
+            return self._paramMap[param]
         else:
-            raise KeyError("Cannot recognize %r as a param." % param)
+            return self._defaultParamMap[param]
 
-    def extractParamMap(self, extraParamMap={}):
+    def extractParamMap(self, extra={}):
         """
         Extracts the embedded default param values and user-supplied
         values, and then merges them with extra values from input into
         a flat param map, where the latter value is used if there exist
         conflicts, i.e., with ordering: default param values <
-        user-supplied values < extraParamMap.
-        :param extraParamMap: extra param values
+        user-supplied values < extra.
+        :param extra: extra param values
         :return: merged param map
         """
-        paramMap = self.defaultParamMap.copy()
-        paramMap.update(self.paramMap)
-        paramMap.update(extraParamMap)
+        paramMap = self._defaultParamMap.copy()
+        paramMap.update(self._paramMap)
+        paramMap.update(extra)
         return paramMap
 
+    def copy(self, extra={}):
+        """
+        Creates a copy of this instance with the same uid and some
+        extra params. The default implementation creates a
+        shallow copy using :py:func:`copy.copy`, and then copies the
+        embedded and extra parameters over and returns the copy.
+        Subclasses should override this method if the default approach
+        is not sufficient.
+        :param extra: Extra parameters to copy to the new instance
+        :return: Copy of this instance
+        """
+        that = copy.copy(self)
+        that._paramMap = self.extractParamMap(extra)
+        return that
+
     def _shouldOwn(self, param):
         """
         Validates that the input param belongs to this Params instance.
         """
-        if param.parent is not self:
+        if not (self.uid == param.parent and self.hasParam(param.name)):
             raise ValueError("Param %r does not belong to %r." % (param, self))
 
     def _resolveParam(self, param):
@@ -175,7 +210,8 @@ def _resolveParam(self, param):
     @staticmethod
     def _dummy():
         """
-        Returns a dummy Params instance used as a placeholder to generate docs.
+        Returns a dummy Params instance used as a placeholder to
+        generate docs.
         """
         dummy = Params()
         dummy.uid = "undefined"
@@ -186,7 +222,7 @@ def _set(self, **kwargs):
         Sets user-supplied params.
         """
         for param, value in kwargs.items():
-            self.paramMap[getattr(self, param)] = value
+            self._paramMap[getattr(self, param)] = value
         return self
 
     def _setDefault(self, **kwargs):
@@ -194,5 +230,19 @@ def _setDefault(self, **kwargs):
         Sets default params.
         """
         for param, value in kwargs.items():
-            self.defaultParamMap[getattr(self, param)] = value
+            self._defaultParamMap[getattr(self, param)] = value
         return self
+
+    def _copyValues(self, to, extra={}):
+        """
+        Copies param values from this instance to another instance for
+        params shared by them.
+        :param to: the target instance
+        :param extra: extra params to be copied
+        :return: the target instance with param values copied
+        """
+        paramMap = self.extractParamMap(extra)
+        for p in self.params:
+            if p in paramMap and to.hasParam(p.name):
+                to._set(**{p.name: paramMap[p]})
+        return to
diff --git a/python/pyspark/ml/param/_shared_params_code_gen.py b/python/pyspark/ml/param/_shared_params_code_gen.py
index 6fa9b8c2cf367..91e45ec373518 100644
--- a/python/pyspark/ml/param/_shared_params_code_gen.py
+++ b/python/pyspark/ml/param/_shared_params_code_gen.py
@@ -83,7 +83,7 @@ def set$Name(self, value):
         """
         Sets the value of :py:attr:`$name`.
         """
-        self.paramMap[self.$name] = value
+        self._paramMap[self.$name] = value
         return self
 
     def get$Name(self):
diff --git a/python/pyspark/ml/param/shared.py b/python/pyspark/ml/param/shared.py
index b116f05a068d3..a5dc9b7ef29ed 100644
--- a/python/pyspark/ml/param/shared.py
+++ b/python/pyspark/ml/param/shared.py
@@ -39,7 +39,7 @@ def setMaxIter(self, value):
         """
         Sets the value of :py:attr:`maxIter`.
         """
-        self.paramMap[self.maxIter] = value
+        self._paramMap[self.maxIter] = value
         return self
 
     def getMaxIter(self):
@@ -68,7 +68,7 @@ def setRegParam(self, value):
         """
         Sets the value of :py:attr:`regParam`.
         """
-        self.paramMap[self.regParam] = value
+        self._paramMap[self.regParam] = value
         return self
 
     def getRegParam(self):
@@ -97,7 +97,7 @@ def setFeaturesCol(self, value):
         """
         Sets the value of :py:attr:`featuresCol`.
         """
-        self.paramMap[self.featuresCol] = value
+        self._paramMap[self.featuresCol] = value
         return self
 
     def getFeaturesCol(self):
@@ -126,7 +126,7 @@ def setLabelCol(self, value):
         """
         Sets the value of :py:attr:`labelCol`.
         """
-        self.paramMap[self.labelCol] = value
+        self._paramMap[self.labelCol] = value
         return self
 
     def getLabelCol(self):
@@ -155,7 +155,7 @@ def setPredictionCol(self, value):
         """
         Sets the value of :py:attr:`predictionCol`.
         """
-        self.paramMap[self.predictionCol] = value
+        self._paramMap[self.predictionCol] = value
         return self
 
     def getPredictionCol(self):
@@ -184,7 +184,7 @@ def setProbabilityCol(self, value):
         """
         Sets the value of :py:attr:`probabilityCol`.
         """
-        self.paramMap[self.probabilityCol] = value
+        self._paramMap[self.probabilityCol] = value
         return self
 
     def getProbabilityCol(self):
@@ -213,7 +213,7 @@ def setRawPredictionCol(self, value):
         """
         Sets the value of :py:attr:`rawPredictionCol`.
         """
-        self.paramMap[self.rawPredictionCol] = value
+        self._paramMap[self.rawPredictionCol] = value
         return self
 
     def getRawPredictionCol(self):
@@ -242,7 +242,7 @@ def setInputCol(self, value):
         """
         Sets the value of :py:attr:`inputCol`.
         """
-        self.paramMap[self.inputCol] = value
+        self._paramMap[self.inputCol] = value
         return self
 
     def getInputCol(self):
@@ -271,7 +271,7 @@ def setInputCols(self, value):
         """
         Sets the value of :py:attr:`inputCols`.
         """
-        self.paramMap[self.inputCols] = value
+        self._paramMap[self.inputCols] = value
         return self
 
     def getInputCols(self):
@@ -300,7 +300,7 @@ def setOutputCol(self, value):
         """
         Sets the value of :py:attr:`outputCol`.
         """
-        self.paramMap[self.outputCol] = value
+        self._paramMap[self.outputCol] = value
         return self
 
     def getOutputCol(self):
@@ -329,7 +329,7 @@ def setNumFeatures(self, value):
         """
         Sets the value of :py:attr:`numFeatures`.
         """
-        self.paramMap[self.numFeatures] = value
+        self._paramMap[self.numFeatures] = value
         return self
 
     def getNumFeatures(self):
@@ -358,7 +358,7 @@ def setCheckpointInterval(self, value):
         """
         Sets the value of :py:attr:`checkpointInterval`.
         """
-        self.paramMap[self.checkpointInterval] = value
+        self._paramMap[self.checkpointInterval] = value
         return self
 
     def getCheckpointInterval(self):
@@ -387,7 +387,7 @@ def setSeed(self, value):
         """
         Sets the value of :py:attr:`seed`.
         """
-        self.paramMap[self.seed] = value
+        self._paramMap[self.seed] = value
         return self
 
     def getSeed(self):
@@ -416,7 +416,7 @@ def setTol(self, value):
         """
         Sets the value of :py:attr:`tol`.
         """
-        self.paramMap[self.tol] = value
+        self._paramMap[self.tol] = value
         return self
 
     def getTol(self):
@@ -445,7 +445,7 @@ def setStepSize(self, value):
         """
         Sets the value of :py:attr:`stepSize`.
         """
-        self.paramMap[self.stepSize] = value
+        self._paramMap[self.stepSize] = value
         return self
 
     def getStepSize(self):
@@ -487,7 +487,7 @@ def setMaxDepth(self, value):
         """
         Sets the value of :py:attr:`maxDepth`.
         """
-        self.paramMap[self.maxDepth] = value
+        self._paramMap[self.maxDepth] = value
         return self
 
     def getMaxDepth(self):
@@ -500,7 +500,7 @@ def setMaxBins(self, value):
         """
         Sets the value of :py:attr:`maxBins`.
         """
-        self.paramMap[self.maxBins] = value
+        self._paramMap[self.maxBins] = value
         return self
 
     def getMaxBins(self):
@@ -513,7 +513,7 @@ def setMinInstancesPerNode(self, value):
         """
         Sets the value of :py:attr:`minInstancesPerNode`.
         """
-        self.paramMap[self.minInstancesPerNode] = value
+        self._paramMap[self.minInstancesPerNode] = value
         return self
 
     def getMinInstancesPerNode(self):
@@ -526,7 +526,7 @@ def setMinInfoGain(self, value):
         """
         Sets the value of :py:attr:`minInfoGain`.
         """
-        self.paramMap[self.minInfoGain] = value
+        self._paramMap[self.minInfoGain] = value
         return self
 
     def getMinInfoGain(self):
@@ -539,7 +539,7 @@ def setMaxMemoryInMB(self, value):
         """
         Sets the value of :py:attr:`maxMemoryInMB`.
         """
-        self.paramMap[self.maxMemoryInMB] = value
+        self._paramMap[self.maxMemoryInMB] = value
         return self
 
     def getMaxMemoryInMB(self):
@@ -552,7 +552,7 @@ def setCacheNodeIds(self, value):
         """
         Sets the value of :py:attr:`cacheNodeIds`.
         """
-        self.paramMap[self.cacheNodeIds] = value
+        self._paramMap[self.cacheNodeIds] = value
         return self
 
     def getCacheNodeIds(self):
diff --git a/python/pyspark/ml/pipeline.py b/python/pyspark/ml/pipeline.py
index a328bcf84a2e7..0f38e021273b0 100644
--- a/python/pyspark/ml/pipeline.py
+++ b/python/pyspark/ml/pipeline.py
@@ -31,18 +31,40 @@ class Estimator(Params):
     __metaclass__ = ABCMeta
 
     @abstractmethod
-    def fit(self, dataset, params={}):
+    def _fit(self, dataset):
         """
-        Fits a model to the input dataset with optional parameters.
+        Fits a model to the input dataset. This is called by the
+        default implementation of fit.
 
         :param dataset: input dataset, which is an instance of
                         :py:class:`pyspark.sql.DataFrame`
-        :param params: an optional param map that overwrites embedded
-                       params
         :returns: fitted model
         """
         raise NotImplementedError()
 
+    def fit(self, dataset, params={}):
+        """
+        Fits a model to the input dataset with optional parameters.
+
+        :param dataset: input dataset, which is an instance of
+                        :py:class:`pyspark.sql.DataFrame`
+        :param params: an optional param map that overrides embedded
+                       params. If a list/tuple of param maps is given,
+                       this calls fit on each param map and returns a
+                       list of models.
+        :returns: fitted model(s)
+        """
+        if isinstance(params, (list, tuple)):
+            return [self.fit(dataset, paramMap) for paramMap in params]
+        elif isinstance(params, dict):
+            if params:
+                return self.copy(params)._fit(dataset)
+            else:
+                return self._fit(dataset)
+        else:
+            raise ValueError("Params must be either a param map or a list/tuple of param maps, "
+                             "but got %s." % type(params))
+
 
 @inherit_doc
 class Transformer(Params):
@@ -54,18 +76,34 @@ class Transformer(Params):
     __metaclass__ = ABCMeta
 
     @abstractmethod
-    def transform(self, dataset, params={}):
+    def _transform(self, dataset):
         """
         Transforms the input dataset with optional parameters.
 
         :param dataset: input dataset, which is an instance of
                         :py:class:`pyspark.sql.DataFrame`
-        :param params: an optional param map that overwrites embedded
-                       params
         :returns: transformed dataset
         """
         raise NotImplementedError()
 
+    def transform(self, dataset, params={}):
+        """
+        Transforms the input dataset with optional parameters.
+
+        :param dataset: input dataset, which is an instance of
+                        :py:class:`pyspark.sql.DataFrame`
+        :param params: an optional param map that overrides embedded
+                       params.
+        :returns: transformed dataset
+        """
+        if isinstance(params, dict):
+            if params:
+                return self.copy(params,)._transform(dataset)
+            else:
+                return self._transform(dataset)
+        else:
+            raise ValueError("Params must be either a param map but got %s." % type(params))
+
 
 @inherit_doc
 class Model(Transformer):
@@ -113,15 +151,15 @@ def setStages(self, value):
         :param value: a list of transformers or estimators
         :return: the pipeline instance
         """
-        self.paramMap[self.stages] = value
+        self._paramMap[self.stages] = value
         return self
 
     def getStages(self):
         """
         Get pipeline stages.
         """
-        if self.stages in self.paramMap:
-            return self.paramMap[self.stages]
+        if self.stages in self._paramMap:
+            return self._paramMap[self.stages]
 
     @keyword_only
     def setParams(self, stages=[]):
@@ -132,9 +170,8 @@ def setParams(self, stages=[]):
         kwargs = self.setParams._input_kwargs
         return self._set(**kwargs)
 
-    def fit(self, dataset, params={}):
-        paramMap = self.extractParamMap(params)
-        stages = paramMap[self.stages]
+    def _fit(self, dataset):
+        stages = self.getStages()
         for stage in stages:
             if not (isinstance(stage, Estimator) or isinstance(stage, Transformer)):
                 raise TypeError(
@@ -148,16 +185,21 @@ def fit(self, dataset, params={}):
             if i <= indexOfLastEstimator:
                 if isinstance(stage, Transformer):
                     transformers.append(stage)
-                    dataset = stage.transform(dataset, paramMap)
+                    dataset = stage.transform(dataset)
                 else:  # must be an Estimator
-                    model = stage.fit(dataset, paramMap)
+                    model = stage.fit(dataset)
                     transformers.append(model)
                     if i < indexOfLastEstimator:
-                        dataset = model.transform(dataset, paramMap)
+                        dataset = model.transform(dataset)
             else:
                 transformers.append(stage)
         return PipelineModel(transformers)
 
+    def copy(self, extra={}):
+        that = Params.copy(self, extra)
+        stages = [stage.copy(extra) for stage in that.getStages()]
+        return that.setStages(stages)
+
 
 @inherit_doc
 class PipelineModel(Model):
@@ -165,16 +207,19 @@ class PipelineModel(Model):
     Represents a compiled pipeline with transformers and fitted models.
     """
 
-    def __init__(self, transformers):
+    def __init__(self, stages):
         super(PipelineModel, self).__init__()
-        self.transformers = transformers
+        self.stages = stages
 
-    def transform(self, dataset, params={}):
-        paramMap = self.extractParamMap(params)
-        for t in self.transformers:
-            dataset = t.transform(dataset, paramMap)
+    def _transform(self, dataset):
+        for t in self.stages:
+            dataset = t.transform(dataset)
         return dataset
 
+    def copy(self, extra={}):
+        stages = [stage.copy(extra) for stage in self.stages]
+        return PipelineModel(stages)
+
 
 class Evaluator(Params):
     """
@@ -184,14 +229,30 @@ class Evaluator(Params):
     __metaclass__ = ABCMeta
 
     @abstractmethod
-    def evaluate(self, dataset, params={}):
+    def _evaluate(self, dataset):
         """
         Evaluates the output.
 
+        :param dataset: a dataset that contains labels/observations and
+               predictions
+        :return: metric
+        """
+        raise NotImplementedError()
+
+    def evaluate(self, dataset, params={}):
+        """
+        Evaluates the output with optional parameters.
+
         :param dataset: a dataset that contains labels/observations and
                         predictions
         :param params: an optional param map that overrides embedded
                        params
         :return: metric
         """
-        raise NotImplementedError()
+        if isinstance(params, dict):
+            if params:
+                return self.copy(params)._evaluate(dataset)
+            else:
+                return self._evaluate(dataset)
+        else:
+            raise ValueError("Params must be a param map but got %s." % type(params))
diff --git a/python/pyspark/ml/recommendation.py b/python/pyspark/ml/recommendation.py
index b2439cbd96522..39c2527543774 100644
--- a/python/pyspark/ml/recommendation.py
+++ b/python/pyspark/ml/recommendation.py
@@ -74,7 +74,7 @@ class ALS(JavaEstimator, HasCheckpointInterval, HasMaxIter, HasPredictionCol, Ha
     >>> predictions[2]
     Row(user=2, item=0, prediction=-1.15...)
     """
-    _java_class = "org.apache.spark.ml.recommendation.ALS"
+
     # a placeholder to make it appear in the generated doc
     rank = Param(Params._dummy(), "rank", "rank of the factorization")
     numUserBlocks = Param(Params._dummy(), "numUserBlocks", "number of user blocks")
@@ -97,6 +97,7 @@ def __init__(self, rank=10, maxIter=10, regParam=0.1, numUserBlocks=10, numItemB
                  ratingCol="rating", nonnegative=false, checkpointInterval=10)
         """
         super(ALS, self).__init__()
+        self._java_obj = self._new_java_obj("org.apache.spark.ml.recommendation.ALS", self.uid)
         self.rank = Param(self, "rank", "rank of the factorization")
         self.numUserBlocks = Param(self, "numUserBlocks", "number of user blocks")
         self.numItemBlocks = Param(self, "numItemBlocks", "number of item blocks")
@@ -133,7 +134,7 @@ def setRank(self, value):
         """
         Sets the value of :py:attr:`rank`.
         """
-        self.paramMap[self.rank] = value
+        self._paramMap[self.rank] = value
         return self
 
     def getRank(self):
@@ -146,7 +147,7 @@ def setNumUserBlocks(self, value):
         """
         Sets the value of :py:attr:`numUserBlocks`.
         """
-        self.paramMap[self.numUserBlocks] = value
+        self._paramMap[self.numUserBlocks] = value
         return self
 
     def getNumUserBlocks(self):
@@ -159,7 +160,7 @@ def setNumItemBlocks(self, value):
         """
         Sets the value of :py:attr:`numItemBlocks`.
         """
-        self.paramMap[self.numItemBlocks] = value
+        self._paramMap[self.numItemBlocks] = value
         return self
 
     def getNumItemBlocks(self):
@@ -172,14 +173,14 @@ def setNumBlocks(self, value):
         """
         Sets both :py:attr:`numUserBlocks` and :py:attr:`numItemBlocks` to the specific value.
         """
-        self.paramMap[self.numUserBlocks] = value
-        self.paramMap[self.numItemBlocks] = value
+        self._paramMap[self.numUserBlocks] = value
+        self._paramMap[self.numItemBlocks] = value
 
     def setImplicitPrefs(self, value):
         """
         Sets the value of :py:attr:`implicitPrefs`.
         """
-        self.paramMap[self.implicitPrefs] = value
+        self._paramMap[self.implicitPrefs] = value
         return self
 
     def getImplicitPrefs(self):
@@ -192,7 +193,7 @@ def setAlpha(self, value):
         """
         Sets the value of :py:attr:`alpha`.
         """
-        self.paramMap[self.alpha] = value
+        self._paramMap[self.alpha] = value
         return self
 
     def getAlpha(self):
@@ -205,7 +206,7 @@ def setUserCol(self, value):
         """
         Sets the value of :py:attr:`userCol`.
         """
-        self.paramMap[self.userCol] = value
+        self._paramMap[self.userCol] = value
         return self
 
     def getUserCol(self):
@@ -218,7 +219,7 @@ def setItemCol(self, value):
         """
         Sets the value of :py:attr:`itemCol`.
         """
-        self.paramMap[self.itemCol] = value
+        self._paramMap[self.itemCol] = value
         return self
 
     def getItemCol(self):
@@ -231,7 +232,7 @@ def setRatingCol(self, value):
         """
         Sets the value of :py:attr:`ratingCol`.
         """
-        self.paramMap[self.ratingCol] = value
+        self._paramMap[self.ratingCol] = value
         return self
 
     def getRatingCol(self):
@@ -244,7 +245,7 @@ def setNonnegative(self, value):
         """
         Sets the value of :py:attr:`nonnegative`.
         """
-        self.paramMap[self.nonnegative] = value
+        self._paramMap[self.nonnegative] = value
         return self
 
     def getNonnegative(self):
diff --git a/python/pyspark/ml/regression.py b/python/pyspark/ml/regression.py
index ef77e19327188..ff809cdafdf51 100644
--- a/python/pyspark/ml/regression.py
+++ b/python/pyspark/ml/regression.py
@@ -62,7 +62,7 @@ class LinearRegression(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPrediction
         ...
     TypeError: Method setParams forces keyword arguments.
     """
-    _java_class = "org.apache.spark.ml.regression.LinearRegression"
+
     # a placeholder to make it appear in the generated doc
     elasticNetParam = \
         Param(Params._dummy(), "elasticNetParam",
@@ -77,6 +77,8 @@ def __init__(self, featuresCol="features", labelCol="label", predictionCol="pred
                  maxIter=100, regParam=0.0, elasticNetParam=0.0, tol=1e-6)
         """
         super(LinearRegression, self).__init__()
+        self._java_obj = self._new_java_obj(
+            "org.apache.spark.ml.regression.LinearRegression", self.uid)
         #: param for the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty
         #  is an L2 penalty. For alpha = 1, it is an L1 penalty.
         self.elasticNetParam = \
@@ -105,7 +107,7 @@ def setElasticNetParam(self, value):
         """
         Sets the value of :py:attr:`elasticNetParam`.
         """
-        self.paramMap[self.elasticNetParam] = value
+        self._paramMap[self.elasticNetParam] = value
         return self
 
     def getElasticNetParam(self):
@@ -178,7 +180,6 @@ class DecisionTreeRegressor(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredi
     1.0
     """
 
-    _java_class = "org.apache.spark.ml.regression.DecisionTreeRegressor"
     # a placeholder to make it appear in the generated doc
     impurity = Param(Params._dummy(), "impurity",
                      "Criterion used for information gain calculation (case-insensitive). " +
@@ -194,6 +195,8 @@ def __init__(self, featuresCol="features", labelCol="label", predictionCol="pred
                  maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, impurity="variance")
         """
         super(DecisionTreeRegressor, self).__init__()
+        self._java_obj = self._new_java_obj(
+            "org.apache.spark.ml.regression.DecisionTreeRegressor", self.uid)
         #: param for Criterion used for information gain calculation (case-insensitive).
         self.impurity = \
             Param(self, "impurity",
@@ -226,7 +229,7 @@ def setImpurity(self, value):
         """
         Sets the value of :py:attr:`impurity`.
         """
-        self.paramMap[self.impurity] = value
+        self._paramMap[self.impurity] = value
         return self
 
     def getImpurity(self):
@@ -264,7 +267,6 @@ class RandomForestRegressor(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredi
     0.5
     """
 
-    _java_class = "org.apache.spark.ml.regression.RandomForestRegressor"
     # a placeholder to make it appear in the generated doc
     impurity = Param(Params._dummy(), "impurity",
                      "Criterion used for information gain calculation (case-insensitive). " +
@@ -290,6 +292,8 @@ def __init__(self, featuresCol="features", labelCol="label", predictionCol="pred
                  impurity="variance", numTrees=20, featureSubsetStrategy="auto", seed=42)
         """
         super(RandomForestRegressor, self).__init__()
+        self._java_obj = self._new_java_obj(
+            "org.apache.spark.ml.regression.RandomForestRegressor", self.uid)
         #: param for Criterion used for information gain calculation (case-insensitive).
         self.impurity = \
             Param(self, "impurity",
@@ -335,7 +339,7 @@ def setImpurity(self, value):
         """
         Sets the value of :py:attr:`impurity`.
         """
-        self.paramMap[self.impurity] = value
+        self._paramMap[self.impurity] = value
         return self
 
     def getImpurity(self):
@@ -348,7 +352,7 @@ def setSubsamplingRate(self, value):
         """
         Sets the value of :py:attr:`subsamplingRate`.
         """
-        self.paramMap[self.subsamplingRate] = value
+        self._paramMap[self.subsamplingRate] = value
         return self
 
     def getSubsamplingRate(self):
@@ -361,7 +365,7 @@ def setNumTrees(self, value):
         """
         Sets the value of :py:attr:`numTrees`.
         """
-        self.paramMap[self.numTrees] = value
+        self._paramMap[self.numTrees] = value
         return self
 
     def getNumTrees(self):
@@ -374,7 +378,7 @@ def setFeatureSubsetStrategy(self, value):
         """
         Sets the value of :py:attr:`featureSubsetStrategy`.
         """
-        self.paramMap[self.featureSubsetStrategy] = value
+        self._paramMap[self.featureSubsetStrategy] = value
         return self
 
     def getFeatureSubsetStrategy(self):
@@ -412,7 +416,6 @@ class GBTRegressor(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol,
     1.0
     """
 
-    _java_class = "org.apache.spark.ml.regression.GBTRegressor"
     # a placeholder to make it appear in the generated doc
     lossType = Param(Params._dummy(), "lossType",
                      "Loss function which GBT tries to minimize (case-insensitive). " +
@@ -436,6 +439,7 @@ def __init__(self, featuresCol="features", labelCol="label", predictionCol="pred
                  lossType="squared", maxIter=20, stepSize=0.1)
         """
         super(GBTRegressor, self).__init__()
+        self._java_obj = self._new_java_obj("org.apache.spark.ml.regression.GBTRegressor", self.uid)
         #: param for Loss function which GBT tries to minimize (case-insensitive).
         self.lossType = Param(self, "lossType",
                               "Loss function which GBT tries to minimize (case-insensitive). " +
@@ -477,7 +481,7 @@ def setLossType(self, value):
         """
         Sets the value of :py:attr:`lossType`.
         """
-        self.paramMap[self.lossType] = value
+        self._paramMap[self.lossType] = value
         return self
 
     def getLossType(self):
@@ -490,7 +494,7 @@ def setSubsamplingRate(self, value):
         """
         Sets the value of :py:attr:`subsamplingRate`.
         """
-        self.paramMap[self.subsamplingRate] = value
+        self._paramMap[self.subsamplingRate] = value
         return self
 
     def getSubsamplingRate(self):
@@ -503,7 +507,7 @@ def setStepSize(self, value):
         """
         Sets the value of :py:attr:`stepSize`.
         """
-        self.paramMap[self.stepSize] = value
+        self._paramMap[self.stepSize] = value
         return self
 
     def getStepSize(self):
diff --git a/python/pyspark/ml/tests.py b/python/pyspark/ml/tests.py
index ba6478dcd58a9..10fe0ef8db38f 100644
--- a/python/pyspark/ml/tests.py
+++ b/python/pyspark/ml/tests.py
@@ -31,10 +31,12 @@
     import unittest
 
 from pyspark.tests import ReusedPySparkTestCase as PySparkTestCase
-from pyspark.sql import DataFrame
-from pyspark.ml.param import Param
+from pyspark.sql import DataFrame, SQLContext
+from pyspark.ml.param import Param, Params
 from pyspark.ml.param.shared import HasMaxIter, HasInputCol
-from pyspark.ml.pipeline import Estimator, Model, Pipeline, Transformer
+from pyspark.ml import Estimator, Model, Pipeline, Transformer
+from pyspark.ml.feature import *
+from pyspark.mllib.linalg import DenseVector
 
 
 class MockDataset(DataFrame):
@@ -43,44 +45,43 @@ def __init__(self):
         self.index = 0
 
 
-class MockTransformer(Transformer):
+class HasFake(Params):
+
+    def __init__(self):
+        super(HasFake, self).__init__()
+        self.fake = Param(self, "fake", "fake param")
+
+    def getFake(self):
+        return self.getOrDefault(self.fake)
+
+
+class MockTransformer(Transformer, HasFake):
 
     def __init__(self):
         super(MockTransformer, self).__init__()
-        self.fake = Param(self, "fake", "fake")
         self.dataset_index = None
-        self.fake_param_value = None
 
-    def transform(self, dataset, params={}):
+    def _transform(self, dataset):
         self.dataset_index = dataset.index
-        if self.fake in params:
-            self.fake_param_value = params[self.fake]
         dataset.index += 1
         return dataset
 
 
-class MockEstimator(Estimator):
+class MockEstimator(Estimator, HasFake):
 
     def __init__(self):
         super(MockEstimator, self).__init__()
-        self.fake = Param(self, "fake", "fake")
         self.dataset_index = None
-        self.fake_param_value = None
-        self.model = None
 
-    def fit(self, dataset, params={}):
+    def _fit(self, dataset):
         self.dataset_index = dataset.index
-        if self.fake in params:
-            self.fake_param_value = params[self.fake]
         model = MockModel()
-        self.model = model
+        self._copyValues(model)
         return model
 
 
-class MockModel(MockTransformer, Model):
-
-    def __init__(self):
-        super(MockModel, self).__init__()
+class MockModel(MockTransformer, Model, HasFake):
+    pass
 
 
 class PipelineTests(PySparkTestCase):
@@ -91,19 +92,17 @@ def test_pipeline(self):
         transformer1 = MockTransformer()
         estimator2 = MockEstimator()
         transformer3 = MockTransformer()
-        pipeline = Pipeline() \
-            .setStages([estimator0, transformer1, estimator2, transformer3])
+        pipeline = Pipeline(stages=[estimator0, transformer1, estimator2, transformer3])
         pipeline_model = pipeline.fit(dataset, {estimator0.fake: 0, transformer1.fake: 1})
-        self.assertEqual(0, estimator0.dataset_index)
-        self.assertEqual(0, estimator0.fake_param_value)
-        model0 = estimator0.model
+        model0, transformer1, model2, transformer3 = pipeline_model.stages
         self.assertEqual(0, model0.dataset_index)
+        self.assertEqual(0, model0.getFake())
         self.assertEqual(1, transformer1.dataset_index)
-        self.assertEqual(1, transformer1.fake_param_value)
-        self.assertEqual(2, estimator2.dataset_index)
-        model2 = estimator2.model
-        self.assertIsNone(model2.dataset_index, "The model produced by the last estimator should "
-                                                "not be called during fit.")
+        self.assertEqual(1, transformer1.getFake())
+        self.assertEqual(2, dataset.index)
+        self.assertIsNone(model2.dataset_index, "The last model shouldn't be called in fit.")
+        self.assertIsNone(transformer3.dataset_index,
+                          "The last transformer shouldn't be called in fit.")
         dataset = pipeline_model.transform(dataset)
         self.assertEqual(2, model0.dataset_index)
         self.assertEqual(3, transformer1.dataset_index)
@@ -129,7 +128,7 @@ def test_param(self):
         maxIter = testParams.maxIter
         self.assertEqual(maxIter.name, "maxIter")
         self.assertEqual(maxIter.doc, "max number of iterations (>= 0)")
-        self.assertTrue(maxIter.parent is testParams)
+        self.assertTrue(maxIter.parent == testParams.uid)
 
     def test_params(self):
         testParams = TestParams()
@@ -139,6 +138,7 @@ def test_params(self):
         params = testParams.params
         self.assertEqual(params, [inputCol, maxIter])
 
+        self.assertTrue(testParams.hasParam(maxIter))
         self.assertTrue(testParams.hasDefault(maxIter))
         self.assertFalse(testParams.isSet(maxIter))
         self.assertTrue(testParams.isDefined(maxIter))
@@ -147,6 +147,7 @@ def test_params(self):
         self.assertTrue(testParams.isSet(maxIter))
         self.assertEquals(testParams.getMaxIter(), 100)
 
+        self.assertTrue(testParams.hasParam(inputCol))
         self.assertFalse(testParams.hasDefault(inputCol))
         self.assertFalse(testParams.isSet(inputCol))
         self.assertFalse(testParams.isDefined(inputCol))
@@ -159,5 +160,45 @@ def test_params(self):
                        "maxIter: max number of iterations (>= 0) (default: 10, current: 100)"]))
 
 
+class FeatureTests(PySparkTestCase):
+
+    def test_binarizer(self):
+        b0 = Binarizer()
+        self.assertListEqual(b0.params, [b0.inputCol, b0.outputCol, b0.threshold])
+        self.assertTrue(all([~b0.isSet(p) for p in b0.params]))
+        self.assertTrue(b0.hasDefault(b0.threshold))
+        self.assertEqual(b0.getThreshold(), 0.0)
+        b0.setParams(inputCol="input", outputCol="output").setThreshold(1.0)
+        self.assertTrue(all([b0.isSet(p) for p in b0.params]))
+        self.assertEqual(b0.getThreshold(), 1.0)
+        self.assertEqual(b0.getInputCol(), "input")
+        self.assertEqual(b0.getOutputCol(), "output")
+
+        b0c = b0.copy({b0.threshold: 2.0})
+        self.assertEqual(b0c.uid, b0.uid)
+        self.assertListEqual(b0c.params, b0.params)
+        self.assertEqual(b0c.getThreshold(), 2.0)
+
+        b1 = Binarizer(threshold=2.0, inputCol="input", outputCol="output")
+        self.assertNotEqual(b1.uid, b0.uid)
+        self.assertEqual(b1.getThreshold(), 2.0)
+        self.assertEqual(b1.getInputCol(), "input")
+        self.assertEqual(b1.getOutputCol(), "output")
+
+    def test_idf(self):
+        sqlContext = SQLContext(self.sc)
+        dataset = sqlContext.createDataFrame([
+            (DenseVector([1.0, 2.0]),),
+            (DenseVector([0.0, 1.0]),),
+            (DenseVector([3.0, 0.2]),)], ["tf"])
+        idf0 = IDF(inputCol="tf")
+        self.assertListEqual(idf0.params, [idf0.inputCol, idf0.minDocFreq, idf0.outputCol])
+        idf0m = idf0.fit(dataset, {idf0.outputCol: "idf"})
+        self.assertEqual(idf0m.uid, idf0.uid,
+                         "Model should inherit the UID from its parent estimator.")
+        output = idf0m.transform(dataset)
+        self.assertIsNotNone(output.head().idf)
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/pyspark/ml/tuning.py b/python/pyspark/ml/tuning.py
index 86f4dc7368be0..497841b6c8ce6 100644
--- a/python/pyspark/ml/tuning.py
+++ b/python/pyspark/ml/tuning.py
@@ -155,7 +155,7 @@ def setEstimator(self, value):
         """
         Sets the value of :py:attr:`estimator`.
         """
-        self.paramMap[self.estimator] = value
+        self._paramMap[self.estimator] = value
         return self
 
     def getEstimator(self):
@@ -168,7 +168,7 @@ def setEstimatorParamMaps(self, value):
         """
         Sets the value of :py:attr:`estimatorParamMaps`.
         """
-        self.paramMap[self.estimatorParamMaps] = value
+        self._paramMap[self.estimatorParamMaps] = value
         return self
 
     def getEstimatorParamMaps(self):
@@ -181,7 +181,7 @@ def setEvaluator(self, value):
         """
         Sets the value of :py:attr:`evaluator`.
         """
-        self.paramMap[self.evaluator] = value
+        self._paramMap[self.evaluator] = value
         return self
 
     def getEvaluator(self):
@@ -194,7 +194,7 @@ def setNumFolds(self, value):
         """
         Sets the value of :py:attr:`numFolds`.
         """
-        self.paramMap[self.numFolds] = value
+        self._paramMap[self.numFolds] = value
         return self
 
     def getNumFolds(self):
@@ -203,13 +203,12 @@ def getNumFolds(self):
         """
         return self.getOrDefault(self.numFolds)
 
-    def fit(self, dataset, params={}):
-        paramMap = self.extractParamMap(params)
-        est = paramMap[self.estimator]
-        epm = paramMap[self.estimatorParamMaps]
+    def _fit(self, dataset):
+        est = self.getOrDefault(self.estimator)
+        epm = self.getOrDefault(self.estimatorParamMaps)
         numModels = len(epm)
-        eva = paramMap[self.evaluator]
-        nFolds = paramMap[self.numFolds]
+        eva = self.getOrDefault(self.evaluator)
+        nFolds = self.getOrDefault(self.numFolds)
         h = 1.0 / nFolds
         randCol = self.uid + "_rand"
         df = dataset.select("*", rand(0).alias(randCol))
@@ -229,6 +228,15 @@ def fit(self, dataset, params={}):
         bestModel = est.fit(dataset, epm[bestIndex])
         return CrossValidatorModel(bestModel)
 
+    def copy(self, extra={}):
+        newCV = Params.copy(self, extra)
+        if self.isSet(self.estimator):
+            newCV.setEstimator(self.getEstimator().copy(extra))
+        # estimatorParamMaps remain the same
+        if self.isSet(self.evaluator):
+            newCV.setEvaluator(self.getEvaluator().copy(extra))
+        return newCV
+
 
 class CrossValidatorModel(Model):
     """
@@ -240,8 +248,19 @@ def __init__(self, bestModel):
         #: best model from cross validation
         self.bestModel = bestModel
 
-    def transform(self, dataset, params={}):
-        return self.bestModel.transform(dataset, params)
+    def _transform(self, dataset):
+        return self.bestModel.transform(dataset)
+
+    def copy(self, extra={}):
+        """
+        Creates a copy of this instance with a randomly generated uid
+        and some extra params. This copies the underlying bestModel,
+        creates a deep copy of the embedded paramMap, and
+        copies the embedded and extra parameters over.
+        :param extra: Extra parameters to copy to the new instance
+        :return: Copy of this instance
+        """
+        return CrossValidatorModel(self.bestModel.copy(extra))
 
 
 if __name__ == "__main__":
diff --git a/python/pyspark/ml/util.py b/python/pyspark/ml/util.py
index d3cb100a9efa5..cee9d67b05325 100644
--- a/python/pyspark/ml/util.py
+++ b/python/pyspark/ml/util.py
@@ -39,9 +39,16 @@ class Identifiable(object):
     """
 
     def __init__(self):
-        #: A unique id for the object. The default implementation
-        #: concatenates the class name, "_", and 8 random hex chars.
-        self.uid = type(self).__name__ + "_" + uuid.uuid4().hex[:8]
+        #: A unique id for the object.
+        self.uid = self._randomUID()
 
     def __repr__(self):
         return self.uid
+
+    @classmethod
+    def _randomUID(cls):
+        """
+        Generate a unique id for the object. The default implementation
+        concatenates the class name, "_", and 12 random hex chars.
+        """
+        return cls.__name__ + "_" + uuid.uuid4().hex[12:]
diff --git a/python/pyspark/ml/wrapper.py b/python/pyspark/ml/wrapper.py
index dda6c6aba3049..4419e16184da8 100644
--- a/python/pyspark/ml/wrapper.py
+++ b/python/pyspark/ml/wrapper.py
@@ -45,46 +45,61 @@ class JavaWrapper(Params):
 
     __metaclass__ = ABCMeta
 
-    #: Fully-qualified class name of the wrapped Java component.
-    _java_class = None
+    #: The wrapped Java companion object. Subclasses should initialize
+    #: it properly. The param values in the Java object should be
+    #: synced with the Python wrapper in fit/transform/evaluate/copy.
+    _java_obj = None
 
-    def _java_obj(self):
+    @staticmethod
+    def _new_java_obj(java_class, *args):
         """
-        Returns or creates a Java object.
+        Construct a new Java object.
         """
+        sc = SparkContext._active_spark_context
         java_obj = _jvm()
-        for name in self._java_class.split("."):
+        for name in java_class.split("."):
             java_obj = getattr(java_obj, name)
-        return java_obj()
+        java_args = [_py2java(sc, arg) for arg in args]
+        return java_obj(*java_args)
 
-    def _transfer_params_to_java(self, params, java_obj):
+    def _make_java_param_pair(self, param, value):
         """
-        Transforms the embedded params and additional params to the
-        input Java object.
-        :param params: additional params (overwriting embedded values)
-        :param java_obj: Java object to receive the params
+        Makes a Java parm pair.
+        """
+        sc = SparkContext._active_spark_context
+        param = self._resolveParam(param)
+        java_param = self._java_obj.getParam(param.name)
+        java_value = _py2java(sc, value)
+        return java_param.w(java_value)
+
+    def _transfer_params_to_java(self):
+        """
+        Transforms the embedded params to the companion Java object.
         """
-        paramMap = self.extractParamMap(params)
+        paramMap = self.extractParamMap()
         for param in self.params:
             if param in paramMap:
-                value = paramMap[param]
-                java_param = java_obj.getParam(param.name)
-                java_obj.set(java_param.w(value))
+                pair = self._make_java_param_pair(param, paramMap[param])
+                self._java_obj.set(pair)
+
+    def _transfer_params_from_java(self):
+        """
+        Transforms the embedded params from the companion Java object.
+        """
+        sc = SparkContext._active_spark_context
+        for param in self.params:
+            if self._java_obj.hasParam(param.name):
+                java_param = self._java_obj.getParam(param.name)
+                value = _java2py(sc, self._java_obj.getOrDefault(java_param))
+                self._paramMap[param] = value
 
-    def _empty_java_param_map(self):
+    @staticmethod
+    def _empty_java_param_map():
         """
         Returns an empty Java ParamMap reference.
         """
         return _jvm().org.apache.spark.ml.param.ParamMap()
 
-    def _create_java_param_map(self, params, java_obj):
-        paramMap = self._empty_java_param_map()
-        for param, value in params.items():
-            if param.parent is self:
-                java_param = java_obj.getParam(param.name)
-                paramMap.put(java_param.w(value))
-        return paramMap
-
 
 @inherit_doc
 class JavaEstimator(Estimator, JavaWrapper):
@@ -99,9 +114,9 @@ def _create_model(self, java_model):
         """
         Creates a model from the input Java model reference.
         """
-        return JavaModel(java_model)
+        raise NotImplementedError()
 
-    def _fit_java(self, dataset, params={}):
+    def _fit_java(self, dataset):
         """
         Fits a Java model to the input dataset.
         :param dataset: input dataset, which is an instance of
@@ -109,12 +124,11 @@ def _fit_java(self, dataset, params={}):
         :param params: additional params (overwriting embedded values)
         :return: fitted Java model
         """
-        java_obj = self._java_obj()
-        self._transfer_params_to_java(params, java_obj)
-        return java_obj.fit(dataset._jdf, self._empty_java_param_map())
+        self._transfer_params_to_java()
+        return self._java_obj.fit(dataset._jdf)
 
-    def fit(self, dataset, params={}):
-        java_model = self._fit_java(dataset, params)
+    def _fit(self, dataset):
+        java_model = self._fit_java(dataset)
         return self._create_model(java_model)
 
 
@@ -127,30 +141,47 @@ class JavaTransformer(Transformer, JavaWrapper):
 
     __metaclass__ = ABCMeta
 
-    def transform(self, dataset, params={}):
-        java_obj = self._java_obj()
-        self._transfer_params_to_java(params, java_obj)
-        return DataFrame(java_obj.transform(dataset._jdf), dataset.sql_ctx)
+    def _transform(self, dataset):
+        self._transfer_params_to_java()
+        return DataFrame(self._java_obj.transform(dataset._jdf), dataset.sql_ctx)
 
 
 @inherit_doc
 class JavaModel(Model, JavaTransformer):
     """
     Base class for :py:class:`Model`s that wrap Java/Scala
-    implementations.
+    implementations. Subclasses should inherit this class before
+    param mix-ins, because this sets the UID from the Java model.
     """
 
     __metaclass__ = ABCMeta
 
     def __init__(self, java_model):
-        super(JavaTransformer, self).__init__()
-        self._java_model = java_model
+        """
+        Initialize this instance with a Java model object.
+        Subclasses should call this constructor, initialize params,
+        and then call _transformer_params_from_java.
+        """
+        super(JavaModel, self).__init__()
+        self._java_obj = java_model
+        self.uid = java_model.uid()
 
-    def _java_obj(self):
-        return self._java_model
+    def copy(self, extra={}):
+        """
+        Creates a copy of this instance with the same uid and some
+        extra params. This implementation first calls Params.copy and
+        then make a copy of the companion Java model with extra params.
+        So both the Python wrapper and the Java model get copied.
+        :param extra: Extra parameters to copy to the new instance
+        :return: Copy of this instance
+        """
+        that = super(JavaModel, self).copy(extra)
+        that._java_obj = self._java_obj.copy(self._empty_java_param_map())
+        that._transfer_params_to_java()
+        return that
 
     def _call_java(self, name, *args):
-        m = getattr(self._java_model, name)
+        m = getattr(self._java_obj, name)
         sc = SparkContext._active_spark_context
         java_args = [_py2java(sc, arg) for arg in args]
         return _java2py(sc, m(*java_args))
@@ -165,7 +196,11 @@ class JavaEvaluator(Evaluator, JavaWrapper):
 
     __metaclass__ = ABCMeta
 
-    def evaluate(self, dataset, params={}):
-        java_obj = self._java_obj()
-        self._transfer_params_to_java(params, java_obj)
-        return java_obj.evaluate(dataset._jdf, self._empty_java_param_map())
+    def _evaluate(self, dataset):
+        """
+        Evaluates the output.
+        :param dataset: a dataset that contains labels/observations and predictions.
+        :return: evaluation metric
+        """
+        self._transfer_params_to_java()
+        return self._java_obj.evaluate(dataset._jdf)

From aa31e431fc09f0477f1c2351c6275769a31aca90 Mon Sep 17 00:00:00 2001
From: Zhan Zhang <zhazhan@gmail.com>
Date: Mon, 18 May 2015 12:03:27 -0700
Subject: [PATCH 048/525] [SPARK-2883] [SQL] ORC data source for Spark SQL

This PR updates PR #6135 authored by zhzhan from Hortonworks.

----

This PR implements a Spark SQL data source for accessing ORC files.

> **NOTE**
>
> Although ORC is now an Apache TLP, the codebase is still tightly coupled with Hive.  That's why the new ORC data source is under `org.apache.spark.sql.hive` package, and must be used with `HiveContext`.  However, it doesn't require existing Hive installation to access ORC files.

1.  Saving/loading ORC files without contacting Hive metastore

1.  Support for complex data types (i.e. array, map, and struct)

1.  Aware of common optimizations provided by Spark SQL:

    - Column pruning
    - Partitioning pruning
    - Filter push-down

1.  Schema evolution support
1.  Hive metastore table conversion

This PR also include initial work done by scwf from Huawei (PR #3753).

Author: Zhan Zhang <zhazhan@gmail.com>
Author: Cheng Lian <lian@databricks.com>

Closes #6194 from liancheng/polishing-orc and squashes the following commits:

55ecd96 [Cheng Lian] Reorganizes ORC test suites
d4afeed [Cheng Lian] Addresses comments
21ada22 [Cheng Lian] Adds @since and @Experimental annotations
128bd3b [Cheng Lian] ORC filter bug fix
d734496 [Cheng Lian] Polishes the ORC data source
2650a42 [Zhan Zhang] resolve review comments
3c9038e [Zhan Zhang] resolve review comments
7b3c7c5 [Zhan Zhang] save mode fix
f95abfd [Zhan Zhang] reuse test suite
7cc2c64 [Zhan Zhang] predicate fix
4e61c16 [Zhan Zhang] minor change
305418c [Zhan Zhang] orc data source support
---
 .../scala/org/apache/spark/sql/SQLConf.scala  |   7 +-
 .../spark/sql/parquet/ParquetTest.scala       |  61 +---
 .../org/apache/spark/sql/sources/ddl.scala    |  18 +-
 .../apache/spark/sql/test/SQLTestUtils.scala  |  81 +++++
 .../spark/sql/hive/HiveInspectors.scala       |  40 ++-
 .../spark/sql/hive/orc/OrcFileOperator.scala  |  69 ++++
 .../spark/sql/hive/orc/OrcFilters.scala       | 144 +++++++++
 .../spark/sql/hive/orc/OrcRelation.scala      | 290 +++++++++++++++++
 .../hive/orc/OrcHadoopFsRelationSuite.scala   |  59 ++++
 .../hive/orc/OrcPartitionDiscoverySuite.scala | 256 +++++++++++++++
 .../spark/sql/hive/orc/OrcQuerySuite.scala    | 294 ++++++++++++++++++
 .../spark/sql/hive/orc/OrcSourceSuite.scala   | 146 +++++++++
 .../apache/spark/sql/hive/orc/OrcTest.scala   |  82 +++++
 .../sql/sources/hadoopFsRelationSuites.scala  |   6 +-
 14 files changed, 1477 insertions(+), 76 deletions(-)
 create mode 100644 sql/core/src/main/scala/org/apache/spark/sql/test/SQLTestUtils.scala
 create mode 100644 sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcFileOperator.scala
 create mode 100644 sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcFilters.scala
 create mode 100644 sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcRelation.scala
 create mode 100644 sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcHadoopFsRelationSuite.scala
 create mode 100644 sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcPartitionDiscoverySuite.scala
 create mode 100644 sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcQuerySuite.scala
 create mode 100644 sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcSourceSuite.scala
 create mode 100644 sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcTest.scala

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SQLConf.scala b/sql/core/src/main/scala/org/apache/spark/sql/SQLConf.scala
index f07bb196c11ec..6da910e332e9b 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/SQLConf.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/SQLConf.scala
@@ -43,6 +43,8 @@ private[spark] object SQLConf {
   val PARQUET_FILTER_PUSHDOWN_ENABLED = "spark.sql.parquet.filterPushdown"
   val PARQUET_USE_DATA_SOURCE_API = "spark.sql.parquet.useDataSourceApi"
 
+  val ORC_FILTER_PUSHDOWN_ENABLED = "spark.sql.orc.filterPushdown"
+
   val HIVE_VERIFY_PARTITIONPATH = "spark.sql.hive.verifyPartitionPath"
 
   val COLUMN_NAME_OF_CORRUPT_RECORD = "spark.sql.columnNameOfCorruptRecord"
@@ -143,6 +145,9 @@ private[sql] class SQLConf extends Serializable with CatalystConf {
   private[spark] def parquetUseDataSourceApi =
     getConf(PARQUET_USE_DATA_SOURCE_API, "true").toBoolean
 
+  private[spark] def orcFilterPushDown =
+    getConf(ORC_FILTER_PUSHDOWN_ENABLED, "false").toBoolean
+
   /** When true uses verifyPartitionPath to prune the path which is not exists. */
   private[spark] def verifyPartitionPath =
     getConf(HIVE_VERIFY_PARTITIONPATH, "true").toBoolean
@@ -254,7 +259,7 @@ private[sql] class SQLConf extends Serializable with CatalystConf {
 
   private[spark] def dataFrameRetainGroupColumns: Boolean =
     getConf(DATAFRAME_RETAIN_GROUP_COLUMNS, "true").toBoolean
-  
+
   /** ********************** SQLConf functionality methods ************ */
 
   /** Set Spark SQL configuration properties. */
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTest.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTest.scala
index 7a73b6f1ac601..516ba373f41d2 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTest.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTest.scala
@@ -21,10 +21,9 @@ import java.io.File
 
 import scala.reflect.ClassTag
 import scala.reflect.runtime.universe.TypeTag
-import scala.util.Try
 
-import org.apache.spark.sql.{DataFrame, SQLContext, SaveMode}
-import org.apache.spark.util.Utils
+import org.apache.spark.sql.test.SQLTestUtils
+import org.apache.spark.sql.{DataFrame, SaveMode}
 
 /**
  * A helper trait that provides convenient facilities for Parquet testing.
@@ -33,54 +32,9 @@ import org.apache.spark.util.Utils
  * convenient to use tuples rather than special case classes when writing test cases/suites.
  * Especially, `Tuple1.apply` can be used to easily wrap a single type/value.
  */
-private[sql] trait ParquetTest {
-  val sqlContext: SQLContext
-
+private[sql] trait ParquetTest extends SQLTestUtils {
   import sqlContext.implicits.{localSeqToDataFrameHolder, rddToDataFrameHolder}
-  import sqlContext.{conf, sparkContext}
-
-  protected def configuration = sparkContext.hadoopConfiguration
-
-  /**
-   * Sets all SQL configurations specified in `pairs`, calls `f`, and then restore all SQL
-   * configurations.
-   *
-   * @todo Probably this method should be moved to a more general place
-   */
-  protected def withSQLConf(pairs: (String, String)*)(f: => Unit): Unit = {
-    val (keys, values) = pairs.unzip
-    val currentValues = keys.map(key => Try(conf.getConf(key)).toOption)
-    (keys, values).zipped.foreach(conf.setConf)
-    try f finally {
-      keys.zip(currentValues).foreach {
-        case (key, Some(value)) => conf.setConf(key, value)
-        case (key, None) => conf.unsetConf(key)
-      }
-    }
-  }
-
-  /**
-   * Generates a temporary path without creating the actual file/directory, then pass it to `f`. If
-   * a file/directory is created there by `f`, it will be delete after `f` returns.
-   *
-   * @todo Probably this method should be moved to a more general place
-   */
-  protected def withTempPath(f: File => Unit): Unit = {
-    val path = Utils.createTempDir()
-    path.delete()
-    try f(path) finally Utils.deleteRecursively(path)
-  }
-
-  /**
-   * Creates a temporary directory, which is then passed to `f` and will be deleted after `f`
-   * returns.
-   *
-   * @todo Probably this method should be moved to a more general place
-   */
-  protected def withTempDir(f: File => Unit): Unit = {
-    val dir = Utils.createTempDir().getCanonicalFile
-    try f(dir) finally Utils.deleteRecursively(dir)
-  }
+  import sqlContext.sparkContext
 
   /**
    * Writes `data` to a Parquet file, which is then passed to `f` and will be deleted after `f`
@@ -105,13 +59,6 @@ private[sql] trait ParquetTest {
     withParquetFile(data)(path => f(sqlContext.read.parquet(path)))
   }
 
-  /**
-   * Drops temporary table `tableName` after calling `f`.
-   */
-  protected def withTempTable(tableName: String)(f: => Unit): Unit = {
-    try f finally sqlContext.dropTempTable(tableName)
-  }
-
   /**
    * Writes `data` to a Parquet file, reads it back as a [[DataFrame]] and registers it as a
    * temporary table named `tableName`, then call `f`. The temporary table together with the
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/sources/ddl.scala b/sql/core/src/main/scala/org/apache/spark/sql/sources/ddl.scala
index 37a569db311ea..a13ab74852ff3 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/sources/ddl.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/sources/ddl.scala
@@ -188,18 +188,20 @@ private[sql] class DDLParser(
 private[sql] object ResolvedDataSource {
 
   private val builtinSources = Map(
-    "jdbc" -> classOf[org.apache.spark.sql.jdbc.DefaultSource],
-    "json" -> classOf[org.apache.spark.sql.json.DefaultSource],
-    "parquet" -> classOf[org.apache.spark.sql.parquet.DefaultSource]
+    "jdbc" -> "org.apache.spark.sql.jdbc.DefaultSource",
+    "json" -> "org.apache.spark.sql.json.DefaultSource",
+    "parquet" -> "org.apache.spark.sql.parquet.DefaultSource",
+    "orc" -> "org.apache.spark.sql.hive.orc.DefaultSource"
   )
 
   /** Given a provider name, look up the data source class definition. */
   def lookupDataSource(provider: String): Class[_] = {
+    val loader = Utils.getContextOrSparkClassLoader
+
     if (builtinSources.contains(provider)) {
-      return builtinSources(provider)
+      return loader.loadClass(builtinSources(provider))
     }
 
-    val loader = Utils.getContextOrSparkClassLoader
     try {
       loader.loadClass(provider)
     } catch {
@@ -208,7 +210,11 @@ private[sql] object ResolvedDataSource {
           loader.loadClass(provider + ".DefaultSource")
         } catch {
           case cnf: java.lang.ClassNotFoundException =>
-            sys.error(s"Failed to load class for data source: $provider")
+            if (provider.startsWith("org.apache.spark.sql.hive.orc")) {
+              sys.error("The ORC data source must be used with Hive support enabled.")
+            } else {
+              sys.error(s"Failed to load class for data source: $provider")
+            }
         }
     }
   }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/test/SQLTestUtils.scala b/sql/core/src/main/scala/org/apache/spark/sql/test/SQLTestUtils.scala
new file mode 100644
index 0000000000000..75d290625ec38
--- /dev/null
+++ b/sql/core/src/main/scala/org/apache/spark/sql/test/SQLTestUtils.scala
@@ -0,0 +1,81 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.test
+
+import java.io.File
+
+import scala.util.Try
+
+import org.apache.spark.sql.SQLContext
+import org.apache.spark.util.Utils
+
+trait SQLTestUtils {
+  val sqlContext: SQLContext
+
+  import sqlContext.{conf, sparkContext}
+
+  protected def configuration = sparkContext.hadoopConfiguration
+
+  /**
+   * Sets all SQL configurations specified in `pairs`, calls `f`, and then restore all SQL
+   * configurations.
+   *
+   * @todo Probably this method should be moved to a more general place
+   */
+  protected def withSQLConf(pairs: (String, String)*)(f: => Unit): Unit = {
+    val (keys, values) = pairs.unzip
+    val currentValues = keys.map(key => Try(conf.getConf(key)).toOption)
+    (keys, values).zipped.foreach(conf.setConf)
+    try f finally {
+      keys.zip(currentValues).foreach {
+        case (key, Some(value)) => conf.setConf(key, value)
+        case (key, None) => conf.unsetConf(key)
+      }
+    }
+  }
+
+  /**
+   * Generates a temporary path without creating the actual file/directory, then pass it to `f`. If
+   * a file/directory is created there by `f`, it will be delete after `f` returns.
+   *
+   * @todo Probably this method should be moved to a more general place
+   */
+  protected def withTempPath(f: File => Unit): Unit = {
+    val path = Utils.createTempDir()
+    path.delete()
+    try f(path) finally Utils.deleteRecursively(path)
+  }
+
+  /**
+   * Creates a temporary directory, which is then passed to `f` and will be deleted after `f`
+   * returns.
+   *
+   * @todo Probably this method should be moved to a more general place
+   */
+  protected def withTempDir(f: File => Unit): Unit = {
+    val dir = Utils.createTempDir().getCanonicalFile
+    try f(dir) finally Utils.deleteRecursively(dir)
+  }
+
+  /**
+   * Drops temporary table `tableName` after calling `f`.
+   */
+  protected def withTempTable(tableName: String)(f: => Unit): Unit = {
+    try f finally sqlContext.dropTempTable(tableName)
+  }
+}
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveInspectors.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveInspectors.scala
index 7c7666f6e4b7c..0a694c70e4e5c 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveInspectors.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveInspectors.scala
@@ -18,8 +18,8 @@
 package org.apache.spark.sql.hive
 
 import org.apache.hadoop.hive.common.`type`.{HiveDecimal, HiveVarchar}
-import org.apache.hadoop.hive.serde2.objectinspector._
 import org.apache.hadoop.hive.serde2.objectinspector.primitive._
+import org.apache.hadoop.hive.serde2.objectinspector.{StructField => HiveStructField, _}
 import org.apache.hadoop.hive.serde2.{io => hiveIo}
 import org.apache.hadoop.{io => hadoopIo}
 
@@ -122,7 +122,7 @@ import scala.collection.JavaConversions._
  *                                 even a normal java object (POJO)
  *   UnionObjectInspector: (tag: Int, object data) (TODO: not supported by SparkSQL yet)
  *
- * 3) ConstantObjectInspector: 
+ * 3) ConstantObjectInspector:
  * Constant object inspector can be either primitive type or Complex type, and it bundles a
  * constant value as its property, usually the value is created when the constant object inspector
  * constructed.
@@ -133,7 +133,7 @@ import scala.collection.JavaConversions._
     }
   }}}
  * Hive provides 3 built-in constant object inspectors:
- * Primitive Object Inspectors: 
+ * Primitive Object Inspectors:
  *     WritableConstantStringObjectInspector
  *     WritableConstantHiveVarcharObjectInspector
  *     WritableConstantHiveDecimalObjectInspector
@@ -147,9 +147,9 @@ import scala.collection.JavaConversions._
  *     WritableConstantByteObjectInspector
  *     WritableConstantBinaryObjectInspector
  *     WritableConstantDateObjectInspector
- * Map Object Inspector: 
+ * Map Object Inspector:
  *     StandardConstantMapObjectInspector
- * List Object Inspector: 
+ * List Object Inspector:
  *     StandardConstantListObjectInspector]]
  * Struct Object Inspector: Hive doesn't provide the built-in constant object inspector for Struct
  * Union Object Inspector: Hive doesn't provide the built-in constant object inspector for Union
@@ -250,9 +250,9 @@ private[hive] trait HiveInspectors {
         poi.getWritableConstantValue.getHiveDecimal)
     case poi: WritableConstantTimestampObjectInspector =>
       poi.getWritableConstantValue.getTimestamp.clone()
-    case poi: WritableConstantIntObjectInspector => 
+    case poi: WritableConstantIntObjectInspector =>
       poi.getWritableConstantValue.get()
-    case poi: WritableConstantDoubleObjectInspector => 
+    case poi: WritableConstantDoubleObjectInspector =>
       poi.getWritableConstantValue.get()
     case poi: WritableConstantBooleanObjectInspector =>
       poi.getWritableConstantValue.get()
@@ -306,7 +306,7 @@ private[hive] trait HiveInspectors {
         // In order to keep backward-compatible, we have to copy the
         // bytes with old apis
         val bw = x.getPrimitiveWritableObject(data)
-        val result = new Array[Byte](bw.getLength()) 
+        val result = new Array[Byte](bw.getLength())
         System.arraycopy(bw.getBytes(), 0, result, 0, bw.getLength())
         result
       case x: DateObjectInspector if x.preferWritable() =>
@@ -394,6 +394,30 @@ private[hive] trait HiveInspectors {
       identity[Any]
   }
 
+  /**
+   * Builds specific unwrappers ahead of time according to object inspector
+   * types to avoid pattern matching and branching costs per row.
+   */
+  def unwrapperFor(field: HiveStructField): (Any, MutableRow, Int) => Unit =
+    field.getFieldObjectInspector match {
+      case oi: BooleanObjectInspector =>
+        (value: Any, row: MutableRow, ordinal: Int) => row.setBoolean(ordinal, oi.get(value))
+      case oi: ByteObjectInspector =>
+        (value: Any, row: MutableRow, ordinal: Int) => row.setByte(ordinal, oi.get(value))
+      case oi: ShortObjectInspector =>
+        (value: Any, row: MutableRow, ordinal: Int) => row.setShort(ordinal, oi.get(value))
+      case oi: IntObjectInspector =>
+        (value: Any, row: MutableRow, ordinal: Int) => row.setInt(ordinal, oi.get(value))
+      case oi: LongObjectInspector =>
+        (value: Any, row: MutableRow, ordinal: Int) => row.setLong(ordinal, oi.get(value))
+      case oi: FloatObjectInspector =>
+        (value: Any, row: MutableRow, ordinal: Int) => row.setFloat(ordinal, oi.get(value))
+      case oi: DoubleObjectInspector =>
+        (value: Any, row: MutableRow, ordinal: Int) => row.setDouble(ordinal, oi.get(value))
+      case oi =>
+        (value: Any, row: MutableRow, ordinal: Int) => row(ordinal) = unwrap(value, oi)
+    }
+
   /**
    * Converts native catalyst types to the types expected by Hive
    * @param a the value to be wrapped
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcFileOperator.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcFileOperator.scala
new file mode 100644
index 0000000000000..1e51173a19882
--- /dev/null
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcFileOperator.scala
@@ -0,0 +1,69 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.hive.orc
+
+import org.apache.hadoop.conf.Configuration
+import org.apache.hadoop.fs.Path
+import org.apache.hadoop.hive.ql.io.orc.{OrcFile, Reader}
+import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector
+
+import org.apache.spark.Logging
+import org.apache.spark.deploy.SparkHadoopUtil
+import org.apache.spark.sql.hive.HiveMetastoreTypes
+import org.apache.spark.sql.types.StructType
+
+private[orc] object OrcFileOperator extends Logging{
+  def getFileReader(pathStr: String, config: Option[Configuration] = None ): Reader = {
+    val conf = config.getOrElse(new Configuration)
+    val fspath = new Path(pathStr)
+    val fs = fspath.getFileSystem(conf)
+    val orcFiles = listOrcFiles(pathStr, conf)
+
+    // TODO Need to consider all files when schema evolution is taken into account.
+    OrcFile.createReader(fs, orcFiles.head)
+  }
+
+  def readSchema(path: String, conf: Option[Configuration]): StructType = {
+    val reader = getFileReader(path, conf)
+    val readerInspector = reader.getObjectInspector.asInstanceOf[StructObjectInspector]
+    val schema = readerInspector.getTypeName
+    HiveMetastoreTypes.toDataType(schema).asInstanceOf[StructType]
+  }
+
+  def getObjectInspector(path: String, conf: Option[Configuration]): StructObjectInspector = {
+    getFileReader(path, conf).getObjectInspector.asInstanceOf[StructObjectInspector]
+  }
+
+  def listOrcFiles(pathStr: String, conf: Configuration): Seq[Path] = {
+    val origPath = new Path(pathStr)
+    val fs = origPath.getFileSystem(conf)
+    val path = origPath.makeQualified(fs)
+    val paths = SparkHadoopUtil.get.listLeafStatuses(fs, origPath)
+      .filterNot(_.isDir)
+      .map(_.getPath)
+      .filterNot(_.getName.startsWith("_"))
+      .filterNot(_.getName.startsWith("."))
+
+    if (paths == null || paths.size == 0) {
+      throw new IllegalArgumentException(
+        s"orcFileOperator: path $path does not have valid orc files matching the pattern")
+    }
+
+    paths
+  }
+}
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcFilters.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcFilters.scala
new file mode 100644
index 0000000000000..250e73a4dba92
--- /dev/null
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcFilters.scala
@@ -0,0 +1,144 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.hive.orc
+
+import org.apache.hadoop.hive.common.`type`.{HiveChar, HiveDecimal, HiveVarchar}
+import org.apache.hadoop.hive.ql.io.sarg.SearchArgument
+import org.apache.hadoop.hive.ql.io.sarg.SearchArgument.Builder
+import org.apache.hadoop.hive.serde2.io.DateWritable
+
+import org.apache.spark.Logging
+import org.apache.spark.sql.sources._
+
+/**
+ * It may be optimized by push down partial filters. But we are conservative here.
+ * Because if some filters fail to be parsed, the tree may be corrupted,
+ * and cannot be used anymore.
+ */
+private[orc] object OrcFilters extends Logging {
+  def createFilter(expr: Array[Filter]): Option[SearchArgument] = {
+    expr.reduceOption(And).flatMap { conjunction =>
+      val builder = SearchArgument.FACTORY.newBuilder()
+      buildSearchArgument(conjunction, builder).map(_.build())
+    }
+  }
+
+  private def buildSearchArgument(expression: Filter, builder: Builder): Option[Builder] = {
+    def newBuilder = SearchArgument.FACTORY.newBuilder()
+
+    def isSearchableLiteral(value: Any) = value match {
+      // These are types recognized by the `SearchArgumentImpl.BuilderImpl.boxLiteral()` method.
+      case _: String | _: Long | _: Double | _: DateWritable | _: HiveDecimal | _: HiveChar |
+           _: HiveVarchar | _: Byte | _: Short | _: Integer | _: Float => true
+      case _ => false
+    }
+
+    // lian: I probably missed something here, and had to end up with a pretty weird double-checking
+    // pattern when converting `And`/`Or`/`Not` filters.
+    //
+    // The annoying part is that, `SearchArgument` builder methods like `startAnd()` `startOr()`,
+    // and `startNot()` mutate internal state of the builder instance.  This forces us to translate
+    // all convertible filters with a single builder instance. However, before actually converting a
+    // filter, we've no idea whether it can be recognized by ORC or not. Thus, when an inconvertible
+    // filter is found, we may already end up with a builder whose internal state is inconsistent.
+    //
+    // For example, to convert an `And` filter with builder `b`, we call `b.startAnd()` first, and
+    // then try to convert its children.  Say we convert `left` child successfully, but find that
+    // `right` child is inconvertible.  Alas, `b.startAnd()` call can't be rolled back, and `b` is
+    // inconsistent now.
+    //
+    // The workaround employed here is that, for `And`/`Or`/`Not`, we first try to convert their
+    // children with brand new builders, and only do the actual conversion with the right builder
+    // instance when the children are proven to be convertible.
+    //
+    // P.S.: Hive seems to use `SearchArgument` together with `ExprNodeGenericFuncDesc` only.
+    // Usage of builder methods mentioned above can only be found in test code, where all tested
+    // filters are known to be convertible.
+
+    expression match {
+      case And(left, right) =>
+        val tryLeft = buildSearchArgument(left, newBuilder)
+        val tryRight = buildSearchArgument(right, newBuilder)
+
+        val conjunction = for {
+          _ <- tryLeft
+          _ <- tryRight
+          lhs <- buildSearchArgument(left, builder.startAnd())
+          rhs <- buildSearchArgument(right, lhs)
+        } yield rhs.end()
+
+        // For filter `left AND right`, we can still push down `left` even if `right` is not
+        // convertible, and vice versa.
+        conjunction
+          .orElse(tryLeft.flatMap(_ => buildSearchArgument(left, builder)))
+          .orElse(tryRight.flatMap(_ => buildSearchArgument(right, builder)))
+
+      case Or(left, right) =>
+        for {
+          _ <- buildSearchArgument(left, newBuilder)
+          _ <- buildSearchArgument(right, newBuilder)
+          lhs <- buildSearchArgument(left, builder.startOr())
+          rhs <- buildSearchArgument(right, lhs)
+        } yield rhs.end()
+
+      case Not(child) =>
+        for {
+          _ <- buildSearchArgument(child, newBuilder)
+          negate <- buildSearchArgument(child, builder.startNot())
+        } yield negate.end()
+
+      case EqualTo(attribute, value) =>
+        Option(value)
+          .filter(isSearchableLiteral)
+          .map(builder.equals(attribute, _))
+
+      case LessThan(attribute, value) =>
+        Option(value)
+          .filter(isSearchableLiteral)
+          .map(builder.lessThan(attribute, _))
+
+      case LessThanOrEqual(attribute, value) =>
+        Option(value)
+          .filter(isSearchableLiteral)
+          .map(builder.lessThanEquals(attribute, _))
+
+      case GreaterThan(attribute, value) =>
+        Option(value)
+          .filter(isSearchableLiteral)
+          .map(builder.startNot().lessThanEquals(attribute, _).end())
+
+      case GreaterThanOrEqual(attribute, value) =>
+        Option(value)
+          .filter(isSearchableLiteral)
+          .map(builder.startNot().lessThan(attribute, _).end())
+
+      case IsNull(attribute) =>
+        Some(builder.isNull(attribute))
+
+      case IsNotNull(attribute) =>
+        Some(builder.startNot().isNull(attribute).end())
+
+      case In(attribute, values) =>
+        Option(values)
+          .filter(_.forall(isSearchableLiteral))
+          .map(builder.in(attribute, _))
+
+      case _ => None
+    }
+  }
+}
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcRelation.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcRelation.scala
new file mode 100644
index 0000000000000..9708199f07349
--- /dev/null
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcRelation.scala
@@ -0,0 +1,290 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.hive.orc
+
+import java.util.{Objects, Properties}
+
+import org.apache.hadoop.conf.Configuration
+import org.apache.hadoop.fs.Path
+import org.apache.hadoop.hive.conf.HiveConf.ConfVars
+import org.apache.hadoop.hive.ql.io.orc.{OrcInputFormat, OrcOutputFormat, OrcSerde, OrcSplit}
+import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector
+import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoUtils
+import org.apache.hadoop.io.{NullWritable, Writable}
+import org.apache.hadoop.mapred.{InputFormat => MapRedInputFormat, JobConf, RecordWriter, Reporter}
+import org.apache.hadoop.mapreduce.lib.input.FileInputFormat
+import org.apache.hadoop.mapreduce.{Job, TaskAttemptContext}
+
+import org.apache.spark.annotation.DeveloperApi
+import org.apache.spark.mapred.SparkHadoopMapRedUtil
+import org.apache.spark.rdd.{HadoopRDD, RDD}
+import org.apache.spark.sql.catalyst.expressions._
+import org.apache.spark.sql.hive.{HiveContext, HiveInspectors, HiveMetastoreTypes, HiveShim}
+import org.apache.spark.sql.sources.{Filter, _}
+import org.apache.spark.sql.types.StructType
+import org.apache.spark.sql.{Row, SQLContext}
+import org.apache.spark.{Logging, SerializableWritable}
+
+/* Implicit conversions */
+import scala.collection.JavaConversions._
+
+private[sql] class DefaultSource extends HadoopFsRelationProvider {
+  def createRelation(
+      sqlContext: SQLContext,
+      paths: Array[String],
+      schema: Option[StructType],
+      partitionColumns: Option[StructType],
+      parameters: Map[String, String]): HadoopFsRelation = {
+    assert(
+      sqlContext.isInstanceOf[HiveContext],
+      "The ORC data source can only be used with HiveContext.")
+
+    val partitionSpec = partitionColumns.map(PartitionSpec(_, Seq.empty[Partition]))
+    OrcRelation(paths, parameters, schema, partitionSpec)(sqlContext)
+  }
+}
+
+private[orc] class OrcOutputWriter(
+    path: String,
+    dataSchema: StructType,
+    context: TaskAttemptContext)
+  extends OutputWriter with SparkHadoopMapRedUtil with HiveInspectors {
+
+  private val serializer = {
+    val table = new Properties()
+    table.setProperty("columns", dataSchema.fieldNames.mkString(","))
+    table.setProperty("columns.types", dataSchema.map { f =>
+      HiveMetastoreTypes.toMetastoreType(f.dataType)
+    }.mkString(":"))
+
+    val serde = new OrcSerde
+    serde.initialize(context.getConfiguration, table)
+    serde
+  }
+
+  // Object inspector converted from the schema of the relation to be written.
+  private val structOI = {
+    val typeInfo =
+      TypeInfoUtils.getTypeInfoFromTypeString(
+        HiveMetastoreTypes.toMetastoreType(dataSchema))
+
+    TypeInfoUtils
+      .getStandardJavaObjectInspectorFromTypeInfo(typeInfo)
+      .asInstanceOf[StructObjectInspector]
+  }
+
+  // Used to hold temporary `Writable` fields of the next row to be written.
+  private val reusableOutputBuffer = new Array[Any](dataSchema.length)
+
+  // Used to convert Catalyst values into Hadoop `Writable`s.
+  private val wrappers = structOI.getAllStructFieldRefs.map { ref =>
+    wrapperFor(ref.getFieldObjectInspector)
+  }.toArray
+
+  // `OrcRecordWriter.close()` creates an empty file if no rows are written at all.  We use this
+  // flag to decide whether `OrcRecordWriter.close()` needs to be called.
+  private var recordWriterInstantiated = false
+
+  private lazy val recordWriter: RecordWriter[NullWritable, Writable] = {
+    recordWriterInstantiated = true
+
+    val conf = context.getConfiguration
+    val partition = context.getTaskAttemptID.getTaskID.getId
+    val filename = f"part-r-$partition%05d-${System.currentTimeMillis}%015d.orc"
+
+    new OrcOutputFormat().getRecordWriter(
+      new Path(path, filename).getFileSystem(conf),
+      conf.asInstanceOf[JobConf],
+      new Path(path, filename).toUri.getPath,
+      Reporter.NULL
+    ).asInstanceOf[RecordWriter[NullWritable, Writable]]
+  }
+
+  override def write(row: Row): Unit = {
+    var i = 0
+    while (i < row.length) {
+      reusableOutputBuffer(i) = wrappers(i)(row(i))
+      i += 1
+    }
+
+    recordWriter.write(
+      NullWritable.get(),
+      serializer.serialize(reusableOutputBuffer, structOI))
+  }
+
+  override def close(): Unit = {
+    if (recordWriterInstantiated) {
+      recordWriter.close(Reporter.NULL)
+    }
+  }
+}
+
+@DeveloperApi
+private[sql] case class OrcRelation(
+    override val paths: Array[String],
+    parameters: Map[String, String],
+    maybeSchema: Option[StructType] = None,
+    maybePartitionSpec: Option[PartitionSpec] = None)(
+    @transient val sqlContext: SQLContext)
+  extends HadoopFsRelation(maybePartitionSpec)
+  with Logging {
+
+  override val dataSchema: StructType = maybeSchema.getOrElse {
+    OrcFileOperator.readSchema(
+      paths.head, Some(sqlContext.sparkContext.hadoopConfiguration))
+  }
+
+  override def userDefinedPartitionColumns: Option[StructType] =
+    maybePartitionSpec.map(_.partitionColumns)
+
+  override def needConversion: Boolean = false
+
+  override def equals(other: Any): Boolean = other match {
+    case that: OrcRelation =>
+      paths.toSet == that.paths.toSet &&
+        dataSchema == that.dataSchema &&
+        schema == that.schema &&
+        partitionColumns == that.partitionColumns
+    case _ => false
+  }
+
+  override def hashCode(): Int = {
+    Objects.hashCode(
+      paths.toSet,
+      dataSchema,
+      schema,
+      maybePartitionSpec)
+  }
+
+  override def buildScan(requiredColumns: Array[String],
+      filters: Array[Filter],
+      inputPaths: Array[String]): RDD[Row] = {
+    val output = StructType(requiredColumns.map(dataSchema(_))).toAttributes
+    OrcTableScan(output, this, filters, inputPaths).execute()
+  }
+
+  override def prepareJobForWrite(job: Job): OutputWriterFactory = {
+    new OutputWriterFactory {
+      override def newInstance(
+          path: String,
+          dataSchema: StructType,
+          context: TaskAttemptContext): OutputWriter = {
+        new OrcOutputWriter(path, dataSchema, context)
+      }
+    }
+  }
+}
+
+private[orc] case class OrcTableScan(
+    attributes: Seq[Attribute],
+    @transient relation: OrcRelation,
+    filters: Array[Filter],
+    inputPaths: Array[String])
+  extends Logging
+  with HiveInspectors {
+
+  @transient private val sqlContext = relation.sqlContext
+
+  private def addColumnIds(
+      output: Seq[Attribute],
+      relation: OrcRelation,
+      conf: Configuration): Unit = {
+    val ids = output.map(a => relation.dataSchema.fieldIndex(a.name): Integer)
+    val (sortedIds, sortedNames) = ids.zip(attributes.map(_.name)).sorted.unzip
+    HiveShim.appendReadColumns(conf, sortedIds, sortedNames)
+  }
+
+  // Transform all given raw `Writable`s into `Row`s.
+  private def fillObject(
+      path: String,
+      conf: Configuration,
+      iterator: Iterator[Writable],
+      nonPartitionKeyAttrs: Seq[(Attribute, Int)],
+      mutableRow: MutableRow): Iterator[Row] = {
+    val deserializer = new OrcSerde
+    val soi = OrcFileOperator.getObjectInspector(path, Some(conf))
+    val (fieldRefs, fieldOrdinals) = nonPartitionKeyAttrs.map {
+      case (attr, ordinal) =>
+        soi.getStructFieldRef(attr.name.toLowerCase) -> ordinal
+    }.unzip
+    val unwrappers = fieldRefs.map(unwrapperFor)
+    // Map each tuple to a row object
+    iterator.map { value =>
+      val raw = deserializer.deserialize(value)
+      var i = 0
+      while (i < fieldRefs.length) {
+        val fieldValue = soi.getStructFieldData(raw, fieldRefs(i))
+        if (fieldValue == null) {
+          mutableRow.setNullAt(fieldOrdinals(i))
+        } else {
+          unwrappers(i)(fieldValue, mutableRow, fieldOrdinals(i))
+        }
+        i += 1
+      }
+      mutableRow: Row
+    }
+  }
+
+  def execute(): RDD[Row] = {
+    val job = new Job(sqlContext.sparkContext.hadoopConfiguration)
+    val conf = job.getConfiguration
+
+    // Tries to push down filters if ORC filter push-down is enabled
+    if (sqlContext.conf.orcFilterPushDown) {
+      OrcFilters.createFilter(filters).foreach { f =>
+        conf.set(OrcTableScan.SARG_PUSHDOWN, f.toKryo)
+        conf.setBoolean(ConfVars.HIVEOPTINDEXFILTER.varname, true)
+      }
+    }
+
+    // Sets requested columns
+    addColumnIds(attributes, relation, conf)
+
+    if (inputPaths.nonEmpty) {
+      FileInputFormat.setInputPaths(job, inputPaths.map(new Path(_)): _*)
+    }
+
+    val inputFormatClass =
+      classOf[OrcInputFormat]
+        .asInstanceOf[Class[_ <: MapRedInputFormat[NullWritable, Writable]]]
+
+    val rdd = sqlContext.sparkContext.hadoopRDD(
+      conf.asInstanceOf[JobConf],
+      inputFormatClass,
+      classOf[NullWritable],
+      classOf[Writable]
+    ).asInstanceOf[HadoopRDD[NullWritable, Writable]]
+
+    val wrappedConf = new SerializableWritable(conf)
+
+    rdd.mapPartitionsWithInputSplit { case (split: OrcSplit, iterator) =>
+      val mutableRow = new SpecificMutableRow(attributes.map(_.dataType))
+      fillObject(
+        split.getPath.toString,
+        wrappedConf.value,
+        iterator.map(_._2),
+        attributes.zipWithIndex,
+        mutableRow)
+    }
+  }
+}
+
+private[orc] object OrcTableScan {
+  // This constant duplicates `OrcInputFormat.SARG_PUSHDOWN`, which is unfortunately not public.
+  private[orc] val SARG_PUSHDOWN = "sarg.pushdown"
+}
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcHadoopFsRelationSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcHadoopFsRelationSuite.scala
new file mode 100644
index 0000000000000..080af5bb23c16
--- /dev/null
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcHadoopFsRelationSuite.scala
@@ -0,0 +1,59 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.hive.orc
+
+import org.apache.hadoop.fs.Path
+
+import org.apache.spark.deploy.SparkHadoopUtil
+import org.apache.spark.sql.sources.HadoopFsRelationTest
+import org.apache.spark.sql.types._
+
+class OrcHadoopFsRelationSuite extends HadoopFsRelationTest {
+  override val dataSourceName: String = classOf[DefaultSource].getCanonicalName
+
+  import sqlContext._
+  import sqlContext.implicits._
+
+  test("save()/load() - partitioned table - simple queries - partition columns in data") {
+    withTempDir { file =>
+      val basePath = new Path(file.getCanonicalPath)
+      val fs = basePath.getFileSystem(SparkHadoopUtil.get.conf)
+      val qualifiedBasePath = fs.makeQualified(basePath)
+
+      for (p1 <- 1 to 2; p2 <- Seq("foo", "bar")) {
+        val partitionDir = new Path(qualifiedBasePath, s"p1=$p1/p2=$p2")
+        sparkContext
+          .parallelize(for (i <- 1 to 3) yield (i, s"val_$i", p1))
+          .toDF("a", "b", "p1")
+          .write
+          .format("orc")
+          .save(partitionDir.toString)
+      }
+
+      val dataSchemaWithPartition =
+        StructType(dataSchema.fields :+ StructField("p1", IntegerType, nullable = true))
+
+      checkQueries(
+        load(
+          source = dataSourceName,
+          options = Map(
+            "path" -> file.getCanonicalPath,
+            "dataSchema" -> dataSchemaWithPartition.json)))
+    }
+  }
+}
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcPartitionDiscoverySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcPartitionDiscoverySuite.scala
new file mode 100644
index 0000000000000..88c99e35260d9
--- /dev/null
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcPartitionDiscoverySuite.scala
@@ -0,0 +1,256 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.hive.orc
+
+import java.io.File
+import org.apache.hadoop.hive.conf.HiveConf.ConfVars
+import org.apache.spark.sql._
+import org.apache.spark.sql.catalyst.expressions.Row
+import org.apache.spark.sql.hive.test.TestHive
+import org.apache.spark.sql.hive.test.TestHive._
+import org.apache.spark.sql.hive.test.TestHive.implicits._
+import org.apache.spark.util.Utils
+import org.scalatest.{BeforeAndAfterAll, FunSuiteLike}
+
+import scala.reflect.ClassTag
+import scala.reflect.runtime.universe.TypeTag
+
+
+// The data where the partitioning key exists only in the directory structure.
+case class OrcParData(intField: Int, stringField: String)
+
+// The data that also includes the partitioning key
+case class OrcParDataWithKey(intField: Int, pi: Int, stringField: String, ps: String)
+
+// TODO This test suite duplicates ParquetPartitionDiscoverySuite a lot
+class OrcPartitionDiscoverySuite extends QueryTest with FunSuiteLike with BeforeAndAfterAll {
+  val defaultPartitionName = ConfVars.DEFAULTPARTITIONNAME.defaultVal
+
+  def withTempDir(f: File => Unit): Unit = {
+    val dir = Utils.createTempDir().getCanonicalFile
+    try f(dir) finally Utils.deleteRecursively(dir)
+  }
+
+  def makeOrcFile[T <: Product: ClassTag: TypeTag](
+      data: Seq[T], path: File): Unit = {
+    data.toDF().write.format("orc").mode("overwrite").save(path.getCanonicalPath)
+  }
+
+
+  def makeOrcFile[T <: Product: ClassTag: TypeTag](
+      df: DataFrame, path: File): Unit = {
+    df.write.format("orc").mode("overwrite").save(path.getCanonicalPath)
+  }
+
+  protected def withTempTable(tableName: String)(f: => Unit): Unit = {
+    try f finally TestHive.dropTempTable(tableName)
+  }
+
+  protected def makePartitionDir(
+      basePath: File,
+      defaultPartitionName: String,
+      partitionCols: (String, Any)*): File = {
+    val partNames = partitionCols.map { case (k, v) =>
+      val valueString = if (v == null || v == "") defaultPartitionName else v.toString
+      s"$k=$valueString"
+    }
+
+    val partDir = partNames.foldLeft(basePath) { (parent, child) =>
+      new File(parent, child)
+    }
+
+    assert(partDir.mkdirs(), s"Couldn't create directory $partDir")
+    partDir
+  }
+
+  test("read partitioned table - normal case") {
+    withTempDir { base =>
+      for {
+        pi <- Seq(1, 2)
+        ps <- Seq("foo", "bar")
+      } {
+        makeOrcFile(
+          (1 to 10).map(i => OrcParData(i, i.toString)),
+          makePartitionDir(base, defaultPartitionName, "pi" -> pi, "ps" -> ps))
+      }
+
+      read.format("orc").load(base.getCanonicalPath).registerTempTable("t")
+
+      withTempTable("t") {
+        checkAnswer(
+          sql("SELECT * FROM t"),
+          for {
+            i <- 1 to 10
+            pi <- Seq(1, 2)
+            ps <- Seq("foo", "bar")
+          } yield Row(i, i.toString, pi, ps))
+
+        checkAnswer(
+          sql("SELECT intField, pi FROM t"),
+          for {
+            i <- 1 to 10
+            pi <- Seq(1, 2)
+            _ <- Seq("foo", "bar")
+          } yield Row(i, pi))
+
+        checkAnswer(
+          sql("SELECT * FROM t WHERE pi = 1"),
+          for {
+            i <- 1 to 10
+            ps <- Seq("foo", "bar")
+          } yield Row(i, i.toString, 1, ps))
+
+        checkAnswer(
+          sql("SELECT * FROM t WHERE ps = 'foo'"),
+          for {
+            i <- 1 to 10
+            pi <- Seq(1, 2)
+          } yield Row(i, i.toString, pi, "foo"))
+      }
+    }
+  }
+
+  test("read partitioned table - partition key included in orc file") {
+    withTempDir { base =>
+      for {
+        pi <- Seq(1, 2)
+        ps <- Seq("foo", "bar")
+      } {
+        makeOrcFile(
+          (1 to 10).map(i => OrcParDataWithKey(i, pi, i.toString, ps)),
+          makePartitionDir(base, defaultPartitionName, "pi" -> pi, "ps" -> ps))
+      }
+
+      read.format("orc").load(base.getCanonicalPath).registerTempTable("t")
+
+      withTempTable("t") {
+        checkAnswer(
+          sql("SELECT * FROM t"),
+          for {
+            i <- 1 to 10
+            pi <- Seq(1, 2)
+            ps <- Seq("foo", "bar")
+          } yield Row(i, pi, i.toString, ps))
+
+        checkAnswer(
+          sql("SELECT intField, pi FROM t"),
+          for {
+            i <- 1 to 10
+            pi <- Seq(1, 2)
+            _ <- Seq("foo", "bar")
+          } yield Row(i, pi))
+
+        checkAnswer(
+          sql("SELECT * FROM t WHERE pi = 1"),
+          for {
+            i <- 1 to 10
+            ps <- Seq("foo", "bar")
+          } yield Row(i, 1, i.toString, ps))
+
+        checkAnswer(
+          sql("SELECT * FROM t WHERE ps = 'foo'"),
+          for {
+            i <- 1 to 10
+            pi <- Seq(1, 2)
+          } yield Row(i, pi, i.toString, "foo"))
+      }
+    }
+  }
+
+
+  test("read partitioned table - with nulls") {
+    withTempDir { base =>
+      for {
+      // Must be `Integer` rather than `Int` here. `null.asInstanceOf[Int]` results in a zero...
+        pi <- Seq(1, null.asInstanceOf[Integer])
+        ps <- Seq("foo", null.asInstanceOf[String])
+      } {
+        makeOrcFile(
+          (1 to 10).map(i => OrcParData(i, i.toString)),
+          makePartitionDir(base, defaultPartitionName, "pi" -> pi, "ps" -> ps))
+      }
+
+      read
+        .format("orc")
+        .option(ConfVars.DEFAULTPARTITIONNAME.varname, defaultPartitionName)
+        .load(base.getCanonicalPath)
+        .registerTempTable("t")
+
+      withTempTable("t") {
+        checkAnswer(
+          sql("SELECT * FROM t"),
+          for {
+            i <- 1 to 10
+            pi <- Seq(1, null.asInstanceOf[Integer])
+            ps <- Seq("foo", null.asInstanceOf[String])
+          } yield Row(i, i.toString, pi, ps))
+
+        checkAnswer(
+          sql("SELECT * FROM t WHERE pi IS NULL"),
+          for {
+            i <- 1 to 10
+            ps <- Seq("foo", null.asInstanceOf[String])
+          } yield Row(i, i.toString, null, ps))
+
+        checkAnswer(
+          sql("SELECT * FROM t WHERE ps IS NULL"),
+          for {
+            i <- 1 to 10
+            pi <- Seq(1, null.asInstanceOf[Integer])
+          } yield Row(i, i.toString, pi, null))
+      }
+    }
+  }
+
+  test("read partitioned table - with nulls and partition keys are included in Orc file") {
+    withTempDir { base =>
+      for {
+        pi <- Seq(1, 2)
+        ps <- Seq("foo", null.asInstanceOf[String])
+      } {
+        makeOrcFile(
+          (1 to 10).map(i => OrcParDataWithKey(i, pi, i.toString, ps)),
+          makePartitionDir(base, defaultPartitionName, "pi" -> pi, "ps" -> ps))
+      }
+
+      read
+        .format("orc")
+        .option(ConfVars.DEFAULTPARTITIONNAME.varname, defaultPartitionName)
+        .load(base.getCanonicalPath)
+        .registerTempTable("t")
+
+      withTempTable("t") {
+        checkAnswer(
+          sql("SELECT * FROM t"),
+          for {
+            i <- 1 to 10
+            pi <- Seq(1, 2)
+            ps <- Seq("foo", null.asInstanceOf[String])
+          } yield Row(i, pi, i.toString, ps))
+
+        checkAnswer(
+          sql("SELECT * FROM t WHERE ps IS NULL"),
+          for {
+            i <- 1 to 10
+            pi <- Seq(1, 2)
+          } yield Row(i, pi, i.toString, null))
+      }
+    }
+  }
+}
+
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcQuerySuite.scala
new file mode 100644
index 0000000000000..cdd6e705f4a2c
--- /dev/null
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcQuerySuite.scala
@@ -0,0 +1,294 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.hive.orc
+
+import java.io.File
+
+import org.apache.hadoop.hive.conf.HiveConf.ConfVars
+import org.apache.hadoop.hive.ql.io.orc.CompressionKind
+import org.scalatest.{BeforeAndAfterAll, FunSuiteLike}
+
+import org.apache.spark.sql._
+import org.apache.spark.sql.catalyst.expressions.Row
+import org.apache.spark.sql.hive.test.TestHive
+import org.apache.spark.sql.hive.test.TestHive._
+import org.apache.spark.sql.hive.test.TestHive.implicits._
+
+case class AllDataTypesWithNonPrimitiveType(
+    stringField: String,
+    intField: Int,
+    longField: Long,
+    floatField: Float,
+    doubleField: Double,
+    shortField: Short,
+    byteField: Byte,
+    booleanField: Boolean,
+    array: Seq[Int],
+    arrayContainsNull: Seq[Option[Int]],
+    map: Map[Int, Long],
+    mapValueContainsNull: Map[Int, Option[Long]],
+    data: (Seq[Int], (Int, String)))
+
+case class BinaryData(binaryData: Array[Byte])
+
+case class Contact(name: String, phone: String)
+
+case class Person(name: String, age: Int, contacts: Seq[Contact])
+
+class OrcQuerySuite extends QueryTest with FunSuiteLike with BeforeAndAfterAll with OrcTest {
+  override val sqlContext = TestHive
+
+  import TestHive.read
+
+  def getTempFilePath(prefix: String, suffix: String = ""): File = {
+    val tempFile = File.createTempFile(prefix, suffix)
+    tempFile.delete()
+    tempFile
+  }
+
+  test("Read/write All Types") {
+    val data = (0 to 255).map { i =>
+      (s"$i", i, i.toLong, i.toFloat, i.toDouble, i.toShort, i.toByte, i % 2 == 0)
+    }
+
+    withOrcFile(data) { file =>
+      checkAnswer(
+        read.format("orc").load(file),
+        data.toDF().collect())
+    }
+  }
+
+  test("Read/write binary data") {
+    withOrcFile(BinaryData("test".getBytes("utf8")) :: Nil) { file =>
+      val bytes = read.format("orc").load(file).head().getAs[Array[Byte]](0)
+      assert(new String(bytes, "utf8") === "test")
+    }
+  }
+
+  test("Read/write all types with non-primitive type") {
+    val data = (0 to 255).map { i =>
+      AllDataTypesWithNonPrimitiveType(
+        s"$i", i, i.toLong, i.toFloat, i.toDouble, i.toShort, i.toByte, i % 2 == 0,
+        0 until i,
+        (0 until i).map(Option(_).filter(_ % 3 == 0)),
+        (0 until i).map(i => i -> i.toLong).toMap,
+        (0 until i).map(i => i -> Option(i.toLong)).toMap + (i -> None),
+        (0 until i, (i, s"$i")))
+    }
+
+    withOrcFile(data) { file =>
+      checkAnswer(
+        read.format("orc").load(file),
+        data.toDF().collect())
+    }
+  }
+
+  test("Creating case class RDD table") {
+    val data = (1 to 100).map(i => (i, s"val_$i"))
+    sparkContext.parallelize(data).toDF().registerTempTable("t")
+    withTempTable("t") {
+      checkAnswer(sql("SELECT * FROM t"), data.toDF().collect())
+    }
+  }
+
+  test("Simple selection form ORC table") {
+    val data = (1 to 10).map { i =>
+      Person(s"name_$i", i, (0 to 1).map { m => Contact(s"contact_$m", s"phone_$m") })
+    }
+
+    withOrcTable(data, "t") {
+      // ppd:
+      // leaf-0 = (LESS_THAN_EQUALS age 5)
+      // expr = leaf-0
+      assert(sql("SELECT name FROM t WHERE age <= 5").count() === 5)
+
+      // ppd:
+      // leaf-0 = (LESS_THAN_EQUALS age 5)
+      // expr = (not leaf-0)
+      assertResult(10) {
+        sql("SELECT name, contacts FROM t where age > 5")
+          .flatMap(_.getAs[Seq[_]]("contacts"))
+          .count()
+      }
+
+      // ppd:
+      // leaf-0 = (LESS_THAN_EQUALS age 5)
+      // leaf-1 = (LESS_THAN age 8)
+      // expr = (and (not leaf-0) leaf-1)
+      {
+        val df = sql("SELECT name, contacts FROM t WHERE age > 5 AND age < 8")
+        assert(df.count() === 2)
+        assertResult(4) {
+          df.flatMap(_.getAs[Seq[_]]("contacts")).count()
+        }
+      }
+
+      // ppd:
+      // leaf-0 = (LESS_THAN age 2)
+      // leaf-1 = (LESS_THAN_EQUALS age 8)
+      // expr = (or leaf-0 (not leaf-1))
+      {
+        val df = sql("SELECT name, contacts FROM t WHERE age < 2 OR age > 8")
+        assert(df.count() === 3)
+        assertResult(6) {
+          df.flatMap(_.getAs[Seq[_]]("contacts")).count()
+        }
+      }
+    }
+  }
+
+  test("save and load case class RDD with `None`s as orc") {
+    val data = (
+      None: Option[Int],
+      None: Option[Long],
+      None: Option[Float],
+      None: Option[Double],
+      None: Option[Boolean]
+    ) :: Nil
+
+    withOrcFile(data) { file =>
+      checkAnswer(
+        read.format("orc").load(file),
+        Row(Seq.fill(5)(null): _*))
+    }
+  }
+
+  // We only support zlib in Hive 0.12.0 now
+  test("Default compression options for writing to an ORC file") {
+    withOrcFile((1 to 100).map(i => (i, s"val_$i"))) { file =>
+      assertResult(CompressionKind.ZLIB) {
+        OrcFileOperator.getFileReader(file).getCompression
+      }
+    }
+  }
+
+  // Following codec is supported in hive-0.13.1, ignore it now
+  ignore("Other compression options for writing to an ORC file - 0.13.1 and above") {
+    val data = (1 to 100).map(i => (i, s"val_$i"))
+    val conf = sparkContext.hadoopConfiguration
+
+    conf.set(ConfVars.HIVE_ORC_DEFAULT_COMPRESS.varname, "SNAPPY")
+    withOrcFile(data) { file =>
+      assertResult(CompressionKind.SNAPPY) {
+        OrcFileOperator.getFileReader(file).getCompression
+      }
+    }
+
+    conf.set(ConfVars.HIVE_ORC_DEFAULT_COMPRESS.varname, "NONE")
+    withOrcFile(data) { file =>
+      assertResult(CompressionKind.NONE) {
+        OrcFileOperator.getFileReader(file).getCompression
+      }
+    }
+
+    conf.set(ConfVars.HIVE_ORC_DEFAULT_COMPRESS.varname, "LZO")
+    withOrcFile(data) { file =>
+      assertResult(CompressionKind.LZO) {
+        OrcFileOperator.getFileReader(file).getCompression
+      }
+    }
+  }
+
+  test("simple select queries") {
+    withOrcTable((0 until 10).map(i => (i, i.toString)), "t") {
+      checkAnswer(
+        sql("SELECT `_1` FROM t where t.`_1` > 5"),
+        (6 until 10).map(Row.apply(_)))
+
+      checkAnswer(
+        sql("SELECT `_1` FROM t as tmp where tmp.`_1` < 5"),
+        (0 until 5).map(Row.apply(_)))
+    }
+  }
+
+  test("appending") {
+    val data = (0 until 10).map(i => (i, i.toString))
+    createDataFrame(data).toDF("c1", "c2").registerTempTable("tmp")
+    withOrcTable(data, "t") {
+      sql("INSERT INTO TABLE t SELECT * FROM tmp")
+      checkAnswer(table("t"), (data ++ data).map(Row.fromTuple))
+    }
+    catalog.unregisterTable(Seq("tmp"))
+  }
+
+  test("overwriting") {
+    val data = (0 until 10).map(i => (i, i.toString))
+    createDataFrame(data).toDF("c1", "c2").registerTempTable("tmp")
+    withOrcTable(data, "t") {
+      sql("INSERT OVERWRITE TABLE t SELECT * FROM tmp")
+      checkAnswer(table("t"), data.map(Row.fromTuple))
+    }
+    catalog.unregisterTable(Seq("tmp"))
+  }
+
+  test("self-join") {
+    // 4 rows, cells of column 1 of row 2 and row 4 are null
+    val data = (1 to 4).map { i =>
+      val maybeInt = if (i % 2 == 0) None else Some(i)
+      (maybeInt, i.toString)
+    }
+
+    withOrcTable(data, "t") {
+      val selfJoin = sql("SELECT * FROM t x JOIN t y WHERE x.`_1` = y.`_1`")
+      val queryOutput = selfJoin.queryExecution.analyzed.output
+
+      assertResult(4, "Field count mismatches")(queryOutput.size)
+      assertResult(2, "Duplicated expression ID in query plan:\n $selfJoin") {
+        queryOutput.filter(_.name == "_1").map(_.exprId).size
+      }
+
+      checkAnswer(selfJoin, List(Row(1, "1", 1, "1"), Row(3, "3", 3, "3")))
+    }
+  }
+
+  test("nested data - struct with array field") {
+    val data = (1 to 10).map(i => Tuple1((i, Seq("val_$i"))))
+    withOrcTable(data, "t") {
+      checkAnswer(sql("SELECT `_1`.`_2`[0] FROM t"), data.map {
+        case Tuple1((_, Seq(string))) => Row(string)
+      })
+    }
+  }
+
+  test("nested data - array of struct") {
+    val data = (1 to 10).map(i => Tuple1(Seq(i -> "val_$i")))
+    withOrcTable(data, "t") {
+      checkAnswer(sql("SELECT `_1`[0].`_2` FROM t"), data.map {
+        case Tuple1(Seq((_, string))) => Row(string)
+      })
+    }
+  }
+
+  test("columns only referenced by pushed down filters should remain") {
+    withOrcTable((1 to 10).map(Tuple1.apply), "t") {
+      checkAnswer(sql("SELECT `_1` FROM t WHERE `_1` < 10"), (1 to 9).map(Row.apply(_)))
+    }
+  }
+
+  test("SPARK-5309 strings stored using dictionary compression in orc") {
+    withOrcTable((0 until 1000).map(i => ("same", "run_" + i / 100, 1)), "t") {
+      checkAnswer(
+        sql("SELECT `_1`, `_2`, SUM(`_3`) FROM t GROUP BY `_1`, `_2`"),
+        (0 until 10).map(i => Row("same", "run_" + i, 100)))
+
+      checkAnswer(
+        sql("SELECT `_1`, `_2`, SUM(`_3`) FROM t WHERE `_2` = 'run_5' GROUP BY `_1`, `_2`"),
+        List(Row("same", "run_5", 100)))
+    }
+  }
+}
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcSourceSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcSourceSuite.scala
new file mode 100644
index 0000000000000..82e08caf46457
--- /dev/null
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcSourceSuite.scala
@@ -0,0 +1,146 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.hive.orc
+
+import java.io.File
+
+import org.scalatest.BeforeAndAfterAll
+
+import org.apache.spark.sql.hive.test.TestHive._
+import org.apache.spark.sql.{QueryTest, Row}
+
+case class OrcData(intField: Int, stringField: String)
+
+abstract class OrcSuite extends QueryTest with BeforeAndAfterAll {
+  var orcTableDir: File = null
+  var orcTableAsDir: File = null
+
+  override def beforeAll(): Unit = {
+    super.beforeAll()
+
+    orcTableAsDir = File.createTempFile("orctests", "sparksql")
+    orcTableAsDir.delete()
+    orcTableAsDir.mkdir()
+
+    // Hack: to prepare orc data files using hive external tables
+    orcTableDir = File.createTempFile("orctests", "sparksql")
+    orcTableDir.delete()
+    orcTableDir.mkdir()
+    import org.apache.spark.sql.hive.test.TestHive.implicits._
+
+    sparkContext
+      .makeRDD(1 to 10)
+      .map(i => OrcData(i, s"part-$i"))
+      .toDF()
+      .registerTempTable(s"orc_temp_table")
+
+    sql(
+      s"""CREATE EXTERNAL TABLE normal_orc(
+         |  intField INT,
+         |  stringField STRING
+         |)
+         |STORED AS ORC
+         |LOCATION '${orcTableAsDir.getCanonicalPath}'
+       """.stripMargin)
+
+    sql(
+      s"""INSERT INTO TABLE normal_orc
+         |SELECT intField, stringField FROM orc_temp_table
+       """.stripMargin)
+  }
+
+  override def afterAll(): Unit = {
+    orcTableDir.delete()
+    orcTableAsDir.delete()
+  }
+
+  test("create temporary orc table") {
+    checkAnswer(sql("SELECT COUNT(*) FROM normal_orc_source"), Row(10))
+
+    checkAnswer(
+      sql("SELECT * FROM normal_orc_source"),
+      (1 to 10).map(i => Row(i, s"part-$i")))
+
+    checkAnswer(
+      sql("SELECT * FROM normal_orc_source where intField > 5"),
+      (6 to 10).map(i => Row(i, s"part-$i")))
+
+    checkAnswer(
+      sql("SELECT COUNT(intField), stringField FROM normal_orc_source GROUP BY stringField"),
+      (1 to 10).map(i => Row(1, s"part-$i")))
+  }
+
+  test("create temporary orc table as") {
+    checkAnswer(sql("SELECT COUNT(*) FROM normal_orc_as_source"), Row(10))
+
+    checkAnswer(
+      sql("SELECT * FROM normal_orc_source"),
+      (1 to 10).map(i => Row(i, s"part-$i")))
+
+    checkAnswer(
+      sql("SELECT * FROM normal_orc_source WHERE intField > 5"),
+      (6 to 10).map(i => Row(i, s"part-$i")))
+
+    checkAnswer(
+      sql("SELECT COUNT(intField), stringField FROM normal_orc_source GROUP BY stringField"),
+      (1 to 10).map(i => Row(1, s"part-$i")))
+  }
+
+  test("appending insert") {
+    sql("INSERT INTO TABLE normal_orc_source SELECT * FROM orc_temp_table WHERE intField > 5")
+
+    checkAnswer(
+      sql("SELECT * FROM normal_orc_source"),
+      (1 to 5).map(i => Row(i, s"part-$i")) ++ (6 to 10).flatMap { i =>
+        Seq.fill(2)(Row(i, s"part-$i"))
+      })
+  }
+
+  test("overwrite insert") {
+    sql(
+      """INSERT OVERWRITE TABLE normal_orc_as_source
+        |SELECT * FROM orc_temp_table WHERE intField > 5
+      """.stripMargin)
+
+    checkAnswer(
+      sql("SELECT * FROM normal_orc_as_source"),
+      (6 to 10).map(i => Row(i, s"part-$i")))
+  }
+}
+
+class OrcSourceSuite extends OrcSuite {
+  override def beforeAll(): Unit = {
+    super.beforeAll()
+
+    sql(
+      s"""CREATE TEMPORARY TABLE normal_orc_source
+         |USING org.apache.spark.sql.hive.orc
+         |OPTIONS (
+         |  PATH '${new File(orcTableAsDir.getAbsolutePath).getCanonicalPath}'
+         |)
+       """.stripMargin)
+
+    sql(
+      s"""CREATE TEMPORARY TABLE normal_orc_as_source
+         |USING org.apache.spark.sql.hive.orc
+         |OPTIONS (
+         |  PATH '${new File(orcTableAsDir.getAbsolutePath).getCanonicalPath}'
+         |)
+       """.stripMargin)
+  }
+}
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcTest.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcTest.scala
new file mode 100644
index 0000000000000..750f0b04aaa87
--- /dev/null
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcTest.scala
@@ -0,0 +1,82 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.hive.orc
+
+import java.io.File
+
+import scala.reflect.ClassTag
+import scala.reflect.runtime.universe.TypeTag
+
+import org.apache.spark.sql.hive.HiveContext
+import org.apache.spark.sql.hive.test.TestHive
+import org.apache.spark.sql.test.SQLTestUtils
+import org.apache.spark.sql._
+
+private[sql] trait OrcTest extends SQLTestUtils {
+  protected def hiveContext = sqlContext.asInstanceOf[HiveContext]
+
+  import sqlContext.sparkContext
+  import sqlContext.implicits._
+
+  /**
+   * Writes `data` to a Orc file, which is then passed to `f` and will be deleted after `f`
+   * returns.
+   */
+  protected def withOrcFile[T <: Product: ClassTag: TypeTag]
+      (data: Seq[T])
+      (f: String => Unit): Unit = {
+    withTempPath { file =>
+      sparkContext.parallelize(data).toDF().write.format("orc").save(file.getCanonicalPath)
+      f(file.getCanonicalPath)
+    }
+  }
+
+  /**
+   * Writes `data` to a Orc file and reads it back as a [[DataFrame]],
+   * which is then passed to `f`. The Orc file will be deleted after `f` returns.
+   */
+  protected def withOrcDataFrame[T <: Product: ClassTag: TypeTag]
+      (data: Seq[T])
+      (f: DataFrame => Unit): Unit = {
+    withOrcFile(data)(path => f(hiveContext.read.format("orc").load(path)))
+  }
+
+  /**
+   * Writes `data` to a Orc file, reads it back as a [[DataFrame]] and registers it as a
+   * temporary table named `tableName`, then call `f`. The temporary table together with the
+   * Orc file will be dropped/deleted after `f` returns.
+   */
+  protected def withOrcTable[T <: Product: ClassTag: TypeTag]
+      (data: Seq[T], tableName: String)
+      (f: => Unit): Unit = {
+    withOrcDataFrame(data) { df =>
+      hiveContext.registerDataFrameAsTable(df, tableName)
+      withTempTable(tableName)(f)
+    }
+  }
+
+  protected def makeOrcFile[T <: Product: ClassTag: TypeTag](
+      data: Seq[T], path: File): Unit = {
+    data.toDF().write.format("orc").mode(SaveMode.Overwrite).save(path.getCanonicalPath)
+  }
+
+  protected def makeOrcFile[T <: Product: ClassTag: TypeTag](
+      df: DataFrame, path: File): Unit = {
+    df.write.format("orc").mode(SaveMode.Overwrite).save(path.getCanonicalPath)
+  }
+}
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/sources/hadoopFsRelationSuites.scala b/sql/hive/src/test/scala/org/apache/spark/sql/sources/hadoopFsRelationSuites.scala
index 9d9b436cabe3c..ad4a4826c6b45 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/sources/hadoopFsRelationSuites.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/sources/hadoopFsRelationSuites.scala
@@ -23,12 +23,10 @@ import org.apache.spark.deploy.SparkHadoopUtil
 import org.apache.spark.sql._
 import org.apache.spark.sql.hive.test.TestHive
 import org.apache.spark.sql.parquet.ParquetTest
+import org.apache.spark.sql.test.SQLTestUtils
 import org.apache.spark.sql.types._
 
-// TODO Don't extend ParquetTest
-// This test suite extends ParquetTest for some convenient utility methods. These methods should be
-// moved to some more general places, maybe QueryTest.
-class HadoopFsRelationTest extends QueryTest with ParquetTest {
+abstract class HadoopFsRelationTest extends QueryTest with SQLTestUtils {
   override val sqlContext: SQLContext = TestHive
 
   import sqlContext._

From fc2480ed13742a99470b5012ca3a75ab91e5a5e5 Mon Sep 17 00:00:00 2001
From: scwf <wangfei1@huawei.com>
Date: Mon, 18 May 2015 12:05:14 -0700
Subject: [PATCH 049/525] [SPARK-7631] [SQL] treenode argString should not
 print children

spark-sql>
> explain extended
> select * from (
> select key from src union all
> select key from src) t;

now the spark plan will print children in argString
```
== Physical Plan ==
Union[ HiveTableScan key#1, (MetastoreRelation default, src, None), None,
HiveTableScan key#3, (MetastoreRelation default, src, None), None]
HiveTableScan key#1, (MetastoreRelation default, src, None), None
HiveTableScan key#3, (MetastoreRelation default, src, None), None
```

after this patch:
```
== Physical Plan ==
Union
 HiveTableScan [key#1], (MetastoreRelation default, src, None), None
 HiveTableScan [key#3], (MetastoreRelation default, src, None), None
```

I have tested this locally

Author: scwf <wangfei1@huawei.com>

Closes #6144 from scwf/fix-argString and squashes the following commits:

1a642e0 [scwf] fix treenode argString
---
 .../scala/org/apache/spark/sql/catalyst/trees/TreeNode.scala     | 1 +
 1 file changed, 1 insertion(+)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/trees/TreeNode.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/trees/TreeNode.scala
index bc2ad34523d2c..28e15566f0961 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/trees/TreeNode.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/trees/TreeNode.scala
@@ -385,6 +385,7 @@ abstract class TreeNode[BaseType <: TreeNode[BaseType]] {
   def argString: String = productIterator.flatMap {
     case tn: TreeNode[_] if children contains tn => Nil
     case tn: TreeNode[_] if tn.toString contains "\n" => s"(${tn.simpleString})" :: Nil
+    case seq: Seq[BaseType] if seq.toSet.subsetOf(children.toSet) => Nil
     case seq: Seq[_] => seq.mkString("[", ",", "]") :: Nil
     case set: Set[_] => set.mkString("{", ",", "}") :: Nil
     case other => other :: Nil

From 103c863c2ef3d9e6186cfc7d95251a9515e9f180 Mon Sep 17 00:00:00 2001
From: Wenchen Fan <cloud0fan@outlook.com>
Date: Mon, 18 May 2015 12:08:28 -0700
Subject: [PATCH 050/525] [SPARK-7269] [SQL] Incorrect analysis for
 aggregation(use semanticEquals)

A modified version of https://github.com/apache/spark/pull/6110, use `semanticEquals` to make it more efficient.

Author: Wenchen Fan <cloud0fan@outlook.com>

Closes #6173 from cloud-fan/7269 and squashes the following commits:

e4a3cc7 [Wenchen Fan] address comments
cc02045 [Wenchen Fan] consider elements length equal
d7ff8f4 [Wenchen Fan] fix 7269
---
 .../sql/catalyst/analysis/Analyzer.scala      | 29 +++++--------------
 .../sql/catalyst/analysis/CheckAnalysis.scala |  4 +--
 .../sql/catalyst/expressions/Expression.scala | 13 +++++++++
 .../expressions/namedExpressions.scala        |  5 ++++
 .../sql/catalyst/planning/patterns.scala      |  5 ++--
 .../sql/hive/execution/SQLQuerySuite.scala    | 18 ++++++++++++
 6 files changed, 48 insertions(+), 26 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
index 0b6e1d44b9c4d..dfa4215f2efe5 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
@@ -25,7 +25,6 @@ import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.plans.logical._
 import org.apache.spark.sql.catalyst.rules._
 import org.apache.spark.sql.types._
-import org.apache.spark.util.collection.OpenHashSet
 
 /**
  * A trivial [[Analyzer]] with an [[EmptyCatalog]] and [[EmptyFunctionRegistry]]. Used for testing
@@ -142,25 +141,6 @@ class Analyzer(
   }
 
   object ResolveGroupingAnalytics extends Rule[LogicalPlan] {
-    /**
-     * Extract attribute set according to the grouping id
-     * @param bitmask bitmask to represent the selected of the attribute sequence
-     * @param exprs the attributes in sequence
-     * @return the attributes of non selected specified via bitmask (with the bit set to 1)
-     */
-    private def buildNonSelectExprSet(bitmask: Int, exprs: Seq[Expression])
-    : OpenHashSet[Expression] = {
-      val set = new OpenHashSet[Expression](2)
-
-      var bit = exprs.length - 1
-      while (bit >= 0) {
-        if (((bitmask >> bit) & 1) == 0) set.add(exprs(bit))
-        bit -= 1
-      }
-
-      set
-    }
-
     /*
      *  GROUP BY a, b, c WITH ROLLUP
      *  is equivalent to
@@ -197,10 +177,15 @@ class Analyzer(
 
       g.bitmasks.foreach { bitmask =>
         // get the non selected grouping attributes according to the bit mask
-        val nonSelectedGroupExprSet = buildNonSelectExprSet(bitmask, g.groupByExprs)
+        val nonSelectedGroupExprs = ArrayBuffer.empty[Expression]
+        var bit = g.groupByExprs.length - 1
+        while (bit >= 0) {
+          if (((bitmask >> bit) & 1) == 0) nonSelectedGroupExprs += g.groupByExprs(bit)
+          bit -= 1
+        }
 
         val substitution = (g.child.output :+ g.gid).map(expr => expr transformDown {
-          case x: Expression if nonSelectedGroupExprSet.contains(x) =>
+          case x: Expression if nonSelectedGroupExprs.find(_ semanticEquals x).isDefined =>
             // if the input attribute in the Invalid Grouping Expression set of for this group
             // replace it with constant null
             Literal.create(null, expr.dataType)
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala
index f104e742c90fe..06a0504359f6e 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala
@@ -86,12 +86,12 @@ trait CheckAnalysis {
           case Aggregate(groupingExprs, aggregateExprs, child) =>
             def checkValidAggregateExpression(expr: Expression): Unit = expr match {
               case _: AggregateExpression => // OK
-              case e: Attribute if !groupingExprs.contains(e) =>
+              case e: Attribute if groupingExprs.find(_ semanticEquals e).isEmpty =>
                 failAnalysis(
                   s"expression '${e.prettyString}' is neither present in the group by, " +
                     s"nor is it an aggregate function. " +
                     "Add to group by or wrap in first() if you don't care which value you get.")
-              case e if groupingExprs.contains(e) => // OK
+              case e if groupingExprs.find(_ semanticEquals e).isDefined => // OK
               case e if e.references.isEmpty => // OK
               case e => e.children.foreach(checkValidAggregateExpression)
             }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala
index 0837a3179d897..c7ae9da7fce49 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala
@@ -76,6 +76,19 @@ abstract class Expression extends TreeNode[Expression] {
       case u: UnresolvedAttribute => PrettyAttribute(u.name)
     }.toString
   }
+
+  /**
+   * Returns true when two expressions will always compute the same result, even if they differ
+   * cosmetically (i.e. capitalization of names in attributes may be different).
+   */
+  def semanticEquals(other: Expression): Boolean = this.getClass == other.getClass && {
+    val elements1 = this.productIterator.toSeq
+    val elements2 = other.asInstanceOf[Product].productIterator.toSeq
+    elements1.length == elements2.length && elements1.zip(elements2).forall {
+      case (e1: Expression, e2: Expression) => e1 semanticEquals e2
+      case (i1, i2) => i1 == i2
+    }
+  }
 }
 
 abstract class BinaryExpression extends Expression with trees.BinaryNode[Expression] {
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/namedExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/namedExpressions.scala
index a9170589f8c6c..50be26d0b08b5 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/namedExpressions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/namedExpressions.scala
@@ -181,6 +181,11 @@ case class AttributeReference(
     case _ => false
   }
 
+  override def semanticEquals(other: Expression): Boolean = other match {
+    case ar: AttributeReference => sameRef(ar)
+    case _ => false
+  }
+
   override def hashCode: Int = {
     // See http://stackoverflow.com/questions/113511/hash-code-implementation
     var h = 17
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/planning/patterns.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/planning/patterns.scala
index cd54d04814ea4..1dd75a8846303 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/planning/patterns.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/planning/patterns.scala
@@ -159,9 +159,10 @@ object PartialAggregation {
             // Should trim aliases around `GetField`s. These aliases are introduced while
             // resolving struct field accesses, because `GetField` is not a `NamedExpression`.
             // (Should we just turn `GetField` into a `NamedExpression`?)
+            val trimmed = e.transform { case Alias(g: ExtractValue, _) => g }
             namedGroupingExpressions
-              .get(e.transform { case Alias(g: ExtractValue, _) => g })
-              .map(_.toAttribute)
+              .find { case (k, v) => k semanticEquals trimmed }
+              .map(_._2.toAttribute)
               .getOrElse(e)
         }).asInstanceOf[Seq[NamedExpression]]
 
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
index ca2c4b4019c55..e60d00e63574d 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
@@ -773,4 +773,22 @@ class SQLQuerySuite extends QueryTest {
         | select * from v2 order by key limit 1
       """.stripMargin), Row(0, 3))
   }
+
+  test("SPARK-7269 Check analysis failed in case in-sensitive") {
+    Seq(1, 2, 3).map { i =>
+      (i.toString, i.toString)
+    }.toDF("key", "value").registerTempTable("df_analysis")
+    sql("SELECT kEy from df_analysis group by key").collect()
+    sql("SELECT kEy+3 from df_analysis group by key+3").collect()
+    sql("SELECT kEy+3, a.kEy, A.kEy from df_analysis A group by key").collect()
+    sql("SELECT cast(kEy+1 as Int) from df_analysis A group by cast(key+1 as int)").collect()
+    sql("SELECT cast(kEy+1 as Int) from df_analysis A group by key+1").collect()
+    sql("SELECT 2 from df_analysis A group by key+1").collect()
+    intercept[AnalysisException] {
+      sql("SELECT kEy+1 from df_analysis group by key+3")
+    }
+    intercept[AnalysisException] {
+      sql("SELECT cast(key+2 as Int) from df_analysis A group by cast(key+1 as int)")
+    }
+  }
 }

From 530397ba2f5c0fcabb86ba73048c95177ed0b9fc Mon Sep 17 00:00:00 2001
From: Yin Huai <yhuai@databricks.com>
Date: Mon, 18 May 2015 12:17:10 -0700
Subject: [PATCH 051/525] [SPARK-7567] [SQL] [follow-up] Use a new flag to set
 output committer based on mapreduce apis

cc liancheng marmbrus

Author: Yin Huai <yhuai@databricks.com>

Closes #6130 from yhuai/directOutput and squashes the following commits:

312b07d [Yin Huai] A data source can use spark.sql.sources.outputCommitterClass to override the output committer.
---
 .../scala/org/apache/spark/sql/SQLConf.scala  |  4 +++
 .../apache/spark/sql/parquet/newParquet.scala |  2 +-
 .../apache/spark/sql/sources/commands.scala   | 29 ++++++++++++++-----
 .../apache/spark/sql/sources/interfaces.scala |  3 +-
 4 files changed, 29 insertions(+), 9 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SQLConf.scala b/sql/core/src/main/scala/org/apache/spark/sql/SQLConf.scala
index 6da910e332e9b..77c6af27d1007 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/SQLConf.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/SQLConf.scala
@@ -71,6 +71,10 @@ private[spark] object SQLConf {
   // Whether to perform partition discovery when loading external data sources.  Default to true.
   val PARTITION_DISCOVERY_ENABLED = "spark.sql.sources.partitionDiscovery.enabled"
 
+  // The output committer class used by FSBasedRelation. The specified class needs to be a
+  // subclass of org.apache.hadoop.mapreduce.OutputCommitter.
+  val OUTPUT_COMMITTER_CLASS = "spark.sql.sources.outputCommitterClass"
+
   // Whether to perform eager analysis when constructing a dataframe.
   // Set to false when debugging requires the ability to look at invalid query plans.
   val DATAFRAME_EAGER_ANALYSIS = "spark.sql.eagerAnalysis"
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/newParquet.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/newParquet.scala
index bcbdb1ebd236a..fea54a251461d 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/newParquet.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/newParquet.scala
@@ -197,7 +197,7 @@ private[sql] class ParquetRelation2(
         classOf[ParquetOutputCommitter])
 
     conf.setClass(
-      "mapred.output.committer.class",
+      SQLConf.OUTPUT_COMMITTER_CLASS,
       committerClass,
       classOf[ParquetOutputCommitter])
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/sources/commands.scala b/sql/core/src/main/scala/org/apache/spark/sql/sources/commands.scala
index a09bb08de736a..d54dbb0831444 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/sources/commands.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/sources/commands.scala
@@ -23,7 +23,7 @@ import scala.collection.mutable
 
 import org.apache.hadoop.fs.Path
 import org.apache.hadoop.mapreduce._
-import org.apache.hadoop.mapreduce.lib.output.{FileOutputCommitter, FileOutputFormat}
+import org.apache.hadoop.mapreduce.lib.output.{FileOutputCommitter => MapReduceFileOutputCommitter, FileOutputFormat}
 import org.apache.hadoop.util.Shell
 import parquet.hadoop.util.ContextUtil
 
@@ -35,7 +35,7 @@ import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.expressions.codegen.GenerateProjection
 import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
 import org.apache.spark.sql.execution.RunnableCommand
-import org.apache.spark.sql.{DataFrame, SQLContext, SaveMode}
+import org.apache.spark.sql.{SQLConf, DataFrame, SQLContext, SaveMode}
 
 private[sql] case class InsertIntoDataSource(
     logicalRelation: LogicalRelation,
@@ -287,24 +287,39 @@ private[sql] abstract class BaseWriterContainer(
   protected def getWorkPath: String = {
     outputCommitter match {
       // FileOutputCommitter writes to a temporary location returned by `getWorkPath`.
-      case f: FileOutputCommitter => f.getWorkPath.toString
+      case f: MapReduceFileOutputCommitter => f.getWorkPath.toString
       case _ => outputPath
     }
   }
 
   private def newOutputCommitter(context: TaskAttemptContext): OutputCommitter = {
     val committerClass = context.getConfiguration.getClass(
-      "mapred.output.committer.class", null, classOf[OutputCommitter])
+      SQLConf.OUTPUT_COMMITTER_CLASS, null, classOf[OutputCommitter])
 
     Option(committerClass).map { clazz =>
-      val ctor = clazz.getDeclaredConstructor(classOf[Path], classOf[TaskAttemptContext])
-      ctor.newInstance(new Path(outputPath), context)
+      // Every output format based on org.apache.hadoop.mapreduce.lib.output.OutputFormat
+      // has an associated output committer. To override this output committer,
+      // we will first try to use the output committer set in SQLConf.OUTPUT_COMMITTER_CLASS.
+      // If a data source needs to override the output committer, it needs to set the
+      // output committer in prepareForWrite method.
+      if (classOf[MapReduceFileOutputCommitter].isAssignableFrom(clazz)) {
+        // The specified output committer is a FileOutputCommitter.
+        // So, we will use the FileOutputCommitter-specified constructor.
+        val ctor = clazz.getDeclaredConstructor(classOf[Path], classOf[TaskAttemptContext])
+        ctor.newInstance(new Path(outputPath), context)
+      } else {
+        // The specified output committer is just a OutputCommitter.
+        // So, we will use the no-argument constructor.
+        val ctor = clazz.getDeclaredConstructor()
+        ctor.newInstance()
+      }
     }.getOrElse {
+      // If output committer class is not set, we will use the one associated with the
+      // file output format.
       outputFormatClass.newInstance().getOutputCommitter(context)
     }
   }
 
-
   private def setupIDs(jobId: Int, splitId: Int, attemptId: Int): Unit = {
     this.jobId = SparkHadoopWriter.createJobID(new Date, jobId)
     this.taskId = new TaskID(this.jobId, true, splitId)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/sources/interfaces.scala b/sql/core/src/main/scala/org/apache/spark/sql/sources/interfaces.scala
index 274ab4485217a..a82a6758d2537 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/sources/interfaces.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/sources/interfaces.scala
@@ -527,7 +527,8 @@ abstract class HadoopFsRelation private[sql](maybePartitionSpec: Option[Partitio
 
   /**
    * Prepares a write job and returns an [[OutputWriterFactory]].  Client side job preparation can
-   * be put here.  For example, user defined output committer can be configured here.
+   * be put here.  For example, user defined output committer can be configured here
+   * by setting the output committer class in the conf of spark.sql.sources.outputCommitterClass.
    *
    * Note that the only side effect expected here is mutating `job` via its setters.  Especially,
    * Spark SQL caches [[BaseRelation]] instances for performance, mutating relation internal states

From 9dadf019b93038e1e18336ccd06c5eecb4bae32f Mon Sep 17 00:00:00 2001
From: Cheng Lian <lian@databricks.com>
Date: Mon, 18 May 2015 12:45:37 -0700
Subject: [PATCH 052/525] [SPARK-7673] [SQL] WIP: HadoopFsRelation and
 ParquetRelation2 performance optimizations

This PR introduces several performance optimizations to `HadoopFsRelation` and `ParquetRelation2`:

1.  Moving `FileStatus` listing from `DataSourceStrategy` into a cache within `HadoopFsRelation`.

    This new cache generalizes and replaces the one used in `ParquetRelation2`.

    This also introduces an interface change: to reuse cached `FileStatus` objects, `HadoopFsRelation.buildScan` methods now receive `Array[FileStatus]` instead of `Array[String]`.

1.  When Parquet task side metadata reading is enabled, skip reading row group information when reading Parquet footers.

    This is basically what PR #5334 does. Also, now we uses `ParquetFileReader.readAllFootersInParallel` to read footers in parallel.

Another optimization in question is, instead of asking `HadoopFsRelation.buildScan` to return an `RDD[Row]` for a single selected partition and then union them all, we ask it to return an `RDD[Row]` for all selected partitions. This optimization is based on the fact that Hadoop configuration broadcasting used in `NewHadoopRDD` takes 34% time in the following microbenchmark.  However, this complicates data source user code because user code must merge partition values manually.

To check the cost of broadcasting in `NewHadoopRDD`, I also did microbenchmark after removing the `broadcast` call in `NewHadoopRDD`.  All results are shown below.

### Microbenchmark

#### Preparation code

Generating a partitioned table with 50k partitions, 1k rows per partition:

```scala
import sqlContext._
import sqlContext.implicits._

for (n <- 0 until 500) {
  val data = for {
    p <- (n * 10) until ((n + 1) * 10)
    i <- 0 until 1000
  } yield (i, f"val_$i%04d", f"$p%04d")

  data.
    toDF("a", "b", "p").
    write.
    partitionBy("p").
    mode("append").
    parquet(path)
}
```

#### Benchmarking code

```scala
import sqlContext._
import sqlContext.implicits._

import org.apache.spark.sql.types._
import com.google.common.base.Stopwatch

val path = "hdfs://localhost:9000/user/lian/5k"

def benchmark(n: Int)(f: => Unit) {
  val stopwatch = new Stopwatch()

  def run() = {
    stopwatch.reset()
    stopwatch.start()
    f
    stopwatch.stop()
    stopwatch.elapsedMillis()
  }

  val records = (0 until n).map(_ => run())

  (0 until n).foreach(i => println(s"Round $i: ${records(i)} ms"))
  println(s"Average: ${records.sum / n.toDouble} ms")
}

benchmark(3) { read.parquet(path).explain(extended = true) }
```

#### Results

Before:

```
Round 0: 72528 ms
Round 1: 68938 ms
Round 2: 65372 ms
Average: 68946.0 ms
```

After:

```
Round 0: 59499 ms
Round 1: 53645 ms
Round 2: 53844 ms
Round 3: 49093 ms
Round 4: 50555 ms
Average: 53327.2 ms
```

Also removing Hadoop configuration broadcasting:

(Note that I was testing on a local laptop, thus network cost is pretty low.)

```
Round 0: 15806 ms
Round 1: 14394 ms
Round 2: 14699 ms
Round 3: 15334 ms
Round 4: 14123 ms
Average: 14871.2 ms
```

Author: Cheng Lian <lian@databricks.com>

Closes #6225 from liancheng/spark-7673 and squashes the following commits:

2d58a2b [Cheng Lian] Skips reading row group information when using task side metadata reading
7aa3748 [Cheng Lian] Optimizes FileStatusCache by introducing a map from parent directories to child files
ba41250 [Cheng Lian] Reuses HadoopFsRelation FileStatusCache in ParquetRelation2
3d278f7 [Cheng Lian] Fixes a bug when reading a single Parquet data file
b84612a [Cheng Lian] Fixes Scala style issue
6a08b02 [Cheng Lian] WIP: Moves file status cache into HadoopFSRelation
---
 .../apache/spark/sql/parquet/newParquet.scala |  61 +++++-----
 .../sql/sources/DataSourceStrategy.scala      |  37 ++-----
 .../apache/spark/sql/sources/interfaces.scala | 104 ++++++++++++++----
 .../sql/sources/SimpleTextRelation.scala      |   6 +-
 4 files changed, 117 insertions(+), 91 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/newParquet.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/newParquet.scala
index fea54a251461d..7ca44f7b81a2d 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/newParquet.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/newParquet.scala
@@ -23,12 +23,11 @@ import scala.collection.JavaConversions._
 import scala.util.Try
 
 import com.google.common.base.Objects
-import org.apache.hadoop.fs.{FileStatus, FileSystem, Path}
+import org.apache.hadoop.fs.{FileStatus, Path}
 import org.apache.hadoop.io.Writable
 import org.apache.hadoop.mapreduce._
 import org.apache.hadoop.mapreduce.lib.input.FileInputFormat
 import parquet.filter2.predicate.FilterApi
-import parquet.format.converter.ParquetMetadataConverter
 import parquet.hadoop._
 import parquet.hadoop.metadata.CompressionCodecName
 import parquet.hadoop.util.ContextUtil
@@ -175,8 +174,8 @@ private[sql] class ParquetRelation2(
   override def dataSchema: StructType = metadataCache.dataSchema
 
   override private[sql] def refresh(): Unit = {
-    metadataCache.refresh()
     super.refresh()
+    metadataCache.refresh()
   }
 
   // Parquet data source always uses Catalyst internal representations.
@@ -234,15 +233,15 @@ private[sql] class ParquetRelation2(
   override def buildScan(
       requiredColumns: Array[String],
       filters: Array[Filter],
-      inputPaths: Array[String]): RDD[Row] = {
+      inputFiles: Array[FileStatus]): RDD[Row] = {
 
     val job = new Job(SparkHadoopUtil.get.conf)
     val conf = ContextUtil.getConfiguration(job)
 
     ParquetInputFormat.setReadSupportClass(job, classOf[RowReadSupport])
 
-    if (inputPaths.nonEmpty) {
-      FileInputFormat.setInputPaths(job, inputPaths.map(new Path(_)): _*)
+    if (inputFiles.nonEmpty) {
+      FileInputFormat.setInputPaths(job, inputFiles.map(_.getPath): _*)
     }
 
     // Try to push down filters when filter push-down is enabled.
@@ -269,10 +268,7 @@ private[sql] class ParquetRelation2(
     val useMetadataCache = sqlContext.getConf(SQLConf.PARQUET_CACHE_METADATA, "true").toBoolean
     conf.set(SQLConf.PARQUET_CACHE_METADATA, useMetadataCache.toString)
 
-    val inputFileStatuses =
-      metadataCache.dataStatuses.filter(f => inputPaths.contains(f.getPath.toString))
-
-    val footers = inputFileStatuses.map(metadataCache.footers)
+    val footers = inputFiles.map(f => metadataCache.footers(f.getPath))
 
     // TODO Stop using `FilteringParquetRowInputFormat` and overriding `getPartition`.
     // After upgrading to Parquet 1.6.0, we should be able to stop caching `FileStatus` objects and
@@ -287,7 +283,7 @@ private[sql] class ParquetRelation2(
 
       val cacheMetadata = useMetadataCache
 
-      @transient val cachedStatuses = inputFileStatuses.map { f =>
+      @transient val cachedStatuses = inputFiles.map { f =>
         // In order to encode the authority of a Path containing special characters such as /,
         // we need to use the string returned by the URI of the path to create a new Path.
         val pathWithAuthority = new Path(f.getPath.toUri.toString)
@@ -333,7 +329,7 @@ private[sql] class ParquetRelation2(
     private var commonMetadataStatuses: Array[FileStatus] = _
 
     // Parquet footer cache.
-    var footers: Map[FileStatus, Footer] = _
+    var footers: Map[Path, Footer] = _
 
     // `FileStatus` objects of all data files (Parquet part-files).
     var dataStatuses: Array[FileStatus] = _
@@ -349,35 +345,30 @@ private[sql] class ParquetRelation2(
      * Refreshes `FileStatus`es, footers, partition spec, and table schema.
      */
     def refresh(): Unit = {
-      // Support either reading a collection of raw Parquet part-files, or a collection of folders
-      // containing Parquet files (e.g. partitioned Parquet table).
-      val baseStatuses = paths.distinct.flatMap { p =>
-        val path = new Path(p)
-        val fs = path.getFileSystem(sqlContext.sparkContext.hadoopConfiguration)
-        val qualified = path.makeQualified(fs.getUri, fs.getWorkingDirectory)
-        Try(fs.getFileStatus(qualified)).toOption
-      }
-      assert(baseStatuses.forall(!_.isDir) || baseStatuses.forall(_.isDir))
-
       // Lists `FileStatus`es of all leaf nodes (files) under all base directories.
-      val leaves = baseStatuses.flatMap { f =>
-        val fs = FileSystem.get(f.getPath.toUri, SparkHadoopUtil.get.conf)
-        SparkHadoopUtil.get.listLeafStatuses(fs, f.getPath).filter { f =>
-          isSummaryFile(f.getPath) ||
-            !(f.getPath.getName.startsWith("_") || f.getPath.getName.startsWith("."))
-        }
-      }
+      val leaves = cachedLeafStatuses().filter { f =>
+        isSummaryFile(f.getPath) ||
+          !(f.getPath.getName.startsWith("_") || f.getPath.getName.startsWith("."))
+      }.toArray
 
       dataStatuses = leaves.filterNot(f => isSummaryFile(f.getPath))
       metadataStatuses = leaves.filter(_.getPath.getName == ParquetFileWriter.PARQUET_METADATA_FILE)
       commonMetadataStatuses =
         leaves.filter(_.getPath.getName == ParquetFileWriter.PARQUET_COMMON_METADATA_FILE)
 
-      footers = (dataStatuses ++ metadataStatuses ++ commonMetadataStatuses).par.map { f =>
-        val parquetMetadata = ParquetFileReader.readFooter(
-          SparkHadoopUtil.get.conf, f, ParquetMetadataConverter.NO_FILTER)
-        f -> new Footer(f.getPath, parquetMetadata)
-      }.seq.toMap
+      footers = {
+        val conf = SparkHadoopUtil.get.conf
+        val taskSideMetaData = conf.getBoolean(ParquetInputFormat.TASK_SIDE_METADATA, true)
+        val rawFooters = if (shouldMergeSchemas) {
+          ParquetFileReader.readAllFootersInParallel(
+            conf, seqAsJavaList(leaves), taskSideMetaData)
+        } else {
+          ParquetFileReader.readAllFootersInParallelUsingSummaryFiles(
+            conf, seqAsJavaList(leaves), taskSideMetaData)
+        }
+
+        rawFooters.map(footer => footer.getFile -> footer).toMap
+      }
 
       // If we already get the schema, don't need to re-compute it since the schema merging is
       // time-consuming.
@@ -448,7 +439,7 @@ private[sql] class ParquetRelation2(
         "No schema defined, " +
           s"and no Parquet data file or summary file found under ${paths.mkString(", ")}.")
 
-      ParquetRelation2.readSchema(filesToTouch.map(footers.apply), sqlContext)
+      ParquetRelation2.readSchema(filesToTouch.map(f => footers.apply(f.getPath)), sqlContext)
     }
   }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/sources/DataSourceStrategy.scala b/sql/core/src/main/scala/org/apache/spark/sql/sources/DataSourceStrategy.scala
index e6324b20b3065..1615a6dcbdb2a 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/sources/DataSourceStrategy.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/sources/DataSourceStrategy.scala
@@ -17,20 +17,16 @@
 
 package org.apache.spark.sql.sources
 
-import org.apache.hadoop.fs.Path
-
 import org.apache.spark.Logging
-import org.apache.spark.deploy.SparkHadoopUtil
-import org.apache.spark.rdd.{UnionRDD, RDD}
-import org.apache.spark.sql.Row
+import org.apache.spark.rdd.{RDD, UnionRDD}
 import org.apache.spark.sql.catalyst.expressions
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.planning.PhysicalOperation
 import org.apache.spark.sql.catalyst.plans.logical
 import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
 import org.apache.spark.sql.execution.SparkPlan
-import org.apache.spark.sql.types.{StructType, UTF8String, StringType}
-import org.apache.spark.sql._
+import org.apache.spark.sql.types.{StringType, StructType, UTF8String}
+import org.apache.spark.sql.{SaveMode, Strategy, execution, sources}
 
 /**
  * A Strategy for planning scans over data sources defined using the sources API.
@@ -58,7 +54,7 @@ private[sql] object DataSourceStrategy extends Strategy with Logging {
         filters,
         (a, _) => t.buildScan(a)) :: Nil
 
-    // Scanning partitioned FSBasedRelation
+    // Scanning partitioned HadoopFsRelation
     case PhysicalOperation(projectList, filters, l @ LogicalRelation(t: HadoopFsRelation))
         if t.partitionSpec.partitionColumns.nonEmpty =>
       val selectedPartitions = prunePartitions(filters, t.partitionSpec).toArray
@@ -86,22 +82,13 @@ private[sql] object DataSourceStrategy extends Strategy with Logging {
         t.partitionSpec.partitionColumns,
         selectedPartitions) :: Nil
 
-    // Scanning non-partitioned FSBasedRelation
+    // Scanning non-partitioned HadoopFsRelation
     case PhysicalOperation(projectList, filters, l @ LogicalRelation(t: HadoopFsRelation)) =>
-      val inputPaths = t.paths.map(new Path(_)).flatMap { path =>
-        val fs = path.getFileSystem(t.sqlContext.sparkContext.hadoopConfiguration)
-        val qualifiedPath = path.makeQualified(fs.getUri, fs.getWorkingDirectory)
-        SparkHadoopUtil.get.listLeafStatuses(fs, qualifiedPath).map(_.getPath).filterNot { path =>
-          val name = path.getName
-          name.startsWith("_") || name.startsWith(".")
-        }.map(fs.makeQualified(_).toString)
-      }
-
       pruneFilterProject(
         l,
         projectList,
         filters,
-        (a, f) => t.buildScan(a, f, inputPaths)) :: Nil
+        (a, f) => t.buildScan(a, f, t.paths)) :: Nil
 
     case l @ LogicalRelation(t: TableScan) =>
       createPhysicalRDD(l.relation, l.output, t.buildScan()) :: Nil
@@ -130,16 +117,6 @@ private[sql] object DataSourceStrategy extends Strategy with Logging {
 
     // Builds RDD[Row]s for each selected partition.
     val perPartitionRows = partitions.map { case Partition(partitionValues, dir) =>
-      // Paths to all data files within this partition
-      val dataFilePaths = {
-        val dirPath = new Path(dir)
-        val fs = dirPath.getFileSystem(SparkHadoopUtil.get.conf)
-        fs.listStatus(dirPath).map(_.getPath).filterNot { path =>
-          val name = path.getName
-          name.startsWith("_") || name.startsWith(".")
-        }.map(fs.makeQualified(_).toString)
-      }
-
       // The table scan operator (PhysicalRDD) which retrieves required columns from data files.
       // Notice that the schema of data files, represented by `relation.dataSchema`, may contain
       // some partition column(s).
@@ -155,7 +132,7 @@ private[sql] object DataSourceStrategy extends Strategy with Logging {
             // assuming partition columns data stored in data files are always consistent with those
             // partition values encoded in partition directory paths.
             val nonPartitionColumns = requiredColumns.filterNot(partitionColNames.contains)
-            val dataRows = relation.buildScan(nonPartitionColumns, filters, dataFilePaths)
+            val dataRows = relation.buildScan(nonPartitionColumns, filters, Array(dir))
 
             // Merges data values with partition values.
             mergeWithPartitionValues(
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/sources/interfaces.scala b/sql/core/src/main/scala/org/apache/spark/sql/sources/interfaces.scala
index a82a6758d2537..9b52d1be3df2d 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/sources/interfaces.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/sources/interfaces.scala
@@ -17,14 +17,14 @@
 
 package org.apache.spark.sql.sources
 
+import scala.collection.mutable
 import scala.util.Try
 
 import org.apache.hadoop.conf.Configuration
-import org.apache.hadoop.fs.{FileStatus, Path}
+import org.apache.hadoop.fs.{FileStatus, FileSystem, Path}
 import org.apache.hadoop.mapreduce.{Job, TaskAttemptContext}
 
 import org.apache.spark.annotation.{DeveloperApi, Experimental}
-import org.apache.spark.deploy.SparkHadoopUtil
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql._
 import org.apache.spark.sql.catalyst.expressions._
@@ -368,18 +368,61 @@ abstract class HadoopFsRelation private[sql](maybePartitionSpec: Option[Partitio
 
   private var _partitionSpec: PartitionSpec = _
 
+  private class FileStatusCache {
+    var leafFiles = mutable.Map.empty[Path, FileStatus]
+
+    var leafDirToChildrenFiles = mutable.Map.empty[Path, Array[FileStatus]]
+
+    var leafDirs = mutable.Map.empty[Path, FileStatus]
+
+    def refresh(): Unit = {
+      def listLeafFilesAndDirs(fs: FileSystem, status: FileStatus): Set[FileStatus] = {
+        val (dirs, files) = fs.listStatus(status.getPath).partition(_.isDir)
+        val leafDirs = if (dirs.isEmpty) Set(status) else Set.empty[FileStatus]
+        files.toSet ++ leafDirs ++ dirs.flatMap(dir => listLeafFilesAndDirs(fs, dir))
+      }
+
+      leafDirs.clear()
+      leafFiles.clear()
+
+      // We don't filter files/directories like _temporary/_SUCCESS here, as specific data sources
+      // may take advantages over them (e.g. Parquet _metadata and _common_metadata files).
+      val statuses = paths.flatMap { path =>
+        val hdfsPath = new Path(path)
+        val fs = hdfsPath.getFileSystem(hadoopConf)
+        val qualified = hdfsPath.makeQualified(fs.getUri, fs.getWorkingDirectory)
+        Try(fs.getFileStatus(qualified)).toOption.toArray.flatMap(listLeafFilesAndDirs(fs, _))
+      }
+
+      val (dirs, files) = statuses.partition(_.isDir)
+      leafDirs ++= dirs.map(d => d.getPath -> d).toMap
+      leafFiles ++= files.map(f => f.getPath -> f).toMap
+      leafDirToChildrenFiles ++= files.groupBy(_.getPath.getParent)
+    }
+  }
+
+  private lazy val fileStatusCache = {
+    val cache = new FileStatusCache
+    cache.refresh()
+    cache
+  }
+
+  protected def cachedLeafStatuses(): Set[FileStatus] = {
+    fileStatusCache.leafFiles.values.toSet
+  }
+
   final private[sql] def partitionSpec: PartitionSpec = {
     if (_partitionSpec == null) {
       _partitionSpec = maybePartitionSpec
         .map(spec => spec.copy(partitionColumns = spec.partitionColumns.asNullable))
         .orElse(userDefinedPartitionColumns.map(PartitionSpec(_, Array.empty[Partition])))
         .getOrElse {
-        if (sqlContext.conf.partitionDiscoveryEnabled()) {
-          discoverPartitions()
-        } else {
-          PartitionSpec(StructType(Nil), Array.empty[Partition])
+          if (sqlContext.conf.partitionDiscoveryEnabled()) {
+            discoverPartitions()
+          } else {
+            PartitionSpec(StructType(Nil), Array.empty[Partition])
+          }
         }
-      }
     }
     _partitionSpec
   }
@@ -409,20 +452,14 @@ abstract class HadoopFsRelation private[sql](maybePartitionSpec: Option[Partitio
   def userDefinedPartitionColumns: Option[StructType] = None
 
   private[sql] def refresh(): Unit = {
+    fileStatusCache.refresh()
     if (sqlContext.conf.partitionDiscoveryEnabled()) {
       _partitionSpec = discoverPartitions()
     }
   }
 
   private def discoverPartitions(): PartitionSpec = {
-    val basePaths = paths.map(new Path(_))
-    val leafDirs = basePaths.flatMap { path =>
-      val fs = path.getFileSystem(hadoopConf)
-      Try(fs.getFileStatus(path.makeQualified(fs.getUri, fs.getWorkingDirectory)))
-        .filter(_.isDir)
-        .map(SparkHadoopUtil.get.listLeafDirStatuses(fs, _))
-        .getOrElse(Seq.empty[FileStatus])
-    }.map(_.getPath)
+    val leafDirs = fileStatusCache.leafDirs.keys.toSeq
 
     if (leafDirs.nonEmpty) {
       PartitioningUtils.parsePartitions(leafDirs, PartitioningUtils.DEFAULT_PARTITION_NAME)
@@ -444,6 +481,27 @@ abstract class HadoopFsRelation private[sql](maybePartitionSpec: Option[Partitio
     })
   }
 
+  private[sources] final def buildScan(
+      requiredColumns: Array[String],
+      filters: Array[Filter],
+      inputPaths: Array[String]): RDD[Row] = {
+    val inputStatuses = inputPaths.flatMap { input =>
+      val path = new Path(input)
+
+      // First assumes `input` is a directory path, and tries to get all files contained in it.
+      fileStatusCache.leafDirToChildrenFiles.getOrElse(
+        path,
+        // Otherwise, `input` might be a file path
+        fileStatusCache.leafFiles.get(path).toArray
+      ).filter { status =>
+        val name = status.getPath.getName
+        !name.startsWith("_") && !name.startsWith(".")
+      }
+    }
+
+    buildScan(requiredColumns, filters, inputStatuses)
+  }
+
   /**
    * Specifies schema of actual data files.  For partitioned relations, if one or more partitioned
    * columns are contained in the data files, they should also appear in `dataSchema`.
@@ -457,13 +515,13 @@ abstract class HadoopFsRelation private[sql](maybePartitionSpec: Option[Partitio
    * this relation. For partitioned relations, this method is called for each selected partition,
    * and builds an `RDD[Row]` containing all rows within that single partition.
    *
-   * @param inputPaths For a non-partitioned relation, it contains paths of all data files in the
+   * @param inputFiles For a non-partitioned relation, it contains paths of all data files in the
    *        relation. For a partitioned relation, it contains paths of all data files in a single
    *        selected partition.
    *
    * @since 1.4.0
    */
-  def buildScan(inputPaths: Array[String]): RDD[Row] = {
+  def buildScan(inputFiles: Array[FileStatus]): RDD[Row] = {
     throw new UnsupportedOperationException(
       "At least one buildScan() method should be overridden to read the relation.")
   }
@@ -474,13 +532,13 @@ abstract class HadoopFsRelation private[sql](maybePartitionSpec: Option[Partitio
    * and builds an `RDD[Row]` containing all rows within that single partition.
    *
    * @param requiredColumns Required columns.
-   * @param inputPaths For a non-partitioned relation, it contains paths of all data files in the
+   * @param inputFiles For a non-partitioned relation, it contains paths of all data files in the
    *        relation. For a partitioned relation, it contains paths of all data files in a single
    *        selected partition.
    *
    * @since 1.4.0
    */
-  def buildScan(requiredColumns: Array[String], inputPaths: Array[String]): RDD[Row] = {
+  def buildScan(requiredColumns: Array[String], inputFiles: Array[FileStatus]): RDD[Row] = {
     // Yeah, to workaround serialization...
     val dataSchema = this.dataSchema
     val codegenEnabled = this.codegenEnabled
@@ -490,7 +548,7 @@ abstract class HadoopFsRelation private[sql](maybePartitionSpec: Option[Partitio
       BoundReference(dataSchema.fieldIndex(col), field.dataType, field.nullable)
     }.toSeq
 
-    buildScan(inputPaths).mapPartitions { rows =>
+    buildScan(inputFiles).mapPartitions { rows =>
       val buildProjection = if (codegenEnabled) {
         GenerateMutableProjection.generate(requiredOutput, dataSchema.toAttributes)
       } else {
@@ -512,7 +570,7 @@ abstract class HadoopFsRelation private[sql](maybePartitionSpec: Option[Partitio
    *        of all `filters`.  The pushed down filters are currently purely an optimization as they
    *        will all be evaluated again. This means it is safe to use them with methods that produce
    *        false positives such as filtering partitions based on a bloom filter.
-   * @param inputPaths For a non-partitioned relation, it contains paths of all data files in the
+   * @param inputFiles For a non-partitioned relation, it contains paths of all data files in the
    *        relation. For a partitioned relation, it contains paths of all data files in a single
    *        selected partition.
    *
@@ -521,8 +579,8 @@ abstract class HadoopFsRelation private[sql](maybePartitionSpec: Option[Partitio
   def buildScan(
       requiredColumns: Array[String],
       filters: Array[Filter],
-      inputPaths: Array[String]): RDD[Row] = {
-    buildScan(requiredColumns, inputPaths)
+      inputFiles: Array[FileStatus]): RDD[Row] = {
+    buildScan(requiredColumns, inputFiles)
   }
 
   /**
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/sources/SimpleTextRelation.scala b/sql/hive/src/test/scala/org/apache/spark/sql/sources/SimpleTextRelation.scala
index 29b21586f9c2a..09eed6646c55a 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/sources/SimpleTextRelation.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/sources/SimpleTextRelation.scala
@@ -21,7 +21,7 @@ import java.text.NumberFormat
 import java.util.UUID
 
 import com.google.common.base.Objects
-import org.apache.hadoop.fs.Path
+import org.apache.hadoop.fs.{FileStatus, Path}
 import org.apache.hadoop.io.{NullWritable, Text}
 import org.apache.hadoop.mapreduce.lib.output.{FileOutputFormat, TextOutputFormat}
 import org.apache.hadoop.mapreduce.{Job, RecordWriter, TaskAttemptContext}
@@ -101,10 +101,10 @@ class SimpleTextRelation(
   override def hashCode(): Int =
     Objects.hashCode(paths, maybeDataSchema, dataSchema)
 
-  override def buildScan(inputPaths: Array[String]): RDD[Row] = {
+  override def buildScan(inputStatuses: Array[FileStatus]): RDD[Row] = {
     val fields = dataSchema.map(_.dataType)
 
-    sparkContext.textFile(inputPaths.mkString(",")).map { record =>
+    sparkContext.textFile(inputStatuses.map(_.getPath).mkString(",")).map { record =>
       Row(record.split(",").zip(fields).map { case (value, dataType) =>
         Cast(Literal(value), dataType).eval()
       }: _*)

From 32fbd297dd651ba3ce4ce52aeb0488233149cdf9 Mon Sep 17 00:00:00 2001
From: Davies Liu <davies@databricks.com>
Date: Mon, 18 May 2015 12:55:13 -0700
Subject: [PATCH 053/525] [SPARK-6216] [PYSPARK] check python version of worker
 with driver

This PR revert #5404, change to pass the version of python in driver into JVM, check it in worker before deserializing closure, then it can works with different major version of Python.

Author: Davies Liu <davies@databricks.com>

Closes #6203 from davies/py_version and squashes the following commits:

b8fb76e [Davies Liu] fix test
6ce5096 [Davies Liu] use string for version
47c6278 [Davies Liu] check python version of worker with driver
---
 .../org/apache/spark/api/python/PythonRDD.scala      |  3 +++
 python/pyspark/context.py                            |  1 +
 python/pyspark/rdd.py                                |  4 ++--
 python/pyspark/sql/context.py                        |  1 +
 python/pyspark/sql/functions.py                      |  4 ++--
 python/pyspark/tests.py                              |  6 +++---
 python/pyspark/worker.py                             | 12 +++++++-----
 .../scala/org/apache/spark/sql/UDFRegistration.scala |  2 ++
 .../org/apache/spark/sql/UserDefinedFunction.scala   |  5 +++--
 .../org/apache/spark/sql/execution/pythonUdfs.scala  |  2 ++
 10 files changed, 26 insertions(+), 14 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala b/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala
index 7409dc2d866f6..2d92f6a42b308 100644
--- a/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala
+++ b/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala
@@ -47,6 +47,7 @@ private[spark] class PythonRDD(
     pythonIncludes: JList[String],
     preservePartitoning: Boolean,
     pythonExec: String,
+    pythonVer: String,
     broadcastVars: JList[Broadcast[PythonBroadcast]],
     accumulator: Accumulator[JList[Array[Byte]]])
   extends RDD[Array[Byte]](parent) {
@@ -210,6 +211,8 @@ private[spark] class PythonRDD(
         val dataOut = new DataOutputStream(stream)
         // Partition index
         dataOut.writeInt(split.index)
+        // Python version of driver
+        PythonRDD.writeUTF(pythonVer, dataOut)
         // sparkFilesDir
         PythonRDD.writeUTF(SparkFiles.getRootDirectory, dataOut)
         // Python includes (*.zip and *.egg files)
diff --git a/python/pyspark/context.py b/python/pyspark/context.py
index 31992795a9e45..d25ee855235be 100644
--- a/python/pyspark/context.py
+++ b/python/pyspark/context.py
@@ -173,6 +173,7 @@ def _do_init(self, master, appName, sparkHome, pyFiles, environment, batchSize,
             self._jvm.PythonAccumulatorParam(host, port))
 
         self.pythonExec = os.environ.get("PYSPARK_PYTHON", 'python')
+        self.pythonVer = "%d.%d" % sys.version_info[:2]
 
         # Broadcast's __reduce__ method stores Broadcast instances here.
         # This allows other code to determine which Broadcast instances have
diff --git a/python/pyspark/rdd.py b/python/pyspark/rdd.py
index 545c5ad20cb96..70db4bbe4cbc5 100644
--- a/python/pyspark/rdd.py
+++ b/python/pyspark/rdd.py
@@ -2260,7 +2260,7 @@ def toLocalIterator(self):
 def _prepare_for_python_RDD(sc, command, obj=None):
     # the serialized command will be compressed by broadcast
     ser = CloudPickleSerializer()
-    pickled_command = ser.dumps((command, sys.version_info[:2]))
+    pickled_command = ser.dumps(command)
     if len(pickled_command) > (1 << 20):  # 1M
         # The broadcast will have same life cycle as created PythonRDD
         broadcast = sc.broadcast(pickled_command)
@@ -2344,7 +2344,7 @@ def _jrdd(self):
         python_rdd = self.ctx._jvm.PythonRDD(self._prev_jrdd.rdd(),
                                              bytearray(pickled_cmd),
                                              env, includes, self.preservesPartitioning,
-                                             self.ctx.pythonExec,
+                                             self.ctx.pythonExec, self.ctx.pythonVer,
                                              bvars, self.ctx._javaAccumulator)
         self._jrdd_val = python_rdd.asJavaRDD()
 
diff --git a/python/pyspark/sql/context.py b/python/pyspark/sql/context.py
index f6f107ca32d2f..0bde7191242ab 100644
--- a/python/pyspark/sql/context.py
+++ b/python/pyspark/sql/context.py
@@ -157,6 +157,7 @@ def registerFunction(self, name, f, returnType=StringType()):
                                             env,
                                             includes,
                                             self._sc.pythonExec,
+                                            self._sc.pythonVer,
                                             bvars,
                                             self._sc._javaAccumulator,
                                             returnType.json())
diff --git a/python/pyspark/sql/functions.py b/python/pyspark/sql/functions.py
index 8d0e766ecd3b4..fbe9bf5b526af 100644
--- a/python/pyspark/sql/functions.py
+++ b/python/pyspark/sql/functions.py
@@ -353,8 +353,8 @@ def _create_judf(self):
         ssql_ctx = sc._jvm.SQLContext(sc._jsc.sc())
         jdt = ssql_ctx.parseDataType(self.returnType.json())
         fname = f.__name__ if hasattr(f, '__name__') else f.__class__.__name__
-        judf = sc._jvm.UserDefinedPythonFunction(fname, bytearray(pickled_command), env,
-                                                 includes, sc.pythonExec, broadcast_vars,
+        judf = sc._jvm.UserDefinedPythonFunction(fname, bytearray(pickled_command), env, includes,
+                                                 sc.pythonExec, sc.pythonVer, broadcast_vars,
                                                  sc._javaAccumulator, jdt)
         return judf
 
diff --git a/python/pyspark/tests.py b/python/pyspark/tests.py
index 09de4d159fdcf..5e023f6c53517 100644
--- a/python/pyspark/tests.py
+++ b/python/pyspark/tests.py
@@ -1543,13 +1543,13 @@ def count():
     def test_with_different_versions_of_python(self):
         rdd = self.sc.parallelize(range(10))
         rdd.count()
-        version = sys.version_info
-        sys.version_info = (2, 0, 0)
+        version = self.sc.pythonVer
+        self.sc.pythonVer = "2.0"
         try:
             with QuietTest(self.sc):
                 self.assertRaises(Py4JJavaError, lambda: rdd.count())
         finally:
-            sys.version_info = version
+            self.sc.pythonVer = version
 
 
 class SparkSubmitTests(unittest.TestCase):
diff --git a/python/pyspark/worker.py b/python/pyspark/worker.py
index fbdaf3a5814cd..93df9002be377 100644
--- a/python/pyspark/worker.py
+++ b/python/pyspark/worker.py
@@ -57,6 +57,12 @@ def main(infile, outfile):
         if split_index == -1:  # for unit tests
             exit(-1)
 
+        version = utf8_deserializer.loads(infile)
+        if version != "%d.%d" % sys.version_info[:2]:
+            raise Exception(("Python in worker has different version %s than that in " +
+                             "driver %s, PySpark cannot run with different minor versions") %
+                            ("%d.%d" % sys.version_info[:2], version))
+
         # initialize global state
         shuffle.MemoryBytesSpilled = 0
         shuffle.DiskBytesSpilled = 0
@@ -92,11 +98,7 @@ def main(infile, outfile):
         command = pickleSer._read_with_length(infile)
         if isinstance(command, Broadcast):
             command = pickleSer.loads(command.value)
-        (func, profiler, deserializer, serializer), version = command
-        if version != sys.version_info[:2]:
-            raise Exception(("Python in worker has different version %s than that in " +
-                            "driver %s, PySpark cannot run with different minor versions") %
-                            (sys.version_info[:2], version))
+        func, profiler, deserializer, serializer = command
         init_time = time.time()
 
         def process():
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/UDFRegistration.scala b/sql/core/src/main/scala/org/apache/spark/sql/UDFRegistration.scala
index dc3389c41bbfa..3cc5c2441d8a5 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/UDFRegistration.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/UDFRegistration.scala
@@ -46,6 +46,7 @@ class UDFRegistration private[sql] (sqlContext: SQLContext) extends Logging {
       envVars: JMap[String, String],
       pythonIncludes: JList[String],
       pythonExec: String,
+      pythonVer: String,
       broadcastVars: JList[Broadcast[PythonBroadcast]],
       accumulator: Accumulator[JList[Array[Byte]]],
       stringDataType: String): Unit = {
@@ -70,6 +71,7 @@ class UDFRegistration private[sql] (sqlContext: SQLContext) extends Logging {
         envVars,
         pythonIncludes,
         pythonExec,
+        pythonVer,
         broadcastVars,
         accumulator,
         dataType,
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/UserDefinedFunction.scala b/sql/core/src/main/scala/org/apache/spark/sql/UserDefinedFunction.scala
index 505ab1301ec96..a02e202d2eebc 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/UserDefinedFunction.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/UserDefinedFunction.scala
@@ -58,14 +58,15 @@ private[sql] case class UserDefinedPythonFunction(
     envVars: JMap[String, String],
     pythonIncludes: JList[String],
     pythonExec: String,
+    pythonVer: String,
     broadcastVars: JList[Broadcast[PythonBroadcast]],
     accumulator: Accumulator[JList[Array[Byte]]],
     dataType: DataType) {
 
   /** Returns a [[Column]] that will evaluate to calling this UDF with the given input. */
   def apply(exprs: Column*): Column = {
-    val udf = PythonUDF(name, command, envVars, pythonIncludes, pythonExec, broadcastVars,
-      accumulator, dataType, exprs.map(_.expr))
+    val udf = PythonUDF(name, command, envVars, pythonIncludes, pythonExec, pythonVer,
+      broadcastVars, accumulator, dataType, exprs.map(_.expr))
     Column(udf)
   }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/pythonUdfs.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/pythonUdfs.scala
index 65dd7ba020fa3..11b2897f76786 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/pythonUdfs.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/pythonUdfs.scala
@@ -46,6 +46,7 @@ private[spark] case class PythonUDF(
     envVars: JMap[String, String],
     pythonIncludes: JList[String],
     pythonExec: String,
+    pythonVer: String,
     broadcastVars: JList[Broadcast[PythonBroadcast]],
     accumulator: Accumulator[JList[Array[Byte]]],
     dataType: DataType,
@@ -251,6 +252,7 @@ case class BatchPythonEvaluation(udf: PythonUDF, output: Seq[Attribute], child:
       udf.pythonIncludes,
       false,
       udf.pythonExec,
+      udf.pythonVer,
       udf.broadcastVars,
       udf.accumulator
     ).mapPartitions { iter =>

From 0b6f503d5337a8387c37cc2c8e544f67c68f7dad Mon Sep 17 00:00:00 2001
From: zsxwing <zsxwing@gmail.com>
Date: Mon, 18 May 2015 13:34:43 -0700
Subject: [PATCH 054/525] [SPARK-7658] [STREAMING] [WEBUI] Update the mouse
 behaviors for the timeline graphs

1. If the user click one point of a batch, scroll down to the corresponding batch row and highlight it. And recovery the batch row after 3 seconds if necessary.

2. Add "#batches" in the histogram graphs.

![screen shot 2015-05-14 at 7 36 19 pm](https://cloud.githubusercontent.com/assets/1000778/7646108/84f4a014-fa73-11e4-8c13-1903d267e60f.png)

![screen shot 2015-05-14 at 7 36 53 pm](https://cloud.githubusercontent.com/assets/1000778/7646109/8b11154a-fa73-11e4-820b-8ece9fa6ee3e.png)

![screen shot 2015-05-14 at 7 36 34 pm](https://cloud.githubusercontent.com/assets/1000778/7646111/93828272-fa73-11e4-89f8-580670144d3c.png)

Author: zsxwing <zsxwing@gmail.com>

Closes #6168 from zsxwing/SPARK-7658 and squashes the following commits:

c242b00 [zsxwing] Change 5 seconds to 3 seconds
31fd0aa [zsxwing] Remove the mouseover highlight feature
06c6f6f [zsxwing] Merge branch 'master' into SPARK-7658
2eaff06 [zsxwing] Merge branch 'master' into SPARK-7658
108d56c [zsxwing] Update the mouse behaviors for the timeline graphs
---
 .../streaming/ui/static/streaming-page.css    |  4 ++
 .../streaming/ui/static/streaming-page.js     | 42 ++++++++++++++++++-
 .../spark/streaming/ui/AllBatchesTable.scala  |  3 +-
 3 files changed, 47 insertions(+), 2 deletions(-)

diff --git a/streaming/src/main/resources/org/apache/spark/streaming/ui/static/streaming-page.css b/streaming/src/main/resources/org/apache/spark/streaming/ui/static/streaming-page.css
index 19abe889ad3c1..b22c884bfebdb 100644
--- a/streaming/src/main/resources/org/apache/spark/streaming/ui/static/streaming-page.css
+++ b/streaming/src/main/resources/org/apache/spark/streaming/ui/static/streaming-page.css
@@ -60,3 +60,7 @@
 span.expand-input-rate {
   cursor: pointer;
 }
+
+tr.batch-table-cell-highlight > td {
+  background-color: #D6FFE4 !important;
+}
diff --git a/streaming/src/main/resources/org/apache/spark/streaming/ui/static/streaming-page.js b/streaming/src/main/resources/org/apache/spark/streaming/ui/static/streaming-page.js
index 0ee6752b29e9a..75251f493ad22 100644
--- a/streaming/src/main/resources/org/apache/spark/streaming/ui/static/streaming-page.js
+++ b/streaming/src/main/resources/org/apache/spark/streaming/ui/static/streaming-page.js
@@ -146,6 +146,12 @@ function drawTimeline(id, data, minX, maxX, minY, maxY, unitY, batchInterval) {
         .attr("class", "line")
         .attr("d", line);
 
+    // If the user click one point in the graphs, jump to the batch row and highlight it. And
+    // recovery the batch row after 3 seconds if necessary.
+    // We need to remember the last clicked batch so that we can recovery it.
+    var lastClickedBatch = null;
+    var lastTimeout = null;
+
     // Add points to the line. However, we make it invisible at first. But when the user moves mouse
     // over a point, it will be displayed with its detail.
     svg.selectAll(".point")
@@ -154,6 +160,7 @@ function drawTimeline(id, data, minX, maxX, minY, maxY, unitY, batchInterval) {
             .attr("stroke", "white") // white and opacity = 0 make it invisible
             .attr("fill", "white")
             .attr("opacity", "0")
+            .style("cursor", "pointer")
             .attr("cx", function(d) { return x(d.x); })
             .attr("cy", function(d) { return y(d.y); })
             .attr("r", function(d) { return 3; })
@@ -175,7 +182,29 @@ function drawTimeline(id, data, minX, maxX, minY, maxY, unitY, batchInterval) {
                     .attr("opacity", "0");
             })
             .on("click", function(d) {
-                window.location.href = "batch/?id=" + d.x;
+                if (lastTimeout != null) {
+                    window.clearTimeout(lastTimeout);
+                }
+                if (lastClickedBatch != null) {
+                    clearBatchRow(lastClickedBatch);
+                    lastClickedBatch = null;
+                }
+                lastClickedBatch = d.x;
+                highlightBatchRow(lastClickedBatch)
+                lastTimeout = window.setTimeout(function () {
+                    lastTimeout = null;
+                    if (lastClickedBatch != null) {
+                        clearBatchRow(lastClickedBatch);
+                        lastClickedBatch = null;
+                    }
+                }, 3000); // Clean up after 3 seconds
+
+                var batchSelector = $("#batch-" + d.x);
+                var topOffset = batchSelector.offset().top - 15;
+                if (topOffset < 0) {
+                    topOffset = 0;
+                }
+                $('html,body').animate({scrollTop: topOffset}, 200);
             });
 }
 
@@ -218,6 +247,9 @@ function drawHistogram(id, values, minY, maxY, unitY, batchInterval) {
     svg.append("g")
         .attr("class", "x axis")
         .call(xAxis)
+        .append("text")
+            .attr("transform", "translate(" + (margin.left + width - 40) + ", 15)")
+            .text("#batches");
 
     svg.append("g")
         .attr("class", "y axis")
@@ -279,3 +311,11 @@ $(function() {
         $(this).find('.expand-input-rate-arrow').toggleClass('arrow-open').toggleClass('arrow-closed');
     }
 });
+
+function highlightBatchRow(batch) {
+    $("#batch-" + batch).parent().addClass("batch-table-cell-highlight");
+}
+
+function clearBatchRow(batch) {
+    $("#batch-" + batch).parent().removeClass("batch-table-cell-highlight");
+}
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/ui/AllBatchesTable.scala b/streaming/src/main/scala/org/apache/spark/streaming/ui/AllBatchesTable.scala
index 00cc47d6a3ca5..f702bd5bc9466 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/ui/AllBatchesTable.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/ui/AllBatchesTable.scala
@@ -44,8 +44,9 @@ private[ui] abstract class BatchTableBase(tableId: String, batchInterval: Long)
     val formattedSchedulingDelay = schedulingDelay.map(SparkUIUtils.formatDuration).getOrElse("-")
     val processingTime = batch.processingDelay
     val formattedProcessingTime = processingTime.map(SparkUIUtils.formatDuration).getOrElse("-")
+    val batchTimeId = s"batch-$batchTime"
 
-    <td sorttable_customkey={batchTime.toString}>
+    <td id={batchTimeId} sorttable_customkey={batchTime.toString}>
       <a href={s"batch?id=$batchTime"}>
         {formattedBatchTime}
       </a>

From fcf90b75ccf222bd2f1939addc3f8f052d2bd3ff Mon Sep 17 00:00:00 2001
From: Michael Armbrust <michael@databricks.com>
Date: Mon, 18 May 2015 14:04:04 -0700
Subject: [PATCH 055/525] [HOTFIX] Fix ORC build break

Fix break caused by merging #6225 and #6194.

Author: Michael Armbrust <michael@databricks.com>

Closes #6244 from marmbrus/fixOrcBuildBreak and squashes the following commits:

b10e47b [Michael Armbrust] [HOTFIX] Fix ORC Build break
---
 .../org/apache/spark/sql/hive/orc/OrcRelation.scala   | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcRelation.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcRelation.scala
index 9708199f07349..e10d3a0b6846c 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcRelation.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcRelation.scala
@@ -20,7 +20,7 @@ package org.apache.spark.sql.hive.orc
 import java.util.{Objects, Properties}
 
 import org.apache.hadoop.conf.Configuration
-import org.apache.hadoop.fs.Path
+import org.apache.hadoop.fs.{FileStatus, Path}
 import org.apache.hadoop.hive.conf.HiveConf.ConfVars
 import org.apache.hadoop.hive.ql.io.orc.{OrcInputFormat, OrcOutputFormat, OrcSerde, OrcSplit}
 import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector
@@ -171,9 +171,10 @@ private[sql] case class OrcRelation(
       maybePartitionSpec)
   }
 
-  override def buildScan(requiredColumns: Array[String],
+  override def buildScan(
+      requiredColumns: Array[String],
       filters: Array[Filter],
-      inputPaths: Array[String]): RDD[Row] = {
+      inputPaths: Array[FileStatus]): RDD[Row] = {
     val output = StructType(requiredColumns.map(dataSchema(_))).toAttributes
     OrcTableScan(output, this, filters, inputPaths).execute()
   }
@@ -194,7 +195,7 @@ private[orc] case class OrcTableScan(
     attributes: Seq[Attribute],
     @transient relation: OrcRelation,
     filters: Array[Filter],
-    inputPaths: Array[String])
+    inputPaths: Array[FileStatus])
   extends Logging
   with HiveInspectors {
 
@@ -256,7 +257,7 @@ private[orc] case class OrcTableScan(
     addColumnIds(attributes, relation, conf)
 
     if (inputPaths.nonEmpty) {
-      FileInputFormat.setInputPaths(job, inputPaths.map(new Path(_)): _*)
+      FileInputFormat.setInputPaths(job, inputPaths.map(_.getPath): _*)
     }
 
     val inputFormatClass =

From b93c97d79b42a06b48d2a8d98beccc636442541e Mon Sep 17 00:00:00 2001
From: Andrew Or <andrew@databricks.com>
Date: Mon, 18 May 2015 14:33:33 -0700
Subject: [PATCH 056/525] [SPARK-7501] [STREAMING] DAG visualization: show
 DStream operations

This is similar to #5999, but for streaming. Roughly 200 lines are tests.

One thing to note here is that we already do some kind of scoping thing for call sites, so this patch adds the new RDD operation scoping logic in the same place. Also, this patch adds a `try finally` block to set the relevant variables in a safer way.

tdas zsxwing

------------------------
**Before**
<img src="https://cloud.githubusercontent.com/assets/2133137/7625996/d88211b8-f9b4-11e4-90b9-e11baa52d6d7.png" width="450px"/>

--------------------------
**After**
<img src="https://cloud.githubusercontent.com/assets/2133137/7625997/e0878f8c-f9b4-11e4-8df3-7dd611b13c87.png" width="650px"/>

Author: Andrew Or <andrew@databricks.com>

Closes #6034 from andrewor14/dag-viz-streaming and squashes the following commits:

932a64a [Andrew Or] Merge branch 'master' of github.com:apache/spark into dag-viz-streaming
e685df9 [Andrew Or] Rename createRDDWith
84d0656 [Andrew Or] Review feedback
697c086 [Andrew Or] Fix tests
53b9936 [Andrew Or] Set scopes for foreachRDD properly
1881802 [Andrew Or] Refactor DStream scope names again
af4ba8d [Andrew Or] Merge branch 'master' of github.com:apache/spark into dag-viz-streaming
fd07d22 [Andrew Or] Make MQTT lower case
f6de871 [Andrew Or] Merge branch 'master' of github.com:apache/spark into dag-viz-streaming
0ca1801 [Andrew Or] Remove a few unnecessary withScopes on aliases
fa4e5fb [Andrew Or] Pass in input stream name rather than defining it from within
1af0b0e [Andrew Or] Fix style
074c00b [Andrew Or] Review comments
d25a324 [Andrew Or] Merge branch 'master' of github.com:apache/spark into dag-viz-streaming
e4a93ac [Andrew Or] Fix tests?
25416dc [Andrew Or] Merge branch 'master' of github.com:apache/spark into dag-viz-streaming
9113183 [Andrew Or] Add tests for DStream scopes
b3806ab [Andrew Or] Fix test
bb80bbb [Andrew Or] Fix MIMA?
5c30360 [Andrew Or] Merge branch 'master' of github.com:apache/spark into dag-viz-streaming
5703939 [Andrew Or] Rename operations that create InputDStreams
7c4513d [Andrew Or] Group RDDs by DStream operations and batches
bf0ab6e [Andrew Or] Merge branch 'master' of github.com:apache/spark into dag-viz-streaming
05c2676 [Andrew Or] Wrap many more methods in withScope
c121047 [Andrew Or] Merge branch 'master' of github.com:apache/spark into dag-viz-streaming
65ef3e9 [Andrew Or] Fix NPE
a0d3263 [Andrew Or] Scope streaming operations instead of RDD operations
---
 .../apache/spark/ui/static/dagre-d3.min.js    |   2 +-
 .../scala/org/apache/spark/SparkContext.scala |   2 +-
 .../apache/spark/rdd/RDDOperationScope.scala  |  24 ++-
 .../spark/ui/scope/RDDOperationGraph.scala    |   6 +-
 .../spark/rdd/RDDOperationScopeSuite.scala    |  12 +-
 .../kafka/DirectKafkaInputDStream.scala       |   3 +
 .../spark/streaming/kafka/KafkaUtils.scala    |  17 +-
 .../streaming/mqtt/MQTTInputDStream.scala     |   3 +-
 .../spark/streaming/StreamingContext.scala    |  48 +++--
 .../spark/streaming/dstream/DStream.scala     | 177 +++++++++++-----
 .../streaming/dstream/ForEachDStream.scala    |   2 +-
 .../streaming/dstream/InputDStream.scala      |  32 ++-
 .../dstream/PairDStreamFunctions.scala        | 111 +++++-----
 .../spark/streaming/DStreamScopeSuite.scala   | 190 ++++++++++++++++++
 14 files changed, 484 insertions(+), 145 deletions(-)
 create mode 100644 streaming/src/test/scala/org/apache/spark/streaming/DStreamScopeSuite.scala

diff --git a/core/src/main/resources/org/apache/spark/ui/static/dagre-d3.min.js b/core/src/main/resources/org/apache/spark/ui/static/dagre-d3.min.js
index c55f752620dfd..2d9262b972a59 100644
--- a/core/src/main/resources/org/apache/spark/ui/static/dagre-d3.min.js
+++ b/core/src/main/resources/org/apache/spark/ui/static/dagre-d3.min.js
@@ -20,7 +20,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  * THE SOFTWARE.
  */
-module.exports={graphlib:require("./lib/graphlib"),dagre:require("./lib/dagre"),intersect:require("./lib/intersect"),render:require("./lib/render"),util:require("./lib/util"),version:require("./lib/version")}},{"./lib/dagre":8,"./lib/graphlib":9,"./lib/intersect":10,"./lib/render":23,"./lib/util":25,"./lib/version":26}],2:[function(require,module,exports){var util=require("./util");module.exports={"default":normal,normal:normal,vee:vee,undirected:undirected};function normal(parent,id,edge,type){var marker=parent.append("marker").attr("id",id).attr("viewBox","0 0 10 10").attr("refX",9).attr("refY",5).attr("markerUnits","strokeWidth").attr("markerWidth",8).attr("markerHeight",6).attr("orient","auto");var path=marker.append("path").attr("d","M 0 0 L 10 5 L 0 10 z").style("stroke-width",1).style("stroke-dasharray","1,0");util.applyStyle(path,edge[type+"Style"])}function vee(parent,id,edge,type){var marker=parent.append("marker").attr("id",id).attr("viewBox","0 0 10 10").attr("refX",9).attr("refY",5).attr("markerUnits","strokeWidth").attr("markerWidth",8).attr("markerHeight",6).attr("orient","auto");var path=marker.append("path").attr("d","M 0 0 L 10 5 L 0 10 L 4 5 z").style("stroke-width",1).style("stroke-dasharray","1,0");util.applyStyle(path,edge[type+"Style"])}function undirected(parent,id,edge,type){var marker=parent.append("marker").attr("id",id).attr("viewBox","0 0 10 10").attr("refX",9).attr("refY",5).attr("markerUnits","strokeWidth").attr("markerWidth",8).attr("markerHeight",6).attr("orient","auto");var path=marker.append("path").attr("d","M 0 5 L 10 5").style("stroke-width",1).style("stroke-dasharray","1,0");util.applyStyle(path,edge[type+"Style"])}},{"./util":25}],3:[function(require,module,exports){var _=require("./lodash"),addLabel=require("./label/add-label"),util=require("./util");module.exports=createClusters;function createClusters(selection,g){var clusters=g.nodes().filter(function(v){return util.isSubgraph(g,v)}),svgClusters=selection.selectAll("g.cluster").data(clusters,function(v){return v});var makeClusterIdentifier=function(v){return"cluster_"+v.replace(/^cluster/,"")};svgClusters.enter().append("g").attr("class",makeClusterIdentifier).attr("name",function(v){return g.node(v).label}).classed("cluster",true).style("opacity",0).append("rect");var sortedClusters=util.orderByRank(g,svgClusters.data());for(var i=0;i<sortedClusters.length;i++){var v=sortedClusters[i];var node=g.node(v);if(node.label){var thisGroup=selection.select("g.cluster."+makeClusterIdentifier(v));labelGroup=thisGroup.append("g").attr("class","label"),labelDom=addLabel(labelGroup,node),bbox=_.pick(labelDom.node().getBBox(),"width","height");node.paddingTop+=bbox.height;node.paddingTop+=util.getMaxChildPaddingTop(g,v)}}util.applyTransition(svgClusters.exit(),g).style("opacity",0).remove();util.applyTransition(svgClusters,g).style("opacity",1);util.applyTransition(svgClusters.selectAll("rect"),g).attr("width",function(v){var node=g.node(v);return node.width+node.paddingLeft+node.paddingRight}).attr("height",function(v){var node=g.node(v);return node.height+node.paddingTop+node.paddingBottom}).attr("x",function(v){var node=g.node(v);return node.x-node.width/2-node.paddingLeft}).attr("y",function(v){var node=g.node(v);return node.y-node.height/2-node.paddingTop});svgClusters.each(function(){var cluster=d3.select(this),label=cluster.select("g.label"),rect=cluster.select("rect"),bbox=label.node().getBBox(),labelW=bbox.width,labelH=bbox.height;var num=function(x){return parseFloat(x.toString().replace(/px$/,""))};var labelX=num(rect.attr("x"))+num(rect.attr("width"))-labelH/2-labelW/2;var labelY=num(rect.attr("y"))+labelH;label.attr("transform","translate("+labelX+","+labelY+")")})}},{"./label/add-label":18,"./lodash":20,"./util":25}],4:[function(require,module,exports){"use strict";var _=require("./lodash"),addLabel=require("./label/add-label"),util=require("./util"),d3=require("./d3");module.exports=createEdgeLabels;function createEdgeLabels(selection,g){var svgEdgeLabels=selection.selectAll("g.edgeLabel").data(g.edges(),function(e){return util.edgeToId(e)}).classed("update",true);svgEdgeLabels.selectAll("*").remove();svgEdgeLabels.enter().append("g").classed("edgeLabel",true).style("opacity",0);svgEdgeLabels.each(function(e){var edge=g.edge(e),label=addLabel(d3.select(this),g.edge(e),0,0).classed("label",true),bbox=label.node().getBBox();if(edge.labelId){label.attr("id",edge.labelId)}if(!_.has(edge,"width")){edge.width=bbox.width}if(!_.has(edge,"height")){edge.height=bbox.height}});util.applyTransition(svgEdgeLabels.exit(),g).style("opacity",0).remove();return svgEdgeLabels}},{"./d3":7,"./label/add-label":18,"./lodash":20,"./util":25}],5:[function(require,module,exports){"use strict";var _=require("./lodash"),intersectNode=require("./intersect/intersect-node"),util=require("./util"),d3=require("./d3");module.exports=createEdgePaths;function createEdgePaths(selection,g,arrows){var svgPaths=selection.selectAll("g.edgePath").data(g.edges(),function(e){return util.edgeToId(e)}).classed("update",true);enter(svgPaths,g);exit(svgPaths,g);util.applyTransition(svgPaths,g).style("opacity",1);svgPaths.each(function(e){var domEdge=d3.select(this);var edge=g.edge(e);edge.elem=this;if(edge.id){domEdge.attr("id",edge.id)}util.applyClass(domEdge,edge["class"],(domEdge.classed("update")?"update ":"")+"edgePath")});svgPaths.selectAll("path.path").each(function(e){var edge=g.edge(e);edge.arrowheadId=_.uniqueId("arrowhead");var domEdge=d3.select(this).attr("marker-end",function(){return"url(#"+edge.arrowheadId+")"}).style("fill","none");util.applyTransition(domEdge,g).attr("d",function(e){return calcPoints(g,e)});util.applyStyle(domEdge,edge.style)});svgPaths.selectAll("defs *").remove();svgPaths.selectAll("defs").each(function(e){var edge=g.edge(e),arrowhead=arrows[edge.arrowhead];arrowhead(d3.select(this),edge.arrowheadId,edge,"arrowhead")});return svgPaths}function calcPoints(g,e){var edge=g.edge(e),tail=g.node(e.v),head=g.node(e.w),points=edge.points.slice(1,edge.points.length-1);points.unshift(intersectNode(tail,points[0]));points.push(intersectNode(head,points[points.length-1]));return createLine(edge,points)}function createLine(edge,points){var line=d3.svg.line().x(function(d){return d.x}).y(function(d){return d.y});if(_.has(edge,"lineInterpolate")){line.interpolate(edge.lineInterpolate)}if(_.has(edge,"lineTension")){line.tension(Number(edge.lineTension))}return line(points)}function getCoords(elem){var bbox=elem.getBBox(),matrix=elem.getTransformToElement(elem.ownerSVGElement).translate(bbox.width/2,bbox.height/2);return{x:matrix.e,y:matrix.f}}function enter(svgPaths,g){var svgPathsEnter=svgPaths.enter().append("g").attr("class","edgePath").style("opacity",0);svgPathsEnter.append("path").attr("class","path").attr("d",function(e){var edge=g.edge(e),sourceElem=g.node(e.v).elem,points=_.range(edge.points.length).map(function(){return getCoords(sourceElem)});return createLine(edge,points)});svgPathsEnter.append("defs")}function exit(svgPaths,g){var svgPathExit=svgPaths.exit();util.applyTransition(svgPathExit,g).style("opacity",0).remove();util.applyTransition(svgPathExit.select("path.path"),g).attr("d",function(e){var source=g.node(e.v);if(source){var points=_.range(this.pathSegList.length).map(function(){return source});return createLine({},points)}else{return d3.select(this).attr("d")}})}},{"./d3":7,"./intersect/intersect-node":14,"./lodash":20,"./util":25}],6:[function(require,module,exports){"use strict";var _=require("./lodash"),addLabel=require("./label/add-label"),util=require("./util"),d3=require("./d3");module.exports=createNodes;function createNodes(selection,g,shapes){var simpleNodes=g.nodes().filter(function(v){return!util.isSubgraph(g,v)});var svgNodes=selection.selectAll("g.node").data(simpleNodes,function(v){return v}).classed("update",true);svgNodes.selectAll("*").remove();svgNodes.enter().append("g").attr("class",function(v){return"node_"+v}).attr("name",function(v){return g.node(v).label}).classed("node",true).style("opacity",0);svgNodes.each(function(v){var node=g.node(v),thisGroup=d3.select(this),labelGroup=thisGroup.append("g").attr("class","label"),labelDom=addLabel(labelGroup,node),shape=shapes[node.shape],bbox=_.pick(labelDom.node().getBBox(),"width","height");node.elem=this;if(node.id){thisGroup.attr("id",node.id)}if(node.labelId){labelGroup.attr("id",node.labelId)}util.applyClass(thisGroup,node["class"],(thisGroup.classed("update")?"update ":"")+"node");if(_.has(node,"width")){bbox.width=node.width}if(_.has(node,"height")){bbox.height=node.height}bbox.width+=node.paddingLeft+node.paddingRight;bbox.height+=node.paddingTop+node.paddingBottom;labelGroup.attr("transform","translate("+(node.paddingLeft-node.paddingRight)/2+","+(node.paddingTop-node.paddingBottom)/2+")");var shapeSvg=shape(d3.select(this),bbox,node);util.applyStyle(shapeSvg,node.style);var requiredWidth=0,requiredHeight=0;var nextNode=g.node(g.parent(v));while(nextNode){var tempGroup=thisGroup.append("g");var tempLabel=addLabel(tempGroup,nextNode);var tempBBox=tempLabel.node().getBBox();tempBBox.width-=50;requiredWidth=Math.max(requiredWidth,tempBBox.width);requiredHeight=Math.max(requiredHeight,tempBBox.height);tempLabel.remove();nextNode=g.node(g.parent(nextNode.label))}var shapeBBox=shapeSvg.node().getBBox();shapeBBox.width=Math.max(shapeBBox.width,requiredWidth);shapeBBox.height=Math.max(shapeBBox.height,requiredHeight);node.width=shapeBBox.width;node.height=shapeBBox.height});util.applyTransition(svgNodes.exit(),g).style("opacity",0).remove();return svgNodes}},{"./d3":7,"./label/add-label":18,"./lodash":20,"./util":25}],7:[function(require,module,exports){module.exports=window.d3},{}],8:[function(require,module,exports){var dagre;if(require){try{dagre=require("dagre")}catch(e){}}if(!dagre){dagre=window.dagre}module.exports=dagre},{dagre:27}],9:[function(require,module,exports){var graphlib;if(require){try{graphlib=require("graphlib")}catch(e){}}if(!graphlib){graphlib=window.graphlib}module.exports=graphlib},{graphlib:57}],10:[function(require,module,exports){module.exports={node:require("./intersect-node"),circle:require("./intersect-circle"),ellipse:require("./intersect-ellipse"),polygon:require("./intersect-polygon"),rect:require("./intersect-rect")}},{"./intersect-circle":11,"./intersect-ellipse":12,"./intersect-node":14,"./intersect-polygon":15,"./intersect-rect":16}],11:[function(require,module,exports){var intersectEllipse=require("./intersect-ellipse");module.exports=intersectCircle;function intersectCircle(node,rx,point){return intersectEllipse(node,rx,rx,point)}},{"./intersect-ellipse":12}],12:[function(require,module,exports){module.exports=intersectEllipse;function intersectEllipse(node,rx,ry,point){var cx=node.x;var cy=node.y;var px=cx-point.x;var py=cy-point.y;var det=Math.sqrt(rx*rx*py*py+ry*ry*px*px);var dx=Math.abs(rx*ry*px/det);if(point.x<cx){dx=-dx}var dy=Math.abs(rx*ry*py/det);if(point.y<cy){dy=-dy}return{x:cx+dx,y:cy+dy}}},{}],13:[function(require,module,exports){module.exports=intersectLine;function intersectLine(p1,p2,q1,q2){var a1,a2,b1,b2,c1,c2;var r1,r2,r3,r4;var denom,offset,num;var x,y;a1=p2.y-p1.y;b1=p1.x-p2.x;c1=p2.x*p1.y-p1.x*p2.y;r3=a1*q1.x+b1*q1.y+c1;r4=a1*q2.x+b1*q2.y+c1;if(r3!==0&&r4!==0&&sameSign(r3,r4)){return}a2=q2.y-q1.y;b2=q1.x-q2.x;c2=q2.x*q1.y-q1.x*q2.y;r1=a2*p1.x+b2*p1.yy+c2;r2=a2*p2.x+b2*p2.y+c2;if(r1!==0&&r2!==0&&sameSign(r1,r2)){return}denom=a1*b2-a2*b1;if(denom===0){return}offset=Math.abs(denom/2);num=b1*c2-b2*c1;x=num<0?(num-offset)/denom:(num+offset)/denom;num=a2*c1-a1*c2;y=num<0?(num-offset)/denom:(num+offset)/denom;return{x:x,y:y}}function sameSign(r1,r2){return r1*r2>0}},{}],14:[function(require,module,exports){module.exports=intersectNode;function intersectNode(node,point){return node.intersect(point)}},{}],15:[function(require,module,exports){var intersectLine=require("./intersect-line");module.exports=intersectPolygon;function intersectPolygon(node,polyPoints,point){var x1=node.x;var y1=node.y;var intersections=[];var minX=Number.POSITIVE_INFINITY,minY=Number.POSITIVE_INFINITY;polyPoints.forEach(function(entry){minX=Math.min(minX,entry.x);minY=Math.min(minY,entry.y)});var left=x1-node.width/2-minX;var top=y1-node.height/2-minY;for(var i=0;i<polyPoints.length;i++){var p1=polyPoints[i];var p2=polyPoints[i<polyPoints.length-1?i+1:0];var intersect=intersectLine(node,point,{x:left+p1.x,y:top+p1.y},{x:left+p2.x,y:top+p2.y});if(intersect){intersections.push(intersect)}}if(!intersections.length){console.log("NO INTERSECTION FOUND, RETURN NODE CENTER",node);return node}if(intersections.length>1){intersections.sort(function(p,q){var pdx=p.x-point.x,pdy=p.y-point.y,distp=Math.sqrt(pdx*pdx+pdy*pdy),qdx=q.x-point.x,qdy=q.y-point.y,distq=Math.sqrt(qdx*qdx+qdy*qdy);return distp<distq?-1:distp===distq?0:1})}return intersections[0]}},{"./intersect-line":13}],16:[function(require,module,exports){module.exports=intersectRect;function intersectRect(node,point){var x=node.x;var y=node.y;var dx=point.x-x;var dy=point.y-y;var w=node.width/2;var h=node.height/2;var sx,sy;if(Math.abs(dy)*w>Math.abs(dx)*h){if(dy<0){h=-h}sx=dy===0?0:h*dx/dy;sy=h}else{if(dx<0){w=-w}sx=w;sy=dx===0?0:w*dy/dx}return{x:x+sx,y:y+sy}}},{}],17:[function(require,module,exports){var util=require("../util");module.exports=addHtmlLabel;function addHtmlLabel(root,node){var fo=root.append("foreignObject").attr("width","100000");var div=fo.append("xhtml:div");var label=node.label;switch(typeof label){case"function":div.insert(label);break;case"object":div.insert(function(){return label});break;default:div.html(label)}util.applyStyle(div,node.labelStyle);div.style("display","inline-block");div.style("white-space","nowrap");var w,h;div.each(function(){w=this.clientWidth;h=this.clientHeight});fo.attr("width",w).attr("height",h);return fo}},{"../util":25}],18:[function(require,module,exports){var addTextLabel=require("./add-text-label"),addHtmlLabel=require("./add-html-label");module.exports=addLabel;function addLabel(root,node){var label=node.label;var labelSvg=root.append("g");if(typeof label!=="string"||node.labelType==="html"){addHtmlLabel(labelSvg,node)}else{addTextLabel(labelSvg,node)}var labelBBox=labelSvg.node().getBBox();labelSvg.attr("transform","translate("+-labelBBox.width/2+","+-labelBBox.height/2+")");return labelSvg}},{"./add-html-label":17,"./add-text-label":19}],19:[function(require,module,exports){var util=require("../util");module.exports=addTextLabel;function addTextLabel(root,node){var domNode=root.append("text");var lines=processEscapeSequences(node.label).split("\n");for(var i=0;i<lines.length;i++){domNode.append("tspan").attr("xml:space","preserve").attr("dy","1em").attr("x","1").text(lines[i])}util.applyStyle(domNode,node.labelStyle);return domNode}function processEscapeSequences(text){var newText="",escaped=false,ch;for(var i=0;i<text.length;++i){ch=text[i];if(escaped){switch(ch){case"n":newText+="\n";break;default:newText+=ch}escaped=false}else if(ch==="\\"){escaped=true}else{newText+=ch}}return newText}},{"../util":25}],20:[function(require,module,exports){var lodash;if(require){try{lodash=require("lodash")}catch(e){}}if(!lodash){lodash=window._}module.exports=lodash},{lodash:77}],21:[function(require,module,exports){"use strict";var util=require("./util"),d3=require("./d3"),_=require("./lodash");module.exports=positionEdgeLabels;function positionEdgeLabels(selection,g){var created=selection.filter(function(){return!d3.select(this).classed("update")});function translate(e){var edge=g.edge(e);return _.has(edge,"x")?"translate("+edge.x+","+edge.y+")":""}created.attr("transform",translate);util.applyTransition(selection,g).style("opacity",1).attr("transform",translate)}},{"./d3":7,"./lodash":20,"./util":25}],22:[function(require,module,exports){"use strict";var util=require("./util"),d3=require("./d3");module.exports=positionNodes;function positionNodes(selection,g){var created=selection.filter(function(){return!d3.select(this).classed("update")});function translate(v){var node=g.node(v);return"translate("+node.x+","+node.y+")"}created.attr("transform",translate);util.applyTransition(selection,g).style("opacity",1).attr("transform",translate)}},{"./d3":7,"./util":25}],23:[function(require,module,exports){var _=require("./lodash"),layout=require("./dagre").layout;module.exports=render;function render(){var createNodes=require("./create-nodes"),createClusters=require("./create-clusters"),createEdgeLabels=require("./create-edge-labels"),createEdgePaths=require("./create-edge-paths"),positionNodes=require("./position-nodes"),positionEdgeLabels=require("./position-edge-labels"),shapes=require("./shapes"),arrows=require("./arrows");var fn=function(svg,g){preProcessGraph(g);var outputGroup=createOrSelectGroup(svg,"output"),clustersGroup=createOrSelectGroup(outputGroup,"clusters"),edgePathsGroup=createOrSelectGroup(outputGroup,"edgePaths"),edgeLabels=createEdgeLabels(createOrSelectGroup(outputGroup,"edgeLabels"),g),nodes=createNodes(createOrSelectGroup(outputGroup,"nodes"),g,shapes);layout(g);positionNodes(nodes,g);positionEdgeLabels(edgeLabels,g);createEdgePaths(edgePathsGroup,g,arrows);createClusters(clustersGroup,g);postProcessGraph(g)};fn.createNodes=function(value){if(!arguments.length)return createNodes;createNodes=value;return fn};fn.createClusters=function(value){if(!arguments.length)return createClusters;createClusters=value;return fn};fn.createEdgeLabels=function(value){if(!arguments.length)return createEdgeLabels;createEdgeLabels=value;return fn};fn.createEdgePaths=function(value){if(!arguments.length)return createEdgePaths;createEdgePaths=value;return fn};fn.shapes=function(value){if(!arguments.length)return shapes;shapes=value;return fn};fn.arrows=function(value){if(!arguments.length)return arrows;arrows=value;return fn};return fn}var NODE_DEFAULT_ATTRS={paddingLeft:0,paddingRight:0,paddingTop:0,paddingBottom:0,rx:0,ry:0,shape:"rect"};var EDGE_DEFAULT_ATTRS={arrowhead:"normal",lineInterpolate:"linear"};function preProcessGraph(g){g.nodes().forEach(function(v){var node=g.node(v);if(!_.has(node,"label")){node.label=v}if(_.has(node,"paddingX")){_.defaults(node,{paddingLeft:node.paddingX,paddingRight:node.paddingX})}if(_.has(node,"paddingY")){_.defaults(node,{paddingTop:node.paddingY,paddingBottom:node.paddingY})}if(_.has(node,"padding")){_.defaults(node,{paddingLeft:node.padding,paddingRight:node.padding,paddingTop:node.padding,paddingBottom:node.padding})}if(_.has(node,"paddingLeft")){_.defaults(node,{paddingLeft:node.paddingLeft})}if(_.has(node,"paddingRight")){_.defaults(node,{paddingRight:node.paddingRight})}if(_.has(node,"paddingTop")){_.defaults(node,{paddingTop:node.paddingTop})}if(_.has(node,"paddingBottom")){_.defaults(node,{paddingBottom:node.paddingBottom})}_.defaults(node,NODE_DEFAULT_ATTRS);_.each(["paddingLeft","paddingRight","paddingTop","paddingBottom"],function(k){node[k]=Number(node[k])});if(_.has(node,"width")){node._prevWidth=node.width}if(_.has(node,"height")){node._prevHeight=node.height}});g.edges().forEach(function(e){var edge=g.edge(e);if(!_.has(edge,"label")){edge.label=""}_.defaults(edge,EDGE_DEFAULT_ATTRS)})}function postProcessGraph(g){_.each(g.nodes(),function(v){var node=g.node(v);if(_.has(node,"_prevWidth")){node.width=node._prevWidth}else{delete node.width}if(_.has(node,"_prevHeight")){node.height=node._prevHeight}else{delete node.height}delete node._prevWidth;delete node._prevHeight})}function createOrSelectGroup(root,name){var selection=root.select("g."+name);if(selection.empty()){selection=root.append("g").attr("class",name)}return selection}},{"./arrows":2,"./create-clusters":3,"./create-edge-labels":4,"./create-edge-paths":5,"./create-nodes":6,"./dagre":8,"./lodash":20,"./position-edge-labels":21,"./position-nodes":22,"./shapes":24}],24:[function(require,module,exports){"use strict";var intersectRect=require("./intersect/intersect-rect"),intersectEllipse=require("./intersect/intersect-ellipse"),intersectCircle=require("./intersect/intersect-circle"),intersectPolygon=require("./intersect/intersect-polygon");module.exports={rect:rect,ellipse:ellipse,circle:circle,diamond:diamond};function rect(parent,bbox,node){var shapeSvg=parent.insert("rect",":first-child").attr("rx",node.rx).attr("ry",node.ry).attr("x",-bbox.width/2).attr("y",-bbox.height/2).attr("width",bbox.width).attr("height",bbox.height);node.intersect=function(point){return intersectRect(node,point)};return shapeSvg}function ellipse(parent,bbox,node){var rx=bbox.width/2,ry=bbox.height/2,shapeSvg=parent.insert("ellipse",":first-child").attr("x",-bbox.width/2).attr("y",-bbox.height/2).attr("rx",rx).attr("ry",ry);node.intersect=function(point){return intersectEllipse(node,rx,ry,point)};return shapeSvg}function circle(parent,bbox,node){var r=Math.max(bbox.width,bbox.height)/2,shapeSvg=parent.insert("circle",":first-child").attr("x",-bbox.width/2).attr("y",-bbox.height/2).attr("r",r);node.intersect=function(point){return intersectCircle(node,r,point)};return shapeSvg}function diamond(parent,bbox,node){var w=bbox.width*Math.SQRT2/2,h=bbox.height*Math.SQRT2/2,points=[{x:0,y:-h},{x:-w,y:0},{x:0,y:h},{x:w,y:0}],shapeSvg=parent.insert("polygon",":first-child").attr("points",points.map(function(p){return p.x+","+p.y}).join(" "));node.intersect=function(p){return intersectPolygon(node,points,p)};return shapeSvg}},{"./intersect/intersect-circle":11,"./intersect/intersect-ellipse":12,"./intersect/intersect-polygon":15,"./intersect/intersect-rect":16}],25:[function(require,module,exports){var _=require("./lodash");module.exports={isSubgraph:isSubgraph,getMaxChildPaddingTop:getMaxChildPaddingTop,orderByRank:orderByRank,edgeToId:edgeToId,applyStyle:applyStyle,applyClass:applyClass,applyTransition:applyTransition};function isSubgraph(g,v){return!!g.children(v).length}function getMaxChildPaddingTop(g,v){var maxPadding=0;var children=g.children(v);for(var i=0;i<children.length;i++){var child=g.node(children[i]);if(child.paddingTop&&child.paddingTop>maxPadding){maxPadding=child.paddingTop}}return maxPadding}function getRank(g,v){var maxRank=0;var children=g.children(v);for(var i=0;i<children.length;i++){var thisRank=getRank(g,children[i])+1;if(thisRank>maxRank){maxRank=thisRank}}return maxRank}function orderByRank(g,nodes){return nodes.sort(function(x,y){return getRank(g,x)-getRank(g,y)})}function edgeToId(e){return escapeId(e.v)+":"+escapeId(e.w)+":"+escapeId(e.name)}var ID_DELIM=/:/g;function escapeId(str){return str?String(str).replace(ID_DELIM,"\\:"):""}function applyStyle(dom,styleFn){if(styleFn){dom.attr("style",styleFn)}}function applyClass(dom,classFn,otherClasses){if(classFn){dom.attr("class",classFn).attr("class",otherClasses+" "+dom.attr("class"))}}function applyTransition(selection,g){var graph=g.graph();if(_.isPlainObject(graph)){var transition=graph.transition;if(_.isFunction(transition)){return transition(selection)}}return selection}},{"./lodash":20}],26:[function(require,module,exports){module.exports="0.4.4-pre"},{}],27:[function(require,module,exports){module.exports={graphlib:require("./lib/graphlib"),layout:require("./lib/layout"),debug:require("./lib/debug"),util:{time:require("./lib/util").time,notime:require("./lib/util").notime},version:require("./lib/version")}},{"./lib/debug":32,"./lib/graphlib":33,"./lib/layout":35,"./lib/util":55,"./lib/version":56}],28:[function(require,module,exports){"use strict";var _=require("./lodash"),greedyFAS=require("./greedy-fas");module.exports={run:run,undo:undo};function run(g){var fas=g.graph().acyclicer==="greedy"?greedyFAS(g,weightFn(g)):dfsFAS(g);_.each(fas,function(e){var label=g.edge(e);g.removeEdge(e);label.forwardName=e.name;label.reversed=true;g.setEdge(e.w,e.v,label,_.uniqueId("rev"))});function weightFn(g){return function(e){return g.edge(e).weight}}}function dfsFAS(g){var fas=[],stack={},visited={};function dfs(v){if(_.has(visited,v)){return}visited[v]=true;stack[v]=true;_.each(g.outEdges(v),function(e){if(_.has(stack,e.w)){fas.push(e)}else{dfs(e.w)}});delete stack[v]}_.each(g.nodes(),dfs);return fas}function undo(g){_.each(g.edges(),function(e){var label=g.edge(e);if(label.reversed){g.removeEdge(e);var forwardName=label.forwardName;delete label.reversed;delete label.forwardName;g.setEdge(e.w,e.v,label,forwardName)}})}},{"./greedy-fas":34,"./lodash":36}],29:[function(require,module,exports){var _=require("./lodash"),util=require("./util");module.exports=addBorderSegments;function addBorderSegments(g){function dfs(v){var children=g.children(v),node=g.node(v);if(children.length){_.each(children,dfs)}if(_.has(node,"minRank")){node.borderLeft=[];node.borderRight=[];for(var rank=node.minRank,maxRank=node.maxRank+1;rank<maxRank;++rank){addBorderNode(g,"borderLeft","_bl",v,node,rank);addBorderNode(g,"borderRight","_br",v,node,rank)}}}_.each(g.children(),dfs)}function addBorderNode(g,prop,prefix,sg,sgNode,rank){var label={width:0,height:0,rank:rank},prev=sgNode[prop][rank-1],curr=util.addDummyNode(g,"border",label,prefix);sgNode[prop][rank]=curr;g.setParent(curr,sg);if(prev){g.setEdge(prev,curr,{weight:1})}}},{"./lodash":36,"./util":55}],30:[function(require,module,exports){"use strict";var _=require("./lodash");module.exports={adjust:adjust,undo:undo};function adjust(g){var rankDir=g.graph().rankdir.toLowerCase();if(rankDir==="lr"||rankDir==="rl"){swapWidthHeight(g)}}function undo(g){var rankDir=g.graph().rankdir.toLowerCase();if(rankDir==="bt"||rankDir==="rl"){reverseY(g)}if(rankDir==="lr"||rankDir==="rl"){swapXY(g);swapWidthHeight(g)}}function swapWidthHeight(g){_.each(g.nodes(),function(v){swapWidthHeightOne(g.node(v))});_.each(g.edges(),function(e){swapWidthHeightOne(g.edge(e))})}function swapWidthHeightOne(attrs){var w=attrs.width;attrs.width=attrs.height;attrs.height=w}function reverseY(g){_.each(g.nodes(),function(v){reverseYOne(g.node(v))});_.each(g.edges(),function(e){var edge=g.edge(e);_.each(edge.points,reverseYOne);if(_.has(edge,"y")){reverseYOne(edge)}})}function reverseYOne(attrs){attrs.y=-attrs.y}function swapXY(g){_.each(g.nodes(),function(v){swapXYOne(g.node(v))});_.each(g.edges(),function(e){var edge=g.edge(e);_.each(edge.points,swapXYOne);if(_.has(edge,"x")){swapXYOne(edge)}})}function swapXYOne(attrs){var x=attrs.x;attrs.x=attrs.y;attrs.y=x}},{"./lodash":36}],31:[function(require,module,exports){module.exports=List;function List(){var sentinel={};sentinel._next=sentinel._prev=sentinel;this._sentinel=sentinel}List.prototype.dequeue=function(){var sentinel=this._sentinel,entry=sentinel._prev;if(entry!==sentinel){unlink(entry);return entry}};List.prototype.enqueue=function(entry){var sentinel=this._sentinel;if(entry._prev&&entry._next){unlink(entry)}entry._next=sentinel._next;sentinel._next._prev=entry;sentinel._next=entry;entry._prev=sentinel};List.prototype.toString=function(){var strs=[],sentinel=this._sentinel,curr=sentinel._prev;while(curr!==sentinel){strs.push(JSON.stringify(curr,filterOutLinks));curr=curr._prev}return"["+strs.join(", ")+"]"};function unlink(entry){entry._prev._next=entry._next;entry._next._prev=entry._prev;delete entry._next;delete entry._prev}function filterOutLinks(k,v){if(k!=="_next"&&k!=="_prev"){return v}}},{}],32:[function(require,module,exports){var _=require("./lodash"),util=require("./util"),Graph=require("./graphlib").Graph;module.exports={debugOrdering:debugOrdering};function debugOrdering(g){var layerMatrix=util.buildLayerMatrix(g);var h=new Graph({compound:true,multigraph:true}).setGraph({});_.each(g.nodes(),function(v){h.setNode(v,{label:v});h.setParent(v,"layer"+g.node(v).rank)});_.each(g.edges(),function(e){h.setEdge(e.v,e.w,{},e.name)});_.each(layerMatrix,function(layer,i){var layerV="layer"+i;h.setNode(layerV,{rank:"same"});_.reduce(layer,function(u,v){h.setEdge(u,v,{style:"invis"});return v})});return h}},{"./graphlib":33,"./lodash":36,"./util":55}],33:[function(require,module,exports){module.exports=require(9)},{"/Users/andrew/Documents/dev/dagre-d3/lib/graphlib.js":9,graphlib:57}],34:[function(require,module,exports){var _=require("./lodash"),Graph=require("./graphlib").Graph,List=require("./data/list");module.exports=greedyFAS;var DEFAULT_WEIGHT_FN=_.constant(1);function greedyFAS(g,weightFn){if(g.nodeCount()<=1){return[]}var state=buildState(g,weightFn||DEFAULT_WEIGHT_FN);var results=doGreedyFAS(state.graph,state.buckets,state.zeroIdx);return _.flatten(_.map(results,function(e){return g.outEdges(e.v,e.w)}),true)}function doGreedyFAS(g,buckets,zeroIdx){var results=[],sources=buckets[buckets.length-1],sinks=buckets[0];var entry;while(g.nodeCount()){while(entry=sinks.dequeue()){removeNode(g,buckets,zeroIdx,entry)}while(entry=sources.dequeue()){removeNode(g,buckets,zeroIdx,entry)}if(g.nodeCount()){for(var i=buckets.length-2;i>0;--i){entry=buckets[i].dequeue();if(entry){results=results.concat(removeNode(g,buckets,zeroIdx,entry,true));break}}}}return results}function removeNode(g,buckets,zeroIdx,entry,collectPredecessors){var results=collectPredecessors?[]:undefined;_.each(g.inEdges(entry.v),function(edge){var weight=g.edge(edge),uEntry=g.node(edge.v);if(collectPredecessors){results.push({v:edge.v,w:edge.w})}uEntry.out-=weight;assignBucket(buckets,zeroIdx,uEntry)});_.each(g.outEdges(entry.v),function(edge){var weight=g.edge(edge),w=edge.w,wEntry=g.node(w);wEntry["in"]-=weight;assignBucket(buckets,zeroIdx,wEntry)});g.removeNode(entry.v);return results}function buildState(g,weightFn){var fasGraph=new Graph,maxIn=0,maxOut=0;_.each(g.nodes(),function(v){fasGraph.setNode(v,{v:v,"in":0,out:0})});_.each(g.edges(),function(e){var prevWeight=fasGraph.edge(e.v,e.w)||0,weight=weightFn(e),edgeWeight=prevWeight+weight;fasGraph.setEdge(e.v,e.w,edgeWeight);maxOut=Math.max(maxOut,fasGraph.node(e.v).out+=weight);maxIn=Math.max(maxIn,fasGraph.node(e.w)["in"]+=weight)});var buckets=_.range(maxOut+maxIn+3).map(function(){return new List});var zeroIdx=maxIn+1;_.each(fasGraph.nodes(),function(v){assignBucket(buckets,zeroIdx,fasGraph.node(v))});return{graph:fasGraph,buckets:buckets,zeroIdx:zeroIdx}}function assignBucket(buckets,zeroIdx,entry){if(!entry.out){buckets[0].enqueue(entry)}else if(!entry["in"]){buckets[buckets.length-1].enqueue(entry)}else{buckets[entry.out-entry["in"]+zeroIdx].enqueue(entry)}}},{"./data/list":31,"./graphlib":33,"./lodash":36}],35:[function(require,module,exports){"use strict";var _=require("./lodash"),acyclic=require("./acyclic"),normalize=require("./normalize"),rank=require("./rank"),normalizeRanks=require("./util").normalizeRanks,parentDummyChains=require("./parent-dummy-chains"),removeEmptyRanks=require("./util").removeEmptyRanks,nestingGraph=require("./nesting-graph"),addBorderSegments=require("./add-border-segments"),coordinateSystem=require("./coordinate-system"),order=require("./order"),position=require("./position"),util=require("./util"),Graph=require("./graphlib").Graph;module.exports=layout;function layout(g,opts){var time=opts&&opts.debugTiming?util.time:util.notime;time("layout",function(){var layoutGraph=time("  buildLayoutGraph",function(){return buildLayoutGraph(g)});time("  runLayout",function(){runLayout(layoutGraph,time)});time("  updateInputGraph",function(){updateInputGraph(g,layoutGraph)})})}function runLayout(g,time){time("    makeSpaceForEdgeLabels",function(){makeSpaceForEdgeLabels(g)});time("    removeSelfEdges",function(){removeSelfEdges(g)});time("    acyclic",function(){acyclic.run(g)});time("    nestingGraph.run",function(){nestingGraph.run(g)});time("    rank",function(){rank(util.asNonCompoundGraph(g))});time("    injectEdgeLabelProxies",function(){injectEdgeLabelProxies(g)});time("    removeEmptyRanks",function(){removeEmptyRanks(g)});time("    nestingGraph.cleanup",function(){nestingGraph.cleanup(g)});time("    normalizeRanks",function(){normalizeRanks(g)});time("    assignRankMinMax",function(){assignRankMinMax(g)});time("    removeEdgeLabelProxies",function(){removeEdgeLabelProxies(g)});time("    normalize.run",function(){normalize.run(g)});time("    parentDummyChains",function(){
+module.exports={graphlib:require("./lib/graphlib"),dagre:require("./lib/dagre"),intersect:require("./lib/intersect"),render:require("./lib/render"),util:require("./lib/util"),version:require("./lib/version")}},{"./lib/dagre":8,"./lib/graphlib":9,"./lib/intersect":10,"./lib/render":23,"./lib/util":25,"./lib/version":26}],2:[function(require,module,exports){var util=require("./util");module.exports={"default":normal,normal:normal,vee:vee,undirected:undirected};function normal(parent,id,edge,type){var marker=parent.append("marker").attr("id",id).attr("viewBox","0 0 10 10").attr("refX",9).attr("refY",5).attr("markerUnits","strokeWidth").attr("markerWidth",8).attr("markerHeight",6).attr("orient","auto");var path=marker.append("path").attr("d","M 0 0 L 10 5 L 0 10 z").style("stroke-width",1).style("stroke-dasharray","1,0");util.applyStyle(path,edge[type+"Style"])}function vee(parent,id,edge,type){var marker=parent.append("marker").attr("id",id).attr("viewBox","0 0 10 10").attr("refX",9).attr("refY",5).attr("markerUnits","strokeWidth").attr("markerWidth",8).attr("markerHeight",6).attr("orient","auto");var path=marker.append("path").attr("d","M 0 0 L 10 5 L 0 10 L 4 5 z").style("stroke-width",1).style("stroke-dasharray","1,0");util.applyStyle(path,edge[type+"Style"])}function undirected(parent,id,edge,type){var marker=parent.append("marker").attr("id",id).attr("viewBox","0 0 10 10").attr("refX",9).attr("refY",5).attr("markerUnits","strokeWidth").attr("markerWidth",8).attr("markerHeight",6).attr("orient","auto");var path=marker.append("path").attr("d","M 0 5 L 10 5").style("stroke-width",1).style("stroke-dasharray","1,0");util.applyStyle(path,edge[type+"Style"])}},{"./util":25}],3:[function(require,module,exports){var _=require("./lodash"),addLabel=require("./label/add-label"),util=require("./util");module.exports=createClusters;function createClusters(selection,g){var clusters=g.nodes().filter(function(v){return util.isSubgraph(g,v)}),svgClusters=selection.selectAll("g.cluster").data(clusters,function(v){return v});var makeClusterIdentifier=function(v){return"cluster_"+v.replace(/^cluster/,"")};svgClusters.enter().append("g").attr("class",makeClusterIdentifier).attr("name",function(v){return g.node(v).label}).classed("cluster",true).style("opacity",0).append("rect");var sortedClusters=util.orderByRank(g,svgClusters.data());for(var i=0;i<sortedClusters.length;i++){var v=sortedClusters[i];var node=g.node(v);if(node.label){var thisGroup=selection.select("g.cluster."+makeClusterIdentifier(v));labelGroup=thisGroup.append("g").attr("class","label"),labelDom=addLabel(labelGroup,node),bbox=_.pick(labelDom.node().getBBox(),"width","height");node.paddingTop+=bbox.height;node.paddingTop+=util.getMaxChildPaddingTop(g,v)}}util.applyTransition(svgClusters.exit(),g).style("opacity",0).remove();util.applyTransition(svgClusters,g).style("opacity",1);util.applyTransition(svgClusters.selectAll("rect"),g).attr("width",function(v){var node=g.node(v);return node.width+node.paddingLeft+node.paddingRight}).attr("height",function(v){var node=g.node(v);return node.height+node.paddingTop+node.paddingBottom}).attr("x",function(v){var node=g.node(v);return node.x-node.width/2-node.paddingLeft}).attr("y",function(v){var node=g.node(v);return node.y-node.height/2-node.paddingTop});svgClusters.each(function(){var cluster=d3.select(this),label=cluster.select("g.label"),rect=cluster.select("rect"),bbox=label.node().getBBox(),labelW=bbox.width,labelH=bbox.height;var num=function(x){return parseFloat(x.toString().replace(/px$/,""))};var labelX=num(rect.attr("x"))+num(rect.attr("width"))-labelH/2+labelW/2;var labelY=num(rect.attr("y"))+labelH;label.attr("text-anchor","end").attr("transform","translate("+labelX+","+labelY+")")})}},{"./label/add-label":18,"./lodash":20,"./util":25}],4:[function(require,module,exports){"use strict";var _=require("./lodash"),addLabel=require("./label/add-label"),util=require("./util"),d3=require("./d3");module.exports=createEdgeLabels;function createEdgeLabels(selection,g){var svgEdgeLabels=selection.selectAll("g.edgeLabel").data(g.edges(),function(e){return util.edgeToId(e)}).classed("update",true);svgEdgeLabels.selectAll("*").remove();svgEdgeLabels.enter().append("g").classed("edgeLabel",true).style("opacity",0);svgEdgeLabels.each(function(e){var edge=g.edge(e),label=addLabel(d3.select(this),g.edge(e),0,0).classed("label",true),bbox=label.node().getBBox();if(edge.labelId){label.attr("id",edge.labelId)}if(!_.has(edge,"width")){edge.width=bbox.width}if(!_.has(edge,"height")){edge.height=bbox.height}});util.applyTransition(svgEdgeLabels.exit(),g).style("opacity",0).remove();return svgEdgeLabels}},{"./d3":7,"./label/add-label":18,"./lodash":20,"./util":25}],5:[function(require,module,exports){"use strict";var _=require("./lodash"),intersectNode=require("./intersect/intersect-node"),util=require("./util"),d3=require("./d3");module.exports=createEdgePaths;function createEdgePaths(selection,g,arrows){var svgPaths=selection.selectAll("g.edgePath").data(g.edges(),function(e){return util.edgeToId(e)}).classed("update",true);enter(svgPaths,g);exit(svgPaths,g);util.applyTransition(svgPaths,g).style("opacity",1);svgPaths.each(function(e){var domEdge=d3.select(this);var edge=g.edge(e);edge.elem=this;if(edge.id){domEdge.attr("id",edge.id)}util.applyClass(domEdge,edge["class"],(domEdge.classed("update")?"update ":"")+"edgePath")});svgPaths.selectAll("path.path").each(function(e){var edge=g.edge(e);edge.arrowheadId=_.uniqueId("arrowhead");var domEdge=d3.select(this).attr("marker-end",function(){return"url(#"+edge.arrowheadId+")"}).style("fill","none");util.applyTransition(domEdge,g).attr("d",function(e){return calcPoints(g,e)});util.applyStyle(domEdge,edge.style)});svgPaths.selectAll("defs *").remove();svgPaths.selectAll("defs").each(function(e){var edge=g.edge(e),arrowhead=arrows[edge.arrowhead];arrowhead(d3.select(this),edge.arrowheadId,edge,"arrowhead")});return svgPaths}function calcPoints(g,e){var edge=g.edge(e),tail=g.node(e.v),head=g.node(e.w),points=edge.points.slice(1,edge.points.length-1);points.unshift(intersectNode(tail,points[0]));points.push(intersectNode(head,points[points.length-1]));return createLine(edge,points)}function createLine(edge,points){var line=d3.svg.line().x(function(d){return d.x}).y(function(d){return d.y});if(_.has(edge,"lineInterpolate")){line.interpolate(edge.lineInterpolate)}if(_.has(edge,"lineTension")){line.tension(Number(edge.lineTension))}return line(points)}function getCoords(elem){var bbox=elem.getBBox(),matrix=elem.getTransformToElement(elem.ownerSVGElement).translate(bbox.width/2,bbox.height/2);return{x:matrix.e,y:matrix.f}}function enter(svgPaths,g){var svgPathsEnter=svgPaths.enter().append("g").attr("class","edgePath").style("opacity",0);svgPathsEnter.append("path").attr("class","path").attr("d",function(e){var edge=g.edge(e),sourceElem=g.node(e.v).elem,points=_.range(edge.points.length).map(function(){return getCoords(sourceElem)});return createLine(edge,points)});svgPathsEnter.append("defs")}function exit(svgPaths,g){var svgPathExit=svgPaths.exit();util.applyTransition(svgPathExit,g).style("opacity",0).remove();util.applyTransition(svgPathExit.select("path.path"),g).attr("d",function(e){var source=g.node(e.v);if(source){var points=_.range(this.pathSegList.length).map(function(){return source});return createLine({},points)}else{return d3.select(this).attr("d")}})}},{"./d3":7,"./intersect/intersect-node":14,"./lodash":20,"./util":25}],6:[function(require,module,exports){"use strict";var _=require("./lodash"),addLabel=require("./label/add-label"),util=require("./util"),d3=require("./d3");module.exports=createNodes;function createNodes(selection,g,shapes){var simpleNodes=g.nodes().filter(function(v){return!util.isSubgraph(g,v)});var svgNodes=selection.selectAll("g.node").data(simpleNodes,function(v){return v}).classed("update",true);svgNodes.selectAll("*").remove();svgNodes.enter().append("g").attr("class",function(v){return"node_"+v}).attr("name",function(v){return g.node(v).label}).classed("node",true).style("opacity",0);svgNodes.each(function(v){var node=g.node(v),thisGroup=d3.select(this),labelGroup=thisGroup.append("g").attr("class","label"),labelDom=addLabel(labelGroup,node),shape=shapes[node.shape],bbox=_.pick(labelDom.node().getBBox(),"width","height");node.elem=this;if(node.id){thisGroup.attr("id",node.id)}if(node.labelId){labelGroup.attr("id",node.labelId)}util.applyClass(thisGroup,node["class"],(thisGroup.classed("update")?"update ":"")+"node");if(_.has(node,"width")){bbox.width=node.width}if(_.has(node,"height")){bbox.height=node.height}bbox.width+=node.paddingLeft+node.paddingRight;bbox.height+=node.paddingTop+node.paddingBottom;labelGroup.attr("transform","translate("+(node.paddingLeft-node.paddingRight)/2+","+(node.paddingTop-node.paddingBottom)/2+")");var shapeSvg=shape(d3.select(this),bbox,node);util.applyStyle(shapeSvg,node.style);var requiredWidth=0,requiredHeight=0;var nextNode=g.node(g.parent(v));while(nextNode){var tempGroup=thisGroup.append("g");var tempLabel=addLabel(tempGroup,nextNode);var tempBBox=tempLabel.node().getBBox();tempBBox.width-=50;requiredWidth=Math.max(requiredWidth,tempBBox.width);requiredHeight=Math.max(requiredHeight,tempBBox.height);tempLabel.remove();nextNode=g.node(g.parent(nextNode.label))}var shapeBBox=shapeSvg.node().getBBox();shapeBBox.width=Math.max(shapeBBox.width,requiredWidth);shapeBBox.height=Math.max(shapeBBox.height,requiredHeight);node.width=shapeBBox.width;node.height=shapeBBox.height});util.applyTransition(svgNodes.exit(),g).style("opacity",0).remove();return svgNodes}},{"./d3":7,"./label/add-label":18,"./lodash":20,"./util":25}],7:[function(require,module,exports){module.exports=window.d3},{}],8:[function(require,module,exports){var dagre;if(require){try{dagre=require("dagre")}catch(e){}}if(!dagre){dagre=window.dagre}module.exports=dagre},{dagre:27}],9:[function(require,module,exports){var graphlib;if(require){try{graphlib=require("graphlib")}catch(e){}}if(!graphlib){graphlib=window.graphlib}module.exports=graphlib},{graphlib:57}],10:[function(require,module,exports){module.exports={node:require("./intersect-node"),circle:require("./intersect-circle"),ellipse:require("./intersect-ellipse"),polygon:require("./intersect-polygon"),rect:require("./intersect-rect")}},{"./intersect-circle":11,"./intersect-ellipse":12,"./intersect-node":14,"./intersect-polygon":15,"./intersect-rect":16}],11:[function(require,module,exports){var intersectEllipse=require("./intersect-ellipse");module.exports=intersectCircle;function intersectCircle(node,rx,point){return intersectEllipse(node,rx,rx,point)}},{"./intersect-ellipse":12}],12:[function(require,module,exports){module.exports=intersectEllipse;function intersectEllipse(node,rx,ry,point){var cx=node.x;var cy=node.y;var px=cx-point.x;var py=cy-point.y;var det=Math.sqrt(rx*rx*py*py+ry*ry*px*px);var dx=Math.abs(rx*ry*px/det);if(point.x<cx){dx=-dx}var dy=Math.abs(rx*ry*py/det);if(point.y<cy){dy=-dy}return{x:cx+dx,y:cy+dy}}},{}],13:[function(require,module,exports){module.exports=intersectLine;function intersectLine(p1,p2,q1,q2){var a1,a2,b1,b2,c1,c2;var r1,r2,r3,r4;var denom,offset,num;var x,y;a1=p2.y-p1.y;b1=p1.x-p2.x;c1=p2.x*p1.y-p1.x*p2.y;r3=a1*q1.x+b1*q1.y+c1;r4=a1*q2.x+b1*q2.y+c1;if(r3!==0&&r4!==0&&sameSign(r3,r4)){return}a2=q2.y-q1.y;b2=q1.x-q2.x;c2=q2.x*q1.y-q1.x*q2.y;r1=a2*p1.x+b2*p1.yy+c2;r2=a2*p2.x+b2*p2.y+c2;if(r1!==0&&r2!==0&&sameSign(r1,r2)){return}denom=a1*b2-a2*b1;if(denom===0){return}offset=Math.abs(denom/2);num=b1*c2-b2*c1;x=num<0?(num-offset)/denom:(num+offset)/denom;num=a2*c1-a1*c2;y=num<0?(num-offset)/denom:(num+offset)/denom;return{x:x,y:y}}function sameSign(r1,r2){return r1*r2>0}},{}],14:[function(require,module,exports){module.exports=intersectNode;function intersectNode(node,point){return node.intersect(point)}},{}],15:[function(require,module,exports){var intersectLine=require("./intersect-line");module.exports=intersectPolygon;function intersectPolygon(node,polyPoints,point){var x1=node.x;var y1=node.y;var intersections=[];var minX=Number.POSITIVE_INFINITY,minY=Number.POSITIVE_INFINITY;polyPoints.forEach(function(entry){minX=Math.min(minX,entry.x);minY=Math.min(minY,entry.y)});var left=x1-node.width/2-minX;var top=y1-node.height/2-minY;for(var i=0;i<polyPoints.length;i++){var p1=polyPoints[i];var p2=polyPoints[i<polyPoints.length-1?i+1:0];var intersect=intersectLine(node,point,{x:left+p1.x,y:top+p1.y},{x:left+p2.x,y:top+p2.y});if(intersect){intersections.push(intersect)}}if(!intersections.length){console.log("NO INTERSECTION FOUND, RETURN NODE CENTER",node);return node}if(intersections.length>1){intersections.sort(function(p,q){var pdx=p.x-point.x,pdy=p.y-point.y,distp=Math.sqrt(pdx*pdx+pdy*pdy),qdx=q.x-point.x,qdy=q.y-point.y,distq=Math.sqrt(qdx*qdx+qdy*qdy);return distp<distq?-1:distp===distq?0:1})}return intersections[0]}},{"./intersect-line":13}],16:[function(require,module,exports){module.exports=intersectRect;function intersectRect(node,point){var x=node.x;var y=node.y;var dx=point.x-x;var dy=point.y-y;var w=node.width/2;var h=node.height/2;var sx,sy;if(Math.abs(dy)*w>Math.abs(dx)*h){if(dy<0){h=-h}sx=dy===0?0:h*dx/dy;sy=h}else{if(dx<0){w=-w}sx=w;sy=dx===0?0:w*dy/dx}return{x:x+sx,y:y+sy}}},{}],17:[function(require,module,exports){var util=require("../util");module.exports=addHtmlLabel;function addHtmlLabel(root,node){var fo=root.append("foreignObject").attr("width","100000");var div=fo.append("xhtml:div");var label=node.label;switch(typeof label){case"function":div.insert(label);break;case"object":div.insert(function(){return label});break;default:div.html(label)}util.applyStyle(div,node.labelStyle);div.style("display","inline-block");div.style("white-space","nowrap");var w,h;div.each(function(){w=this.clientWidth;h=this.clientHeight});fo.attr("width",w).attr("height",h);return fo}},{"../util":25}],18:[function(require,module,exports){var addTextLabel=require("./add-text-label"),addHtmlLabel=require("./add-html-label");module.exports=addLabel;function addLabel(root,node){var label=node.label;var labelSvg=root.append("g");if(typeof label!=="string"||node.labelType==="html"){addHtmlLabel(labelSvg,node)}else{addTextLabel(labelSvg,node)}var labelBBox=labelSvg.node().getBBox();labelSvg.attr("transform","translate("+-labelBBox.width/2+","+-labelBBox.height/2+")");return labelSvg}},{"./add-html-label":17,"./add-text-label":19}],19:[function(require,module,exports){var util=require("../util");module.exports=addTextLabel;function addTextLabel(root,node){var domNode=root.append("text");var lines=processEscapeSequences(node.label).split("\n");for(var i=0;i<lines.length;i++){domNode.append("tspan").attr("xml:space","preserve").attr("dy","1em").attr("x","1").text(lines[i])}util.applyStyle(domNode,node.labelStyle);return domNode}function processEscapeSequences(text){var newText="",escaped=false,ch;for(var i=0;i<text.length;++i){ch=text[i];if(escaped){switch(ch){case"n":newText+="\n";break;default:newText+=ch}escaped=false}else if(ch==="\\"){escaped=true}else{newText+=ch}}return newText}},{"../util":25}],20:[function(require,module,exports){var lodash;if(require){try{lodash=require("lodash")}catch(e){}}if(!lodash){lodash=window._}module.exports=lodash},{lodash:77}],21:[function(require,module,exports){"use strict";var util=require("./util"),d3=require("./d3"),_=require("./lodash");module.exports=positionEdgeLabels;function positionEdgeLabels(selection,g){var created=selection.filter(function(){return!d3.select(this).classed("update")});function translate(e){var edge=g.edge(e);return _.has(edge,"x")?"translate("+edge.x+","+edge.y+")":""}created.attr("transform",translate);util.applyTransition(selection,g).style("opacity",1).attr("transform",translate)}},{"./d3":7,"./lodash":20,"./util":25}],22:[function(require,module,exports){"use strict";var util=require("./util"),d3=require("./d3");module.exports=positionNodes;function positionNodes(selection,g){var created=selection.filter(function(){return!d3.select(this).classed("update")});function translate(v){var node=g.node(v);return"translate("+node.x+","+node.y+")"}created.attr("transform",translate);util.applyTransition(selection,g).style("opacity",1).attr("transform",translate)}},{"./d3":7,"./util":25}],23:[function(require,module,exports){var _=require("./lodash"),layout=require("./dagre").layout;module.exports=render;function render(){var createNodes=require("./create-nodes"),createClusters=require("./create-clusters"),createEdgeLabels=require("./create-edge-labels"),createEdgePaths=require("./create-edge-paths"),positionNodes=require("./position-nodes"),positionEdgeLabels=require("./position-edge-labels"),shapes=require("./shapes"),arrows=require("./arrows");var fn=function(svg,g){preProcessGraph(g);var outputGroup=createOrSelectGroup(svg,"output"),clustersGroup=createOrSelectGroup(outputGroup,"clusters"),edgePathsGroup=createOrSelectGroup(outputGroup,"edgePaths"),edgeLabels=createEdgeLabels(createOrSelectGroup(outputGroup,"edgeLabels"),g),nodes=createNodes(createOrSelectGroup(outputGroup,"nodes"),g,shapes);layout(g);positionNodes(nodes,g);positionEdgeLabels(edgeLabels,g);createEdgePaths(edgePathsGroup,g,arrows);createClusters(clustersGroup,g);postProcessGraph(g)};fn.createNodes=function(value){if(!arguments.length)return createNodes;createNodes=value;return fn};fn.createClusters=function(value){if(!arguments.length)return createClusters;createClusters=value;return fn};fn.createEdgeLabels=function(value){if(!arguments.length)return createEdgeLabels;createEdgeLabels=value;return fn};fn.createEdgePaths=function(value){if(!arguments.length)return createEdgePaths;createEdgePaths=value;return fn};fn.shapes=function(value){if(!arguments.length)return shapes;shapes=value;return fn};fn.arrows=function(value){if(!arguments.length)return arrows;arrows=value;return fn};return fn}var NODE_DEFAULT_ATTRS={paddingLeft:0,paddingRight:0,paddingTop:0,paddingBottom:0,rx:0,ry:0,shape:"rect"};var EDGE_DEFAULT_ATTRS={arrowhead:"normal",lineInterpolate:"linear"};function preProcessGraph(g){g.nodes().forEach(function(v){var node=g.node(v);if(!_.has(node,"label")){node.label=v}if(_.has(node,"paddingX")){_.defaults(node,{paddingLeft:node.paddingX,paddingRight:node.paddingX})}if(_.has(node,"paddingY")){_.defaults(node,{paddingTop:node.paddingY,paddingBottom:node.paddingY})}if(_.has(node,"padding")){_.defaults(node,{paddingLeft:node.padding,paddingRight:node.padding,paddingTop:node.padding,paddingBottom:node.padding})}if(_.has(node,"paddingLeft")){_.defaults(node,{paddingLeft:node.paddingLeft})}if(_.has(node,"paddingRight")){_.defaults(node,{paddingRight:node.paddingRight})}if(_.has(node,"paddingTop")){_.defaults(node,{paddingTop:node.paddingTop})}if(_.has(node,"paddingBottom")){_.defaults(node,{paddingBottom:node.paddingBottom})}_.defaults(node,NODE_DEFAULT_ATTRS);_.each(["paddingLeft","paddingRight","paddingTop","paddingBottom"],function(k){node[k]=Number(node[k])});if(_.has(node,"width")){node._prevWidth=node.width}if(_.has(node,"height")){node._prevHeight=node.height}});g.edges().forEach(function(e){var edge=g.edge(e);if(!_.has(edge,"label")){edge.label=""}_.defaults(edge,EDGE_DEFAULT_ATTRS)})}function postProcessGraph(g){_.each(g.nodes(),function(v){var node=g.node(v);if(_.has(node,"_prevWidth")){node.width=node._prevWidth}else{delete node.width}if(_.has(node,"_prevHeight")){node.height=node._prevHeight}else{delete node.height}delete node._prevWidth;delete node._prevHeight})}function createOrSelectGroup(root,name){var selection=root.select("g."+name);if(selection.empty()){selection=root.append("g").attr("class",name)}return selection}},{"./arrows":2,"./create-clusters":3,"./create-edge-labels":4,"./create-edge-paths":5,"./create-nodes":6,"./dagre":8,"./lodash":20,"./position-edge-labels":21,"./position-nodes":22,"./shapes":24}],24:[function(require,module,exports){"use strict";var intersectRect=require("./intersect/intersect-rect"),intersectEllipse=require("./intersect/intersect-ellipse"),intersectCircle=require("./intersect/intersect-circle"),intersectPolygon=require("./intersect/intersect-polygon");module.exports={rect:rect,ellipse:ellipse,circle:circle,diamond:diamond};function rect(parent,bbox,node){var shapeSvg=parent.insert("rect",":first-child").attr("rx",node.rx).attr("ry",node.ry).attr("x",-bbox.width/2).attr("y",-bbox.height/2).attr("width",bbox.width).attr("height",bbox.height);node.intersect=function(point){return intersectRect(node,point)};return shapeSvg}function ellipse(parent,bbox,node){var rx=bbox.width/2,ry=bbox.height/2,shapeSvg=parent.insert("ellipse",":first-child").attr("x",-bbox.width/2).attr("y",-bbox.height/2).attr("rx",rx).attr("ry",ry);node.intersect=function(point){return intersectEllipse(node,rx,ry,point)};return shapeSvg}function circle(parent,bbox,node){var r=Math.max(bbox.width,bbox.height)/2,shapeSvg=parent.insert("circle",":first-child").attr("x",-bbox.width/2).attr("y",-bbox.height/2).attr("r",r);node.intersect=function(point){return intersectCircle(node,r,point)};return shapeSvg}function diamond(parent,bbox,node){var w=bbox.width*Math.SQRT2/2,h=bbox.height*Math.SQRT2/2,points=[{x:0,y:-h},{x:-w,y:0},{x:0,y:h},{x:w,y:0}],shapeSvg=parent.insert("polygon",":first-child").attr("points",points.map(function(p){return p.x+","+p.y}).join(" "));node.intersect=function(p){return intersectPolygon(node,points,p)};return shapeSvg}},{"./intersect/intersect-circle":11,"./intersect/intersect-ellipse":12,"./intersect/intersect-polygon":15,"./intersect/intersect-rect":16}],25:[function(require,module,exports){var _=require("./lodash");module.exports={isSubgraph:isSubgraph,getMaxChildPaddingTop:getMaxChildPaddingTop,orderByRank:orderByRank,edgeToId:edgeToId,applyStyle:applyStyle,applyClass:applyClass,applyTransition:applyTransition};function isSubgraph(g,v){return!!g.children(v).length}function getMaxChildPaddingTop(g,v){var maxPadding=0;var children=g.children(v);for(var i=0;i<children.length;i++){var child=g.node(children[i]);if(child.paddingTop&&child.paddingTop>maxPadding){maxPadding=child.paddingTop}}return maxPadding}function getRank(g,v){var maxRank=0;var children=g.children(v);for(var i=0;i<children.length;i++){var thisRank=getRank(g,children[i])+1;if(thisRank>maxRank){maxRank=thisRank}}return maxRank}function orderByRank(g,nodes){return nodes.sort(function(x,y){return getRank(g,x)-getRank(g,y)})}function edgeToId(e){return escapeId(e.v)+":"+escapeId(e.w)+":"+escapeId(e.name)}var ID_DELIM=/:/g;function escapeId(str){return str?String(str).replace(ID_DELIM,"\\:"):""}function applyStyle(dom,styleFn){if(styleFn){dom.attr("style",styleFn)}}function applyClass(dom,classFn,otherClasses){if(classFn){dom.attr("class",classFn).attr("class",otherClasses+" "+dom.attr("class"))}}function applyTransition(selection,g){var graph=g.graph();if(_.isPlainObject(graph)){var transition=graph.transition;if(_.isFunction(transition)){return transition(selection)}}return selection}},{"./lodash":20}],26:[function(require,module,exports){module.exports="0.4.4-pre"},{}],27:[function(require,module,exports){module.exports={graphlib:require("./lib/graphlib"),layout:require("./lib/layout"),debug:require("./lib/debug"),util:{time:require("./lib/util").time,notime:require("./lib/util").notime},version:require("./lib/version")}},{"./lib/debug":32,"./lib/graphlib":33,"./lib/layout":35,"./lib/util":55,"./lib/version":56}],28:[function(require,module,exports){"use strict";var _=require("./lodash"),greedyFAS=require("./greedy-fas");module.exports={run:run,undo:undo};function run(g){var fas=g.graph().acyclicer==="greedy"?greedyFAS(g,weightFn(g)):dfsFAS(g);_.each(fas,function(e){var label=g.edge(e);g.removeEdge(e);label.forwardName=e.name;label.reversed=true;g.setEdge(e.w,e.v,label,_.uniqueId("rev"))});function weightFn(g){return function(e){return g.edge(e).weight}}}function dfsFAS(g){var fas=[],stack={},visited={};function dfs(v){if(_.has(visited,v)){return}visited[v]=true;stack[v]=true;_.each(g.outEdges(v),function(e){if(_.has(stack,e.w)){fas.push(e)}else{dfs(e.w)}});delete stack[v]}_.each(g.nodes(),dfs);return fas}function undo(g){_.each(g.edges(),function(e){var label=g.edge(e);if(label.reversed){g.removeEdge(e);var forwardName=label.forwardName;delete label.reversed;delete label.forwardName;g.setEdge(e.w,e.v,label,forwardName)}})}},{"./greedy-fas":34,"./lodash":36}],29:[function(require,module,exports){var _=require("./lodash"),util=require("./util");module.exports=addBorderSegments;function addBorderSegments(g){function dfs(v){var children=g.children(v),node=g.node(v);if(children.length){_.each(children,dfs)}if(_.has(node,"minRank")){node.borderLeft=[];node.borderRight=[];for(var rank=node.minRank,maxRank=node.maxRank+1;rank<maxRank;++rank){addBorderNode(g,"borderLeft","_bl",v,node,rank);addBorderNode(g,"borderRight","_br",v,node,rank)}}}_.each(g.children(),dfs)}function addBorderNode(g,prop,prefix,sg,sgNode,rank){var label={width:0,height:0,rank:rank},prev=sgNode[prop][rank-1],curr=util.addDummyNode(g,"border",label,prefix);sgNode[prop][rank]=curr;g.setParent(curr,sg);if(prev){g.setEdge(prev,curr,{weight:1})}}},{"./lodash":36,"./util":55}],30:[function(require,module,exports){"use strict";var _=require("./lodash");module.exports={adjust:adjust,undo:undo};function adjust(g){var rankDir=g.graph().rankdir.toLowerCase();if(rankDir==="lr"||rankDir==="rl"){swapWidthHeight(g)}}function undo(g){var rankDir=g.graph().rankdir.toLowerCase();if(rankDir==="bt"||rankDir==="rl"){reverseY(g)}if(rankDir==="lr"||rankDir==="rl"){swapXY(g);swapWidthHeight(g)}}function swapWidthHeight(g){_.each(g.nodes(),function(v){swapWidthHeightOne(g.node(v))});_.each(g.edges(),function(e){swapWidthHeightOne(g.edge(e))})}function swapWidthHeightOne(attrs){var w=attrs.width;attrs.width=attrs.height;attrs.height=w}function reverseY(g){_.each(g.nodes(),function(v){reverseYOne(g.node(v))});_.each(g.edges(),function(e){var edge=g.edge(e);_.each(edge.points,reverseYOne);if(_.has(edge,"y")){reverseYOne(edge)}})}function reverseYOne(attrs){attrs.y=-attrs.y}function swapXY(g){_.each(g.nodes(),function(v){swapXYOne(g.node(v))});_.each(g.edges(),function(e){var edge=g.edge(e);_.each(edge.points,swapXYOne);if(_.has(edge,"x")){swapXYOne(edge)}})}function swapXYOne(attrs){var x=attrs.x;attrs.x=attrs.y;attrs.y=x}},{"./lodash":36}],31:[function(require,module,exports){module.exports=List;function List(){var sentinel={};sentinel._next=sentinel._prev=sentinel;this._sentinel=sentinel}List.prototype.dequeue=function(){var sentinel=this._sentinel,entry=sentinel._prev;if(entry!==sentinel){unlink(entry);return entry}};List.prototype.enqueue=function(entry){var sentinel=this._sentinel;if(entry._prev&&entry._next){unlink(entry)}entry._next=sentinel._next;sentinel._next._prev=entry;sentinel._next=entry;entry._prev=sentinel};List.prototype.toString=function(){var strs=[],sentinel=this._sentinel,curr=sentinel._prev;while(curr!==sentinel){strs.push(JSON.stringify(curr,filterOutLinks));curr=curr._prev}return"["+strs.join(", ")+"]"};function unlink(entry){entry._prev._next=entry._next;entry._next._prev=entry._prev;delete entry._next;delete entry._prev}function filterOutLinks(k,v){if(k!=="_next"&&k!=="_prev"){return v}}},{}],32:[function(require,module,exports){var _=require("./lodash"),util=require("./util"),Graph=require("./graphlib").Graph;module.exports={debugOrdering:debugOrdering};function debugOrdering(g){var layerMatrix=util.buildLayerMatrix(g);var h=new Graph({compound:true,multigraph:true}).setGraph({});_.each(g.nodes(),function(v){h.setNode(v,{label:v});h.setParent(v,"layer"+g.node(v).rank)});_.each(g.edges(),function(e){h.setEdge(e.v,e.w,{},e.name)});_.each(layerMatrix,function(layer,i){var layerV="layer"+i;h.setNode(layerV,{rank:"same"});_.reduce(layer,function(u,v){h.setEdge(u,v,{style:"invis"});return v})});return h}},{"./graphlib":33,"./lodash":36,"./util":55}],33:[function(require,module,exports){module.exports=require(9)},{"/Users/andrew/Documents/dev/dagre-d3/lib/graphlib.js":9,graphlib:57}],34:[function(require,module,exports){var _=require("./lodash"),Graph=require("./graphlib").Graph,List=require("./data/list");module.exports=greedyFAS;var DEFAULT_WEIGHT_FN=_.constant(1);function greedyFAS(g,weightFn){if(g.nodeCount()<=1){return[]}var state=buildState(g,weightFn||DEFAULT_WEIGHT_FN);var results=doGreedyFAS(state.graph,state.buckets,state.zeroIdx);return _.flatten(_.map(results,function(e){return g.outEdges(e.v,e.w)}),true)}function doGreedyFAS(g,buckets,zeroIdx){var results=[],sources=buckets[buckets.length-1],sinks=buckets[0];var entry;while(g.nodeCount()){while(entry=sinks.dequeue()){removeNode(g,buckets,zeroIdx,entry)}while(entry=sources.dequeue()){removeNode(g,buckets,zeroIdx,entry)}if(g.nodeCount()){for(var i=buckets.length-2;i>0;--i){entry=buckets[i].dequeue();if(entry){results=results.concat(removeNode(g,buckets,zeroIdx,entry,true));break}}}}return results}function removeNode(g,buckets,zeroIdx,entry,collectPredecessors){var results=collectPredecessors?[]:undefined;_.each(g.inEdges(entry.v),function(edge){var weight=g.edge(edge),uEntry=g.node(edge.v);if(collectPredecessors){results.push({v:edge.v,w:edge.w})}uEntry.out-=weight;assignBucket(buckets,zeroIdx,uEntry)});_.each(g.outEdges(entry.v),function(edge){var weight=g.edge(edge),w=edge.w,wEntry=g.node(w);wEntry["in"]-=weight;assignBucket(buckets,zeroIdx,wEntry)});g.removeNode(entry.v);return results}function buildState(g,weightFn){var fasGraph=new Graph,maxIn=0,maxOut=0;_.each(g.nodes(),function(v){fasGraph.setNode(v,{v:v,"in":0,out:0})});_.each(g.edges(),function(e){var prevWeight=fasGraph.edge(e.v,e.w)||0,weight=weightFn(e),edgeWeight=prevWeight+weight;fasGraph.setEdge(e.v,e.w,edgeWeight);maxOut=Math.max(maxOut,fasGraph.node(e.v).out+=weight);maxIn=Math.max(maxIn,fasGraph.node(e.w)["in"]+=weight)});var buckets=_.range(maxOut+maxIn+3).map(function(){return new List});var zeroIdx=maxIn+1;_.each(fasGraph.nodes(),function(v){assignBucket(buckets,zeroIdx,fasGraph.node(v))});return{graph:fasGraph,buckets:buckets,zeroIdx:zeroIdx}}function assignBucket(buckets,zeroIdx,entry){if(!entry.out){buckets[0].enqueue(entry)}else if(!entry["in"]){buckets[buckets.length-1].enqueue(entry)}else{buckets[entry.out-entry["in"]+zeroIdx].enqueue(entry)}}},{"./data/list":31,"./graphlib":33,"./lodash":36}],35:[function(require,module,exports){"use strict";var _=require("./lodash"),acyclic=require("./acyclic"),normalize=require("./normalize"),rank=require("./rank"),normalizeRanks=require("./util").normalizeRanks,parentDummyChains=require("./parent-dummy-chains"),removeEmptyRanks=require("./util").removeEmptyRanks,nestingGraph=require("./nesting-graph"),addBorderSegments=require("./add-border-segments"),coordinateSystem=require("./coordinate-system"),order=require("./order"),position=require("./position"),util=require("./util"),Graph=require("./graphlib").Graph;module.exports=layout;function layout(g,opts){var time=opts&&opts.debugTiming?util.time:util.notime;time("layout",function(){var layoutGraph=time("  buildLayoutGraph",function(){return buildLayoutGraph(g)});time("  runLayout",function(){runLayout(layoutGraph,time)});time("  updateInputGraph",function(){updateInputGraph(g,layoutGraph)})})}function runLayout(g,time){time("    makeSpaceForEdgeLabels",function(){makeSpaceForEdgeLabels(g)});time("    removeSelfEdges",function(){removeSelfEdges(g)});time("    acyclic",function(){acyclic.run(g)});time("    nestingGraph.run",function(){nestingGraph.run(g)});time("    rank",function(){rank(util.asNonCompoundGraph(g))});time("    injectEdgeLabelProxies",function(){injectEdgeLabelProxies(g)});time("    removeEmptyRanks",function(){removeEmptyRanks(g)});time("    nestingGraph.cleanup",function(){nestingGraph.cleanup(g)});time("    normalizeRanks",function(){normalizeRanks(g)});time("    assignRankMinMax",function(){assignRankMinMax(g)});time("    removeEdgeLabelProxies",function(){removeEdgeLabelProxies(g)});time("    normalize.run",function(){normalize.run(g)});time("    parentDummyChains",function(){
 parentDummyChains(g)});time("    addBorderSegments",function(){addBorderSegments(g)});time("    order",function(){order(g)});time("    insertSelfEdges",function(){insertSelfEdges(g)});time("    adjustCoordinateSystem",function(){coordinateSystem.adjust(g)});time("    position",function(){position(g)});time("    positionSelfEdges",function(){positionSelfEdges(g)});time("    removeBorderNodes",function(){removeBorderNodes(g)});time("    normalize.undo",function(){normalize.undo(g)});time("    fixupEdgeLabelCoords",function(){fixupEdgeLabelCoords(g)});time("    undoCoordinateSystem",function(){coordinateSystem.undo(g)});time("    translateGraph",function(){translateGraph(g)});time("    assignNodeIntersects",function(){assignNodeIntersects(g)});time("    reversePoints",function(){reversePointsForReversedEdges(g)});time("    acyclic.undo",function(){acyclic.undo(g)})}function updateInputGraph(inputGraph,layoutGraph){_.each(inputGraph.nodes(),function(v){var inputLabel=inputGraph.node(v),layoutLabel=layoutGraph.node(v);if(inputLabel){inputLabel.x=layoutLabel.x;inputLabel.y=layoutLabel.y;if(layoutGraph.children(v).length){inputLabel.width=layoutLabel.width;inputLabel.height=layoutLabel.height}}});_.each(inputGraph.edges(),function(e){var inputLabel=inputGraph.edge(e),layoutLabel=layoutGraph.edge(e);inputLabel.points=layoutLabel.points;if(_.has(layoutLabel,"x")){inputLabel.x=layoutLabel.x;inputLabel.y=layoutLabel.y}});inputGraph.graph().width=layoutGraph.graph().width;inputGraph.graph().height=layoutGraph.graph().height}var graphNumAttrs=["nodesep","edgesep","ranksep","marginx","marginy"],graphDefaults={ranksep:50,edgesep:20,nodesep:50,rankdir:"tb"},graphAttrs=["acyclicer","ranker","rankdir","align"],nodeNumAttrs=["width","height"],nodeDefaults={width:0,height:0},edgeNumAttrs=["minlen","weight","width","height","labeloffset"],edgeDefaults={minlen:1,weight:1,width:0,height:0,labeloffset:10,labelpos:"r"},edgeAttrs=["labelpos"];function buildLayoutGraph(inputGraph){var g=new Graph({multigraph:true,compound:true}),graph=canonicalize(inputGraph.graph());g.setGraph(_.merge({},graphDefaults,selectNumberAttrs(graph,graphNumAttrs),_.pick(graph,graphAttrs)));_.each(inputGraph.nodes(),function(v){var node=canonicalize(inputGraph.node(v));g.setNode(v,_.defaults(selectNumberAttrs(node,nodeNumAttrs),nodeDefaults));g.setParent(v,inputGraph.parent(v))});_.each(inputGraph.edges(),function(e){var edge=canonicalize(inputGraph.edge(e));g.setEdge(e,_.merge({},edgeDefaults,selectNumberAttrs(edge,edgeNumAttrs),_.pick(edge,edgeAttrs)))});return g}function makeSpaceForEdgeLabels(g){var graph=g.graph();graph.ranksep/=2;_.each(g.edges(),function(e){var edge=g.edge(e);edge.minlen*=2;if(edge.labelpos.toLowerCase()!=="c"){if(graph.rankdir==="TB"||graph.rankdir==="BT"){edge.width+=edge.labeloffset}else{edge.height+=edge.labeloffset}}})}function injectEdgeLabelProxies(g){_.each(g.edges(),function(e){var edge=g.edge(e);if(edge.width&&edge.height){var v=g.node(e.v),w=g.node(e.w),label={rank:(w.rank-v.rank)/2+v.rank,e:e};util.addDummyNode(g,"edge-proxy",label,"_ep")}})}function assignRankMinMax(g){var maxRank=0;_.each(g.nodes(),function(v){var node=g.node(v);if(node.borderTop){node.minRank=g.node(node.borderTop).rank;node.maxRank=g.node(node.borderBottom).rank;maxRank=_.max(maxRank,node.maxRank)}});g.graph().maxRank=maxRank}function removeEdgeLabelProxies(g){_.each(g.nodes(),function(v){var node=g.node(v);if(node.dummy==="edge-proxy"){g.edge(node.e).labelRank=node.rank;g.removeNode(v)}})}function translateGraph(g){var minX=Number.POSITIVE_INFINITY,maxX=0,minY=Number.POSITIVE_INFINITY,maxY=0,graphLabel=g.graph(),marginX=graphLabel.marginx||0,marginY=graphLabel.marginy||0;function getExtremes(attrs){var x=attrs.x,y=attrs.y,w=attrs.width,h=attrs.height;minX=Math.min(minX,x-w/2);maxX=Math.max(maxX,x+w/2);minY=Math.min(minY,y-h/2);maxY=Math.max(maxY,y+h/2)}_.each(g.nodes(),function(v){getExtremes(g.node(v))});_.each(g.edges(),function(e){var edge=g.edge(e);if(_.has(edge,"x")){getExtremes(edge)}});minX-=marginX;minY-=marginY;_.each(g.nodes(),function(v){var node=g.node(v);node.x-=minX;node.y-=minY});_.each(g.edges(),function(e){var edge=g.edge(e);_.each(edge.points,function(p){p.x-=minX;p.y-=minY});if(_.has(edge,"x")){edge.x-=minX}if(_.has(edge,"y")){edge.y-=minY}});graphLabel.width=maxX-minX+marginX;graphLabel.height=maxY-minY+marginY}function assignNodeIntersects(g){_.each(g.edges(),function(e){var edge=g.edge(e),nodeV=g.node(e.v),nodeW=g.node(e.w),p1,p2;if(!edge.points){edge.points=[];p1=nodeW;p2=nodeV}else{p1=edge.points[0];p2=edge.points[edge.points.length-1]}edge.points.unshift(util.intersectRect(nodeV,p1));edge.points.push(util.intersectRect(nodeW,p2))})}function fixupEdgeLabelCoords(g){_.each(g.edges(),function(e){var edge=g.edge(e);if(_.has(edge,"x")){if(edge.labelpos==="l"||edge.labelpos==="r"){edge.width-=edge.labeloffset}switch(edge.labelpos){case"l":edge.x-=edge.width/2+edge.labeloffset;break;case"r":edge.x+=edge.width/2+edge.labeloffset;break}}})}function reversePointsForReversedEdges(g){_.each(g.edges(),function(e){var edge=g.edge(e);if(edge.reversed){edge.points.reverse()}})}function removeBorderNodes(g){_.each(g.nodes(),function(v){if(g.children(v).length){var node=g.node(v),t=g.node(node.borderTop),b=g.node(node.borderBottom),l=g.node(_.last(node.borderLeft)),r=g.node(_.last(node.borderRight));node.width=Math.abs(r.x-l.x);node.height=Math.abs(b.y-t.y);node.x=l.x+node.width/2;node.y=t.y+node.height/2}});_.each(g.nodes(),function(v){if(g.node(v).dummy==="border"){g.removeNode(v)}})}function removeSelfEdges(g){_.each(g.edges(),function(e){if(e.v===e.w){var node=g.node(e.v);if(!node.selfEdges){node.selfEdges=[]}node.selfEdges.push({e:e,label:g.edge(e)});g.removeEdge(e)}})}function insertSelfEdges(g){var layers=util.buildLayerMatrix(g);_.each(layers,function(layer){var orderShift=0;_.each(layer,function(v,i){var node=g.node(v);node.order=i+orderShift;_.each(node.selfEdges,function(selfEdge){util.addDummyNode(g,"selfedge",{width:selfEdge.label.width,height:selfEdge.label.height,rank:node.rank,order:i+ ++orderShift,e:selfEdge.e,label:selfEdge.label},"_se")});delete node.selfEdges})})}function positionSelfEdges(g){_.each(g.nodes(),function(v){var node=g.node(v);if(node.dummy==="selfedge"){var selfNode=g.node(node.e.v),x=selfNode.x+selfNode.width/2,y=selfNode.y,dx=node.x-x,dy=selfNode.height/2;g.setEdge(node.e,node.label);g.removeNode(v);node.label.points=[{x:x+2*dx/3,y:y-dy},{x:x+5*dx/6,y:y-dy},{x:x+dx,y:y},{x:x+5*dx/6,y:y+dy},{x:x+2*dx/3,y:y+dy}];node.label.x=node.x;node.label.y=node.y}})}function selectNumberAttrs(obj,attrs){return _.mapValues(_.pick(obj,attrs),Number)}function canonicalize(attrs){var newAttrs={};_.each(attrs,function(v,k){newAttrs[k.toLowerCase()]=v});return newAttrs}},{"./acyclic":28,"./add-border-segments":29,"./coordinate-system":30,"./graphlib":33,"./lodash":36,"./nesting-graph":37,"./normalize":38,"./order":43,"./parent-dummy-chains":48,"./position":50,"./rank":52,"./util":55}],36:[function(require,module,exports){module.exports=require(20)},{"/Users/andrew/Documents/dev/dagre-d3/lib/lodash.js":20,lodash:77}],37:[function(require,module,exports){var _=require("./lodash"),util=require("./util");module.exports={run:run,cleanup:cleanup};function run(g){var root=util.addDummyNode(g,"root",{},"_root"),depths=treeDepths(g),height=_.max(depths)-1,nodeSep=2*height+1;g.graph().nestingRoot=root;_.each(g.edges(),function(e){g.edge(e).minlen*=nodeSep});var weight=sumWeights(g)+1;_.each(g.children(),function(child){dfs(g,root,nodeSep,weight,height,depths,child)});g.graph().nodeRankFactor=nodeSep}function dfs(g,root,nodeSep,weight,height,depths,v){var children=g.children(v);if(!children.length){if(v!==root){g.setEdge(root,v,{weight:0,minlen:nodeSep})}return}var top=util.addBorderNode(g,"_bt"),bottom=util.addBorderNode(g,"_bb"),label=g.node(v);g.setParent(top,v);label.borderTop=top;g.setParent(bottom,v);label.borderBottom=bottom;_.each(children,function(child){dfs(g,root,nodeSep,weight,height,depths,child);var childNode=g.node(child),childTop=childNode.borderTop?childNode.borderTop:child,childBottom=childNode.borderBottom?childNode.borderBottom:child,thisWeight=childNode.borderTop?weight:2*weight,minlen=childTop!==childBottom?1:height-depths[v]+1;g.setEdge(top,childTop,{weight:thisWeight,minlen:minlen,nestingEdge:true});g.setEdge(childBottom,bottom,{weight:thisWeight,minlen:minlen,nestingEdge:true})});if(!g.parent(v)){g.setEdge(root,top,{weight:0,minlen:height+depths[v]})}}function treeDepths(g){var depths={};function dfs(v,depth){var children=g.children(v);if(children&&children.length){_.each(children,function(child){dfs(child,depth+1)})}depths[v]=depth}_.each(g.children(),function(v){dfs(v,1)});return depths}function sumWeights(g){return _.reduce(g.edges(),function(acc,e){return acc+g.edge(e).weight},0)}function cleanup(g){var graphLabel=g.graph();g.removeNode(graphLabel.nestingRoot);delete graphLabel.nestingRoot;_.each(g.edges(),function(e){var edge=g.edge(e);if(edge.nestingEdge){g.removeEdge(e)}})}},{"./lodash":36,"./util":55}],38:[function(require,module,exports){"use strict";var _=require("./lodash"),util=require("./util");module.exports={run:run,undo:undo};function run(g){g.graph().dummyChains=[];_.each(g.edges(),function(edge){normalizeEdge(g,edge)})}function normalizeEdge(g,e){var v=e.v,vRank=g.node(v).rank,w=e.w,wRank=g.node(w).rank,name=e.name,edgeLabel=g.edge(e),labelRank=edgeLabel.labelRank;if(wRank===vRank+1)return;g.removeEdge(e);var dummy,attrs,i;for(i=0,++vRank;vRank<wRank;++i,++vRank){edgeLabel.points=[];attrs={width:0,height:0,edgeLabel:edgeLabel,edgeObj:e,rank:vRank};dummy=util.addDummyNode(g,"edge",attrs,"_d");if(vRank===labelRank){attrs.width=edgeLabel.width;attrs.height=edgeLabel.height;attrs.dummy="edge-label";attrs.labelpos=edgeLabel.labelpos}g.setEdge(v,dummy,{weight:edgeLabel.weight},name);if(i===0){g.graph().dummyChains.push(dummy)}v=dummy}g.setEdge(v,w,{weight:edgeLabel.weight},name)}function undo(g){_.each(g.graph().dummyChains,function(v){var node=g.node(v),origLabel=node.edgeLabel,w;g.setEdge(node.edgeObj,origLabel);while(node.dummy){w=g.successors(v)[0];g.removeNode(v);origLabel.points.push({x:node.x,y:node.y});if(node.dummy==="edge-label"){origLabel.x=node.x;origLabel.y=node.y;origLabel.width=node.width;origLabel.height=node.height}v=w;node=g.node(v)}})}},{"./lodash":36,"./util":55}],39:[function(require,module,exports){var _=require("../lodash");module.exports=addSubgraphConstraints;function addSubgraphConstraints(g,cg,vs){var prev={},rootPrev;_.each(vs,function(v){var child=g.parent(v),parent,prevChild;while(child){parent=g.parent(child);if(parent){prevChild=prev[parent];prev[parent]=child}else{prevChild=rootPrev;rootPrev=child}if(prevChild&&prevChild!==child){cg.setEdge(prevChild,child);return}child=parent}})}},{"../lodash":36}],40:[function(require,module,exports){var _=require("../lodash");module.exports=barycenter;function barycenter(g,movable){return _.map(movable,function(v){var inV=g.inEdges(v);if(!inV.length){return{v:v}}else{var result=_.reduce(inV,function(acc,e){var edge=g.edge(e),nodeU=g.node(e.v);return{sum:acc.sum+edge.weight*nodeU.order,weight:acc.weight+edge.weight}},{sum:0,weight:0});return{v:v,barycenter:result.sum/result.weight,weight:result.weight}}})}},{"../lodash":36}],41:[function(require,module,exports){var _=require("../lodash"),Graph=require("../graphlib").Graph;module.exports=buildLayerGraph;function buildLayerGraph(g,rank,relationship){var root=createRootNode(g),result=new Graph({compound:true}).setGraph({root:root}).setDefaultNodeLabel(function(v){return g.node(v)});_.each(g.nodes(),function(v){var node=g.node(v),parent=g.parent(v);if(node.rank===rank||node.minRank<=rank&&rank<=node.maxRank){result.setNode(v);result.setParent(v,parent||root);_.each(g[relationship](v),function(e){var u=e.v===v?e.w:e.v,edge=result.edge(u,v),weight=!_.isUndefined(edge)?edge.weight:0;result.setEdge(u,v,{weight:g.edge(e).weight+weight})});if(_.has(node,"minRank")){result.setNode(v,{borderLeft:node.borderLeft[rank],borderRight:node.borderRight[rank]})}}});return result}function createRootNode(g){var v;while(g.hasNode(v=_.uniqueId("_root")));return v}},{"../graphlib":33,"../lodash":36}],42:[function(require,module,exports){"use strict";var _=require("../lodash");module.exports=crossCount;function crossCount(g,layering){var cc=0;for(var i=1;i<layering.length;++i){cc+=twoLayerCrossCount(g,layering[i-1],layering[i])}return cc}function twoLayerCrossCount(g,northLayer,southLayer){var southPos=_.zipObject(southLayer,_.map(southLayer,function(v,i){return i}));var southEntries=_.flatten(_.map(northLayer,function(v){return _.chain(g.outEdges(v)).map(function(e){return{pos:southPos[e.w],weight:g.edge(e).weight}}).sortBy("pos").value()}),true);var firstIndex=1;while(firstIndex<southLayer.length)firstIndex<<=1;var treeSize=2*firstIndex-1;firstIndex-=1;var tree=_.map(new Array(treeSize),function(){return 0});var cc=0;_.each(southEntries.forEach(function(entry){var index=entry.pos+firstIndex;tree[index]+=entry.weight;var weightSum=0;while(index>0){if(index%2){weightSum+=tree[index+1]}index=index-1>>1;tree[index]+=entry.weight}cc+=entry.weight*weightSum}));return cc}},{"../lodash":36}],43:[function(require,module,exports){"use strict";var _=require("../lodash"),initOrder=require("./init-order"),crossCount=require("./cross-count"),sortSubgraph=require("./sort-subgraph"),buildLayerGraph=require("./build-layer-graph"),addSubgraphConstraints=require("./add-subgraph-constraints"),Graph=require("../graphlib").Graph,util=require("../util");module.exports=order;function order(g){var maxRank=util.maxRank(g),downLayerGraphs=buildLayerGraphs(g,_.range(1,maxRank+1),"inEdges"),upLayerGraphs=buildLayerGraphs(g,_.range(maxRank-1,-1,-1),"outEdges");var layering=initOrder(g);assignOrder(g,layering);var bestCC=Number.POSITIVE_INFINITY,best;for(var i=0,lastBest=0;lastBest<4;++i,++lastBest){sweepLayerGraphs(i%2?downLayerGraphs:upLayerGraphs,i%4>=2);layering=util.buildLayerMatrix(g);var cc=crossCount(g,layering);if(cc<bestCC){lastBest=0;best=_.cloneDeep(layering);bestCC=cc}}assignOrder(g,best)}function buildLayerGraphs(g,ranks,relationship){return _.map(ranks,function(rank){return buildLayerGraph(g,rank,relationship)})}function sweepLayerGraphs(layerGraphs,biasRight){var cg=new Graph;_.each(layerGraphs,function(lg){var root=lg.graph().root;var sorted=sortSubgraph(lg,root,cg,biasRight);_.each(sorted.vs,function(v,i){lg.node(v).order=i});addSubgraphConstraints(lg,cg,sorted.vs)})}function assignOrder(g,layering){_.each(layering,function(layer){_.each(layer,function(v,i){g.node(v).order=i})})}},{"../graphlib":33,"../lodash":36,"../util":55,"./add-subgraph-constraints":39,"./build-layer-graph":41,"./cross-count":42,"./init-order":44,"./sort-subgraph":46}],44:[function(require,module,exports){"use strict";var _=require("../lodash");module.exports=initOrder;function initOrder(g){var visited={},simpleNodes=_.filter(g.nodes(),function(v){return!g.children(v).length}),maxRank=_.max(_.map(simpleNodes,function(v){return g.node(v).rank})),layers=_.map(_.range(maxRank+1),function(){return[]});function dfs(v){if(_.has(visited,v))return;visited[v]=true;var node=g.node(v);layers[node.rank].push(v);_.each(g.successors(v),dfs)}var orderedVs=_.sortBy(simpleNodes,function(v){return g.node(v).rank});_.each(orderedVs,dfs);return layers}},{"../lodash":36}],45:[function(require,module,exports){"use strict";var _=require("../lodash");module.exports=resolveConflicts;function resolveConflicts(entries,cg){var mappedEntries={};_.each(entries,function(entry,i){var tmp=mappedEntries[entry.v]={indegree:0,"in":[],out:[],vs:[entry.v],i:i};if(!_.isUndefined(entry.barycenter)){tmp.barycenter=entry.barycenter;tmp.weight=entry.weight}});_.each(cg.edges(),function(e){var entryV=mappedEntries[e.v],entryW=mappedEntries[e.w];if(!_.isUndefined(entryV)&&!_.isUndefined(entryW)){entryW.indegree++;entryV.out.push(mappedEntries[e.w])}});var sourceSet=_.filter(mappedEntries,function(entry){return!entry.indegree});return doResolveConflicts(sourceSet)}function doResolveConflicts(sourceSet){var entries=[];function handleIn(vEntry){return function(uEntry){if(uEntry.merged){return}if(_.isUndefined(uEntry.barycenter)||_.isUndefined(vEntry.barycenter)||uEntry.barycenter>=vEntry.barycenter){mergeEntries(vEntry,uEntry)}}}function handleOut(vEntry){return function(wEntry){wEntry["in"].push(vEntry);if(--wEntry.indegree===0){sourceSet.push(wEntry)}}}while(sourceSet.length){var entry=sourceSet.pop();entries.push(entry);_.each(entry["in"].reverse(),handleIn(entry));_.each(entry.out,handleOut(entry))}return _.chain(entries).filter(function(entry){return!entry.merged}).map(function(entry){return _.pick(entry,["vs","i","barycenter","weight"])}).value()}function mergeEntries(target,source){var sum=0,weight=0;if(target.weight){sum+=target.barycenter*target.weight;weight+=target.weight}if(source.weight){sum+=source.barycenter*source.weight;weight+=source.weight}target.vs=source.vs.concat(target.vs);target.barycenter=sum/weight;target.weight=weight;target.i=Math.min(source.i,target.i);source.merged=true}},{"../lodash":36}],46:[function(require,module,exports){var _=require("../lodash"),barycenter=require("./barycenter"),resolveConflicts=require("./resolve-conflicts"),sort=require("./sort");module.exports=sortSubgraph;function sortSubgraph(g,v,cg,biasRight){var movable=g.children(v),node=g.node(v),bl=node?node.borderLeft:undefined,br=node?node.borderRight:undefined,subgraphs={};if(bl){movable=_.filter(movable,function(w){return w!==bl&&w!==br})}var barycenters=barycenter(g,movable);_.each(barycenters,function(entry){if(g.children(entry.v).length){var subgraphResult=sortSubgraph(g,entry.v,cg,biasRight);subgraphs[entry.v]=subgraphResult;if(_.has(subgraphResult,"barycenter")){mergeBarycenters(entry,subgraphResult)}}});var entries=resolveConflicts(barycenters,cg);expandSubgraphs(entries,subgraphs);var result=sort(entries,biasRight);if(bl){result.vs=_.flatten([bl,result.vs,br],true);if(g.predecessors(bl).length){var blPred=g.node(g.predecessors(bl)[0]),brPred=g.node(g.predecessors(br)[0]);if(!_.has(result,"barycenter")){result.barycenter=0;result.weight=0}result.barycenter=(result.barycenter*result.weight+blPred.order+brPred.order)/(result.weight+2);result.weight+=2}}return result}function expandSubgraphs(entries,subgraphs){_.each(entries,function(entry){entry.vs=_.flatten(entry.vs.map(function(v){if(subgraphs[v]){return subgraphs[v].vs}return v}),true)})}function mergeBarycenters(target,other){if(!_.isUndefined(target.barycenter)){target.barycenter=(target.barycenter*target.weight+other.barycenter*other.weight)/(target.weight+other.weight);target.weight+=other.weight}else{target.barycenter=other.barycenter;target.weight=other.weight}}},{"../lodash":36,"./barycenter":40,"./resolve-conflicts":45,"./sort":47}],47:[function(require,module,exports){var _=require("../lodash"),util=require("../util");module.exports=sort;function sort(entries,biasRight){var parts=util.partition(entries,function(entry){return _.has(entry,"barycenter")});var sortable=parts.lhs,unsortable=_.sortBy(parts.rhs,function(entry){return-entry.i}),vs=[],sum=0,weight=0,vsIndex=0;sortable.sort(compareWithBias(!!biasRight));vsIndex=consumeUnsortable(vs,unsortable,vsIndex);_.each(sortable,function(entry){vsIndex+=entry.vs.length;vs.push(entry.vs);sum+=entry.barycenter*entry.weight;weight+=entry.weight;vsIndex=consumeUnsortable(vs,unsortable,vsIndex)});var result={vs:_.flatten(vs,true)};if(weight){result.barycenter=sum/weight;result.weight=weight}return result}function consumeUnsortable(vs,unsortable,index){var last;while(unsortable.length&&(last=_.last(unsortable)).i<=index){unsortable.pop();vs.push(last.vs);index++}return index}function compareWithBias(bias){return function(entryV,entryW){if(entryV.barycenter<entryW.barycenter){return-1}else if(entryV.barycenter>entryW.barycenter){return 1}return!bias?entryV.i-entryW.i:entryW.i-entryV.i}}},{"../lodash":36,"../util":55}],48:[function(require,module,exports){var _=require("./lodash");module.exports=parentDummyChains;function parentDummyChains(g){var postorderNums=postorder(g);_.each(g.graph().dummyChains,function(v){var node=g.node(v),edgeObj=node.edgeObj,pathData=findPath(g,postorderNums,edgeObj.v,edgeObj.w),path=pathData.path,lca=pathData.lca,pathIdx=0,pathV=path[pathIdx],ascending=true;while(v!==edgeObj.w){node=g.node(v);if(ascending){while((pathV=path[pathIdx])!==lca&&g.node(pathV).maxRank<node.rank){pathIdx++}if(pathV===lca){ascending=false}}if(!ascending){while(pathIdx<path.length-1&&g.node(pathV=path[pathIdx+1]).minRank<=node.rank){pathIdx++}pathV=path[pathIdx]}g.setParent(v,pathV);v=g.successors(v)[0]}})}function findPath(g,postorderNums,v,w){var vPath=[],wPath=[],low=Math.min(postorderNums[v].low,postorderNums[w].low),lim=Math.max(postorderNums[v].lim,postorderNums[w].lim),parent,lca;parent=v;do{parent=g.parent(parent);vPath.push(parent)}while(parent&&(postorderNums[parent].low>low||lim>postorderNums[parent].lim));lca=parent;parent=w;while((parent=g.parent(parent))!==lca){wPath.push(parent)}return{path:vPath.concat(wPath.reverse()),lca:lca}}function postorder(g){var result={},lim=0;function dfs(v){var low=lim;_.each(g.children(v),dfs);result[v]={low:low,lim:lim++}}_.each(g.children(),dfs);return result}},{"./lodash":36}],49:[function(require,module,exports){"use strict";var _=require("../lodash"),Graph=require("../graphlib").Graph,util=require("../util");module.exports={positionX:positionX,findType1Conflicts:findType1Conflicts,findType2Conflicts:findType2Conflicts,addConflict:addConflict,hasConflict:hasConflict,verticalAlignment:verticalAlignment,horizontalCompaction:horizontalCompaction,alignCoordinates:alignCoordinates,findSmallestWidthAlignment:findSmallestWidthAlignment,balance:balance};function findType1Conflicts(g,layering){var conflicts={};function visitLayer(prevLayer,layer){var k0=0,scanPos=0,prevLayerLength=prevLayer.length,lastNode=_.last(layer);_.each(layer,function(v,i){var w=findOtherInnerSegmentNode(g,v),k1=w?g.node(w).order:prevLayerLength;if(w||v===lastNode){_.each(layer.slice(scanPos,i+1),function(scanNode){_.each(g.predecessors(scanNode),function(u){var uLabel=g.node(u),uPos=uLabel.order;if((uPos<k0||k1<uPos)&&!(uLabel.dummy&&g.node(scanNode).dummy)){addConflict(conflicts,u,scanNode)}})});scanPos=i+1;k0=k1}});return layer}_.reduce(layering,visitLayer);return conflicts}function findType2Conflicts(g,layering){var conflicts={};function scan(south,southPos,southEnd,prevNorthBorder,nextNorthBorder){var v;_.each(_.range(southPos,southEnd),function(i){v=south[i];if(g.node(v).dummy){_.each(g.predecessors(v),function(u){var uNode=g.node(u);if(uNode.dummy&&(uNode.order<prevNorthBorder||uNode.order>nextNorthBorder)){addConflict(conflicts,u,v)}})}})}function visitLayer(north,south){var prevNorthPos=-1,nextNorthPos,southPos=0;_.each(south,function(v,southLookahead){if(g.node(v).dummy==="border"){var predecessors=g.predecessors(v);if(predecessors.length){nextNorthPos=g.node(predecessors[0]).order;scan(south,southPos,southLookahead,prevNorthPos,nextNorthPos);southPos=southLookahead;prevNorthPos=nextNorthPos}}scan(south,southPos,south.length,nextNorthPos,north.length)});return south}_.reduce(layering,visitLayer);return conflicts}function findOtherInnerSegmentNode(g,v){if(g.node(v).dummy){return _.find(g.predecessors(v),function(u){return g.node(u).dummy})}}function addConflict(conflicts,v,w){if(v>w){var tmp=v;v=w;w=tmp}var conflictsV=conflicts[v];if(!conflictsV){conflicts[v]=conflictsV={}}conflictsV[w]=true}function hasConflict(conflicts,v,w){if(v>w){var tmp=v;v=w;w=tmp}return _.has(conflicts[v],w)}function verticalAlignment(g,layering,conflicts,neighborFn){var root={},align={},pos={};_.each(layering,function(layer){_.each(layer,function(v,order){root[v]=v;align[v]=v;pos[v]=order})});_.each(layering,function(layer){var prevIdx=-1;_.each(layer,function(v){var ws=neighborFn(v);if(ws.length){ws=_.sortBy(ws,function(w){return pos[w]});var mp=(ws.length-1)/2;for(var i=Math.floor(mp),il=Math.ceil(mp);i<=il;++i){var w=ws[i];if(align[v]===v&&prevIdx<pos[w]&&!hasConflict(conflicts,v,w)){align[w]=v;align[v]=root[v]=root[w];prevIdx=pos[w]}}}})});return{root:root,align:align}}function horizontalCompaction(g,layering,root,align,reverseSep){var xs={},blockG=buildBlockGraph(g,layering,root,reverseSep);var visited={};function pass1(v){if(!_.has(visited,v)){visited[v]=true;xs[v]=_.reduce(blockG.inEdges(v),function(max,e){pass1(e.v);return Math.max(max,xs[e.v]+blockG.edge(e))},0)}}_.each(blockG.nodes(),pass1);function pass2(v){if(visited[v]!==2){visited[v]++;var min=_.reduce(blockG.outEdges(v),function(min,e){pass2(e.w);return Math.min(min,xs[e.w]-blockG.edge(e))},Number.POSITIVE_INFINITY);if(min!==Number.POSITIVE_INFINITY){xs[v]=Math.max(xs[v],min)}}}_.each(blockG.nodes(),pass2);_.each(align,function(v){xs[v]=xs[root[v]]});return xs}function buildBlockGraph(g,layering,root,reverseSep){var blockGraph=new Graph,graphLabel=g.graph(),sepFn=sep(graphLabel.nodesep,graphLabel.edgesep,reverseSep);_.each(layering,function(layer){var u;_.each(layer,function(v){var vRoot=root[v];blockGraph.setNode(vRoot);if(u){var uRoot=root[u],prevMax=blockGraph.edge(uRoot,vRoot);blockGraph.setEdge(uRoot,vRoot,Math.max(sepFn(g,v,u),prevMax||0))}u=v})});return blockGraph}function findSmallestWidthAlignment(g,xss){return _.min(xss,function(xs){var min=_.min(xs,function(x,v){return x-width(g,v)/2}),max=_.max(xs,function(x,v){return x+width(g,v)/2});return max-min})}function alignCoordinates(xss,alignTo){var alignToMin=_.min(alignTo),alignToMax=_.max(alignTo);_.each(["u","d"],function(vert){_.each(["l","r"],function(horiz){var alignment=vert+horiz,xs=xss[alignment],delta;if(xs===alignTo)return;delta=horiz==="l"?alignToMin-_.min(xs):alignToMax-_.max(xs);if(delta){xss[alignment]=_.mapValues(xs,function(x){return x+delta})}})})}function balance(xss,align){return _.mapValues(xss.ul,function(ignore,v){if(align){return xss[align.toLowerCase()][v]}else{var xs=_.sortBy(_.pluck(xss,v));return(xs[1]+xs[2])/2}})}function positionX(g){var layering=util.buildLayerMatrix(g),conflicts=_.merge(findType1Conflicts(g,layering),findType2Conflicts(g,layering));var xss={},adjustedLayering;_.each(["u","d"],function(vert){adjustedLayering=vert==="u"?layering:_.values(layering).reverse();_.each(["l","r"],function(horiz){if(horiz==="r"){adjustedLayering=_.map(adjustedLayering,function(inner){return _.values(inner).reverse()})}var neighborFn=_.bind(vert==="u"?g.predecessors:g.successors,g);var align=verticalAlignment(g,adjustedLayering,conflicts,neighborFn);var xs=horizontalCompaction(g,adjustedLayering,align.root,align.align,horiz==="r");if(horiz==="r"){xs=_.mapValues(xs,function(x){return-x})}xss[vert+horiz]=xs})});var smallestWidth=findSmallestWidthAlignment(g,xss);alignCoordinates(xss,smallestWidth);return balance(xss,g.graph().align)}function sep(nodeSep,edgeSep,reverseSep){return function(g,v,w){var vLabel=g.node(v),wLabel=g.node(w),sum=0,delta;sum+=vLabel.width/2;if(_.has(vLabel,"labelpos")){switch(vLabel.labelpos.toLowerCase()){case"l":delta=-vLabel.width/2;break;case"r":delta=vLabel.width/2;break}}if(delta){sum+=reverseSep?delta:-delta}delta=0;sum+=(vLabel.dummy?edgeSep:nodeSep)/2;sum+=(wLabel.dummy?edgeSep:nodeSep)/2;sum+=wLabel.width/2;if(_.has(wLabel,"labelpos")){switch(wLabel.labelpos.toLowerCase()){case"l":delta=wLabel.width/2;break;case"r":delta=-wLabel.width/2;break}}if(delta){sum+=reverseSep?delta:-delta}delta=0;return sum}}function width(g,v){return g.node(v).width}},{"../graphlib":33,"../lodash":36,"../util":55}],50:[function(require,module,exports){"use strict";var _=require("../lodash"),util=require("../util"),positionX=require("./bk").positionX;module.exports=position;function position(g){g=util.asNonCompoundGraph(g);positionY(g);_.each(positionX(g),function(x,v){g.node(v).x=x})}function positionY(g){var layering=util.buildLayerMatrix(g),rankSep=g.graph().ranksep,prevY=0;_.each(layering,function(layer){var maxHeight=_.max(_.map(layer,function(v){return g.node(v).height}));_.each(layer,function(v){g.node(v).y=prevY+maxHeight/2});prevY+=maxHeight+rankSep})}},{"../lodash":36,"../util":55,"./bk":49}],51:[function(require,module,exports){"use strict";var _=require("../lodash"),Graph=require("../graphlib").Graph,slack=require("./util").slack;module.exports=feasibleTree;function feasibleTree(g){var t=new Graph({directed:false});var start=g.nodes()[0],size=g.nodeCount();t.setNode(start,{});var edge,delta;while(tightTree(t,g)<size){edge=findMinSlackEdge(t,g);delta=t.hasNode(edge.v)?slack(g,edge):-slack(g,edge);shiftRanks(t,g,delta)}return t}function tightTree(t,g){function dfs(v){_.each(g.nodeEdges(v),function(e){var edgeV=e.v,w=v===edgeV?e.w:edgeV;if(!t.hasNode(w)&&!slack(g,e)){t.setNode(w,{});t.setEdge(v,w,{});dfs(w)}})}_.each(t.nodes(),dfs);return t.nodeCount()}function findMinSlackEdge(t,g){return _.min(g.edges(),function(e){if(t.hasNode(e.v)!==t.hasNode(e.w)){return slack(g,e)}})}function shiftRanks(t,g,delta){_.each(t.nodes(),function(v){g.node(v).rank+=delta})}},{"../graphlib":33,"../lodash":36,"./util":54}],52:[function(require,module,exports){"use strict";var rankUtil=require("./util"),longestPath=rankUtil.longestPath,feasibleTree=require("./feasible-tree"),networkSimplex=require("./network-simplex");module.exports=rank;function rank(g){switch(g.graph().ranker){case"network-simplex":networkSimplexRanker(g);break;case"tight-tree":tightTreeRanker(g);break;case"longest-path":longestPathRanker(g);break;default:networkSimplexRanker(g)}}var longestPathRanker=longestPath;function tightTreeRanker(g){longestPath(g);feasibleTree(g)}function networkSimplexRanker(g){networkSimplex(g)}},{"./feasible-tree":51,"./network-simplex":53,"./util":54}],53:[function(require,module,exports){"use strict";var _=require("../lodash"),feasibleTree=require("./feasible-tree"),slack=require("./util").slack,initRank=require("./util").longestPath,preorder=require("../graphlib").alg.preorder,postorder=require("../graphlib").alg.postorder,simplify=require("../util").simplify;module.exports=networkSimplex;networkSimplex.initLowLimValues=initLowLimValues;networkSimplex.initCutValues=initCutValues;networkSimplex.calcCutValue=calcCutValue;networkSimplex.leaveEdge=leaveEdge;networkSimplex.enterEdge=enterEdge;networkSimplex.exchangeEdges=exchangeEdges;function networkSimplex(g){g=simplify(g);initRank(g);var t=feasibleTree(g);initLowLimValues(t);initCutValues(t,g);var e,f;while(e=leaveEdge(t)){f=enterEdge(t,g,e);exchangeEdges(t,g,e,f)}}function initCutValues(t,g){var vs=postorder(t,t.nodes());vs=vs.slice(0,vs.length-1);_.each(vs,function(v){assignCutValue(t,g,v)})}function assignCutValue(t,g,child){var childLab=t.node(child),parent=childLab.parent;t.edge(child,parent).cutvalue=calcCutValue(t,g,child)}function calcCutValue(t,g,child){var childLab=t.node(child),parent=childLab.parent,childIsTail=true,graphEdge=g.edge(child,parent),cutValue=0;if(!graphEdge){childIsTail=false;graphEdge=g.edge(parent,child)}cutValue=graphEdge.weight;_.each(g.nodeEdges(child),function(e){var isOutEdge=e.v===child,other=isOutEdge?e.w:e.v;if(other!==parent){var pointsToHead=isOutEdge===childIsTail,otherWeight=g.edge(e).weight;cutValue+=pointsToHead?otherWeight:-otherWeight;if(isTreeEdge(t,child,other)){var otherCutValue=t.edge(child,other).cutvalue;cutValue+=pointsToHead?-otherCutValue:otherCutValue}}});return cutValue}function initLowLimValues(tree,root){if(arguments.length<2){root=tree.nodes()[0]}dfsAssignLowLim(tree,{},1,root)}function dfsAssignLowLim(tree,visited,nextLim,v,parent){var low=nextLim,label=tree.node(v);visited[v]=true;_.each(tree.neighbors(v),function(w){if(!_.has(visited,w)){nextLim=dfsAssignLowLim(tree,visited,nextLim,w,v)}});label.low=low;label.lim=nextLim++;if(parent){label.parent=parent}else{delete label.parent}return nextLim}function leaveEdge(tree){return _.find(tree.edges(),function(e){return tree.edge(e).cutvalue<0})}function enterEdge(t,g,edge){var v=edge.v,w=edge.w;
 
 if(!g.hasEdge(v,w)){v=edge.w;w=edge.v}var vLabel=t.node(v),wLabel=t.node(w),tailLabel=vLabel,flip=false;if(vLabel.lim>wLabel.lim){tailLabel=wLabel;flip=true}var candidates=_.filter(g.edges(),function(edge){return flip===isDescendant(t,t.node(edge.v),tailLabel)&&flip!==isDescendant(t,t.node(edge.w),tailLabel)});return _.min(candidates,function(edge){return slack(g,edge)})}function exchangeEdges(t,g,e,f){var v=e.v,w=e.w;t.removeEdge(v,w);t.setEdge(f.v,f.w,{});initLowLimValues(t);initCutValues(t,g);updateRanks(t,g)}function updateRanks(t,g){var root=_.find(t.nodes(),function(v){return!g.node(v).parent}),vs=preorder(t,root);vs=vs.slice(1);_.each(vs,function(v){var parent=t.node(v).parent,edge=g.edge(v,parent),flipped=false;if(!edge){edge=g.edge(parent,v);flipped=true}g.node(v).rank=g.node(parent).rank+(flipped?edge.minlen:-edge.minlen)})}function isTreeEdge(tree,u,v){return tree.hasEdge(u,v)}function isDescendant(tree,vLabel,rootLabel){return rootLabel.low<=vLabel.lim&&vLabel.lim<=rootLabel.lim}},{"../graphlib":33,"../lodash":36,"../util":55,"./feasible-tree":51,"./util":54}],54:[function(require,module,exports){"use strict";var _=require("../lodash");module.exports={longestPath:longestPath,slack:slack};function longestPath(g){var visited={};function dfs(v){var label=g.node(v);if(_.has(visited,v)){return label.rank}visited[v]=true;var rank=_.min(_.map(g.outEdges(v),function(e){return dfs(e.w)-g.edge(e).minlen}));if(rank===Number.POSITIVE_INFINITY){rank=0}return label.rank=rank}_.each(g.sources(),dfs)}function slack(g,e){return g.node(e.w).rank-g.node(e.v).rank-g.edge(e).minlen}},{"../lodash":36}],55:[function(require,module,exports){"use strict";var _=require("./lodash"),Graph=require("./graphlib").Graph;module.exports={addDummyNode:addDummyNode,simplify:simplify,asNonCompoundGraph:asNonCompoundGraph,successorWeights:successorWeights,predecessorWeights:predecessorWeights,intersectRect:intersectRect,buildLayerMatrix:buildLayerMatrix,normalizeRanks:normalizeRanks,removeEmptyRanks:removeEmptyRanks,addBorderNode:addBorderNode,maxRank:maxRank,partition:partition,time:time,notime:notime};function addDummyNode(g,type,attrs,name){var v;do{v=_.uniqueId(name)}while(g.hasNode(v));attrs.dummy=type;g.setNode(v,attrs);return v}function simplify(g){var simplified=(new Graph).setGraph(g.graph());_.each(g.nodes(),function(v){simplified.setNode(v,g.node(v))});_.each(g.edges(),function(e){var simpleLabel=simplified.edge(e.v,e.w)||{weight:0,minlen:1},label=g.edge(e);simplified.setEdge(e.v,e.w,{weight:simpleLabel.weight+label.weight,minlen:Math.max(simpleLabel.minlen,label.minlen)})});return simplified}function asNonCompoundGraph(g){var simplified=new Graph({multigraph:g.isMultigraph()}).setGraph(g.graph());_.each(g.nodes(),function(v){if(!g.children(v).length){simplified.setNode(v,g.node(v))}});_.each(g.edges(),function(e){simplified.setEdge(e,g.edge(e))});return simplified}function successorWeights(g){var weightMap=_.map(g.nodes(),function(v){var sucs={};_.each(g.outEdges(v),function(e){sucs[e.w]=(sucs[e.w]||0)+g.edge(e).weight});return sucs});return _.zipObject(g.nodes(),weightMap)}function predecessorWeights(g){var weightMap=_.map(g.nodes(),function(v){var preds={};_.each(g.inEdges(v),function(e){preds[e.v]=(preds[e.v]||0)+g.edge(e).weight});return preds});return _.zipObject(g.nodes(),weightMap)}function intersectRect(rect,point){var x=rect.x;var y=rect.y;var dx=point.x-x;var dy=point.y-y;var w=rect.width/2;var h=rect.height/2;if(!dx&&!dy){throw new Error("Not possible to find intersection inside of the rectangle")}var sx,sy;if(Math.abs(dy)*w>Math.abs(dx)*h){if(dy<0){h=-h}sx=h*dx/dy;sy=h}else{if(dx<0){w=-w}sx=w;sy=w*dy/dx}return{x:x+sx,y:y+sy}}function buildLayerMatrix(g){var layering=_.map(_.range(maxRank(g)+1),function(){return[]});_.each(g.nodes(),function(v){var node=g.node(v),rank=node.rank;if(!_.isUndefined(rank)){layering[rank][node.order]=v}});return layering}function normalizeRanks(g){var min=_.min(_.map(g.nodes(),function(v){return g.node(v).rank}));_.each(g.nodes(),function(v){var node=g.node(v);if(_.has(node,"rank")){node.rank-=min}})}function removeEmptyRanks(g){var offset=_.min(_.map(g.nodes(),function(v){return g.node(v).rank}));var layers=[];_.each(g.nodes(),function(v){var rank=g.node(v).rank-offset;if(!_.has(layers,rank)){layers[rank]=[]}layers[rank].push(v)});var delta=0,nodeRankFactor=g.graph().nodeRankFactor;_.each(layers,function(vs,i){if(_.isUndefined(vs)&&i%nodeRankFactor!==0){--delta}else if(delta){_.each(vs,function(v){g.node(v).rank+=delta})}})}function addBorderNode(g,prefix,rank,order){var node={width:0,height:0};if(arguments.length>=4){node.rank=rank;node.order=order}return addDummyNode(g,"border",node,prefix)}function maxRank(g){return _.max(_.map(g.nodes(),function(v){var rank=g.node(v).rank;if(!_.isUndefined(rank)){return rank}}))}function partition(collection,fn){var result={lhs:[],rhs:[]};_.each(collection,function(value){if(fn(value)){result.lhs.push(value)}else{result.rhs.push(value)}});return result}function time(name,fn){var start=_.now();try{return fn()}finally{console.log(name+" time: "+(_.now()-start)+"ms")}}function notime(name,fn){return fn()}},{"./graphlib":33,"./lodash":36}],56:[function(require,module,exports){module.exports="0.7.1"},{}],57:[function(require,module,exports){var lib=require("./lib");module.exports={Graph:lib.Graph,json:require("./lib/json"),alg:require("./lib/alg"),version:lib.version}},{"./lib":73,"./lib/alg":64,"./lib/json":74}],58:[function(require,module,exports){var _=require("../lodash");module.exports=components;function components(g){var visited={},cmpts=[],cmpt;function dfs(v){if(_.has(visited,v))return;visited[v]=true;cmpt.push(v);_.each(g.successors(v),dfs);_.each(g.predecessors(v),dfs)}_.each(g.nodes(),function(v){cmpt=[];dfs(v);if(cmpt.length){cmpts.push(cmpt)}});return cmpts}},{"../lodash":75}],59:[function(require,module,exports){var _=require("../lodash");module.exports=dfs;function dfs(g,vs,order){if(!_.isArray(vs)){vs=[vs]}var acc=[],visited={};_.each(vs,function(v){if(!g.hasNode(v)){throw new Error("Graph does not have node: "+v)}doDfs(g,v,order==="post",visited,acc)});return acc}function doDfs(g,v,postorder,visited,acc){if(!_.has(visited,v)){visited[v]=true;if(!postorder){acc.push(v)}_.each(g.neighbors(v),function(w){doDfs(g,w,postorder,visited,acc)});if(postorder){acc.push(v)}}}},{"../lodash":75}],60:[function(require,module,exports){var dijkstra=require("./dijkstra"),_=require("../lodash");module.exports=dijkstraAll;function dijkstraAll(g,weightFunc,edgeFunc){return _.transform(g.nodes(),function(acc,v){acc[v]=dijkstra(g,v,weightFunc,edgeFunc)},{})}},{"../lodash":75,"./dijkstra":61}],61:[function(require,module,exports){var _=require("../lodash"),PriorityQueue=require("../data/priority-queue");module.exports=dijkstra;var DEFAULT_WEIGHT_FUNC=_.constant(1);function dijkstra(g,source,weightFn,edgeFn){return runDijkstra(g,String(source),weightFn||DEFAULT_WEIGHT_FUNC,edgeFn||function(v){return g.outEdges(v)})}function runDijkstra(g,source,weightFn,edgeFn){var results={},pq=new PriorityQueue,v,vEntry;var updateNeighbors=function(edge){var w=edge.v!==v?edge.v:edge.w,wEntry=results[w],weight=weightFn(edge),distance=vEntry.distance+weight;if(weight<0){throw new Error("dijkstra does not allow negative edge weights. "+"Bad edge: "+edge+" Weight: "+weight)}if(distance<wEntry.distance){wEntry.distance=distance;wEntry.predecessor=v;pq.decrease(w,distance)}};g.nodes().forEach(function(v){var distance=v===source?0:Number.POSITIVE_INFINITY;results[v]={distance:distance};pq.add(v,distance)});while(pq.size()>0){v=pq.removeMin();vEntry=results[v];if(vEntry.distance===Number.POSITIVE_INFINITY){break}edgeFn(v).forEach(updateNeighbors)}return results}},{"../data/priority-queue":71,"../lodash":75}],62:[function(require,module,exports){var _=require("../lodash"),tarjan=require("./tarjan");module.exports=findCycles;function findCycles(g){return _.filter(tarjan(g),function(cmpt){return cmpt.length>1})}},{"../lodash":75,"./tarjan":69}],63:[function(require,module,exports){var _=require("../lodash");module.exports=floydWarshall;var DEFAULT_WEIGHT_FUNC=_.constant(1);function floydWarshall(g,weightFn,edgeFn){return runFloydWarshall(g,weightFn||DEFAULT_WEIGHT_FUNC,edgeFn||function(v){return g.outEdges(v)})}function runFloydWarshall(g,weightFn,edgeFn){var results={},nodes=g.nodes();nodes.forEach(function(v){results[v]={};results[v][v]={distance:0};nodes.forEach(function(w){if(v!==w){results[v][w]={distance:Number.POSITIVE_INFINITY}}});edgeFn(v).forEach(function(edge){var w=edge.v===v?edge.w:edge.v,d=weightFn(edge);results[v][w]={distance:d,predecessor:v}})});nodes.forEach(function(k){var rowK=results[k];nodes.forEach(function(i){var rowI=results[i];nodes.forEach(function(j){var ik=rowI[k];var kj=rowK[j];var ij=rowI[j];var altDistance=ik.distance+kj.distance;if(altDistance<ij.distance){ij.distance=altDistance;ij.predecessor=kj.predecessor}})})});return results}},{"../lodash":75}],64:[function(require,module,exports){module.exports={components:require("./components"),dijkstra:require("./dijkstra"),dijkstraAll:require("./dijkstra-all"),findCycles:require("./find-cycles"),floydWarshall:require("./floyd-warshall"),isAcyclic:require("./is-acyclic"),postorder:require("./postorder"),preorder:require("./preorder"),prim:require("./prim"),tarjan:require("./tarjan"),topsort:require("./topsort")}},{"./components":58,"./dijkstra":61,"./dijkstra-all":60,"./find-cycles":62,"./floyd-warshall":63,"./is-acyclic":65,"./postorder":66,"./preorder":67,"./prim":68,"./tarjan":69,"./topsort":70}],65:[function(require,module,exports){var topsort=require("./topsort");module.exports=isAcyclic;function isAcyclic(g){try{topsort(g)}catch(e){if(e instanceof topsort.CycleException){return false}throw e}return true}},{"./topsort":70}],66:[function(require,module,exports){var dfs=require("./dfs");module.exports=postorder;function postorder(g,vs){return dfs(g,vs,"post")}},{"./dfs":59}],67:[function(require,module,exports){var dfs=require("./dfs");module.exports=preorder;function preorder(g,vs){return dfs(g,vs,"pre")}},{"./dfs":59}],68:[function(require,module,exports){var _=require("../lodash"),Graph=require("../graph"),PriorityQueue=require("../data/priority-queue");module.exports=prim;function prim(g,weightFunc){var result=new Graph,parents={},pq=new PriorityQueue,v;function updateNeighbors(edge){var w=edge.v===v?edge.w:edge.v,pri=pq.priority(w);if(pri!==undefined){var edgeWeight=weightFunc(edge);if(edgeWeight<pri){parents[w]=v;pq.decrease(w,edgeWeight)}}}if(g.nodeCount()===0){return result}_.each(g.nodes(),function(v){pq.add(v,Number.POSITIVE_INFINITY);result.setNode(v)});pq.decrease(g.nodes()[0],0);var init=false;while(pq.size()>0){v=pq.removeMin();if(_.has(parents,v)){result.setEdge(v,parents[v])}else if(init){throw new Error("Input graph is not connected: "+g)}else{init=true}g.nodeEdges(v).forEach(updateNeighbors)}return result}},{"../data/priority-queue":71,"../graph":72,"../lodash":75}],69:[function(require,module,exports){var _=require("../lodash");module.exports=tarjan;function tarjan(g){var index=0,stack=[],visited={},results=[];function dfs(v){var entry=visited[v]={onStack:true,lowlink:index,index:index++};stack.push(v);g.successors(v).forEach(function(w){if(!_.has(visited,w)){dfs(w);entry.lowlink=Math.min(entry.lowlink,visited[w].lowlink)}else if(visited[w].onStack){entry.lowlink=Math.min(entry.lowlink,visited[w].index)}});if(entry.lowlink===entry.index){var cmpt=[],w;do{w=stack.pop();visited[w].onStack=false;cmpt.push(w)}while(v!==w);results.push(cmpt)}}g.nodes().forEach(function(v){if(!_.has(visited,v)){dfs(v)}});return results}},{"../lodash":75}],70:[function(require,module,exports){var _=require("../lodash");module.exports=topsort;topsort.CycleException=CycleException;function topsort(g){var visited={},stack={},results=[];function visit(node){if(_.has(stack,node)){throw new CycleException}if(!_.has(visited,node)){stack[node]=true;visited[node]=true;_.each(g.predecessors(node),visit);delete stack[node];results.push(node)}}_.each(g.sinks(),visit);if(_.size(visited)!==g.nodeCount()){throw new CycleException}return results}function CycleException(){}},{"../lodash":75}],71:[function(require,module,exports){var _=require("../lodash");module.exports=PriorityQueue;function PriorityQueue(){this._arr=[];this._keyIndices={}}PriorityQueue.prototype.size=function(){return this._arr.length};PriorityQueue.prototype.keys=function(){return this._arr.map(function(x){return x.key})};PriorityQueue.prototype.has=function(key){return _.has(this._keyIndices,key)};PriorityQueue.prototype.priority=function(key){var index=this._keyIndices[key];if(index!==undefined){return this._arr[index].priority}};PriorityQueue.prototype.min=function(){if(this.size()===0){throw new Error("Queue underflow")}return this._arr[0].key};PriorityQueue.prototype.add=function(key,priority){var keyIndices=this._keyIndices;key=String(key);if(!_.has(keyIndices,key)){var arr=this._arr;var index=arr.length;keyIndices[key]=index;arr.push({key:key,priority:priority});this._decrease(index);return true}return false};PriorityQueue.prototype.removeMin=function(){this._swap(0,this._arr.length-1);var min=this._arr.pop();delete this._keyIndices[min.key];this._heapify(0);return min.key};PriorityQueue.prototype.decrease=function(key,priority){var index=this._keyIndices[key];if(priority>this._arr[index].priority){throw new Error("New priority is greater than current priority. "+"Key: "+key+" Old: "+this._arr[index].priority+" New: "+priority)}this._arr[index].priority=priority;this._decrease(index)};PriorityQueue.prototype._heapify=function(i){var arr=this._arr;var l=2*i,r=l+1,largest=i;if(l<arr.length){largest=arr[l].priority<arr[largest].priority?l:largest;if(r<arr.length){largest=arr[r].priority<arr[largest].priority?r:largest}if(largest!==i){this._swap(i,largest);this._heapify(largest)}}};PriorityQueue.prototype._decrease=function(index){var arr=this._arr;var priority=arr[index].priority;var parent;while(index!==0){parent=index>>1;if(arr[parent].priority<priority){break}this._swap(index,parent);index=parent}};PriorityQueue.prototype._swap=function(i,j){var arr=this._arr;var keyIndices=this._keyIndices;var origArrI=arr[i];var origArrJ=arr[j];arr[i]=origArrJ;arr[j]=origArrI;keyIndices[origArrJ.key]=i;keyIndices[origArrI.key]=j}},{"../lodash":75}],72:[function(require,module,exports){"use strict";var _=require("./lodash");module.exports=Graph;var DEFAULT_EDGE_NAME="\x00",GRAPH_NODE="\x00",EDGE_KEY_DELIM="";function Graph(opts){this._isDirected=_.has(opts,"directed")?opts.directed:true;this._isMultigraph=_.has(opts,"multigraph")?opts.multigraph:false;this._isCompound=_.has(opts,"compound")?opts.compound:false;this._label=undefined;this._defaultNodeLabelFn=_.constant(undefined);this._defaultEdgeLabelFn=_.constant(undefined);this._nodes={};if(this._isCompound){this._parent={};this._children={};this._children[GRAPH_NODE]={}}this._in={};this._preds={};this._out={};this._sucs={};this._edgeObjs={};this._edgeLabels={}}Graph.prototype._nodeCount=0;Graph.prototype._edgeCount=0;Graph.prototype.isDirected=function(){return this._isDirected};Graph.prototype.isMultigraph=function(){return this._isMultigraph};Graph.prototype.isCompound=function(){return this._isCompound};Graph.prototype.setGraph=function(label){this._label=label;return this};Graph.prototype.graph=function(){return this._label};Graph.prototype.setDefaultNodeLabel=function(newDefault){if(!_.isFunction(newDefault)){newDefault=_.constant(newDefault)}this._defaultNodeLabelFn=newDefault;return this};Graph.prototype.nodeCount=function(){return this._nodeCount};Graph.prototype.nodes=function(){return _.keys(this._nodes)};Graph.prototype.sources=function(){return _.filter(this.nodes(),function(v){return _.isEmpty(this._in[v])},this)};Graph.prototype.sinks=function(){return _.filter(this.nodes(),function(v){return _.isEmpty(this._out[v])},this)};Graph.prototype.setNodes=function(vs,value){var args=arguments;_.each(vs,function(v){if(args.length>1){this.setNode(v,value)}else{this.setNode(v)}},this);return this};Graph.prototype.setNode=function(v,value){if(_.has(this._nodes,v)){if(arguments.length>1){this._nodes[v]=value}return this}this._nodes[v]=arguments.length>1?value:this._defaultNodeLabelFn(v);if(this._isCompound){this._parent[v]=GRAPH_NODE;this._children[v]={};this._children[GRAPH_NODE][v]=true}this._in[v]={};this._preds[v]={};this._out[v]={};this._sucs[v]={};++this._nodeCount;return this};Graph.prototype.node=function(v){return this._nodes[v]};Graph.prototype.hasNode=function(v){return _.has(this._nodes,v)};Graph.prototype.removeNode=function(v){var self=this;if(_.has(this._nodes,v)){var removeEdge=function(e){self.removeEdge(self._edgeObjs[e])};delete this._nodes[v];if(this._isCompound){this._removeFromParentsChildList(v);delete this._parent[v];_.each(this.children(v),function(child){this.setParent(child)},this);delete this._children[v]}_.each(_.keys(this._in[v]),removeEdge);delete this._in[v];delete this._preds[v];_.each(_.keys(this._out[v]),removeEdge);delete this._out[v];delete this._sucs[v];--this._nodeCount}return this};Graph.prototype.setParent=function(v,parent){if(!this._isCompound){throw new Error("Cannot set parent in a non-compound graph")}if(_.isUndefined(parent)){parent=GRAPH_NODE}else{for(var ancestor=parent;!_.isUndefined(ancestor);ancestor=this.parent(ancestor)){if(ancestor===v){throw new Error("Setting "+parent+" as parent of "+v+" would create create a cycle")}}this.setNode(parent)}this.setNode(v);this._removeFromParentsChildList(v);this._parent[v]=parent;this._children[parent][v]=true;return this};Graph.prototype._removeFromParentsChildList=function(v){delete this._children[this._parent[v]][v]};Graph.prototype.parent=function(v){if(this._isCompound){var parent=this._parent[v];if(parent!==GRAPH_NODE){return parent}}};Graph.prototype.children=function(v){if(_.isUndefined(v)){v=GRAPH_NODE}if(this._isCompound){var children=this._children[v];if(children){return _.keys(children)}}else if(v===GRAPH_NODE){return this.nodes()}else if(this.hasNode(v)){return[]}};Graph.prototype.predecessors=function(v){var predsV=this._preds[v];if(predsV){return _.keys(predsV)}};Graph.prototype.successors=function(v){var sucsV=this._sucs[v];if(sucsV){return _.keys(sucsV)}};Graph.prototype.neighbors=function(v){var preds=this.predecessors(v);if(preds){return _.union(preds,this.successors(v))}};Graph.prototype.setDefaultEdgeLabel=function(newDefault){if(!_.isFunction(newDefault)){newDefault=_.constant(newDefault)}this._defaultEdgeLabelFn=newDefault;return this};Graph.prototype.edgeCount=function(){return this._edgeCount};Graph.prototype.edges=function(){return _.values(this._edgeObjs)};Graph.prototype.setPath=function(vs,value){var self=this,args=arguments;_.reduce(vs,function(v,w){if(args.length>1){self.setEdge(v,w,value)}else{self.setEdge(v,w)}return w});return this};Graph.prototype.setEdge=function(){var v,w,name,value,valueSpecified=false;if(_.isPlainObject(arguments[0])){v=arguments[0].v;w=arguments[0].w;name=arguments[0].name;if(arguments.length===2){value=arguments[1];valueSpecified=true}}else{v=arguments[0];w=arguments[1];name=arguments[3];if(arguments.length>2){value=arguments[2];valueSpecified=true}}v=""+v;w=""+w;if(!_.isUndefined(name)){name=""+name}var e=edgeArgsToId(this._isDirected,v,w,name);if(_.has(this._edgeLabels,e)){if(valueSpecified){this._edgeLabels[e]=value}return this}if(!_.isUndefined(name)&&!this._isMultigraph){throw new Error("Cannot set a named edge when isMultigraph = false")}this.setNode(v);this.setNode(w);this._edgeLabels[e]=valueSpecified?value:this._defaultEdgeLabelFn(v,w,name);var edgeObj=edgeArgsToObj(this._isDirected,v,w,name);v=edgeObj.v;w=edgeObj.w;Object.freeze(edgeObj);this._edgeObjs[e]=edgeObj;incrementOrInitEntry(this._preds[w],v);incrementOrInitEntry(this._sucs[v],w);this._in[w][e]=edgeObj;this._out[v][e]=edgeObj;this._edgeCount++;return this};Graph.prototype.edge=function(v,w,name){var e=arguments.length===1?edgeObjToId(this._isDirected,arguments[0]):edgeArgsToId(this._isDirected,v,w,name);return this._edgeLabels[e]};Graph.prototype.hasEdge=function(v,w,name){var e=arguments.length===1?edgeObjToId(this._isDirected,arguments[0]):edgeArgsToId(this._isDirected,v,w,name);return _.has(this._edgeLabels,e)};Graph.prototype.removeEdge=function(v,w,name){var e=arguments.length===1?edgeObjToId(this._isDirected,arguments[0]):edgeArgsToId(this._isDirected,v,w,name),edge=this._edgeObjs[e];if(edge){v=edge.v;w=edge.w;delete this._edgeLabels[e];delete this._edgeObjs[e];decrementOrRemoveEntry(this._preds[w],v);decrementOrRemoveEntry(this._sucs[v],w);delete this._in[w][e];delete this._out[v][e];this._edgeCount--}return this};Graph.prototype.inEdges=function(v,u){var inV=this._in[v];if(inV){var edges=_.values(inV);if(!u){return edges}return _.filter(edges,function(edge){return edge.v===u})}};Graph.prototype.outEdges=function(v,w){var outV=this._out[v];if(outV){var edges=_.values(outV);if(!w){return edges}return _.filter(edges,function(edge){return edge.w===w})}};Graph.prototype.nodeEdges=function(v,w){var inEdges=this.inEdges(v,w);if(inEdges){return inEdges.concat(this.outEdges(v,w))}};function incrementOrInitEntry(map,k){if(_.has(map,k)){map[k]++}else{map[k]=1}}function decrementOrRemoveEntry(map,k){if(!--map[k]){delete map[k]}}function edgeArgsToId(isDirected,v,w,name){if(!isDirected&&v>w){var tmp=v;v=w;w=tmp}return v+EDGE_KEY_DELIM+w+EDGE_KEY_DELIM+(_.isUndefined(name)?DEFAULT_EDGE_NAME:name)}function edgeArgsToObj(isDirected,v,w,name){if(!isDirected&&v>w){var tmp=v;v=w;w=tmp}var edgeObj={v:v,w:w};if(name){edgeObj.name=name}return edgeObj}function edgeObjToId(isDirected,edgeObj){return edgeArgsToId(isDirected,edgeObj.v,edgeObj.w,edgeObj.name)}},{"./lodash":75}],73:[function(require,module,exports){module.exports={Graph:require("./graph"),version:require("./version")}},{"./graph":72,"./version":76}],74:[function(require,module,exports){var _=require("./lodash"),Graph=require("./graph");module.exports={write:write,read:read};function write(g){var json={options:{directed:g.isDirected(),multigraph:g.isMultigraph(),compound:g.isCompound()},nodes:writeNodes(g),edges:writeEdges(g)};if(!_.isUndefined(g.graph())){json.value=_.clone(g.graph())}return json}function writeNodes(g){return _.map(g.nodes(),function(v){var nodeValue=g.node(v),parent=g.parent(v),node={v:v};if(!_.isUndefined(nodeValue)){node.value=nodeValue}if(!_.isUndefined(parent)){node.parent=parent}return node})}function writeEdges(g){return _.map(g.edges(),function(e){var edgeValue=g.edge(e),edge={v:e.v,w:e.w};if(!_.isUndefined(e.name)){edge.name=e.name}if(!_.isUndefined(edgeValue)){edge.value=edgeValue}return edge})}function read(json){var g=new Graph(json.options).setGraph(json.value);_.each(json.nodes,function(entry){g.setNode(entry.v,entry.value);if(entry.parent){g.setParent(entry.v,entry.parent)}});_.each(json.edges,function(entry){g.setEdge({v:entry.v,w:entry.w,name:entry.name},entry.value)});return g}},{"./graph":72,"./lodash":75}],75:[function(require,module,exports){module.exports=require(20)},{"/Users/andrew/Documents/dev/dagre-d3/lib/lodash.js":20,lodash:77}],76:[function(require,module,exports){module.exports="1.0.1"},{}],77:[function(require,module,exports){(function(global){(function(){var undefined;var arrayPool=[],objectPool=[];var idCounter=0;var keyPrefix=+new Date+"";var largeArraySize=75;var maxPoolSize=40;var whitespace=" 	\f \ufeff"+"\n\r\u2028\u2029"+" ᠎             　";var reEmptyStringLeading=/\b__p \+= '';/g,reEmptyStringMiddle=/\b(__p \+=) '' \+/g,reEmptyStringTrailing=/(__e\(.*?\)|\b__t\)) \+\n'';/g;var reEsTemplate=/\$\{([^\\}]*(?:\\.[^\\}]*)*)\}/g;var reFlags=/\w*$/;var reFuncName=/^\s*function[ \n\r\t]+\w/;var reInterpolate=/<%=([\s\S]+?)%>/g;var reLeadingSpacesAndZeros=RegExp("^["+whitespace+"]*0+(?=.$)");var reNoMatch=/($^)/;var reThis=/\bthis\b/;var reUnescapedString=/['\n\r\t\u2028\u2029\\]/g;var contextProps=["Array","Boolean","Date","Function","Math","Number","Object","RegExp","String","_","attachEvent","clearTimeout","isFinite","isNaN","parseInt","setTimeout"];var templateCounter=0;var argsClass="[object Arguments]",arrayClass="[object Array]",boolClass="[object Boolean]",dateClass="[object Date]",funcClass="[object Function]",numberClass="[object Number]",objectClass="[object Object]",regexpClass="[object RegExp]",stringClass="[object String]";var cloneableClasses={};cloneableClasses[funcClass]=false;cloneableClasses[argsClass]=cloneableClasses[arrayClass]=cloneableClasses[boolClass]=cloneableClasses[dateClass]=cloneableClasses[numberClass]=cloneableClasses[objectClass]=cloneableClasses[regexpClass]=cloneableClasses[stringClass]=true;var debounceOptions={leading:false,maxWait:0,trailing:false};var descriptor={configurable:false,enumerable:false,value:null,writable:false};var objectTypes={"boolean":false,"function":true,object:true,number:false,string:false,undefined:false};var stringEscapes={"\\":"\\","'":"'","\n":"n","\r":"r","	":"t","\u2028":"u2028","\u2029":"u2029"};var root=objectTypes[typeof window]&&window||this;var freeExports=objectTypes[typeof exports]&&exports&&!exports.nodeType&&exports;var freeModule=objectTypes[typeof module]&&module&&!module.nodeType&&module;var moduleExports=freeModule&&freeModule.exports===freeExports&&freeExports;var freeGlobal=objectTypes[typeof global]&&global;if(freeGlobal&&(freeGlobal.global===freeGlobal||freeGlobal.window===freeGlobal)){root=freeGlobal}function baseIndexOf(array,value,fromIndex){var index=(fromIndex||0)-1,length=array?array.length:0;while(++index<length){if(array[index]===value){return index}}return-1}function cacheIndexOf(cache,value){var type=typeof value;cache=cache.cache;if(type=="boolean"||value==null){return cache[value]?0:-1}if(type!="number"&&type!="string"){type="object"}var key=type=="number"?value:keyPrefix+value;cache=(cache=cache[type])&&cache[key];return type=="object"?cache&&baseIndexOf(cache,value)>-1?0:-1:cache?0:-1}function cachePush(value){var cache=this.cache,type=typeof value;if(type=="boolean"||value==null){cache[value]=true}else{if(type!="number"&&type!="string"){type="object"}var key=type=="number"?value:keyPrefix+value,typeCache=cache[type]||(cache[type]={});if(type=="object"){(typeCache[key]||(typeCache[key]=[])).push(value)}else{typeCache[key]=true}}}function charAtCallback(value){return value.charCodeAt(0)}function compareAscending(a,b){var ac=a.criteria,bc=b.criteria,index=-1,length=ac.length;while(++index<length){var value=ac[index],other=bc[index];if(value!==other){if(value>other||typeof value=="undefined"){return 1}if(value<other||typeof other=="undefined"){return-1}}}return a.index-b.index}function createCache(array){var index=-1,length=array.length,first=array[0],mid=array[length/2|0],last=array[length-1];if(first&&typeof first=="object"&&mid&&typeof mid=="object"&&last&&typeof last=="object"){return false}var cache=getObject();cache["false"]=cache["null"]=cache["true"]=cache["undefined"]=false;var result=getObject();result.array=array;result.cache=cache;result.push=cachePush;while(++index<length){result.push(array[index])}return result}function escapeStringChar(match){return"\\"+stringEscapes[match]}function getArray(){return arrayPool.pop()||[]}function getObject(){return objectPool.pop()||{array:null,cache:null,criteria:null,"false":false,index:0,"null":false,number:null,object:null,push:null,string:null,"true":false,undefined:false,value:null}}function releaseArray(array){array.length=0;if(arrayPool.length<maxPoolSize){arrayPool.push(array)}}function releaseObject(object){var cache=object.cache;if(cache){releaseObject(cache)}object.array=object.cache=object.criteria=object.object=object.number=object.string=object.value=null;if(objectPool.length<maxPoolSize){objectPool.push(object)}}function slice(array,start,end){start||(start=0);if(typeof end=="undefined"){end=array?array.length:0}var index=-1,length=end-start||0,result=Array(length<0?0:length);while(++index<length){result[index]=array[start+index]}return result}function runInContext(context){context=context?_.defaults(root.Object(),context,_.pick(root,contextProps)):root;var Array=context.Array,Boolean=context.Boolean,Date=context.Date,Function=context.Function,Math=context.Math,Number=context.Number,Object=context.Object,RegExp=context.RegExp,String=context.String,TypeError=context.TypeError;var arrayRef=[];var objectProto=Object.prototype;var oldDash=context._;var toString=objectProto.toString;var reNative=RegExp("^"+String(toString).replace(/[.*+?^${}()|[\]\\]/g,"\\$&").replace(/toString| for [^\]]+/g,".*?")+"$");var ceil=Math.ceil,clearTimeout=context.clearTimeout,floor=Math.floor,fnToString=Function.prototype.toString,getPrototypeOf=isNative(getPrototypeOf=Object.getPrototypeOf)&&getPrototypeOf,hasOwnProperty=objectProto.hasOwnProperty,push=arrayRef.push,setTimeout=context.setTimeout,splice=arrayRef.splice,unshift=arrayRef.unshift;var defineProperty=function(){try{var o={},func=isNative(func=Object.defineProperty)&&func,result=func(o,o,o)&&func}catch(e){}return result}();var nativeCreate=isNative(nativeCreate=Object.create)&&nativeCreate,nativeIsArray=isNative(nativeIsArray=Array.isArray)&&nativeIsArray,nativeIsFinite=context.isFinite,nativeIsNaN=context.isNaN,nativeKeys=isNative(nativeKeys=Object.keys)&&nativeKeys,nativeMax=Math.max,nativeMin=Math.min,nativeParseInt=context.parseInt,nativeRandom=Math.random;var ctorByClass={};ctorByClass[arrayClass]=Array;ctorByClass[boolClass]=Boolean;ctorByClass[dateClass]=Date;ctorByClass[funcClass]=Function;ctorByClass[objectClass]=Object;ctorByClass[numberClass]=Number;ctorByClass[regexpClass]=RegExp;ctorByClass[stringClass]=String;function lodash(value){return value&&typeof value=="object"&&!isArray(value)&&hasOwnProperty.call(value,"__wrapped__")?value:new lodashWrapper(value)}function lodashWrapper(value,chainAll){this.__chain__=!!chainAll;this.__wrapped__=value}lodashWrapper.prototype=lodash.prototype;var support=lodash.support={};support.funcDecomp=!isNative(context.WinRTError)&&reThis.test(runInContext);support.funcNames=typeof Function.name=="string";lodash.templateSettings={escape:/<%-([\s\S]+?)%>/g,evaluate:/<%([\s\S]+?)%>/g,interpolate:reInterpolate,variable:"",imports:{_:lodash}};function baseBind(bindData){var func=bindData[0],partialArgs=bindData[2],thisArg=bindData[4];function bound(){if(partialArgs){var args=slice(partialArgs);push.apply(args,arguments)}if(this instanceof bound){var thisBinding=baseCreate(func.prototype),result=func.apply(thisBinding,args||arguments);return isObject(result)?result:thisBinding}return func.apply(thisArg,args||arguments)}setBindData(bound,bindData);return bound}function baseClone(value,isDeep,callback,stackA,stackB){if(callback){var result=callback(value);if(typeof result!="undefined"){return result}}var isObj=isObject(value);if(isObj){var className=toString.call(value);if(!cloneableClasses[className]){return value}var ctor=ctorByClass[className];switch(className){case boolClass:case dateClass:return new ctor(+value);case numberClass:case stringClass:return new ctor(value);case regexpClass:result=ctor(value.source,reFlags.exec(value));result.lastIndex=value.lastIndex;return result}}else{return value}var isArr=isArray(value);if(isDeep){var initedStack=!stackA;stackA||(stackA=getArray());stackB||(stackB=getArray());var length=stackA.length;while(length--){if(stackA[length]==value){return stackB[length]}}result=isArr?ctor(value.length):{}}else{result=isArr?slice(value):assign({},value)}if(isArr){if(hasOwnProperty.call(value,"index")){result.index=value.index}if(hasOwnProperty.call(value,"input")){result.input=value.input}}if(!isDeep){return result}stackA.push(value);stackB.push(result);(isArr?forEach:forOwn)(value,function(objValue,key){result[key]=baseClone(objValue,isDeep,callback,stackA,stackB)});if(initedStack){releaseArray(stackA);releaseArray(stackB)}return result}function baseCreate(prototype,properties){return isObject(prototype)?nativeCreate(prototype):{};
diff --git a/core/src/main/scala/org/apache/spark/SparkContext.scala b/core/src/main/scala/org/apache/spark/SparkContext.scala
index af276e7b8d40c..f78fbaf33f656 100644
--- a/core/src/main/scala/org/apache/spark/SparkContext.scala
+++ b/core/src/main/scala/org/apache/spark/SparkContext.scala
@@ -678,7 +678,7 @@ class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationCli
    *
    * Note: Return statements are NOT allowed in the given body.
    */
-  private def withScope[U](body: => U): U = RDDOperationScope.withScope[U](this)(body)
+  private[spark] def withScope[U](body: => U): U = RDDOperationScope.withScope[U](this)(body)
 
   // Methods for creating RDDs
 
diff --git a/core/src/main/scala/org/apache/spark/rdd/RDDOperationScope.scala b/core/src/main/scala/org/apache/spark/rdd/RDDOperationScope.scala
index 2725826f421f4..6b09dfafc889c 100644
--- a/core/src/main/scala/org/apache/spark/rdd/RDDOperationScope.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/RDDOperationScope.scala
@@ -24,7 +24,7 @@ import com.fasterxml.jackson.annotation.JsonInclude.Include
 import com.fasterxml.jackson.databind.ObjectMapper
 import com.fasterxml.jackson.module.scala.DefaultScalaModule
 
-import org.apache.spark.SparkContext
+import org.apache.spark.{Logging, SparkContext}
 
 /**
  * A general, named code block representing an operation that instantiates RDDs.
@@ -43,9 +43,8 @@ import org.apache.spark.SparkContext
 @JsonPropertyOrder(Array("id", "name", "parent"))
 private[spark] class RDDOperationScope(
     val name: String,
-    val parent: Option[RDDOperationScope] = None) {
-
-  val id: Int = RDDOperationScope.nextScopeId()
+    val parent: Option[RDDOperationScope] = None,
+    val id: String = RDDOperationScope.nextScopeId().toString) {
 
   def toJson: String = {
     RDDOperationScope.jsonMapper.writeValueAsString(this)
@@ -75,7 +74,7 @@ private[spark] class RDDOperationScope(
  * A collection of utility methods to construct a hierarchical representation of RDD scopes.
  * An RDD scope tracks the series of operations that created a given RDD.
  */
-private[spark] object RDDOperationScope {
+private[spark] object RDDOperationScope extends Logging {
   private val jsonMapper = new ObjectMapper().registerModule(DefaultScalaModule)
   private val scopeCounter = new AtomicInteger(0)
 
@@ -88,14 +87,25 @@ private[spark] object RDDOperationScope {
 
   /**
    * Execute the given body such that all RDDs created in this body will have the same scope.
-   * The name of the scope will be the name of the method that immediately encloses this one.
+   * The name of the scope will be the first method name in the stack trace that is not the
+   * same as this method's.
    *
    * Note: Return statements are NOT allowed in body.
    */
   private[spark] def withScope[T](
       sc: SparkContext,
       allowNesting: Boolean = false)(body: => T): T = {
-    val callerMethodName = Thread.currentThread.getStackTrace()(3).getMethodName
+    val stackTrace = Thread.currentThread.getStackTrace().tail // ignore "Thread#getStackTrace"
+    val ourMethodName = stackTrace(1).getMethodName // i.e. withScope
+    // Climb upwards to find the first method that's called something different
+    val callerMethodName = stackTrace
+      .find(_.getMethodName != ourMethodName)
+      .map(_.getMethodName)
+      .getOrElse {
+        // Log a warning just in case, but this should almost certainly never happen
+        logWarning("No valid method name for this RDD operation scope!")
+        "N/A"
+      }
     withScope[T](sc, callerMethodName, allowNesting, ignoreParent = false)(body)
   }
 
diff --git a/core/src/main/scala/org/apache/spark/ui/scope/RDDOperationGraph.scala b/core/src/main/scala/org/apache/spark/ui/scope/RDDOperationGraph.scala
index 33a7303be711c..d6a5085db1efb 100644
--- a/core/src/main/scala/org/apache/spark/ui/scope/RDDOperationGraph.scala
+++ b/core/src/main/scala/org/apache/spark/ui/scope/RDDOperationGraph.scala
@@ -116,8 +116,8 @@ private[ui] object RDDOperationGraph extends Logging {
         // which may be nested inside of other clusters
         val rddScopes = rdd.scope.map { scope => scope.getAllScopes }.getOrElse(Seq.empty)
         val rddClusters = rddScopes.map { scope =>
-          val clusterId = scope.name + "_" + scope.id
-          val clusterName = scope.name
+          val clusterId = scope.id
+          val clusterName = scope.name.replaceAll("\\n", "\\\\n")
           clusters.getOrElseUpdate(clusterId, new RDDOperationCluster(clusterId, clusterName))
         }
         // Build the cluster hierarchy for this RDD
@@ -177,7 +177,7 @@ private[ui] object RDDOperationGraph extends Logging {
 
   /** Return the dot representation of a node in an RDDOperationGraph. */
   private def makeDotNode(node: RDDOperationNode): String = {
-    s"""${node.id} [label="${node.name} (${node.id})"]"""
+    s"""${node.id} [label="${node.name} [${node.id}]"]"""
   }
 
   /** Return the dot representation of a subgraph in an RDDOperationGraph. */
diff --git a/core/src/test/scala/org/apache/spark/rdd/RDDOperationScopeSuite.scala b/core/src/test/scala/org/apache/spark/rdd/RDDOperationScopeSuite.scala
index db465a6a9eb55..4434ed858c60c 100644
--- a/core/src/test/scala/org/apache/spark/rdd/RDDOperationScopeSuite.scala
+++ b/core/src/test/scala/org/apache/spark/rdd/RDDOperationScopeSuite.scala
@@ -22,13 +22,13 @@ import org.scalatest.{BeforeAndAfter, FunSuite}
 import org.apache.spark.{TaskContext, Partition, SparkContext}
 
 /**
- *
+ * Tests whether scopes are passed from the RDD operation to the RDDs correctly.
  */
 class RDDOperationScopeSuite extends FunSuite with BeforeAndAfter {
   private var sc: SparkContext = null
   private val scope1 = new RDDOperationScope("scope1")
-  private val scope2 = new RDDOperationScope("scope2", parent = Some(scope1))
-  private val scope3 = new RDDOperationScope("scope3", parent = Some(scope2))
+  private val scope2 = new RDDOperationScope("scope2", Some(scope1))
+  private val scope3 = new RDDOperationScope("scope3", Some(scope2))
 
   before {
     sc = new SparkContext("local", "test")
@@ -48,9 +48,9 @@ class RDDOperationScopeSuite extends FunSuite with BeforeAndAfter {
     val scope1Json = scope1.toJson
     val scope2Json = scope2.toJson
     val scope3Json = scope3.toJson
-    assert(scope1Json === s"""{"id":${scope1.id},"name":"scope1"}""")
-    assert(scope2Json === s"""{"id":${scope2.id},"name":"scope2","parent":$scope1Json}""")
-    assert(scope3Json === s"""{"id":${scope3.id},"name":"scope3","parent":$scope2Json}""")
+    assert(scope1Json === s"""{"id":"${scope1.id}","name":"scope1"}""")
+    assert(scope2Json === s"""{"id":"${scope2.id}","name":"scope2","parent":$scope1Json}""")
+    assert(scope3Json === s"""{"id":"${scope3.id}","name":"scope3","parent":$scope2Json}""")
     assert(RDDOperationScope.fromJson(scope1Json) === scope1)
     assert(RDDOperationScope.fromJson(scope2Json) === scope2)
     assert(RDDOperationScope.fromJson(scope3Json) === scope3)
diff --git a/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/DirectKafkaInputDStream.scala b/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/DirectKafkaInputDStream.scala
index 6715aede7928a..060c2f23eded8 100644
--- a/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/DirectKafkaInputDStream.scala
+++ b/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/DirectKafkaInputDStream.scala
@@ -65,6 +65,9 @@ class DirectKafkaInputDStream[
   val maxRetries = context.sparkContext.getConf.getInt(
     "spark.streaming.kafka.maxRetries", 1)
 
+  // Keep this consistent with how other streams are named (e.g. "Flume polling stream [2]")
+  private[streaming] override def name: String = s"Kafka direct stream [$id]"
+
   protected[streaming] override val checkpointData =
     new DirectKafkaInputDStreamCheckpointData
 
diff --git a/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/KafkaUtils.scala b/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/KafkaUtils.scala
index d7cf500577c2a..8be2707528d93 100644
--- a/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/KafkaUtils.scala
+++ b/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/KafkaUtils.scala
@@ -189,7 +189,7 @@ object KafkaUtils {
       sc: SparkContext,
       kafkaParams: Map[String, String],
       offsetRanges: Array[OffsetRange]
-    ): RDD[(K, V)] = {
+    ): RDD[(K, V)] = sc.withScope {
     val messageHandler = (mmd: MessageAndMetadata[K, V]) => (mmd.key, mmd.message)
     val leaders = leadersForRanges(kafkaParams, offsetRanges)
     new KafkaRDD[K, V, KD, VD, (K, V)](sc, kafkaParams, offsetRanges, leaders, messageHandler)
@@ -224,7 +224,7 @@ object KafkaUtils {
       offsetRanges: Array[OffsetRange],
       leaders: Map[TopicAndPartition, Broker],
       messageHandler: MessageAndMetadata[K, V] => R
-    ): RDD[R] = {
+    ): RDD[R] = sc.withScope {
     val leaderMap = if (leaders.isEmpty) {
       leadersForRanges(kafkaParams, offsetRanges)
     } else {
@@ -233,7 +233,8 @@ object KafkaUtils {
         case (tp: TopicAndPartition, Broker(host, port)) => (tp, (host, port))
       }.toMap
     }
-    new KafkaRDD[K, V, KD, VD, R](sc, kafkaParams, offsetRanges, leaderMap, messageHandler)
+    val cleanedHandler = sc.clean(messageHandler)
+    new KafkaRDD[K, V, KD, VD, R](sc, kafkaParams, offsetRanges, leaderMap, cleanedHandler)
   }
 
   /**
@@ -256,7 +257,7 @@ object KafkaUtils {
       valueDecoderClass: Class[VD],
       kafkaParams: JMap[String, String],
       offsetRanges: Array[OffsetRange]
-    ): JavaPairRDD[K, V] = {
+    ): JavaPairRDD[K, V] = jsc.sc.withScope {
     implicit val keyCmt: ClassTag[K] = ClassTag(keyClass)
     implicit val valueCmt: ClassTag[V] = ClassTag(valueClass)
     implicit val keyDecoderCmt: ClassTag[KD] = ClassTag(keyDecoderClass)
@@ -294,7 +295,7 @@ object KafkaUtils {
       offsetRanges: Array[OffsetRange],
       leaders: JMap[TopicAndPartition, Broker],
       messageHandler: JFunction[MessageAndMetadata[K, V], R]
-    ): JavaRDD[R] = {
+    ): JavaRDD[R] = jsc.sc.withScope {
     implicit val keyCmt: ClassTag[K] = ClassTag(keyClass)
     implicit val valueCmt: ClassTag[V] = ClassTag(valueClass)
     implicit val keyDecoderCmt: ClassTag[KD] = ClassTag(keyDecoderClass)
@@ -348,8 +349,9 @@ object KafkaUtils {
       fromOffsets: Map[TopicAndPartition, Long],
       messageHandler: MessageAndMetadata[K, V] => R
   ): InputDStream[R] = {
+    val cleanedHandler = ssc.sc.clean(messageHandler)
     new DirectKafkaInputDStream[K, V, KD, VD, R](
-      ssc, kafkaParams, fromOffsets, messageHandler)
+      ssc, kafkaParams, fromOffsets, cleanedHandler)
   }
 
   /**
@@ -469,11 +471,12 @@ object KafkaUtils {
     implicit val keyDecoderCmt: ClassTag[KD] = ClassTag(keyDecoderClass)
     implicit val valueDecoderCmt: ClassTag[VD] = ClassTag(valueDecoderClass)
     implicit val recordCmt: ClassTag[R] = ClassTag(recordClass)
+    val cleanedHandler = jssc.sparkContext.clean(messageHandler.call _)
     createDirectStream[K, V, KD, VD, R](
       jssc.ssc,
       Map(kafkaParams.toSeq: _*),
       Map(fromOffsets.mapValues { _.longValue() }.toSeq: _*),
-      messageHandler.call _
+      cleanedHandler
     )
   }
 
diff --git a/external/mqtt/src/main/scala/org/apache/spark/streaming/mqtt/MQTTInputDStream.scala b/external/mqtt/src/main/scala/org/apache/spark/streaming/mqtt/MQTTInputDStream.scala
index 3c0ef94cb0fab..40f5f18547236 100644
--- a/external/mqtt/src/main/scala/org/apache/spark/streaming/mqtt/MQTTInputDStream.scala
+++ b/external/mqtt/src/main/scala/org/apache/spark/streaming/mqtt/MQTTInputDStream.scala
@@ -35,7 +35,6 @@ import org.eclipse.paho.client.mqttv3.MqttMessage
 import org.eclipse.paho.client.mqttv3.MqttTopic
 import org.eclipse.paho.client.mqttv3.persist.MemoryPersistence
 
-import org.apache.spark.Logging
 import org.apache.spark.storage.StorageLevel
 import org.apache.spark.streaming.StreamingContext
 import org.apache.spark.streaming.dstream._
@@ -57,6 +56,8 @@ class MQTTInputDStream(
     storageLevel: StorageLevel
   ) extends ReceiverInputDStream[String](ssc_) {
 
+  private[streaming] override def name: String = s"MQTT stream [$id]"
+
   def getReceiver(): Receiver[String] = {
     new MQTTReceiver(brokerUrl, topic, storageLevel)
   }
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/StreamingContext.scala b/streaming/src/main/scala/org/apache/spark/streaming/StreamingContext.scala
index 1d2ecdd341813..7f181bcecd4bf 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/StreamingContext.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/StreamingContext.scala
@@ -34,7 +34,7 @@ import org.apache.hadoop.mapreduce.{InputFormat => NewInputFormat}
 import org.apache.spark._
 import org.apache.spark.annotation.{DeveloperApi, Experimental}
 import org.apache.spark.input.FixedLengthBinaryInputFormat
-import org.apache.spark.rdd.RDD
+import org.apache.spark.rdd.{RDD, RDDOperationScope}
 import org.apache.spark.storage.StorageLevel
 import org.apache.spark.streaming.StreamingContextState._
 import org.apache.spark.streaming.dstream._
@@ -241,15 +241,34 @@ class StreamingContext private[streaming] (
 
   private[streaming] def getNewInputStreamId() = nextInputStreamId.getAndIncrement()
 
+  /**
+   * Execute a block of code in a scope such that all new DStreams created in this body will
+   * be part of the same scope. For more detail, see the comments in `doCompute`.
+   *
+   * Note: Return statements are NOT allowed in the given body.
+   */
+  private[streaming] def withScope[U](body: => U): U = sparkContext.withScope(body)
+
+  /**
+   * Execute a block of code in a scope such that all new DStreams created in this body will
+   * be part of the same scope. For more detail, see the comments in `doCompute`.
+   *
+   * Note: Return statements are NOT allowed in the given body.
+   */
+  private[streaming] def withNamedScope[U](name: String)(body: => U): U = {
+    RDDOperationScope.withScope(sc, name, allowNesting = false, ignoreParent = false)(body)
+  }
+
   /**
    * Create an input stream with any arbitrary user implemented receiver.
    * Find more details at: http://spark.apache.org/docs/latest/streaming-custom-receivers.html
    * @param receiver Custom implementation of Receiver
    */
   @deprecated("Use receiverStream", "1.0.0")
-  def networkStream[T: ClassTag](
-    receiver: Receiver[T]): ReceiverInputDStream[T] = {
-    receiverStream(receiver)
+  def networkStream[T: ClassTag](receiver: Receiver[T]): ReceiverInputDStream[T] = {
+    withNamedScope("network stream") {
+      receiverStream(receiver)
+    }
   }
 
   /**
@@ -257,9 +276,10 @@ class StreamingContext private[streaming] (
    * Find more details at: http://spark.apache.org/docs/latest/streaming-custom-receivers.html
    * @param receiver Custom implementation of Receiver
    */
-  def receiverStream[T: ClassTag](
-    receiver: Receiver[T]): ReceiverInputDStream[T] = {
-    new PluggableInputDStream[T](this, receiver)
+  def receiverStream[T: ClassTag](receiver: Receiver[T]): ReceiverInputDStream[T] = {
+    withNamedScope("receiver stream") {
+      new PluggableInputDStream[T](this, receiver)
+    }
   }
 
   /**
@@ -279,7 +299,7 @@ class StreamingContext private[streaming] (
       name: String,
       storageLevel: StorageLevel = StorageLevel.MEMORY_AND_DISK_SER_2,
       supervisorStrategy: SupervisorStrategy = ActorSupervisorStrategy.defaultStrategy
-    ): ReceiverInputDStream[T] = {
+    ): ReceiverInputDStream[T] = withNamedScope("actor stream") {
     receiverStream(new ActorReceiver[T](props, name, storageLevel, supervisorStrategy))
   }
 
@@ -296,7 +316,7 @@ class StreamingContext private[streaming] (
       hostname: String,
       port: Int,
       storageLevel: StorageLevel = StorageLevel.MEMORY_AND_DISK_SER_2
-    ): ReceiverInputDStream[String] = {
+    ): ReceiverInputDStream[String] = withNamedScope("socket text stream") {
     socketStream[String](hostname, port, SocketReceiver.bytesToLines, storageLevel)
   }
 
@@ -334,7 +354,7 @@ class StreamingContext private[streaming] (
       hostname: String,
       port: Int,
       storageLevel: StorageLevel = StorageLevel.MEMORY_AND_DISK_SER_2
-    ): ReceiverInputDStream[T] = {
+    ): ReceiverInputDStream[T] = withNamedScope("raw socket stream") {
     new RawInputDStream[T](this, hostname, port, storageLevel)
   }
 
@@ -408,7 +428,7 @@ class StreamingContext private[streaming] (
    * file system. File names starting with . are ignored.
    * @param directory HDFS directory to monitor for new file
    */
-  def textFileStream(directory: String): DStream[String] = {
+  def textFileStream(directory: String): DStream[String] = withNamedScope("text file stream") {
     fileStream[LongWritable, Text, TextInputFormat](directory).map(_._2.toString)
   }
 
@@ -430,7 +450,7 @@ class StreamingContext private[streaming] (
   @Experimental
   def binaryRecordsStream(
       directory: String,
-      recordLength: Int): DStream[Array[Byte]] = {
+      recordLength: Int): DStream[Array[Byte]] = withNamedScope("binary records stream") {
     val conf = sc_.hadoopConfiguration
     conf.setInt(FixedLengthBinaryInputFormat.RECORD_LENGTH_PROPERTY, recordLength)
     val br = fileStream[LongWritable, BytesWritable, FixedLengthBinaryInputFormat](
@@ -477,7 +497,7 @@ class StreamingContext private[streaming] (
   /**
    * Create a unified DStream from multiple DStreams of the same type and same slide duration.
    */
-  def union[T: ClassTag](streams: Seq[DStream[T]]): DStream[T] = {
+  def union[T: ClassTag](streams: Seq[DStream[T]]): DStream[T] = withScope {
     new UnionDStream[T](streams.toArray)
   }
 
@@ -488,7 +508,7 @@ class StreamingContext private[streaming] (
   def transform[T: ClassTag](
       dstreams: Seq[DStream[_]],
       transformFunc: (Seq[RDD[_]], Time) => RDD[T]
-    ): DStream[T] = {
+    ): DStream[T] = withScope {
     new TransformedDStream[T](dstreams, sparkContext.clean(transformFunc))
   }
 
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/dstream/DStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/dstream/DStream.scala
index 64de7526a6a34..5977481e1f081 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/dstream/DStream.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/dstream/DStream.scala
@@ -25,12 +25,13 @@ import scala.language.implicitConversions
 import scala.reflect.ClassTag
 import scala.util.matching.Regex
 
-import org.apache.spark.{Logging, SparkException}
-import org.apache.spark.rdd.{BlockRDD, PairRDDFunctions, RDD}
+import org.apache.spark.{Logging, SparkContext, SparkException}
+import org.apache.spark.rdd.{BlockRDD, PairRDDFunctions, RDD, RDDOperationScope}
 import org.apache.spark.storage.StorageLevel
 import org.apache.spark.streaming._
 import org.apache.spark.streaming.StreamingContext.rddToFileName
 import org.apache.spark.streaming.scheduler.Job
+import org.apache.spark.streaming.ui.UIUtils
 import org.apache.spark.util.{CallSite, MetadataCleaner, Utils}
 
 /**
@@ -73,7 +74,7 @@ abstract class DStream[T: ClassTag] (
   def dependencies: List[DStream[_]]
 
   /** Method that generates a RDD for the given time */
-  def compute (validTime: Time): Option[RDD[T]]
+  def compute(validTime: Time): Option[RDD[T]]
 
   // =======================================================================
   // Methods and fields available on all DStreams
@@ -111,6 +112,44 @@ abstract class DStream[T: ClassTag] (
   /* Set the creation call site */
   private[streaming] val creationSite = DStream.getCreationSite()
 
+  /**
+   * The base scope associated with the operation that created this DStream.
+   *
+   * This is the medium through which we pass the DStream operation name (e.g. updatedStateByKey)
+   * to the RDDs created by this DStream. Note that we never use this scope directly in RDDs.
+   * Instead, we instantiate a new scope during each call to `compute` based on this one.
+   *
+   * This is not defined if the DStream is created outside of one of the public DStream operations.
+   */
+  protected[streaming] val baseScope: Option[String] = {
+    Option(ssc.sc.getLocalProperty(SparkContext.RDD_SCOPE_KEY))
+  }
+
+  /**
+   * Make a scope that groups RDDs created in the same DStream operation in the same batch.
+   *
+   * Each DStream produces many scopes and each scope may be shared by other DStreams created
+   * in the same operation. Separate calls to the same DStream operation create separate scopes.
+   * For instance, `dstream.map(...).map(...)` creates two separate scopes per batch.
+   */
+  private def makeScope(time: Time): Option[RDDOperationScope] = {
+    baseScope.map { bsJson =>
+      val formattedBatchTime = UIUtils.formatBatchTime(
+        time.milliseconds, ssc.graph.batchDuration.milliseconds, showYYYYMMSS = false)
+      val bs = RDDOperationScope.fromJson(bsJson)
+      val baseName = bs.name // e.g. countByWindow, "kafka stream [0]"
+      val scopeName =
+        if (baseName.length > 10) {
+          // If the operation name is too long, wrap the line
+          s"$baseName\n@ $formattedBatchTime"
+        } else {
+          s"$baseName @ $formattedBatchTime"
+        }
+      val scopeId = s"${bs.id}_${time.milliseconds}"
+      new RDDOperationScope(scopeName, id = scopeId)
+    }
+  }
+
   /** Persist the RDDs of this DStream with the given storage level */
   def persist(level: StorageLevel): DStream[T] = {
     if (this.isInitialized) {
@@ -295,28 +334,23 @@ abstract class DStream[T: ClassTag] (
    * Get the RDD corresponding to the given time; either retrieve it from cache
    * or compute-and-cache it.
    */
-  private[streaming] def getOrCompute(time: Time): Option[RDD[T]] = {
+  private[streaming] final def getOrCompute(time: Time): Option[RDD[T]] = {
     // If RDD was already generated, then retrieve it from HashMap,
     // or else compute the RDD
     generatedRDDs.get(time).orElse {
       // Compute the RDD if time is valid (e.g. correct time in a sliding window)
       // of RDD generation, else generate nothing.
       if (isTimeValid(time)) {
-        // Set the thread-local property for call sites to this DStream's creation site
-        // such that RDDs generated by compute gets that as their creation site.
-        // Note that this `getOrCompute` may get called from another DStream which may have
-        // set its own call site. So we store its call site in a temporary variable,
-        // set this DStream's creation site, generate RDDs and then restore the previous call site.
-        val prevCallSite = ssc.sparkContext.getCallSite()
-        ssc.sparkContext.setCallSite(creationSite)
-        // Disable checks for existing output directories in jobs launched by the streaming
-        // scheduler, since we may need to write output to an existing directory during checkpoint
-        // recovery; see SPARK-4835 for more details. We need to have this call here because
-        // compute() might cause Spark jobs to be launched.
-        val rddOption = PairRDDFunctions.disableOutputSpecValidation.withValue(true) {
-          compute(time)
+
+        val rddOption = createRDDWithLocalProperties(time) {
+          // Disable checks for existing output directories in jobs launched by the streaming
+          // scheduler, since we may need to write output to an existing directory during checkpoint
+          // recovery; see SPARK-4835 for more details. We need to have this call here because
+          // compute() might cause Spark jobs to be launched.
+          PairRDDFunctions.disableOutputSpecValidation.withValue(true) {
+            compute(time)
+          }
         }
-        ssc.sparkContext.setCallSite(prevCallSite)
 
         rddOption.foreach { case newRDD =>
           // Register the generated RDD for caching and checkpointing
@@ -337,6 +371,41 @@ abstract class DStream[T: ClassTag] (
     }
   }
 
+  /**
+   * Wrap a body of code such that the call site and operation scope
+   * information are passed to the RDDs created in this body properly.
+   */
+  protected def createRDDWithLocalProperties[U](time: Time)(body: => U): U = {
+    val scopeKey = SparkContext.RDD_SCOPE_KEY
+    val scopeNoOverrideKey = SparkContext.RDD_SCOPE_NO_OVERRIDE_KEY
+    // Pass this DStream's operation scope and creation site information to RDDs through
+    // thread-local properties in our SparkContext. Since this method may be called from another
+    // DStream, we need to temporarily store any old scope and creation site information to
+    // restore them later after setting our own.
+    val prevCallSite = ssc.sparkContext.getCallSite()
+    val prevScope = ssc.sparkContext.getLocalProperty(scopeKey)
+    val prevScopeNoOverride = ssc.sparkContext.getLocalProperty(scopeNoOverrideKey)
+
+    try {
+      ssc.sparkContext.setCallSite(creationSite)
+      // Use the DStream's base scope for this RDD so we can (1) preserve the higher level
+      // DStream operation name, and (2) share this scope with other DStreams created in the
+      // same operation. Disallow nesting so that low-level Spark primitives do not show up.
+      // TODO: merge callsites with scopes so we can just reuse the code there
+      makeScope(time).foreach { s =>
+        ssc.sparkContext.setLocalProperty(scopeKey, s.toJson)
+        ssc.sparkContext.setLocalProperty(scopeNoOverrideKey, "true")
+      }
+
+      body
+    } finally {
+      // Restore any state that was modified before returning
+      ssc.sparkContext.setCallSite(prevCallSite)
+      ssc.sparkContext.setLocalProperty(scopeKey, prevScope)
+      ssc.sparkContext.setLocalProperty(scopeNoOverrideKey, prevScopeNoOverride)
+    }
+  }
+
   /**
    * Generate a SparkStreaming job for the given time. This is an internal method that
    * should not be called directly. This default implementation creates a job
@@ -456,7 +525,7 @@ abstract class DStream[T: ClassTag] (
   // =======================================================================
 
   /** Return a new DStream by applying a function to all elements of this DStream. */
-  def map[U: ClassTag](mapFunc: T => U): DStream[U] = {
+  def map[U: ClassTag](mapFunc: T => U): DStream[U] = ssc.withScope {
     new MappedDStream(this, context.sparkContext.clean(mapFunc))
   }
 
@@ -464,26 +533,31 @@ abstract class DStream[T: ClassTag] (
    * Return a new DStream by applying a function to all elements of this DStream,
    * and then flattening the results
    */
-  def flatMap[U: ClassTag](flatMapFunc: T => Traversable[U]): DStream[U] = {
+  def flatMap[U: ClassTag](flatMapFunc: T => Traversable[U]): DStream[U] = ssc.withScope {
     new FlatMappedDStream(this, context.sparkContext.clean(flatMapFunc))
   }
 
   /** Return a new DStream containing only the elements that satisfy a predicate. */
-  def filter(filterFunc: T => Boolean): DStream[T] = new FilteredDStream(this, filterFunc)
+  def filter(filterFunc: T => Boolean): DStream[T] = ssc.withScope {
+    new FilteredDStream(this, filterFunc)
+  }
 
   /**
    * Return a new DStream in which each RDD is generated by applying glom() to each RDD of
    * this DStream. Applying glom() to an RDD coalesces all elements within each partition into
    * an array.
    */
-  def glom(): DStream[Array[T]] = new GlommedDStream(this)
-
+  def glom(): DStream[Array[T]] = ssc.withScope {
+    new GlommedDStream(this)
+  }
 
   /**
    * Return a new DStream with an increased or decreased level of parallelism. Each RDD in the
    * returned DStream has exactly numPartitions partitions.
    */
-  def repartition(numPartitions: Int): DStream[T] = this.transform(_.repartition(numPartitions))
+  def repartition(numPartitions: Int): DStream[T] = ssc.withScope {
+    this.transform(_.repartition(numPartitions))
+  }
 
   /**
    * Return a new DStream in which each RDD is generated by applying mapPartitions() to each RDDs
@@ -493,7 +567,7 @@ abstract class DStream[T: ClassTag] (
   def mapPartitions[U: ClassTag](
       mapPartFunc: Iterator[T] => Iterator[U],
       preservePartitioning: Boolean = false
-    ): DStream[U] = {
+    ): DStream[U] = ssc.withScope {
     new MapPartitionedDStream(this, context.sparkContext.clean(mapPartFunc), preservePartitioning)
   }
 
@@ -501,14 +575,15 @@ abstract class DStream[T: ClassTag] (
    * Return a new DStream in which each RDD has a single element generated by reducing each RDD
    * of this DStream.
    */
-  def reduce(reduceFunc: (T, T) => T): DStream[T] =
+  def reduce(reduceFunc: (T, T) => T): DStream[T] = ssc.withScope {
     this.map(x => (null, x)).reduceByKey(reduceFunc, 1).map(_._2)
+  }
 
   /**
    * Return a new DStream in which each RDD has a single element generated by counting each RDD
    * of this DStream.
    */
-  def count(): DStream[Long] = {
+  def count(): DStream[Long] = ssc.withScope {
     this.map(_ => (null, 1L))
         .transform(_.union(context.sparkContext.makeRDD(Seq((null, 0L)), 1)))
         .reduceByKey(_ + _)
@@ -522,15 +597,16 @@ abstract class DStream[T: ClassTag] (
    * `numPartitions` not specified).
    */
   def countByValue(numPartitions: Int = ssc.sc.defaultParallelism)(implicit ord: Ordering[T] = null)
-      : DStream[(T, Long)] =
+      : DStream[(T, Long)] = ssc.withScope {
     this.map(x => (x, 1L)).reduceByKey((x: Long, y: Long) => x + y, numPartitions)
+  }
 
   /**
    * Apply a function to each RDD in this DStream. This is an output operator, so
    * 'this' DStream will be registered as an output stream and therefore materialized.
    */
   @deprecated("use foreachRDD", "0.9.0")
-  def foreach(foreachFunc: RDD[T] => Unit): Unit = {
+  def foreach(foreachFunc: RDD[T] => Unit): Unit = ssc.withScope {
     this.foreachRDD(foreachFunc)
   }
 
@@ -539,7 +615,7 @@ abstract class DStream[T: ClassTag] (
    * 'this' DStream will be registered as an output stream and therefore materialized.
    */
   @deprecated("use foreachRDD", "0.9.0")
-  def foreach(foreachFunc: (RDD[T], Time) => Unit): Unit = {
+  def foreach(foreachFunc: (RDD[T], Time) => Unit): Unit = ssc.withScope {
     this.foreachRDD(foreachFunc)
   }
 
@@ -547,7 +623,7 @@ abstract class DStream[T: ClassTag] (
    * Apply a function to each RDD in this DStream. This is an output operator, so
    * 'this' DStream will be registered as an output stream and therefore materialized.
    */
-  def foreachRDD(foreachFunc: RDD[T] => Unit) {
+  def foreachRDD(foreachFunc: RDD[T] => Unit): Unit = ssc.withScope {
     this.foreachRDD((r: RDD[T], t: Time) => foreachFunc(r))
   }
 
@@ -555,7 +631,7 @@ abstract class DStream[T: ClassTag] (
    * Apply a function to each RDD in this DStream. This is an output operator, so
    * 'this' DStream will be registered as an output stream and therefore materialized.
    */
-  def foreachRDD(foreachFunc: (RDD[T], Time) => Unit) {
+  def foreachRDD(foreachFunc: (RDD[T], Time) => Unit): Unit = ssc.withScope {
     // because the DStream is reachable from the outer object here, and because 
     // DStreams can't be serialized with closures, we can't proactively check 
     // it for serializability and so we pass the optional false to SparkContext.clean
@@ -566,7 +642,7 @@ abstract class DStream[T: ClassTag] (
    * Return a new DStream in which each RDD is generated by applying a function
    * on each RDD of 'this' DStream.
    */
-  def transform[U: ClassTag](transformFunc: RDD[T] => RDD[U]): DStream[U] = {
+  def transform[U: ClassTag](transformFunc: RDD[T] => RDD[U]): DStream[U] = ssc.withScope {
     // because the DStream is reachable from the outer object here, and because 
     // DStreams can't be serialized with closures, we can't proactively check 
     // it for serializability and so we pass the optional false to SparkContext.clean
@@ -578,7 +654,7 @@ abstract class DStream[T: ClassTag] (
    * Return a new DStream in which each RDD is generated by applying a function
    * on each RDD of 'this' DStream.
    */
-  def transform[U: ClassTag](transformFunc: (RDD[T], Time) => RDD[U]): DStream[U] = {
+  def transform[U: ClassTag](transformFunc: (RDD[T], Time) => RDD[U]): DStream[U] = ssc.withScope {
     // because the DStream is reachable from the outer object here, and because 
     // DStreams can't be serialized with closures, we can't proactively check 
     // it for serializability and so we pass the optional false to SparkContext.clean
@@ -596,7 +672,7 @@ abstract class DStream[T: ClassTag] (
    */
   def transformWith[U: ClassTag, V: ClassTag](
       other: DStream[U], transformFunc: (RDD[T], RDD[U]) => RDD[V]
-    ): DStream[V] = {
+    ): DStream[V] = ssc.withScope {
     // because the DStream is reachable from the outer object here, and because 
     // DStreams can't be serialized with closures, we can't proactively check 
     // it for serializability and so we pass the optional false to SparkContext.clean
@@ -610,7 +686,7 @@ abstract class DStream[T: ClassTag] (
    */
   def transformWith[U: ClassTag, V: ClassTag](
       other: DStream[U], transformFunc: (RDD[T], RDD[U], Time) => RDD[V]
-    ): DStream[V] = {
+    ): DStream[V] = ssc.withScope {
     // because the DStream is reachable from the outer object here, and because 
     // DStreams can't be serialized with closures, we can't proactively check 
     // it for serializability and so we pass the optional false to SparkContext.clean
@@ -628,7 +704,7 @@ abstract class DStream[T: ClassTag] (
    * Print the first ten elements of each RDD generated in this DStream. This is an output
    * operator, so this DStream will be registered as an output stream and there materialized.
    */
-  def print() {
+  def print(): Unit = ssc.withScope {
     print(10)
   }
 
@@ -636,7 +712,7 @@ abstract class DStream[T: ClassTag] (
    * Print the first num elements of each RDD generated in this DStream. This is an output
    * operator, so this DStream will be registered as an output stream and there materialized.
    */
-  def print(num: Int) {
+  def print(num: Int): Unit = ssc.withScope {
     def foreachFunc: (RDD[T], Time) => Unit = {
       (rdd: RDD[T], time: Time) => {
         val firstNum = rdd.take(num + 1)
@@ -668,7 +744,7 @@ abstract class DStream[T: ClassTag] (
    *                       the new DStream will generate RDDs); must be a multiple of this
    *                       DStream's batching interval
    */
-  def window(windowDuration: Duration, slideDuration: Duration): DStream[T] = {
+  def window(windowDuration: Duration, slideDuration: Duration): DStream[T] = ssc.withScope {
     new WindowedDStream(this, windowDuration, slideDuration)
   }
 
@@ -686,7 +762,7 @@ abstract class DStream[T: ClassTag] (
       reduceFunc: (T, T) => T,
       windowDuration: Duration,
       slideDuration: Duration
-    ): DStream[T] = {
+    ): DStream[T] = ssc.withScope {
     this.reduce(reduceFunc).window(windowDuration, slideDuration).reduce(reduceFunc)
   }
 
@@ -711,7 +787,7 @@ abstract class DStream[T: ClassTag] (
       invReduceFunc: (T, T) => T,
       windowDuration: Duration,
       slideDuration: Duration
-    ): DStream[T] = {
+    ): DStream[T] = ssc.withScope {
       this.map(x => (1, x))
           .reduceByKeyAndWindow(reduceFunc, invReduceFunc, windowDuration, slideDuration, 1)
           .map(_._2)
@@ -727,7 +803,9 @@ abstract class DStream[T: ClassTag] (
    *                       the new DStream will generate RDDs); must be a multiple of this
    *                       DStream's batching interval
    */
-  def countByWindow(windowDuration: Duration, slideDuration: Duration): DStream[Long] = {
+  def countByWindow(
+      windowDuration: Duration,
+      slideDuration: Duration): DStream[Long] = ssc.withScope {
     this.map(_ => 1L).reduceByWindow(_ + _, _ - _, windowDuration, slideDuration)
   }
 
@@ -748,8 +826,7 @@ abstract class DStream[T: ClassTag] (
       slideDuration: Duration,
       numPartitions: Int = ssc.sc.defaultParallelism)
       (implicit ord: Ordering[T] = null)
-      : DStream[(T, Long)] =
-  {
+      : DStream[(T, Long)] = ssc.withScope {
     this.map(x => (x, 1L)).reduceByKeyAndWindow(
       (x: Long, y: Long) => x + y,
       (x: Long, y: Long) => x - y,
@@ -764,19 +841,21 @@ abstract class DStream[T: ClassTag] (
    * Return a new DStream by unifying data of another DStream with this DStream.
    * @param that Another DStream having the same slideDuration as this DStream.
    */
-  def union(that: DStream[T]): DStream[T] = new UnionDStream[T](Array(this, that))
+  def union(that: DStream[T]): DStream[T] = ssc.withScope {
+    new UnionDStream[T](Array(this, that))
+  }
 
   /**
    * Return all the RDDs defined by the Interval object (both end times included)
    */
-  def slice(interval: Interval): Seq[RDD[T]] = {
+  def slice(interval: Interval): Seq[RDD[T]] = ssc.withScope {
     slice(interval.beginTime, interval.endTime)
   }
 
   /**
    * Return all the RDDs between 'fromTime' to 'toTime' (both included)
    */
-  def slice(fromTime: Time, toTime: Time): Seq[RDD[T]] = {
+  def slice(fromTime: Time, toTime: Time): Seq[RDD[T]] = ssc.withScope {
     if (!isInitialized) {
       throw new SparkException(this + " has not been initialized")
     }
@@ -810,7 +889,7 @@ abstract class DStream[T: ClassTag] (
    * The file name at each batch interval is generated based on `prefix` and
    * `suffix`: "prefix-TIME_IN_MS.suffix".
    */
-  def saveAsObjectFiles(prefix: String, suffix: String = "") {
+  def saveAsObjectFiles(prefix: String, suffix: String = ""): Unit = ssc.withScope {
     val saveFunc = (rdd: RDD[T], time: Time) => {
       val file = rddToFileName(prefix, suffix, time)
       rdd.saveAsObjectFile(file)
@@ -823,7 +902,7 @@ abstract class DStream[T: ClassTag] (
    * of elements. The file name at each batch interval is generated based on
    * `prefix` and `suffix`: "prefix-TIME_IN_MS.suffix".
    */
-  def saveAsTextFiles(prefix: String, suffix: String = "") {
+  def saveAsTextFiles(prefix: String, suffix: String = ""): Unit = ssc.withScope {
     val saveFunc = (rdd: RDD[T], time: Time) => {
       val file = rddToFileName(prefix, suffix, time)
       rdd.saveAsTextFile(file)
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/dstream/ForEachDStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/dstream/ForEachDStream.scala
index 685a32e1d280d..c109ceccc6989 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/dstream/ForEachDStream.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/dstream/ForEachDStream.scala
@@ -37,7 +37,7 @@ class ForEachDStream[T: ClassTag] (
   override def generateJob(time: Time): Option[Job] = {
     parent.getOrCompute(time) match {
       case Some(rdd) =>
-        val jobFunc = () => {
+        val jobFunc = () => createRDDWithLocalProperties(time) {
           ssc.sparkContext.setCallSite(creationSite)
           foreachFunc(rdd, time)
         }
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/dstream/InputDStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/dstream/InputDStream.scala
index 9716adb62817c..d58c99a8ff321 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/dstream/InputDStream.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/dstream/InputDStream.scala
@@ -17,10 +17,13 @@
 
 package org.apache.spark.streaming.dstream
 
-import org.apache.spark.streaming.{Time, Duration, StreamingContext}
-
 import scala.reflect.ClassTag
 
+import org.apache.spark.SparkContext
+import org.apache.spark.rdd.RDDOperationScope
+import org.apache.spark.streaming.{Time, Duration, StreamingContext}
+import org.apache.spark.util.Utils
+
 /**
  * This is the abstract base class for all input streams. This class provides methods
  * start() and stop() which is called by Spark Streaming system to start and stop receiving data.
@@ -44,10 +47,31 @@ abstract class InputDStream[T: ClassTag] (@transient ssc_ : StreamingContext)
   /** This is an unique identifier for the input stream. */
   val id = ssc.getNewInputStreamId()
 
+  /** A human-readable name of this InputDStream */
+  private[streaming] def name: String = {
+    // e.g. FlumePollingDStream -> "Flume polling stream"
+    val newName = Utils.getFormattedClassName(this)
+      .replaceAll("InputDStream", "Stream")
+      .split("(?=[A-Z])")
+      .filter(_.nonEmpty)
+      .mkString(" ")
+      .toLowerCase
+      .capitalize
+    s"$newName [$id]"
+  }
+
   /**
-   * The name of this InputDStream. By default, it's the class name with its id.
+   * The base scope associated with the operation that created this DStream.
+   *
+   * For InputDStreams, we use the name of this DStream as the scope name.
+   * If an outer scope is given, we assume that it includes an alternative name for this stream.
    */
-  private[streaming] def name: String = s"${getClass.getSimpleName}-$id"
+  protected[streaming] override val baseScope: Option[String] = {
+    val scopeName = Option(ssc.sc.getLocalProperty(SparkContext.RDD_SCOPE_KEY))
+      .map { json => RDDOperationScope.fromJson(json).name + s" [$id]" }
+      .getOrElse(name.toLowerCase)
+    Some(new RDDOperationScope(scopeName).toJson)
+  }
 
   /**
    * Checks whether the 'time' is valid wrt slideDuration for generating RDD.
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/dstream/PairDStreamFunctions.scala b/streaming/src/main/scala/org/apache/spark/streaming/dstream/PairDStreamFunctions.scala
index 8a58571632447..884a8e8b52289 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/dstream/PairDStreamFunctions.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/dstream/PairDStreamFunctions.scala
@@ -46,7 +46,7 @@ class PairDStreamFunctions[K, V](self: DStream[(K,V)])
    * Return a new DStream by applying `groupByKey` to each RDD. Hash partitioning is used to
    * generate the RDDs with Spark's default number of partitions.
    */
-  def groupByKey(): DStream[(K, Iterable[V])] = {
+  def groupByKey(): DStream[(K, Iterable[V])] = ssc.withScope {
     groupByKey(defaultPartitioner())
   }
 
@@ -54,7 +54,7 @@ class PairDStreamFunctions[K, V](self: DStream[(K,V)])
    * Return a new DStream by applying `groupByKey` to each RDD. Hash partitioning is used to
    * generate the RDDs with `numPartitions` partitions.
    */
-  def groupByKey(numPartitions: Int): DStream[(K, Iterable[V])] = {
+  def groupByKey(numPartitions: Int): DStream[(K, Iterable[V])] = ssc.withScope {
     groupByKey(defaultPartitioner(numPartitions))
   }
 
@@ -62,7 +62,7 @@ class PairDStreamFunctions[K, V](self: DStream[(K,V)])
    * Return a new DStream by applying `groupByKey` on each RDD. The supplied
    * org.apache.spark.Partitioner is used to control the partitioning of each RDD.
    */
-  def groupByKey(partitioner: Partitioner): DStream[(K, Iterable[V])] = {
+  def groupByKey(partitioner: Partitioner): DStream[(K, Iterable[V])] = ssc.withScope {
     val createCombiner = (v: V) => ArrayBuffer[V](v)
     val mergeValue = (c: ArrayBuffer[V], v: V) => (c += v)
     val mergeCombiner = (c1: ArrayBuffer[V], c2: ArrayBuffer[V]) => (c1 ++ c2)
@@ -75,7 +75,7 @@ class PairDStreamFunctions[K, V](self: DStream[(K,V)])
    * merged using the associative reduce function. Hash partitioning is used to generate the RDDs
    * with Spark's default number of partitions.
    */
-  def reduceByKey(reduceFunc: (V, V) => V): DStream[(K, V)] = {
+  def reduceByKey(reduceFunc: (V, V) => V): DStream[(K, V)] = ssc.withScope {
     reduceByKey(reduceFunc, defaultPartitioner())
   }
 
@@ -84,7 +84,9 @@ class PairDStreamFunctions[K, V](self: DStream[(K,V)])
    * merged using the supplied reduce function. Hash partitioning is used to generate the RDDs
    * with `numPartitions` partitions.
    */
-  def reduceByKey(reduceFunc: (V, V) => V, numPartitions: Int): DStream[(K, V)] = {
+  def reduceByKey(
+      reduceFunc: (V, V) => V,
+      numPartitions: Int): DStream[(K, V)] = ssc.withScope {
     reduceByKey(reduceFunc, defaultPartitioner(numPartitions))
   }
 
@@ -93,7 +95,9 @@ class PairDStreamFunctions[K, V](self: DStream[(K,V)])
    * merged using the supplied reduce function. org.apache.spark.Partitioner is used to control
    * the partitioning of each RDD.
    */
-  def reduceByKey(reduceFunc: (V, V) => V, partitioner: Partitioner): DStream[(K, V)] = {
+  def reduceByKey(
+      reduceFunc: (V, V) => V,
+      partitioner: Partitioner): DStream[(K, V)] = ssc.withScope {
     val cleanedReduceFunc = ssc.sc.clean(reduceFunc)
     combineByKey((v: V) => v, cleanedReduceFunc, cleanedReduceFunc, partitioner)
   }
@@ -104,11 +108,11 @@ class PairDStreamFunctions[K, V](self: DStream[(K,V)])
    * org.apache.spark.rdd.PairRDDFunctions in the Spark core documentation for more information.
    */
   def combineByKey[C: ClassTag](
-    createCombiner: V => C,
-    mergeValue: (C, V) => C,
-    mergeCombiner: (C, C) => C,
-    partitioner: Partitioner,
-    mapSideCombine: Boolean = true): DStream[(K, C)] = {
+      createCombiner: V => C,
+      mergeValue: (C, V) => C,
+      mergeCombiner: (C, C) => C,
+      partitioner: Partitioner,
+      mapSideCombine: Boolean = true): DStream[(K, C)] = ssc.withScope {
     new ShuffledDStream[K, V, C](self, createCombiner, mergeValue, mergeCombiner, partitioner,
       mapSideCombine)
   }
@@ -121,7 +125,7 @@ class PairDStreamFunctions[K, V](self: DStream[(K,V)])
    * @param windowDuration width of the window; must be a multiple of this DStream's
    *                       batching interval
    */
-  def groupByKeyAndWindow(windowDuration: Duration): DStream[(K, Iterable[V])] = {
+  def groupByKeyAndWindow(windowDuration: Duration): DStream[(K, Iterable[V])] = ssc.withScope {
     groupByKeyAndWindow(windowDuration, self.slideDuration, defaultPartitioner())
   }
 
@@ -136,8 +140,7 @@ class PairDStreamFunctions[K, V](self: DStream[(K,V)])
    *                       DStream's batching interval
    */
   def groupByKeyAndWindow(windowDuration: Duration, slideDuration: Duration)
-      : DStream[(K, Iterable[V])] =
-  {
+      : DStream[(K, Iterable[V])] = ssc.withScope {
     groupByKeyAndWindow(windowDuration, slideDuration, defaultPartitioner())
   }
 
@@ -157,7 +160,7 @@ class PairDStreamFunctions[K, V](self: DStream[(K,V)])
       windowDuration: Duration,
       slideDuration: Duration,
       numPartitions: Int
-    ): DStream[(K, Iterable[V])] = {
+    ): DStream[(K, Iterable[V])] = ssc.withScope {
     groupByKeyAndWindow(windowDuration, slideDuration, defaultPartitioner(numPartitions))
   }
 
@@ -176,7 +179,7 @@ class PairDStreamFunctions[K, V](self: DStream[(K,V)])
       windowDuration: Duration,
       slideDuration: Duration,
       partitioner: Partitioner
-    ): DStream[(K, Iterable[V])] = {
+    ): DStream[(K, Iterable[V])] = ssc.withScope {
     val createCombiner = (v: Iterable[V]) => new ArrayBuffer[V] ++= v
     val mergeValue = (buf: ArrayBuffer[V], v: Iterable[V]) => buf ++= v
     val mergeCombiner = (buf1: ArrayBuffer[V], buf2: ArrayBuffer[V]) => buf1 ++= buf2
@@ -198,7 +201,7 @@ class PairDStreamFunctions[K, V](self: DStream[(K,V)])
   def reduceByKeyAndWindow(
       reduceFunc: (V, V) => V,
       windowDuration: Duration
-    ): DStream[(K, V)] = {
+    ): DStream[(K, V)] = ssc.withScope {
     reduceByKeyAndWindow(reduceFunc, windowDuration, self.slideDuration, defaultPartitioner())
   }
 
@@ -217,7 +220,7 @@ class PairDStreamFunctions[K, V](self: DStream[(K,V)])
       reduceFunc: (V, V) => V,
       windowDuration: Duration,
       slideDuration: Duration
-    ): DStream[(K, V)] = {
+    ): DStream[(K, V)] = ssc.withScope {
     reduceByKeyAndWindow(reduceFunc, windowDuration, slideDuration, defaultPartitioner())
   }
 
@@ -238,7 +241,7 @@ class PairDStreamFunctions[K, V](self: DStream[(K,V)])
       windowDuration: Duration,
       slideDuration: Duration,
       numPartitions: Int
-    ): DStream[(K, V)] = {
+    ): DStream[(K, V)] = ssc.withScope {
     reduceByKeyAndWindow(reduceFunc, windowDuration, slideDuration,
       defaultPartitioner(numPartitions))
   }
@@ -260,7 +263,7 @@ class PairDStreamFunctions[K, V](self: DStream[(K,V)])
       windowDuration: Duration,
       slideDuration: Duration,
       partitioner: Partitioner
-    ): DStream[(K, V)] = {
+    ): DStream[(K, V)] = ssc.withScope {
     val cleanedReduceFunc = ssc.sc.clean(reduceFunc)
     self.reduceByKey(cleanedReduceFunc, partitioner)
         .window(windowDuration, slideDuration)
@@ -294,8 +297,7 @@ class PairDStreamFunctions[K, V](self: DStream[(K,V)])
       slideDuration: Duration = self.slideDuration,
       numPartitions: Int = ssc.sc.defaultParallelism,
       filterFunc: ((K, V)) => Boolean = null
-    ): DStream[(K, V)] = {
-
+    ): DStream[(K, V)] = ssc.withScope {
     reduceByKeyAndWindow(
       reduceFunc, invReduceFunc, windowDuration,
       slideDuration, defaultPartitioner(numPartitions), filterFunc
@@ -328,7 +330,7 @@ class PairDStreamFunctions[K, V](self: DStream[(K,V)])
       slideDuration: Duration,
       partitioner: Partitioner,
       filterFunc: ((K, V)) => Boolean
-    ): DStream[(K, V)] = {
+    ): DStream[(K, V)] = ssc.withScope {
 
     val cleanedReduceFunc = ssc.sc.clean(reduceFunc)
     val cleanedInvReduceFunc = ssc.sc.clean(invReduceFunc)
@@ -349,7 +351,7 @@ class PairDStreamFunctions[K, V](self: DStream[(K,V)])
    */
   def updateStateByKey[S: ClassTag](
       updateFunc: (Seq[V], Option[S]) => Option[S]
-    ): DStream[(K, S)] = {
+    ): DStream[(K, S)] = ssc.withScope {
     updateStateByKey(updateFunc, defaultPartitioner())
   }
 
@@ -365,7 +367,7 @@ class PairDStreamFunctions[K, V](self: DStream[(K,V)])
   def updateStateByKey[S: ClassTag](
       updateFunc: (Seq[V], Option[S]) => Option[S],
       numPartitions: Int
-    ): DStream[(K, S)] = {
+    ): DStream[(K, S)] = ssc.withScope {
     updateStateByKey(updateFunc, defaultPartitioner(numPartitions))
   }
 
@@ -382,7 +384,7 @@ class PairDStreamFunctions[K, V](self: DStream[(K,V)])
   def updateStateByKey[S: ClassTag](
       updateFunc: (Seq[V], Option[S]) => Option[S],
       partitioner: Partitioner
-    ): DStream[(K, S)] = {
+    ): DStream[(K, S)] = ssc.withScope {
     val newUpdateFunc = (iterator: Iterator[(K, Seq[V], Option[S])]) => {
       iterator.flatMap(t => updateFunc(t._2, t._3).map(s => (t._1, s)))
     }
@@ -406,7 +408,7 @@ class PairDStreamFunctions[K, V](self: DStream[(K,V)])
       updateFunc: (Iterator[(K, Seq[V], Option[S])]) => Iterator[(K, S)],
       partitioner: Partitioner,
       rememberPartitioner: Boolean
-    ): DStream[(K, S)] = {
+    ): DStream[(K, S)] = ssc.withScope {
      new StateDStream(self, ssc.sc.clean(updateFunc), partitioner, rememberPartitioner, None)
   }
 
@@ -425,7 +427,7 @@ class PairDStreamFunctions[K, V](self: DStream[(K,V)])
       updateFunc: (Seq[V], Option[S]) => Option[S],
       partitioner: Partitioner,
       initialRDD: RDD[(K, S)]
-    ): DStream[(K, S)] = {
+    ): DStream[(K, S)] = ssc.withScope {
     val newUpdateFunc = (iterator: Iterator[(K, Seq[V], Option[S])]) => {
       iterator.flatMap(t => updateFunc(t._2, t._3).map(s => (t._1, s)))
     }
@@ -451,7 +453,7 @@ class PairDStreamFunctions[K, V](self: DStream[(K,V)])
       partitioner: Partitioner,
       rememberPartitioner: Boolean,
       initialRDD: RDD[(K, S)]
-    ): DStream[(K, S)] = {
+    ): DStream[(K, S)] = ssc.withScope {
      new StateDStream(self, ssc.sc.clean(updateFunc), partitioner,
        rememberPartitioner, Some(initialRDD))
   }
@@ -460,7 +462,7 @@ class PairDStreamFunctions[K, V](self: DStream[(K,V)])
    * Return a new DStream by applying a map function to the value of each key-value pairs in
    * 'this' DStream without changing the key.
    */
-  def mapValues[U: ClassTag](mapValuesFunc: V => U): DStream[(K, U)] = {
+  def mapValues[U: ClassTag](mapValuesFunc: V => U): DStream[(K, U)] = ssc.withScope {
     new MapValuedDStream[K, V, U](self, mapValuesFunc)
   }
 
@@ -470,7 +472,7 @@ class PairDStreamFunctions[K, V](self: DStream[(K,V)])
    */
   def flatMapValues[U: ClassTag](
       flatMapValuesFunc: V => TraversableOnce[U]
-    ): DStream[(K, U)] = {
+    ): DStream[(K, U)] = ssc.withScope {
     new FlatMapValuedDStream[K, V, U](self, flatMapValuesFunc)
   }
 
@@ -479,7 +481,8 @@ class PairDStreamFunctions[K, V](self: DStream[(K,V)])
    * Hash partitioning is used to generate the RDDs with Spark's default number
    * of partitions.
    */
-  def cogroup[W: ClassTag](other: DStream[(K, W)]): DStream[(K, (Iterable[V], Iterable[W]))] = {
+  def cogroup[W: ClassTag](
+      other: DStream[(K, W)]): DStream[(K, (Iterable[V], Iterable[W]))] = ssc.withScope {
     cogroup(other, defaultPartitioner())
   }
 
@@ -487,8 +490,9 @@ class PairDStreamFunctions[K, V](self: DStream[(K,V)])
    * Return a new DStream by applying 'cogroup' between RDDs of `this` DStream and `other` DStream.
    * Hash partitioning is used to generate the RDDs with `numPartitions` partitions.
    */
-  def cogroup[W: ClassTag](other: DStream[(K, W)], numPartitions: Int)
-  : DStream[(K, (Iterable[V], Iterable[W]))] = {
+  def cogroup[W: ClassTag](
+      other: DStream[(K, W)],
+      numPartitions: Int): DStream[(K, (Iterable[V], Iterable[W]))] = ssc.withScope {
     cogroup(other, defaultPartitioner(numPartitions))
   }
 
@@ -499,7 +503,7 @@ class PairDStreamFunctions[K, V](self: DStream[(K,V)])
   def cogroup[W: ClassTag](
       other: DStream[(K, W)],
       partitioner: Partitioner
-    ): DStream[(K, (Iterable[V], Iterable[W]))] = {
+    ): DStream[(K, (Iterable[V], Iterable[W]))] = ssc.withScope {
     self.transformWith(
       other,
       (rdd1: RDD[(K, V)], rdd2: RDD[(K, W)]) => rdd1.cogroup(rdd2, partitioner)
@@ -510,7 +514,7 @@ class PairDStreamFunctions[K, V](self: DStream[(K,V)])
    * Return a new DStream by applying 'join' between RDDs of `this` DStream and `other` DStream.
    * Hash partitioning is used to generate the RDDs with Spark's default number of partitions.
    */
-  def join[W: ClassTag](other: DStream[(K, W)]): DStream[(K, (V, W))] = {
+  def join[W: ClassTag](other: DStream[(K, W)]): DStream[(K, (V, W))] = ssc.withScope {
     join[W](other, defaultPartitioner())
   }
 
@@ -518,7 +522,9 @@ class PairDStreamFunctions[K, V](self: DStream[(K,V)])
    * Return a new DStream by applying 'join' between RDDs of `this` DStream and `other` DStream.
    * Hash partitioning is used to generate the RDDs with `numPartitions` partitions.
    */
-  def join[W: ClassTag](other: DStream[(K, W)], numPartitions: Int): DStream[(K, (V, W))] = {
+  def join[W: ClassTag](
+      other: DStream[(K, W)],
+      numPartitions: Int): DStream[(K, (V, W))] = ssc.withScope {
     join[W](other, defaultPartitioner(numPartitions))
   }
 
@@ -529,7 +535,7 @@ class PairDStreamFunctions[K, V](self: DStream[(K,V)])
   def join[W: ClassTag](
       other: DStream[(K, W)],
       partitioner: Partitioner
-    ): DStream[(K, (V, W))] = {
+    ): DStream[(K, (V, W))] = ssc.withScope {
     self.transformWith(
       other,
       (rdd1: RDD[(K, V)], rdd2: RDD[(K, W)]) => rdd1.join(rdd2, partitioner)
@@ -541,7 +547,8 @@ class PairDStreamFunctions[K, V](self: DStream[(K,V)])
    * `other` DStream. Hash partitioning is used to generate the RDDs with Spark's default
    * number of partitions.
    */
-  def leftOuterJoin[W: ClassTag](other: DStream[(K, W)]): DStream[(K, (V, Option[W]))] = {
+  def leftOuterJoin[W: ClassTag](
+      other: DStream[(K, W)]): DStream[(K, (V, Option[W]))] = ssc.withScope {
     leftOuterJoin[W](other, defaultPartitioner())
   }
 
@@ -553,7 +560,7 @@ class PairDStreamFunctions[K, V](self: DStream[(K,V)])
   def leftOuterJoin[W: ClassTag](
       other: DStream[(K, W)],
       numPartitions: Int
-    ): DStream[(K, (V, Option[W]))] = {
+    ): DStream[(K, (V, Option[W]))] = ssc.withScope {
     leftOuterJoin[W](other, defaultPartitioner(numPartitions))
   }
 
@@ -565,7 +572,7 @@ class PairDStreamFunctions[K, V](self: DStream[(K,V)])
   def leftOuterJoin[W: ClassTag](
       other: DStream[(K, W)],
       partitioner: Partitioner
-    ): DStream[(K, (V, Option[W]))] = {
+    ): DStream[(K, (V, Option[W]))] = ssc.withScope {
     self.transformWith(
       other,
       (rdd1: RDD[(K, V)], rdd2: RDD[(K, W)]) => rdd1.leftOuterJoin(rdd2, partitioner)
@@ -577,7 +584,8 @@ class PairDStreamFunctions[K, V](self: DStream[(K,V)])
    * `other` DStream. Hash partitioning is used to generate the RDDs with Spark's default
    * number of partitions.
    */
-  def rightOuterJoin[W: ClassTag](other: DStream[(K, W)]): DStream[(K, (Option[V], W))] = {
+  def rightOuterJoin[W: ClassTag](
+      other: DStream[(K, W)]): DStream[(K, (Option[V], W))] = ssc.withScope {
     rightOuterJoin[W](other, defaultPartitioner())
   }
 
@@ -589,7 +597,7 @@ class PairDStreamFunctions[K, V](self: DStream[(K,V)])
   def rightOuterJoin[W: ClassTag](
       other: DStream[(K, W)],
       numPartitions: Int
-    ): DStream[(K, (Option[V], W))] = {
+    ): DStream[(K, (Option[V], W))] = ssc.withScope {
     rightOuterJoin[W](other, defaultPartitioner(numPartitions))
   }
 
@@ -601,7 +609,7 @@ class PairDStreamFunctions[K, V](self: DStream[(K,V)])
   def rightOuterJoin[W: ClassTag](
       other: DStream[(K, W)],
       partitioner: Partitioner
-    ): DStream[(K, (Option[V], W))] = {
+    ): DStream[(K, (Option[V], W))] = ssc.withScope {
     self.transformWith(
       other,
       (rdd1: RDD[(K, V)], rdd2: RDD[(K, W)]) => rdd1.rightOuterJoin(rdd2, partitioner)
@@ -613,7 +621,8 @@ class PairDStreamFunctions[K, V](self: DStream[(K,V)])
    * `other` DStream. Hash partitioning is used to generate the RDDs with Spark's default
    * number of partitions.
    */
-  def fullOuterJoin[W: ClassTag](other: DStream[(K, W)]): DStream[(K, (Option[V], Option[W]))] = {
+  def fullOuterJoin[W: ClassTag](
+      other: DStream[(K, W)]): DStream[(K, (Option[V], Option[W]))] = ssc.withScope {
     fullOuterJoin[W](other, defaultPartitioner())
   }
 
@@ -625,7 +634,7 @@ class PairDStreamFunctions[K, V](self: DStream[(K,V)])
   def fullOuterJoin[W: ClassTag](
       other: DStream[(K, W)],
       numPartitions: Int
-    ): DStream[(K, (Option[V], Option[W]))] = {
+    ): DStream[(K, (Option[V], Option[W]))] = ssc.withScope {
     fullOuterJoin[W](other, defaultPartitioner(numPartitions))
   }
 
@@ -637,7 +646,7 @@ class PairDStreamFunctions[K, V](self: DStream[(K,V)])
   def fullOuterJoin[W: ClassTag](
       other: DStream[(K, W)],
       partitioner: Partitioner
-    ): DStream[(K, (Option[V], Option[W]))] = {
+    ): DStream[(K, (Option[V], Option[W]))] = ssc.withScope {
     self.transformWith(
       other,
       (rdd1: RDD[(K, V)], rdd2: RDD[(K, W)]) => rdd1.fullOuterJoin(rdd2, partitioner)
@@ -651,7 +660,7 @@ class PairDStreamFunctions[K, V](self: DStream[(K,V)])
   def saveAsHadoopFiles[F <: OutputFormat[K, V]](
       prefix: String,
       suffix: String
-    )(implicit fm: ClassTag[F]) {
+    )(implicit fm: ClassTag[F]): Unit = ssc.withScope {
     saveAsHadoopFiles(prefix, suffix, keyClass, valueClass,
       fm.runtimeClass.asInstanceOf[Class[F]])
   }
@@ -667,7 +676,7 @@ class PairDStreamFunctions[K, V](self: DStream[(K,V)])
       valueClass: Class[_],
       outputFormatClass: Class[_ <: OutputFormat[_, _]],
       conf: JobConf = new JobConf(ssc.sparkContext.hadoopConfiguration)
-    ) {
+    ): Unit = ssc.withScope {
     // Wrap conf in SerializableWritable so that ForeachDStream can be serialized for checkpoints
     val serializableConf = new SerializableWritable(conf)
     val saveFunc = (rdd: RDD[(K, V)], time: Time) => {
@@ -684,7 +693,7 @@ class PairDStreamFunctions[K, V](self: DStream[(K,V)])
   def saveAsNewAPIHadoopFiles[F <: NewOutputFormat[K, V]](
       prefix: String,
       suffix: String
-    )(implicit fm: ClassTag[F])  {
+    )(implicit fm: ClassTag[F]): Unit = ssc.withScope {
     saveAsNewAPIHadoopFiles(prefix, suffix, keyClass, valueClass,
       fm.runtimeClass.asInstanceOf[Class[F]])
   }
@@ -700,7 +709,7 @@ class PairDStreamFunctions[K, V](self: DStream[(K,V)])
       valueClass: Class[_],
       outputFormatClass: Class[_ <: NewOutputFormat[_, _]],
       conf: Configuration = ssc.sparkContext.hadoopConfiguration
-    ) {
+    ): Unit = ssc.withScope {
     // Wrap conf in SerializableWritable so that ForeachDStream can be serialized for checkpoints
     val serializableConf = new SerializableWritable(conf)
     val saveFunc = (rdd: RDD[(K, V)], time: Time) => {
diff --git a/streaming/src/test/scala/org/apache/spark/streaming/DStreamScopeSuite.scala b/streaming/src/test/scala/org/apache/spark/streaming/DStreamScopeSuite.scala
new file mode 100644
index 0000000000000..392933102097e
--- /dev/null
+++ b/streaming/src/test/scala/org/apache/spark/streaming/DStreamScopeSuite.scala
@@ -0,0 +1,190 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.streaming
+
+import org.scalatest.{BeforeAndAfter, BeforeAndAfterAll, FunSuite}
+
+import org.apache.spark.SparkContext
+import org.apache.spark.rdd.{RDD, RDDOperationScope}
+import org.apache.spark.streaming.dstream.{DStream, InputDStream}
+import org.apache.spark.streaming.ui.UIUtils
+
+/**
+ * Tests whether scope information is passed from DStream operations to RDDs correctly.
+ */
+class DStreamScopeSuite extends FunSuite with BeforeAndAfter with BeforeAndAfterAll {
+  private var ssc: StreamingContext = null
+  private val batchDuration: Duration = Seconds(1)
+
+  override def beforeAll(): Unit = {
+    ssc = new StreamingContext(new SparkContext("local", "test"), batchDuration)
+  }
+
+  override def afterAll(): Unit = {
+    ssc.stop(stopSparkContext = true)
+  }
+
+  before { assertPropertiesNotSet() }
+  after { assertPropertiesNotSet() }
+
+  test("dstream without scope") {
+    val dummyStream = new DummyDStream(ssc)
+    dummyStream.initialize(Time(0))
+
+    // This DStream is not instantiated in any scope, so all RDDs
+    // created by this stream should similarly not have a scope
+    assert(dummyStream.baseScope === None)
+    assert(dummyStream.getOrCompute(Time(1000)).get.scope === None)
+    assert(dummyStream.getOrCompute(Time(2000)).get.scope === None)
+    assert(dummyStream.getOrCompute(Time(3000)).get.scope === None)
+  }
+
+  test("input dstream without scope") {
+    val inputStream = new DummyInputDStream(ssc)
+    inputStream.initialize(Time(0))
+
+    val baseScope = inputStream.baseScope.map(RDDOperationScope.fromJson)
+    val scope1 = inputStream.getOrCompute(Time(1000)).get.scope
+    val scope2 = inputStream.getOrCompute(Time(2000)).get.scope
+    val scope3 = inputStream.getOrCompute(Time(3000)).get.scope
+
+    // This DStream is not instantiated in any scope, so all RDDs
+    assertDefined(baseScope, scope1, scope2, scope3)
+    assert(baseScope.get.name.startsWith("dummy stream"))
+    assertScopeCorrect(baseScope.get, scope1.get, 1000)
+    assertScopeCorrect(baseScope.get, scope2.get, 2000)
+    assertScopeCorrect(baseScope.get, scope3.get, 3000)
+  }
+
+  test("scoping simple operations") {
+    val inputStream = new DummyInputDStream(ssc)
+    val mappedStream = inputStream.map { i => i + 1 }
+    val filteredStream = mappedStream.filter { i => i % 2 == 0 }
+    filteredStream.initialize(Time(0))
+
+    val mappedScopeBase = mappedStream.baseScope.map(RDDOperationScope.fromJson)
+    val mappedScope1 = mappedStream.getOrCompute(Time(1000)).get.scope
+    val mappedScope2 = mappedStream.getOrCompute(Time(2000)).get.scope
+    val mappedScope3 = mappedStream.getOrCompute(Time(3000)).get.scope
+    val filteredScopeBase = filteredStream.baseScope.map(RDDOperationScope.fromJson)
+    val filteredScope1 = filteredStream.getOrCompute(Time(1000)).get.scope
+    val filteredScope2 = filteredStream.getOrCompute(Time(2000)).get.scope
+    val filteredScope3 = filteredStream.getOrCompute(Time(3000)).get.scope
+
+    // These streams are defined in their respective scopes "map" and "filter", so all
+    // RDDs created by these streams should inherit the IDs and names of their parent
+    // DStream's base scopes
+    assertDefined(mappedScopeBase, mappedScope1, mappedScope2, mappedScope3)
+    assertDefined(filteredScopeBase, filteredScope1, filteredScope2, filteredScope3)
+    assert(mappedScopeBase.get.name === "map")
+    assert(filteredScopeBase.get.name === "filter")
+    assertScopeCorrect(mappedScopeBase.get, mappedScope1.get, 1000)
+    assertScopeCorrect(mappedScopeBase.get, mappedScope2.get, 2000)
+    assertScopeCorrect(mappedScopeBase.get, mappedScope3.get, 3000)
+    assertScopeCorrect(filteredScopeBase.get, filteredScope1.get, 1000)
+    assertScopeCorrect(filteredScopeBase.get, filteredScope2.get, 2000)
+    assertScopeCorrect(filteredScopeBase.get, filteredScope3.get, 3000)
+  }
+
+  test("scoping nested operations") {
+    val inputStream = new DummyInputDStream(ssc)
+    val countStream = inputStream.countByWindow(Seconds(10), Seconds(1))
+    countStream.initialize(Time(0))
+
+    val countScopeBase = countStream.baseScope.map(RDDOperationScope.fromJson)
+    val countScope1 = countStream.getOrCompute(Time(1000)).get.scope
+    val countScope2 = countStream.getOrCompute(Time(2000)).get.scope
+    val countScope3 = countStream.getOrCompute(Time(3000)).get.scope
+
+    // Assert that all children RDDs inherit the DStream operation name correctly
+    assertDefined(countScopeBase, countScope1, countScope2, countScope3)
+    assert(countScopeBase.get.name === "countByWindow")
+    assertScopeCorrect(countScopeBase.get, countScope1.get, 1000)
+    assertScopeCorrect(countScopeBase.get, countScope2.get, 2000)
+    assertScopeCorrect(countScopeBase.get, countScope3.get, 3000)
+
+    // All streams except the input stream should share the same scopes as `countStream`
+    def testStream(stream: DStream[_]): Unit = {
+      if (stream != inputStream) {
+        val myScopeBase = stream.baseScope.map(RDDOperationScope.fromJson)
+        val myScope1 = stream.getOrCompute(Time(1000)).get.scope
+        val myScope2 = stream.getOrCompute(Time(2000)).get.scope
+        val myScope3 = stream.getOrCompute(Time(3000)).get.scope
+        assertDefined(myScopeBase, myScope1, myScope2, myScope3)
+        assert(myScopeBase === countScopeBase)
+        assert(myScope1 === countScope1)
+        assert(myScope2 === countScope2)
+        assert(myScope3 === countScope3)
+        // Climb upwards to test the parent streams
+        stream.dependencies.foreach(testStream)
+      }
+    }
+    testStream(countStream)
+  }
+
+  /** Assert that the RDD operation scope properties are not set in our SparkContext. */
+  private def assertPropertiesNotSet(): Unit = {
+    assert(ssc != null)
+    assert(ssc.sc.getLocalProperty(SparkContext.RDD_SCOPE_KEY) == null)
+    assert(ssc.sc.getLocalProperty(SparkContext.RDD_SCOPE_NO_OVERRIDE_KEY) == null)
+  }
+
+  /** Assert that the given RDD scope inherits the name and ID of the base scope correctly. */
+  private def assertScopeCorrect(
+      baseScope: RDDOperationScope,
+      rddScope: RDDOperationScope,
+      batchTime: Long): Unit = {
+    assertScopeCorrect(baseScope.id, baseScope.name, rddScope, batchTime)
+  }
+
+  /** Assert that the given RDD scope inherits the base name and ID correctly. */
+  private def assertScopeCorrect(
+      baseScopeId: String,
+      baseScopeName: String,
+      rddScope: RDDOperationScope,
+      batchTime: Long): Unit = {
+    val formattedBatchTime = UIUtils.formatBatchTime(
+      batchTime, ssc.graph.batchDuration.milliseconds, showYYYYMMSS = false)
+    assert(rddScope.id === s"${baseScopeId}_$batchTime")
+    assert(rddScope.name.replaceAll("\\n", " ") === s"$baseScopeName @ $formattedBatchTime")
+  }
+
+  /** Assert that all the specified options are defined. */
+  private def assertDefined[T](options: Option[T]*): Unit = {
+    options.zipWithIndex.foreach { case (o, i) => assert(o.isDefined, s"Option $i was empty!") }
+  }
+
+}
+
+/**
+ * A dummy stream that does absolutely nothing.
+ */
+private class DummyDStream(ssc: StreamingContext) extends DStream[Int](ssc) {
+  override def dependencies: List[DStream[Int]] = List.empty
+  override def slideDuration: Duration = Seconds(1)
+  override def compute(time: Time): Option[RDD[Int]] = Some(ssc.sc.emptyRDD[Int])
+}
+
+/**
+ * A dummy input stream that does absolutely nothing.
+ */
+private class DummyInputDStream(ssc: StreamingContext) extends InputDStream[Int](ssc) {
+  override def start(): Unit = { }
+  override def stop(): Unit = { }
+  override def compute(time: Time): Option[RDD[Int]] = Some(ssc.sc.emptyRDD[Int])
+}

From 6525fc0ab0c953c3346d16db0334aaf01ade7ed5 Mon Sep 17 00:00:00 2001
From: Jihong MA <linlin200605@gmail.com>
Date: Mon, 18 May 2015 22:47:50 +0100
Subject: [PATCH 057/525] [SPARK-7063] when lz4 compression is used, it causes
 core dump

this fix is to solve one issue found in lz4 1.2.0, which caused core dump in Spark Core with IBM JDK.  that issue is fixed in lz4 1.3.0 version.

Author: Jihong MA <linlin200605@gmail.com>

Closes #6226 from JihongMA/SPARK-7063-1 and squashes the following commits:

0cca781 [Jihong MA] SPARK-7063
4559ed5 [Jihong MA] SPARK-7063
daa520f [Jihong MA] SPARK-7063 upgrade lz4 jars
71738ee [Jihong MA] Merge remote-tracking branch 'upstream/master'
dfaa971 [Jihong MA] SPARK-7265 minor fix of the content
ace454d [Jihong MA] SPARK-7265 take out PySpark on YARN limitation
9ea0832 [Jihong MA] Merge remote-tracking branch 'upstream/master'
d5bf3f5 [Jihong MA] Merge remote-tracking branch 'upstream/master'
7b842e6 [Jihong MA] Merge remote-tracking branch 'upstream/master'
9c84695 [Jihong MA] SPARK-7265 address review comment
a399aa6 [Jihong MA] SPARK-7265 Improving documentation for Spark SQL Hive support
---
 pom.xml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pom.xml b/pom.xml
index 6f525b6ac81a3..c72d7cbf843ef 100644
--- a/pom.xml
+++ b/pom.xml
@@ -495,7 +495,7 @@
       <dependency>
         <groupId>net.jpountz.lz4</groupId>
         <artifactId>lz4</artifactId>
-        <version>1.2.0</version>
+        <version>1.3.0</version>
       </dependency>
       <dependency>
         <groupId>com.clearspring.analytics</groupId>

From eb4632f282d070e1dfd5ffed968fa212896137da Mon Sep 17 00:00:00 2001
From: Michael Armbrust <michael@databricks.com>
Date: Mon, 18 May 2015 15:24:31 -0700
Subject: [PATCH 058/525] [SQL] Fix serializability of ORC table scan

A follow-up to #6244.

Author: Michael Armbrust <michael@databricks.com>

Closes #6247 from marmbrus/fixOrcTests and squashes the following commits:

e39ee1b [Michael Armbrust] [SQL] Fix serializability of ORC table scan
---
 .../main/scala/org/apache/spark/sql/hive/orc/OrcRelation.scala  | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcRelation.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcRelation.scala
index e10d3a0b6846c..58b97adb46165 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcRelation.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcRelation.scala
@@ -195,7 +195,7 @@ private[orc] case class OrcTableScan(
     attributes: Seq[Attribute],
     @transient relation: OrcRelation,
     filters: Array[Filter],
-    inputPaths: Array[FileStatus])
+    @transient inputPaths: Array[FileStatus])
   extends Logging
   with HiveInspectors {
 

From 4fb52f9545ae338fae2d3aeea4bfc35d5df44853 Mon Sep 17 00:00:00 2001
From: Davies Liu <davies@databricks.com>
Date: Mon, 18 May 2015 16:55:45 -0700
Subject: [PATCH 059/525] [SPARK-7624] Revert #4147

Author: Davies Liu <davies@databricks.com>

Closes #6172 from davies/revert_4147 and squashes the following commits:

3bfbbde [Davies Liu] Revert #4147
---
 .../spark/scheduler/local/LocalBackend.scala  | 23 ++-----------------
 1 file changed, 2 insertions(+), 21 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/scheduler/local/LocalBackend.scala b/core/src/main/scala/org/apache/spark/scheduler/local/LocalBackend.scala
index e64d06c4d3cfc..3078a1b10be8b 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/local/LocalBackend.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/local/LocalBackend.scala
@@ -18,14 +18,12 @@
 package org.apache.spark.scheduler.local
 
 import java.nio.ByteBuffer
-import java.util.concurrent.TimeUnit
 
 import org.apache.spark.{Logging, SparkConf, SparkContext, SparkEnv, TaskState}
 import org.apache.spark.TaskState.TaskState
 import org.apache.spark.executor.{Executor, ExecutorBackend}
-import org.apache.spark.rpc.{ThreadSafeRpcEndpoint, RpcCallContext, RpcEndpointRef, RpcEnv}
+import org.apache.spark.rpc.{RpcCallContext, RpcEndpointRef, RpcEnv, ThreadSafeRpcEndpoint}
 import org.apache.spark.scheduler.{SchedulerBackend, TaskSchedulerImpl, WorkerOffer}
-import org.apache.spark.util.{ThreadUtils, Utils}
 
 private case class ReviveOffers()
 
@@ -47,9 +45,6 @@ private[spark] class LocalEndpoint(
     private val totalCores: Int)
   extends ThreadSafeRpcEndpoint with Logging {
 
-  private val reviveThread =
-    ThreadUtils.newDaemonSingleThreadScheduledExecutor("local-revive-thread")
-
   private var freeCores = totalCores
 
   private val localExecutorId = SparkContext.DRIVER_IDENTIFIER
@@ -79,27 +74,13 @@ private[spark] class LocalEndpoint(
       context.reply(true)
   }
 
-
   def reviveOffers() {
     val offers = Seq(new WorkerOffer(localExecutorId, localExecutorHostname, freeCores))
-    val tasks = scheduler.resourceOffers(offers).flatten
-    for (task <- tasks) {
+    for (task <- scheduler.resourceOffers(offers).flatten) {
       freeCores -= scheduler.CPUS_PER_TASK
       executor.launchTask(executorBackend, taskId = task.taskId, attemptNumber = task.attemptNumber,
         task.name, task.serializedTask)
     }
-    if (tasks.isEmpty && scheduler.activeTaskSets.nonEmpty) {
-      // Try to reviveOffer after 1 second, because scheduler may wait for locality timeout
-      reviveThread.schedule(new Runnable {
-        override def run(): Unit = Utils.tryLogNonFatalError {
-          Option(self).foreach(_.send(ReviveOffers))
-        }
-      }, 1000, TimeUnit.MILLISECONDS)
-    }
-  }
-
-  override def onStop(): Unit = {
-    reviveThread.shutdownNow()
   }
 }
 

From 0a7a94eab5fba3d2f2ef14a70c2c1bf4ee21b626 Mon Sep 17 00:00:00 2001
From: jerluc <jeremyalucas@gmail.com>
Date: Mon, 18 May 2015 18:13:29 -0700
Subject: [PATCH 060/525] [SPARK-7621] [STREAMING] Report Kafka errors to
 StreamingListeners

PR per [SPARK-7621](https://issues.apache.org/jira/browse/SPARK-7621), which makes both `KafkaReceiver` and `ReliableKafkaReceiver` report its errors to the `ReceiverTracker`, which in turn will add the events to the bus to fire off any registered `StreamingListener`s.

Author: jerluc <jeremyalucas@gmail.com>

Closes #6204 from jerluc/master and squashes the following commits:

82439a5 [jerluc] [SPARK-7621] [STREAMING] Report Kafka errors to StreamingListeners
---
 .../org/apache/spark/streaming/kafka/KafkaInputDStream.scala    | 2 +-
 .../apache/spark/streaming/kafka/ReliableKafkaReceiver.scala    | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/KafkaInputDStream.scala b/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/KafkaInputDStream.scala
index cca0fac0234e1..04b2dc10d39ea 100644
--- a/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/KafkaInputDStream.scala
+++ b/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/KafkaInputDStream.scala
@@ -135,7 +135,7 @@ class KafkaReceiver[
           store((msgAndMetadata.key, msgAndMetadata.message))
         }
       } catch {
-        case e: Throwable => logError("Error handling message; exiting", e)
+        case e: Throwable => reportError("Error handling message; exiting", e)
       }
     }
   }
diff --git a/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/ReliableKafkaReceiver.scala b/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/ReliableKafkaReceiver.scala
index ea87e960379f1..75f0dfc22b9dc 100644
--- a/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/ReliableKafkaReceiver.scala
+++ b/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/ReliableKafkaReceiver.scala
@@ -267,7 +267,7 @@ class ReliableKafkaReceiver[
           }
         } catch {
           case e: Exception =>
-            logError("Error handling message", e)
+            reportError("Error handling message", e)
         }
       }
     }

From 3a6003866ade45974b43a9e785ec35fb76a32b99 Mon Sep 17 00:00:00 2001
From: Tathagata Das <tathagata.das1565@gmail.com>
Date: Mon, 18 May 2015 18:24:15 -0700
Subject: [PATCH 061/525] [SPARK-7692] Updated Kinesis examples

- Updated Kinesis examples to use stable API
- Cleaned up comments, etc.
- Renamed KinesisWordCountProducerASL to KinesisWordProducerASL

Author: Tathagata Das <tathagata.das1565@gmail.com>

Closes #6249 from tdas/kinesis-examples and squashes the following commits:

7cc307b [Tathagata Das] More tweaks
f080872 [Tathagata Das] More cleanup
841987f [Tathagata Das] Small update
011cbe2 [Tathagata Das] More fixes
b0d74f9 [Tathagata Das] Updated examples.
---
 .../streaming/JavaKinesisWordCountASL.java    | 245 +++++++++--------
 .../streaming/KinesisWordCountASL.scala       | 260 ++++++++++--------
 2 files changed, 268 insertions(+), 237 deletions(-)

diff --git a/extras/kinesis-asl/src/main/java/org/apache/spark/examples/streaming/JavaKinesisWordCountASL.java b/extras/kinesis-asl/src/main/java/org/apache/spark/examples/streaming/JavaKinesisWordCountASL.java
index b0bff27a61c19..06e0ff28afd95 100644
--- a/extras/kinesis-asl/src/main/java/org/apache/spark/examples/streaming/JavaKinesisWordCountASL.java
+++ b/extras/kinesis-asl/src/main/java/org/apache/spark/examples/streaming/JavaKinesisWordCountASL.java
@@ -20,6 +20,7 @@
 import java.util.List;
 import java.util.regex.Pattern;
 
+import com.amazonaws.regions.RegionUtils;
 import org.apache.log4j.Logger;
 import org.apache.spark.SparkConf;
 import org.apache.spark.api.java.function.FlatMapFunction;
@@ -40,140 +41,146 @@
 import com.google.common.collect.Lists;
 
 /**
- * Java-friendly Kinesis Spark Streaming WordCount example
+ * Consumes messages from a Amazon Kinesis streams and does wordcount.
  *
- * See http://spark.apache.org/docs/latest/streaming-kinesis.html for more details
- * on the Kinesis Spark Streaming integration.
+ * This example spins up 1 Kinesis Receiver per shard for the given stream.
+ * It then starts pulling from the last checkpointed sequence number of the given stream.
  *
- * This example spins up 1 Kinesis Worker (Spark Streaming Receiver) per shard
- *   for the given stream.
- * It then starts pulling from the last checkpointed sequence number of the given
- *   <stream-name> and <endpoint-url>. 
+ * Usage: JavaKinesisWordCountASL [app-name] [stream-name] [endpoint-url] [region-name]
+ *   [app-name] is the name of the consumer app, used to track the read data in DynamoDB
+ *   [stream-name] name of the Kinesis stream (ie. mySparkStream)
+ *   [endpoint-url] endpoint of the Kinesis service
+ *     (e.g. https://kinesis.us-east-1.amazonaws.com)
  *
- * Valid endpoint urls:  http://docs.aws.amazon.com/general/latest/gr/rande.html#ak_region
- *
- * This code uses the DefaultAWSCredentialsProviderChain and searches for credentials 
- *  in the following order of precedence: 
- *         Environment Variables - AWS_ACCESS_KEY_ID and AWS_SECRET_KEY
- *         Java System Properties - aws.accessKeyId and aws.secretKey
- *         Credential profiles file - default location (~/.aws/credentials) shared by all AWS SDKs
- *         Instance profile credentials - delivered through the Amazon EC2 metadata service
- *
- * Usage: JavaKinesisWordCountASL <stream-name> <endpoint-url>
- *         <stream-name> is the name of the Kinesis stream (ie. mySparkStream)
- *         <endpoint-url> is the endpoint of the Kinesis service 
- *           (ie. https://kinesis.us-east-1.amazonaws.com)
  *
  * Example:
- *      $ export AWS_ACCESS_KEY_ID=<your-access-key>
+ *      # export AWS keys if necessary
+ *      $ export AWS_ACCESS_KEY_ID=[your-access-key]
  *      $ export AWS_SECRET_KEY=<your-secret-key>
- *      $ $SPARK_HOME/bin/run-example \
- *            org.apache.spark.examples.streaming.JavaKinesisWordCountASL mySparkStream \
- *            https://kinesis.us-east-1.amazonaws.com
  *
- * Note that number of workers/threads should be 1 more than the number of receivers.
- * This leaves one thread available for actually processing the data.
+ *      # run the example
+ *      $ SPARK_HOME/bin/run-example   streaming.JavaKinesisWordCountASL myAppName  mySparkStream \
+ *             https://kinesis.us-east-1.amazonaws.com
+ *
+ * There is a companion helper class called KinesisWordProducerASL which puts dummy data
+ * onto the Kinesis stream.
  *
- * There is a companion helper class called KinesisWordCountProducerASL which puts dummy data 
- *   onto the Kinesis stream. 
- * Usage instructions for KinesisWordCountProducerASL are provided in the class definition.
+ * This code uses the DefaultAWSCredentialsProviderChain to find credentials
+ * in the following order:
+ *    Environment Variables - AWS_ACCESS_KEY_ID and AWS_SECRET_KEY
+ *    Java System Properties - aws.accessKeyId and aws.secretKey
+ *    Credential profiles file - default location (~/.aws/credentials) shared by all AWS SDKs
+ *    Instance profile credentials - delivered through the Amazon EC2 metadata service
+ * For more information, see
+ * http://docs.aws.amazon.com/AWSSdkDocsJava/latest/DeveloperGuide/credentials.html
+ *
+ * See http://spark.apache.org/docs/latest/streaming-kinesis-integration.html for more details on
+ * the Kinesis Spark Streaming integration.
  */
 public final class JavaKinesisWordCountASL { // needs to be public for access from run-example
-    private static final Pattern WORD_SEPARATOR = Pattern.compile(" ");
-    private static final Logger logger = Logger.getLogger(JavaKinesisWordCountASL.class);
-
-    /* Make the constructor private to enforce singleton */
-    private JavaKinesisWordCountASL() {
+  private static final Pattern WORD_SEPARATOR = Pattern.compile(" ");
+  private static final Logger logger = Logger.getLogger(JavaKinesisWordCountASL.class);
+
+  public static void main(String[] args) {
+    // Check that all required args were passed in.
+    if (args.length != 3) {
+      System.err.println(
+          "Usage: JavaKinesisWordCountASL <stream-name> <endpoint-url>\n\n" +
+          "    <app-name> is the name of the app, used to track the read data in DynamoDB\n" +
+          "    <stream-name> is the name of the Kinesis stream\n" +
+          "    <endpoint-url> is the endpoint of the Kinesis service\n" +
+          "                   (e.g. https://kinesis.us-east-1.amazonaws.com)\n" +
+          "Generate data for the Kinesis stream using the example KinesisWordProducerASL.\n" +
+          "See http://spark.apache.org/docs/latest/streaming-kinesis-integration.html for more\n" +
+          "details.\n"
+      );
+      System.exit(1);
     }
 
-    public static void main(String[] args) {
-        /* Check that all required args were passed in. */
-        if (args.length < 2) {
-          System.err.println(
-              "Usage: JavaKinesisWordCountASL <stream-name> <endpoint-url>\n" +
-              "    <stream-name> is the name of the Kinesis stream\n" +
-              "    <endpoint-url> is the endpoint of the Kinesis service\n" +
-              "                   (e.g. https://kinesis.us-east-1.amazonaws.com)\n");
-          System.exit(1);
-        }
-
-        StreamingExamples.setStreamingLogLevels();
-
-        /* Populate the appropriate variables from the given args */
-        String streamName = args[0];
-        String endpointUrl = args[1];
-        /* Set the batch interval to a fixed 2000 millis (2 seconds) */
-        Duration batchInterval = new Duration(2000);
-
-        /* Create a Kinesis client in order to determine the number of shards for the given stream */
-        AmazonKinesisClient kinesisClient = new AmazonKinesisClient(
-                new DefaultAWSCredentialsProviderChain());
-        kinesisClient.setEndpoint(endpointUrl);
-
-        /* Determine the number of shards from the stream */
-        int numShards = kinesisClient.describeStream(streamName)
-                .getStreamDescription().getShards().size();
-
-        /* In this example, we're going to create 1 Kinesis Worker/Receiver/DStream for each shard */ 
-        int numStreams = numShards;
-
-        /* Setup the Spark config. */
-        SparkConf sparkConfig = new SparkConf().setAppName("KinesisWordCount");
-
-        /* Kinesis checkpoint interval.  Same as batchInterval for this example. */
-        Duration checkpointInterval = batchInterval;
+    // Set default log4j logging level to WARN to hide Spark logs
+    StreamingExamples.setStreamingLogLevels();
+
+    // Populate the appropriate variables from the given args
+    String kinesisAppName = args[0];
+    String streamName = args[1];
+    String endpointUrl = args[2];
+
+    // Create a Kinesis client in order to determine the number of shards for the given stream
+    AmazonKinesisClient kinesisClient =
+        new AmazonKinesisClient(new DefaultAWSCredentialsProviderChain());
+    kinesisClient.setEndpoint(endpointUrl);
+    int numShards =
+        kinesisClient.describeStream(streamName).getStreamDescription().getShards().size();
+
+
+    // In this example, we're going to create 1 Kinesis Receiver/input DStream for each shard.
+    // This is not a necessity; if there are less receivers/DStreams than the number of shards,
+    // then the shards will be automatically distributed among the receivers and each receiver
+    // will receive data from multiple shards.
+    int numStreams = numShards;
+
+    // Spark Streaming batch interval
+    Duration batchInterval = new Duration(2000);
+
+    // Kinesis checkpoint interval.  Same as batchInterval for this example.
+    Duration kinesisCheckpointInterval = batchInterval;
+
+    // Get the region name from the endpoint URL to save Kinesis Client Library metadata in
+    // DynamoDB of the same region as the Kinesis stream
+    String regionName = RegionUtils.getRegionByEndpoint(endpointUrl).getName();
+
+    // Setup the Spark config and StreamingContext
+    SparkConf sparkConfig = new SparkConf().setAppName("JavaKinesisWordCountASL");
+    JavaStreamingContext jssc = new JavaStreamingContext(sparkConfig, batchInterval);
+
+    // Create the Kinesis DStreams
+    List<JavaDStream<byte[]>> streamsList = new ArrayList<JavaDStream<byte[]>>(numStreams);
+    for (int i = 0; i < numStreams; i++) {
+      streamsList.add(
+          KinesisUtils.createStream(jssc, kinesisAppName, streamName, endpointUrl, regionName,
+              InitialPositionInStream.LATEST, kinesisCheckpointInterval, StorageLevel.MEMORY_AND_DISK_2())
+      );
+    }
 
-        /* Setup the StreamingContext */
-        JavaStreamingContext jssc = new JavaStreamingContext(sparkConfig, batchInterval);
+    // Union all the streams if there is more than 1 stream
+    JavaDStream<byte[]> unionStreams;
+    if (streamsList.size() > 1) {
+      unionStreams = jssc.union(streamsList.get(0), streamsList.subList(1, streamsList.size()));
+    } else {
+      // Otherwise, just use the 1 stream
+      unionStreams = streamsList.get(0);
+    }
 
-        /* Create the same number of Kinesis DStreams/Receivers as Kinesis stream's shards */
-        List<JavaDStream<byte[]>> streamsList = new ArrayList<JavaDStream<byte[]>>(numStreams);
-        for (int i = 0; i < numStreams; i++) {
-          streamsList.add(
-            KinesisUtils.createStream(jssc, streamName, endpointUrl, checkpointInterval, 
-            InitialPositionInStream.LATEST, StorageLevel.MEMORY_AND_DISK_2())
-          );
+    // Convert each line of Array[Byte] to String, and split into words
+    JavaDStream<String> words = unionStreams.flatMap(new FlatMapFunction<byte[], String>() {
+      @Override
+      public Iterable<String> call(byte[] line) {
+        return Lists.newArrayList(WORD_SEPARATOR.split(new String(line)));
+      }
+    });
+
+    // Map each word to a (word, 1) tuple so we can reduce by key to count the words
+    JavaPairDStream<String, Integer> wordCounts = words.mapToPair(
+        new PairFunction<String, String, Integer>() {
+          @Override
+          public Tuple2<String, Integer> call(String s) {
+            return new Tuple2<String, Integer>(s, 1);
+          }
         }
-
-        /* Union all the streams if there is more than 1 stream */
-        JavaDStream<byte[]> unionStreams;
-        if (streamsList.size() > 1) {
-            unionStreams = jssc.union(streamsList.get(0), streamsList.subList(1, streamsList.size()));
-        } else {
-            /* Otherwise, just use the 1 stream */
-            unionStreams = streamsList.get(0);
+    ).reduceByKey(
+        new Function2<Integer, Integer, Integer>() {
+          @Override
+          public Integer call(Integer i1, Integer i2) {
+            return i1 + i2;
+          }
         }
+    );
 
-        /*
-         * Split each line of the union'd DStreams into multiple words using flatMap to produce the collection.
-         * Convert lines of byte[] to multiple Strings by first converting to String, then splitting on WORD_SEPARATOR.
-         */
-        JavaDStream<String> words = unionStreams.flatMap(new FlatMapFunction<byte[], String>() {
-                @Override
-                public Iterable<String> call(byte[] line) {
-                    return Lists.newArrayList(WORD_SEPARATOR.split(new String(line)));
-                }
-            });
-
-        /* Map each word to a (word, 1) tuple, then reduce/aggregate by word. */
-        JavaPairDStream<String, Integer> wordCounts = words.mapToPair(
-            new PairFunction<String, String, Integer>() {
-                @Override
-                public Tuple2<String, Integer> call(String s) {
-                    return new Tuple2<String, Integer>(s, 1);
-                }
-            }).reduceByKey(new Function2<Integer, Integer, Integer>() {
-                @Override
-                public Integer call(Integer i1, Integer i2) {
-                  return i1 + i2;
-                }
-            });
-
-        /* Print the first 10 wordCounts */
-        wordCounts.print();
-
-        /* Start the streaming context and await termination */
-        jssc.start();
-        jssc.awaitTermination();
-    }
+    // Print the first 10 wordCounts
+    wordCounts.print();
+
+    // Start the streaming context and await termination
+    jssc.start();
+    jssc.awaitTermination();
+  }
 }
diff --git a/extras/kinesis-asl/src/main/scala/org/apache/spark/examples/streaming/KinesisWordCountASL.scala b/extras/kinesis-asl/src/main/scala/org/apache/spark/examples/streaming/KinesisWordCountASL.scala
index 32da0858d1a1d..640ca049e2ec4 100644
--- a/extras/kinesis-asl/src/main/scala/org/apache/spark/examples/streaming/KinesisWordCountASL.scala
+++ b/extras/kinesis-asl/src/main/scala/org/apache/spark/examples/streaming/KinesisWordCountASL.scala
@@ -18,213 +18,238 @@
 package org.apache.spark.examples.streaming
 
 import java.nio.ByteBuffer
+
 import scala.util.Random
-import org.apache.spark.Logging
-import org.apache.spark.SparkConf
-import org.apache.spark.storage.StorageLevel
-import org.apache.spark.streaming.Milliseconds
-import org.apache.spark.streaming.StreamingContext
-import org.apache.spark.streaming.StreamingContext.toPairDStreamFunctions
-import org.apache.spark.streaming.kinesis.KinesisUtils
-import com.amazonaws.auth.DefaultAWSCredentialsProviderChain
+
+import com.amazonaws.auth.{DefaultAWSCredentialsProviderChain, BasicAWSCredentials}
+import com.amazonaws.regions.RegionUtils
 import com.amazonaws.services.kinesis.AmazonKinesisClient
 import com.amazonaws.services.kinesis.clientlibrary.lib.worker.InitialPositionInStream
 import com.amazonaws.services.kinesis.model.PutRecordRequest
-import org.apache.log4j.Logger
-import org.apache.log4j.Level
+import org.apache.log4j.{Level, Logger}
+
+import org.apache.spark.{Logging, SparkConf}
+import org.apache.spark.storage.StorageLevel
+import org.apache.spark.streaming.{Milliseconds, StreamingContext}
+import org.apache.spark.streaming.dstream.DStream.toPairDStreamFunctions
+import org.apache.spark.streaming.kinesis.KinesisUtils
+
 
 /**
- * Kinesis Spark Streaming WordCount example.
+ * Consumes messages from a Amazon Kinesis streams and does wordcount.
  *
- * See http://spark.apache.org/docs/latest/streaming-kinesis.html for more details on
- *   the Kinesis Spark Streaming integration.
+ * This example spins up 1 Kinesis Receiver per shard for the given stream.
+ * It then starts pulling from the last checkpointed sequence number of the given stream.
  *
- * This example spins up 1 Kinesis Worker (Spark Streaming Receiver) per shard 
- *   for the given stream.
- * It then starts pulling from the last checkpointed sequence number of the given 
- *   <stream-name> and <endpoint-url>. 
+ * Usage: KinesisWordCountASL <app-name> <stream-name> <endpoint-url> <region-name>
+ *   <app-name> is the name of the consumer app, used to track the read data in DynamoDB
+ *   <stream-name> name of the Kinesis stream (ie. mySparkStream)
+ *   <endpoint-url> endpoint of the Kinesis service
+ *     (e.g. https://kinesis.us-east-1.amazonaws.com)
  *
- * Valid endpoint urls:  http://docs.aws.amazon.com/general/latest/gr/rande.html#ak_region
- * 
- * This code uses the DefaultAWSCredentialsProviderChain and searches for credentials
- *   in the following order of precedence:
- * Environment Variables - AWS_ACCESS_KEY_ID and AWS_SECRET_KEY
- * Java System Properties - aws.accessKeyId and aws.secretKey
- * Credential profiles file - default location (~/.aws/credentials) shared by all AWS SDKs
- * Instance profile credentials - delivered through the Amazon EC2 metadata service
- *
- * Usage: KinesisWordCountASL <stream-name> <endpoint-url>
- *   <stream-name> is the name of the Kinesis stream (ie. mySparkStream)
- *   <endpoint-url> is the endpoint of the Kinesis service
- *     (ie. https://kinesis.us-east-1.amazonaws.com)
  *
  * Example:
- *    $ export AWS_ACCESS_KEY_ID=<your-access-key>
- *    $ export AWS_SECRET_KEY=<your-secret-key>
- *    $ $SPARK_HOME/bin/run-example \
- *        org.apache.spark.examples.streaming.KinesisWordCountASL mySparkStream \
- *        https://kinesis.us-east-1.amazonaws.com
+ *      # export AWS keys if necessary
+ *      $ export AWS_ACCESS_KEY_ID=<your-access-key>
+ *      $ export AWS_SECRET_KEY=<your-secret-key>
+ *
+ *      # run the example
+ *      $ SPARK_HOME/bin/run-example  streaming.KinesisWordCountASL myAppName  mySparkStream \
+ *              https://kinesis.us-east-1.amazonaws.com
  *
- * 
- * Note that number of workers/threads should be 1 more than the number of receivers.
- * This leaves one thread available for actually processing the data.
+ * There is a companion helper class called KinesisWordProducerASL which puts dummy data
+ * onto the Kinesis stream.
  *
- * There is a companion helper class below called KinesisWordCountProducerASL which puts
- *   dummy data onto the Kinesis stream.
- * Usage instructions for KinesisWordCountProducerASL are provided in that class definition.
+ * This code uses the DefaultAWSCredentialsProviderChain to find credentials
+ * in the following order:
+ *    Environment Variables - AWS_ACCESS_KEY_ID and AWS_SECRET_KEY
+ *    Java System Properties - aws.accessKeyId and aws.secretKey
+ *    Credential profiles file - default location (~/.aws/credentials) shared by all AWS SDKs
+ *    Instance profile credentials - delivered through the Amazon EC2 metadata service
+ * For more information, see
+ * http://docs.aws.amazon.com/AWSSdkDocsJava/latest/DeveloperGuide/credentials.html
+ *
+ * See http://spark.apache.org/docs/latest/streaming-kinesis-integration.html for more details on
+ * the Kinesis Spark Streaming integration.
  */
-private object KinesisWordCountASL extends Logging {
+object KinesisWordCountASL extends Logging {
   def main(args: Array[String]) {
-    /* Check that all required args were passed in. */
-    if (args.length < 2) {
+    // Check that all required args were passed in.
+    if (args.length != 3) {
       System.err.println(
         """
-          |Usage: KinesisWordCount <stream-name> <endpoint-url>
+          |Usage: KinesisWordCountASL <app-name> <stream-name> <endpoint-url> <region-name>
+          |
+          |    <app-name> is the name of the consumer app, used to track the read data in DynamoDB
           |    <stream-name> is the name of the Kinesis stream
           |    <endpoint-url> is the endpoint of the Kinesis service
           |                   (e.g. https://kinesis.us-east-1.amazonaws.com)
+          |
+          |Generate input data for Kinesis stream using the example KinesisWordProducerASL.
+          |See http://spark.apache.org/docs/latest/streaming-kinesis-integration.html for more
+          |details.
         """.stripMargin)
       System.exit(1)
     }
 
     StreamingExamples.setStreamingLogLevels()
 
-    /* Populate the appropriate variables from the given args */
-    val Array(streamName, endpointUrl) = args
+    // Populate the appropriate variables from the given args
+    val Array(appName, streamName, endpointUrl) = args
 
-    /* Determine the number of shards from the stream */
-    val kinesisClient = new AmazonKinesisClient(new DefaultAWSCredentialsProviderChain())
+
+    // Determine the number of shards from the stream using the low-level Kinesis Client
+    // from the AWS Java SDK.
+    val credentials = new DefaultAWSCredentialsProviderChain().getCredentials()
+    require(credentials != null,
+      "No AWS credentials found. Please specify credentials using one of the methods specified " +
+        "in http://docs.aws.amazon.com/AWSSdkDocsJava/latest/DeveloperGuide/credentials.html")
+    val kinesisClient = new AmazonKinesisClient(credentials)
     kinesisClient.setEndpoint(endpointUrl)
-    val numShards = kinesisClient.describeStream(streamName).getStreamDescription().getShards()
-      .size()
+    val numShards = kinesisClient.describeStream(streamName).getStreamDescription().getShards().size
+
 
-    /* In this example, we're going to create 1 Kinesis Worker/Receiver/DStream for each shard. */
+    // In this example, we're going to create 1 Kinesis Receiver/input DStream for each shard.
+    // This is not a necessity; if there are less receivers/DStreams than the number of shards,
+    // then the shards will be automatically distributed among the receivers and each receiver
+    // will receive data from multiple shards.
     val numStreams = numShards
 
-    /* Setup the and SparkConfig and StreamingContext */
-    /* Spark Streaming batch interval */
+    // Spark Streaming batch interval
     val batchInterval = Milliseconds(2000)
-    val sparkConfig = new SparkConf().setAppName("KinesisWordCount")
-    val ssc = new StreamingContext(sparkConfig, batchInterval)
 
-    /* Kinesis checkpoint interval.  Same as batchInterval for this example. */
+    // Kinesis checkpoint interval is the interval at which the DynamoDB is updated with information
+    //on sequence number of records that have been received. Same as batchInterval for this example.
     val kinesisCheckpointInterval = batchInterval
 
-    /* Create the same number of Kinesis DStreams/Receivers as Kinesis stream's shards */
+    // Get the region name from the endpoint URL to save Kinesis Client Library metadata in
+    // DynamoDB of the same region as the Kinesis stream
+    val regionName = RegionUtils.getRegionByEndpoint(endpointUrl).getName()
+
+    // Setup the SparkConfig and StreamingContext
+    val sparkConfig = new SparkConf().setAppName("KinesisWordCountASL")
+    val ssc = new StreamingContext(sparkConfig, batchInterval)
+
+    // Create the Kinesis DStreams
     val kinesisStreams = (0 until numStreams).map { i =>
-      KinesisUtils.createStream(ssc, streamName, endpointUrl, kinesisCheckpointInterval,
-          InitialPositionInStream.LATEST, StorageLevel.MEMORY_AND_DISK_2)
+      KinesisUtils.createStream(ssc, appName, streamName, endpointUrl, regionName,
+        InitialPositionInStream.LATEST, kinesisCheckpointInterval, StorageLevel.MEMORY_AND_DISK_2)
     }
 
-    /* Union all the streams */
+    // Union all the streams
     val unionStreams = ssc.union(kinesisStreams)
 
-    /* Convert each line of Array[Byte] to String, split into words, and count them */
-    val words = unionStreams.flatMap(byteArray => new String(byteArray)
-      .split(" "))
+    // Convert each line of Array[Byte] to String, and split into words
+    val words = unionStreams.flatMap(byteArray => new String(byteArray).split(" "))
 
-    /* Map each word to a (word, 1) tuple so we can reduce/aggregate by key. */
+    // Map each word to a (word, 1) tuple so we can reduce by key to count the words
     val wordCounts = words.map(word => (word, 1)).reduceByKey(_ + _)
-
-    /* Print the first 10 wordCounts */
+ 
+    // Print the first 10 wordCounts
     wordCounts.print()
 
-    /* Start the streaming context and await termination */
+    // Start the streaming context and await termination
     ssc.start()
     ssc.awaitTermination()
   }
 }
 
 /**
- * Usage: KinesisWordCountProducerASL <stream-name> <kinesis-endpoint-url>
- *     <recordsPerSec> <wordsPerRecord>
+ * Usage: KinesisWordProducerASL <stream-name> <endpoint-url> \
+ *   <records-per-sec> <words-per-record>
+ *
  *   <stream-name> is the name of the Kinesis stream (ie. mySparkStream)
- *   <kinesis-endpoint-url> is the endpoint of the Kinesis service
+ *   <endpoint-url> is the endpoint of the Kinesis service
  *     (ie. https://kinesis.us-east-1.amazonaws.com)
  *   <records-per-sec> is the rate of records per second to put onto the stream
  *   <words-per-record> is the rate of records per second to put onto the stream
  *
  * Example:
- *    $ export AWS_ACCESS_KEY_ID=<your-access-key>
- *    $ export AWS_SECRET_KEY=<your-secret-key>
- *    $ $SPARK_HOME/bin/run-example \
- *         org.apache.spark.examples.streaming.KinesisWordCountProducerASL mySparkStream \
- *         https://kinesis.us-east-1.amazonaws.com 10 5
+ *    $ SPARK_HOME/bin/run-example streaming.KinesisWordProducerASL mySparkStream \
+ *         https://kinesis.us-east-1.amazonaws.com us-east-1 10 5
  */
-private object KinesisWordCountProducerASL {
+object KinesisWordProducerASL {
   def main(args: Array[String]) {
-    if (args.length < 4) {
-      System.err.println("Usage: KinesisWordCountProducerASL <stream-name> <endpoint-url>" +
-          " <records-per-sec> <words-per-record>")
+    if (args.length != 4) {
+      System.err.println(
+        """
+          |Usage: KinesisWordProducerASL <stream-name> <endpoint-url> <records-per-sec> <words-per-record>
+          |
+          |    <stream-name> is the name of the Kinesis stream
+          |    <endpoint-url> is the endpoint of the Kinesis service
+          |                   (e.g. https://kinesis.us-east-1.amazonaws.com)
+          |    <records-per-sec> is the rate of records per second to put onto the stream
+          |    <words-per-record> is the rate of records per second to put onto the stream
+          |
+        """.stripMargin)
+
       System.exit(1)
     }
 
+    // Set default log4j logging level to WARN to hide Spark logs
     StreamingExamples.setStreamingLogLevels()
 
-    /* Populate the appropriate variables from the given args */
+    // Populate the appropriate variables from the given args
     val Array(stream, endpoint, recordsPerSecond, wordsPerRecord) = args
 
-    /* Generate the records and return the totals */
-    val totals = generate(stream, endpoint, recordsPerSecond.toInt, wordsPerRecord.toInt)
+    // Generate the records and return the totals
+    val totals = generate(stream, endpoint, recordsPerSecond.toInt,
+        wordsPerRecord.toInt)
 
-    /* Print the array of (index, total) tuples */
-    println("Totals")
-    totals.foreach(total => println(total.toString()))
+    // Print the array of (word, total) tuples
+    println("Totals for the words sent")
+    totals.foreach(println(_))
   }
 
   def generate(stream: String,
       endpoint: String,
       recordsPerSecond: Int,
-      wordsPerRecord: Int): Seq[(Int, Int)] = {
-
-    val MaxRandomInts = 10
+      wordsPerRecord: Int): Seq[(String, Int)] = {
 
-    /* Create the Kinesis client */
+    val randomWords = List("spark","you","are","my","father")
+    val totals = scala.collection.mutable.Map[String, Int]()
+  
+    // Create the low-level Kinesis Client from the AWS Java SDK.
     val kinesisClient = new AmazonKinesisClient(new DefaultAWSCredentialsProviderChain())
     kinesisClient.setEndpoint(endpoint)
 
     println(s"Putting records onto stream $stream and endpoint $endpoint at a rate of" +
-      s" $recordsPerSecond records per second and $wordsPerRecord words per record");
-
-    val totals = new Array[Int](MaxRandomInts)
-    /* Put String records onto the stream per the given recordPerSec and wordsPerRecord */
-    for (i <- 1 to 5) {
-
-      /* Generate recordsPerSec records to put onto the stream */
-      val records = (1 to recordsPerSecond.toInt).map { recordNum =>
-        /* 
-         *  Randomly generate each wordsPerRec words between 0 (inclusive)
-         *  and MAX_RANDOM_INTS (exclusive) 
-         */
+        s" $recordsPerSecond records per second and $wordsPerRecord words per record")
+  
+    // Iterate and put records onto the stream per the given recordPerSec and wordsPerRecord
+    for (i <- 1 to 10) {
+      // Generate recordsPerSec records to put onto the stream
+      val records = (1 to recordsPerSecond.toInt).foreach { recordNum =>
+        // Randomly generate wordsPerRecord number of words
         val data = (1 to wordsPerRecord.toInt).map(x => {
-          /* Generate the random int */
-          val randomInt = Random.nextInt(MaxRandomInts)
+          // Get a random index to a word
+          val randomWordIdx = Random.nextInt(randomWords.size)
+          val randomWord = randomWords(randomWordIdx)
 
-          /* Keep track of the totals */
-          totals(randomInt) += 1
+          // Increment total count to compare to server counts later
+          totals(randomWord) = totals.getOrElse(randomWord, 0) + 1
 
-          randomInt.toString()
+          randomWord
         }).mkString(" ")
 
-        /* Create a partitionKey based on recordNum */
+        // Create a partitionKey based on recordNum
         val partitionKey = s"partitionKey-$recordNum"
 
-        /* Create a PutRecordRequest with an Array[Byte] version of the data */
+        // Create a PutRecordRequest with an Array[Byte] version of the data
         val putRecordRequest = new PutRecordRequest().withStreamName(stream)
             .withPartitionKey(partitionKey)
-            .withData(ByteBuffer.wrap(data.getBytes()));
+            .withData(ByteBuffer.wrap(data.getBytes()))
 
-        /* Put the record onto the stream and capture the PutRecordResult */
-        val putRecordResult = kinesisClient.putRecord(putRecordRequest);
+        // Put the record onto the stream and capture the PutRecordResult
+        val putRecordResult = kinesisClient.putRecord(putRecordRequest)
       }
 
-      /* Sleep for a second */
+      // Sleep for a second
       Thread.sleep(1000)
       println("Sent " + recordsPerSecond + " records")
     }
-
-    /* Convert the totals to (index, total) tuple */
-    (0 to (MaxRandomInts - 1)).zip(totals)
+     // Convert the totals to (index, total) tuple
+    totals.toSeq.sortBy(_._1)
   }
 }
 
@@ -233,8 +258,7 @@ private object KinesisWordCountProducerASL {
  *  This has been lifted from the examples/ project to remove the circular dependency.
  */
 private[streaming] object StreamingExamples extends Logging {
-
-  /** Set reasonable logging levels for streaming if the user has not configured log4j. */
+  // Set reasonable logging levels for streaming if the user has not configured log4j.
   def setStreamingLogLevels() {
     val log4jInitialized = Logger.getRootLogger.getAllAppenders.hasMoreElements
     if (!log4jInitialized) {

From d03638cc2d414cee9ac7481084672e454495dfc1 Mon Sep 17 00:00:00 2001
From: Liang-Chi Hsieh <viirya@gmail.com>
Date: Mon, 18 May 2015 21:32:36 -0700
Subject: [PATCH 062/525] [SPARK-7681] [MLLIB] Add SparseVector support for
 gemv

JIRA: https://issues.apache.org/jira/browse/SPARK-7681

Author: Liang-Chi Hsieh <viirya@gmail.com>

Closes #6209 from viirya/sparsevector_gemv and squashes the following commits:

ce0bb8b [Liang-Chi Hsieh] Still need to scal y when beta is 0.0 because it clears out y.
b890e63 [Liang-Chi Hsieh] Do not delete multiply for DenseVector.
57a8c1e [Liang-Chi Hsieh] Add MimaExcludes for v1.4.
458d1ae [Liang-Chi Hsieh] List DenseMatrix.multiply and SparseMatrix.multiply to MimaExcludes too.
054f05d [Liang-Chi Hsieh] Fix scala style.
410381a [Liang-Chi Hsieh] Address comments. Make Matrix.multiply more generalized.
4616696 [Liang-Chi Hsieh] Add support for SparseVector with SparseMatrix.
5d6d07a [Liang-Chi Hsieh] Merge remote-tracking branch 'upstream/master' into sparsevector_gemv
c069507 [Liang-Chi Hsieh] Add SparseVector support for gemv with DenseMatrix.
---
 .../org/apache/spark/mllib/linalg/BLAS.scala  | 152 ++++++++++++++++--
 .../apache/spark/mllib/linalg/Matrices.scala  |   7 +-
 .../apache/spark/mllib/linalg/BLASSuite.scala |  96 +++++++++--
 project/MimaExcludes.scala                    |  18 ++-
 4 files changed, 240 insertions(+), 33 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/BLAS.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/BLAS.scala
index 87052e1ba8539..ec38529cf8fae 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/BLAS.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/BLAS.scala
@@ -463,7 +463,7 @@ private[spark] object BLAS extends Serializable with Logging {
   def gemv(
       alpha: Double,
       A: Matrix,
-      x: DenseVector,
+      x: Vector,
       beta: Double,
       y: DenseVector): Unit = {
     require(A.numCols == x.size,
@@ -473,44 +473,169 @@ private[spark] object BLAS extends Serializable with Logging {
     if (alpha == 0.0) {
       logDebug("gemv: alpha is equal to 0. Returning y.")
     } else {
-      A match {
-        case sparse: SparseMatrix =>
-          gemv(alpha, sparse, x, beta, y)
-        case dense: DenseMatrix =>
-          gemv(alpha, dense, x, beta, y)
+      (A, x) match {
+        case (smA: SparseMatrix, dvx: DenseVector) =>
+          gemv(alpha, smA, dvx, beta, y)
+        case (smA: SparseMatrix, svx: SparseVector) =>
+          gemv(alpha, smA, svx, beta, y)
+        case (dmA: DenseMatrix, dvx: DenseVector) =>
+          gemv(alpha, dmA, dvx, beta, y)
+        case (dmA: DenseMatrix, svx: SparseVector) =>
+          gemv(alpha, dmA, svx, beta, y)
         case _ =>
-          throw new IllegalArgumentException(s"gemv doesn't support matrix type ${A.getClass}.")
+          throw new IllegalArgumentException(s"gemv doesn't support running on matrix type " +
+            s"${A.getClass} and vector type ${x.getClass}.")
       }
     }
   }
 
   /**
    * y := alpha * A * x + beta * y
-   * For `DenseMatrix` A.
+   * For `DenseMatrix` A and `DenseVector` x.
    */
   private def gemv(
       alpha: Double,
       A: DenseMatrix,
       x: DenseVector,
       beta: Double,
-      y: DenseVector): Unit =  {
+      y: DenseVector): Unit = {
     val tStrA = if (A.isTransposed) "T" else "N"
     val mA = if (!A.isTransposed) A.numRows else A.numCols
     val nA = if (!A.isTransposed) A.numCols else A.numRows
     nativeBLAS.dgemv(tStrA, mA, nA, alpha, A.values, mA, x.values, 1, beta,
       y.values, 1)
   }
+ 
+  /**
+   * y := alpha * A * x + beta * y
+   * For `DenseMatrix` A and `SparseVector` x.
+   */
+  private def gemv(
+      alpha: Double,
+      A: DenseMatrix,
+      x: SparseVector,
+      beta: Double,
+      y: DenseVector): Unit = {
+    val mA: Int = A.numRows
+    val nA: Int = A.numCols
+
+    val Avals = A.values
+
+    val xIndices = x.indices
+    val xNnz = xIndices.length
+    val xValues = x.values
+    val yValues = y.values
 
+    if (alpha == 0.0) {
+      scal(beta, y)
+      return
+    }
+
+    if (A.isTransposed) {
+      var rowCounterForA = 0
+      while (rowCounterForA < mA) {
+        var sum = 0.0
+        var k = 0
+        while (k < xNnz) {
+          sum += xValues(k) * Avals(xIndices(k) + rowCounterForA * nA)
+          k += 1
+        }
+        yValues(rowCounterForA) = sum * alpha + beta * yValues(rowCounterForA)
+        rowCounterForA += 1
+      }
+    } else {
+      var rowCounterForA = 0
+      while (rowCounterForA < mA) {
+        var sum = 0.0
+        var k = 0
+        while (k < xNnz) {
+          sum += xValues(k) * Avals(xIndices(k) * mA + rowCounterForA)
+          k += 1
+        }
+        yValues(rowCounterForA) = sum * alpha + beta * yValues(rowCounterForA)
+        rowCounterForA += 1
+      }
+    }
+  }
+ 
   /**
    * y := alpha * A * x + beta * y
-   * For `SparseMatrix` A.
+   * For `SparseMatrix` A and `SparseVector` x.
+   */
+  private def gemv(
+      alpha: Double,
+      A: SparseMatrix,
+      x: SparseVector,
+      beta: Double,
+      y: DenseVector): Unit = {
+    val xValues = x.values
+    val xIndices = x.indices
+    val xNnz = xIndices.length
+
+    val yValues = y.values
+
+    val mA: Int = A.numRows
+    val nA: Int = A.numCols
+
+    val Avals = A.values
+    val Arows = if (!A.isTransposed) A.rowIndices else A.colPtrs
+    val Acols = if (!A.isTransposed) A.colPtrs else A.rowIndices
+
+    if (alpha == 0.0) {
+      scal(beta, y)
+      return
+    }
+
+    if (A.isTransposed) {
+      var rowCounter = 0
+      while (rowCounter < mA) {
+        var i = Arows(rowCounter)
+        val indEnd = Arows(rowCounter + 1)
+        var sum = 0.0
+        var k = 0
+        while (k < xNnz && i < indEnd) {
+          if (xIndices(k) == Acols(i)) {
+            sum += Avals(i) * xValues(k)
+            i += 1
+          }
+          k += 1
+        }
+        yValues(rowCounter) = sum * alpha + beta * yValues(rowCounter)
+        rowCounter += 1
+      }
+    } else {
+      scal(beta, y)
+
+      var colCounterForA = 0
+      var k = 0
+      while (colCounterForA < nA && k < xNnz) {
+        if (xIndices(k) == colCounterForA) {
+          var i = Acols(colCounterForA)
+          val indEnd = Acols(colCounterForA + 1)
+
+          val xTemp = xValues(k) * alpha
+          while (i < indEnd) {
+            val rowIndex = Arows(i)
+            yValues(Arows(i)) += Avals(i) * xTemp
+            i += 1
+          }
+          k += 1
+        }
+        colCounterForA += 1
+      }
+    }
+  }
+
+  /**
+   * y := alpha * A * x + beta * y
+   * For `SparseMatrix` A and `DenseVector` x.
    */
   private def gemv(
       alpha: Double,
       A: SparseMatrix,
       x: DenseVector,
       beta: Double,
-      y: DenseVector): Unit =  {
+      y: DenseVector): Unit = {
     val xValues = x.values
     val yValues = y.values
     val mA: Int = A.numRows
@@ -534,10 +659,7 @@ private[spark] object BLAS extends Serializable with Logging {
         rowCounter += 1
       }
     } else {
-      // Scale vector first if `beta` is not equal to 0.0
-      if (beta != 0.0) {
-        scal(beta, y)
-      }
+      scal(beta, y)
       // Perform matrix-vector multiplication and add to y
       var colCounterForA = 0
       while (colCounterForA < nA) {
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Matrices.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Matrices.scala
index a609674df6b8b..9584da8e3a0f9 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Matrices.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Matrices.scala
@@ -77,8 +77,13 @@ sealed trait Matrix extends Serializable {
     C
   }
 
-  /** Convenience method for `Matrix`-`DenseVector` multiplication. */
+  /** Convenience method for `Matrix`-`DenseVector` multiplication. For binary compatibility. */
   def multiply(y: DenseVector): DenseVector = {
+    multiply(y.asInstanceOf[Vector])
+  }
+
+  /** Convenience method for `Matrix`-`Vector` multiplication. */
+  def multiply(y: Vector): DenseVector = {
     val output = new DenseVector(new Array[Double](numRows))
     BLAS.gemv(1.0, this, y, 0.0, output)
     output
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/linalg/BLASSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/linalg/BLASSuite.scala
index 002cb253862b5..64ecd12ea7ded 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/linalg/BLASSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/linalg/BLASSuite.scala
@@ -256,42 +256,108 @@ class BLASSuite extends FunSuite {
     val dA =
       new DenseMatrix(4, 3, Array(0.0, 1.0, 0.0, 0.0, 2.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 3.0))
     val sA = new SparseMatrix(4, 3, Array(0, 1, 3, 4), Array(1, 0, 2, 3), Array(1.0, 2.0, 1.0, 3.0))
-
-    val x = new DenseVector(Array(1.0, 2.0, 3.0))
+ 
+    val dA2 =
+      new DenseMatrix(4, 3, Array(0.0, 2.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 3.0), true)
+    val sA2 =
+      new SparseMatrix(4, 3, Array(0, 1, 2, 3, 4), Array(1, 0, 1, 2), Array(2.0, 1.0, 1.0, 3.0),
+        true)
+ 
+    val dx = new DenseVector(Array(1.0, 2.0, 3.0))
+    val sx = dx.toSparse
     val expected = new DenseVector(Array(4.0, 1.0, 2.0, 9.0))
 
-    assert(dA.multiply(x) ~== expected absTol 1e-15)
-    assert(sA.multiply(x) ~== expected absTol 1e-15)
-
+    assert(dA.multiply(dx) ~== expected absTol 1e-15)
+    assert(sA.multiply(dx) ~== expected absTol 1e-15)
+    assert(dA.multiply(sx) ~== expected absTol 1e-15)
+    assert(sA.multiply(sx) ~== expected absTol 1e-15)
+ 
     val y1 = new DenseVector(Array(1.0, 3.0, 1.0, 0.0))
     val y2 = y1.copy
     val y3 = y1.copy
     val y4 = y1.copy
+    val y5 = y1.copy
+    val y6 = y1.copy
+    val y7 = y1.copy
+    val y8 = y1.copy
+    val y9 = y1.copy
+    val y10 = y1.copy
+    val y11 = y1.copy
+    val y12 = y1.copy
+    val y13 = y1.copy
+    val y14 = y1.copy
+    val y15 = y1.copy
+    val y16 = y1.copy
+ 
     val expected2 = new DenseVector(Array(6.0, 7.0, 4.0, 9.0))
     val expected3 = new DenseVector(Array(10.0, 8.0, 6.0, 18.0))
 
-    gemv(1.0, dA, x, 2.0, y1)
-    gemv(1.0, sA, x, 2.0, y2)
-    gemv(2.0, dA, x, 2.0, y3)
-    gemv(2.0, sA, x, 2.0, y4)
+    gemv(1.0, dA, dx, 2.0, y1)
+    gemv(1.0, sA, dx, 2.0, y2)
+    gemv(1.0, dA, sx, 2.0, y3)
+    gemv(1.0, sA, sx, 2.0, y4)
+ 
+    gemv(1.0, dA2, dx, 2.0, y5)
+    gemv(1.0, sA2, dx, 2.0, y6)
+    gemv(1.0, dA2, sx, 2.0, y7)
+    gemv(1.0, sA2, sx, 2.0, y8)
+ 
+    gemv(2.0, dA, dx, 2.0, y9)
+    gemv(2.0, sA, dx, 2.0, y10)
+    gemv(2.0, dA, sx, 2.0, y11)
+    gemv(2.0, sA, sx, 2.0, y12)
+ 
+    gemv(2.0, dA2, dx, 2.0, y13)
+    gemv(2.0, sA2, dx, 2.0, y14)
+    gemv(2.0, dA2, sx, 2.0, y15)
+    gemv(2.0, sA2, sx, 2.0, y16)
+ 
     assert(y1 ~== expected2 absTol 1e-15)
     assert(y2 ~== expected2 absTol 1e-15)
-    assert(y3 ~== expected3 absTol 1e-15)
-    assert(y4 ~== expected3 absTol 1e-15)
+    assert(y3 ~== expected2 absTol 1e-15)
+    assert(y4 ~== expected2 absTol 1e-15)
+ 
+    assert(y5 ~== expected2 absTol 1e-15)
+    assert(y6 ~== expected2 absTol 1e-15)
+    assert(y7 ~== expected2 absTol 1e-15)
+    assert(y8 ~== expected2 absTol 1e-15)
+ 
+    assert(y9 ~== expected3 absTol 1e-15)
+    assert(y10 ~== expected3 absTol 1e-15)
+    assert(y11 ~== expected3 absTol 1e-15)
+    assert(y12 ~== expected3 absTol 1e-15)
+ 
+    assert(y13 ~== expected3 absTol 1e-15)
+    assert(y14 ~== expected3 absTol 1e-15)
+    assert(y15 ~== expected3 absTol 1e-15)
+    assert(y16 ~== expected3 absTol 1e-15)
+ 
     withClue("columns of A don't match the rows of B") {
       intercept[Exception] {
-        gemv(1.0, dA.transpose, x, 2.0, y1)
+        gemv(1.0, dA.transpose, dx, 2.0, y1)
+      }
+      intercept[Exception] {
+        gemv(1.0, sA.transpose, dx, 2.0, y1)
+      }
+      intercept[Exception] {
+        gemv(1.0, dA.transpose, sx, 2.0, y1)
+      }
+      intercept[Exception] {
+        gemv(1.0, sA.transpose, sx, 2.0, y1)
       }
     }
+ 
     val dAT =
       new DenseMatrix(3, 4, Array(0.0, 2.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 3.0))
     val sAT =
       new SparseMatrix(3, 4, Array(0, 1, 2, 3, 4), Array(1, 0, 1, 2), Array(2.0, 1.0, 1.0, 3.0))
-
+ 
     val dATT = dAT.transpose
     val sATT = sAT.transpose
 
-    assert(dATT.multiply(x) ~== expected absTol 1e-15)
-    assert(sATT.multiply(x) ~== expected absTol 1e-15)
+    assert(dATT.multiply(dx) ~== expected absTol 1e-15)
+    assert(sATT.multiply(dx) ~== expected absTol 1e-15)
+    assert(dATT.multiply(sx) ~== expected absTol 1e-15)
+    assert(sATT.multiply(sx) ~== expected absTol 1e-15)
   }
 }
diff --git a/project/MimaExcludes.scala b/project/MimaExcludes.scala
index 513bbaf98d804..f8d0160f6445e 100644
--- a/project/MimaExcludes.scala
+++ b/project/MimaExcludes.scala
@@ -87,7 +87,14 @@ object MimaExcludes {
             ProblemFilters.exclude[MissingMethodProblem](
               "org.apache.spark.mllib.linalg.Vector.toSparse"),
             ProblemFilters.exclude[MissingMethodProblem](
-              "org.apache.spark.mllib.linalg.Vector.numActives")
+              "org.apache.spark.mllib.linalg.Vector.numActives"),
+            // SPARK-7681 add SparseVector support for gemv
+            ProblemFilters.exclude[MissingMethodProblem](
+              "org.apache.spark.mllib.linalg.Matrix.multiply"),
+            ProblemFilters.exclude[MissingMethodProblem](
+              "org.apache.spark.mllib.linalg.DenseMatrix.multiply"),
+            ProblemFilters.exclude[MissingMethodProblem](
+              "org.apache.spark.mllib.linalg.SparseMatrix.multiply")
           ) ++ Seq(
             // Execution should never be included as its always internal.
             MimaBuild.excludeSparkPackage("sql.execution"),
@@ -180,7 +187,14 @@ object MimaExcludes {
             ProblemFilters.exclude[MissingMethodProblem](
               "org.apache.spark.mllib.linalg.Matrix.isTransposed"),
             ProblemFilters.exclude[MissingMethodProblem](
-              "org.apache.spark.mllib.linalg.Matrix.foreachActive")
+              "org.apache.spark.mllib.linalg.Matrix.foreachActive"),
+            // SPARK-7681 add SparseVector support for gemv
+            ProblemFilters.exclude[MissingMethodProblem](
+              "org.apache.spark.mllib.linalg.Matrix.multiply"),
+            ProblemFilters.exclude[MissingMethodProblem](
+              "org.apache.spark.mllib.linalg.DenseMatrix.multiply"),
+            ProblemFilters.exclude[MissingMethodProblem](
+              "org.apache.spark.mllib.linalg.SparseMatrix.multiply")
           ) ++ Seq(
             // SPARK-5540
             ProblemFilters.exclude[MissingMethodProblem](

From c2437de1899e09894df4ec27adfaa7fac158fd3a Mon Sep 17 00:00:00 2001
From: Daoyuan Wang <daoyuan.wang@intel.com>
Date: Mon, 18 May 2015 21:43:12 -0700
Subject: [PATCH 063/525] [SPARK-7150] SparkContext.range() and
 SQLContext.range()

This PR is based on #6081, thanks adrian-wang.

Closes #6081

Author: Daoyuan Wang <daoyuan.wang@intel.com>
Author: Davies Liu <davies@databricks.com>

Closes #6230 from davies/range and squashes the following commits:

d3ce5fe [Davies Liu] add tests
789eda5 [Davies Liu] add range() in Python
4590208 [Davies Liu] Merge commit 'refs/pull/6081/head' of github.com:apache/spark into range
cbf5200 [Daoyuan Wang] let's add python support in a separate PR
f45e3b2 [Daoyuan Wang] remove redundant toLong
617da76 [Daoyuan Wang] fix safe marge for corner cases
867c417 [Daoyuan Wang] fix
13dbe84 [Daoyuan Wang] update
bd998ba [Daoyuan Wang] update comments
d3a0c1b [Daoyuan Wang] add range api()
---
 .../scala/org/apache/spark/SparkContext.scala | 72 +++++++++++++++++++
 python/pyspark/context.py                     | 16 +++++
 python/pyspark/sql/context.py                 | 20 ++++++
 python/pyspark/sql/tests.py                   |  5 ++
 python/pyspark/tests.py                       |  5 ++
 .../org/apache/spark/sql/SQLContext.scala     | 31 ++++++++
 .../org/apache/spark/sql/DataFrameSuite.scala | 40 +++++++++++
 7 files changed, 189 insertions(+)

diff --git a/core/src/main/scala/org/apache/spark/SparkContext.scala b/core/src/main/scala/org/apache/spark/SparkContext.scala
index f78fbaf33f656..3fe3dc5e300e8 100644
--- a/core/src/main/scala/org/apache/spark/SparkContext.scala
+++ b/core/src/main/scala/org/apache/spark/SparkContext.scala
@@ -697,6 +697,78 @@ class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationCli
     new ParallelCollectionRDD[T](this, seq, numSlices, Map[Int, Seq[String]]())
   }
 
+  /**
+   * Creates a new RDD[Long] containing elements from `start` to `end`(exclusive), increased by
+   * `step` every element.
+   *
+   * @note if we need to cache this RDD, we should make sure each partition does not exceed limit.
+   *
+   * @param start the start value.
+   * @param end the end value.
+   * @param step the incremental step
+   * @param numSlices the partition number of the new RDD.
+   * @return
+   */
+  def range(
+      start: Long,
+      end: Long,
+      step: Long = 1,
+      numSlices: Int = defaultParallelism): RDD[Long] = withScope {
+    assertNotStopped()
+    // when step is 0, range will run infinitely
+    require(step != 0, "step cannot be 0")
+    val numElements: BigInt = {
+      val safeStart = BigInt(start)
+      val safeEnd = BigInt(end)
+      if ((safeEnd - safeStart) % step == 0 || safeEnd > safeStart ^ step > 0) {
+        (safeEnd - safeStart) / step
+      } else {
+        // the remainder has the same sign with range, could add 1 more
+        (safeEnd - safeStart) / step + 1
+      }
+    }
+    parallelize(0 until numSlices, numSlices).mapPartitionsWithIndex((i, _) => {
+      val partitionStart = (i * numElements) / numSlices * step + start
+      val partitionEnd = (((i + 1) * numElements) / numSlices) * step + start
+      def getSafeMargin(bi: BigInt): Long =
+        if (bi.isValidLong) {
+          bi.toLong
+        } else if (bi > 0) {
+          Long.MaxValue
+        } else {
+          Long.MinValue
+        }
+      val safePartitionStart = getSafeMargin(partitionStart)
+      val safePartitionEnd = getSafeMargin(partitionEnd)
+
+      new Iterator[Long] {
+        private[this] var number: Long = safePartitionStart
+        private[this] var overflow: Boolean = false
+
+        override def hasNext =
+          if (!overflow) {
+            if (step > 0) {
+              number < safePartitionEnd
+            } else {
+              number > safePartitionEnd
+            }
+          } else false
+
+        override def next() = {
+          val ret = number
+          number += step
+          if (number < ret ^ step < 0) {
+            // we have Long.MaxValue + Long.MaxValue < Long.MaxValue
+            // and Long.MinValue + Long.MinValue > Long.MinValue, so iff the step causes a step
+            // back, we are pretty sure that we have an overflow.
+            overflow = true
+          }
+          ret
+        }
+      }
+    })
+  }
+
   /** Distribute a local Scala collection to form an RDD.
    *
    * This method is identical to `parallelize`.
diff --git a/python/pyspark/context.py b/python/pyspark/context.py
index d25ee855235be..1f2b40b29fafa 100644
--- a/python/pyspark/context.py
+++ b/python/pyspark/context.py
@@ -319,6 +319,22 @@ def stop(self):
         with SparkContext._lock:
             SparkContext._active_spark_context = None
 
+    def range(self, start, end, step=1, numSlices=None):
+        """
+        Create a new RDD of int containing elements from `start` to `end`
+        (exclusive), increased by `step` every element.
+
+        :param start: the start value
+        :param end: the end value (exclusive)
+        :param step: the incremental step (default: 1)
+        :param numSlices: the number of partitions of the new RDD
+        :return: An RDD of int
+
+        >>> sc.range(1, 7, 2).collect()
+        [1, 3, 5]
+        """
+        return self.parallelize(xrange(start, end, step), numSlices)
+
     def parallelize(self, c, numSlices=None):
         """
         Distribute a local Python collection to form an RDD. Using xrange
diff --git a/python/pyspark/sql/context.py b/python/pyspark/sql/context.py
index 0bde7191242ab..9f26d13235d5f 100644
--- a/python/pyspark/sql/context.py
+++ b/python/pyspark/sql/context.py
@@ -122,6 +122,26 @@ def udf(self):
         """Returns a :class:`UDFRegistration` for UDF registration."""
         return UDFRegistration(self)
 
+    def range(self, start, end, step=1, numPartitions=None):
+        """
+        Create a :class:`DataFrame` with single LongType column named `id`,
+        containing elements in a range from `start` to `end` (exclusive) with
+        step value `step`.
+
+        :param start: the start value
+        :param end: the end value (exclusive)
+        :param step: the incremental step (default: 1)
+        :param numPartitions: the number of partitions of the DataFrame
+        :return: A new DataFrame
+
+        >>> sqlContext.range(1, 7, 2).collect()
+        [Row(id=1), Row(id=3), Row(id=5)]
+        """
+        if numPartitions is None:
+            numPartitions = self._sc.defaultParallelism
+        jdf = self._ssql_ctx.range(int(start), int(end), int(step), int(numPartitions))
+        return DataFrame(jdf, self)
+
     @ignore_unicode_prefix
     def registerFunction(self, name, f, returnType=StringType()):
         """Registers a lambda function as a UDF so it can be used in SQL statements.
diff --git a/python/pyspark/sql/tests.py b/python/pyspark/sql/tests.py
index d37c5dbed7f6b..84ae36f2fd026 100644
--- a/python/pyspark/sql/tests.py
+++ b/python/pyspark/sql/tests.py
@@ -117,6 +117,11 @@ def tearDownClass(cls):
         ReusedPySparkTestCase.tearDownClass()
         shutil.rmtree(cls.tempdir.name, ignore_errors=True)
 
+    def test_range(self):
+        self.assertEqual(self.sqlCtx.range(1, 1).count(), 0)
+        self.assertEqual(self.sqlCtx.range(1, 0, -1).count(), 1)
+        self.assertEqual(self.sqlCtx.range(0, 1 << 40, 1 << 39).count(), 2)
+
     def test_explode(self):
         from pyspark.sql.functions import explode
         d = [Row(a=1, intlist=[1, 2, 3], mapfield={"a": "b"})]
diff --git a/python/pyspark/tests.py b/python/pyspark/tests.py
index 5e023f6c53517..d8e319994cc96 100644
--- a/python/pyspark/tests.py
+++ b/python/pyspark/tests.py
@@ -444,6 +444,11 @@ def func(x):
 
 class RDDTests(ReusedPySparkTestCase):
 
+    def test_range(self):
+        self.assertEqual(self.sc.range(1, 1).count(), 0)
+        self.assertEqual(self.sc.range(1, 0, -1).count(), 1)
+        self.assertEqual(self.sc.range(0, 1 << 40, 1 << 39).count(), 2)
+
     def test_id(self):
         rdd = self.sc.parallelize(range(10))
         id = rdd.id()
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
index ac1a800219423..316ef7d58809d 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
@@ -684,6 +684,37 @@ class SQLContext(@transient val sparkContext: SparkContext)
     catalog.unregisterTable(Seq(tableName))
   }
 
+  /**
+   * :: Experimental ::
+   * Creates a [[DataFrame]] with a single [[LongType]] column named `id`, containing elements
+   * in an range from `start` to `end`(exclusive) with step value 1.
+   *
+   * @since 1.4.0
+   * @group dataframe
+   */
+  @Experimental
+  def range(start: Long, end: Long): DataFrame = {
+    createDataFrame(
+      sparkContext.range(start, end).map(Row(_)),
+      StructType(StructField("id", LongType, nullable = false) :: Nil))
+  }
+
+  /**
+   * :: Experimental ::
+   * Creates a [[DataFrame]] with a single [[LongType]] column named `id`, containing elements
+   * in an range from `start` to `end`(exclusive) with an step value, with partition number
+   * specified.
+   *
+   * @since 1.4.0
+   * @group dataframe
+   */
+  @Experimental
+  def range(start: Long, end: Long, step: Long, numPartitions: Int): DataFrame = {
+    createDataFrame(
+      sparkContext.range(start, end, step, numPartitions).map(Row(_)),
+      StructType(StructField("id", LongType, nullable = false) :: Nil))
+  }
+
   /**
    * Executes a SQL query using Spark, returning the result as a [[DataFrame]]. The dialect that is
    * used for SQL parsing can be configured with 'spark.sql.dialect'.
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
index 054b23dba84c5..f05d059d443c4 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
@@ -532,4 +532,44 @@ class DataFrameSuite extends QueryTest {
     val p = df.logicalPlan.asInstanceOf[Project].child.asInstanceOf[Project]
     assert(!p.child.isInstanceOf[Project])
   }
+
+  test("SPARK-7150 range api") {
+    // numSlice is greater than length
+    val res1 = TestSQLContext.range(0, 10, 1, 15).select("id")
+    assert(res1.count == 10)
+    assert(res1.agg(sum("id")).as("sumid").collect() === Seq(Row(45)))
+
+    val res2 = TestSQLContext.range(3, 15, 3, 2).select("id")
+    assert(res2.count == 4)
+    assert(res2.agg(sum("id")).as("sumid").collect() === Seq(Row(30)))
+
+    val res3 = TestSQLContext.range(1, -2).select("id")
+    assert(res3.count == 0)
+
+    // start is positive, end is negative, step is negative
+    val res4 = TestSQLContext.range(1, -2, -2, 6).select("id")
+    assert(res4.count == 2)
+    assert(res4.agg(sum("id")).as("sumid").collect() === Seq(Row(0)))
+
+    // start, end, step are negative
+    val res5 = TestSQLContext.range(-3, -8, -2, 1).select("id")
+    assert(res5.count == 3)
+    assert(res5.agg(sum("id")).as("sumid").collect() === Seq(Row(-15)))
+
+    // start, end are negative, step is positive
+    val res6 = TestSQLContext.range(-8, -4, 2, 1).select("id")
+    assert(res6.count == 2)
+    assert(res6.agg(sum("id")).as("sumid").collect() === Seq(Row(-14)))
+
+    val res7 = TestSQLContext.range(-10, -9, -20, 1).select("id")
+    assert(res7.count == 0)
+
+    val res8 = TestSQLContext.range(Long.MinValue, Long.MaxValue, Long.MaxValue, 100).select("id")
+    assert(res8.count == 3)
+    assert(res8.agg(sum("id")).as("sumid").collect() === Seq(Row(-3)))
+
+    val res9 = TestSQLContext.range(Long.MaxValue, Long.MinValue, Long.MinValue, 100).select("id")
+    assert(res9.count == 2)
+    assert(res9.agg(sum("id")).as("sumid").collect() === Seq(Row(Long.MaxValue - 1)))
+  }
 }

From c9fa870a6de3f7d0903fa7a75ea5ffb6a2fcd174 Mon Sep 17 00:00:00 2001
From: Josh Rosen <joshrosen@databricks.com>
Date: Mon, 18 May 2015 21:53:44 -0700
Subject: [PATCH 064/525] [SPARK-7687] [SQL] DataFrame.describe() should cast
 all aggregates to String

In `DataFrame.describe()`, the `count` aggregate produces an integer, the `avg` and `stdev` aggregates produce doubles, and `min` and `max` aggregates can produce varying types depending on what type of column they're applied to.  As a result, we should cast all aggregate results to String so that `describe()`'s output types match its declared output schema.

Author: Josh Rosen <joshrosen@databricks.com>

Closes #6218 from JoshRosen/SPARK-7687 and squashes the following commits:

146b615 [Josh Rosen] Fix R test.
2974bd5 [Josh Rosen] Cast to string type instead
f206580 [Josh Rosen] Cast to double to fix SPARK-7687
307ecbf [Josh Rosen] Add failing regression test for SPARK-7687
---
 R/pkg/inst/tests/test_sparkSQL.R                | 10 +++++-----
 .../scala/org/apache/spark/sql/DataFrame.scala  |  6 +++---
 .../org/apache/spark/sql/DataFrameSuite.scala   | 17 +++++++++++------
 3 files changed, 19 insertions(+), 14 deletions(-)

diff --git a/R/pkg/inst/tests/test_sparkSQL.R b/R/pkg/inst/tests/test_sparkSQL.R
index 3e5658eb5b24b..1768c57fd02e4 100644
--- a/R/pkg/inst/tests/test_sparkSQL.R
+++ b/R/pkg/inst/tests/test_sparkSQL.R
@@ -757,12 +757,12 @@ test_that("parquetFile works with multiple input paths", {
 test_that("describe() on a DataFrame", {
   df <- jsonFile(sqlCtx, jsonPath)
   stats <- describe(df, "age")
-  expect_true(collect(stats)[1, "summary"] == "count")
-  expect_true(collect(stats)[2, "age"] == 24.5)
-  expect_true(collect(stats)[3, "age"] == 5.5)
+  expect_equal(collect(stats)[1, "summary"], "count")
+  expect_equal(collect(stats)[2, "age"], "24.5")
+  expect_equal(collect(stats)[3, "age"], "5.5")
   stats <- describe(df)
-  expect_true(collect(stats)[4, "name"] == "Andy")
-  expect_true(collect(stats)[5, "age"] == 30.0)
+  expect_equal(collect(stats)[4, "name"], "Andy")
+  expect_equal(collect(stats)[5, "age"], "30")
 })
 
 unlink(parquetPath)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
index 27e9af49f0664..adad85806d1ea 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
@@ -1063,7 +1063,7 @@ class DataFrame private[sql](
 
     val ret: Seq[Row] = if (outputCols.nonEmpty) {
       val aggExprs = statistics.flatMap { case (_, colToAgg) =>
-        outputCols.map(c => Column(colToAgg(Column(c).expr)).as(c))
+        outputCols.map(c => Column(Cast(colToAgg(Column(c).expr), StringType)).as(c))
       }
 
       val row = agg(aggExprs.head, aggExprs.tail: _*).head().toSeq
@@ -1077,9 +1077,9 @@ class DataFrame private[sql](
       statistics.map { case (name, _) => Row(name) }
     }
 
-    // The first column is string type, and the rest are double type.
+    // All columns are string type
     val schema = StructType(
-      StructField("summary", StringType) :: outputCols.map(StructField(_, DoubleType))).toAttributes
+      StructField("summary", StringType) :: outputCols.map(StructField(_, StringType))).toAttributes
     LocalRelation(schema, ret)
   }
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
index f05d059d443c4..0dcba80ef2a20 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
@@ -370,14 +370,14 @@ class DataFrameSuite extends QueryTest {
       ("Amy",   24, 180)).toDF("name", "age", "height")
 
     val describeResult = Seq(
-      Row("count",   4,               4),
-      Row("mean",    33.0,            178.0),
-      Row("stddev",  16.583123951777, 10.0),
-      Row("min",     16,              164),
-      Row("max",     60,              192))
+      Row("count",   "4",               "4"),
+      Row("mean",    "33.0",            "178.0"),
+      Row("stddev",  "16.583123951777", "10.0"),
+      Row("min",     "16",              "164"),
+      Row("max",     "60",              "192"))
 
     val emptyDescribeResult = Seq(
-      Row("count",   0,    0),
+      Row("count",   "0",  "0"),
       Row("mean",    null, null),
       Row("stddev",  null, null),
       Row("min",     null, null),
@@ -388,6 +388,11 @@ class DataFrameSuite extends QueryTest {
     val describeTwoCols = describeTestData.describe("age", "height")
     assert(getSchemaAsSeq(describeTwoCols) === Seq("summary", "age", "height"))
     checkAnswer(describeTwoCols, describeResult)
+    // All aggregate value should have been cast to string
+    describeTwoCols.collect().foreach { row =>
+      assert(row.get(1).isInstanceOf[String], "expected string but found " + row.get(1).getClass)
+      assert(row.get(2).isInstanceOf[String], "expected string but found " + row.get(2).getClass)
+    }
 
     val describeAllCols = describeTestData.describe()
     assert(getSchemaAsSeq(describeAllCols) === Seq("summary", "age", "height"))

From 9ebb44f8abb1a13f045eed60190954db904ffef7 Mon Sep 17 00:00:00 2001
From: Patrick Wendell <patrick@databricks.com>
Date: Tue, 19 May 2015 06:00:13 +0000
Subject: [PATCH 065/525] [HOTFIX]: Java 6 Build Breaks

These were blocking RC1 so I fixed them manually.
---
 .../shuffle/unsafe/UnsafeShuffleWriterSuite.java   | 14 --------------
 .../apache/spark/sql/hive/orc/OrcRelation.scala    |  3 ++-
 2 files changed, 2 insertions(+), 15 deletions(-)

diff --git a/core/src/test/java/org/apache/spark/shuffle/unsafe/UnsafeShuffleWriterSuite.java b/core/src/test/java/org/apache/spark/shuffle/unsafe/UnsafeShuffleWriterSuite.java
index 730d265c87f88..03116d8fc2b21 100644
--- a/core/src/test/java/org/apache/spark/shuffle/unsafe/UnsafeShuffleWriterSuite.java
+++ b/core/src/test/java/org/apache/spark/shuffle/unsafe/UnsafeShuffleWriterSuite.java
@@ -252,20 +252,6 @@ public void doNotNeedToCallWriteBeforeUnsuccessfulStop() throws IOException {
     createWriter(false).stop(false);
   }
 
-  @Test
-  public void writeEmptyIterator() throws Exception {
-    final UnsafeShuffleWriter<Object, Object> writer = createWriter(true);
-    writer.write(Collections.<Product2<Object, Object>>emptyIterator());
-    final Option<MapStatus> mapStatus = writer.stop(true);
-    assertTrue(mapStatus.isDefined());
-    assertTrue(mergedOutputFile.exists());
-    assertArrayEquals(new long[NUM_PARTITITONS], partitionSizesInMergedFile);
-    assertEquals(0, taskMetrics.shuffleWriteMetrics().get().shuffleRecordsWritten());
-    assertEquals(0, taskMetrics.shuffleWriteMetrics().get().shuffleBytesWritten());
-    assertEquals(0, taskMetrics.diskBytesSpilled());
-    assertEquals(0, taskMetrics.memoryBytesSpilled());
-  }
-
   @Test
   public void writeWithoutSpilling() throws Exception {
     // In this example, each partition should have exactly one record:
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcRelation.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcRelation.scala
index 58b97adb46165..b69e14a179d0a 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcRelation.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcRelation.scala
@@ -17,8 +17,9 @@
 
 package org.apache.spark.sql.hive.orc
 
-import java.util.{Objects, Properties}
+import java.util.Properties
 
+import com.google.common.base.Objects
 import org.apache.hadoop.conf.Configuration
 import org.apache.hadoop.fs.{FileStatus, Path}
 import org.apache.hadoop.hive.conf.HiveConf.ConfVars

From 23cf897112624ece19a3b5e5394cdf71b9c3c8b3 Mon Sep 17 00:00:00 2001
From: Patrick Wendell <patrick@databricks.com>
Date: Tue, 19 May 2015 00:02:06 -0700
Subject: [PATCH 066/525] [HOTFIX] Fixing style failures in Kinesis source

---
 .../spark/examples/streaming/KinesisWordCountASL.scala      | 6 ++++--
 .../apache/spark/streaming/kinesis/KinesisReceiver.scala    | 4 ++--
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/extras/kinesis-asl/src/main/scala/org/apache/spark/examples/streaming/KinesisWordCountASL.scala b/extras/kinesis-asl/src/main/scala/org/apache/spark/examples/streaming/KinesisWordCountASL.scala
index 640ca049e2ec4..df77f4be9db1d 100644
--- a/extras/kinesis-asl/src/main/scala/org/apache/spark/examples/streaming/KinesisWordCountASL.scala
+++ b/extras/kinesis-asl/src/main/scala/org/apache/spark/examples/streaming/KinesisWordCountASL.scala
@@ -119,7 +119,8 @@ object KinesisWordCountASL extends Logging {
     val batchInterval = Milliseconds(2000)
 
     // Kinesis checkpoint interval is the interval at which the DynamoDB is updated with information
-    //on sequence number of records that have been received. Same as batchInterval for this example.
+    // on sequence number of records that have been received. Same as batchInterval for this 
+    // example.
     val kinesisCheckpointInterval = batchInterval
 
     // Get the region name from the endpoint URL to save Kinesis Client Library metadata in
@@ -173,7 +174,8 @@ object KinesisWordProducerASL {
     if (args.length != 4) {
       System.err.println(
         """
-          |Usage: KinesisWordProducerASL <stream-name> <endpoint-url> <records-per-sec> <words-per-record>
+          |Usage: KinesisWordProducerASL <stream-name> <endpoint-url> <records-per-sec>
+                                         <words-per-record>
           |
           |    <stream-name> is the name of the Kinesis stream
           |    <endpoint-url> is the endpoint of the Kinesis service
diff --git a/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisReceiver.scala b/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisReceiver.scala
index 01608fbd3fd31..90164490efb2e 100644
--- a/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisReceiver.scala
+++ b/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisReceiver.scala
@@ -82,8 +82,8 @@ private[kinesis] class KinesisReceiver(
    */
 
   /**
-   * workerId is used by the KCL should be based on the ip address of the actual Spark Worker where this code runs
-   * (not the driver's IP address.)
+   * workerId is used by the KCL should be based on the ip address of the actual Spark Worker 
+   * where this code runs (not the driver's IP address.)
    */
   private var workerId: String = null
 

From 6008ec14ed6491d0a854bb50548c46f2f9709269 Mon Sep 17 00:00:00 2001
From: Xusen Yin <yinxusen@gmail.com>
Date: Tue, 19 May 2015 00:06:33 -0700
Subject: [PATCH 067/525] [SPARK-7581] [ML] [DOC] User guide for spark.ml
 PolynomialExpansion

JIRA [here](https://issues.apache.org/jira/browse/SPARK-7581).

CC jkbradley

Author: Xusen Yin <yinxusen@gmail.com>

Closes #6113 from yinxusen/SPARK-7581 and squashes the following commits:

1a7d80d [Xusen Yin] merge with master
892a8e9 [Xusen Yin] fix python 3 compatibility
ec935bf [Xusen Yin] small fix
3e9fa1d [Xusen Yin] delete note
69fcf85 [Xusen Yin] simplify and add python example
81d21dc [Xusen Yin] add programming guide for Polynomial Expansion
40babfb [Xusen Yin] add java test suite for PolynomialExpansion
---
 docs/ml-features.md                           | 83 +++++++++++++++++
 .../feature/JavaPolynomialExpansionSuite.java | 91 +++++++++++++++++++
 2 files changed, 174 insertions(+)
 create mode 100644 mllib/src/test/java/org/apache/spark/ml/feature/JavaPolynomialExpansionSuite.java

diff --git a/docs/ml-features.md b/docs/ml-features.md
index 5df61dd36a070..e86f9edc4f68b 100644
--- a/docs/ml-features.md
+++ b/docs/ml-features.md
@@ -268,5 +268,88 @@ for binarized_feature, in binarizedFeatures.collect():
 </div>
 </div>
 
+## PolynomialExpansion
+
+[Polynomial expansion](http://en.wikipedia.org/wiki/Polynomial_expansion) is the process of expanding your features into a polynomial space, which is formulated by an n-degree combination of original dimensions. A [PolynomialExpansion](api/scala/index.html#org.apache.spark.ml.feature.PolynomialExpansion) class provides this functionality.  The example below shows how to expand your features into a 3-degree polynomial space.
+
+<div class="codetabs">
+<div data-lang="scala" markdown="1">
+{% highlight scala %}
+import org.apache.spark.ml.feature.PolynomialExpansion
+import org.apache.spark.mllib.linalg.Vectors
+
+val data = Array(
+  Vectors.dense(-2.0, 2.3),
+  Vectors.dense(0.0, 0.0),
+  Vectors.dense(0.6, -1.1)
+)
+val df = sqlContext.createDataFrame(data.map(Tuple1.apply)).toDF("features")
+val polynomialExpansion = new PolynomialExpansion()
+  .setInputCol("features")
+  .setOutputCol("polyFeatures")
+  .setDegree(3)
+val polyDF = polynomialExpansion.transform(df)
+polyDF.select("polyFeatures").take(3).foreach(println)
+{% endhighlight %}
+</div>
+
+<div data-lang="java" markdown="1">
+{% highlight java %}
+import com.google.common.collect.Lists;
+
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.mllib.linalg.Vector;
+import org.apache.spark.mllib.linalg.VectorUDT;
+import org.apache.spark.mllib.linalg.Vectors;
+import org.apache.spark.sql.DataFrame;
+import org.apache.spark.sql.Row;
+import org.apache.spark.sql.RowFactory;
+import org.apache.spark.sql.SQLContext;
+import org.apache.spark.sql.types.Metadata;
+import org.apache.spark.sql.types.StructField;
+import org.apache.spark.sql.types.StructType;
+
+JavaSparkContext jsc = ...
+SQLContext jsql = ...
+PolynomialExpansion polyExpansion = new PolynomialExpansion()
+  .setInputCol("features")
+  .setOutputCol("polyFeatures")
+  .setDegree(3);
+JavaRDD<Row> data = jsc.parallelize(Lists.newArrayList(
+  RowFactory.create(Vectors.dense(-2.0, 2.3)),
+  RowFactory.create(Vectors.dense(0.0, 0.0)),
+  RowFactory.create(Vectors.dense(0.6, -1.1))
+));
+StructType schema = new StructType(new StructField[] {
+  new StructField("features", new VectorUDT(), false, Metadata.empty()),
+});
+DataFrame df = jsql.createDataFrame(data, schema);
+DataFrame polyDF = polyExpansion.transform(df);
+Row[] row = polyDF.select("polyFeatures").take(3);
+for (Row r : row) {
+  System.out.println(r.get(0));
+}
+{% endhighlight %}
+</div>
+
+<div data-lang="python" markdown="1">
+{% highlight python %}
+from pyspark.ml.feature import PolynomialExpansion
+from pyspark.mllib.linalg import Vectors
+
+df = sqlContext.createDataFrame(
+  [(Vectors.dense([-2.0, 2.3]), ),
+  (Vectors.dense([0.0, 0.0]), ),
+  (Vectors.dense([0.6, -1.1]), )],
+  ["features"])
+px = PolynomialExpansion(degree=2, inputCol="features", outputCol="polyFeatures")
+polyDF = px.transform(df)
+for expanded in polyDF.select("polyFeatures").take(3):
+  print(expanded)
+{% endhighlight %}
+</div>
+</div>
+
 # Feature Selectors
 
diff --git a/mllib/src/test/java/org/apache/spark/ml/feature/JavaPolynomialExpansionSuite.java b/mllib/src/test/java/org/apache/spark/ml/feature/JavaPolynomialExpansionSuite.java
new file mode 100644
index 0000000000000..5e8211c2c5118
--- /dev/null
+++ b/mllib/src/test/java/org/apache/spark/ml/feature/JavaPolynomialExpansionSuite.java
@@ -0,0 +1,91 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.ml.feature;
+
+import com.google.common.collect.Lists;
+import org.junit.After;
+import org.junit.Assert;
+import org.junit.Before;
+import org.junit.Test;
+
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.mllib.linalg.Vector;
+import org.apache.spark.mllib.linalg.VectorUDT;
+import org.apache.spark.mllib.linalg.Vectors;
+import org.apache.spark.sql.DataFrame;
+import org.apache.spark.sql.Row;
+import org.apache.spark.sql.RowFactory;
+import org.apache.spark.sql.SQLContext;
+import org.apache.spark.sql.types.Metadata;
+import org.apache.spark.sql.types.StructField;
+import org.apache.spark.sql.types.StructType;
+
+public class JavaPolynomialExpansionSuite {
+  private transient JavaSparkContext jsc;
+  private transient SQLContext jsql;
+
+  @Before
+  public void setUp() {
+    jsc = new JavaSparkContext("local", "JavaPolynomialExpansionSuite");
+    jsql = new SQLContext(jsc);
+  }
+
+  @After
+  public void tearDown() {
+    jsc.stop();
+    jsc = null;
+  }
+
+  @Test
+  public void polynomialExpansionTest() {
+    PolynomialExpansion polyExpansion = new PolynomialExpansion()
+      .setInputCol("features")
+      .setOutputCol("polyFeatures")
+      .setDegree(3);
+
+    JavaRDD<Row> data = jsc.parallelize(Lists.newArrayList(
+      RowFactory.create(
+        Vectors.dense(-2.0, 2.3),
+        Vectors.dense(-2.0, 4.0, -8.0, 2.3, -4.6, 9.2, 5.29, -10.58, 12.17)
+      ),
+      RowFactory.create(Vectors.dense(0.0, 0.0), Vectors.dense(new double[9])),
+      RowFactory.create(
+        Vectors.dense(0.6, -1.1),
+        Vectors.dense(0.6, 0.36, 0.216, -1.1, -0.66, -0.396, 1.21, 0.726, -1.331)
+      )
+    ));
+
+    StructType schema = new StructType(new StructField[] {
+      new StructField("features", new VectorUDT(), false, Metadata.empty()),
+      new StructField("expected", new VectorUDT(), false, Metadata.empty())
+    });
+
+    DataFrame dataset = jsql.createDataFrame(data, schema);
+
+    Row[] pairs = polyExpansion.transform(dataset)
+      .select("polyFeatures", "expected")
+      .collect();
+
+    for (Row r : pairs) {
+      double[] polyFeatures = ((Vector)r.get(0)).toArray();
+      double[] expected = ((Vector)r.get(1)).toArray();
+      Assert.assertArrayEquals(polyFeatures, expected, 1e-1);
+    }
+  }
+}

From 61f164d3fdd1c8dcdba8c9d66df05ff4069aa6e6 Mon Sep 17 00:00:00 2001
From: Mike Dusenberry <dusenberrymw@gmail.com>
Date: Tue, 19 May 2015 08:59:45 +0100
Subject: [PATCH 068/525] Fixing a few basic typos in the Programming Guide.

Just a few minor fixes in the guide, so a new JIRA issue was not created per the guidelines.

Author: Mike Dusenberry <dusenberrymw@gmail.com>

Closes #6240 from dusenberrymw/Fix_Programming_Guide_Typos and squashes the following commits:

ffa76eb [Mike Dusenberry] Fixing a few basic typos in the Programming Guide.
---
 docs/programming-guide.md | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/docs/programming-guide.md b/docs/programming-guide.md
index 27816515c5de2..0c273769bb14b 100644
--- a/docs/programming-guide.md
+++ b/docs/programming-guide.md
@@ -1071,7 +1071,7 @@ for details.
 </tr>
 <tr>
   <td> <b>saveAsSequenceFile</b>(<i>path</i>) <br /> (Java and Scala) </td>
-  <td> Write the elements of the dataset as a Hadoop SequenceFile in a given path in the local filesystem, HDFS or any other Hadoop-supported file system. This is available on RDDs of key-value pairs that either implement Hadoop's Writable interface. In Scala, it is also
+  <td> Write the elements of the dataset as a Hadoop SequenceFile in a given path in the local filesystem, HDFS or any other Hadoop-supported file system. This is available on RDDs of key-value pairs that implement Hadoop's Writable interface. In Scala, it is also
    available on types that are implicitly convertible to Writable (Spark includes conversions for basic types like Int, Double, String, etc). </td>
 </tr>
 <tr>
@@ -1122,7 +1122,7 @@ ordered data following shuffle then it's possible to use:
 * `sortBy` to make a globally ordered RDD
 
 Operations which can cause a shuffle include **repartition** operations like
-[`repartition`](#RepartitionLink), and [`coalesce`](#CoalesceLink), **'ByKey** operations
+[`repartition`](#RepartitionLink) and [`coalesce`](#CoalesceLink), **'ByKey** operations
 (except for counting) like [`groupByKey`](#GroupByLink) and [`reduceByKey`](#ReduceByLink), and
 **join** operations like [`cogroup`](#CogroupLink) and [`join`](#JoinLink).
 
@@ -1138,7 +1138,7 @@ read the relevant sorted blocks.
         
 Certain shuffle operations can consume significant amounts of heap memory since they employ 
 in-memory data structures to organize records before or after transferring them. Specifically, 
-`reduceByKey` and `aggregateByKey` create these structures on the map side and `'ByKey` operations 
+`reduceByKey` and `aggregateByKey` create these structures on the map side, and `'ByKey` operations 
 generate these on the reduce side. When data does not fit in memory Spark will spill these tables 
 to disk, incurring the additional overhead of disk I/O and increased garbage collection.
 

From 27fa88b9ba320cd0d95703aa3437151ba7c86f98 Mon Sep 17 00:00:00 2001
From: Patrick Wendell <patrick@databricks.com>
Date: Tue, 19 May 2015 02:28:41 -0700
Subject: [PATCH 069/525] [HOTFIX] Revert "[SPARK-7092] Update spark scala
 version to 2.11.6"

This reverts commit a11c8683c76c67f45749a1b50a0912a731fd2487.

For more information see:
https://issues.apache.org/jira/browse/SPARK-7726
---
 pom.xml                                                       | 4 ++--
 .../src/main/scala/org/apache/spark/repl/SparkIMain.scala     | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/pom.xml b/pom.xml
index c72d7cbf843ef..d903f02c1aed0 100644
--- a/pom.xml
+++ b/pom.xml
@@ -1799,9 +1799,9 @@
         <property><name>scala-2.11</name></property>
       </activation>
       <properties>
-        <scala.version>2.11.6</scala.version>
+        <scala.version>2.11.2</scala.version>
         <scala.binary.version>2.11</scala.binary.version>
-        <jline.version>2.12.1</jline.version>
+        <jline.version>2.12</jline.version>
         <jline.groupid>jline</jline.groupid>
       </properties>
     </profile>
diff --git a/repl/scala-2.11/src/main/scala/org/apache/spark/repl/SparkIMain.scala b/repl/scala-2.11/src/main/scala/org/apache/spark/repl/SparkIMain.scala
index 1cb910f376060..1bb62c84abddc 100644
--- a/repl/scala-2.11/src/main/scala/org/apache/spark/repl/SparkIMain.scala
+++ b/repl/scala-2.11/src/main/scala/org/apache/spark/repl/SparkIMain.scala
@@ -1129,7 +1129,7 @@ class SparkIMain(@BeanProperty val factory: ScriptEngineFactory, initialSettings
 
     def apply(line: String): Result = debugging(s"""parse("$line")""")  {
       var isIncomplete = false
-      currentRun.parsing.withIncompleteHandler((_, _) => isIncomplete = true) {
+      currentRun.reporting.withIncompleteHandler((_, _) => isIncomplete = true) {
         reporter.reset()
         val trees = newUnitParser(line).parseStats()
         if (reporter.hasErrors) Error

From df34793ad4e76214fc4c0a22af1eb89b171a32e4 Mon Sep 17 00:00:00 2001
From: Saleem Ansari <tuxdna@gmail.com>
Date: Tue, 19 May 2015 10:31:11 +0100
Subject: [PATCH 070/525] [SPARK-7723] Fix string interpolation in pipeline
 examples

https://issues.apache.org/jira/browse/SPARK-7723

Author: Saleem Ansari <tuxdna@gmail.com>

Closes #6258 from tuxdna/master and squashes the following commits:

2bb5a42 [Saleem Ansari] Merge branch 'master' into mllib-pipeline
e39db9c [Saleem Ansari] Fix string interpolation in pipeline examples
---
 docs/ml-guide.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/ml-guide.md b/docs/ml-guide.md
index b7b6376e061f7..cac705683c8bc 100644
--- a/docs/ml-guide.md
+++ b/docs/ml-guide.md
@@ -237,7 +237,7 @@ model2.transform(test.toDF)
   .select("features", "label", "myProbability", "prediction")
   .collect()
   .foreach { case Row(features: Vector, label: Double, prob: Vector, prediction: Double) =>
-    println("($features, $label) -> prob=$prob, prediction=$prediction")
+    println(s"($features, $label) -> prob=$prob, prediction=$prediction")
   }
 
 sc.stop()
@@ -391,7 +391,7 @@ model.transform(test.toDF)
   .select("id", "text", "probability", "prediction")
   .collect()
   .foreach { case Row(id: Long, text: String, prob: Vector, prediction: Double) =>
-    println("($id, $text) --> prob=$prob, prediction=$prediction")
+    println(s"($id, $text) --> prob=$prob, prediction=$prediction")
   }
 
 sc.stop()

From 6845cb2ff475fd794b30b01af5ebc80714b880f0 Mon Sep 17 00:00:00 2001
From: Xiangrui Meng <meng@databricks.com>
Date: Tue, 19 May 2015 08:24:57 -0700
Subject: [PATCH 071/525] [SPARK-7681] [MLLIB] remove mima excludes for 1.3

There excludes are unnecessary for 1.3 because the changes were made in 1.4.x.

Author: Xiangrui Meng <meng@databricks.com>

Closes #6254 from mengxr/SPARK-7681-mima and squashes the following commits:

7f0cea0 [Xiangrui Meng] remove mima excludes for 1.3
---
 project/MimaExcludes.scala | 9 +--------
 1 file changed, 1 insertion(+), 8 deletions(-)

diff --git a/project/MimaExcludes.scala b/project/MimaExcludes.scala
index f8d0160f6445e..03e93a2f98f9b 100644
--- a/project/MimaExcludes.scala
+++ b/project/MimaExcludes.scala
@@ -187,14 +187,7 @@ object MimaExcludes {
             ProblemFilters.exclude[MissingMethodProblem](
               "org.apache.spark.mllib.linalg.Matrix.isTransposed"),
             ProblemFilters.exclude[MissingMethodProblem](
-              "org.apache.spark.mllib.linalg.Matrix.foreachActive"),
-            // SPARK-7681 add SparseVector support for gemv
-            ProblemFilters.exclude[MissingMethodProblem](
-              "org.apache.spark.mllib.linalg.Matrix.multiply"),
-            ProblemFilters.exclude[MissingMethodProblem](
-              "org.apache.spark.mllib.linalg.DenseMatrix.multiply"),
-            ProblemFilters.exclude[MissingMethodProblem](
-              "org.apache.spark.mllib.linalg.SparseMatrix.multiply")
+              "org.apache.spark.mllib.linalg.Matrix.foreachActive")
           ) ++ Seq(
             // SPARK-5540
             ProblemFilters.exclude[MissingMethodProblem](

From 32fa611b19c6b95d4563be631c5a8ff0cdf3438f Mon Sep 17 00:00:00 2001
From: Dice <poleon.kd@gmail.com>
Date: Tue, 19 May 2015 18:12:05 +0100
Subject: [PATCH 072/525] [SPARK-7704] Updating Programming Guides per
 SPARK-4397

The change per SPARK-4397 makes implicit objects in SparkContext to be found by the compiler automatically. So that we don't need to import the o.a.s.SparkContext._ explicitly any more and can remove some statements around the "implicit conversions" from the latest Programming Guides (1.3.0 and higher)

Author: Dice <poleon.kd@gmail.com>

Closes #6234 from daisukebe/patch-1 and squashes the following commits:

b77ecd9 [Dice] fix a typo
45dfcd3 [Dice] rewording per Sean's advice
a094bcf [Dice] Adding a note for users on any previous releases
a29be5f [Dice] Updating Programming Guides per SPARK-4397
---
 docs/programming-guide.md | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/docs/programming-guide.md b/docs/programming-guide.md
index 0c273769bb14b..07a4d29fe7104 100644
--- a/docs/programming-guide.md
+++ b/docs/programming-guide.md
@@ -41,14 +41,15 @@ In addition, if you wish to access an HDFS cluster, you need to add a dependency
     artifactId = hadoop-client
     version = <your-hdfs-version>
 
-Finally, you need to import some Spark classes and implicit conversions into your program. Add the following lines:
+Finally, you need to import some Spark classes into your program. Add the following lines:
 
 {% highlight scala %}
 import org.apache.spark.SparkContext
-import org.apache.spark.SparkContext._
 import org.apache.spark.SparkConf
 {% endhighlight %}
 
+(Before Spark 1.3.0, you need to explicitly `import org.apache.spark.SparkContext._` to enable essential implicit conversions.)
+
 </div>
 
 <div data-lang="java"  markdown="1">
@@ -821,11 +822,9 @@ by a key.
 
 In Scala, these operations are automatically available on RDDs containing
 [Tuple2](http://www.scala-lang.org/api/{{site.SCALA_VERSION}}/index.html#scala.Tuple2) objects
-(the built-in tuples in the language, created by simply writing `(a, b)`), as long as you
-import `org.apache.spark.SparkContext._` in your program to enable Spark's implicit
-conversions. The key-value pair operations are available in the
+(the built-in tuples in the language, created by simply writing `(a, b)`). The key-value pair operations are available in the
 [PairRDDFunctions](api/scala/index.html#org.apache.spark.rdd.PairRDDFunctions) class,
-which automatically wraps around an RDD of tuples if you import the conversions.
+which automatically wraps around an RDD of tuples.
 
 For example, the following code uses the `reduceByKey` operation on key-value pairs to count how
 many times each line of text occurs in a file:

From fb90273212dc7241c9a0c3446e25e0e0b9377750 Mon Sep 17 00:00:00 2001
From: "Joseph K. Bradley" <joseph@databricks.com>
Date: Tue, 19 May 2015 10:55:21 -0700
Subject: [PATCH 073/525] [SPARK-7047] [ML] ml.Model optional parent support

Made Model.parent transient.  Added Model.hasParent to test for null parent

CC: mengxr

Author: Joseph K. Bradley <joseph@databricks.com>

Closes #5914 from jkbradley/parent-optional and squashes the following commits:

d501774 [Joseph K. Bradley] Made Model.parent transient.  Added Model.hasParent to test for null parent
---
 mllib/src/main/scala/org/apache/spark/ml/Model.scala         | 5 ++++-
 .../spark/ml/classification/LogisticRegressionSuite.scala    | 1 +
 .../ml/classification/RandomForestClassifierSuite.scala      | 2 ++
 3 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/Model.scala b/mllib/src/main/scala/org/apache/spark/ml/Model.scala
index 7fd515369b19b..70e7495ac616c 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/Model.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/Model.scala
@@ -32,7 +32,7 @@ abstract class Model[M <: Model[M]] extends Transformer {
    * The parent estimator that produced this model.
    * Note: For ensembles' component Models, this value can be null.
    */
-  var parent: Estimator[M] = _
+  @transient var parent: Estimator[M] = _
 
   /**
    * Sets the parent of this model (Java API).
@@ -42,6 +42,9 @@ abstract class Model[M <: Model[M]] extends Transformer {
     this.asInstanceOf[M]
   }
 
+  /** Indicates whether this [[Model]] has a corresponding parent. */
+  def hasParent: Boolean = parent != null
+
   override def copy(extra: ParamMap): M = {
     // The default implementation of Params.copy doesn't work for models.
     throw new NotImplementedError(s"${this.getClass} doesn't implement copy(extra: ParamMap)")
diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala
index 43765241a20b6..97f9749cb4a9a 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala
@@ -83,6 +83,7 @@ class LogisticRegressionSuite extends FunSuite with MLlibTestSparkContext {
     assert(model.getRawPredictionCol === "rawPrediction")
     assert(model.getProbabilityCol === "probability")
     assert(model.intercept !== 0.0)
+    assert(model.hasParent)
   }
 
   test("logistic regression doesn't fit intercept when fitIntercept is off") {
diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/RandomForestClassifierSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/RandomForestClassifierSuite.scala
index 08f86fa45bc1d..cdbbacab8e0e3 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/classification/RandomForestClassifierSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/classification/RandomForestClassifierSuite.scala
@@ -162,5 +162,7 @@ private object RandomForestClassifierSuite {
     val oldModelAsNew = RandomForestClassificationModel.fromOld(
       oldModel, newModel.parent.asInstanceOf[RandomForestClassifier], categoricalFeatures)
     TreeTests.checkEqual(oldModelAsNew, newModel)
+    assert(newModel.hasParent)
+    assert(!newModel.trees.head.asInstanceOf[DecisionTreeClassificationModel].hasParent)
   }
 }

From 7b16e9f2118fbfbb1c0ba957161fe500c9aff82a Mon Sep 17 00:00:00 2001
From: "Joseph K. Bradley" <joseph@databricks.com>
Date: Tue, 19 May 2015 10:57:47 -0700
Subject: [PATCH 074/525] [SPARK-7678] [ML] Fix default random seed in HasSeed

Changed shared param HasSeed to have default based on hashCode of class name, instead of random number.
Also, removed fixed random seeds from Word2Vec and ALS.

CC: mengxr

Author: Joseph K. Bradley <joseph@databricks.com>

Closes #6251 from jkbradley/scala-fixed-seed and squashes the following commits:

0e37184 [Joseph K. Bradley] Fixed Word2VecSuite, ALSSuite in spark.ml to use original fixed random seeds
678ec3a [Joseph K. Bradley] Removed fixed random seeds from Word2Vec and ALS. Changed shared param HasSeed to have default based on hashCode of class name, instead of random number.
---
 .../org/apache/spark/ml/feature/Word2Vec.scala   |  1 -
 .../ml/param/shared/SharedParamsCodeGen.scala    |  2 +-
 .../spark/ml/param/shared/sharedParams.scala     |  4 ++--
 .../org/apache/spark/ml/recommendation/ALS.scala |  2 +-
 .../apache/spark/ml/feature/Word2VecSuite.scala  |  1 +
 .../spark/ml/recommendation/ALSSuite.scala       | 16 +++++++++-------
 6 files changed, 14 insertions(+), 12 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/Word2Vec.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/Word2Vec.scala
index 8ace8c53bb663..90f0be76df44f 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/Word2Vec.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/Word2Vec.scala
@@ -68,7 +68,6 @@ private[feature] trait Word2VecBase extends Params
 
   setDefault(stepSize -> 0.025)
   setDefault(maxIter -> 1)
-  setDefault(seed -> 42L)
 
   /**
    * Validate and transform the input schema.
diff --git a/mllib/src/main/scala/org/apache/spark/ml/param/shared/SharedParamsCodeGen.scala b/mllib/src/main/scala/org/apache/spark/ml/param/shared/SharedParamsCodeGen.scala
index 5085b798daa17..8b8cb81373a65 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/param/shared/SharedParamsCodeGen.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/param/shared/SharedParamsCodeGen.scala
@@ -53,7 +53,7 @@ private[shared] object SharedParamsCodeGen {
       ParamDesc[Int]("checkpointInterval", "checkpoint interval (>= 1)",
         isValid = "ParamValidators.gtEq(1)"),
       ParamDesc[Boolean]("fitIntercept", "whether to fit an intercept term", Some("true")),
-      ParamDesc[Long]("seed", "random seed", Some("Utils.random.nextLong()")),
+      ParamDesc[Long]("seed", "random seed", Some("this.getClass.getName.hashCode.toLong")),
       ParamDesc[Double]("elasticNetParam", "the ElasticNet mixing parameter, in range [0, 1]." +
         " For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty.",
         isValid = "ParamValidators.inRange(0, 1)"),
diff --git a/mllib/src/main/scala/org/apache/spark/ml/param/shared/sharedParams.scala b/mllib/src/main/scala/org/apache/spark/ml/param/shared/sharedParams.scala
index 7525d37007377..3a4976d3ddcd1 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/param/shared/sharedParams.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/param/shared/sharedParams.scala
@@ -232,7 +232,7 @@ private[ml] trait HasFitIntercept extends Params {
 }
 
 /**
- * (private[ml]) Trait for shared param seed (default: Utils.random.nextLong()).
+ * (private[ml]) Trait for shared param seed (default: this.getClass.getName.hashCode.toLong).
  */
 private[ml] trait HasSeed extends Params {
 
@@ -242,7 +242,7 @@ private[ml] trait HasSeed extends Params {
    */
   final val seed: LongParam = new LongParam(this, "seed", "random seed")
 
-  setDefault(seed, Utils.random.nextLong())
+  setDefault(seed, this.getClass.getName.hashCode.toLong)
 
   /** @group getParam */
   final def getSeed: Long = $(seed)
diff --git a/mllib/src/main/scala/org/apache/spark/ml/recommendation/ALS.scala b/mllib/src/main/scala/org/apache/spark/ml/recommendation/ALS.scala
index 45c57b50da70f..2a5ddbfae5cdf 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/recommendation/ALS.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/recommendation/ALS.scala
@@ -148,7 +148,7 @@ private[recommendation] trait ALSParams extends Params with HasMaxIter with HasR
 
   setDefault(rank -> 10, maxIter -> 10, regParam -> 0.1, numUserBlocks -> 10, numItemBlocks -> 10,
     implicitPrefs -> false, alpha -> 1.0, userCol -> "user", itemCol -> "item",
-    ratingCol -> "rating", nonnegative -> false, checkpointInterval -> 10, seed -> 0L)
+    ratingCol -> "rating", nonnegative -> false, checkpointInterval -> 10)
 
   /**
    * Validates and transforms the input schema.
diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/Word2VecSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/Word2VecSuite.scala
index 03ba86670d453..43a09cc418703 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/feature/Word2VecSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/feature/Word2VecSuite.scala
@@ -52,6 +52,7 @@ class Word2VecSuite extends FunSuite with MLlibTestSparkContext {
       .setVectorSize(3)
       .setInputCol("text")
       .setOutputCol("result")
+      .setSeed(42L)
       .fit(docDF)
 
     model.transform(docDF).select("result", "expected").collect().foreach {
diff --git a/mllib/src/test/scala/org/apache/spark/ml/recommendation/ALSSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/recommendation/ALSSuite.scala
index fc7349330cf86..6cc6ec94eb643 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/recommendation/ALSSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/recommendation/ALSSuite.scala
@@ -345,6 +345,7 @@ class ALSSuite extends FunSuite with MLlibTestSparkContext with Logging {
       .setImplicitPrefs(implicitPrefs)
       .setNumUserBlocks(numUserBlocks)
       .setNumItemBlocks(numItemBlocks)
+      .setSeed(0)
     val alpha = als.getAlpha
     val model = als.fit(training.toDF())
     val predictions = model.transform(test.toDF())
@@ -425,17 +426,18 @@ class ALSSuite extends FunSuite with MLlibTestSparkContext with Logging {
     val (ratings, _) = genImplicitTestData(numUsers = 20, numItems = 40, rank = 2, noiseStd = 0.01)
 
     val longRatings = ratings.map(r => Rating(r.user.toLong, r.item.toLong, r.rating))
-    val (longUserFactors, _) = ALS.train(longRatings, rank = 2, maxIter = 4)
+    val (longUserFactors, _) = ALS.train(longRatings, rank = 2, maxIter = 4, seed = 0)
     assert(longUserFactors.first()._1.getClass === classOf[Long])
 
     val strRatings = ratings.map(r => Rating(r.user.toString, r.item.toString, r.rating))
-    val (strUserFactors, _) = ALS.train(strRatings, rank = 2, maxIter = 4)
+    val (strUserFactors, _) = ALS.train(strRatings, rank = 2, maxIter = 4, seed = 0)
     assert(strUserFactors.first()._1.getClass === classOf[String])
   }
 
   test("nonnegative constraint") {
     val (ratings, _) = genImplicitTestData(numUsers = 20, numItems = 40, rank = 2, noiseStd = 0.01)
-    val (userFactors, itemFactors) = ALS.train(ratings, rank = 2, maxIter = 4, nonnegative = true)
+    val (userFactors, itemFactors) =
+      ALS.train(ratings, rank = 2, maxIter = 4, nonnegative = true, seed = 0)
     def isNonnegative(factors: RDD[(Int, Array[Float])]): Boolean = {
       factors.values.map { _.forall(_ >= 0.0) }.reduce(_ && _)
     }
@@ -459,7 +461,7 @@ class ALSSuite extends FunSuite with MLlibTestSparkContext with Logging {
   test("partitioner in returned factors") {
     val (ratings, _) = genImplicitTestData(numUsers = 20, numItems = 40, rank = 2, noiseStd = 0.01)
     val (userFactors, itemFactors) = ALS.train(
-      ratings, rank = 2, maxIter = 4, numUserBlocks = 3, numItemBlocks = 4)
+      ratings, rank = 2, maxIter = 4, numUserBlocks = 3, numItemBlocks = 4, seed = 0)
     for ((tpe, factors) <- Seq(("User", userFactors), ("Item", itemFactors))) {
       assert(userFactors.partitioner.isDefined, s"$tpe factors should have partitioner.")
       val part = userFactors.partitioner.get
@@ -476,8 +478,8 @@ class ALSSuite extends FunSuite with MLlibTestSparkContext with Logging {
 
   test("als with large number of iterations") {
     val (ratings, _) = genExplicitTestData(numUsers = 4, numItems = 4, rank = 1)
-    ALS.train(ratings, rank = 1, maxIter = 50, numUserBlocks = 2, numItemBlocks = 2)
-    ALS.train(
-      ratings, rank = 1, maxIter = 50, numUserBlocks = 2, numItemBlocks = 2, implicitPrefs = true)
+    ALS.train(ratings, rank = 1, maxIter = 50, numUserBlocks = 2, numItemBlocks = 2, seed = 0)
+    ALS.train(ratings, rank = 1, maxIter = 50, numUserBlocks = 2, numItemBlocks = 2,
+      implicitPrefs = true, seed = 0)
   }
 }

From 3c4c1f96474b3e66fa1d44ac0177f548cf5a3a10 Mon Sep 17 00:00:00 2001
From: Iulian Dragos <jaguarul@gmail.com>
Date: Tue, 19 May 2015 12:14:48 -0700
Subject: [PATCH 075/525] [SPARK-7726] Fix Scaladoc false errors

Visibility rules for static members are different in Scala and Java, and this case requires an explicit static import. Even though these are Java files, they are run through scaladoc, which enforces Scala rules.

Also reverted the commit that reverts the upgrade to 2.11.6

Author: Iulian Dragos <jaguarul@gmail.com>

Closes #6260 from dragos/issue/scaladoc-false-error and squashes the following commits:

f2e998e [Iulian Dragos] Revert "[HOTFIX] Revert "[SPARK-7092] Update spark scala version to 2.11.6""
0bad052 [Iulian Dragos] Fix scaladoc faux-error.
---
 .../org/apache/spark/network/shuffle/protocol/OpenBlocks.java | 3 +++
 .../spark/network/shuffle/protocol/RegisterExecutor.java      | 3 +++
 .../apache/spark/network/shuffle/protocol/StreamHandle.java   | 3 +++
 .../apache/spark/network/shuffle/protocol/UploadBlock.java    | 3 +++
 pom.xml                                                       | 4 ++--
 .../src/main/scala/org/apache/spark/repl/SparkIMain.scala     | 2 +-
 6 files changed, 15 insertions(+), 3 deletions(-)

diff --git a/network/shuffle/src/main/java/org/apache/spark/network/shuffle/protocol/OpenBlocks.java b/network/shuffle/src/main/java/org/apache/spark/network/shuffle/protocol/OpenBlocks.java
index 60485bace643c..ce954b8a289e4 100644
--- a/network/shuffle/src/main/java/org/apache/spark/network/shuffle/protocol/OpenBlocks.java
+++ b/network/shuffle/src/main/java/org/apache/spark/network/shuffle/protocol/OpenBlocks.java
@@ -24,6 +24,9 @@
 
 import org.apache.spark.network.protocol.Encoders;
 
+// Needed by ScalaDoc. See SPARK-7726
+import static org.apache.spark.network.shuffle.protocol.BlockTransferMessage.Type;
+
 /** Request to read a set of blocks. Returns {@link StreamHandle}. */
 public class OpenBlocks extends BlockTransferMessage {
   public final String appId;
diff --git a/network/shuffle/src/main/java/org/apache/spark/network/shuffle/protocol/RegisterExecutor.java b/network/shuffle/src/main/java/org/apache/spark/network/shuffle/protocol/RegisterExecutor.java
index 38acae3b31d64..cca8b17c4f129 100644
--- a/network/shuffle/src/main/java/org/apache/spark/network/shuffle/protocol/RegisterExecutor.java
+++ b/network/shuffle/src/main/java/org/apache/spark/network/shuffle/protocol/RegisterExecutor.java
@@ -22,6 +22,9 @@
 
 import org.apache.spark.network.protocol.Encoders;
 
+// Needed by ScalaDoc. See SPARK-7726
+import static org.apache.spark.network.shuffle.protocol.BlockTransferMessage.Type;
+
 /**
  * Initial registration message between an executor and its local shuffle server.
  * Returns nothing (empty bye array).
diff --git a/network/shuffle/src/main/java/org/apache/spark/network/shuffle/protocol/StreamHandle.java b/network/shuffle/src/main/java/org/apache/spark/network/shuffle/protocol/StreamHandle.java
index 9a9220211a50c..1915295aa6cc2 100644
--- a/network/shuffle/src/main/java/org/apache/spark/network/shuffle/protocol/StreamHandle.java
+++ b/network/shuffle/src/main/java/org/apache/spark/network/shuffle/protocol/StreamHandle.java
@@ -20,6 +20,9 @@
 import com.google.common.base.Objects;
 import io.netty.buffer.ByteBuf;
 
+// Needed by ScalaDoc. See SPARK-7726
+import static org.apache.spark.network.shuffle.protocol.BlockTransferMessage.Type;
+
 /**
  * Identifier for a fixed number of chunks to read from a stream created by an "open blocks"
  * message. This is used by {@link org.apache.spark.network.shuffle.OneForOneBlockFetcher}.
diff --git a/network/shuffle/src/main/java/org/apache/spark/network/shuffle/protocol/UploadBlock.java b/network/shuffle/src/main/java/org/apache/spark/network/shuffle/protocol/UploadBlock.java
index 2ff9aaa650f92..3caed59d508fd 100644
--- a/network/shuffle/src/main/java/org/apache/spark/network/shuffle/protocol/UploadBlock.java
+++ b/network/shuffle/src/main/java/org/apache/spark/network/shuffle/protocol/UploadBlock.java
@@ -24,6 +24,9 @@
 
 import org.apache.spark.network.protocol.Encoders;
 
+// Needed by ScalaDoc. See SPARK-7726
+import static org.apache.spark.network.shuffle.protocol.BlockTransferMessage.Type;
+
 
 /** Request to upload a block with a certain StorageLevel. Returns nothing (empty byte array). */
 public class UploadBlock extends BlockTransferMessage {
diff --git a/pom.xml b/pom.xml
index d903f02c1aed0..c72d7cbf843ef 100644
--- a/pom.xml
+++ b/pom.xml
@@ -1799,9 +1799,9 @@
         <property><name>scala-2.11</name></property>
       </activation>
       <properties>
-        <scala.version>2.11.2</scala.version>
+        <scala.version>2.11.6</scala.version>
         <scala.binary.version>2.11</scala.binary.version>
-        <jline.version>2.12</jline.version>
+        <jline.version>2.12.1</jline.version>
         <jline.groupid>jline</jline.groupid>
       </properties>
     </profile>
diff --git a/repl/scala-2.11/src/main/scala/org/apache/spark/repl/SparkIMain.scala b/repl/scala-2.11/src/main/scala/org/apache/spark/repl/SparkIMain.scala
index 1bb62c84abddc..1cb910f376060 100644
--- a/repl/scala-2.11/src/main/scala/org/apache/spark/repl/SparkIMain.scala
+++ b/repl/scala-2.11/src/main/scala/org/apache/spark/repl/SparkIMain.scala
@@ -1129,7 +1129,7 @@ class SparkIMain(@BeanProperty val factory: ScriptEngineFactory, initialSettings
 
     def apply(line: String): Result = debugging(s"""parse("$line")""")  {
       var isIncomplete = false
-      currentRun.reporting.withIncompleteHandler((_, _) => isIncomplete = true) {
+      currentRun.parsing.withIncompleteHandler((_, _) => isIncomplete = true) {
         reporter.reset()
         val trees = newUnitParser(line).parseStats()
         if (reporter.hasErrors) Error

From 68fb2a46edc95f867d4b28597d20da2597f008c1 Mon Sep 17 00:00:00 2001
From: Xusen Yin <yinxusen@gmail.com>
Date: Tue, 19 May 2015 13:43:48 -0700
Subject: [PATCH 076/525] [SPARK-7586] [ML] [DOC] Add docs of Word2Vec in ml
 package

CC jkbradley.

JIRA [issue](https://issues.apache.org/jira/browse/SPARK-7586).

Author: Xusen Yin <yinxusen@gmail.com>

Closes #6181 from yinxusen/SPARK-7586 and squashes the following commits:

77014c5 [Xusen Yin] comment fix
57a4c07 [Xusen Yin] small fix for docs
1178c8f [Xusen Yin] remove the correctness check in java suite
1c3f389 [Xusen Yin] delete sbt commit
1af152b [Xusen Yin] check python example code
1b5369e [Xusen Yin] add docs of word2vec
---
 docs/ml-features.md                           | 89 +++++++++++++++++++
 .../spark/ml/feature/JavaWord2VecSuite.java   | 76 ++++++++++++++++
 2 files changed, 165 insertions(+)
 create mode 100644 mllib/src/test/java/org/apache/spark/ml/feature/JavaWord2VecSuite.java

diff --git a/docs/ml-features.md b/docs/ml-features.md
index e86f9edc4f68b..63ea3e5db7ac9 100644
--- a/docs/ml-features.md
+++ b/docs/ml-features.md
@@ -106,6 +106,95 @@ for features_label in featurized.select("features", "label").take(3):
 </div>
 </div>
 
+## Word2Vec
+
+`Word2Vec` is an `Estimator` which takes sequences of words that represents documents and trains a `Word2VecModel`. The model is a `Map(String, Vector)` essentially, which maps each word to an unique fix-sized vector. The `Word2VecModel` transforms each documents into a vector using the average of all words in the document, which aims to other computations of documents such as similarity calculation consequencely. Please refer to the [MLlib user guide on Word2Vec](mllib-feature-extraction.html#Word2Vec) for more details on Word2Vec.
+
+Word2Vec is implemented in [Word2Vec](api/scala/index.html#org.apache.spark.ml.feature.Word2Vec). In the following code segment, we start with a set of documents, each of them is represented as a sequence of words. For each document, we transform it into a feature vector. This feature vector could then be passed to a learning algorithm.
+
+<div class="codetabs">
+<div data-lang="scala" markdown="1">
+{% highlight scala %}
+import org.apache.spark.ml.feature.Word2Vec
+
+// Input data: Each row is a bag of words from a sentence or document.
+val documentDF = sqlContext.createDataFrame(Seq(
+  "Hi I heard about Spark".split(" "),
+  "I wish Java could use case classes".split(" "),
+  "Logistic regression models are neat".split(" ")
+).map(Tuple1.apply)).toDF("text")
+
+// Learn a mapping from words to Vectors.
+val word2Vec = new Word2Vec()
+  .setInputCol("text")
+  .setOutputCol("result")
+  .setVectorSize(3)
+  .setMinCount(0)
+val model = word2Vec.fit(documentDF)
+val result = model.transform(documentDF)
+result.select("result").take(3).foreach(println)
+{% endhighlight %}
+</div>
+
+<div data-lang="java" markdown="1">
+{% highlight java %}
+import com.google.common.collect.Lists;
+
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.sql.DataFrame;
+import org.apache.spark.sql.Row;
+import org.apache.spark.sql.RowFactory;
+import org.apache.spark.sql.SQLContext;
+import org.apache.spark.sql.types.*;
+
+JavaSparkContext jsc = ...
+SQLContext sqlContext = ...
+
+// Input data: Each row is a bag of words from a sentence or document.
+JavaRDD<Row> jrdd = jsc.parallelize(Lists.newArrayList(
+  RowFactory.create(Lists.newArrayList("Hi I heard about Spark".split(" "))),
+  RowFactory.create(Lists.newArrayList("I wish Java could use case classes".split(" "))),
+  RowFactory.create(Lists.newArrayList("Logistic regression models are neat".split(" ")))
+));
+StructType schema = new StructType(new StructField[]{
+  new StructField("text", new ArrayType(DataTypes.StringType, true), false, Metadata.empty())
+});
+DataFrame documentDF = sqlContext.createDataFrame(jrdd, schema);
+
+// Learn a mapping from words to Vectors.
+Word2Vec word2Vec = new Word2Vec()
+  .setInputCol("text")
+  .setOutputCol("result")
+  .setVectorSize(3)
+  .setMinCount(0);
+Word2VecModel model = word2Vec.fit(documentDF);
+DataFrame result = model.transform(documentDF);
+for (Row r: result.select("result").take(3)) {
+  System.out.println(r);
+}
+{% endhighlight %}
+</div>
+
+<div data-lang="python" markdown="1">
+{% highlight python %}
+from pyspark.ml.feature import Word2Vec
+
+# Input data: Each row is a bag of words from a sentence or document.
+documentDF = sqlContext.createDataFrame([
+  ("Hi I heard about Spark".split(" "), ),
+  ("I wish Java could use case classes".split(" "), ),
+  ("Logistic regression models are neat".split(" "), )
+], ["text"])
+# Learn a mapping from words to Vectors.
+word2Vec = Word2Vec(vectorSize=3, minCount=0, inputCol="text", outputCol="result")
+model = word2Vec.fit(documentDF)
+result = model.transform(documentDF)
+for feature in result.select("result").take(3):
+  print(feature)
+{% endhighlight %}
+</div>
+</div>
 
 # Feature Transformers
 
diff --git a/mllib/src/test/java/org/apache/spark/ml/feature/JavaWord2VecSuite.java b/mllib/src/test/java/org/apache/spark/ml/feature/JavaWord2VecSuite.java
new file mode 100644
index 0000000000000..39c70157f83c0
--- /dev/null
+++ b/mllib/src/test/java/org/apache/spark/ml/feature/JavaWord2VecSuite.java
@@ -0,0 +1,76 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.ml.feature;
+
+import com.google.common.collect.Lists;
+import org.junit.After;
+import org.junit.Assert;
+import org.junit.Before;
+import org.junit.Test;
+
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.mllib.linalg.Vector;
+import org.apache.spark.sql.DataFrame;
+import org.apache.spark.sql.Row;
+import org.apache.spark.sql.RowFactory;
+import org.apache.spark.sql.SQLContext;
+import org.apache.spark.sql.types.*;
+
+public class JavaWord2VecSuite {
+  private transient JavaSparkContext jsc;
+  private transient SQLContext sqlContext;
+
+  @Before
+  public void setUp() {
+    jsc = new JavaSparkContext("local", "JavaWord2VecSuite");
+    sqlContext = new SQLContext(jsc);
+  }
+
+  @After
+  public void tearDown() {
+    jsc.stop();
+    jsc = null;
+  }
+
+  @Test
+  public void testJavaWord2Vec() {
+    JavaRDD<Row> jrdd = jsc.parallelize(Lists.newArrayList(
+      RowFactory.create(Lists.newArrayList("Hi I heard about Spark".split(" "))),
+      RowFactory.create(Lists.newArrayList("I wish Java could use case classes".split(" "))),
+      RowFactory.create(Lists.newArrayList("Logistic regression models are neat".split(" ")))
+    ));
+    StructType schema = new StructType(new StructField[]{
+      new StructField("text", new ArrayType(DataTypes.StringType, true), false, Metadata.empty())
+    });
+    DataFrame documentDF = sqlContext.createDataFrame(jrdd, schema);
+
+    Word2Vec word2Vec = new Word2Vec()
+      .setInputCol("text")
+      .setOutputCol("result")
+      .setVectorSize(3)
+      .setMinCount(0);
+    Word2VecModel model = word2Vec.fit(documentDF);
+    DataFrame result = model.transform(documentDF);
+
+    for (Row r: result.select("result").collect()) {
+      double[] polyFeatures = ((Vector)r.get(0)).toArray();
+      Assert.assertEquals(polyFeatures.length, 3);
+    }
+  }
+}

From c12dff9b82e4869f866a9b96ce0bf05503dd7dda Mon Sep 17 00:00:00 2001
From: Liang-Chi Hsieh <viirya@gmail.com>
Date: Tue, 19 May 2015 13:53:08 -0700
Subject: [PATCH 077/525] [SPARK-7652] [MLLIB] Update the implementation of
 naive Bayes prediction with BLAS

JIRA: https://issues.apache.org/jira/browse/SPARK-7652

Author: Liang-Chi Hsieh <viirya@gmail.com>

Closes #6189 from viirya/naive_bayes_blas_prediction and squashes the following commits:

ab611fd [Liang-Chi Hsieh] Remove unnecessary space.
ddc48b9 [Liang-Chi Hsieh] Merge remote-tracking branch 'upstream/master' into naive_bayes_blas_prediction
b5772b4 [Liang-Chi Hsieh] Fix binary compatibility.
2f65186 [Liang-Chi Hsieh] Remove toDense.
1b6cdfe [Liang-Chi Hsieh] Update the implementation of naive Bayes prediction with BLAS.
---
 .../mllib/classification/NaiveBayes.scala     | 41 +++++++++++--------
 1 file changed, 24 insertions(+), 17 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/classification/NaiveBayes.scala b/mllib/src/main/scala/org/apache/spark/mllib/classification/NaiveBayes.scala
index ac0ebeceaa1df..53fb2cba03cbf 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/classification/NaiveBayes.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/classification/NaiveBayes.scala
@@ -21,13 +21,11 @@ import java.lang.{Iterable => JIterable}
 
 import scala.collection.JavaConverters._
 
-import breeze.linalg.{Axis, DenseMatrix => BDM, DenseVector => BDV, argmax => brzArgmax, sum => brzSum}
-import breeze.numerics.{exp => brzExp, log => brzLog}
 import org.json4s.JsonDSL._
 import org.json4s.jackson.JsonMethods._
 
 import org.apache.spark.{Logging, SparkContext, SparkException}
-import org.apache.spark.mllib.linalg.{BLAS, DenseVector, SparseVector, Vector}
+import org.apache.spark.mllib.linalg.{BLAS, DenseMatrix, DenseVector, SparseVector, Vector, Vectors}
 import org.apache.spark.mllib.regression.LabeledPoint
 import org.apache.spark.mllib.util.{Loader, Saveable}
 import org.apache.spark.rdd.RDD
@@ -50,6 +48,9 @@ class NaiveBayesModel private[mllib] (
     val modelType: String)
   extends ClassificationModel with Serializable with Saveable {
 
+  private val piVector = new DenseVector(pi)
+  private val thetaMatrix = new DenseMatrix(labels.size, theta(0).size, theta.flatten, true)
+
   private[mllib] def this(labels: Array[Double], pi: Array[Double], theta: Array[Array[Double]]) =
     this(labels, pi, theta, "Multinomial")
 
@@ -60,17 +61,18 @@ class NaiveBayesModel private[mllib] (
       theta: JIterable[JIterable[Double]]) =
     this(labels.asScala.toArray, pi.asScala.toArray, theta.asScala.toArray.map(_.asScala.toArray))
 
-  private val brzPi = new BDV[Double](pi)
-  private val brzTheta = new BDM(theta(0).length, theta.length, theta.flatten).t
-
   // Bernoulli scoring requires log(condprob) if 1, log(1-condprob) if 0.
-  // This precomputes log(1.0 - exp(theta)) and its sum  which are used for the  linear algebra
+  // This precomputes log(1.0 - exp(theta)) and its sum which are used for the linear algebra
   // application of this condition (in predict function).
-  private val (brzNegTheta, brzNegThetaSum) = modelType match {
+  private val (thetaMinusNegTheta, negThetaSum) = modelType match {
     case "Multinomial" => (None, None)
     case "Bernoulli" =>
-      val negTheta = brzLog((brzExp(brzTheta.copy) :*= (-1.0)) :+= 1.0) // log(1.0 - exp(x))
-      (Option(negTheta), Option(brzSum(negTheta, Axis._1)))
+      val negTheta = thetaMatrix.map(value => math.log(1.0 - math.exp(value)))
+      val ones = new DenseVector(Array.fill(thetaMatrix.numCols){1.0})
+      val thetaMinusNegTheta = thetaMatrix.map { value =>
+        value - math.log(1.0 - math.exp(value))
+      }
+      (Option(thetaMinusNegTheta), Option(negTheta.multiply(ones)))
     case _ =>
       // This should never happen.
       throw new UnknownError(s"NaiveBayesModel was created with an unknown ModelType: $modelType")
@@ -85,17 +87,22 @@ class NaiveBayesModel private[mllib] (
   }
 
   override def predict(testData: Vector): Double = {
-    val brzData = testData.toBreeze
     modelType match {
       case "Multinomial" =>
-        labels(brzArgmax(brzPi + brzTheta * brzData))
+        val prob = thetaMatrix.multiply(testData)
+        BLAS.axpy(1.0, piVector, prob)
+        labels(prob.argmax)
       case "Bernoulli" =>
-        if (!brzData.forall(v => v == 0.0 || v == 1.0)) {
-          throw new SparkException(
-            s"Bernoulli Naive Bayes requires 0 or 1 feature values but found $testData.")
+        testData.foreachActive { (index, value) =>
+          if (value != 0.0 && value != 1.0) {
+            throw new SparkException(
+              s"Bernoulli Naive Bayes requires 0 or 1 feature values but found $testData.")
+          }
         }
-        labels(brzArgmax(brzPi +
-          (brzTheta - brzNegTheta.get) * brzData + brzNegThetaSum.get))
+        val prob = thetaMinusNegTheta.get.multiply(testData)
+        BLAS.axpy(1.0, piVector, prob)
+        BLAS.axpy(1.0, negThetaSum.get, prob)
+        labels(prob.argmax)
       case _ =>
         // This should never happen.
         throw new UnknownError(s"NaiveBayesModel was created with an unknown ModelType: $modelType")

From 4de74d2602f6577c3c8458aa85377e89c19724ca Mon Sep 17 00:00:00 2001
From: Davies Liu <davies@databricks.com>
Date: Tue, 19 May 2015 14:23:28 -0700
Subject: [PATCH 078/525] [SPARK-7738] [SQL] [PySpark] add reader and writer
 API in Python

cc rxin, please take a quick look, I'm working on tests.

Author: Davies Liu <davies@databricks.com>

Closes #6238 from davies/readwrite and squashes the following commits:

c7200eb [Davies Liu] update tests
9cbf01b [Davies Liu] Merge branch 'master' of github.com:apache/spark into readwrite
f0c5a04 [Davies Liu] use sqlContext.read.load
5f68bc8 [Davies Liu] update tests
6437e9a [Davies Liu] Merge branch 'master' of github.com:apache/spark into readwrite
bcc6668 [Davies Liu] add reader amd writer API in Python
---
 .../apache/spark/api/python/PythonUtils.scala |  11 +-
 python/pyspark/sql/__init__.py                |   1 +
 python/pyspark/sql/context.py                 |  28 +-
 python/pyspark/sql/dataframe.py               |  67 ++--
 python/pyspark/sql/readwriter.py              | 338 ++++++++++++++++++
 python/pyspark/sql/tests.py                   |  77 ++--
 6 files changed, 430 insertions(+), 92 deletions(-)
 create mode 100644 python/pyspark/sql/readwriter.py

diff --git a/core/src/main/scala/org/apache/spark/api/python/PythonUtils.scala b/core/src/main/scala/org/apache/spark/api/python/PythonUtils.scala
index efb6b93cfc35d..90dacaeb93429 100644
--- a/core/src/main/scala/org/apache/spark/api/python/PythonUtils.scala
+++ b/core/src/main/scala/org/apache/spark/api/python/PythonUtils.scala
@@ -50,8 +50,15 @@ private[spark] object PythonUtils {
   /**
    * Convert list of T into seq of T (for calling API with varargs)
    */
-  def toSeq[T](cols: JList[T]): Seq[T] = {
-    cols.toList.toSeq
+  def toSeq[T](vs: JList[T]): Seq[T] = {
+    vs.toList.toSeq
+  }
+
+  /**
+   * Convert list of T into array of T (for calling API with array)
+   */
+  def toArray[T](vs: JList[T]): Array[T] = {
+    vs.toArray().asInstanceOf[Array[T]]
   }
 
   /**
diff --git a/python/pyspark/sql/__init__.py b/python/pyspark/sql/__init__.py
index 19805e291e91b..634c575ecd80e 100644
--- a/python/pyspark/sql/__init__.py
+++ b/python/pyspark/sql/__init__.py
@@ -58,6 +58,7 @@
 from pyspark.sql.column import Column
 from pyspark.sql.dataframe import DataFrame, SchemaRDD, DataFrameNaFunctions, DataFrameStatFunctions
 from pyspark.sql.group import GroupedData
+from pyspark.sql.readwriter import DataFrameReader, DataFrameWriter
 
 __all__ = [
     'SQLContext', 'HiveContext', 'DataFrame', 'GroupedData', 'Column', 'Row',
diff --git a/python/pyspark/sql/context.py b/python/pyspark/sql/context.py
index 9f26d13235d5f..7543475014bd2 100644
--- a/python/pyspark/sql/context.py
+++ b/python/pyspark/sql/context.py
@@ -31,6 +31,7 @@
 from pyspark.sql.types import Row, StringType, StructType, _verify_type, \
     _infer_schema, _has_nulltype, _merge_type, _create_converter, _python_to_sql_converter
 from pyspark.sql.dataframe import DataFrame
+from pyspark.sql.readwriter import DataFrameReader
 
 try:
     import pandas
@@ -457,19 +458,7 @@ def load(self, path=None, source=None, schema=None, **options):
 
         Optionally, a schema can be provided as the schema of the returned DataFrame.
         """
-        if path is not None:
-            options["path"] = path
-        if source is None:
-            source = self.getConf("spark.sql.sources.default",
-                                  "org.apache.spark.sql.parquet")
-        if schema is None:
-            df = self._ssql_ctx.load(source, options)
-        else:
-            if not isinstance(schema, StructType):
-                raise TypeError("schema should be StructType")
-            scala_datatype = self._ssql_ctx.parseDataType(schema.json())
-            df = self._ssql_ctx.load(source, scala_datatype, options)
-        return DataFrame(df, self)
+        return self.read.load(path, source, schema, **options)
 
     def createExternalTable(self, tableName, path=None, source=None,
                             schema=None, **options):
@@ -567,6 +556,19 @@ def clearCache(self):
         """Removes all cached tables from the in-memory cache. """
         self._ssql_ctx.clearCache()
 
+    @property
+    def read(self):
+        """
+        Returns a :class:`DataFrameReader` that can be used to read data
+        in as a :class:`DataFrame`.
+
+        ::note: Experimental
+
+        >>> sqlContext.read
+        <pyspark.sql.readwriter.DataFrameReader object at ...>
+        """
+        return DataFrameReader(self)
+
 
 class HiveContext(SQLContext):
     """A variant of Spark SQL that integrates with data stored in Hive.
diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py
index e4a191a9ef07f..f2280b5100e53 100644
--- a/python/pyspark/sql/dataframe.py
+++ b/python/pyspark/sql/dataframe.py
@@ -29,9 +29,10 @@
 from pyspark.serializers import BatchedSerializer, PickleSerializer, UTF8Deserializer
 from pyspark.storagelevel import StorageLevel
 from pyspark.traceback_utils import SCCallSiteSync
-from pyspark.sql.types import *
 from pyspark.sql.types import _create_cls, _parse_datatype_json_string
 from pyspark.sql.column import Column, _to_seq, _to_java_column
+from pyspark.sql.readwriter import DataFrameWriter
+from pyspark.sql.types import *
 
 __all__ = ["DataFrame", "SchemaRDD", "DataFrameNaFunctions", "DataFrameStatFunctions"]
 
@@ -151,25 +152,6 @@ def insertInto(self, tableName, overwrite=False):
         """
         self._jdf.insertInto(tableName, overwrite)
 
-    def _java_save_mode(self, mode):
-        """Returns the Java save mode based on the Python save mode represented by a string.
-        """
-        jSaveMode = self._sc._jvm.org.apache.spark.sql.SaveMode
-        jmode = jSaveMode.ErrorIfExists
-        mode = mode.lower()
-        if mode == "append":
-            jmode = jSaveMode.Append
-        elif mode == "overwrite":
-            jmode = jSaveMode.Overwrite
-        elif mode == "ignore":
-            jmode = jSaveMode.Ignore
-        elif mode == "error":
-            pass
-        else:
-            raise ValueError(
-                "Only 'append', 'overwrite', 'ignore', and 'error' are acceptable save mode.")
-        return jmode
-
     def saveAsTable(self, tableName, source=None, mode="error", **options):
         """Saves the contents of this :class:`DataFrame` to a data source as a table.
 
@@ -185,11 +167,7 @@ def saveAsTable(self, tableName, source=None, mode="error", **options):
         * `error`: Throw an exception if data already exists.
         * `ignore`: Silently ignore this operation if data already exists.
         """
-        if source is None:
-            source = self.sql_ctx.getConf("spark.sql.sources.default",
-                                          "org.apache.spark.sql.parquet")
-        jmode = self._java_save_mode(mode)
-        self._jdf.saveAsTable(tableName, source, jmode, options)
+        self.write.saveAsTable(tableName, source, mode, **options)
 
     def save(self, path=None, source=None, mode="error", **options):
         """Saves the contents of the :class:`DataFrame` to a data source.
@@ -206,13 +184,22 @@ def save(self, path=None, source=None, mode="error", **options):
         * `error`: Throw an exception if data already exists.
         * `ignore`: Silently ignore this operation if data already exists.
         """
-        if path is not None:
-            options["path"] = path
-        if source is None:
-            source = self.sql_ctx.getConf("spark.sql.sources.default",
-                                          "org.apache.spark.sql.parquet")
-        jmode = self._java_save_mode(mode)
-        self._jdf.save(source, jmode, options)
+        return self.write.save(path, source, mode, **options)
+
+    @property
+    def write(self):
+        """
+        Interface for saving the content of the :class:`DataFrame` out
+        into external storage.
+
+        :return :class:`DataFrameWriter`
+
+        ::note: Experimental
+
+        >>> df.write
+        <pyspark.sql.readwriter.DataFrameWriter object at ...>
+        """
+        return DataFrameWriter(self)
 
     @property
     def schema(self):
@@ -411,9 +398,19 @@ def unpersist(self, blocking=True):
         self._jdf.unpersist(blocking)
         return self
 
-    # def coalesce(self, numPartitions, shuffle=False):
-    #     rdd = self._jdf.coalesce(numPartitions, shuffle, None)
-    #     return DataFrame(rdd, self.sql_ctx)
+    def coalesce(self, numPartitions):
+        """
+        Returns a new :class:`DataFrame` that has exactly `numPartitions` partitions.
+
+        Similar to coalesce defined on an :class:`RDD`, this operation results in a
+        narrow dependency, e.g. if you go from 1000 partitions to 100 partitions,
+        there will not be a shuffle, instead each of the 100 new partitions will
+        claim 10 of the current partitions.
+
+        >>> df.coalesce(1).rdd.getNumPartitions()
+        1
+        """
+        return DataFrame(self._jdf.coalesce(numPartitions), self.sql_ctx)
 
     def repartition(self, numPartitions):
         """Returns a new :class:`DataFrame` that has exactly ``numPartitions`` partitions.
diff --git a/python/pyspark/sql/readwriter.py b/python/pyspark/sql/readwriter.py
new file mode 100644
index 0000000000000..e2b27fb587e73
--- /dev/null
+++ b/python/pyspark/sql/readwriter.py
@@ -0,0 +1,338 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from py4j.java_gateway import JavaClass
+
+from pyspark.sql.column import _to_seq
+from pyspark.sql.types import *
+
+__all__ = ["DataFrameReader", "DataFrameWriter"]
+
+
+class DataFrameReader(object):
+    """
+    Interface used to load a :class:`DataFrame` from external storage systems
+    (e.g. file systems, key-value stores, etc). Use :func:`SQLContext.read`
+    to access this.
+
+    ::Note: Experimental
+    """
+
+    def __init__(self, sqlContext):
+        self._jreader = sqlContext._ssql_ctx.read()
+        self._sqlContext = sqlContext
+
+    def _df(self, jdf):
+        from pyspark.sql.dataframe import DataFrame
+        return DataFrame(jdf, self._sqlContext)
+
+    def load(self, path=None, format=None, schema=None, **options):
+        """Loads data from a data source and returns it as a :class`DataFrame`.
+
+        :param path: optional string for file-system backed data sources.
+        :param format: optional string for format of the data source. Default to 'parquet'.
+        :param schema: optional :class:`StructType` for the input schema.
+        :param options: all other string options
+        """
+        jreader = self._jreader
+        if format is not None:
+            jreader = jreader.format(format)
+        if schema is not None:
+            if not isinstance(schema, StructType):
+                raise TypeError("schema should be StructType")
+            jschema = self._sqlContext._ssql_ctx.parseDataType(schema.json())
+            jreader = jreader.schema(jschema)
+        for k in options:
+            jreader = jreader.option(k, options[k])
+        if path is not None:
+            return self._df(jreader.load(path))
+        else:
+            return self._df(jreader.load())
+
+    def json(self, path, schema=None):
+        """
+        Loads a JSON file (one object per line) and returns the result as
+        a :class`DataFrame`.
+
+        If the ``schema`` parameter is not specified, this function goes
+        through the input once to determine the input schema.
+
+        :param path: string, path to the JSON dataset.
+        :param schema: an optional :class:`StructType` for the input schema.
+
+        >>> import tempfile, shutil
+        >>> jsonFile = tempfile.mkdtemp()
+        >>> shutil.rmtree(jsonFile)
+        >>> with open(jsonFile, 'w') as f:
+        ...     f.writelines(jsonStrings)
+        >>> df1 = sqlContext.read.json(jsonFile)
+        >>> df1.printSchema()
+        root
+         |-- field1: long (nullable = true)
+         |-- field2: string (nullable = true)
+         |-- field3: struct (nullable = true)
+         |    |-- field4: long (nullable = true)
+
+        >>> from pyspark.sql.types import *
+        >>> schema = StructType([
+        ...     StructField("field2", StringType()),
+        ...     StructField("field3",
+        ...         StructType([StructField("field5", ArrayType(IntegerType()))]))])
+        >>> df2 = sqlContext.read.json(jsonFile, schema)
+        >>> df2.printSchema()
+        root
+         |-- field2: string (nullable = true)
+         |-- field3: struct (nullable = true)
+         |    |-- field5: array (nullable = true)
+         |    |    |-- element: integer (containsNull = true)
+        """
+        if schema is None:
+            jdf = self._jreader.json(path)
+        else:
+            jschema = self._sqlContext._ssql_ctx.parseDataType(schema.json())
+            jdf = self._jreader.schema(jschema).json(path)
+        return self._df(jdf)
+
+    def table(self, tableName):
+        """Returns the specified table as a :class:`DataFrame`.
+
+        >>> sqlContext.registerDataFrameAsTable(df, "table1")
+        >>> df2 = sqlContext.read.table("table1")
+        >>> sorted(df.collect()) == sorted(df2.collect())
+        True
+        """
+        return self._df(self._jreader.table(tableName))
+
+    def parquet(self, *path):
+        """Loads a Parquet file, returning the result as a :class:`DataFrame`.
+
+        >>> import tempfile, shutil
+        >>> parquetFile = tempfile.mkdtemp()
+        >>> shutil.rmtree(parquetFile)
+        >>> df.saveAsParquetFile(parquetFile)
+        >>> df2 = sqlContext.read.parquet(parquetFile)
+        >>> sorted(df.collect()) == sorted(df2.collect())
+        True
+        """
+        return self._df(self._jreader.parquet(_to_seq(self._sqlContext._sc, path)))
+
+    def jdbc(self, url, table, column=None, lowerBound=None, upperBound=None, numPartitions=None,
+             predicates=None, properties={}):
+        """
+        Construct a :class:`DataFrame` representing the database table accessible
+        via JDBC URL `url` named `table` and connection `properties`.
+
+        The `column` parameter could be used to partition the table, then it will
+        be retrieved in parallel based on the parameters passed to this function.
+
+        The `predicates` parameter gives a list expressions suitable for inclusion
+        in WHERE clauses; each one defines one partition of the :class:`DataFrame`.
+
+        ::Note: Don't create too many partitions in parallel on a large cluster;
+        otherwise Spark might crash your external database systems.
+
+        :param url: a JDBC URL
+        :param table: name of table
+        :param column: the column used to partition
+        :param lowerBound: the lower bound of partition column
+        :param upperBound: the upper bound of the partition column
+        :param numPartitions: the number of partitions
+        :param predicates: a list of expressions
+        :param properties: JDBC database connection arguments, a list of arbitrary string
+                           tag/value. Normally at least a "user" and "password" property
+                           should be included.
+        :return: a DataFrame
+        """
+        jprop = JavaClass("java.util.Properties", self._sqlContext._sc._gateway._gateway_client)()
+        for k in properties:
+            jprop.setProperty(k, properties[k])
+        if column is not None:
+            if numPartitions is None:
+                numPartitions = self._sqlContext._sc.defaultParallelism
+            return self._df(self._jreader.jdbc(url, table, column, int(lowerBound), int(upperBound),
+                                               int(numPartitions), jprop))
+        if predicates is not None:
+            arr = self._sqlContext._sc._jvm.PythonUtils.toArray(predicates)
+            return self._df(self._jreader.jdbc(url, table, arr, jprop))
+        return self._df(self._jreader.jdbc(url, table, jprop))
+
+
+class DataFrameWriter(object):
+    """
+    Interface used to write a [[DataFrame]] to external storage systems
+    (e.g. file systems, key-value stores, etc). Use :func:`DataFrame.write`
+    to access this.
+
+    ::Note: Experimental
+    """
+    def __init__(self, df):
+        self._df = df
+        self._sqlContext = df.sql_ctx
+        self._jwrite = df._jdf.write()
+
+    def save(self, path=None, format=None, mode="error", **options):
+        """
+        Saves the contents of the :class:`DataFrame` to a data source.
+
+        The data source is specified by the ``format`` and a set of ``options``.
+        If ``format`` is not specified, the default data source configured by
+        ``spark.sql.sources.default`` will be used.
+
+        Additionally, mode is used to specify the behavior of the save operation when
+        data already exists in the data source. There are four modes:
+
+        * `append`: Append contents of this :class:`DataFrame` to existing data.
+        * `overwrite`: Overwrite existing data.
+        * `error`: Throw an exception if data already exists.
+        * `ignore`: Silently ignore this operation if data already exists.
+
+        :param path: the path in a Hadoop supported file system
+        :param format: the format used to save
+        :param mode: one of `append`, `overwrite`, `error`, `ignore` (default: error)
+        :param options: all other string options
+        """
+        jwrite = self._jwrite.mode(mode)
+        if format is not None:
+            jwrite = jwrite.format(format)
+        for k in options:
+            jwrite = jwrite.option(k, options[k])
+        if path is None:
+            jwrite.save()
+        else:
+            jwrite.save(path)
+
+    def saveAsTable(self, name, format=None, mode="error", **options):
+        """
+        Saves the contents of this :class:`DataFrame` to a data source as a table.
+
+        The data source is specified by the ``source`` and a set of ``options``.
+        If ``source`` is not specified, the default data source configured by
+        ``spark.sql.sources.default`` will be used.
+
+        Additionally, mode is used to specify the behavior of the saveAsTable operation when
+        table already exists in the data source. There are four modes:
+
+        * `append`: Append contents of this :class:`DataFrame` to existing data.
+        * `overwrite`: Overwrite existing data.
+        * `error`: Throw an exception if data already exists.
+        * `ignore`: Silently ignore this operation if data already exists.
+
+        :param name: the table name
+        :param format: the format used to save
+        :param mode: one of `append`, `overwrite`, `error`, `ignore` (default: error)
+        :param options: all other string options
+        """
+        jwrite = self._jwrite.mode(mode)
+        if format is not None:
+            jwrite = jwrite.format(format)
+        for k in options:
+            jwrite = jwrite.option(k, options[k])
+        return jwrite.saveAsTable(name)
+
+    def json(self, path, mode="error"):
+        """
+        Saves the content of the :class:`DataFrame` in JSON format at the
+        specified path.
+
+        Additionally, mode is used to specify the behavior of the save operation when
+        data already exists in the data source. There are four modes:
+
+        * `append`: Append contents of this :class:`DataFrame` to existing data.
+        * `overwrite`: Overwrite existing data.
+        * `error`: Throw an exception if data already exists.
+        * `ignore`: Silently ignore this operation if data already exists.
+
+        :param path: the path in any Hadoop supported file system
+        :param mode: one of `append`, `overwrite`, `error`, `ignore` (default: error)
+        """
+        return self._jwrite.mode(mode).json(path)
+
+    def parquet(self, path, mode="error"):
+        """
+        Saves the content of the :class:`DataFrame` in Parquet format at the
+        specified path.
+
+        Additionally, mode is used to specify the behavior of the save operation when
+        data already exists in the data source. There are four modes:
+
+        * `append`: Append contents of this :class:`DataFrame` to existing data.
+        * `overwrite`: Overwrite existing data.
+        * `error`: Throw an exception if data already exists.
+        * `ignore`: Silently ignore this operation if data already exists.
+
+        :param path: the path in any Hadoop supported file system
+        :param mode: one of `append`, `overwrite`, `error`, `ignore` (default: error)
+        """
+        return self._jwrite.mode(mode).parquet(path)
+
+    def jdbc(self, url, table, mode="error", properties={}):
+        """
+        Saves the content of the :class:`DataFrame` to a external database table
+        via JDBC.
+
+        In the case the table already exists in the external database,
+        behavior of this function depends on the save mode, specified by the `mode`
+        function (default to throwing an exception). There are four modes:
+
+        * `append`: Append contents of this :class:`DataFrame` to existing data.
+        * `overwrite`: Overwrite existing data.
+        * `error`: Throw an exception if data already exists.
+        * `ignore`: Silently ignore this operation if data already exists.
+
+        :param url: a JDBC URL of the form `jdbc:subprotocol:subname`
+        :param table: Name of the table in the external database.
+        :param mode: one of `append`, `overwrite`, `error`, `ignore` (default: error)
+        :param properties: JDBC database connection arguments, a list of
+                                    arbitrary string tag/value. Normally at least a
+                                    "user" and "password" property should be included.
+        """
+        jprop = JavaClass("java.util.Properties", self._sqlContext._sc._gateway._gateway_client)()
+        for k in properties:
+            jprop.setProperty(k, properties[k])
+        self._jwrite.mode(mode).jdbc(url, table, jprop)
+
+
+def _test():
+    import doctest
+    from pyspark.context import SparkContext
+    from pyspark.sql import Row, SQLContext
+    import pyspark.sql.readwriter
+    globs = pyspark.sql.readwriter.__dict__.copy()
+    sc = SparkContext('local[4]', 'PythonTest')
+    globs['sc'] = sc
+    globs['sqlContext'] = SQLContext(sc)
+    globs['df'] = sc.parallelize([(2, 'Alice'), (5, 'Bob')]) \
+        .toDF(StructType([StructField('age', IntegerType()),
+                          StructField('name', StringType())]))
+    jsonStrings = [
+        '{"field1": 1, "field2": "row1", "field3":{"field4":11}}',
+        '{"field1" : 2, "field3":{"field4":22, "field5": [10, 11]},'
+        '"field6":[{"field7": "row2"}]}',
+        '{"field1" : null, "field2": "row3", '
+        '"field3":{"field4":33, "field5": []}}'
+    ]
+    globs['jsonStrings'] = jsonStrings
+    (failure_count, test_count) = doctest.testmod(
+        pyspark.sql.readwriter, globs=globs,
+        optionflags=doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE | doctest.REPORT_NDIFF)
+    globs['sc'].stop()
+    if failure_count:
+        exit(-1)
+
+
+if __name__ == "__main__":
+    _test()
diff --git a/python/pyspark/sql/tests.py b/python/pyspark/sql/tests.py
index 84ae36f2fd026..7e349962416c9 100644
--- a/python/pyspark/sql/tests.py
+++ b/python/pyspark/sql/tests.py
@@ -485,29 +485,29 @@ def test_save_and_load(self):
         df = self.df
         tmpPath = tempfile.mkdtemp()
         shutil.rmtree(tmpPath)
-        df.save(tmpPath, "org.apache.spark.sql.json", "error")
-        actual = self.sqlCtx.load(tmpPath, "org.apache.spark.sql.json")
-        self.assertTrue(sorted(df.collect()) == sorted(actual.collect()))
+        df.write.json(tmpPath)
+        actual = self.sqlCtx.read.json(tmpPath)
+        self.assertEqual(sorted(df.collect()), sorted(actual.collect()))
 
         schema = StructType([StructField("value", StringType(), True)])
-        actual = self.sqlCtx.load(tmpPath, "org.apache.spark.sql.json", schema)
-        self.assertTrue(sorted(df.select("value").collect()) == sorted(actual.collect()))
+        actual = self.sqlCtx.read.json(tmpPath, schema)
+        self.assertEqual(sorted(df.select("value").collect()), sorted(actual.collect()))
 
-        df.save(tmpPath, "org.apache.spark.sql.json", "overwrite")
-        actual = self.sqlCtx.load(tmpPath, "org.apache.spark.sql.json")
-        self.assertTrue(sorted(df.collect()) == sorted(actual.collect()))
+        df.write.json(tmpPath, "overwrite")
+        actual = self.sqlCtx.read.json(tmpPath)
+        self.assertEqual(sorted(df.collect()), sorted(actual.collect()))
 
-        df.save(source="org.apache.spark.sql.json", mode="overwrite", path=tmpPath,
-                noUse="this options will not be used in save.")
-        actual = self.sqlCtx.load(source="org.apache.spark.sql.json", path=tmpPath,
-                                  noUse="this options will not be used in load.")
-        self.assertTrue(sorted(df.collect()) == sorted(actual.collect()))
+        df.write.save(format="json", mode="overwrite", path=tmpPath,
+                      noUse="this options will not be used in save.")
+        actual = self.sqlCtx.read.load(format="json", path=tmpPath,
+                                       noUse="this options will not be used in load.")
+        self.assertEqual(sorted(df.collect()), sorted(actual.collect()))
 
         defaultDataSourceName = self.sqlCtx.getConf("spark.sql.sources.default",
                                                     "org.apache.spark.sql.parquet")
         self.sqlCtx.sql("SET spark.sql.sources.default=org.apache.spark.sql.json")
         actual = self.sqlCtx.load(path=tmpPath)
-        self.assertTrue(sorted(df.collect()) == sorted(actual.collect()))
+        self.assertEqual(sorted(df.collect()), sorted(actual.collect()))
         self.sqlCtx.sql("SET spark.sql.sources.default=" + defaultDataSourceName)
 
         shutil.rmtree(tmpPath)
@@ -767,51 +767,44 @@ def test_save_and_load_table(self):
         df = self.df
         tmpPath = tempfile.mkdtemp()
         shutil.rmtree(tmpPath)
-        df.saveAsTable("savedJsonTable", "org.apache.spark.sql.json", "append", path=tmpPath)
-        actual = self.sqlCtx.createExternalTable("externalJsonTable", tmpPath,
-                                                 "org.apache.spark.sql.json")
-        self.assertTrue(
-            sorted(df.collect()) ==
-            sorted(self.sqlCtx.sql("SELECT * FROM savedJsonTable").collect()))
-        self.assertTrue(
-            sorted(df.collect()) ==
-            sorted(self.sqlCtx.sql("SELECT * FROM externalJsonTable").collect()))
-        self.assertTrue(sorted(df.collect()) == sorted(actual.collect()))
+        df.write.saveAsTable("savedJsonTable", "json", "append", path=tmpPath)
+        actual = self.sqlCtx.createExternalTable("externalJsonTable", tmpPath, "json")
+        self.assertEqual(sorted(df.collect()),
+                         sorted(self.sqlCtx.sql("SELECT * FROM savedJsonTable").collect()))
+        self.assertEqual(sorted(df.collect()),
+                         sorted(self.sqlCtx.sql("SELECT * FROM externalJsonTable").collect()))
+        self.assertEqual(sorted(df.collect()), sorted(actual.collect()))
         self.sqlCtx.sql("DROP TABLE externalJsonTable")
 
-        df.saveAsTable("savedJsonTable", "org.apache.spark.sql.json", "overwrite", path=tmpPath)
+        df.write.saveAsTable("savedJsonTable", "json", "overwrite", path=tmpPath)
         schema = StructType([StructField("value", StringType(), True)])
-        actual = self.sqlCtx.createExternalTable("externalJsonTable",
-                                                 source="org.apache.spark.sql.json",
+        actual = self.sqlCtx.createExternalTable("externalJsonTable", source="json",
                                                  schema=schema, path=tmpPath,
                                                  noUse="this options will not be used")
-        self.assertTrue(
-            sorted(df.collect()) ==
-            sorted(self.sqlCtx.sql("SELECT * FROM savedJsonTable").collect()))
-        self.assertTrue(
-            sorted(df.select("value").collect()) ==
-            sorted(self.sqlCtx.sql("SELECT * FROM externalJsonTable").collect()))
-        self.assertTrue(sorted(df.select("value").collect()) == sorted(actual.collect()))
+        self.assertEqual(sorted(df.collect()),
+                         sorted(self.sqlCtx.sql("SELECT * FROM savedJsonTable").collect()))
+        self.assertEqual(sorted(df.select("value").collect()),
+                         sorted(self.sqlCtx.sql("SELECT * FROM externalJsonTable").collect()))
+        self.assertEqual(sorted(df.select("value").collect()), sorted(actual.collect()))
         self.sqlCtx.sql("DROP TABLE savedJsonTable")
         self.sqlCtx.sql("DROP TABLE externalJsonTable")
 
         defaultDataSourceName = self.sqlCtx.getConf("spark.sql.sources.default",
                                                     "org.apache.spark.sql.parquet")
         self.sqlCtx.sql("SET spark.sql.sources.default=org.apache.spark.sql.json")
-        df.saveAsTable("savedJsonTable", path=tmpPath, mode="overwrite")
+        df.write.saveAsTable("savedJsonTable", path=tmpPath, mode="overwrite")
         actual = self.sqlCtx.createExternalTable("externalJsonTable", path=tmpPath)
-        self.assertTrue(
-            sorted(df.collect()) ==
-            sorted(self.sqlCtx.sql("SELECT * FROM savedJsonTable").collect()))
-        self.assertTrue(
-            sorted(df.collect()) ==
-            sorted(self.sqlCtx.sql("SELECT * FROM externalJsonTable").collect()))
-        self.assertTrue(sorted(df.collect()) == sorted(actual.collect()))
+        self.assertEqual(sorted(df.collect()),
+                         sorted(self.sqlCtx.sql("SELECT * FROM savedJsonTable").collect()))
+        self.assertEqual(sorted(df.collect()),
+                         sorted(self.sqlCtx.sql("SELECT * FROM externalJsonTable").collect()))
+        self.assertEqual(sorted(df.collect()), sorted(actual.collect()))
         self.sqlCtx.sql("DROP TABLE savedJsonTable")
         self.sqlCtx.sql("DROP TABLE externalJsonTable")
         self.sqlCtx.sql("SET spark.sql.sources.default=" + defaultDataSourceName)
 
         shutil.rmtree(tmpPath)
 
+
 if __name__ == "__main__":
     unittest.main()

From bcb1ff81468eb4afc7c03b2bca18e99cc1ccf6b8 Mon Sep 17 00:00:00 2001
From: Cheng Hao <hao.cheng@intel.com>
Date: Tue, 19 May 2015 15:20:46 -0700
Subject: [PATCH 079/525] [SPARK-7662] [SQL] Resolve correct names for
 generator in projection

```
select explode(map(value, key)) from src;
```
Throws exception
```
org.apache.spark.sql.AnalysisException: The number of aliases supplied in the AS clause does not match the number of columns output by the UDTF expected 2 aliases but got _c0 ;
at org.apache.spark.sql.catalyst.analysis.CheckAnalysis$class.failAnalysis(CheckAnalysis.scala:38)
at org.apache.spark.sql.catalyst.analysis.Analyzer.failAnalysis(Analyzer.scala:43)
at org.apache.spark.sql.catalyst.analysis.Analyzer$ResolveGenerate$.org$apache$spark$sql$catalyst$analysis$Analyzer$ResolveGenerate$$makeGeneratorOutput(Analyzer.scala:605)
at org.apache.spark.sql.catalyst.analysis.Analyzer$ResolveGenerate$$anonfun$apply$16$$anonfun$22.apply(Analyzer.scala:562)
at org.apache.spark.sql.catalyst.analysis.Analyzer$ResolveGenerate$$anonfun$apply$16$$anonfun$22.apply(Analyzer.scala:548)
at scala.collection.TraversableLike$$anonfun$flatMap$1.apply(TraversableLike.scala:251)
at scala.collection.TraversableLike$$anonfun$flatMap$1.apply(TraversableLike.scala:251)
at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:47)
at scala.collection.TraversableLike$class.flatMap(TraversableLike.scala:251)
at scala.collection.AbstractTraversable.flatMap(Traversable.scala:105)
at org.apache.spark.sql.catalyst.analysis.Analyzer$ResolveGenerate$$anonfun$apply$16.applyOrElse(Analyzer.scala:548)
at org.apache.spark.sql.catalyst.analysis.Analyzer$ResolveGenerate$$anonfun$apply$16.applyOrElse(Analyzer.scala:538)
at org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$3.apply(TreeNode.scala:222)
```

Author: Cheng Hao <hao.cheng@intel.com>

Closes #6178 from chenghao-intel/explode and squashes the following commits:

916fbe9 [Cheng Hao] add more strict rules for TGF alias
5c3f2c5 [Cheng Hao] fix bug in unit test
e1d93ab [Cheng Hao] Add more unit test
19db09e [Cheng Hao] resolve names for generator in projection
---
 .../sql/catalyst/analysis/Analyzer.scala      | 15 +++++++++++
 .../sql/hive/execution/HiveQuerySuite.scala   |  6 ++---
 .../sql/hive/execution/SQLQuerySuite.scala    | 25 ++++++++++++++++++-
 3 files changed, 42 insertions(+), 4 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
index dfa4215f2efe5..c239e83271615 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
@@ -561,6 +561,21 @@ class Analyzer(
     /** Extracts a [[Generator]] expression and any names assigned by aliases to their output. */
     private object AliasedGenerator {
       def unapply(e: Expression): Option[(Generator, Seq[String])] = e match {
+        case Alias(g: Generator, name)
+          if g.elementTypes.size > 1 && java.util.regex.Pattern.matches("_c[0-9]+", name) => {
+          // Assume the default name given by parser is "_c[0-9]+",
+          // TODO in long term, move the naming logic from Parser to Analyzer.
+          // In projection, Parser gave default name for TGF as does for normal UDF,
+          // but the TGF probably have multiple output columns/names.
+          //    e.g. SELECT explode(map(key, value)) FROM src;
+          // Let's simply ignore the default given name for this case.
+          Some((g, Nil))
+        }
+        case Alias(g: Generator, name) if g.elementTypes.size > 1 =>
+          // If not given the default names, and the TGF with multiple output columns
+          failAnalysis(
+            s"""Expect multiple names given for ${g.getClass.getName},
+               |but only single name '${name}' specified""".stripMargin)
         case Alias(g: Generator, name) => Some((g, name :: Nil))
         case MultiAlias(g: Generator, names) => Some(g, names)
         case _ => None
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala
index 089a57e25c08d..e7aec0b188c66 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala
@@ -111,13 +111,13 @@ class HiveQuerySuite extends HiveComparisonTest with BeforeAndAfter {
       |  SELECT key FROM gen_tmp ORDER BY key ASC;
     """.stripMargin)
 
-  test("multiple generator in projection") {
+  test("multiple generators in projection") {
     intercept[AnalysisException] {
-      sql("SELECT explode(map(key, value)), key FROM src").collect()
+      sql("SELECT explode(array(key, key)), explode(array(key, key)) FROM src").collect()
     }
 
     intercept[AnalysisException] {
-      sql("SELECT explode(map(key, value)) as k1, k2, key FROM src").collect()
+      sql("SELECT explode(array(key, key)) as k1, explode(array(key, key)) FROM src").collect()
     }
   }
 
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
index e60d00e63574d..fbbf6ba5947dc 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
@@ -548,13 +548,36 @@ class SQLQuerySuite extends QueryTest {
     dropTempTable("data")
   }
 
-  test("resolve udtf with single alias") {
+  test("resolve udtf in projection #1") {
     val rdd = sparkContext.makeRDD((1 to 5).map(i => s"""{"a":[$i, ${i + 1}]}"""))
     read.json(rdd).registerTempTable("data")
     val df = sql("SELECT explode(a) AS val FROM data")
     val col = df("val")
   }
 
+  test("resolve udtf in projection #2") {
+    val rdd = sparkContext.makeRDD((1 to 2).map(i => s"""{"a":[$i, ${i + 1}]}"""))
+    jsonRDD(rdd).registerTempTable("data")
+    checkAnswer(sql("SELECT explode(map(1, 1)) FROM data LIMIT 1"), Row(1, 1) :: Nil)
+    checkAnswer(sql("SELECT explode(map(1, 1)) as (k1, k2) FROM data LIMIT 1"), Row(1, 1) :: Nil)
+    intercept[AnalysisException] {
+      sql("SELECT explode(map(1, 1)) as k1 FROM data LIMIT 1")
+    }
+
+    intercept[AnalysisException] {
+      sql("SELECT explode(map(1, 1)) as (k1, k2, k3) FROM data LIMIT 1")
+    }
+  }
+
+  // TGF with non-TGF in project is allowed in Spark SQL, but not in Hive
+  test("TGF with non-TGF in projection") {
+    val rdd = sparkContext.makeRDD( """{"a": "1", "b":"1"}""" :: Nil)
+    jsonRDD(rdd).registerTempTable("data")
+    checkAnswer(
+      sql("SELECT explode(map(a, b)) as (k1, k2), a, b FROM data"),
+      Row("1", "1", "1", "1") :: Nil)
+  }
+
   test("logical.Project should not be resolved if it contains aggregates or generators") {
     // This test is used to test the fix of SPARK-5875.
     // The original issue was that Project's resolved will be true when it contains

From 2bc5e0616d878b09daa8e31a7a1fdb7127bca079 Mon Sep 17 00:00:00 2001
From: alyaxey <oleksii.sliusarenko@grammarly.com>
Date: Tue, 19 May 2015 16:45:52 -0700
Subject: [PATCH 080/525] [SPARK-6246] [EC2] fixed support for more than 100
 nodes

This is a small fix. But it is important for amazon users because as the ticket states, "spark-ec2 can't handle clusters with > 100 nodes" now.

Author: alyaxey <oleksii.sliusarenko@grammarly.com>

Closes #6267 from alyaxey/ec2_100_nodes_fix and squashes the following commits:

1e0d747 [alyaxey] [SPARK-6246] fixed support for more than 100 nodes
---
 ec2/spark_ec2.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/ec2/spark_ec2.py b/ec2/spark_ec2.py
index be92d5f45aa77..c6d5a1f0d0a81 100755
--- a/ec2/spark_ec2.py
+++ b/ec2/spark_ec2.py
@@ -864,7 +864,11 @@ def wait_for_cluster_state(conn, opts, cluster_instances, cluster_state):
         for i in cluster_instances:
             i.update()
 
-        statuses = conn.get_all_instance_status(instance_ids=[i.id for i in cluster_instances])
+        max_batch = 100
+        statuses = []
+        for j in xrange(0, len(cluster_instances), max_batch):
+            batch = [i.id for i in cluster_instances[j:j + max_batch]]
+            statuses.extend(conn.get_all_instance_status(instance_ids=batch))
 
         if cluster_state == 'ssh-ready':
             if all(i.state == 'running' for i in cluster_instances) and \

From 3860520633770cc5719b2cdebe6dc3608798386d Mon Sep 17 00:00:00 2001
From: Mike Dusenberry <dusenberrymw@gmail.com>
Date: Tue, 19 May 2015 17:18:08 -0700
Subject: [PATCH 081/525] [SPARK-7744] [DOCS] [MLLIB] Distributed matrix"
 section in MLlib "Data Types" documentation should be reordered.

The documentation for BlockMatrix should come after RowMatrix, IndexedRowMatrix, and CoordinateMatrix, as BlockMatrix references the later three types, and RowMatrix is considered the "basic" distributed matrix.  This will improve comprehensibility of the "Distributed matrix" section, especially for the new reader.

Author: Mike Dusenberry <dusenberrymw@gmail.com>

Closes #6270 from dusenberrymw/Reorder_MLlib_Data_Types_Distributed_matrix_docs and squashes the following commits:

6313bab [Mike Dusenberry] The documentation for BlockMatrix should come after RowMatrix, IndexedRowMatrix, and CoordinateMatrix, as BlockMatrix references the later three types, and RowMatrix is considered the "basic" distributed matrix.  This will improve comprehensibility of the "Distributed matrix" section, especially for the new reader.
---
 docs/mllib-data-types.md | 128 +++++++++++++++++++--------------------
 1 file changed, 64 insertions(+), 64 deletions(-)

diff --git a/docs/mllib-data-types.md b/docs/mllib-data-types.md
index acec0426dc69b..d824dab1d7f7b 100644
--- a/docs/mllib-data-types.md
+++ b/docs/mllib-data-types.md
@@ -296,70 +296,6 @@ backed by an RDD of its entries.
 The underlying RDDs of a distributed matrix must be deterministic, because we cache the matrix size.
 In general the use of non-deterministic RDDs can lead to errors.
 
-### BlockMatrix
-
-A `BlockMatrix` is a distributed matrix backed by an RDD of `MatrixBlock`s, where a `MatrixBlock` is
-a tuple of `((Int, Int), Matrix)`, where the `(Int, Int)` is the index of the block, and `Matrix` is
-the sub-matrix at the given index with size `rowsPerBlock` x `colsPerBlock`.
-`BlockMatrix` supports methods such as `add` and `multiply` with another `BlockMatrix`.
-`BlockMatrix` also has a helper function `validate` which can be used to check whether the
-`BlockMatrix` is set up properly.
-
-<div class="codetabs">
-<div data-lang="scala" markdown="1">
-
-A [`BlockMatrix`](api/scala/index.html#org.apache.spark.mllib.linalg.distributed.BlockMatrix) can be
-most easily created from an `IndexedRowMatrix` or `CoordinateMatrix` by calling `toBlockMatrix`.
-`toBlockMatrix` creates blocks of size 1024 x 1024 by default.
-Users may change the block size by supplying the values through `toBlockMatrix(rowsPerBlock, colsPerBlock)`.
-
-{% highlight scala %}
-import org.apache.spark.mllib.linalg.distributed.{BlockMatrix, CoordinateMatrix, MatrixEntry}
-
-val entries: RDD[MatrixEntry] = ... // an RDD of (i, j, v) matrix entries
-// Create a CoordinateMatrix from an RDD[MatrixEntry].
-val coordMat: CoordinateMatrix = new CoordinateMatrix(entries)
-// Transform the CoordinateMatrix to a BlockMatrix
-val matA: BlockMatrix = coordMat.toBlockMatrix().cache()
-
-// Validate whether the BlockMatrix is set up properly. Throws an Exception when it is not valid.
-// Nothing happens if it is valid.
-matA.validate()
-
-// Calculate A^T A.
-val ata = matA.transpose.multiply(matA)
-{% endhighlight %}
-</div>
-
-<div data-lang="java" markdown="1">
-
-A [`BlockMatrix`](api/java/org/apache/spark/mllib/linalg/distributed/BlockMatrix.html) can be
-most easily created from an `IndexedRowMatrix` or `CoordinateMatrix` by calling `toBlockMatrix`.
-`toBlockMatrix` creates blocks of size 1024 x 1024 by default.
-Users may change the block size by supplying the values through `toBlockMatrix(rowsPerBlock, colsPerBlock)`.
-
-{% highlight java %}
-import org.apache.spark.api.java.JavaRDD;
-import org.apache.spark.mllib.linalg.distributed.BlockMatrix;
-import org.apache.spark.mllib.linalg.distributed.CoordinateMatrix;
-import org.apache.spark.mllib.linalg.distributed.IndexedRowMatrix;
-
-JavaRDD<MatrixEntry> entries = ... // a JavaRDD of (i, j, v) Matrix Entries
-// Create a CoordinateMatrix from a JavaRDD<MatrixEntry>.
-CoordinateMatrix coordMat = new CoordinateMatrix(entries.rdd());
-// Transform the CoordinateMatrix to a BlockMatrix
-BlockMatrix matA = coordMat.toBlockMatrix().cache();
-
-// Validate whether the BlockMatrix is set up properly. Throws an Exception when it is not valid.
-// Nothing happens if it is valid.
-matA.validate();
-
-// Calculate A^T A.
-BlockMatrix ata = matA.transpose().multiply(matA);
-{% endhighlight %}
-</div>
-</div>
-
 ### RowMatrix
 
 A `RowMatrix` is a row-oriented distributed matrix without meaningful row indices, backed by an RDD
@@ -530,3 +466,67 @@ IndexedRowMatrix indexedRowMatrix = mat.toIndexedRowMatrix();
 {% endhighlight %}
 </div>
 </div>
+
+### BlockMatrix
+
+A `BlockMatrix` is a distributed matrix backed by an RDD of `MatrixBlock`s, where a `MatrixBlock` is
+a tuple of `((Int, Int), Matrix)`, where the `(Int, Int)` is the index of the block, and `Matrix` is
+the sub-matrix at the given index with size `rowsPerBlock` x `colsPerBlock`.
+`BlockMatrix` supports methods such as `add` and `multiply` with another `BlockMatrix`.
+`BlockMatrix` also has a helper function `validate` which can be used to check whether the
+`BlockMatrix` is set up properly.
+
+<div class="codetabs">
+<div data-lang="scala" markdown="1">
+
+A [`BlockMatrix`](api/scala/index.html#org.apache.spark.mllib.linalg.distributed.BlockMatrix) can be
+most easily created from an `IndexedRowMatrix` or `CoordinateMatrix` by calling `toBlockMatrix`.
+`toBlockMatrix` creates blocks of size 1024 x 1024 by default.
+Users may change the block size by supplying the values through `toBlockMatrix(rowsPerBlock, colsPerBlock)`.
+
+{% highlight scala %}
+import org.apache.spark.mllib.linalg.distributed.{BlockMatrix, CoordinateMatrix, MatrixEntry}
+
+val entries: RDD[MatrixEntry] = ... // an RDD of (i, j, v) matrix entries
+// Create a CoordinateMatrix from an RDD[MatrixEntry].
+val coordMat: CoordinateMatrix = new CoordinateMatrix(entries)
+// Transform the CoordinateMatrix to a BlockMatrix
+val matA: BlockMatrix = coordMat.toBlockMatrix().cache()
+
+// Validate whether the BlockMatrix is set up properly. Throws an Exception when it is not valid.
+// Nothing happens if it is valid.
+matA.validate()
+
+// Calculate A^T A.
+val ata = matA.transpose.multiply(matA)
+{% endhighlight %}
+</div>
+
+<div data-lang="java" markdown="1">
+
+A [`BlockMatrix`](api/java/org/apache/spark/mllib/linalg/distributed/BlockMatrix.html) can be
+most easily created from an `IndexedRowMatrix` or `CoordinateMatrix` by calling `toBlockMatrix`.
+`toBlockMatrix` creates blocks of size 1024 x 1024 by default.
+Users may change the block size by supplying the values through `toBlockMatrix(rowsPerBlock, colsPerBlock)`.
+
+{% highlight java %}
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.mllib.linalg.distributed.BlockMatrix;
+import org.apache.spark.mllib.linalg.distributed.CoordinateMatrix;
+import org.apache.spark.mllib.linalg.distributed.IndexedRowMatrix;
+
+JavaRDD<MatrixEntry> entries = ... // a JavaRDD of (i, j, v) Matrix Entries
+// Create a CoordinateMatrix from a JavaRDD<MatrixEntry>.
+CoordinateMatrix coordMat = new CoordinateMatrix(entries.rdd());
+// Transform the CoordinateMatrix to a BlockMatrix
+BlockMatrix matA = coordMat.toBlockMatrix().cache();
+
+// Validate whether the BlockMatrix is set up properly. Throws an Exception when it is not valid.
+// Nothing happens if it is valid.
+matA.validate();
+
+// Calculate A^T A.
+BlockMatrix ata = matA.transpose().multiply(matA);
+{% endhighlight %}
+</div>
+</div>

From 60336e3bc02a2587fdf315f9011bbe7c9d3a58c4 Mon Sep 17 00:00:00 2001
From: scwf <wangfei1@huawei.com>
Date: Tue, 19 May 2015 17:36:00 -0700
Subject: [PATCH 082/525] [SPARK-7656] [SQL] use CatalystConf in
 FunctionRegistry

follow up for #5806

Author: scwf <wangfei1@huawei.com>

Closes #6164 from scwf/FunctionRegistry and squashes the following commits:

15e6697 [scwf] use catalogconf in FunctionRegistry
---
 .../sql/catalyst/analysis/FunctionRegistry.scala     | 12 +++++++-----
 .../main/scala/org/apache/spark/sql/SQLContext.scala |  2 +-
 .../org/apache/spark/sql/hive/HiveContext.scala      |  2 +-
 3 files changed, 9 insertions(+), 7 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
index 16ca5bcd57a72..0849faa9bfa7b 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
@@ -17,6 +17,7 @@
 
 package org.apache.spark.sql.catalyst.analysis
 
+import org.apache.spark.sql.catalyst.CatalystConf
 import org.apache.spark.sql.catalyst.expressions.Expression
 import scala.collection.mutable
 
@@ -28,12 +29,12 @@ trait FunctionRegistry {
 
   def lookupFunction(name: String, children: Seq[Expression]): Expression
 
-  def caseSensitive: Boolean
+  def conf: CatalystConf
 }
 
 trait OverrideFunctionRegistry extends FunctionRegistry {
 
-  val functionBuilders = StringKeyHashMap[FunctionBuilder](caseSensitive)
+  val functionBuilders = StringKeyHashMap[FunctionBuilder](conf.caseSensitiveAnalysis)
 
   override def registerFunction(name: String, builder: FunctionBuilder): Unit = {
     functionBuilders.put(name, builder)
@@ -44,8 +45,9 @@ trait OverrideFunctionRegistry extends FunctionRegistry {
   }
 }
 
-class SimpleFunctionRegistry(val caseSensitive: Boolean) extends FunctionRegistry {
-  val functionBuilders = StringKeyHashMap[FunctionBuilder](caseSensitive)
+class SimpleFunctionRegistry(val conf: CatalystConf) extends FunctionRegistry {
+
+  val functionBuilders = StringKeyHashMap[FunctionBuilder](conf.caseSensitiveAnalysis)
 
   override def registerFunction(name: String, builder: FunctionBuilder): Unit = {
     functionBuilders.put(name, builder)
@@ -69,7 +71,7 @@ object EmptyFunctionRegistry extends FunctionRegistry {
     throw new UnsupportedOperationException
   }
 
-  override def caseSensitive: Boolean = throw new UnsupportedOperationException
+  override def conf: CatalystConf = throw new UnsupportedOperationException
 }
 
 /**
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
index 316ef7d58809d..304e958192bb9 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
@@ -121,7 +121,7 @@ class SQLContext(@transient val sparkContext: SparkContext)
 
   // TODO how to handle the temp function per user session?
   @transient
-  protected[sql] lazy val functionRegistry: FunctionRegistry = new SimpleFunctionRegistry(true)
+  protected[sql] lazy val functionRegistry: FunctionRegistry = new SimpleFunctionRegistry(conf)
 
   @transient
   protected[sql] lazy val analyzer: Analyzer =
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala
index 2733ebdb95bca..863a5db1bf98c 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala
@@ -357,7 +357,7 @@ class HiveContext(sc: SparkContext) extends SQLContext(sc) {
   @transient
   override protected[sql] lazy val functionRegistry =
     new HiveFunctionRegistry with OverrideFunctionRegistry {
-      def caseSensitive: Boolean = false
+      override def conf: CatalystConf = currentSession().conf
     }
 
   /* An analyzer that uses the Hive metastore. */

From b3abf0b8d9bca13840eb759953d76905c2ba9b8a Mon Sep 17 00:00:00 2001
From: Xusen Yin <yinxusen@gmail.com>
Date: Wed, 20 May 2015 10:41:18 +0100
Subject: [PATCH 083/525] [SPARK-7663] [MLLIB] Add requirement for word2vec
 model

JIRA issue [link](https://issues.apache.org/jira/browse/SPARK-7663).

We should check the model size of word2vec, to prevent the unexpected empty.

CC srowen.

Author: Xusen Yin <yinxusen@gmail.com>

Closes #6228 from yinxusen/SPARK-7663 and squashes the following commits:

21770c5 [Xusen Yin] check the vocab size
54ae63e [Xusen Yin] add requirement for word2vec model
---
 .../main/scala/org/apache/spark/mllib/feature/Word2Vec.scala   | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala b/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala
index 731f7576c2335..f65f78299d182 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala
@@ -158,6 +158,9 @@ class Word2Vec extends Serializable with Logging {
       .sortWith((a, b) => a.cn > b.cn)
     
     vocabSize = vocab.length
+    require(vocabSize > 0, "The vocabulary size should be > 0. You may need to check " +
+      "the setting of minCount, which could be large enough to remove all your words in sentences.")
+
     var a = 0
     while (a < vocabSize) {
       vocabHash += vocab(a).word -> a

From 09265ad7c85c6de6b568ec329daad632d4a79fa3 Mon Sep 17 00:00:00 2001
From: Cheng Hao <hao.cheng@intel.com>
Date: Wed, 20 May 2015 19:09:47 +0800
Subject: [PATCH 084/525] [SPARK-7320] [SQL] Add Cube / Rollup for dataframe

Add `cube` & `rollup` for DataFrame
For example:
```scala
testData.rollup($"a" + $"b", $"b").agg(sum($"a" - $"b"))
testData.cube($"a" + $"b", $"b").agg(sum($"a" - $"b"))
```

Author: Cheng Hao <hao.cheng@intel.com>

Closes #6257 from chenghao-intel/rollup and squashes the following commits:

7302319 [Cheng Hao] cancel the implicit keyword
a66e38f [Cheng Hao] remove the unnecessary code changes
a2869d4 [Cheng Hao] update the code as comments
c441777 [Cheng Hao] update the code as suggested
84c9564 [Cheng Hao] Remove the CubedData & RollupedData
279584c [Cheng Hao] hiden the CubedData & RollupedData
ef357e1 [Cheng Hao] Add Cube / Rollup for dataframe
---
 .../org/apache/spark/sql/DataFrame.scala      | 104 +++++++++++++++++-
 .../org/apache/spark/sql/GroupedData.scala    |  92 +++++++++++-----
 .../hive/HiveDataFrameAnalyticsSuite.scala    |  62 +++++++++++
 3 files changed, 230 insertions(+), 28 deletions(-)
 create mode 100644 sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveDataFrameAnalyticsSuite.scala

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
index adad85806d1ea..d78b4c2f8909c 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
@@ -685,7 +685,53 @@ class DataFrame private[sql](
    * @since 1.3.0
    */
   @scala.annotation.varargs
-  def groupBy(cols: Column*): GroupedData = new GroupedData(this, cols.map(_.expr))
+  def groupBy(cols: Column*): GroupedData = {
+    GroupedData(this, cols.map(_.expr), GroupedData.GroupByType)
+  }
+
+  /**
+   * Create a multi-dimensional rollup for the current [[DataFrame]] using the specified columns,
+   * so we can run aggregation on them.
+   * See [[GroupedData]] for all the available aggregate functions.
+   *
+   * {{{
+   *   // Compute the average for all numeric columns rolluped by department and group.
+   *   df.rollup($"department", $"group").avg()
+   *
+   *   // Compute the max age and average salary, rolluped by department and gender.
+   *   df.rollup($"department", $"gender").agg(Map(
+   *     "salary" -> "avg",
+   *     "age" -> "max"
+   *   ))
+   * }}}
+   * @group dfops
+   * @since 1.4.0
+   */
+  @scala.annotation.varargs
+  def rollup(cols: Column*): GroupedData = {
+    GroupedData(this, cols.map(_.expr), GroupedData.RollupType)
+  }
+
+  /**
+   * Create a multi-dimensional cube for the current [[DataFrame]] using the specified columns,
+   * so we can run aggregation on them.
+   * See [[GroupedData]] for all the available aggregate functions.
+   *
+   * {{{
+   *   // Compute the average for all numeric columns cubed by department and group.
+   *   df.cube($"department", $"group").avg()
+   *
+   *   // Compute the max age and average salary, cubed by department and gender.
+   *   df.cube($"department", $"gender").agg(Map(
+   *     "salary" -> "avg",
+   *     "age" -> "max"
+   *   ))
+   * }}}
+   * @group dfops
+   * @since 1.4.0
+   */
+  @scala.annotation.varargs
+  def cube(cols: Column*): GroupedData = GroupedData(this, cols.map(_.expr), GroupedData.CubeType)
 
   /**
    * Groups the [[DataFrame]] using the specified columns, so we can run aggregation on them.
@@ -710,7 +756,61 @@ class DataFrame private[sql](
   @scala.annotation.varargs
   def groupBy(col1: String, cols: String*): GroupedData = {
     val colNames: Seq[String] = col1 +: cols
-    new GroupedData(this, colNames.map(colName => resolve(colName)))
+    GroupedData(this, colNames.map(colName => resolve(colName)), GroupedData.GroupByType)
+  }
+
+  /**
+   * Create a multi-dimensional rollup for the current [[DataFrame]] using the specified columns,
+   * so we can run aggregation on them.
+   * See [[GroupedData]] for all the available aggregate functions.
+   *
+   * This is a variant of rollup that can only group by existing columns using column names
+   * (i.e. cannot construct expressions).
+   *
+   * {{{
+   *   // Compute the average for all numeric columns rolluped by department and group.
+   *   df.rollup("department", "group").avg()
+   *
+   *   // Compute the max age and average salary, rolluped by department and gender.
+   *   df.rollup($"department", $"gender").agg(Map(
+   *     "salary" -> "avg",
+   *     "age" -> "max"
+   *   ))
+   * }}}
+   * @group dfops
+   * @since 1.4.0
+   */
+  @scala.annotation.varargs
+  def rollup(col1: String, cols: String*): GroupedData = {
+    val colNames: Seq[String] = col1 +: cols
+    GroupedData(this, colNames.map(colName => resolve(colName)), GroupedData.RollupType)
+  }
+
+  /**
+   * Create a multi-dimensional cube for the current [[DataFrame]] using the specified columns,
+   * so we can run aggregation on them.
+   * See [[GroupedData]] for all the available aggregate functions.
+   *
+   * This is a variant of cube that can only group by existing columns using column names
+   * (i.e. cannot construct expressions).
+   *
+   * {{{
+   *   // Compute the average for all numeric columns cubed by department and group.
+   *   df.cube("department", "group").avg()
+   *
+   *   // Compute the max age and average salary, cubed by department and gender.
+   *   df.cube($"department", $"gender").agg(Map(
+   *     "salary" -> "avg",
+   *     "age" -> "max"
+   *   ))
+   * }}}
+   * @group dfops
+   * @since 1.4.0
+   */
+  @scala.annotation.varargs
+  def cube(col1: String, cols: String*): GroupedData = {
+    val colNames: Seq[String] = col1 +: cols
+    GroupedData(this, colNames.map(colName => resolve(colName)), GroupedData.CubeType)
   }
 
   /**
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/GroupedData.scala b/sql/core/src/main/scala/org/apache/spark/sql/GroupedData.scala
index 1381b9f1a6080..f730e4ae00e2b 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/GroupedData.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/GroupedData.scala
@@ -23,9 +23,40 @@ import scala.language.implicitConversions
 import org.apache.spark.annotation.Experimental
 import org.apache.spark.sql.catalyst.analysis.Star
 import org.apache.spark.sql.catalyst.expressions._
-import org.apache.spark.sql.catalyst.plans.logical.Aggregate
+import org.apache.spark.sql.catalyst.plans.logical.{Rollup, Cube, Aggregate}
 import org.apache.spark.sql.types.NumericType
 
+/**
+ * Companion object for GroupedData
+ */
+private[sql] object GroupedData {
+  def apply(
+      df: DataFrame,
+      groupingExprs: Seq[Expression],
+      groupType: GroupType): GroupedData = {
+    new GroupedData(df, groupingExprs, groupType: GroupType)
+  }
+
+  /**
+   * The Grouping Type
+   */
+  trait GroupType
+
+  /**
+   * To indicate it's the GroupBy
+   */
+  object GroupByType extends GroupType
+
+  /**
+   * To indicate it's the CUBE
+   */
+  object CubeType extends GroupType
+
+  /**
+   * To indicate it's the ROLLUP
+   */
+  object RollupType extends GroupType
+}
 
 /**
  * :: Experimental ::
@@ -34,19 +65,37 @@ import org.apache.spark.sql.types.NumericType
  * @since 1.3.0
  */
 @Experimental
-class GroupedData protected[sql](df: DataFrame, groupingExprs: Seq[Expression]) {
+class GroupedData protected[sql](
+    df: DataFrame,
+    groupingExprs: Seq[Expression],
+    private val groupType: GroupedData.GroupType) {
 
-  private[sql] implicit def toDF(aggExprs: Seq[NamedExpression]): DataFrame = {
-    val namedGroupingExprs = groupingExprs.map {
-      case expr: NamedExpression => expr
-      case expr: Expression => Alias(expr, expr.prettyString)()
+  private[this] def toDF(aggExprs: Seq[NamedExpression]): DataFrame = {
+    val aggregates = if (df.sqlContext.conf.dataFrameRetainGroupColumns) {
+        val retainedExprs = groupingExprs.map {
+          case expr: NamedExpression => expr
+          case expr: Expression => Alias(expr, expr.prettyString)()
+        }
+        retainedExprs ++ aggExprs
+      } else {
+        aggExprs
+      }
+
+    groupType match {
+      case GroupedData.GroupByType =>
+        DataFrame(
+          df.sqlContext, Aggregate(groupingExprs, aggregates, df.logicalPlan))
+      case GroupedData.RollupType =>
+        DataFrame(
+          df.sqlContext, Rollup(groupingExprs, df.logicalPlan, aggregates))
+      case GroupedData.CubeType =>
+        DataFrame(
+          df.sqlContext, Cube(groupingExprs, df.logicalPlan, aggregates))
     }
-    DataFrame(
-      df.sqlContext, Aggregate(groupingExprs, namedGroupingExprs ++ aggExprs, df.logicalPlan))
   }
 
   private[this] def aggregateNumericColumns(colNames: String*)(f: Expression => Expression)
-    : Seq[NamedExpression] = {
+    : DataFrame = {
 
     val columnExprs = if (colNames.isEmpty) {
       // No columns specified. Use all numeric columns.
@@ -63,10 +112,10 @@ class GroupedData protected[sql](df: DataFrame, groupingExprs: Seq[Expression])
         namedExpr
       }
     }
-    columnExprs.map { c =>
+    toDF(columnExprs.map { c =>
       val a = f(c)
       Alias(a, a.prettyString)()
-    }
+    })
   }
 
   private[this] def strToExpr(expr: String): (Expression => Expression) = {
@@ -119,10 +168,10 @@ class GroupedData protected[sql](df: DataFrame, groupingExprs: Seq[Expression])
    * @since 1.3.0
    */
   def agg(exprs: Map[String, String]): DataFrame = {
-    exprs.map { case (colName, expr) =>
+    toDF(exprs.map { case (colName, expr) =>
       val a = strToExpr(expr)(df(colName).expr)
       Alias(a, a.prettyString)()
-    }.toSeq
+    }.toSeq)
   }
 
   /**
@@ -175,19 +224,10 @@ class GroupedData protected[sql](df: DataFrame, groupingExprs: Seq[Expression])
    */
   @scala.annotation.varargs
   def agg(expr: Column, exprs: Column*): DataFrame = {
-    val aggExprs = (expr +: exprs).map(_.expr).map {
+    toDF((expr +: exprs).map(_.expr).map {
       case expr: NamedExpression => expr
       case expr: Expression => Alias(expr, expr.prettyString)()
-    }
-    if (df.sqlContext.conf.dataFrameRetainGroupColumns) {
-      val retainedExprs = groupingExprs.map {
-        case expr: NamedExpression => expr
-        case expr: Expression => Alias(expr, expr.prettyString)()
-      }
-      DataFrame(df.sqlContext, Aggregate(groupingExprs, retainedExprs ++ aggExprs, df.logicalPlan))
-    } else {
-      DataFrame(df.sqlContext, Aggregate(groupingExprs, aggExprs, df.logicalPlan))
-    }
+    })
   }
 
   /**
@@ -196,7 +236,7 @@ class GroupedData protected[sql](df: DataFrame, groupingExprs: Seq[Expression])
    *
    * @since 1.3.0
    */
-  def count(): DataFrame = Seq(Alias(Count(Literal(1)), "count")())
+  def count(): DataFrame = toDF(Seq(Alias(Count(Literal(1)), "count")()))
 
   /**
    * Compute the average value for each numeric columns for each group. This is an alias for `avg`.
@@ -256,5 +296,5 @@ class GroupedData protected[sql](df: DataFrame, groupingExprs: Seq[Expression])
   @scala.annotation.varargs
   def sum(colNames: String*): DataFrame = {
     aggregateNumericColumns(colNames:_*)(Sum)
-  }    
+  }
 }
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveDataFrameAnalyticsSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveDataFrameAnalyticsSuite.scala
new file mode 100644
index 0000000000000..3ad05f482504c
--- /dev/null
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveDataFrameAnalyticsSuite.scala
@@ -0,0 +1,62 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.hive
+
+import org.apache.spark.sql.QueryTest
+import org.apache.spark.sql.functions._
+import org.apache.spark.sql.hive.test.TestHive
+import org.apache.spark.sql.hive.test.TestHive._
+import org.apache.spark.sql.hive.test.TestHive.implicits._
+
+case class TestData2Int(a: Int, b: Int)
+
+// TODO ideally we should put the test suite into the package `sql`, as
+// `hive` package is optional in compiling, however, `SQLContext.sql` doesn't
+// support the `cube` or `rollup` yet.
+class HiveDataFrameAnalyticsSuite extends QueryTest {
+  val testData =
+    TestHive.sparkContext.parallelize(
+      TestData2Int(1, 2) ::
+        TestData2Int(2, 4) :: Nil).toDF()
+
+  testData.registerTempTable("mytable")
+
+  test("rollup") {
+    checkAnswer(
+      testData.rollup($"a" + $"b", $"b").agg(sum($"a" - $"b")),
+      sql("select a + b, b, sum(a - b) from mytable group by a + b, b with rollup").collect()
+    )
+
+    checkAnswer(
+      testData.rollup("a", "b").agg(sum("b")),
+      sql("select a, b, sum(b) from mytable group by a, b with rollup").collect()
+    )
+  }
+
+  test("cube") {
+    checkAnswer(
+      testData.cube($"a" + $"b", $"b").agg(sum($"a" - $"b")),
+      sql("select a + b, b, sum(a - b) from mytable group by a + b, b with cube").collect()
+    )
+
+    checkAnswer(
+      testData.cube("a", "b").agg(sum("b")),
+      sql("select a, b, sum(b) from mytable group by a, b with cube").collect()
+    )
+  }
+}

From 3ddf051ee7256f642f8a17768d161c7b5f55c7e1 Mon Sep 17 00:00:00 2001
From: ehnalis <zoltan.zvara@gmail.com>
Date: Wed, 20 May 2015 08:27:39 -0500
Subject: [PATCH 085/525] [SPARK-7533] [YARN] Decrease spacing between AM-RM
 heartbeats.

Added faster RM-heartbeats on pending container allocations with multiplicative back-off.
Also updated related documentations.

Author: ehnalis <zoltan.zvara@gmail.com>

Closes #6082 from ehnalis/yarn and squashes the following commits:

a1d2101 [ehnalis] MIss-spell fixed.
90f8ba4 [ehnalis] Changed default HB values.
6120295 [ehnalis] Removed the bug, when allocation heartbeat would not start from initial value.
08bac63 [ehnalis] Refined style, grammar, removed duplicated code.
073d283 [ehnalis] [SPARK-7533] [YARN] Decrease spacing between AM-RM heartbeats.
d4408c9 [ehnalis] [SPARK-7533] [YARN] Decrease spacing between AM-RM heartbeats.
---
 docs/running-on-yarn.md                       | 15 +++++++-
 .../spark/deploy/yarn/ApplicationMaster.scala | 34 ++++++++++++++-----
 2 files changed, 39 insertions(+), 10 deletions(-)

diff --git a/docs/running-on-yarn.md b/docs/running-on-yarn.md
index 51c1339165024..9d55f435e80ad 100644
--- a/docs/running-on-yarn.md
+++ b/docs/running-on-yarn.md
@@ -71,9 +71,22 @@ Most of the configs are the same for Spark on YARN as for other deployment modes
 </tr>
 <tr>
   <td><code>spark.yarn.scheduler.heartbeat.interval-ms</code></td>
-  <td>5000</td>
+  <td>3000</td>
   <td>
     The interval in ms in which the Spark application master heartbeats into the YARN ResourceManager.
+    The value is capped at half the value of YARN's configuration for the expiry interval
+    (<code>yarn.am.liveness-monitor.expiry-interval-ms</code>).
+  </td>
+</tr>
+<tr>
+  <td><code>spark.yarn.scheduler.initial-allocation.interval</code></td>
+  <td>200ms</td>
+  <td>
+    The initial interval in which the Spark application master eagerly heartbeats to the YARN ResourceManager
+    when there are pending container allocation requests. It should be no larger than
+    <code>spark.yarn.scheduler.heartbeat.interval-ms</code>. The allocation interval will doubled on
+    successive eager heartbeats if pending containers still exist, until
+    <code>spark.yarn.scheduler.heartbeat.interval-ms</code> is reached.
   </td>
 </tr>
 <tr>
diff --git a/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala b/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala
index 29752969e6152..63a6f2e9472c1 100644
--- a/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala
+++ b/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala
@@ -300,11 +300,14 @@ private[spark] class ApplicationMaster(
     val expiryInterval = yarnConf.getInt(YarnConfiguration.RM_AM_EXPIRY_INTERVAL_MS, 120000)
 
     // we want to be reasonably responsive without causing too many requests to RM.
-    val schedulerInterval =
-      sparkConf.getTimeAsMs("spark.yarn.scheduler.heartbeat.interval-ms", "5s")
+    val heartbeatInterval = math.max(0, math.min(expiryInterval / 2,
+      sparkConf.getTimeAsMs("spark.yarn.scheduler.heartbeat.interval-ms", "3s")))
 
-    // must be <= expiryInterval / 2.
-    val interval = math.max(0, math.min(expiryInterval / 2, schedulerInterval))
+    // we want to check more frequently for pending containers
+    val initialAllocationInterval = math.min(heartbeatInterval,
+      sparkConf.getTimeAsMs("spark.yarn.scheduler.initial-allocation.interval", "200ms"))
+
+    var nextAllocationInterval = initialAllocationInterval
 
     // The number of failures in a row until Reporter thread give up
     val reporterMaxFailures = sparkConf.getInt("spark.yarn.scheduler.reporterThread.maxFailures", 5)
@@ -330,15 +333,27 @@ private[spark] class ApplicationMaster(
               if (!NonFatal(e) || failureCount >= reporterMaxFailures) {
                 finish(FinalApplicationStatus.FAILED,
                   ApplicationMaster.EXIT_REPORTER_FAILURE, "Exception was thrown " +
-                    s"${failureCount} time(s) from Reporter thread.")
-
+                    s"$failureCount time(s) from Reporter thread.")
               } else {
-                logWarning(s"Reporter thread fails ${failureCount} time(s) in a row.", e)
+                logWarning(s"Reporter thread fails $failureCount time(s) in a row.", e)
               }
             }
           }
           try {
-            Thread.sleep(interval)
+            val numPendingAllocate = allocator.getNumPendingAllocate
+            val sleepInterval =
+              if (numPendingAllocate > 0) {
+                val currentAllocationInterval =
+                  math.min(heartbeatInterval, nextAllocationInterval)
+                nextAllocationInterval *= 2
+                currentAllocationInterval
+              } else {
+                nextAllocationInterval = initialAllocationInterval
+                heartbeatInterval
+              }
+            logDebug(s"Number of pending allocations is $numPendingAllocate. " +
+                     s"Sleeping for $sleepInterval.")
+            Thread.sleep(sleepInterval)
           } catch {
             case e: InterruptedException =>
           }
@@ -349,7 +364,8 @@ private[spark] class ApplicationMaster(
     t.setDaemon(true)
     t.setName("Reporter")
     t.start()
-    logInfo("Started progress reporter thread - sleep time : " + interval)
+    logInfo(s"Started progress reporter thread with (heartbeat : $heartbeatInterval, " +
+            s"initial allocation : $initialAllocationInterval) intervals")
     t
   }
 

From 589b12f8e62ec5d10713ce057756ebc791e7ddc6 Mon Sep 17 00:00:00 2001
From: Xiangrui Meng <meng@databricks.com>
Date: Wed, 20 May 2015 07:46:17 -0700
Subject: [PATCH 086/525] [SPARK-7654] [MLLIB] Migrate MLlib to the DataFrame
 reader/writer API

parquetFile -> read.parquet rxin

Author: Xiangrui Meng <meng@databricks.com>

Closes #6281 from mengxr/SPARK-7654 and squashes the following commits:

a79b612 [Xiangrui Meng] parquetFile -> read.parquet
---
 .../org/apache/spark/mllib/classification/NaiveBayes.scala    | 4 ++--
 .../mllib/classification/impl/GLMClassificationModel.scala    | 2 +-
 .../apache/spark/mllib/clustering/GaussianMixtureModel.scala  | 2 +-
 .../scala/org/apache/spark/mllib/clustering/KMeansModel.scala | 2 +-
 .../main/scala/org/apache/spark/mllib/feature/Word2Vec.scala  | 2 +-
 .../spark/mllib/recommendation/MatrixFactorizationModel.scala | 4 ++--
 .../apache/spark/mllib/regression/IsotonicRegression.scala    | 2 +-
 .../spark/mllib/regression/impl/GLMRegressionModel.scala      | 2 +-
 .../org/apache/spark/mllib/tree/model/DecisionTreeModel.scala | 2 +-
 .../apache/spark/mllib/tree/model/treeEnsembleModels.scala    | 2 +-
 10 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/classification/NaiveBayes.scala b/mllib/src/main/scala/org/apache/spark/mllib/classification/NaiveBayes.scala
index 53fb2cba03cbf..cffe9ef1e0b2a 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/classification/NaiveBayes.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/classification/NaiveBayes.scala
@@ -153,7 +153,7 @@ object NaiveBayesModel extends Loader[NaiveBayesModel] {
     def load(sc: SparkContext, path: String): NaiveBayesModel = {
       val sqlContext = new SQLContext(sc)
       // Load Parquet data.
-      val dataRDD = sqlContext.parquetFile(dataPath(path))
+      val dataRDD = sqlContext.read.parquet(dataPath(path))
       // Check schema explicitly since erasure makes it hard to use match-case for checking.
       checkSchema[Data](dataRDD.schema)
       val dataArray = dataRDD.select("labels", "pi", "theta", "modelType").take(1)
@@ -199,7 +199,7 @@ object NaiveBayesModel extends Loader[NaiveBayesModel] {
     def load(sc: SparkContext, path: String): NaiveBayesModel = {
       val sqlContext = new SQLContext(sc)
       // Load Parquet data.
-      val dataRDD = sqlContext.parquetFile(dataPath(path))
+      val dataRDD = sqlContext.read.parquet(dataPath(path))
       // Check schema explicitly since erasure makes it hard to use match-case for checking.
       checkSchema[Data](dataRDD.schema)
       val dataArray = dataRDD.select("labels", "pi", "theta").take(1)
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/classification/impl/GLMClassificationModel.scala b/mllib/src/main/scala/org/apache/spark/mllib/classification/impl/GLMClassificationModel.scala
index d842ec57b2f52..fe09f6b75d28b 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/classification/impl/GLMClassificationModel.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/classification/impl/GLMClassificationModel.scala
@@ -75,7 +75,7 @@ private[classification] object GLMClassificationModel {
     def loadData(sc: SparkContext, path: String, modelClass: String): Data = {
       val datapath = Loader.dataPath(path)
       val sqlContext = new SQLContext(sc)
-      val dataRDD = sqlContext.parquetFile(datapath)
+      val dataRDD = sqlContext.read.parquet(datapath)
       val dataArray = dataRDD.select("weights", "intercept", "threshold").take(1)
       assert(dataArray.size == 1, s"Unable to load $modelClass data from: $datapath")
       val data = dataArray(0)
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixtureModel.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixtureModel.scala
index 731b43a1be574..86353aed81156 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixtureModel.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixtureModel.scala
@@ -132,7 +132,7 @@ object GaussianMixtureModel extends Loader[GaussianMixtureModel] {
     def load(sc: SparkContext, path: String): GaussianMixtureModel = {
       val dataPath = Loader.dataPath(path)
       val sqlContext = new SQLContext(sc)
-      val dataFrame = sqlContext.parquetFile(dataPath)
+      val dataFrame = sqlContext.read.parquet(dataPath)
       val dataArray = dataFrame.select("weight", "mu", "sigma").collect()
 
       // Check schema explicitly since erasure makes it hard to use match-case for checking.
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeansModel.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeansModel.scala
index 252e166e85cef..8ecb3df11d95e 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeansModel.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeansModel.scala
@@ -120,7 +120,7 @@ object KMeansModel extends Loader[KMeansModel] {
       assert(className == thisClassName)
       assert(formatVersion == thisFormatVersion)
       val k = (metadata \ "k").extract[Int]
-      val centriods = sqlContext.parquetFile(Loader.dataPath(path))
+      val centriods = sqlContext.read.parquet(Loader.dataPath(path))
       Loader.checkSchema[Cluster](centriods.schema)
       val localCentriods = centriods.map(Cluster.apply).collect()
       assert(k == localCentriods.size)
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala b/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala
index f65f78299d182..9106b73dfcd76 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala
@@ -559,7 +559,7 @@ object Word2VecModel extends Loader[Word2VecModel] {
     def load(sc: SparkContext, path: String): Word2VecModel = {
       val dataPath = Loader.dataPath(path)
       val sqlContext = new SQLContext(sc)
-      val dataFrame = sqlContext.parquetFile(dataPath)
+      val dataFrame = sqlContext.read.parquet(dataPath)
 
       val dataArray = dataFrame.select("word", "vector").collect()
 
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/recommendation/MatrixFactorizationModel.scala b/mllib/src/main/scala/org/apache/spark/mllib/recommendation/MatrixFactorizationModel.scala
index b960fbc5bf5f5..93aa41e49961e 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/recommendation/MatrixFactorizationModel.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/recommendation/MatrixFactorizationModel.scala
@@ -292,11 +292,11 @@ object MatrixFactorizationModel extends Loader[MatrixFactorizationModel] {
       assert(className == thisClassName)
       assert(formatVersion == thisFormatVersion)
       val rank = (metadata \ "rank").extract[Int]
-      val userFeatures = sqlContext.parquetFile(userPath(path))
+      val userFeatures = sqlContext.read.parquet(userPath(path))
         .map { case Row(id: Int, features: Seq[_]) =>
           (id, features.asInstanceOf[Seq[Double]].toArray)
         }
-      val productFeatures = sqlContext.parquetFile(productPath(path))
+      val productFeatures = sqlContext.read.parquet(productPath(path))
         .map { case Row(id: Int, features: Seq[_]) =>
         (id, features.asInstanceOf[Seq[Double]].toArray)
       }
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/regression/IsotonicRegression.scala b/mllib/src/main/scala/org/apache/spark/mllib/regression/IsotonicRegression.scala
index 22b9b22a871f0..3ea63dd8c0acd 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/regression/IsotonicRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/regression/IsotonicRegression.scala
@@ -189,7 +189,7 @@ object IsotonicRegressionModel extends Loader[IsotonicRegressionModel] {
 
     def load(sc: SparkContext, path: String): (Array[Double], Array[Double]) = {
       val sqlContext = new SQLContext(sc)
-      val dataRDD = sqlContext.parquetFile(dataPath(path))
+      val dataRDD = sqlContext.read.parquet(dataPath(path))
 
       checkSchema[Data](dataRDD.schema)
       val dataArray = dataRDD.select("boundary", "prediction").collect()
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/regression/impl/GLMRegressionModel.scala b/mllib/src/main/scala/org/apache/spark/mllib/regression/impl/GLMRegressionModel.scala
index 2aa0e9ef96d48..317d3a5702636 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/regression/impl/GLMRegressionModel.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/regression/impl/GLMRegressionModel.scala
@@ -72,7 +72,7 @@ private[regression] object GLMRegressionModel {
     def loadData(sc: SparkContext, path: String, modelClass: String, numFeatures: Int): Data = {
       val datapath = Loader.dataPath(path)
       val sqlContext = new SQLContext(sc)
-      val dataRDD = sqlContext.parquetFile(datapath)
+      val dataRDD = sqlContext.read.parquet(datapath)
       val dataArray = dataRDD.select("weights", "intercept").take(1)
       assert(dataArray.size == 1, s"Unable to load $modelClass data from: $datapath")
       val data = dataArray(0)
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/model/DecisionTreeModel.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/model/DecisionTreeModel.scala
index a558f84c8d506..25bb1453db404 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/tree/model/DecisionTreeModel.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/model/DecisionTreeModel.scala
@@ -230,7 +230,7 @@ object DecisionTreeModel extends Loader[DecisionTreeModel] with Logging {
       val datapath = Loader.dataPath(path)
       val sqlContext = new SQLContext(sc)
       // Load Parquet data.
-      val dataRDD = sqlContext.parquetFile(datapath)
+      val dataRDD = sqlContext.read.parquet(datapath)
       // Check schema explicitly since erasure makes it hard to use match-case for checking.
       Loader.checkSchema[NodeData](dataRDD.schema)
       val nodes = dataRDD.map(NodeData.apply)
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/model/treeEnsembleModels.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/model/treeEnsembleModels.scala
index f9cd0140fe63f..1e3333d8d81d0 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/tree/model/treeEnsembleModels.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/model/treeEnsembleModels.scala
@@ -437,7 +437,7 @@ private[tree] object TreeEnsembleModel extends Logging {
         treeAlgo: String): Array[DecisionTreeModel] = {
       val datapath = Loader.dataPath(path)
       val sqlContext = new SQLContext(sc)
-      val nodes = sqlContext.parquetFile(datapath).map(NodeData.apply)
+      val nodes = sqlContext.read.parquet(datapath).map(NodeData.apply)
       val trees = constructTrees(nodes)
       trees.map(new DecisionTreeModel(_, Algo.fromString(treeAlgo)))
     }

From 98a46f9dffec294386f6c39acafa7f11adb87a8f Mon Sep 17 00:00:00 2001
From: Yanbo Liang <ybliang8@gmail.com>
Date: Wed, 20 May 2015 07:55:51 -0700
Subject: [PATCH 087/525] [SPARK-6094] [MLLIB] Add MultilabelMetrics in
 PySpark/MLlib

Add MultilabelMetrics in PySpark/MLlib

Author: Yanbo Liang <ybliang8@gmail.com>

Closes #6276 from yanboliang/spark-6094 and squashes the following commits:

b8e3343 [Yanbo Liang] Add MultilabelMetrics in PySpark/MLlib
---
 .../mllib/evaluation/MultilabelMetrics.scala  |   8 ++
 python/pyspark/mllib/evaluation.py            | 117 ++++++++++++++++++
 2 files changed, 125 insertions(+)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/evaluation/MultilabelMetrics.scala b/mllib/src/main/scala/org/apache/spark/mllib/evaluation/MultilabelMetrics.scala
index a8378a76d20ae..bf6eb1d5bd2ab 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/evaluation/MultilabelMetrics.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/evaluation/MultilabelMetrics.scala
@@ -19,6 +19,7 @@ package org.apache.spark.mllib.evaluation
 
 import org.apache.spark.rdd.RDD
 import org.apache.spark.SparkContext._
+import org.apache.spark.sql.DataFrame
 
 /**
  * Evaluator for multilabel classification.
@@ -27,6 +28,13 @@ import org.apache.spark.SparkContext._
  */
 class MultilabelMetrics(predictionAndLabels: RDD[(Array[Double], Array[Double])]) {
 
+  /**
+   * An auxiliary constructor taking a DataFrame.
+   * @param predictionAndLabels a DataFrame with two double array columns: prediction and label
+   */
+  private[mllib] def this(predictionAndLabels: DataFrame) =
+    this(predictionAndLabels.map(r => (r.getSeq[Double](0).toArray, r.getSeq[Double](1).toArray)))
+
   private lazy val numDocs: Long = predictionAndLabels.count()
 
   private lazy val numLabels: Long = predictionAndLabels.flatMap { case (_, labels) =>
diff --git a/python/pyspark/mllib/evaluation.py b/python/pyspark/mllib/evaluation.py
index a5e5ddc8fe506..aab5e5f4b77b5 100644
--- a/python/pyspark/mllib/evaluation.py
+++ b/python/pyspark/mllib/evaluation.py
@@ -343,6 +343,123 @@ def ndcgAt(self, k):
         return self.call("ndcgAt", int(k))
 
 
+class MultilabelMetrics(JavaModelWrapper):
+    """
+    Evaluator for multilabel classification.
+
+    >>> predictionAndLabels = sc.parallelize([([0.0, 1.0], [0.0, 2.0]), ([0.0, 2.0], [0.0, 1.0]),
+    ...     ([], [0.0]), ([2.0], [2.0]), ([2.0, 0.0], [2.0, 0.0]),
+    ...     ([0.0, 1.0, 2.0], [0.0, 1.0]), ([1.0], [1.0, 2.0])])
+    >>> metrics = MultilabelMetrics(predictionAndLabels)
+    >>> metrics.precision(0.0)
+    1.0
+    >>> metrics.recall(1.0)
+    0.66...
+    >>> metrics.f1Measure(2.0)
+    0.5
+    >>> metrics.precision()
+    0.66...
+    >>> metrics.recall()
+    0.64...
+    >>> metrics.f1Measure()
+    0.63...
+    >>> metrics.microPrecision
+    0.72...
+    >>> metrics.microRecall
+    0.66...
+    >>> metrics.microF1Measure
+    0.69...
+    >>> metrics.hammingLoss
+    0.33...
+    >>> metrics.subsetAccuracy
+    0.28...
+    >>> metrics.accuracy
+    0.54...
+    """
+
+    def __init__(self, predictionAndLabels):
+        sc = predictionAndLabels.ctx
+        sql_ctx = SQLContext(sc)
+        df = sql_ctx.createDataFrame(predictionAndLabels,
+                                     schema=sql_ctx._inferSchema(predictionAndLabels))
+        java_class = sc._jvm.org.apache.spark.mllib.evaluation.MultilabelMetrics
+        java_model = java_class(df._jdf)
+        super(MultilabelMetrics, self).__init__(java_model)
+
+    def precision(self, label=None):
+        """
+        Returns precision or precision for a given label (category) if specified.
+        """
+        if label is None:
+            return self.call("precision")
+        else:
+            return self.call("precision", float(label))
+
+    def recall(self, label=None):
+        """
+        Returns recall or recall for a given label (category) if specified.
+        """
+        if label is None:
+            return self.call("recall")
+        else:
+            return self.call("recall", float(label))
+
+    def f1Measure(self, label=None):
+        """
+        Returns f1Measure or f1Measure for a given label (category) if specified.
+        """
+        if label is None:
+            return self.call("f1Measure")
+        else:
+            return self.call("f1Measure", float(label))
+
+    @property
+    def microPrecision(self):
+        """
+        Returns micro-averaged label-based precision.
+        (equals to micro-averaged document-based precision)
+        """
+        return self.call("microPrecision")
+
+    @property
+    def microRecall(self):
+        """
+        Returns micro-averaged label-based recall.
+        (equals to micro-averaged document-based recall)
+        """
+        return self.call("microRecall")
+
+    @property
+    def microF1Measure(self):
+        """
+        Returns micro-averaged label-based f1-measure.
+        (equals to micro-averaged document-based f1-measure)
+        """
+        return self.call("microF1Measure")
+
+    @property
+    def hammingLoss(self):
+        """
+        Returns Hamming-loss.
+        """
+        return self.call("hammingLoss")
+
+    @property
+    def subsetAccuracy(self):
+        """
+        Returns subset accuracy.
+        (for equal sets of labels)
+        """
+        return self.call("subsetAccuracy")
+
+    @property
+    def accuracy(self):
+        """
+        Returns accuracy.
+        """
+        return self.call("accuracy")
+
+
 def _test():
     import doctest
     from pyspark import SparkContext

From b631bf73b9f288f37c98b806be430b22485880e5 Mon Sep 17 00:00:00 2001
From: Yin Huai <yhuai@databricks.com>
Date: Wed, 20 May 2015 11:23:40 -0700
Subject: [PATCH 088/525] [SPARK-7713] [SQL] Use shared broadcast hadoop conf
 for partitioned table scan.

https://issues.apache.org/jira/browse/SPARK-7713

I tested the performance with the following code:
```scala
import sqlContext._
import sqlContext.implicits._

(1 to 5000).foreach { i =>
  val df = (1 to 1000).map(j => (j, s"str$j")).toDF("a", "b").save(s"/tmp/partitioned/i=$i")
}

sqlContext.sql("""
CREATE TEMPORARY TABLE partitionedParquet
USING org.apache.spark.sql.parquet
OPTIONS (
  path '/tmp/partitioned'
)""")

table("partitionedParquet").explain(true)
```

In our master `explain` takes 40s in my laptop. With this PR, `explain` takes 14s.

Author: Yin Huai <yhuai@databricks.com>

Closes #6252 from yhuai/broadcastHadoopConf and squashes the following commits:

6fa73df [Yin Huai] Address comments of Josh and Andrew.
807fbf9 [Yin Huai] Make the new buildScan and SqlNewHadoopRDD private sql.
e393555 [Yin Huai] Cheng's comments.
2eb53bb [Yin Huai] Use a shared broadcast Hadoop Configuration for partitioned HadoopFsRelations.
---
 .../apache/spark/sql/parquet/newParquet.scala | 113 +++++---
 .../sql/sources/DataSourceStrategy.scala      |  19 +-
 .../spark/sql/sources/SqlNewHadoopRDD.scala   | 268 ++++++++++++++++++
 .../apache/spark/sql/sources/interfaces.scala |  35 ++-
 4 files changed, 387 insertions(+), 48 deletions(-)
 create mode 100644 sql/core/src/main/scala/org/apache/spark/sql/sources/SqlNewHadoopRDD.scala

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/newParquet.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/newParquet.scala
index 7ca44f7b81a2d..c35b7eff82af5 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/newParquet.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/newParquet.scala
@@ -23,6 +23,7 @@ import scala.collection.JavaConversions._
 import scala.util.Try
 
 import com.google.common.base.Objects
+import org.apache.hadoop.conf.Configuration
 import org.apache.hadoop.fs.{FileStatus, Path}
 import org.apache.hadoop.io.Writable
 import org.apache.hadoop.mapreduce._
@@ -32,13 +33,14 @@ import parquet.hadoop._
 import parquet.hadoop.metadata.CompressionCodecName
 import parquet.hadoop.util.ContextUtil
 
+import org.apache.spark.broadcast.Broadcast
 import org.apache.spark.deploy.SparkHadoopUtil
 import org.apache.spark.rdd.RDD._
-import org.apache.spark.rdd.{NewHadoopPartition, NewHadoopRDD, RDD}
+import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.sources._
 import org.apache.spark.sql.types.{DataType, StructType}
 import org.apache.spark.sql.{Row, SQLConf, SQLContext}
-import org.apache.spark.{Logging, Partition => SparkPartition, SparkException}
+import org.apache.spark.{Partition => SparkPartition, SparkEnv, SerializableWritable, Logging, SparkException}
 
 private[sql] class DefaultSource extends HadoopFsRelationProvider {
   override def createRelation(
@@ -233,40 +235,20 @@ private[sql] class ParquetRelation2(
   override def buildScan(
       requiredColumns: Array[String],
       filters: Array[Filter],
-      inputFiles: Array[FileStatus]): RDD[Row] = {
-
-    val job = new Job(SparkHadoopUtil.get.conf)
-    val conf = ContextUtil.getConfiguration(job)
-
-    ParquetInputFormat.setReadSupportClass(job, classOf[RowReadSupport])
-
-    if (inputFiles.nonEmpty) {
-      FileInputFormat.setInputPaths(job, inputFiles.map(_.getPath): _*)
-    }
-
-    // Try to push down filters when filter push-down is enabled.
-    if (sqlContext.conf.parquetFilterPushDown) {
-      filters
-        // Collects all converted Parquet filter predicates. Notice that not all predicates can be
-        // converted (`ParquetFilters.createFilter` returns an `Option`). That's why a `flatMap`
-        // is used here.
-        .flatMap(ParquetFilters.createFilter(dataSchema, _))
-        .reduceOption(FilterApi.and)
-        .foreach(ParquetInputFormat.setFilterPredicate(conf, _))
-    }
-
-    conf.set(RowReadSupport.SPARK_ROW_REQUESTED_SCHEMA, {
-      val requestedSchema = StructType(requiredColumns.map(dataSchema(_)))
-      ParquetTypesConverter.convertToString(requestedSchema.toAttributes)
-    })
-
-    conf.set(
-      RowWriteSupport.SPARK_ROW_SCHEMA,
-      ParquetTypesConverter.convertToString(dataSchema.toAttributes))
-
-    // Tell FilteringParquetRowInputFormat whether it's okay to cache Parquet and FS metadata
+      inputFiles: Array[FileStatus],
+      broadcastedConf: Broadcast[SerializableWritable[Configuration]]): RDD[Row] = {
     val useMetadataCache = sqlContext.getConf(SQLConf.PARQUET_CACHE_METADATA, "true").toBoolean
-    conf.set(SQLConf.PARQUET_CACHE_METADATA, useMetadataCache.toString)
+    val parquetFilterPushDown = sqlContext.conf.parquetFilterPushDown
+    // Create the function to set variable Parquet confs at both driver and executor side.
+    val initLocalJobFuncOpt =
+      ParquetRelation2.initializeLocalJobFunc(
+        requiredColumns,
+        filters,
+        dataSchema,
+        useMetadataCache,
+        parquetFilterPushDown) _
+    // Create the function to set input paths at the driver side.
+    val setInputPaths = ParquetRelation2.initializeDriverSideJobFunc(inputFiles) _
 
     val footers = inputFiles.map(f => metadataCache.footers(f.getPath))
 
@@ -274,12 +256,14 @@ private[sql] class ParquetRelation2(
     // After upgrading to Parquet 1.6.0, we should be able to stop caching `FileStatus` objects and
     // footers.  Especially when a global arbitrative schema (either from metastore or data source
     // DDL) is available.
-    new NewHadoopRDD(
-      sqlContext.sparkContext,
-      classOf[FilteringParquetRowInputFormat],
-      classOf[Void],
-      classOf[Row],
-      conf) {
+    new SqlNewHadoopRDD(
+      sc = sqlContext.sparkContext,
+      broadcastedConf = broadcastedConf,
+      initDriverSideJobFuncOpt = Some(setInputPaths),
+      initLocalJobFuncOpt = Some(initLocalJobFuncOpt),
+      inputFormatClass = classOf[FilteringParquetRowInputFormat],
+      keyClass = classOf[Void],
+      valueClass = classOf[Row]) {
 
       val cacheMetadata = useMetadataCache
 
@@ -311,11 +295,11 @@ private[sql] class ParquetRelation2(
           new FilteringParquetRowInputFormat
         }
 
-        val jobContext = newJobContext(getConf, jobId)
+        val jobContext = newJobContext(getConf(isDriverSide = true), jobId)
         val rawSplits = inputFormat.getSplits(jobContext)
 
         Array.tabulate[SparkPartition](rawSplits.size) { i =>
-          new NewHadoopPartition(id, i, rawSplits(i).asInstanceOf[InputSplit with Writable])
+          new SqlNewHadoopPartition(id, i, rawSplits(i).asInstanceOf[InputSplit with Writable])
         }
       }
     }.values
@@ -452,6 +436,49 @@ private[sql] object ParquetRelation2 extends Logging {
   // internally.
   private[sql] val METASTORE_SCHEMA = "metastoreSchema"
 
+  /** This closure sets various Parquet configurations at both driver side and executor side. */
+  private[parquet] def initializeLocalJobFunc(
+      requiredColumns: Array[String],
+      filters: Array[Filter],
+      dataSchema: StructType,
+      useMetadataCache: Boolean,
+      parquetFilterPushDown: Boolean)(job: Job): Unit = {
+    val conf = job.getConfiguration
+    conf.set(ParquetInputFormat.READ_SUPPORT_CLASS, classOf[RowReadSupport].getName())
+
+    // Try to push down filters when filter push-down is enabled.
+    if (parquetFilterPushDown) {
+      filters
+        // Collects all converted Parquet filter predicates. Notice that not all predicates can be
+        // converted (`ParquetFilters.createFilter` returns an `Option`). That's why a `flatMap`
+        // is used here.
+        .flatMap(ParquetFilters.createFilter(dataSchema, _))
+        .reduceOption(FilterApi.and)
+        .foreach(ParquetInputFormat.setFilterPredicate(conf, _))
+    }
+
+    conf.set(RowReadSupport.SPARK_ROW_REQUESTED_SCHEMA, {
+      val requestedSchema = StructType(requiredColumns.map(dataSchema(_)))
+      ParquetTypesConverter.convertToString(requestedSchema.toAttributes)
+    })
+
+    conf.set(
+      RowWriteSupport.SPARK_ROW_SCHEMA,
+      ParquetTypesConverter.convertToString(dataSchema.toAttributes))
+
+    // Tell FilteringParquetRowInputFormat whether it's okay to cache Parquet and FS metadata
+    conf.set(SQLConf.PARQUET_CACHE_METADATA, useMetadataCache.toString)
+  }
+
+  /** This closure sets input paths at the driver side. */
+  private[parquet] def initializeDriverSideJobFunc(
+      inputFiles: Array[FileStatus])(job: Job): Unit = {
+    // We side the input paths at the driver side.
+    if (inputFiles.nonEmpty) {
+      FileInputFormat.setInputPaths(job, inputFiles.map(_.getPath): _*)
+    }
+  }
+
   private[parquet] def readSchema(
       footers: Seq[Footer], sqlContext: SQLContext): Option[StructType] = {
     footers.map { footer =>
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/sources/DataSourceStrategy.scala b/sql/core/src/main/scala/org/apache/spark/sql/sources/DataSourceStrategy.scala
index 1615a6dcbdb2a..550090d22d551 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/sources/DataSourceStrategy.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/sources/DataSourceStrategy.scala
@@ -17,7 +17,8 @@
 
 package org.apache.spark.sql.sources
 
-import org.apache.spark.Logging
+import org.apache.spark.{SerializableWritable, Logging}
+import org.apache.spark.deploy.SparkHadoopUtil
 import org.apache.spark.rdd.{RDD, UnionRDD}
 import org.apache.spark.sql.catalyst.expressions
 import org.apache.spark.sql.catalyst.expressions._
@@ -84,11 +85,16 @@ private[sql] object DataSourceStrategy extends Strategy with Logging {
 
     // Scanning non-partitioned HadoopFsRelation
     case PhysicalOperation(projectList, filters, l @ LogicalRelation(t: HadoopFsRelation)) =>
+      // See buildPartitionedTableScan for the reason that we need to create a shard
+      // broadcast HadoopConf.
+      val sharedHadoopConf = SparkHadoopUtil.get.conf
+      val confBroadcast =
+        t.sqlContext.sparkContext.broadcast(new SerializableWritable(sharedHadoopConf))
       pruneFilterProject(
         l,
         projectList,
         filters,
-        (a, f) => t.buildScan(a, f, t.paths)) :: Nil
+        (a, f) => t.buildScan(a, f, t.paths, confBroadcast)) :: Nil
 
     case l @ LogicalRelation(t: TableScan) =>
       createPhysicalRDD(l.relation, l.output, t.buildScan()) :: Nil
@@ -115,6 +121,12 @@ private[sql] object DataSourceStrategy extends Strategy with Logging {
     val output = projections.map(_.toAttribute)
     val relation = logicalRelation.relation.asInstanceOf[HadoopFsRelation]
 
+    // Because we are creating one RDD per partition, we need to have a shared HadoopConf.
+    // Otherwise, the cost of broadcasting HadoopConf in every RDD will be high.
+    val sharedHadoopConf = SparkHadoopUtil.get.conf
+    val confBroadcast =
+      relation.sqlContext.sparkContext.broadcast(new SerializableWritable(sharedHadoopConf))
+
     // Builds RDD[Row]s for each selected partition.
     val perPartitionRows = partitions.map { case Partition(partitionValues, dir) =>
       // The table scan operator (PhysicalRDD) which retrieves required columns from data files.
@@ -132,7 +144,8 @@ private[sql] object DataSourceStrategy extends Strategy with Logging {
             // assuming partition columns data stored in data files are always consistent with those
             // partition values encoded in partition directory paths.
             val nonPartitionColumns = requiredColumns.filterNot(partitionColNames.contains)
-            val dataRows = relation.buildScan(nonPartitionColumns, filters, Array(dir))
+            val dataRows =
+              relation.buildScan(nonPartitionColumns, filters, Array(dir), confBroadcast)
 
             // Merges data values with partition values.
             mergeWithPartitionValues(
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/sources/SqlNewHadoopRDD.scala b/sql/core/src/main/scala/org/apache/spark/sql/sources/SqlNewHadoopRDD.scala
new file mode 100644
index 0000000000000..0c7bb6e50cd98
--- /dev/null
+++ b/sql/core/src/main/scala/org/apache/spark/sql/sources/SqlNewHadoopRDD.scala
@@ -0,0 +1,268 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.sources
+
+import java.text.SimpleDateFormat
+import java.util.Date
+
+import org.apache.hadoop.conf.{Configurable, Configuration}
+import org.apache.hadoop.io.Writable
+import org.apache.hadoop.mapreduce._
+import org.apache.hadoop.mapreduce.lib.input.{CombineFileSplit, FileSplit}
+import org.apache.spark.broadcast.Broadcast
+
+import org.apache.spark.{Partition => SparkPartition, _}
+import org.apache.spark.annotation.DeveloperApi
+import org.apache.spark.deploy.SparkHadoopUtil
+import org.apache.spark.executor.DataReadMethod
+import org.apache.spark.mapreduce.SparkHadoopMapReduceUtil
+import org.apache.spark.rdd.{RDD, HadoopRDD}
+import org.apache.spark.rdd.NewHadoopRDD.NewHadoopMapPartitionsWithSplitRDD
+import org.apache.spark.storage.StorageLevel
+import org.apache.spark.util.Utils
+
+import scala.reflect.ClassTag
+
+private[spark] class SqlNewHadoopPartition(
+    rddId: Int,
+    val index: Int,
+    @transient rawSplit: InputSplit with Writable)
+  extends SparkPartition {
+
+  val serializableHadoopSplit = new SerializableWritable(rawSplit)
+
+  override def hashCode(): Int = 41 * (41 + rddId) + index
+}
+
+/**
+ * An RDD that provides core functionality for reading data stored in Hadoop (e.g., files in HDFS,
+ * sources in HBase, or S3), using the new MapReduce API (`org.apache.hadoop.mapreduce`).
+ * It is based on [[org.apache.spark.rdd.NewHadoopRDD]]. It has three additions.
+ * 1. A shared broadcast Hadoop Configuration.
+ * 2. An optional closure `initDriverSideJobFuncOpt` that set configurations at the driver side
+ *    to the shared Hadoop Configuration.
+ * 3. An optional closure `initLocalJobFuncOpt` that set configurations at both the driver side
+ *    and the executor side to the shared Hadoop Configuration.
+ *
+ * Note: This is RDD is basically a cloned version of [[org.apache.spark.rdd.NewHadoopRDD]] with
+ * changes based on [[org.apache.spark.rdd.HadoopRDD]]. In future, this functionality will be
+ * folded into core.
+ */
+private[sql] class SqlNewHadoopRDD[K, V](
+    @transient sc : SparkContext,
+    broadcastedConf: Broadcast[SerializableWritable[Configuration]],
+    @transient initDriverSideJobFuncOpt: Option[Job => Unit],
+    initLocalJobFuncOpt: Option[Job => Unit],
+    inputFormatClass: Class[_ <: InputFormat[K, V]],
+    keyClass: Class[K],
+    valueClass: Class[V])
+  extends RDD[(K, V)](sc, Nil)
+  with SparkHadoopMapReduceUtil
+  with Logging {
+
+  if (initLocalJobFuncOpt.isDefined) {
+    sc.clean(initLocalJobFuncOpt.get)
+  }
+
+  protected def getJob(): Job = {
+    val conf: Configuration = broadcastedConf.value.value
+    // "new Job" will make a copy of the conf. Then, it is
+    // safe to mutate conf properties with initLocalJobFuncOpt
+    // and initDriverSideJobFuncOpt.
+    val newJob = new Job(conf)
+    initLocalJobFuncOpt.map(f => f(newJob))
+    newJob
+  }
+
+  def getConf(isDriverSide: Boolean): Configuration = {
+    val job = getJob()
+    if (isDriverSide) {
+      initDriverSideJobFuncOpt.map(f => f(job))
+    }
+    job.getConfiguration
+  }
+
+  private val jobTrackerId: String = {
+    val formatter = new SimpleDateFormat("yyyyMMddHHmm")
+    formatter.format(new Date())
+  }
+
+  @transient protected val jobId = new JobID(jobTrackerId, id)
+
+  override def getPartitions: Array[SparkPartition] = {
+    val conf = getConf(isDriverSide = true)
+    val inputFormat = inputFormatClass.newInstance
+    inputFormat match {
+      case configurable: Configurable =>
+        configurable.setConf(conf)
+      case _ =>
+    }
+    val jobContext = newJobContext(conf, jobId)
+    val rawSplits = inputFormat.getSplits(jobContext).toArray
+    val result = new Array[SparkPartition](rawSplits.size)
+    for (i <- 0 until rawSplits.size) {
+      result(i) =
+        new SqlNewHadoopPartition(id, i, rawSplits(i).asInstanceOf[InputSplit with Writable])
+    }
+    result
+  }
+
+  override def compute(
+      theSplit: SparkPartition,
+      context: TaskContext): InterruptibleIterator[(K, V)] = {
+    val iter = new Iterator[(K, V)] {
+      val split = theSplit.asInstanceOf[SqlNewHadoopPartition]
+      logInfo("Input split: " + split.serializableHadoopSplit)
+      val conf = getConf(isDriverSide = false)
+
+      val inputMetrics = context.taskMetrics
+        .getInputMetricsForReadMethod(DataReadMethod.Hadoop)
+
+      // Find a function that will return the FileSystem bytes read by this thread. Do this before
+      // creating RecordReader, because RecordReader's constructor might read some bytes
+      val bytesReadCallback = inputMetrics.bytesReadCallback.orElse {
+        split.serializableHadoopSplit.value match {
+          case _: FileSplit | _: CombineFileSplit =>
+            SparkHadoopUtil.get.getFSBytesReadOnThreadCallback()
+          case _ => None
+        }
+      }
+      inputMetrics.setBytesReadCallback(bytesReadCallback)
+
+      val attemptId = newTaskAttemptID(jobTrackerId, id, isMap = true, split.index, 0)
+      val hadoopAttemptContext = newTaskAttemptContext(conf, attemptId)
+      val format = inputFormatClass.newInstance
+      format match {
+        case configurable: Configurable =>
+          configurable.setConf(conf)
+        case _ =>
+      }
+      val reader = format.createRecordReader(
+        split.serializableHadoopSplit.value, hadoopAttemptContext)
+      reader.initialize(split.serializableHadoopSplit.value, hadoopAttemptContext)
+
+      // Register an on-task-completion callback to close the input stream.
+      context.addTaskCompletionListener(context => close())
+      var havePair = false
+      var finished = false
+      var recordsSinceMetricsUpdate = 0
+
+      override def hasNext: Boolean = {
+        if (!finished && !havePair) {
+          finished = !reader.nextKeyValue
+          havePair = !finished
+        }
+        !finished
+      }
+
+      override def next(): (K, V) = {
+        if (!hasNext) {
+          throw new java.util.NoSuchElementException("End of stream")
+        }
+        havePair = false
+        if (!finished) {
+          inputMetrics.incRecordsRead(1)
+        }
+        (reader.getCurrentKey, reader.getCurrentValue)
+      }
+
+      private def close() {
+        try {
+          reader.close()
+          if (bytesReadCallback.isDefined) {
+            inputMetrics.updateBytesRead()
+          } else if (split.serializableHadoopSplit.value.isInstanceOf[FileSplit] ||
+                     split.serializableHadoopSplit.value.isInstanceOf[CombineFileSplit]) {
+            // If we can't get the bytes read from the FS stats, fall back to the split size,
+            // which may be inaccurate.
+            try {
+              inputMetrics.incBytesRead(split.serializableHadoopSplit.value.getLength)
+            } catch {
+              case e: java.io.IOException =>
+                logWarning("Unable to get input size to set InputMetrics for task", e)
+            }
+          }
+        } catch {
+          case e: Exception => {
+            if (!Utils.inShutdown()) {
+              logWarning("Exception in RecordReader.close()", e)
+            }
+          }
+        }
+      }
+    }
+    new InterruptibleIterator(context, iter)
+  }
+
+  /** Maps over a partition, providing the InputSplit that was used as the base of the partition. */
+  @DeveloperApi
+  def mapPartitionsWithInputSplit[U: ClassTag](
+      f: (InputSplit, Iterator[(K, V)]) => Iterator[U],
+      preservesPartitioning: Boolean = false): RDD[U] = {
+    new NewHadoopMapPartitionsWithSplitRDD(this, f, preservesPartitioning)
+  }
+
+  override def getPreferredLocations(hsplit: SparkPartition): Seq[String] = {
+    val split = hsplit.asInstanceOf[SqlNewHadoopPartition].serializableHadoopSplit.value
+    val locs = HadoopRDD.SPLIT_INFO_REFLECTIONS match {
+      case Some(c) => 
+        try {
+          val infos = c.newGetLocationInfo.invoke(split).asInstanceOf[Array[AnyRef]]
+          Some(HadoopRDD.convertSplitLocationInfo(infos))
+        } catch {
+          case e : Exception =>
+            logDebug("Failed to use InputSplit#getLocationInfo.", e)
+            None
+        }
+      case None => None
+    }
+    locs.getOrElse(split.getLocations.filter(_ != "localhost"))
+  }
+
+  override def persist(storageLevel: StorageLevel): this.type = {
+    if (storageLevel.deserialized) {
+      logWarning("Caching NewHadoopRDDs as deserialized objects usually leads to undesired" +
+        " behavior because Hadoop's RecordReader reuses the same Writable object for all records." +
+        " Use a map transformation to make copies of the records.")
+    }
+    super.persist(storageLevel)
+  }
+}
+
+private[spark] object SqlNewHadoopRDD {
+  /**
+   * Analogous to [[org.apache.spark.rdd.MapPartitionsRDD]], but passes in an InputSplit to
+   * the given function rather than the index of the partition.
+   */
+  private[spark] class NewHadoopMapPartitionsWithSplitRDD[U: ClassTag, T: ClassTag](
+      prev: RDD[T],
+      f: (InputSplit, Iterator[T]) => Iterator[U],
+      preservesPartitioning: Boolean = false)
+    extends RDD[U](prev) {
+
+    override val partitioner = if (preservesPartitioning) firstParent[T].partitioner else None
+
+    override def getPartitions: Array[SparkPartition] = firstParent[T].partitions
+
+    override def compute(split: SparkPartition, context: TaskContext): Iterator[U] = {
+      val partition = split.asInstanceOf[SqlNewHadoopPartition]
+      val inputSplit = partition.serializableHadoopSplit.value
+      f(inputSplit, firstParent[T].iterator(split, context))
+    }
+  }
+}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/sources/interfaces.scala b/sql/core/src/main/scala/org/apache/spark/sql/sources/interfaces.scala
index 9b52d1be3df2d..6a917bf38b139 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/sources/interfaces.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/sources/interfaces.scala
@@ -25,7 +25,9 @@ import org.apache.hadoop.fs.{FileStatus, FileSystem, Path}
 import org.apache.hadoop.mapreduce.{Job, TaskAttemptContext}
 
 import org.apache.spark.annotation.{DeveloperApi, Experimental}
+import org.apache.spark.broadcast.Broadcast
 import org.apache.spark.rdd.RDD
+import org.apache.spark.SerializableWritable
 import org.apache.spark.sql._
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.expressions.codegen.GenerateMutableProjection
@@ -484,7 +486,8 @@ abstract class HadoopFsRelation private[sql](maybePartitionSpec: Option[Partitio
   private[sources] final def buildScan(
       requiredColumns: Array[String],
       filters: Array[Filter],
-      inputPaths: Array[String]): RDD[Row] = {
+      inputPaths: Array[String],
+      broadcastedConf: Broadcast[SerializableWritable[Configuration]]): RDD[Row] = {
     val inputStatuses = inputPaths.flatMap { input =>
       val path = new Path(input)
 
@@ -499,7 +502,7 @@ abstract class HadoopFsRelation private[sql](maybePartitionSpec: Option[Partitio
       }
     }
 
-    buildScan(requiredColumns, filters, inputStatuses)
+    buildScan(requiredColumns, filters, inputStatuses, broadcastedConf)
   }
 
   /**
@@ -583,6 +586,34 @@ abstract class HadoopFsRelation private[sql](maybePartitionSpec: Option[Partitio
     buildScan(requiredColumns, inputFiles)
   }
 
+  /**
+   * For a non-partitioned relation, this method builds an `RDD[Row]` containing all rows within
+   * this relation. For partitioned relations, this method is called for each selected partition,
+   * and builds an `RDD[Row]` containing all rows within that single partition.
+   *
+   * Note: This interface is subject to change in future.
+   *
+   * @param requiredColumns Required columns.
+   * @param filters Candidate filters to be pushed down. The actual filter should be the conjunction
+   *        of all `filters`.  The pushed down filters are currently purely an optimization as they
+   *        will all be evaluated again. This means it is safe to use them with methods that produce
+   *        false positives such as filtering partitions based on a bloom filter.
+   * @param inputFiles For a non-partitioned relation, it contains paths of all data files in the
+   *        relation. For a partitioned relation, it contains paths of all data files in a single
+   *        selected partition.
+   * @param broadcastedConf A shared broadcast Hadoop Configuration, which can be used to reduce the
+   *                        overhead of broadcasting the Configuration for every Hadoop RDD.
+   *
+   * @since 1.4.0
+   */
+  private[sql] def buildScan(
+      requiredColumns: Array[String],
+      filters: Array[Filter],
+      inputFiles: Array[FileStatus],
+      broadcastedConf: Broadcast[SerializableWritable[Configuration]]): RDD[Row] = {
+    buildScan(requiredColumns, filters, inputFiles)
+  }
+
   /**
    * Prepares a write job and returns an [[OutputWriterFactory]].  Client side job preparation can
    * be put here.  For example, user defined output committer can be configured here

From 2ad4837cfa66fcedc96b0819a8c2f4c3d70b0aaa Mon Sep 17 00:00:00 2001
From: Xiangrui Meng <meng@databricks.com>
Date: Wed, 20 May 2015 12:50:06 -0700
Subject: [PATCH 089/525] [SPARK-7537] [MLLIB] spark.mllib API updates

Minor updates to the spark.mllib APIs:

1. Add `DeveloperApi` to `PMMLExportable` and add `Experimental` to `toPMML` methods.
2. Mention `RankingMetrics.of` in the `RankingMetrics` constructor.

Author: Xiangrui Meng <meng@databricks.com>

Closes #6280 from mengxr/SPARK-7537 and squashes the following commits:

1bd2583 [Xiangrui Meng] organize imports
94afa7a [Xiangrui Meng] mark all toPMML methods experimental
4c40da1 [Xiangrui Meng] mention the factory method for RankingMetrics for Java users
88c62d0 [Xiangrui Meng] add DeveloperApi to PMMLExportable
---
 .../spark/mllib/evaluation/RankingMetrics.scala       |  2 ++
 .../org/apache/spark/mllib/pmml/PMMLExportable.scala  | 11 +++++++++++
 2 files changed, 13 insertions(+)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/evaluation/RankingMetrics.scala b/mllib/src/main/scala/org/apache/spark/mllib/evaluation/RankingMetrics.scala
index b9b54b93c27fa..5b5a2a1450f7f 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/evaluation/RankingMetrics.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/evaluation/RankingMetrics.scala
@@ -31,6 +31,8 @@ import org.apache.spark.rdd.RDD
  * ::Experimental::
  * Evaluator for ranking algorithms.
  *
+ * Java users should use [[RankingMetrics$.of]] to create a [[RankingMetrics]] instance.
+ *
  * @param predictionAndLabels an RDD of (predicted ranking, ground truth set) pairs.
  */
 @Experimental
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/pmml/PMMLExportable.scala b/mllib/src/main/scala/org/apache/spark/mllib/pmml/PMMLExportable.scala
index 354e90f3eeaa6..5e882d4ebb10b 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/pmml/PMMLExportable.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/pmml/PMMLExportable.scala
@@ -23,13 +23,16 @@ import javax.xml.transform.stream.StreamResult
 import org.jpmml.model.JAXBUtil
 
 import org.apache.spark.SparkContext
+import org.apache.spark.annotation.{DeveloperApi, Experimental}
 import org.apache.spark.mllib.pmml.export.PMMLModelExportFactory
 
 /**
+ * :: DeveloperApi ::
  * Export model to the PMML format
  * Predictive Model Markup Language (PMML) is an XML-based file format
  * developed by the Data Mining Group (www.dmg.org).
  */
+@DeveloperApi
 trait PMMLExportable {
 
   /**
@@ -41,30 +44,38 @@ trait PMMLExportable {
   }
 
   /**
+   * :: Experimental ::
    * Export the model to a local file in PMML format
    */
+  @Experimental
   def toPMML(localPath: String): Unit = {
     toPMML(new StreamResult(new File(localPath)))
   }
 
   /**
+   * :: Experimental ::
    * Export the model to a directory on a distributed file system in PMML format
    */
+  @Experimental
   def toPMML(sc: SparkContext, path: String): Unit = {
     val pmml = toPMML()
     sc.parallelize(Array(pmml), 1).saveAsTextFile(path)
   }
 
   /**
+   * :: Experimental ::
    * Export the model to the OutputStream in PMML format
    */
+  @Experimental
   def toPMML(outputStream: OutputStream): Unit = {
     toPMML(new StreamResult(outputStream))
   }
 
   /**
+   * :: Experimental ::
    * Export the model to a String in PMML format
    */
+  @Experimental
   def toPMML(): String = {
     val writer = new StringWriter
     toPMML(new StreamResult(writer))

From 829f1d95bac9153e7b646fbc0d55566ecf896200 Mon Sep 17 00:00:00 2001
From: Sandy Ryza <sandy@cloudera.com>
Date: Wed, 20 May 2015 13:10:30 -0700
Subject: [PATCH 090/525] [SPARK-7579] [ML] [DOC] User guide update for
 OneHotEncoder

Author: Sandy Ryza <sandy@cloudera.com>

Closes #6126 from sryza/sandy-spark-7579 and squashes the following commits:

5af803d [Sandy Ryza] SPARK-7579 [MLLIB] User guide update for OneHotEncoder
---
 docs/ml-features.md | 95 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 95 insertions(+)

diff --git a/docs/ml-features.md b/docs/ml-features.md
index 63ea3e5db7ac9..235029d71fadd 100644
--- a/docs/ml-features.md
+++ b/docs/ml-features.md
@@ -440,5 +440,100 @@ for expanded in polyDF.select("polyFeatures").take(3):
 </div>
 </div>
 
+## OneHotEncoder
+
+[One-hot encoding](http://en.wikipedia.org/wiki/One-hot) maps a column of label indices to a column of binary vectors, with at most a single one-value. This encoding allows algorithms which expect continuous features, such as Logistic Regression, to use categorical features 
+
+<div class="codetabs">
+<div data-lang="scala" markdown="1">
+{% highlight scala %}
+import org.apache.spark.ml.feature.{OneHotEncoder, StringIndexer}
+
+val df = sqlContext.createDataFrame(Seq(
+  (0, "a"),
+  (1, "b"),
+  (2, "c"),
+  (3, "a"),
+  (4, "a"),
+  (5, "c")
+)).toDF("id", "category")
+
+val indexer = new StringIndexer()
+  .setInputCol("category")
+  .setOutputCol("categoryIndex")
+  .fit(df)
+val indexed = indexer.transform(df)
+
+val encoder = new OneHotEncoder().setInputCol("categoryIndex").
+  setOutputCol("categoryVec")
+val encoded = encoder.transform(indexed)
+encoded.select("id", "categoryVec").foreach(println)
+{% endhighlight %}
+</div>
+
+<div data-lang="java" markdown="1">
+{% highlight java %}
+import com.google.common.collect.Lists;
+
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.ml.feature.OneHotEncoder;
+import org.apache.spark.ml.feature.StringIndexer;
+import org.apache.spark.ml.feature.StringIndexerModel;
+import org.apache.spark.sql.DataFrame;
+import org.apache.spark.sql.Row;
+import org.apache.spark.sql.RowFactory;
+import org.apache.spark.sql.types.DataTypes;
+import org.apache.spark.sql.types.Metadata;
+import org.apache.spark.sql.types.StructField;
+import org.apache.spark.sql.types.StructType;
+
+JavaRDD<Row> jrdd = jsc.parallelize(Lists.newArrayList(
+    RowFactory.create(0, "a"),
+    RowFactory.create(1, "b"),
+    RowFactory.create(2, "c"),
+    RowFactory.create(3, "a"),
+    RowFactory.create(4, "a"),
+    RowFactory.create(5, "c")
+));
+StructType schema = new StructType(new StructField[]{
+    new StructField("id", DataTypes.DoubleType, false, Metadata.empty()),
+    new StructField("category", DataTypes.StringType, false, Metadata.empty())
+});
+DataFrame df = sqlContext.createDataFrame(jrdd, schema);
+StringIndexerModel indexer = new StringIndexer()
+  .setInputCol("category")
+  .setOutputCol("categoryIndex")
+  .fit(df);
+DataFrame indexed = indexer.transform(df);
+
+OneHotEncoder encoder = new OneHotEncoder()
+  .setInputCol("categoryIndex")
+  .setOutputCol("categoryVec");
+DataFrame encoded = encoder.transform(indexed);
+{% endhighlight %}
+</div>
+
+<div data-lang="python" markdown="1">
+{% highlight python %}
+from pyspark.ml.feature import OneHotEncoder, StringIndexer
+
+df = sqlContext.createDataFrame([
+  (0, "a"),
+  (1, "b"),
+  (2, "c"),
+  (3, "a"),
+  (4, "a"),
+  (5, "c")
+], ["id", "category"])
+
+stringIndexer = StringIndexer(inputCol="category", outputCol="categoryIndex")
+model = stringIndexer.fit(df)
+indexed = model.transform(df)
+encoder = OneHotEncoder(includeFirst=False, inputCol="categoryIndex", outputCol="categoryVec")
+encoded = encoder.transform(indexed)
+{% endhighlight %}
+</div>
+</div>
+
 # Feature Selectors
 

From 6338c40da61de045485c51aa11a5b1e425d22144 Mon Sep 17 00:00:00 2001
From: Patrick Wendell <patrick@databricks.com>
Date: Wed, 20 May 2015 13:39:04 -0700
Subject: [PATCH 091/525] Revert "[SPARK-7320] [SQL] Add Cube / Rollup for
 dataframe"

This reverts commit 10698e1131f665addb454cd498669920699a91b2.
---
 .../org/apache/spark/sql/DataFrame.scala      | 104 +-----------------
 .../org/apache/spark/sql/GroupedData.scala    |  92 +++++-----------
 .../hive/HiveDataFrameAnalyticsSuite.scala    |  62 -----------
 3 files changed, 28 insertions(+), 230 deletions(-)
 delete mode 100644 sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveDataFrameAnalyticsSuite.scala

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
index d78b4c2f8909c..adad85806d1ea 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
@@ -685,53 +685,7 @@ class DataFrame private[sql](
    * @since 1.3.0
    */
   @scala.annotation.varargs
-  def groupBy(cols: Column*): GroupedData = {
-    GroupedData(this, cols.map(_.expr), GroupedData.GroupByType)
-  }
-
-  /**
-   * Create a multi-dimensional rollup for the current [[DataFrame]] using the specified columns,
-   * so we can run aggregation on them.
-   * See [[GroupedData]] for all the available aggregate functions.
-   *
-   * {{{
-   *   // Compute the average for all numeric columns rolluped by department and group.
-   *   df.rollup($"department", $"group").avg()
-   *
-   *   // Compute the max age and average salary, rolluped by department and gender.
-   *   df.rollup($"department", $"gender").agg(Map(
-   *     "salary" -> "avg",
-   *     "age" -> "max"
-   *   ))
-   * }}}
-   * @group dfops
-   * @since 1.4.0
-   */
-  @scala.annotation.varargs
-  def rollup(cols: Column*): GroupedData = {
-    GroupedData(this, cols.map(_.expr), GroupedData.RollupType)
-  }
-
-  /**
-   * Create a multi-dimensional cube for the current [[DataFrame]] using the specified columns,
-   * so we can run aggregation on them.
-   * See [[GroupedData]] for all the available aggregate functions.
-   *
-   * {{{
-   *   // Compute the average for all numeric columns cubed by department and group.
-   *   df.cube($"department", $"group").avg()
-   *
-   *   // Compute the max age and average salary, cubed by department and gender.
-   *   df.cube($"department", $"gender").agg(Map(
-   *     "salary" -> "avg",
-   *     "age" -> "max"
-   *   ))
-   * }}}
-   * @group dfops
-   * @since 1.4.0
-   */
-  @scala.annotation.varargs
-  def cube(cols: Column*): GroupedData = GroupedData(this, cols.map(_.expr), GroupedData.CubeType)
+  def groupBy(cols: Column*): GroupedData = new GroupedData(this, cols.map(_.expr))
 
   /**
    * Groups the [[DataFrame]] using the specified columns, so we can run aggregation on them.
@@ -756,61 +710,7 @@ class DataFrame private[sql](
   @scala.annotation.varargs
   def groupBy(col1: String, cols: String*): GroupedData = {
     val colNames: Seq[String] = col1 +: cols
-    GroupedData(this, colNames.map(colName => resolve(colName)), GroupedData.GroupByType)
-  }
-
-  /**
-   * Create a multi-dimensional rollup for the current [[DataFrame]] using the specified columns,
-   * so we can run aggregation on them.
-   * See [[GroupedData]] for all the available aggregate functions.
-   *
-   * This is a variant of rollup that can only group by existing columns using column names
-   * (i.e. cannot construct expressions).
-   *
-   * {{{
-   *   // Compute the average for all numeric columns rolluped by department and group.
-   *   df.rollup("department", "group").avg()
-   *
-   *   // Compute the max age and average salary, rolluped by department and gender.
-   *   df.rollup($"department", $"gender").agg(Map(
-   *     "salary" -> "avg",
-   *     "age" -> "max"
-   *   ))
-   * }}}
-   * @group dfops
-   * @since 1.4.0
-   */
-  @scala.annotation.varargs
-  def rollup(col1: String, cols: String*): GroupedData = {
-    val colNames: Seq[String] = col1 +: cols
-    GroupedData(this, colNames.map(colName => resolve(colName)), GroupedData.RollupType)
-  }
-
-  /**
-   * Create a multi-dimensional cube for the current [[DataFrame]] using the specified columns,
-   * so we can run aggregation on them.
-   * See [[GroupedData]] for all the available aggregate functions.
-   *
-   * This is a variant of cube that can only group by existing columns using column names
-   * (i.e. cannot construct expressions).
-   *
-   * {{{
-   *   // Compute the average for all numeric columns cubed by department and group.
-   *   df.cube("department", "group").avg()
-   *
-   *   // Compute the max age and average salary, cubed by department and gender.
-   *   df.cube($"department", $"gender").agg(Map(
-   *     "salary" -> "avg",
-   *     "age" -> "max"
-   *   ))
-   * }}}
-   * @group dfops
-   * @since 1.4.0
-   */
-  @scala.annotation.varargs
-  def cube(col1: String, cols: String*): GroupedData = {
-    val colNames: Seq[String] = col1 +: cols
-    GroupedData(this, colNames.map(colName => resolve(colName)), GroupedData.CubeType)
+    new GroupedData(this, colNames.map(colName => resolve(colName)))
   }
 
   /**
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/GroupedData.scala b/sql/core/src/main/scala/org/apache/spark/sql/GroupedData.scala
index f730e4ae00e2b..1381b9f1a6080 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/GroupedData.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/GroupedData.scala
@@ -23,40 +23,9 @@ import scala.language.implicitConversions
 import org.apache.spark.annotation.Experimental
 import org.apache.spark.sql.catalyst.analysis.Star
 import org.apache.spark.sql.catalyst.expressions._
-import org.apache.spark.sql.catalyst.plans.logical.{Rollup, Cube, Aggregate}
+import org.apache.spark.sql.catalyst.plans.logical.Aggregate
 import org.apache.spark.sql.types.NumericType
 
-/**
- * Companion object for GroupedData
- */
-private[sql] object GroupedData {
-  def apply(
-      df: DataFrame,
-      groupingExprs: Seq[Expression],
-      groupType: GroupType): GroupedData = {
-    new GroupedData(df, groupingExprs, groupType: GroupType)
-  }
-
-  /**
-   * The Grouping Type
-   */
-  trait GroupType
-
-  /**
-   * To indicate it's the GroupBy
-   */
-  object GroupByType extends GroupType
-
-  /**
-   * To indicate it's the CUBE
-   */
-  object CubeType extends GroupType
-
-  /**
-   * To indicate it's the ROLLUP
-   */
-  object RollupType extends GroupType
-}
 
 /**
  * :: Experimental ::
@@ -65,37 +34,19 @@ private[sql] object GroupedData {
  * @since 1.3.0
  */
 @Experimental
-class GroupedData protected[sql](
-    df: DataFrame,
-    groupingExprs: Seq[Expression],
-    private val groupType: GroupedData.GroupType) {
+class GroupedData protected[sql](df: DataFrame, groupingExprs: Seq[Expression]) {
 
-  private[this] def toDF(aggExprs: Seq[NamedExpression]): DataFrame = {
-    val aggregates = if (df.sqlContext.conf.dataFrameRetainGroupColumns) {
-        val retainedExprs = groupingExprs.map {
-          case expr: NamedExpression => expr
-          case expr: Expression => Alias(expr, expr.prettyString)()
-        }
-        retainedExprs ++ aggExprs
-      } else {
-        aggExprs
-      }
-
-    groupType match {
-      case GroupedData.GroupByType =>
-        DataFrame(
-          df.sqlContext, Aggregate(groupingExprs, aggregates, df.logicalPlan))
-      case GroupedData.RollupType =>
-        DataFrame(
-          df.sqlContext, Rollup(groupingExprs, df.logicalPlan, aggregates))
-      case GroupedData.CubeType =>
-        DataFrame(
-          df.sqlContext, Cube(groupingExprs, df.logicalPlan, aggregates))
+  private[sql] implicit def toDF(aggExprs: Seq[NamedExpression]): DataFrame = {
+    val namedGroupingExprs = groupingExprs.map {
+      case expr: NamedExpression => expr
+      case expr: Expression => Alias(expr, expr.prettyString)()
     }
+    DataFrame(
+      df.sqlContext, Aggregate(groupingExprs, namedGroupingExprs ++ aggExprs, df.logicalPlan))
   }
 
   private[this] def aggregateNumericColumns(colNames: String*)(f: Expression => Expression)
-    : DataFrame = {
+    : Seq[NamedExpression] = {
 
     val columnExprs = if (colNames.isEmpty) {
       // No columns specified. Use all numeric columns.
@@ -112,10 +63,10 @@ class GroupedData protected[sql](
         namedExpr
       }
     }
-    toDF(columnExprs.map { c =>
+    columnExprs.map { c =>
       val a = f(c)
       Alias(a, a.prettyString)()
-    })
+    }
   }
 
   private[this] def strToExpr(expr: String): (Expression => Expression) = {
@@ -168,10 +119,10 @@ class GroupedData protected[sql](
    * @since 1.3.0
    */
   def agg(exprs: Map[String, String]): DataFrame = {
-    toDF(exprs.map { case (colName, expr) =>
+    exprs.map { case (colName, expr) =>
       val a = strToExpr(expr)(df(colName).expr)
       Alias(a, a.prettyString)()
-    }.toSeq)
+    }.toSeq
   }
 
   /**
@@ -224,10 +175,19 @@ class GroupedData protected[sql](
    */
   @scala.annotation.varargs
   def agg(expr: Column, exprs: Column*): DataFrame = {
-    toDF((expr +: exprs).map(_.expr).map {
+    val aggExprs = (expr +: exprs).map(_.expr).map {
       case expr: NamedExpression => expr
       case expr: Expression => Alias(expr, expr.prettyString)()
-    })
+    }
+    if (df.sqlContext.conf.dataFrameRetainGroupColumns) {
+      val retainedExprs = groupingExprs.map {
+        case expr: NamedExpression => expr
+        case expr: Expression => Alias(expr, expr.prettyString)()
+      }
+      DataFrame(df.sqlContext, Aggregate(groupingExprs, retainedExprs ++ aggExprs, df.logicalPlan))
+    } else {
+      DataFrame(df.sqlContext, Aggregate(groupingExprs, aggExprs, df.logicalPlan))
+    }
   }
 
   /**
@@ -236,7 +196,7 @@ class GroupedData protected[sql](
    *
    * @since 1.3.0
    */
-  def count(): DataFrame = toDF(Seq(Alias(Count(Literal(1)), "count")()))
+  def count(): DataFrame = Seq(Alias(Count(Literal(1)), "count")())
 
   /**
    * Compute the average value for each numeric columns for each group. This is an alias for `avg`.
@@ -296,5 +256,5 @@ class GroupedData protected[sql](
   @scala.annotation.varargs
   def sum(colNames: String*): DataFrame = {
     aggregateNumericColumns(colNames:_*)(Sum)
-  }
+  }    
 }
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveDataFrameAnalyticsSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveDataFrameAnalyticsSuite.scala
deleted file mode 100644
index 3ad05f482504c..0000000000000
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveDataFrameAnalyticsSuite.scala
+++ /dev/null
@@ -1,62 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.sql.hive
-
-import org.apache.spark.sql.QueryTest
-import org.apache.spark.sql.functions._
-import org.apache.spark.sql.hive.test.TestHive
-import org.apache.spark.sql.hive.test.TestHive._
-import org.apache.spark.sql.hive.test.TestHive.implicits._
-
-case class TestData2Int(a: Int, b: Int)
-
-// TODO ideally we should put the test suite into the package `sql`, as
-// `hive` package is optional in compiling, however, `SQLContext.sql` doesn't
-// support the `cube` or `rollup` yet.
-class HiveDataFrameAnalyticsSuite extends QueryTest {
-  val testData =
-    TestHive.sparkContext.parallelize(
-      TestData2Int(1, 2) ::
-        TestData2Int(2, 4) :: Nil).toDF()
-
-  testData.registerTempTable("mytable")
-
-  test("rollup") {
-    checkAnswer(
-      testData.rollup($"a" + $"b", $"b").agg(sum($"a" - $"b")),
-      sql("select a + b, b, sum(a - b) from mytable group by a + b, b with rollup").collect()
-    )
-
-    checkAnswer(
-      testData.rollup("a", "b").agg(sum("b")),
-      sql("select a, b, sum(b) from mytable group by a, b with rollup").collect()
-    )
-  }
-
-  test("cube") {
-    checkAnswer(
-      testData.cube($"a" + $"b", $"b").agg(sum($"a" - $"b")),
-      sql("select a + b, b, sum(a - b) from mytable group by a + b, b with cube").collect()
-    )
-
-    checkAnswer(
-      testData.cube("a", "b").agg(sum("b")),
-      sql("select a, b, sum(b) from mytable group by a, b with cube").collect()
-    )
-  }
-}

From 191ee474527530246ac3164ae9631e01bdd1e647 Mon Sep 17 00:00:00 2001
From: Holden Karau <holden@pigscanfly.ca>
Date: Wed, 20 May 2015 15:16:12 -0700
Subject: [PATCH 092/525] [SPARK-7511] [MLLIB] pyspark ml seed param should be
 random by default or 42 is quite funny but not very random

Author: Holden Karau <holden@pigscanfly.ca>

Closes #6139 from holdenk/SPARK-7511-pyspark-ml-seed-param-should-be-random-by-default-or-42-is-quite-funny-but-not-very-random and squashes the following commits:

591f8e5 [Holden Karau] specify old seed for doc tests
2470004 [Holden Karau] Fix a bunch of seeds with default values to have None as the default which will then result in using the hash of the class name
cbad96d [Holden Karau] Add the setParams function that is used in the real code
423b8d7 [Holden Karau] Switch the test code to behave slightly more like production code. also don't check the param map value only check for key existence
140d25d [Holden Karau] remove extra space
926165a [Holden Karau] Add some missing newlines for pep8 style
8616751 [Holden Karau] merge in master
58532e6 [Holden Karau] its the __name__ method, also treat None values as not set
56ef24a [Holden Karau] fix test and regenerate base
afdaa5c [Holden Karau] make sure different classes have different results
68eb528 [Holden Karau] switch default seed to hash of type of self
89c4611 [Holden Karau] Merge branch 'master' into SPARK-7511-pyspark-ml-seed-param-should-be-random-by-default-or-42-is-quite-funny-but-not-very-random
31cd96f [Holden Karau] specify the seed to randomforestregressor test
e1b947f [Holden Karau] Style fixes
ce90ec8 [Holden Karau] merge in master
bcdf3c9 [Holden Karau] update docstring seeds to none and some other default seeds from 42
65eba21 [Holden Karau] pep8 fixes
0e3797e [Holden Karau] Make seed default to random in more places
213a543 [Holden Karau] Simplify the generated code to only include set default if there is a default rather than having None is note None in the generated code
1ff17c2 [Holden Karau] Make the seed random for HasSeed in python
---
 python/pyspark/ml/classification.py           | 12 ++--
 python/pyspark/ml/feature.py                  | 10 +--
 python/pyspark/ml/param/__init__.py           |  2 +-
 .../ml/param/_shared_params_code_gen.py       |  9 +--
 python/pyspark/ml/param/shared.py             | 37 ++--------
 python/pyspark/ml/recommendation.py           | 10 +--
 python/pyspark/ml/regression.py               | 13 ++--
 python/pyspark/ml/tests.py                    | 67 +++++++++++++++++--
 8 files changed, 96 insertions(+), 64 deletions(-)

diff --git a/python/pyspark/ml/classification.py b/python/pyspark/ml/classification.py
index 4e645519c47c7..7abbde8b260eb 100644
--- a/python/pyspark/ml/classification.py
+++ b/python/pyspark/ml/classification.py
@@ -292,7 +292,7 @@ class RandomForestClassifier(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPred
     >>> stringIndexer = StringIndexer(inputCol="label", outputCol="indexed")
     >>> si_model = stringIndexer.fit(df)
     >>> td = si_model.transform(df)
-    >>> rf = RandomForestClassifier(numTrees=2, maxDepth=2, labelCol="indexed")
+    >>> rf = RandomForestClassifier(numTrees=2, maxDepth=2, labelCol="indexed", seed=42)
     >>> model = rf.fit(td)
     >>> test0 = sqlContext.createDataFrame([(Vectors.dense(-1.0),)], ["features"])
     >>> model.transform(test0).head().prediction
@@ -319,12 +319,12 @@ class RandomForestClassifier(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPred
     def __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction",
                  maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0,
                  maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, impurity="gini",
-                 numTrees=20, featureSubsetStrategy="auto", seed=42):
+                 numTrees=20, featureSubsetStrategy="auto", seed=None):
         """
         __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction", \
                  maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, \
                  maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, impurity="gini", \
-                 numTrees=20, featureSubsetStrategy="auto", seed=42)
+                 numTrees=20, featureSubsetStrategy="auto", seed=None)
         """
         super(RandomForestClassifier, self).__init__()
         self._java_obj = self._new_java_obj(
@@ -347,7 +347,7 @@ def __init__(self, featuresCol="features", labelCol="label", predictionCol="pred
                   "The number of features to consider for splits at each tree node. Supported " +
                   "options: " + ", ".join(RandomForestParams.supportedFeatureSubsetStrategies))
         self._setDefault(maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0,
-                         maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, seed=42,
+                         maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, seed=None,
                          impurity="gini", numTrees=20, featureSubsetStrategy="auto")
         kwargs = self.__init__._input_kwargs
         self.setParams(**kwargs)
@@ -355,12 +355,12 @@ def __init__(self, featuresCol="features", labelCol="label", predictionCol="pred
     @keyword_only
     def setParams(self, featuresCol="features", labelCol="label", predictionCol="prediction",
                   maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0,
-                  maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, seed=42,
+                  maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, seed=None,
                   impurity="gini", numTrees=20, featureSubsetStrategy="auto"):
         """
         setParams(self, featuresCol="features", labelCol="label", predictionCol="prediction", \
                   maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, \
-                  maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, seed=42, \
+                  maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, seed=None, \
                   impurity="gini", numTrees=20, featureSubsetStrategy="auto")
         Sets params for linear classification.
         """
diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py
index c8115cb5bcf63..5511dceb70419 100644
--- a/python/pyspark/ml/feature.py
+++ b/python/pyspark/ml/feature.py
@@ -876,10 +876,10 @@ class Word2Vec(JavaEstimator, HasStepSize, HasMaxIter, HasSeed, HasInputCol, Has
 
     @keyword_only
     def __init__(self, vectorSize=100, minCount=5, numPartitions=1, stepSize=0.025, maxIter=1,
-                 seed=42, inputCol=None, outputCol=None):
+                 seed=None, inputCol=None, outputCol=None):
         """
         __init__(self, vectorSize=100, minCount=5, numPartitions=1, stepSize=0.025, maxIter=1, \
-                 seed=42, inputCol=None, outputCol=None)
+                 seed=None, inputCol=None, outputCol=None)
         """
         super(Word2Vec, self).__init__()
         self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.Word2Vec", self.uid)
@@ -891,15 +891,15 @@ def __init__(self, vectorSize=100, minCount=5, numPartitions=1, stepSize=0.025,
                               "the minimum number of times a token must appear to be included " +
                               "in the word2vec model's vocabulary")
         self._setDefault(vectorSize=100, minCount=5, numPartitions=1, stepSize=0.025, maxIter=1,
-                         seed=42)
+                         seed=None)
         kwargs = self.__init__._input_kwargs
         self.setParams(**kwargs)
 
     @keyword_only
     def setParams(self, vectorSize=100, minCount=5, numPartitions=1, stepSize=0.025, maxIter=1,
-                  seed=42, inputCol=None, outputCol=None):
+                  seed=None, inputCol=None, outputCol=None):
         """
-        setParams(self, minCount=5, numPartitions=1, stepSize=0.025, maxIter=1, seed=42, \
+        setParams(self, minCount=5, numPartitions=1, stepSize=0.025, maxIter=1, seed=None, \
                  inputCol=None, outputCol=None)
         Sets params for this Word2Vec.
         """
diff --git a/python/pyspark/ml/param/__init__.py b/python/pyspark/ml/param/__init__.py
index 67fb6e3dc74fb..7845536161e07 100644
--- a/python/pyspark/ml/param/__init__.py
+++ b/python/pyspark/ml/param/__init__.py
@@ -147,7 +147,7 @@ def hasParam(self, paramName):
     def getOrDefault(self, param):
         """
         Gets the value of a param in the user-supplied param map or its
-        default value. Raises an error if either is set.
+        default value. Raises an error if neither is set.
         """
         param = self._resolveParam(param)
         if param in self._paramMap:
diff --git a/python/pyspark/ml/param/_shared_params_code_gen.py b/python/pyspark/ml/param/_shared_params_code_gen.py
index 91e45ec373518..ccb929af184b8 100644
--- a/python/pyspark/ml/param/_shared_params_code_gen.py
+++ b/python/pyspark/ml/param/_shared_params_code_gen.py
@@ -56,9 +56,10 @@ def _gen_param_header(name, doc, defaultValueStr):
     def __init__(self):
         super(Has$Name, self).__init__()
         #: param for $doc
-        self.$name = Param(self, "$name", "$doc")
-        if $defaultValueStr is not None:
-            self._setDefault($name=$defaultValueStr)'''
+        self.$name = Param(self, "$name", "$doc")'''
+    if defaultValueStr is not None:
+        template += '''
+        self._setDefault($name=$defaultValueStr)'''
 
     Name = name[0].upper() + name[1:]
     return template \
@@ -118,7 +119,7 @@ def get$Name(self):
         ("outputCol", "output column name", None),
         ("numFeatures", "number of features", None),
         ("checkpointInterval", "checkpoint interval (>= 1)", None),
-        ("seed", "random seed", None),
+        ("seed", "random seed", "hash(type(self).__name__)"),
         ("tol", "the convergence tolerance for iterative algorithms", None),
         ("stepSize", "Step size to be used for each iteration of optimization.", None)]
     code = []
diff --git a/python/pyspark/ml/param/shared.py b/python/pyspark/ml/param/shared.py
index a5dc9b7ef29ed..0b93788899124 100644
--- a/python/pyspark/ml/param/shared.py
+++ b/python/pyspark/ml/param/shared.py
@@ -32,8 +32,6 @@ def __init__(self):
         super(HasMaxIter, self).__init__()
         #: param for max number of iterations (>= 0)
         self.maxIter = Param(self, "maxIter", "max number of iterations (>= 0)")
-        if None is not None:
-            self._setDefault(maxIter=None)
 
     def setMaxIter(self, value):
         """
@@ -61,8 +59,6 @@ def __init__(self):
         super(HasRegParam, self).__init__()
         #: param for regularization parameter (>= 0)
         self.regParam = Param(self, "regParam", "regularization parameter (>= 0)")
-        if None is not None:
-            self._setDefault(regParam=None)
 
     def setRegParam(self, value):
         """
@@ -90,8 +86,7 @@ def __init__(self):
         super(HasFeaturesCol, self).__init__()
         #: param for features column name
         self.featuresCol = Param(self, "featuresCol", "features column name")
-        if 'features' is not None:
-            self._setDefault(featuresCol='features')
+        self._setDefault(featuresCol='features')
 
     def setFeaturesCol(self, value):
         """
@@ -119,8 +114,7 @@ def __init__(self):
         super(HasLabelCol, self).__init__()
         #: param for label column name
         self.labelCol = Param(self, "labelCol", "label column name")
-        if 'label' is not None:
-            self._setDefault(labelCol='label')
+        self._setDefault(labelCol='label')
 
     def setLabelCol(self, value):
         """
@@ -148,8 +142,7 @@ def __init__(self):
         super(HasPredictionCol, self).__init__()
         #: param for prediction column name
         self.predictionCol = Param(self, "predictionCol", "prediction column name")
-        if 'prediction' is not None:
-            self._setDefault(predictionCol='prediction')
+        self._setDefault(predictionCol='prediction')
 
     def setPredictionCol(self, value):
         """
@@ -177,8 +170,7 @@ def __init__(self):
         super(HasProbabilityCol, self).__init__()
         #: param for Column name for predicted class conditional probabilities. Note: Not all models output well-calibrated probability estimates! These probabilities should be treated as confidences, not precise probabilities.
         self.probabilityCol = Param(self, "probabilityCol", "Column name for predicted class conditional probabilities. Note: Not all models output well-calibrated probability estimates! These probabilities should be treated as confidences, not precise probabilities.")
-        if 'probability' is not None:
-            self._setDefault(probabilityCol='probability')
+        self._setDefault(probabilityCol='probability')
 
     def setProbabilityCol(self, value):
         """
@@ -206,8 +198,7 @@ def __init__(self):
         super(HasRawPredictionCol, self).__init__()
         #: param for raw prediction (a.k.a. confidence) column name
         self.rawPredictionCol = Param(self, "rawPredictionCol", "raw prediction (a.k.a. confidence) column name")
-        if 'rawPrediction' is not None:
-            self._setDefault(rawPredictionCol='rawPrediction')
+        self._setDefault(rawPredictionCol='rawPrediction')
 
     def setRawPredictionCol(self, value):
         """
@@ -235,8 +226,6 @@ def __init__(self):
         super(HasInputCol, self).__init__()
         #: param for input column name
         self.inputCol = Param(self, "inputCol", "input column name")
-        if None is not None:
-            self._setDefault(inputCol=None)
 
     def setInputCol(self, value):
         """
@@ -264,8 +253,6 @@ def __init__(self):
         super(HasInputCols, self).__init__()
         #: param for input column names
         self.inputCols = Param(self, "inputCols", "input column names")
-        if None is not None:
-            self._setDefault(inputCols=None)
 
     def setInputCols(self, value):
         """
@@ -293,8 +280,6 @@ def __init__(self):
         super(HasOutputCol, self).__init__()
         #: param for output column name
         self.outputCol = Param(self, "outputCol", "output column name")
-        if None is not None:
-            self._setDefault(outputCol=None)
 
     def setOutputCol(self, value):
         """
@@ -322,8 +307,6 @@ def __init__(self):
         super(HasNumFeatures, self).__init__()
         #: param for number of features
         self.numFeatures = Param(self, "numFeatures", "number of features")
-        if None is not None:
-            self._setDefault(numFeatures=None)
 
     def setNumFeatures(self, value):
         """
@@ -351,8 +334,6 @@ def __init__(self):
         super(HasCheckpointInterval, self).__init__()
         #: param for checkpoint interval (>= 1)
         self.checkpointInterval = Param(self, "checkpointInterval", "checkpoint interval (>= 1)")
-        if None is not None:
-            self._setDefault(checkpointInterval=None)
 
     def setCheckpointInterval(self, value):
         """
@@ -380,8 +361,7 @@ def __init__(self):
         super(HasSeed, self).__init__()
         #: param for random seed
         self.seed = Param(self, "seed", "random seed")
-        if None is not None:
-            self._setDefault(seed=None)
+        self._setDefault(seed=hash(type(self).__name__))
 
     def setSeed(self, value):
         """
@@ -409,8 +389,6 @@ def __init__(self):
         super(HasTol, self).__init__()
         #: param for the convergence tolerance for iterative algorithms
         self.tol = Param(self, "tol", "the convergence tolerance for iterative algorithms")
-        if None is not None:
-            self._setDefault(tol=None)
 
     def setTol(self, value):
         """
@@ -438,8 +416,6 @@ def __init__(self):
         super(HasStepSize, self).__init__()
         #: param for Step size to be used for each iteration of optimization.
         self.stepSize = Param(self, "stepSize", "Step size to be used for each iteration of optimization.")
-        if None is not None:
-            self._setDefault(stepSize=None)
 
     def setStepSize(self, value):
         """
@@ -467,6 +443,7 @@ class DecisionTreeParams(Params):
     minInfoGain = Param(Params._dummy(), "minInfoGain", "Minimum information gain for a split to be considered at a tree node.")
     maxMemoryInMB = Param(Params._dummy(), "maxMemoryInMB", "Maximum memory in MB allocated to histogram aggregation.")
     cacheNodeIds = Param(Params._dummy(), "cacheNodeIds", "If false, the algorithm will pass trees to executors to match instances with nodes. If true, the algorithm will cache node IDs for each instance. Caching can speed up training of deeper trees.")
+    
 
     def __init__(self):
         super(DecisionTreeParams, self).__init__()
diff --git a/python/pyspark/ml/recommendation.py b/python/pyspark/ml/recommendation.py
index 39c2527543774..b3e0dd7abf681 100644
--- a/python/pyspark/ml/recommendation.py
+++ b/python/pyspark/ml/recommendation.py
@@ -89,11 +89,11 @@ class ALS(JavaEstimator, HasCheckpointInterval, HasMaxIter, HasPredictionCol, Ha
 
     @keyword_only
     def __init__(self, rank=10, maxIter=10, regParam=0.1, numUserBlocks=10, numItemBlocks=10,
-                 implicitPrefs=False, alpha=1.0, userCol="user", itemCol="item", seed=0,
+                 implicitPrefs=False, alpha=1.0, userCol="user", itemCol="item", seed=None,
                  ratingCol="rating", nonnegative=False, checkpointInterval=10):
         """
         __init__(self, rank=10, maxIter=10, regParam=0.1, numUserBlocks=10, numItemBlocks=10, \
-                 implicitPrefs=false, alpha=1.0, userCol="user", itemCol="item", seed=0, \
+                 implicitPrefs=false, alpha=1.0, userCol="user", itemCol="item", seed=None, \
                  ratingCol="rating", nonnegative=false, checkpointInterval=10)
         """
         super(ALS, self).__init__()
@@ -109,18 +109,18 @@ def __init__(self, rank=10, maxIter=10, regParam=0.1, numUserBlocks=10, numItemB
         self.nonnegative = Param(self, "nonnegative",
                                  "whether to use nonnegative constraint for least squares")
         self._setDefault(rank=10, maxIter=10, regParam=0.1, numUserBlocks=10, numItemBlocks=10,
-                         implicitPrefs=False, alpha=1.0, userCol="user", itemCol="item", seed=0,
+                         implicitPrefs=False, alpha=1.0, userCol="user", itemCol="item", seed=None,
                          ratingCol="rating", nonnegative=False, checkpointInterval=10)
         kwargs = self.__init__._input_kwargs
         self.setParams(**kwargs)
 
     @keyword_only
     def setParams(self, rank=10, maxIter=10, regParam=0.1, numUserBlocks=10, numItemBlocks=10,
-                  implicitPrefs=False, alpha=1.0, userCol="user", itemCol="item", seed=0,
+                  implicitPrefs=False, alpha=1.0, userCol="user", itemCol="item", seed=None,
                   ratingCol="rating", nonnegative=False, checkpointInterval=10):
         """
         setParams(self, rank=10, maxIter=10, regParam=0.1, numUserBlocks=10, numItemBlocks=10, \
-                 implicitPrefs=False, alpha=1.0, userCol="user", itemCol="item", seed=0, \
+                 implicitPrefs=False, alpha=1.0, userCol="user", itemCol="item", seed=None, \
                  ratingCol="rating", nonnegative=False, checkpointInterval=10)
         Sets params for ALS.
         """
diff --git a/python/pyspark/ml/regression.py b/python/pyspark/ml/regression.py
index ff809cdafdf51..b139e27372d80 100644
--- a/python/pyspark/ml/regression.py
+++ b/python/pyspark/ml/regression.py
@@ -257,7 +257,7 @@ class RandomForestRegressor(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredi
     >>> df = sqlContext.createDataFrame([
     ...     (1.0, Vectors.dense(1.0)),
     ...     (0.0, Vectors.sparse(1, [], []))], ["label", "features"])
-    >>> rf = RandomForestRegressor(numTrees=2, maxDepth=2)
+    >>> rf = RandomForestRegressor(numTrees=2, maxDepth=2, seed=42)
     >>> model = rf.fit(df)
     >>> test0 = sqlContext.createDataFrame([(Vectors.dense(-1.0),)], ["features"])
     >>> model.transform(test0).head().prediction
@@ -284,12 +284,13 @@ class RandomForestRegressor(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredi
     def __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction",
                  maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0,
                  maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, impurity="variance",
-                 numTrees=20, featureSubsetStrategy="auto", seed=42):
+                 numTrees=20, featureSubsetStrategy="auto", seed=None):
         """
         __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction", \
                  maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, \
                  maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, \
-                 impurity="variance", numTrees=20, featureSubsetStrategy="auto", seed=42)
+                 impurity="variance", numTrees=20, \
+                 featureSubsetStrategy="auto", seed=None)
         """
         super(RandomForestRegressor, self).__init__()
         self._java_obj = self._new_java_obj(
@@ -312,7 +313,7 @@ def __init__(self, featuresCol="features", labelCol="label", predictionCol="pred
                   "The number of features to consider for splits at each tree node. Supported " +
                   "options: " + ", ".join(RandomForestParams.supportedFeatureSubsetStrategies))
         self._setDefault(maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0,
-                         maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, seed=42,
+                         maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, seed=None,
                          impurity="variance", numTrees=20, featureSubsetStrategy="auto")
         kwargs = self.__init__._input_kwargs
         self.setParams(**kwargs)
@@ -320,12 +321,12 @@ def __init__(self, featuresCol="features", labelCol="label", predictionCol="pred
     @keyword_only
     def setParams(self, featuresCol="features", labelCol="label", predictionCol="prediction",
                   maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0,
-                  maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, seed=42,
+                  maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, seed=None,
                   impurity="variance", numTrees=20, featureSubsetStrategy="auto"):
         """
         setParams(self, featuresCol="features", labelCol="label", predictionCol="prediction", \
                   maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, \
-                  maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, seed=42, \
+                  maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, seed=None, \
                   impurity="variance", numTrees=20, featureSubsetStrategy="auto")
         Sets params for linear regression.
         """
diff --git a/python/pyspark/ml/tests.py b/python/pyspark/ml/tests.py
index 10fe0ef8db38f..6adbf166f34a8 100644
--- a/python/pyspark/ml/tests.py
+++ b/python/pyspark/ml/tests.py
@@ -33,7 +33,8 @@
 from pyspark.tests import ReusedPySparkTestCase as PySparkTestCase
 from pyspark.sql import DataFrame, SQLContext
 from pyspark.ml.param import Param, Params
-from pyspark.ml.param.shared import HasMaxIter, HasInputCol
+from pyspark.ml.param.shared import HasMaxIter, HasInputCol, HasSeed
+from pyspark.ml.util import keyword_only
 from pyspark.ml import Estimator, Model, Pipeline, Transformer
 from pyspark.ml.feature import *
 from pyspark.mllib.linalg import DenseVector
@@ -111,14 +112,46 @@ def test_pipeline(self):
         self.assertEqual(6, dataset.index)
 
 
-class TestParams(HasMaxIter, HasInputCol):
+class TestParams(HasMaxIter, HasInputCol, HasSeed):
     """
-    A subclass of Params mixed with HasMaxIter and HasInputCol.
+    A subclass of Params mixed with HasMaxIter, HasInputCol and HasSeed.
     """
-
-    def __init__(self):
+    @keyword_only
+    def __init__(self, seed=None):
         super(TestParams, self).__init__()
         self._setDefault(maxIter=10)
+        kwargs = self.__init__._input_kwargs
+        self.setParams(**kwargs)
+
+    @keyword_only
+    def setParams(self, seed=None):
+        """
+        setParams(self, seed=None)
+        Sets params for this test.
+        """
+        kwargs = self.setParams._input_kwargs
+        return self._set(**kwargs)
+
+
+class OtherTestParams(HasMaxIter, HasInputCol, HasSeed):
+    """
+    A subclass of Params mixed with HasMaxIter, HasInputCol and HasSeed.
+    """
+    @keyword_only
+    def __init__(self, seed=None):
+        super(OtherTestParams, self).__init__()
+        self._setDefault(maxIter=10)
+        kwargs = self.__init__._input_kwargs
+        self.setParams(**kwargs)
+
+    @keyword_only
+    def setParams(self, seed=None):
+        """
+        setParams(self, seed=None)
+        Sets params for this test.
+        """
+        kwargs = self.setParams._input_kwargs
+        return self._set(**kwargs)
 
 
 class ParamTests(PySparkTestCase):
@@ -134,9 +167,10 @@ def test_params(self):
         testParams = TestParams()
         maxIter = testParams.maxIter
         inputCol = testParams.inputCol
+        seed = testParams.seed
 
         params = testParams.params
-        self.assertEqual(params, [inputCol, maxIter])
+        self.assertEqual(params, [inputCol, maxIter, seed])
 
         self.assertTrue(testParams.hasParam(maxIter))
         self.assertTrue(testParams.hasDefault(maxIter))
@@ -154,10 +188,29 @@ def test_params(self):
         with self.assertRaises(KeyError):
             testParams.getInputCol()
 
+        # Since the default is normally random, set it to a known number for debug str
+        testParams._setDefault(seed=41)
+        testParams.setSeed(43)
+
         self.assertEquals(
             testParams.explainParams(),
             "\n".join(["inputCol: input column name (undefined)",
-                       "maxIter: max number of iterations (>= 0) (default: 10, current: 100)"]))
+                       "maxIter: max number of iterations (>= 0) (default: 10, current: 100)",
+                       "seed: random seed (default: 41, current: 43)"]))
+
+    def test_hasseed(self):
+        noSeedSpecd = TestParams()
+        withSeedSpecd = TestParams(seed=42)
+        other = OtherTestParams()
+        # Check that we no longer use 42 as the magic number
+        self.assertNotEqual(noSeedSpecd.getSeed(), 42)
+        origSeed = noSeedSpecd.getSeed()
+        # Check that we only compute the seed once
+        self.assertEqual(noSeedSpecd.getSeed(), origSeed)
+        # Check that a specified seed is honored
+        self.assertEqual(withSeedSpecd.getSeed(), 42)
+        # Check that a different class has a different seed
+        self.assertNotEqual(other.getSeed(), noSeedSpecd.getSeed())
 
 
 class FeatureTests(PySparkTestCase):

From 9b84443dd43777e25b0b00468c61814fe6d26c23 Mon Sep 17 00:00:00 2001
From: Andrew Or <andrew@databricks.com>
Date: Wed, 20 May 2015 15:39:32 -0700
Subject: [PATCH 093/525] [SPARK-7237] [SPARK-7741] [CORE] [STREAMING] Clean
 more closures that need cleaning

SPARK-7741 is the equivalent of SPARK-7237 in streaming. This is an alternative to #6268.

Author: Andrew Or <andrew@databricks.com>

Closes #6269 from andrewor14/clean-moar and squashes the following commits:

c51c9ab [Andrew Or] Add periods (trivial)
6c686ac [Andrew Or] Merge branch 'master' of github.com:apache/spark into clean-moar
79a435b [Andrew Or] Fix tests
d18c9f9 [Andrew Or] Merge branch 'master' of github.com:apache/spark into clean-moar
65ef07b [Andrew Or] Fix tests?
4b487a3 [Andrew Or] Add tests for closures passed to DStream operations
328139b [Andrew Or] Do not forget foreachRDD
5431f61 [Andrew Or] Clean streaming closures
72b7b73 [Andrew Or] Clean core closures
---
 .../scala/org/apache/spark/SparkContext.scala |   4 +-
 .../apache/spark/rdd/PairRDDFunctions.scala   |   5 +-
 .../spark/util/ClosureCleanerSuite.scala      |   4 +
 .../spark/streaming/StreamingContext.scala    |   2 +-
 .../spark/streaming/dstream/DStream.scala     |   5 +-
 .../dstream/PairDStreamFunctions.scala        |  30 ++-
 .../spark/streaming/DStreamClosureSuite.scala | 196 ++++++++++++++++++
 .../spark/streaming/DStreamScopeSuite.scala   |  22 +-
 .../spark/streaming/TestSuiteBase.scala       |  18 ++
 9 files changed, 249 insertions(+), 37 deletions(-)
 create mode 100644 streaming/src/test/scala/org/apache/spark/streaming/DStreamClosureSuite.scala

diff --git a/core/src/main/scala/org/apache/spark/SparkContext.scala b/core/src/main/scala/org/apache/spark/SparkContext.scala
index 3fe3dc5e300e8..cf3820fcb6a35 100644
--- a/core/src/main/scala/org/apache/spark/SparkContext.scala
+++ b/core/src/main/scala/org/apache/spark/SparkContext.scala
@@ -1159,8 +1159,8 @@ class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationCli
         kcf: () => WritableConverter[K], vcf: () => WritableConverter[V]): RDD[(K, V)] = {
     withScope {
       assertNotStopped()
-      val kc = kcf()
-      val vc = vcf()
+      val kc = clean(kcf)()
+      val vc = clean(vcf)()
       val format = classOf[SequenceFileInputFormat[Writable, Writable]]
       val writables = hadoopFile(path, format,
         kc.writableClass(km).asInstanceOf[Class[Writable]],
diff --git a/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala b/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala
index a6d5d2c94e17f..8653cdee1adee 100644
--- a/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala
@@ -296,6 +296,7 @@ class PairRDDFunctions[K, V](self: RDD[(K, V)])
    * before sending results to a reducer, similarly to a "combiner" in MapReduce.
    */
   def reduceByKeyLocally(func: (V, V) => V): Map[K, V] = self.withScope {
+    val cleanedF = self.sparkContext.clean(func)
 
     if (keyClass.isArray) {
       throw new SparkException("reduceByKeyLocally() does not support array keys")
@@ -305,7 +306,7 @@ class PairRDDFunctions[K, V](self: RDD[(K, V)])
       val map = new JHashMap[K, V]
       iter.foreach { pair =>
         val old = map.get(pair._1)
-        map.put(pair._1, if (old == null) pair._2 else func(old, pair._2))
+        map.put(pair._1, if (old == null) pair._2 else cleanedF(old, pair._2))
       }
       Iterator(map)
     } : Iterator[JHashMap[K, V]]
@@ -313,7 +314,7 @@ class PairRDDFunctions[K, V](self: RDD[(K, V)])
     val mergeMaps = (m1: JHashMap[K, V], m2: JHashMap[K, V]) => {
       m2.foreach { pair =>
         val old = m1.get(pair._1)
-        m1.put(pair._1, if (old == null) pair._2 else func(old, pair._2))
+        m1.put(pair._1, if (old == null) pair._2 else cleanedF(old, pair._2))
       }
       m1
     } : JHashMap[K, V]
diff --git a/core/src/test/scala/org/apache/spark/util/ClosureCleanerSuite.scala b/core/src/test/scala/org/apache/spark/util/ClosureCleanerSuite.scala
index e41f6ee27764e..7b165fe28bdd3 100644
--- a/core/src/test/scala/org/apache/spark/util/ClosureCleanerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/util/ClosureCleanerSuite.scala
@@ -112,6 +112,7 @@ class ClosureCleanerSuite extends FunSuite {
       expectCorrectException { TestUserClosuresActuallyCleaned.testAggregateByKey(pairRdd) }
       expectCorrectException { TestUserClosuresActuallyCleaned.testFoldByKey(pairRdd) }
       expectCorrectException { TestUserClosuresActuallyCleaned.testReduceByKey(pairRdd) }
+      expectCorrectException { TestUserClosuresActuallyCleaned.testReduceByKeyLocally(pairRdd) }
       expectCorrectException { TestUserClosuresActuallyCleaned.testMapValues(pairRdd) }
       expectCorrectException { TestUserClosuresActuallyCleaned.testFlatMapValues(pairRdd) }
       expectCorrectException { TestUserClosuresActuallyCleaned.testForeachAsync(rdd) }
@@ -315,6 +316,9 @@ private object TestUserClosuresActuallyCleaned {
   }
   def testFoldByKey(rdd: RDD[(Int, Int)]): Unit = { rdd.foldByKey(0) { case (_, _) => return; 1 } }
   def testReduceByKey(rdd: RDD[(Int, Int)]): Unit = { rdd.reduceByKey { case (_, _) => return; 1 } }
+  def testReduceByKeyLocally(rdd: RDD[(Int, Int)]): Unit = {
+    rdd.reduceByKeyLocally { case (_, _) => return; 1 }
+  }
   def testMapValues(rdd: RDD[(Int, Int)]): Unit = { rdd.mapValues { _ => return; 1 } }
   def testFlatMapValues(rdd: RDD[(Int, Int)]): Unit = { rdd.flatMapValues { _ => return; Seq() } }
 
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/StreamingContext.scala b/streaming/src/main/scala/org/apache/spark/streaming/StreamingContext.scala
index 7f181bcecd4bf..fe614c4be590f 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/StreamingContext.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/StreamingContext.scala
@@ -255,7 +255,7 @@ class StreamingContext private[streaming] (
    *
    * Note: Return statements are NOT allowed in the given body.
    */
-  private[streaming] def withNamedScope[U](name: String)(body: => U): U = {
+  private def withNamedScope[U](name: String)(body: => U): U = {
     RDDOperationScope.withScope(sc, name, allowNesting = false, ignoreParent = false)(body)
   }
 
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/dstream/DStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/dstream/DStream.scala
index 5977481e1f081..7c50a766a9bad 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/dstream/DStream.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/dstream/DStream.scala
@@ -539,7 +539,7 @@ abstract class DStream[T: ClassTag] (
 
   /** Return a new DStream containing only the elements that satisfy a predicate. */
   def filter(filterFunc: T => Boolean): DStream[T] = ssc.withScope {
-    new FilteredDStream(this, filterFunc)
+    new FilteredDStream(this, context.sparkContext.clean(filterFunc))
   }
 
   /**
@@ -624,7 +624,8 @@ abstract class DStream[T: ClassTag] (
    * 'this' DStream will be registered as an output stream and therefore materialized.
    */
   def foreachRDD(foreachFunc: RDD[T] => Unit): Unit = ssc.withScope {
-    this.foreachRDD((r: RDD[T], t: Time) => foreachFunc(r))
+    val cleanedF = context.sparkContext.clean(foreachFunc, false)
+    this.foreachRDD((r: RDD[T], t: Time) => cleanedF(r))
   }
 
   /**
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/dstream/PairDStreamFunctions.scala b/streaming/src/main/scala/org/apache/spark/streaming/dstream/PairDStreamFunctions.scala
index 884a8e8b52289..fda22eb6ec42e 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/dstream/PairDStreamFunctions.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/dstream/PairDStreamFunctions.scala
@@ -38,6 +38,8 @@ class PairDStreamFunctions[K, V](self: DStream[(K,V)])
 {
   private[streaming] def ssc = self.ssc
 
+  private[streaming] def sparkContext = self.context.sparkContext
+
   private[streaming] def defaultPartitioner(numPartitions: Int = self.ssc.sc.defaultParallelism) = {
     new HashPartitioner(numPartitions)
   }
@@ -98,8 +100,7 @@ class PairDStreamFunctions[K, V](self: DStream[(K,V)])
   def reduceByKey(
       reduceFunc: (V, V) => V,
       partitioner: Partitioner): DStream[(K, V)] = ssc.withScope {
-    val cleanedReduceFunc = ssc.sc.clean(reduceFunc)
-    combineByKey((v: V) => v, cleanedReduceFunc, cleanedReduceFunc, partitioner)
+    combineByKey((v: V) => v, reduceFunc, reduceFunc, partitioner)
   }
 
   /**
@@ -113,7 +114,15 @@ class PairDStreamFunctions[K, V](self: DStream[(K,V)])
       mergeCombiner: (C, C) => C,
       partitioner: Partitioner,
       mapSideCombine: Boolean = true): DStream[(K, C)] = ssc.withScope {
-    new ShuffledDStream[K, V, C](self, createCombiner, mergeValue, mergeCombiner, partitioner,
+    val cleanedCreateCombiner = sparkContext.clean(createCombiner)
+    val cleanedMergeValue = sparkContext.clean(mergeValue)
+    val cleanedMergeCombiner = sparkContext.clean(mergeCombiner)
+    new ShuffledDStream[K, V, C](
+      self,
+      cleanedCreateCombiner,
+      cleanedMergeValue,
+      cleanedMergeCombiner,
+      partitioner,
       mapSideCombine)
   }
 
@@ -264,10 +273,9 @@ class PairDStreamFunctions[K, V](self: DStream[(K,V)])
       slideDuration: Duration,
       partitioner: Partitioner
     ): DStream[(K, V)] = ssc.withScope {
-    val cleanedReduceFunc = ssc.sc.clean(reduceFunc)
-    self.reduceByKey(cleanedReduceFunc, partitioner)
+    self.reduceByKey(reduceFunc, partitioner)
         .window(windowDuration, slideDuration)
-        .reduceByKey(cleanedReduceFunc, partitioner)
+        .reduceByKey(reduceFunc, partitioner)
   }
 
   /**
@@ -385,8 +393,9 @@ class PairDStreamFunctions[K, V](self: DStream[(K,V)])
       updateFunc: (Seq[V], Option[S]) => Option[S],
       partitioner: Partitioner
     ): DStream[(K, S)] = ssc.withScope {
+    val cleanedUpdateF = sparkContext.clean(updateFunc)
     val newUpdateFunc = (iterator: Iterator[(K, Seq[V], Option[S])]) => {
-      iterator.flatMap(t => updateFunc(t._2, t._3).map(s => (t._1, s)))
+      iterator.flatMap(t => cleanedUpdateF(t._2, t._3).map(s => (t._1, s)))
     }
     updateStateByKey(newUpdateFunc, partitioner, true)
   }
@@ -428,8 +437,9 @@ class PairDStreamFunctions[K, V](self: DStream[(K,V)])
       partitioner: Partitioner,
       initialRDD: RDD[(K, S)]
     ): DStream[(K, S)] = ssc.withScope {
+    val cleanedUpdateF = sparkContext.clean(updateFunc)
     val newUpdateFunc = (iterator: Iterator[(K, Seq[V], Option[S])]) => {
-      iterator.flatMap(t => updateFunc(t._2, t._3).map(s => (t._1, s)))
+      iterator.flatMap(t => cleanedUpdateF(t._2, t._3).map(s => (t._1, s)))
     }
     updateStateByKey(newUpdateFunc, partitioner, true, initialRDD)
   }
@@ -463,7 +473,7 @@ class PairDStreamFunctions[K, V](self: DStream[(K,V)])
    * 'this' DStream without changing the key.
    */
   def mapValues[U: ClassTag](mapValuesFunc: V => U): DStream[(K, U)] = ssc.withScope {
-    new MapValuedDStream[K, V, U](self, mapValuesFunc)
+    new MapValuedDStream[K, V, U](self, sparkContext.clean(mapValuesFunc))
   }
 
   /**
@@ -473,7 +483,7 @@ class PairDStreamFunctions[K, V](self: DStream[(K,V)])
   def flatMapValues[U: ClassTag](
       flatMapValuesFunc: V => TraversableOnce[U]
     ): DStream[(K, U)] = ssc.withScope {
-    new FlatMapValuedDStream[K, V, U](self, flatMapValuesFunc)
+    new FlatMapValuedDStream[K, V, U](self, sparkContext.clean(flatMapValuesFunc))
   }
 
   /**
diff --git a/streaming/src/test/scala/org/apache/spark/streaming/DStreamClosureSuite.scala b/streaming/src/test/scala/org/apache/spark/streaming/DStreamClosureSuite.scala
new file mode 100644
index 0000000000000..6a1dd6949b204
--- /dev/null
+++ b/streaming/src/test/scala/org/apache/spark/streaming/DStreamClosureSuite.scala
@@ -0,0 +1,196 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.streaming
+
+import java.io.NotSerializableException
+
+import org.scalatest.{BeforeAndAfterAll, FunSuite}
+
+import org.apache.spark.{HashPartitioner, SparkContext, SparkException}
+import org.apache.spark.rdd.RDD
+import org.apache.spark.streaming.dstream.DStream
+import org.apache.spark.util.ReturnStatementInClosureException
+
+/**
+ * Test that closures passed to DStream operations are actually cleaned.
+ */
+class DStreamClosureSuite extends FunSuite with BeforeAndAfterAll {
+  private var ssc: StreamingContext = null
+
+  override def beforeAll(): Unit = {
+    val sc = new SparkContext("local", "test")
+    ssc = new StreamingContext(sc, Seconds(1))
+  }
+
+  override def afterAll(): Unit = {
+    ssc.stop(stopSparkContext = true)
+    ssc = null
+  }
+
+  test("user provided closures are actually cleaned") {
+    val dstream = new DummyInputDStream(ssc)
+    val pairDstream = dstream.map { i => (i, i) }
+    // DStream
+    testMap(dstream)
+    testFlatMap(dstream)
+    testFilter(dstream)
+    testMapPartitions(dstream)
+    testReduce(dstream)
+    testForeach(dstream)
+    testForeachRDD(dstream)
+    testTransform(dstream)
+    testTransformWith(dstream)
+    testReduceByWindow(dstream)
+    // PairDStreamFunctions
+    testReduceByKey(pairDstream)
+    testCombineByKey(pairDstream)
+    testReduceByKeyAndWindow(pairDstream)
+    testUpdateStateByKey(pairDstream)
+    testMapValues(pairDstream)
+    testFlatMapValues(pairDstream)
+    // StreamingContext
+    testTransform2(ssc, dstream)
+  }
+
+  /**
+   * Verify that the expected exception is thrown.
+   *
+   * We use return statements as an indication that a closure is actually being cleaned.
+   * We expect closure cleaner to find the return statements in the user provided closures.
+   */
+  private def expectCorrectException(body: => Unit): Unit = {
+    try {
+      body
+    } catch {
+      case rse: ReturnStatementInClosureException => // Success!
+      case e @ (_: NotSerializableException | _: SparkException) =>
+        throw new TestException(
+          s"Expected ReturnStatementInClosureException, but got $e.\n" +
+          "This means the closure provided by user is not actually cleaned.")
+    }
+  }
+
+  // DStream operations
+  private def testMap(ds: DStream[Int]): Unit = expectCorrectException {
+    ds.map { _ => return; 1 }
+  }
+  private def testFlatMap(ds: DStream[Int]): Unit = expectCorrectException {
+    ds.flatMap { _ => return; Seq.empty }
+  }
+  private def testFilter(ds: DStream[Int]): Unit = expectCorrectException {
+    ds.filter { _ => return; true }
+  }
+  private def testMapPartitions(ds: DStream[Int]): Unit = expectCorrectException {
+    ds.mapPartitions { _ => return; Seq.empty.toIterator }
+  }
+  private def testReduce(ds: DStream[Int]): Unit = expectCorrectException {
+    ds.reduce { case (_, _) => return; 1 }
+  }
+  private def testForeach(ds: DStream[Int]): Unit = {
+    val foreachF1 = (rdd: RDD[Int], t: Time) => return
+    val foreachF2 = (rdd: RDD[Int]) => return
+    expectCorrectException { ds.foreach(foreachF1) }
+    expectCorrectException { ds.foreach(foreachF2) }
+  }
+  private def testForeachRDD(ds: DStream[Int]): Unit = {
+    val foreachRDDF1 = (rdd: RDD[Int], t: Time) => return
+    val foreachRDDF2 = (rdd: RDD[Int]) => return
+    expectCorrectException { ds.foreachRDD(foreachRDDF1) }
+    expectCorrectException { ds.foreachRDD(foreachRDDF2) }
+  }
+  private def testTransform(ds: DStream[Int]): Unit = {
+    val transformF1 = (rdd: RDD[Int]) => { return; rdd }
+    val transformF2 = (rdd: RDD[Int], time: Time) => { return; rdd }
+    expectCorrectException { ds.transform(transformF1) }
+    expectCorrectException { ds.transform(transformF2) }
+  }
+  private def testTransformWith(ds: DStream[Int]): Unit = {
+    val transformF1 = (rdd1: RDD[Int], rdd2: RDD[Int]) => { return; rdd1 }
+    val transformF2 = (rdd1: RDD[Int], rdd2: RDD[Int], time: Time) => { return; rdd2 }
+    expectCorrectException { ds.transformWith(ds, transformF1) }
+    expectCorrectException { ds.transformWith(ds, transformF2) }
+  }
+  private def testReduceByWindow(ds: DStream[Int]): Unit = {
+    val reduceF = (_: Int, _: Int) => { return; 1 }
+    expectCorrectException { ds.reduceByWindow(reduceF, Seconds(1), Seconds(2)) }
+    expectCorrectException { ds.reduceByWindow(reduceF, reduceF, Seconds(1), Seconds(2)) }
+  }
+
+  // PairDStreamFunctions operations
+  private def testReduceByKey(ds: DStream[(Int, Int)]): Unit = {
+    val reduceF = (_: Int, _: Int) => { return; 1 }
+    expectCorrectException { ds.reduceByKey(reduceF) }
+    expectCorrectException { ds.reduceByKey(reduceF, 5) }
+    expectCorrectException { ds.reduceByKey(reduceF, new HashPartitioner(5)) }
+  }
+  private def testCombineByKey(ds: DStream[(Int, Int)]): Unit = {
+    expectCorrectException {
+      ds.combineByKey[Int](
+        { _: Int => return; 1 },
+        { case (_: Int, _: Int) => return; 1 },
+        { case (_: Int, _: Int) => return; 1 },
+        new HashPartitioner(5)
+      )
+    }
+  }
+  private def testReduceByKeyAndWindow(ds: DStream[(Int, Int)]): Unit = {
+    val reduceF = (_: Int, _: Int) => { return; 1 }
+    val filterF = (_: (Int, Int)) => { return; false }
+    expectCorrectException { ds.reduceByKeyAndWindow(reduceF, Seconds(1)) }
+    expectCorrectException { ds.reduceByKeyAndWindow(reduceF, Seconds(1), Seconds(2)) }
+    expectCorrectException { ds.reduceByKeyAndWindow(reduceF, Seconds(1), Seconds(2), 5) }
+    expectCorrectException {
+      ds.reduceByKeyAndWindow(reduceF, Seconds(1), Seconds(2), new HashPartitioner(5))
+    }
+    expectCorrectException { ds.reduceByKeyAndWindow(reduceF, reduceF, Seconds(2)) }
+    expectCorrectException {
+      ds.reduceByKeyAndWindow(
+        reduceF, reduceF, Seconds(2), Seconds(3), new HashPartitioner(5), filterF)
+    }
+  }
+  private def testUpdateStateByKey(ds: DStream[(Int, Int)]): Unit = {
+    val updateF1 = (_: Seq[Int], _: Option[Int]) => { return; Some(1) }
+    val updateF2 = (_: Iterator[(Int, Seq[Int], Option[Int])]) => { return; Seq((1, 1)).toIterator }
+    val initialRDD = ds.ssc.sparkContext.emptyRDD[Int].map { i => (i, i) }
+    expectCorrectException { ds.updateStateByKey(updateF1) }
+    expectCorrectException { ds.updateStateByKey(updateF1, 5) }
+    expectCorrectException { ds.updateStateByKey(updateF1, new HashPartitioner(5)) }
+    expectCorrectException {
+      ds.updateStateByKey(updateF1, new HashPartitioner(5), initialRDD)
+    }
+    expectCorrectException {
+      ds.updateStateByKey(updateF2, new HashPartitioner(5), true)
+    }
+    expectCorrectException {
+      ds.updateStateByKey(updateF2, new HashPartitioner(5), true, initialRDD)
+    }
+  }
+  private def testMapValues(ds: DStream[(Int, Int)]): Unit = expectCorrectException {
+    ds.mapValues { _ => return; 1 }
+  }
+  private def testFlatMapValues(ds: DStream[(Int, Int)]): Unit = expectCorrectException {
+    ds.flatMapValues { _ => return; Seq.empty }
+  }
+
+  // StreamingContext operations
+  private def testTransform2(ssc: StreamingContext, ds: DStream[Int]): Unit = {
+    val transformF = (rdds: Seq[RDD[_]], time: Time) => { return; ssc.sparkContext.emptyRDD[Int] }
+    expectCorrectException { ssc.transform(Seq(ds), transformF) }
+  }
+
+}
diff --git a/streaming/src/test/scala/org/apache/spark/streaming/DStreamScopeSuite.scala b/streaming/src/test/scala/org/apache/spark/streaming/DStreamScopeSuite.scala
index 392933102097e..e3fb2ef130859 100644
--- a/streaming/src/test/scala/org/apache/spark/streaming/DStreamScopeSuite.scala
+++ b/streaming/src/test/scala/org/apache/spark/streaming/DStreamScopeSuite.scala
@@ -20,8 +20,8 @@ package org.apache.spark.streaming
 import org.scalatest.{BeforeAndAfter, BeforeAndAfterAll, FunSuite}
 
 import org.apache.spark.SparkContext
-import org.apache.spark.rdd.{RDD, RDDOperationScope}
-import org.apache.spark.streaming.dstream.{DStream, InputDStream}
+import org.apache.spark.rdd.RDDOperationScope
+import org.apache.spark.streaming.dstream.DStream
 import org.apache.spark.streaming.ui.UIUtils
 
 /**
@@ -170,21 +170,3 @@ class DStreamScopeSuite extends FunSuite with BeforeAndAfter with BeforeAndAfter
   }
 
 }
-
-/**
- * A dummy stream that does absolutely nothing.
- */
-private class DummyDStream(ssc: StreamingContext) extends DStream[Int](ssc) {
-  override def dependencies: List[DStream[Int]] = List.empty
-  override def slideDuration: Duration = Seconds(1)
-  override def compute(time: Time): Option[RDD[Int]] = Some(ssc.sc.emptyRDD[Int])
-}
-
-/**
- * A dummy input stream that does absolutely nothing.
- */
-private class DummyInputDStream(ssc: StreamingContext) extends InputDStream[Int](ssc) {
-  override def start(): Unit = { }
-  override def stop(): Unit = { }
-  override def compute(time: Time): Option[RDD[Int]] = Some(ssc.sc.emptyRDD[Int])
-}
diff --git a/streaming/src/test/scala/org/apache/spark/streaming/TestSuiteBase.scala b/streaming/src/test/scala/org/apache/spark/streaming/TestSuiteBase.scala
index 4f70ae7f1f187..554cd30223f44 100644
--- a/streaming/src/test/scala/org/apache/spark/streaming/TestSuiteBase.scala
+++ b/streaming/src/test/scala/org/apache/spark/streaming/TestSuiteBase.scala
@@ -35,6 +35,24 @@ import org.apache.spark.streaming.dstream.{DStream, InputDStream, ForEachDStream
 import org.apache.spark.streaming.scheduler._
 import org.apache.spark.util.{ManualClock, Utils}
 
+/**
+ * A dummy stream that does absolutely nothing.
+ */
+private[streaming] class DummyDStream(ssc: StreamingContext) extends DStream[Int](ssc) {
+  override def dependencies: List[DStream[Int]] = List.empty
+  override def slideDuration: Duration = Seconds(1)
+  override def compute(time: Time): Option[RDD[Int]] = Some(ssc.sc.emptyRDD[Int])
+}
+
+/**
+ * A dummy input stream that does absolutely nothing.
+ */
+private[streaming] class DummyInputDStream(ssc: StreamingContext) extends InputDStream[Int](ssc) {
+  override def start(): Unit = { }
+  override def stop(): Unit = { }
+  override def compute(time: Time): Option[RDD[Int]] = Some(ssc.sc.emptyRDD[Int])
+}
+
 /**
  * This is a input stream just for the testsuites. This is equivalent to a checkpointable,
  * replayable, reliable message queue like Kafka. It requires a sequence as input, and

From 3c434cbfd0d6821e5bcf572be792b787a514018b Mon Sep 17 00:00:00 2001
From: Tathagata Das <tathagata.das1565@gmail.com>
Date: Wed, 20 May 2015 16:21:23 -0700
Subject: [PATCH 094/525] [SPARK-7767] [STREAMING] Added test for checkpoint
 serialization in StreamingContext.start()

Currently, the background checkpointing thread fails silently if the checkpoint is not serializable. It is hard to debug and therefore its best to fail fast at `start()` when checkpointing is enabled and the checkpoint is not serializable.

Author: Tathagata Das <tathagata.das1565@gmail.com>

Closes #6292 from tdas/SPARK-7767 and squashes the following commits:

51304e6 [Tathagata Das] Addressed comments.
c35237b [Tathagata Das] Added test for checkpoint serialization in StreamingContext.start()
---
 .../serializer/SerializationDebugger.scala    |  2 +-
 .../apache/spark/streaming/Checkpoint.scala   | 70 +++++++++++--------
 .../spark/streaming/StreamingContext.scala    | 26 ++++++-
 .../streaming/StreamingContextSuite.scala     | 27 +++++--
 4 files changed, 89 insertions(+), 36 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/serializer/SerializationDebugger.scala b/core/src/main/scala/org/apache/spark/serializer/SerializationDebugger.scala
index 5abfa467c0ec8..bb5db545531d2 100644
--- a/core/src/main/scala/org/apache/spark/serializer/SerializationDebugger.scala
+++ b/core/src/main/scala/org/apache/spark/serializer/SerializationDebugger.scala
@@ -27,7 +27,7 @@ import scala.util.control.NonFatal
 
 import org.apache.spark.Logging
 
-private[serializer] object SerializationDebugger extends Logging {
+private[spark] object SerializationDebugger extends Logging {
 
   /**
    * Improve the given NotSerializableException with the serialization path leading from the given
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/Checkpoint.scala b/streaming/src/main/scala/org/apache/spark/streaming/Checkpoint.scala
index 7bfae253c3a0c..d8dc4e4101664 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/Checkpoint.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/Checkpoint.scala
@@ -102,6 +102,44 @@ object Checkpoint extends Logging {
       Seq.empty
     }
   }
+
+  /** Serialize the checkpoint, or throw any exception that occurs */
+  def serialize(checkpoint: Checkpoint, conf: SparkConf): Array[Byte] = {
+    val compressionCodec = CompressionCodec.createCodec(conf)
+    val bos = new ByteArrayOutputStream()
+    val zos = compressionCodec.compressedOutputStream(bos)
+    val oos = new ObjectOutputStream(zos)
+    Utils.tryWithSafeFinally {
+      oos.writeObject(checkpoint)
+    } {
+      oos.close()
+    }
+    bos.toByteArray
+  }
+
+  /** Deserialize a checkpoint from the input stream, or throw any exception that occurs */
+  def deserialize(inputStream: InputStream, conf: SparkConf): Checkpoint = {
+    val compressionCodec = CompressionCodec.createCodec(conf)
+    var ois: ObjectInputStreamWithLoader = null
+    Utils.tryWithSafeFinally {
+
+      // ObjectInputStream uses the last defined user-defined class loader in the stack
+      // to find classes, which maybe the wrong class loader. Hence, a inherited version
+      // of ObjectInputStream is used to explicitly use the current thread's default class
+      // loader to find and load classes. This is a well know Java issue and has popped up
+      // in other places (e.g., http://jira.codehaus.org/browse/GROOVY-1627)
+      val zis = compressionCodec.compressedInputStream(inputStream)
+      ois = new ObjectInputStreamWithLoader(zis,
+        Thread.currentThread().getContextClassLoader)
+      val cp = ois.readObject.asInstanceOf[Checkpoint]
+      cp.validate()
+      cp
+    } {
+      if (ois != null) {
+        ois.close()
+      }
+    }
+  }
 }
 
 
@@ -189,17 +227,10 @@ class CheckpointWriter(
   }
 
   def write(checkpoint: Checkpoint, clearCheckpointDataLater: Boolean) {
-    val bos = new ByteArrayOutputStream()
-    val zos = compressionCodec.compressedOutputStream(bos)
-    val oos = new ObjectOutputStream(zos)
-    Utils.tryWithSafeFinally {
-      oos.writeObject(checkpoint)
-    } {
-      oos.close()
-    }
     try {
+      val bytes = Checkpoint.serialize(checkpoint, conf)
       executor.execute(new CheckpointWriteHandler(
-        checkpoint.checkpointTime, bos.toByteArray, clearCheckpointDataLater))
+        checkpoint.checkpointTime, bytes, clearCheckpointDataLater))
       logDebug("Submitted checkpoint of time " + checkpoint.checkpointTime + " writer queue")
     } catch {
       case rej: RejectedExecutionException =>
@@ -264,25 +295,8 @@ object CheckpointReader extends Logging {
     checkpointFiles.foreach(file => {
       logInfo("Attempting to load checkpoint from file " + file)
       try {
-        var ois: ObjectInputStreamWithLoader = null
-        var cp: Checkpoint = null
-        Utils.tryWithSafeFinally {
-          val fis = fs.open(file)
-          // ObjectInputStream uses the last defined user-defined class loader in the stack
-          // to find classes, which maybe the wrong class loader. Hence, a inherited version
-          // of ObjectInputStream is used to explicitly use the current thread's default class
-          // loader to find and load classes. This is a well know Java issue and has popped up
-          // in other places (e.g., http://jira.codehaus.org/browse/GROOVY-1627)
-          val zis = compressionCodec.compressedInputStream(fis)
-          ois = new ObjectInputStreamWithLoader(zis,
-            Thread.currentThread().getContextClassLoader)
-          cp = ois.readObject.asInstanceOf[Checkpoint]
-        } {
-          if (ois != null) {
-            ois.close()
-          }
-        }
-        cp.validate()
+        val fis = fs.open(file)
+        val cp = Checkpoint.deserialize(fis, conf)
         logInfo("Checkpoint successfully loaded from file " + file)
         logInfo("Checkpoint was generated at time " + cp.checkpointTime)
         return Some(cp)
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/StreamingContext.scala b/streaming/src/main/scala/org/apache/spark/streaming/StreamingContext.scala
index fe614c4be590f..95063692e1146 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/StreamingContext.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/StreamingContext.scala
@@ -17,7 +17,7 @@
 
 package org.apache.spark.streaming
 
-import java.io.InputStream
+import java.io.{InputStream, NotSerializableException}
 import java.util.concurrent.atomic.{AtomicInteger, AtomicReference}
 
 import scala.collection.Map
@@ -35,6 +35,7 @@ import org.apache.spark._
 import org.apache.spark.annotation.{DeveloperApi, Experimental}
 import org.apache.spark.input.FixedLengthBinaryInputFormat
 import org.apache.spark.rdd.{RDD, RDDOperationScope}
+import org.apache.spark.serializer.SerializationDebugger
 import org.apache.spark.storage.StorageLevel
 import org.apache.spark.streaming.StreamingContextState._
 import org.apache.spark.streaming.dstream._
@@ -235,6 +236,10 @@ class StreamingContext private[streaming] (
     }
   }
 
+  private[streaming] def isCheckpointingEnabled: Boolean = {
+    checkpointDir != null
+  }
+
   private[streaming] def initialCheckpoint: Checkpoint = {
     if (isCheckpointPresent) cp_ else null
   }
@@ -523,11 +528,26 @@ class StreamingContext private[streaming] (
     assert(graph != null, "Graph is null")
     graph.validate()
 
-    assert(
-      checkpointDir == null || checkpointDuration != null,
+    require(
+      !isCheckpointingEnabled || checkpointDuration != null,
       "Checkpoint directory has been set, but the graph checkpointing interval has " +
         "not been set. Please use StreamingContext.checkpoint() to set the interval."
     )
+
+    // Verify whether the DStream checkpoint is serializable
+    if (isCheckpointingEnabled) {
+      val checkpoint = new Checkpoint(this, Time.apply(0))
+      try {
+        Checkpoint.serialize(checkpoint, conf)
+      } catch {
+        case e: NotSerializableException =>
+          throw new NotSerializableException(
+            "DStream checkpointing has been enabled but the DStreams with their functions " +
+              "are not serializable\nSerialization stack:\n" +
+              SerializationDebugger.find(checkpoint).map("\t- " + _).mkString("\n")
+          )
+      }
+    }
   }
 
   /**
diff --git a/streaming/src/test/scala/org/apache/spark/streaming/StreamingContextSuite.scala b/streaming/src/test/scala/org/apache/spark/streaming/StreamingContextSuite.scala
index 4b12affbb0ddd..3a958bf3a3c19 100644
--- a/streaming/src/test/scala/org/apache/spark/streaming/StreamingContextSuite.scala
+++ b/streaming/src/test/scala/org/apache/spark/streaming/StreamingContextSuite.scala
@@ -17,21 +17,21 @@
 
 package org.apache.spark.streaming
 
-import java.io.File
+import java.io.{File, NotSerializableException}
 import java.util.concurrent.atomic.AtomicInteger
 
 import org.apache.commons.io.FileUtils
-import org.scalatest.{Assertions, BeforeAndAfter, FunSuite}
-import org.scalatest.concurrent.Timeouts
 import org.scalatest.concurrent.Eventually._
+import org.scalatest.concurrent.Timeouts
 import org.scalatest.exceptions.TestFailedDueToTimeoutException
 import org.scalatest.time.SpanSugar._
+import org.scalatest.{Assertions, BeforeAndAfter, FunSuite}
 
-import org.apache.spark.{Logging, SparkConf, SparkContext, SparkException}
 import org.apache.spark.storage.StorageLevel
 import org.apache.spark.streaming.dstream.DStream
 import org.apache.spark.streaming.receiver.Receiver
 import org.apache.spark.util.Utils
+import org.apache.spark.{Logging, SparkConf, SparkContext, SparkException}
 
 
 class StreamingContextSuite extends FunSuite with BeforeAndAfter with Timeouts with Logging {
@@ -132,6 +132,25 @@ class StreamingContextSuite extends FunSuite with BeforeAndAfter with Timeouts w
     }
   }
 
+  test("start with non-seriazable DStream checkpoints") {
+    val checkpointDir = Utils.createTempDir()
+    ssc = new StreamingContext(conf, batchDuration)
+    ssc.checkpoint(checkpointDir.getAbsolutePath)
+    addInputStream(ssc).foreachRDD { rdd =>
+      // Refer to this.appName from inside closure so that this closure refers to
+      // the instance of StreamingContextSuite, and is therefore not serializable
+      rdd.count() + appName
+    }
+
+    // Test whether start() fails early when checkpointing is enabled
+    val exception = intercept[NotSerializableException] {
+      ssc.start()
+    }
+    assert(exception.getMessage().contains("DStreams with their functions are not serializable"))
+    assert(ssc.getState() !== StreamingContextState.ACTIVE)
+    assert(StreamingContext.getActive().isEmpty)
+  }
+
   test("start multiple times") {
     ssc = new StreamingContext(master, appName, batchDuration)
     addInputStream(ssc).register()

From 7956dd7ab03e1542d89dd94c043f1e5131684199 Mon Sep 17 00:00:00 2001
From: Josh Rosen <joshrosen@databricks.com>
Date: Wed, 20 May 2015 16:37:11 -0700
Subject: [PATCH 095/525] [SPARK-7698] Cache and reuse buffers in
 ExecutorMemoryAllocator when using heap allocation

When on-heap memory allocation is used, ExecutorMemoryManager should maintain a cache / pool of buffers for re-use by tasks. This will significantly improve the performance of the new Tungsten's sort-shuffle for jobs with many short-lived tasks by eliminating a major source of GC.

This pull request is a minimum-viable-implementation of this idea.  In its current form, this patch significantly improves performance on a stress test which launches huge numbers of short-lived shuffle map tasks back-to-back in the same JVM.

Author: Josh Rosen <joshrosen@databricks.com>

Closes #6227 from JoshRosen/SPARK-7698 and squashes the following commits:

fd6cb55 [Josh Rosen] SoftReference -> WeakReference
b154e86 [Josh Rosen] WIP sketch of pooling in ExecutorMemoryManager
---
 .../unsafe/memory/ExecutorMemoryManager.java  | 57 ++++++++++++++++++-
 1 file changed, 55 insertions(+), 2 deletions(-)

diff --git a/unsafe/src/main/java/org/apache/spark/unsafe/memory/ExecutorMemoryManager.java b/unsafe/src/main/java/org/apache/spark/unsafe/memory/ExecutorMemoryManager.java
index 62c29c8cc1e4d..cbbe8594627a5 100644
--- a/unsafe/src/main/java/org/apache/spark/unsafe/memory/ExecutorMemoryManager.java
+++ b/unsafe/src/main/java/org/apache/spark/unsafe/memory/ExecutorMemoryManager.java
@@ -17,6 +17,12 @@
 
 package org.apache.spark.unsafe.memory;
 
+import java.lang.ref.WeakReference;
+import java.util.HashMap;
+import java.util.LinkedList;
+import java.util.Map;
+import javax.annotation.concurrent.GuardedBy;
+
 /**
  * Manages memory for an executor. Individual operators / tasks allocate memory through
  * {@link TaskMemoryManager} objects, which obtain their memory from ExecutorMemoryManager.
@@ -33,6 +39,12 @@ public class ExecutorMemoryManager {
    */
   final boolean inHeap;
 
+  @GuardedBy("this")
+  private final Map<Long, LinkedList<WeakReference<MemoryBlock>>> bufferPoolsBySize =
+    new HashMap<Long, LinkedList<WeakReference<MemoryBlock>>>();
+
+  private static final int POOLING_THRESHOLD_BYTES = 1024 * 1024;
+
   /**
    * Construct a new ExecutorMemoryManager.
    *
@@ -43,16 +55,57 @@ public ExecutorMemoryManager(MemoryAllocator allocator) {
     this.allocator = allocator;
   }
 
+  /**
+   * Returns true if allocations of the given size should go through the pooling mechanism and
+   * false otherwise.
+   */
+  private boolean shouldPool(long size) {
+    // Very small allocations are less likely to benefit from pooling.
+    // At some point, we should explore supporting pooling for off-heap memory, but for now we'll
+    // ignore that case in the interest of simplicity.
+    return size >= POOLING_THRESHOLD_BYTES && allocator instanceof HeapMemoryAllocator;
+  }
+
   /**
    * Allocates a contiguous block of memory. Note that the allocated memory is not guaranteed
    * to be zeroed out (call `zero()` on the result if this is necessary).
    */
   MemoryBlock allocate(long size) throws OutOfMemoryError {
-    return allocator.allocate(size);
+    if (shouldPool(size)) {
+      synchronized (this) {
+        final LinkedList<WeakReference<MemoryBlock>> pool = bufferPoolsBySize.get(size);
+        if (pool != null) {
+          while (!pool.isEmpty()) {
+            final WeakReference<MemoryBlock> blockReference = pool.pop();
+            final MemoryBlock memory = blockReference.get();
+            if (memory != null) {
+              assert (memory.size() == size);
+              return memory;
+            }
+          }
+          bufferPoolsBySize.remove(size);
+        }
+      }
+      return allocator.allocate(size);
+    } else {
+      return allocator.allocate(size);
+    }
   }
 
   void free(MemoryBlock memory) {
-    allocator.free(memory);
+    final long size = memory.size();
+    if (shouldPool(size)) {
+      synchronized (this) {
+        LinkedList<WeakReference<MemoryBlock>> pool = bufferPoolsBySize.get(size);
+        if (pool == null) {
+          pool = new LinkedList<WeakReference<MemoryBlock>>();
+          bufferPoolsBySize.put(size, pool);
+        }
+        pool.add(new WeakReference<MemoryBlock>(memory));
+      }
+    } else {
+      allocator.free(memory);
+    }
   }
 
 }

From f2faa7af30662e3bdf15780f8719c71108f8e30b Mon Sep 17 00:00:00 2001
From: Josh Rosen <joshrosen@databricks.com>
Date: Wed, 20 May 2015 16:42:49 -0700
Subject: [PATCH 096/525] [SPARK-7251] Perform sequential scan when iterating
 over BytesToBytesMap

This patch modifies `BytesToBytesMap.iterator()` to iterate through records in the order that they appear in the data pages rather than iterating through the hashtable pointer arrays. This results in fewer random memory accesses, significantly improving performance for scan-and-copy operations.

This is possible because our data pages are laid out as sequences of `[keyLength][data][valueLength][data]` entries.  In order to mark the end of a partially-filled data page, we write `-1` as a special end-of-page length (BytesToByesMap supports empty/zero-length keys and values, which is why we had to use a negative length).

This patch incorporates / closes #5836.

Author: Josh Rosen <joshrosen@databricks.com>

Closes #6159 from JoshRosen/SPARK-7251 and squashes the following commits:

05bd90a [Josh Rosen] Compare capacity, not size, to MAX_CAPACITY
2a20d71 [Josh Rosen] Fix maximum BytesToBytesMap capacity
bc4854b [Josh Rosen] Guard against overflow when growing BytesToBytesMap
f5feadf [Josh Rosen] Add test for iterating over an empty map
273b842 [Josh Rosen] [SPARK-7251] Perform sequential scan when iterating over entries in BytesToBytesMap
---
 unsafe/pom.xml                                |   5 +
 .../spark/unsafe/map/BytesToBytesMap.java     | 151 ++++++++++++----
 .../unsafe/map/HashMapGrowthStrategy.java     |   4 +-
 .../unsafe/memory/TaskMemoryManager.java      |   2 +-
 .../map/AbstractBytesToBytesMapSuite.java     | 165 ++++++++++++++++--
 5 files changed, 274 insertions(+), 53 deletions(-)

diff --git a/unsafe/pom.xml b/unsafe/pom.xml
index 9e151fc7a9141..2fd17267ac427 100644
--- a/unsafe/pom.xml
+++ b/unsafe/pom.xml
@@ -65,6 +65,11 @@
       <artifactId>junit-interface</artifactId>
       <scope>test</scope>
     </dependency>
+    <dependency>
+      <groupId>org.mockito</groupId>
+      <artifactId>mockito-all</artifactId>
+      <scope>test</scope>
+    </dependency>
   </dependencies>
   <build>
     <outputDirectory>target/scala-${scala.binary.version}/classes</outputDirectory>
diff --git a/unsafe/src/main/java/org/apache/spark/unsafe/map/BytesToBytesMap.java b/unsafe/src/main/java/org/apache/spark/unsafe/map/BytesToBytesMap.java
index 19d6a169fd2ad..bd4ca74cc7764 100644
--- a/unsafe/src/main/java/org/apache/spark/unsafe/map/BytesToBytesMap.java
+++ b/unsafe/src/main/java/org/apache/spark/unsafe/map/BytesToBytesMap.java
@@ -23,6 +23,8 @@
 import java.util.LinkedList;
 import java.util.List;
 
+import com.google.common.annotations.VisibleForTesting;
+
 import org.apache.spark.unsafe.*;
 import org.apache.spark.unsafe.array.ByteArrayMethods;
 import org.apache.spark.unsafe.array.LongArray;
@@ -36,9 +38,8 @@
  * This is backed by a power-of-2-sized hash table, using quadratic probing with triangular numbers,
  * which is guaranteed to exhaust the space.
  * <p>
- * The map can support up to 2^31 keys because we use 32 bit MurmurHash. If the key cardinality is
- * higher than this, you should probably be using sorting instead of hashing for better cache
- * locality.
+ * The map can support up to 2^29 keys. If the key cardinality is higher than this, you should
+ * probably be using sorting instead of hashing for better cache locality.
  * <p>
  * This class is not thread safe.
  */
@@ -48,6 +49,11 @@ public final class BytesToBytesMap {
 
   private static final HashMapGrowthStrategy growthStrategy = HashMapGrowthStrategy.DOUBLING;
 
+  /**
+   * Special record length that is placed after the last record in a data page.
+   */
+  private static final int END_OF_PAGE_MARKER = -1;
+
   private final TaskMemoryManager memoryManager;
 
   /**
@@ -64,7 +70,7 @@ public final class BytesToBytesMap {
 
   /**
    * Offset into `currentDataPage` that points to the location where new data can be inserted into
-   * the page.
+   * the page. This does not incorporate the page's base offset.
    */
   private long pageCursor = 0;
 
@@ -74,6 +80,15 @@ public final class BytesToBytesMap {
    */
   private static final long PAGE_SIZE_BYTES = 1L << 26; // 64 megabytes
 
+  /**
+   * The maximum number of keys that BytesToBytesMap supports. The hash table has to be
+   * power-of-2-sized and its backing Java array can contain at most (1 << 30) elements, since
+   * that's the largest power-of-2 that's less than Integer.MAX_VALUE. We need two long array
+   * entries per key, giving us a maximum capacity of (1 << 29).
+   */
+  @VisibleForTesting
+  static final int MAX_CAPACITY = (1 << 29);
+
   // This choice of page table size and page size means that we can address up to 500 gigabytes
   // of memory.
 
@@ -143,6 +158,13 @@ public BytesToBytesMap(
     this.loadFactor = loadFactor;
     this.loc = new Location();
     this.enablePerfMetrics = enablePerfMetrics;
+    if (initialCapacity <= 0) {
+      throw new IllegalArgumentException("Initial capacity must be greater than 0");
+    }
+    if (initialCapacity > MAX_CAPACITY) {
+      throw new IllegalArgumentException(
+        "Initial capacity " + initialCapacity + " exceeds maximum capacity of " + MAX_CAPACITY);
+    }
     allocate(initialCapacity);
   }
 
@@ -162,6 +184,55 @@ public BytesToBytesMap(
    */
   public int size() { return size; }
 
+  private static final class BytesToBytesMapIterator implements Iterator<Location> {
+
+    private final int numRecords;
+    private final Iterator<MemoryBlock> dataPagesIterator;
+    private final Location loc;
+
+    private int currentRecordNumber = 0;
+    private Object pageBaseObject;
+    private long offsetInPage;
+
+    BytesToBytesMapIterator(int numRecords, Iterator<MemoryBlock> dataPagesIterator, Location loc) {
+      this.numRecords = numRecords;
+      this.dataPagesIterator = dataPagesIterator;
+      this.loc = loc;
+      if (dataPagesIterator.hasNext()) {
+        advanceToNextPage();
+      }
+    }
+
+    private void advanceToNextPage() {
+      final MemoryBlock currentPage = dataPagesIterator.next();
+      pageBaseObject = currentPage.getBaseObject();
+      offsetInPage = currentPage.getBaseOffset();
+    }
+
+    @Override
+    public boolean hasNext() {
+      return currentRecordNumber != numRecords;
+    }
+
+    @Override
+    public Location next() {
+      int keyLength = (int) PlatformDependent.UNSAFE.getLong(pageBaseObject, offsetInPage);
+      if (keyLength == END_OF_PAGE_MARKER) {
+        advanceToNextPage();
+        keyLength = (int) PlatformDependent.UNSAFE.getLong(pageBaseObject, offsetInPage);
+      }
+      loc.with(pageBaseObject, offsetInPage);
+      offsetInPage += 8 + 8 + keyLength + loc.getValueLength();
+      currentRecordNumber++;
+      return loc;
+    }
+
+    @Override
+    public void remove() {
+      throw new UnsupportedOperationException();
+    }
+  }
+
   /**
    * Returns an iterator for iterating over the entries of this map.
    *
@@ -171,27 +242,7 @@ public BytesToBytesMap(
    * `lookup()`, the behavior of the returned iterator is undefined.
    */
   public Iterator<Location> iterator() {
-    return new Iterator<Location>() {
-
-      private int nextPos = bitset.nextSetBit(0);
-
-      @Override
-      public boolean hasNext() {
-        return nextPos != -1;
-      }
-
-      @Override
-      public Location next() {
-        final int pos = nextPos;
-        nextPos = bitset.nextSetBit(nextPos + 1);
-        return loc.with(pos, 0, true);
-      }
-
-      @Override
-      public void remove() {
-        throw new UnsupportedOperationException();
-      }
-    };
+    return new BytesToBytesMapIterator(size, dataPages.iterator(), loc);
   }
 
   /**
@@ -268,8 +319,11 @@ public final class Location {
     private int valueLength;
 
     private void updateAddressesAndSizes(long fullKeyAddress) {
-        final Object page = memoryManager.getPage(fullKeyAddress);
-        final long keyOffsetInPage = memoryManager.getOffsetInPage(fullKeyAddress);
+      updateAddressesAndSizes(
+        memoryManager.getPage(fullKeyAddress), memoryManager.getOffsetInPage(fullKeyAddress));
+    }
+
+    private void updateAddressesAndSizes(Object page, long keyOffsetInPage) {
         long position = keyOffsetInPage;
         keyLength = (int) PlatformDependent.UNSAFE.getLong(page, position);
         position += 8; // word used to store the key size
@@ -291,6 +345,12 @@ Location with(int pos, int keyHashcode, boolean isDefined) {
       return this;
     }
 
+    Location with(Object page, long keyOffsetInPage) {
+      this.isDefined = true;
+      updateAddressesAndSizes(page, keyOffsetInPage);
+      return this;
+    }
+
     /**
      * Returns true if the key is defined at this position, and false otherwise.
      */
@@ -345,6 +405,8 @@ public int getValueLength() {
      * <p>
      * It is only valid to call this method immediately after calling `lookup()` using the same key.
      * <p>
+     * The key and value must be word-aligned (that is, their sizes must multiples of 8).
+     * <p>
      * After calling this method, calls to `get[Key|Value]Address()` and `get[Key|Value]Length`
      * will return information on the data stored by this `putNewKey` call.
      * <p>
@@ -370,17 +432,27 @@ public void putNewKey(
       isDefined = true;
       assert (keyLengthBytes % 8 == 0);
       assert (valueLengthBytes % 8 == 0);
+      if (size == MAX_CAPACITY) {
+        throw new IllegalStateException("BytesToBytesMap has reached maximum capacity");
+      }
       // Here, we'll copy the data into our data pages. Because we only store a relative offset from
       // the key address instead of storing the absolute address of the value, the key and value
       // must be stored in the same memory page.
       // (8 byte key length) (key) (8 byte value length) (value)
       final long requiredSize = 8 + keyLengthBytes + 8 + valueLengthBytes;
-      assert(requiredSize <= PAGE_SIZE_BYTES);
+      assert (requiredSize <= PAGE_SIZE_BYTES - 8); // Reserve 8 bytes for the end-of-page marker.
       size++;
       bitset.set(pos);
 
-      // If there's not enough space in the current page, allocate a new page:
-      if (currentDataPage == null || PAGE_SIZE_BYTES - pageCursor < requiredSize) {
+      // If there's not enough space in the current page, allocate a new page (8 bytes are reserved
+      // for the end-of-page marker).
+      if (currentDataPage == null || PAGE_SIZE_BYTES - 8 - pageCursor < requiredSize) {
+        if (currentDataPage != null) {
+          // There wasn't enough space in the current page, so write an end-of-page marker:
+          final Object pageBaseObject = currentDataPage.getBaseObject();
+          final long lengthOffsetInPage = currentDataPage.getBaseOffset() + pageCursor;
+          PlatformDependent.UNSAFE.putLong(pageBaseObject, lengthOffsetInPage, END_OF_PAGE_MARKER);
+        }
         MemoryBlock newPage = memoryManager.allocatePage(PAGE_SIZE_BYTES);
         dataPages.add(newPage);
         pageCursor = 0;
@@ -414,7 +486,7 @@ public void putNewKey(
       longArray.set(pos * 2 + 1, keyHashcode);
       updateAddressesAndSizes(storedKeyAddress);
       isDefined = true;
-      if (size > growthThreshold) {
+      if (size > growthThreshold && longArray.size() < MAX_CAPACITY) {
         growAndRehash();
       }
     }
@@ -427,8 +499,11 @@ public void putNewKey(
    * @param capacity the new map capacity
    */
   private void allocate(int capacity) {
-    capacity = Math.max((int) Math.min(Integer.MAX_VALUE, nextPowerOf2(capacity)), 64);
-    longArray = new LongArray(memoryManager.allocate(capacity * 8 * 2));
+    assert (capacity >= 0);
+    // The capacity needs to be divisible by 64 so that our bit set can be sized properly
+    capacity = Math.max((int) Math.min(MAX_CAPACITY, nextPowerOf2(capacity)), 64);
+    assert (capacity <= MAX_CAPACITY);
+    longArray = new LongArray(memoryManager.allocate(capacity * 8L * 2));
     bitset = new BitSet(MemoryBlock.fromLongArray(new long[capacity / 64]));
 
     this.growthThreshold = (int) (capacity * loadFactor);
@@ -494,10 +569,16 @@ public long getNumHashCollisions() {
     return numHashCollisions;
   }
 
+  @VisibleForTesting
+  int getNumDataPages() {
+    return dataPages.size();
+  }
+
   /**
    * Grows the size of the hash table and re-hash everything.
    */
-  private void growAndRehash() {
+  @VisibleForTesting
+  void growAndRehash() {
     long resizeStartTime = -1;
     if (enablePerfMetrics) {
       resizeStartTime = System.nanoTime();
@@ -508,7 +589,7 @@ private void growAndRehash() {
     final int oldCapacity = (int) oldBitSet.capacity();
 
     // Allocate the new data structures
-    allocate(Math.min(Integer.MAX_VALUE, growthStrategy.nextCapacity(oldCapacity)));
+    allocate(Math.min(growthStrategy.nextCapacity(oldCapacity), MAX_CAPACITY));
 
     // Re-mask (we don't recompute the hashcode because we stored all 32 bits of it)
     for (int pos = oldBitSet.nextSetBit(0); pos >= 0; pos = oldBitSet.nextSetBit(pos + 1)) {
diff --git a/unsafe/src/main/java/org/apache/spark/unsafe/map/HashMapGrowthStrategy.java b/unsafe/src/main/java/org/apache/spark/unsafe/map/HashMapGrowthStrategy.java
index 7c321baffe82d..20654e4eeaa02 100644
--- a/unsafe/src/main/java/org/apache/spark/unsafe/map/HashMapGrowthStrategy.java
+++ b/unsafe/src/main/java/org/apache/spark/unsafe/map/HashMapGrowthStrategy.java
@@ -32,7 +32,9 @@ public interface HashMapGrowthStrategy {
   class Doubling implements HashMapGrowthStrategy {
     @Override
     public int nextCapacity(int currentCapacity) {
-      return currentCapacity * 2;
+      assert (currentCapacity > 0);
+      // Guard against overflow
+      return (currentCapacity * 2 > 0) ? (currentCapacity * 2) : Integer.MAX_VALUE;
     }
   }
 
diff --git a/unsafe/src/main/java/org/apache/spark/unsafe/memory/TaskMemoryManager.java b/unsafe/src/main/java/org/apache/spark/unsafe/memory/TaskMemoryManager.java
index 2906ac8abad1a..10881969dbc78 100644
--- a/unsafe/src/main/java/org/apache/spark/unsafe/memory/TaskMemoryManager.java
+++ b/unsafe/src/main/java/org/apache/spark/unsafe/memory/TaskMemoryManager.java
@@ -44,7 +44,7 @@
  * maximum size of a long[] array, allowing us to address 8192 * 2^32 * 8 bytes, which is
  * approximately 35 terabytes of memory.
  */
-public final class TaskMemoryManager {
+public class TaskMemoryManager {
 
   private final Logger logger = LoggerFactory.getLogger(TaskMemoryManager.class);
 
diff --git a/unsafe/src/test/java/org/apache/spark/unsafe/map/AbstractBytesToBytesMapSuite.java b/unsafe/src/test/java/org/apache/spark/unsafe/map/AbstractBytesToBytesMapSuite.java
index 7a5c0622d1ffb..81315f7c94645 100644
--- a/unsafe/src/test/java/org/apache/spark/unsafe/map/AbstractBytesToBytesMapSuite.java
+++ b/unsafe/src/test/java/org/apache/spark/unsafe/map/AbstractBytesToBytesMapSuite.java
@@ -25,24 +25,40 @@
 import org.junit.Assert;
 import org.junit.Before;
 import org.junit.Test;
+import org.mockito.invocation.InvocationOnMock;
+import org.mockito.stubbing.Answer;
+import static org.mockito.AdditionalMatchers.geq;
+import static org.mockito.Mockito.*;
 
 import org.apache.spark.unsafe.array.ByteArrayMethods;
+import org.apache.spark.unsafe.memory.*;
 import org.apache.spark.unsafe.PlatformDependent;
 import static org.apache.spark.unsafe.PlatformDependent.BYTE_ARRAY_OFFSET;
-import org.apache.spark.unsafe.memory.ExecutorMemoryManager;
-import org.apache.spark.unsafe.memory.MemoryAllocator;
-import org.apache.spark.unsafe.memory.MemoryLocation;
-import org.apache.spark.unsafe.memory.TaskMemoryManager;
+import static org.apache.spark.unsafe.PlatformDependent.LONG_ARRAY_OFFSET;
+
 
 public abstract class AbstractBytesToBytesMapSuite {
 
   private final Random rand = new Random(42);
 
   private TaskMemoryManager memoryManager;
+  private TaskMemoryManager sizeLimitedMemoryManager;
 
   @Before
   public void setup() {
     memoryManager = new TaskMemoryManager(new ExecutorMemoryManager(getMemoryAllocator()));
+    // Mocked memory manager for tests that check the maximum array size, since actually allocating
+    // such large arrays will cause us to run out of memory in our tests.
+    sizeLimitedMemoryManager = spy(memoryManager);
+    when(sizeLimitedMemoryManager.allocate(geq(1L << 20))).thenAnswer(new Answer<MemoryBlock>() {
+      @Override
+      public MemoryBlock answer(InvocationOnMock invocation) throws Throwable {
+        if (((Long) invocation.getArguments()[0] / 8) > Integer.MAX_VALUE) {
+          throw new OutOfMemoryError("Requested array size exceeds VM limit");
+        }
+        return memoryManager.allocate(1L << 20);
+      }
+    });
   }
 
   @After
@@ -101,6 +117,7 @@ public void emptyMap() {
       final int keyLengthInBytes = keyLengthInWords * 8;
       final byte[] key = getRandomByteArray(keyLengthInWords);
       Assert.assertFalse(map.lookup(key, BYTE_ARRAY_OFFSET, keyLengthInBytes).isDefined());
+      Assert.assertFalse(map.iterator().hasNext());
     } finally {
       map.free();
     }
@@ -159,7 +176,7 @@ public void setAndRetrieveAKey() {
 
   @Test
   public void iteratorTest() throws Exception {
-    final int size = 128;
+    final int size = 4096;
     BytesToBytesMap map = new BytesToBytesMap(memoryManager, size / 2);
     try {
       for (long i = 0; i < size; i++) {
@@ -167,14 +184,26 @@ public void iteratorTest() throws Exception {
         final BytesToBytesMap.Location loc =
           map.lookup(value, PlatformDependent.LONG_ARRAY_OFFSET, 8);
         Assert.assertFalse(loc.isDefined());
-        loc.putNewKey(
-          value,
-          PlatformDependent.LONG_ARRAY_OFFSET,
-          8,
-          value,
-          PlatformDependent.LONG_ARRAY_OFFSET,
-          8
-        );
+        // Ensure that we store some zero-length keys
+        if (i % 5 == 0) {
+          loc.putNewKey(
+            null,
+            PlatformDependent.LONG_ARRAY_OFFSET,
+            0,
+            value,
+            PlatformDependent.LONG_ARRAY_OFFSET,
+            8
+          );
+        } else {
+          loc.putNewKey(
+            value,
+            PlatformDependent.LONG_ARRAY_OFFSET,
+            8,
+            value,
+            PlatformDependent.LONG_ARRAY_OFFSET,
+            8
+          );
+        }
       }
       final java.util.BitSet valuesSeen = new java.util.BitSet(size);
       final Iterator<BytesToBytesMap.Location> iter = map.iterator();
@@ -183,11 +212,16 @@ public void iteratorTest() throws Exception {
         Assert.assertTrue(loc.isDefined());
         final MemoryLocation keyAddress = loc.getKeyAddress();
         final MemoryLocation valueAddress = loc.getValueAddress();
-        final long key =  PlatformDependent.UNSAFE.getLong(
-          keyAddress.getBaseObject(), keyAddress.getBaseOffset());
         final long value = PlatformDependent.UNSAFE.getLong(
           valueAddress.getBaseObject(), valueAddress.getBaseOffset());
-        Assert.assertEquals(key, value);
+        final long keyLength = loc.getKeyLength();
+        if (keyLength == 0) {
+          Assert.assertTrue("value " + value + " was not divisible by 5", value % 5 == 0);
+        } else {
+        final long key = PlatformDependent.UNSAFE.getLong(
+          keyAddress.getBaseObject(), keyAddress.getBaseOffset());
+          Assert.assertEquals(value, key);
+        }
         valuesSeen.set((int) value);
       }
       Assert.assertEquals(size, valuesSeen.cardinality());
@@ -196,6 +230,74 @@ public void iteratorTest() throws Exception {
     }
   }
 
+  @Test
+  public void iteratingOverDataPagesWithWastedSpace() throws Exception {
+    final int NUM_ENTRIES = 1000 * 1000;
+    final int KEY_LENGTH = 16;
+    final int VALUE_LENGTH = 40;
+    final BytesToBytesMap map = new BytesToBytesMap(memoryManager, NUM_ENTRIES);
+    // Each record will take 8 + 8 + 16 + 40 = 72 bytes of space in the data page. Our 64-megabyte
+    // pages won't be evenly-divisible by records of this size, which will cause us to waste some
+    // space at the end of the page. This is necessary in order for us to take the end-of-record
+    // handling branch in iterator().
+    try {
+      for (int i = 0; i < NUM_ENTRIES; i++) {
+        final long[] key = new long[] { i, i };  // 2 * 8 = 16 bytes
+        final long[] value = new long[] { i, i, i, i, i }; // 5 * 8 = 40 bytes
+        final BytesToBytesMap.Location loc = map.lookup(
+          key,
+          LONG_ARRAY_OFFSET,
+          KEY_LENGTH
+        );
+        Assert.assertFalse(loc.isDefined());
+        loc.putNewKey(
+          key,
+          LONG_ARRAY_OFFSET,
+          KEY_LENGTH,
+          value,
+          LONG_ARRAY_OFFSET,
+          VALUE_LENGTH
+        );
+      }
+      Assert.assertEquals(2, map.getNumDataPages());
+
+      final java.util.BitSet valuesSeen = new java.util.BitSet(NUM_ENTRIES);
+      final Iterator<BytesToBytesMap.Location> iter = map.iterator();
+      final long key[] = new long[KEY_LENGTH / 8];
+      final long value[] = new long[VALUE_LENGTH / 8];
+      while (iter.hasNext()) {
+        final BytesToBytesMap.Location loc = iter.next();
+        Assert.assertTrue(loc.isDefined());
+        Assert.assertEquals(KEY_LENGTH, loc.getKeyLength());
+        Assert.assertEquals(VALUE_LENGTH, loc.getValueLength());
+        PlatformDependent.copyMemory(
+          loc.getKeyAddress().getBaseObject(),
+          loc.getKeyAddress().getBaseOffset(),
+          key,
+          LONG_ARRAY_OFFSET,
+          KEY_LENGTH
+        );
+        PlatformDependent.copyMemory(
+          loc.getValueAddress().getBaseObject(),
+          loc.getValueAddress().getBaseOffset(),
+          value,
+          LONG_ARRAY_OFFSET,
+          VALUE_LENGTH
+        );
+        for (long j : key) {
+          Assert.assertEquals(key[0], j);
+        }
+        for (long j : value) {
+          Assert.assertEquals(key[0], j);
+        }
+        valuesSeen.set((int) key[0]);
+      }
+      Assert.assertEquals(NUM_ENTRIES, valuesSeen.cardinality());
+    } finally {
+      map.free();
+    }
+  }
+
   @Test
   public void randomizedStressTest() {
     final int size = 65536;
@@ -247,4 +349,35 @@ public void randomizedStressTest() {
       map.free();
     }
   }
+
+  @Test
+  public void initialCapacityBoundsChecking() {
+    try {
+      new BytesToBytesMap(sizeLimitedMemoryManager, 0);
+      Assert.fail("Expected IllegalArgumentException to be thrown");
+    } catch (IllegalArgumentException e) {
+      // expected exception
+    }
+
+    try {
+      new BytesToBytesMap(sizeLimitedMemoryManager, BytesToBytesMap.MAX_CAPACITY + 1);
+      Assert.fail("Expected IllegalArgumentException to be thrown");
+    } catch (IllegalArgumentException e) {
+      // expected exception
+    }
+
+   // Can allocate _at_ the max capacity
+    BytesToBytesMap map =
+      new BytesToBytesMap(sizeLimitedMemoryManager, BytesToBytesMap.MAX_CAPACITY);
+    map.free();
+  }
+
+  @Test
+  public void resizingLargeMap() {
+    // As long as a map's capacity is below the max, we should be able to resize up to the max
+    BytesToBytesMap map =
+      new BytesToBytesMap(sizeLimitedMemoryManager, BytesToBytesMap.MAX_CAPACITY - 64);
+    map.growAndRehash();
+    map.free();
+  }
 }

From c330e52dae6a3ec7e67ca82e2c2f4ea873976458 Mon Sep 17 00:00:00 2001
From: Xiangrui Meng <meng@databricks.com>
Date: Wed, 20 May 2015 17:26:26 -0700
Subject: [PATCH 097/525] [SPARK-7762] [MLLIB] set default value for outputCol

Set a default value for `outputCol` instead of forcing users to name it. This is useful for intermediate transformers in the pipeline. jkbradley

Author: Xiangrui Meng <meng@databricks.com>

Closes #6289 from mengxr/SPARK-7762 and squashes the following commits:

54edebc [Xiangrui Meng] merge master
bff8667 [Xiangrui Meng] update unit test
171246b [Xiangrui Meng] add unit test for outputCol
a4321bd [Xiangrui Meng] set default value for outputCol
---
 .../ml/param/shared/SharedParamsCodeGen.scala |  2 +-
 .../spark/ml/param/shared/sharedParams.scala  |  4 ++-
 .../ml/param/shared/SharedParamsSuite.scala   | 35 +++++++++++++++++++
 .../ml/param/_shared_params_code_gen.py       |  2 +-
 python/pyspark/ml/param/shared.py             |  3 +-
 5 files changed, 42 insertions(+), 4 deletions(-)
 create mode 100644 mllib/src/test/scala/org/apache/spark/ml/param/shared/SharedParamsSuite.scala

diff --git a/mllib/src/main/scala/org/apache/spark/ml/param/shared/SharedParamsCodeGen.scala b/mllib/src/main/scala/org/apache/spark/ml/param/shared/SharedParamsCodeGen.scala
index 8b8cb81373a65..1ffb5eddc36bd 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/param/shared/SharedParamsCodeGen.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/param/shared/SharedParamsCodeGen.scala
@@ -49,7 +49,7 @@ private[shared] object SharedParamsCodeGen {
         isValid = "ParamValidators.inRange(0, 1)"),
       ParamDesc[String]("inputCol", "input column name"),
       ParamDesc[Array[String]]("inputCols", "input column names"),
-      ParamDesc[String]("outputCol", "output column name"),
+      ParamDesc[String]("outputCol", "output column name", Some("uid + \"__output\"")),
       ParamDesc[Int]("checkpointInterval", "checkpoint interval (>= 1)",
         isValid = "ParamValidators.gtEq(1)"),
       ParamDesc[Boolean]("fitIntercept", "whether to fit an intercept term", Some("true")),
diff --git a/mllib/src/main/scala/org/apache/spark/ml/param/shared/sharedParams.scala b/mllib/src/main/scala/org/apache/spark/ml/param/shared/sharedParams.scala
index 3a4976d3ddcd1..ed08417bd4df8 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/param/shared/sharedParams.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/param/shared/sharedParams.scala
@@ -185,7 +185,7 @@ private[ml] trait HasInputCols extends Params {
 }
 
 /**
- * (private[ml]) Trait for shared param outputCol.
+ * (private[ml]) Trait for shared param outputCol (default: uid + "__output").
  */
 private[ml] trait HasOutputCol extends Params {
 
@@ -195,6 +195,8 @@ private[ml] trait HasOutputCol extends Params {
    */
   final val outputCol: Param[String] = new Param[String](this, "outputCol", "output column name")
 
+  setDefault(outputCol, uid + "__output")
+
   /** @group getParam */
   final def getOutputCol: String = $(outputCol)
 }
diff --git a/mllib/src/test/scala/org/apache/spark/ml/param/shared/SharedParamsSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/param/shared/SharedParamsSuite.scala
new file mode 100644
index 0000000000000..ca18fa1ad3c15
--- /dev/null
+++ b/mllib/src/test/scala/org/apache/spark/ml/param/shared/SharedParamsSuite.scala
@@ -0,0 +1,35 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.ml.param.shared
+
+import org.scalatest.FunSuite
+
+import org.apache.spark.ml.param.Params
+
+class SharedParamsSuite extends FunSuite {
+
+  test("outputCol") {
+
+    class Obj(override val uid: String) extends Params with HasOutputCol
+
+    val obj = new Obj("obj")
+
+    assert(obj.hasDefault(obj.outputCol))
+    assert(obj.getOrDefault(obj.outputCol) === "obj__output")
+  }
+}
diff --git a/python/pyspark/ml/param/_shared_params_code_gen.py b/python/pyspark/ml/param/_shared_params_code_gen.py
index ccb929af184b8..69efc424ec4ef 100644
--- a/python/pyspark/ml/param/_shared_params_code_gen.py
+++ b/python/pyspark/ml/param/_shared_params_code_gen.py
@@ -116,7 +116,7 @@ def get$Name(self):
         ("rawPredictionCol", "raw prediction (a.k.a. confidence) column name", "'rawPrediction'"),
         ("inputCol", "input column name", None),
         ("inputCols", "input column names", None),
-        ("outputCol", "output column name", None),
+        ("outputCol", "output column name", "self.uid + '__output'"),
         ("numFeatures", "number of features", None),
         ("checkpointInterval", "checkpoint interval (>= 1)", None),
         ("seed", "random seed", "hash(type(self).__name__)"),
diff --git a/python/pyspark/ml/param/shared.py b/python/pyspark/ml/param/shared.py
index 0b93788899124..bc088e4c29e26 100644
--- a/python/pyspark/ml/param/shared.py
+++ b/python/pyspark/ml/param/shared.py
@@ -280,6 +280,7 @@ def __init__(self):
         super(HasOutputCol, self).__init__()
         #: param for output column name
         self.outputCol = Param(self, "outputCol", "output column name")
+        self._setDefault(outputCol=self.uid + '__output')
 
     def setOutputCol(self, value):
         """
@@ -459,7 +460,7 @@ def __init__(self):
         self.maxMemoryInMB = Param(self, "maxMemoryInMB", "Maximum memory in MB allocated to histogram aggregation.")
         #: param for If false, the algorithm will pass trees to executors to match instances with nodes. If true, the algorithm will cache node IDs for each instance. Caching can speed up training of deeper trees.
         self.cacheNodeIds = Param(self, "cacheNodeIds", "If false, the algorithm will pass trees to executors to match instances with nodes. If true, the algorithm will cache node IDs for each instance. Caching can speed up training of deeper trees.")
-
+        
     def setMaxDepth(self, value):
         """
         Sets the value of :py:attr:`maxDepth`.

From 5196efff53af4965ff216a9d5c0f8b2b4fc98652 Mon Sep 17 00:00:00 2001
From: Josh Rosen <joshrosen@databricks.com>
Date: Wed, 20 May 2015 17:52:50 -0700
Subject: [PATCH 098/525] [SPARK-7719] Re-add UnsafeShuffleWriterSuite test
 that was removed for Java 6 compat

This patch re-adds a test which was removed in 9ebb44f8abb1a13f045eed60190954db904ffef7 due to a Java 6 compatibility issue.  We now use Guava's `Iterators.emptyIterator()` in place of `Collections.emptyIterator()`, which isn't present in all Java 6 versions.

Author: Josh Rosen <joshrosen@databricks.com>

Closes #6298 from JoshRosen/SPARK-7719-fix-java-6-test-code and squashes the following commits:

5c9bd85 [Josh Rosen] Re-add UnsafeShuffleWriterSuite.emptyIterator() test which was removed due to Java 6 issue
---
 .../shuffle/unsafe/UnsafeShuffleWriterSuite.java  | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/core/src/test/java/org/apache/spark/shuffle/unsafe/UnsafeShuffleWriterSuite.java b/core/src/test/java/org/apache/spark/shuffle/unsafe/UnsafeShuffleWriterSuite.java
index 03116d8fc2b21..83d109115aa5c 100644
--- a/core/src/test/java/org/apache/spark/shuffle/unsafe/UnsafeShuffleWriterSuite.java
+++ b/core/src/test/java/org/apache/spark/shuffle/unsafe/UnsafeShuffleWriterSuite.java
@@ -26,6 +26,7 @@
 import scala.reflect.ClassTag;
 import scala.runtime.AbstractFunction1;
 
+import com.google.common.collect.Iterators;
 import com.google.common.collect.HashMultiset;
 import com.google.common.io.ByteStreams;
 import org.junit.After;
@@ -252,6 +253,20 @@ public void doNotNeedToCallWriteBeforeUnsuccessfulStop() throws IOException {
     createWriter(false).stop(false);
   }
 
+  @Test
+  public void writeEmptyIterator() throws Exception {
+    final UnsafeShuffleWriter<Object, Object> writer = createWriter(true);
+    writer.write(Iterators.<Product2<Object, Object>>emptyIterator());
+    final Option<MapStatus> mapStatus = writer.stop(true);
+    assertTrue(mapStatus.isDefined());
+    assertTrue(mergedOutputFile.exists());
+    assertArrayEquals(new long[NUM_PARTITITONS], partitionSizesInMergedFile);
+    assertEquals(0, taskMetrics.shuffleWriteMetrics().get().shuffleRecordsWritten());
+    assertEquals(0, taskMetrics.shuffleWriteMetrics().get().shuffleBytesWritten());
+    assertEquals(0, taskMetrics.diskBytesSpilled());
+    assertEquals(0, taskMetrics.memoryBytesSpilled());
+  }
+
   @Test
   public void writeWithoutSpilling() throws Exception {
     // In this example, each partition should have exactly one record:

From a70bf06b790add5f279a69607df89ed36155b0e4 Mon Sep 17 00:00:00 2001
From: Hari Shreedharan <hshreedharan@apache.org>
Date: Wed, 20 May 2015 21:13:10 -0500
Subject: [PATCH 099/525] =?UTF-8?q?[SPARK-7750]=20[WEBUI]=20Rename=20endpo?=
 =?UTF-8?q?ints=20from=20`json`=20to=20`api`=20to=20allow=20fu=E2=80=A6?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

…rther extension to non-json outputs too.

Author: Hari Shreedharan <hshreedharan@apache.org>

Closes #6273 from harishreedharan/json-to-api and squashes the following commits:

e14b73b [Hari Shreedharan] Rename `getJsonServlet` to `getServletHandler` i
42f8acb [Hari Shreedharan] Import order fixes.
2ef852f [Hari Shreedharan] [SPARK-7750][WebUI] Rename endpoints from `json` to `api` to allow further extension to non-json outputs too.
---
 .../spark/deploy/history/HistoryServer.scala     |  5 +++--
 .../spark/deploy/master/ui/MasterWebUI.scala     |  5 +++--
 ...nRootResource.scala => ApiRootResource.scala} |  8 ++++----
 .../main/scala/org/apache/spark/ui/SparkUI.scala |  5 +++--
 .../deploy/history/HistoryServerSuite.scala      |  4 ++--
 .../org/apache/spark/ui/UISeleniumSuite.scala    | 16 ++++++++--------
 docs/monitoring.md                               | 10 +++++-----
 7 files changed, 28 insertions(+), 25 deletions(-)
 rename core/src/main/scala/org/apache/spark/status/api/v1/{JsonRootResource.scala => ApiRootResource.scala} (97%)

diff --git a/core/src/main/scala/org/apache/spark/deploy/history/HistoryServer.scala b/core/src/main/scala/org/apache/spark/deploy/history/HistoryServer.scala
index 517cbe5176241..5a0eb585a9049 100644
--- a/core/src/main/scala/org/apache/spark/deploy/history/HistoryServer.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/history/HistoryServer.scala
@@ -25,7 +25,8 @@ import org.eclipse.jetty.servlet.{ServletContextHandler, ServletHolder}
 
 import org.apache.spark.{Logging, SecurityManager, SparkConf}
 import org.apache.spark.deploy.SparkHadoopUtil
-import org.apache.spark.status.api.v1.{ApplicationInfo, ApplicationsListResource, JsonRootResource, UIRoot}
+import org.apache.spark.status.api.v1.{ApiRootResource, ApplicationInfo, ApplicationsListResource,
+  UIRoot}
 import org.apache.spark.ui.{SparkUI, UIUtils, WebUI}
 import org.apache.spark.ui.JettyUtils._
 import org.apache.spark.util.{SignalLogger, Utils}
@@ -125,7 +126,7 @@ class HistoryServer(
   def initialize() {
     attachPage(new HistoryPage(this))
 
-    attachHandler(JsonRootResource.getJsonServlet(this))
+    attachHandler(ApiRootResource.getServletHandler(this))
 
     attachHandler(createStaticHandler(SparkUI.STATIC_RESOURCE_DIR, "/static"))
 
diff --git a/core/src/main/scala/org/apache/spark/deploy/master/ui/MasterWebUI.scala b/core/src/main/scala/org/apache/spark/deploy/master/ui/MasterWebUI.scala
index eb26e9f99c70b..2111a8581f2e4 100644
--- a/core/src/main/scala/org/apache/spark/deploy/master/ui/MasterWebUI.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/master/ui/MasterWebUI.scala
@@ -19,7 +19,8 @@ package org.apache.spark.deploy.master.ui
 
 import org.apache.spark.Logging
 import org.apache.spark.deploy.master.Master
-import org.apache.spark.status.api.v1.{ApplicationsListResource, ApplicationInfo, JsonRootResource, UIRoot}
+import org.apache.spark.status.api.v1.{ApiRootResource, ApplicationsListResource, ApplicationInfo,
+  UIRoot}
 import org.apache.spark.ui.{SparkUI, WebUI}
 import org.apache.spark.ui.JettyUtils._
 import org.apache.spark.util.RpcUtils
@@ -47,7 +48,7 @@ class MasterWebUI(val master: Master, requestedPort: Int)
     attachPage(new HistoryNotFoundPage(this))
     attachPage(masterPage)
     attachHandler(createStaticHandler(MasterWebUI.STATIC_RESOURCE_DIR, "/static"))
-    attachHandler(JsonRootResource.getJsonServlet(this))
+    attachHandler(ApiRootResource.getServletHandler(this))
     attachHandler(createRedirectHandler(
       "/app/kill", "/", masterPage.handleAppKillRequest, httpMethods = Set("POST")))
     attachHandler(createRedirectHandler(
diff --git a/core/src/main/scala/org/apache/spark/status/api/v1/JsonRootResource.scala b/core/src/main/scala/org/apache/spark/status/api/v1/ApiRootResource.scala
similarity index 97%
rename from core/src/main/scala/org/apache/spark/status/api/v1/JsonRootResource.scala
rename to core/src/main/scala/org/apache/spark/status/api/v1/ApiRootResource.scala
index c3ec45f54681b..bf2cc2e72f1fe 100644
--- a/core/src/main/scala/org/apache/spark/status/api/v1/JsonRootResource.scala
+++ b/core/src/main/scala/org/apache/spark/status/api/v1/ApiRootResource.scala
@@ -39,7 +39,7 @@ import org.apache.spark.ui.SparkUI
  * HistoryServerSuite.
  */
 @Path("/v1")
-private[v1] class JsonRootResource extends UIRootFromServletContext {
+private[v1] class ApiRootResource extends UIRootFromServletContext {
 
   @Path("applications")
   def getApplicationList(): ApplicationListResource = {
@@ -166,11 +166,11 @@ private[v1] class JsonRootResource extends UIRootFromServletContext {
 
 }
 
-private[spark] object JsonRootResource {
+private[spark] object ApiRootResource {
 
-  def getJsonServlet(uiRoot: UIRoot): ServletContextHandler = {
+  def getServletHandler(uiRoot: UIRoot): ServletContextHandler = {
     val jerseyContext = new ServletContextHandler(ServletContextHandler.NO_SESSIONS)
-    jerseyContext.setContextPath("/json")
+    jerseyContext.setContextPath("/api")
     val holder:ServletHolder = new ServletHolder(classOf[ServletContainer])
     holder.setInitParameter("com.sun.jersey.config.property.resourceConfigClass",
       "com.sun.jersey.api.core.PackagesResourceConfig")
diff --git a/core/src/main/scala/org/apache/spark/ui/SparkUI.scala b/core/src/main/scala/org/apache/spark/ui/SparkUI.scala
index bfe4a180e8a6f..0b11e914bb251 100644
--- a/core/src/main/scala/org/apache/spark/ui/SparkUI.scala
+++ b/core/src/main/scala/org/apache/spark/ui/SparkUI.scala
@@ -19,7 +19,8 @@ package org.apache.spark.ui
 
 import java.util.Date
 
-import org.apache.spark.status.api.v1.{ApplicationAttemptInfo, ApplicationInfo, JsonRootResource, UIRoot}
+import org.apache.spark.status.api.v1.{ApiRootResource, ApplicationAttemptInfo, ApplicationInfo,
+  UIRoot}
 import org.apache.spark.{Logging, SecurityManager, SparkConf, SparkContext}
 import org.apache.spark.scheduler._
 import org.apache.spark.storage.StorageStatusListener
@@ -64,7 +65,7 @@ private[spark] class SparkUI private (
     attachTab(new ExecutorsTab(this))
     attachHandler(createStaticHandler(SparkUI.STATIC_RESOURCE_DIR, "/static"))
     attachHandler(createRedirectHandler("/", "/jobs", basePath = basePath))
-    attachHandler(JsonRootResource.getJsonServlet(this))
+    attachHandler(ApiRootResource.getServletHandler(this))
     // This should be POST only, but, the YARN AM proxy won't proxy POSTs
     attachHandler(createRedirectHandler(
       "/stages/stage/kill", "/stages", stagesTab.handleKillRequest,
diff --git a/core/src/test/scala/org/apache/spark/deploy/history/HistoryServerSuite.scala b/core/src/test/scala/org/apache/spark/deploy/history/HistoryServerSuite.scala
index 318ab5dbc4804..4adb5122bcf1a 100644
--- a/core/src/test/scala/org/apache/spark/deploy/history/HistoryServerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/deploy/history/HistoryServerSuite.scala
@@ -198,11 +198,11 @@ class HistoryServerSuite extends FunSuite with BeforeAndAfter with Matchers with
   }
 
   def getContentAndCode(path: String, port: Int = port): (Int, Option[String], Option[String]) = {
-    HistoryServerSuite.getContentAndCode(new URL(s"http://localhost:$port/json/v1/$path"))
+    HistoryServerSuite.getContentAndCode(new URL(s"http://localhost:$port/api/v1/$path"))
   }
 
   def getUrl(path: String): String = {
-    HistoryServerSuite.getUrl(new URL(s"http://localhost:$port/json/v1/$path"))
+    HistoryServerSuite.getUrl(new URL(s"http://localhost:$port/api/v1/$path"))
   }
 
   def generateExpectation(name: String, path: String): Unit = {
diff --git a/core/src/test/scala/org/apache/spark/ui/UISeleniumSuite.scala b/core/src/test/scala/org/apache/spark/ui/UISeleniumSuite.scala
index 117b2c3960820..b6f5accef0cef 100644
--- a/core/src/test/scala/org/apache/spark/ui/UISeleniumSuite.scala
+++ b/core/src/test/scala/org/apache/spark/ui/UISeleniumSuite.scala
@@ -497,7 +497,7 @@ class UISeleniumSuite extends FunSuite with WebBrowser with Matchers with Before
       goToUi(sc, "/jobs/job/?id=7")
       find("no-info").get.text should be ("No information to display for job 7")
 
-      val badJob = HistoryServerSuite.getContentAndCode(jsonUrl(sc.ui.get, "jobs/7"))
+      val badJob = HistoryServerSuite.getContentAndCode(apiUrl(sc.ui.get, "jobs/7"))
       badJob._1 should be (HttpServletResponse.SC_NOT_FOUND)
       badJob._2 should be (None)
       badJob._3 should be (Some("unknown job: 7"))
@@ -540,18 +540,18 @@ class UISeleniumSuite extends FunSuite with WebBrowser with Matchers with Before
 
       goToUi(sc, "/stages/stage/?id=12&attempt=0")
       find("no-info").get.text should be ("No information to display for Stage 12 (Attempt 0)")
-      val badStage = HistoryServerSuite.getContentAndCode(jsonUrl(sc.ui.get,"stages/12/0"))
+      val badStage = HistoryServerSuite.getContentAndCode(apiUrl(sc.ui.get,"stages/12/0"))
       badStage._1 should be (HttpServletResponse.SC_NOT_FOUND)
       badStage._2 should be (None)
       badStage._3 should be (Some("unknown stage: 12"))
 
-      val badAttempt = HistoryServerSuite.getContentAndCode(jsonUrl(sc.ui.get,"stages/19/15"))
+      val badAttempt = HistoryServerSuite.getContentAndCode(apiUrl(sc.ui.get,"stages/19/15"))
       badAttempt._1 should be (HttpServletResponse.SC_NOT_FOUND)
       badAttempt._2 should be (None)
       badAttempt._3 should be (Some("unknown attempt for stage 19.  Found attempts: [0]"))
 
       val badStageAttemptList = HistoryServerSuite.getContentAndCode(
-        jsonUrl(sc.ui.get, "stages/12"))
+        apiUrl(sc.ui.get, "stages/12"))
       badStageAttemptList._1 should be (HttpServletResponse.SC_NOT_FOUND)
       badStageAttemptList._2 should be (None)
       badStageAttemptList._3 should be (Some("unknown stage: 12"))
@@ -561,7 +561,7 @@ class UISeleniumSuite extends FunSuite with WebBrowser with Matchers with Before
   test("live UI json application list") {
     withSpark(newSparkContext()) { sc =>
       val appListRawJson = HistoryServerSuite.getUrl(new URL(
-        sc.ui.get.appUIAddress + "/json/v1/applications"))
+        sc.ui.get.appUIAddress + "/api/v1/applications"))
       val appListJsonAst = JsonMethods.parse(appListRawJson)
       appListJsonAst.children.length should be (1)
       val attempts = (appListJsonAst \ "attempts").children
@@ -587,10 +587,10 @@ class UISeleniumSuite extends FunSuite with WebBrowser with Matchers with Before
   }
 
   def getJson(ui: SparkUI, path: String): JValue = {
-    JsonMethods.parse(HistoryServerSuite.getUrl(jsonUrl(ui, path)))
+    JsonMethods.parse(HistoryServerSuite.getUrl(apiUrl(ui, path)))
   }
 
-  def jsonUrl(ui: SparkUI, path: String): URL = {
-    new URL(ui.appUIAddress + "/json/v1/applications/test/" + path)
+  def apiUrl(ui: SparkUI, path: String): URL = {
+    new URL(ui.appUIAddress + "/api/v1/applications/test/" + path)
   }
 }
diff --git a/docs/monitoring.md b/docs/monitoring.md
index 1e0fc150862fb..e75018499003a 100644
--- a/docs/monitoring.md
+++ b/docs/monitoring.md
@@ -178,9 +178,9 @@ Note that the history server only displays completed Spark jobs. One way to sign
 
 In addition to viewing the metrics in the UI, they are also available as JSON.  This gives developers
 an easy way to create new visualizations and monitoring tools for Spark.  The JSON is available for
-both running applications, and in the history server.  The endpoints are mounted at `/json/v1`.  Eg.,
-for the history server, they would typically be accessible at `http://<server-url>:18080/json/v1`, and
-for a running application, at `http://localhost:4040/json/v1`.
+both running applications, and in the history server.  The endpoints are mounted at `/api/v1`.  Eg.,
+for the history server, they would typically be accessible at `http://<server-url>:18080/api/v1`, and
+for a running application, at `http://localhost:4040/api/v1`.
 
 <table class="table">
   <tr><th>Endpoint</th><th>Meaning</th></tr>
@@ -240,12 +240,12 @@ These endpoints have been strongly versioned to make it easier to develop applic
 * Individual fields will never be removed for any given endpoint
 * New endpoints may be added
 * New fields may be added to existing endpoints
-* New versions of the api may be added in the future at a separate endpoint (eg., `json/v2`).  New versions are *not* required to be backwards compatible.
+* New versions of the api may be added in the future at a separate endpoint (eg., `api/v2`).  New versions are *not* required to be backwards compatible.
 * Api versions may be dropped, but only after at least one minor release of co-existing with a new api version
 
 Note that even when examining the UI of a running applications, the `applications/[app-id]` portion is
 still required, though there is only one application available.  Eg. to see the list of jobs for the
-running app, you would go to `http://localhost:4040/json/v1/applications/[app-id]/jobs`.  This is to
+running app, you would go to `http://localhost:4040/api/v1/applications/[app-id]/jobs`.  This is to
 keep the paths consistent in both modes.
 
 # Metrics

From 895baf8f77e630ce32b0e25b00bf5ee45d17398f Mon Sep 17 00:00:00 2001
From: zsxwing <zsxwing@gmail.com>
Date: Wed, 20 May 2015 19:56:01 -0700
Subject: [PATCH 100/525] [SPARK-7777] [STREAMING] Fix the flaky test in
 org.apache.spark.streaming.BasicOperationsSuite

Just added a guard to make sure a batch has completed before moving to the next batch.

Author: zsxwing <zsxwing@gmail.com>

Closes #6306 from zsxwing/SPARK-7777 and squashes the following commits:

ecee529 [zsxwing] Fix the failure message
58634fe [zsxwing] Fix the flaky test in org.apache.spark.streaming.BasicOperationsSuite
---
 .../org/apache/spark/streaming/BasicOperationsSuite.scala  | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/streaming/src/test/scala/org/apache/spark/streaming/BasicOperationsSuite.scala b/streaming/src/test/scala/org/apache/spark/streaming/BasicOperationsSuite.scala
index 87bc20f79c3cd..f269cb74e0c2b 100644
--- a/streaming/src/test/scala/org/apache/spark/streaming/BasicOperationsSuite.scala
+++ b/streaming/src/test/scala/org/apache/spark/streaming/BasicOperationsSuite.scala
@@ -557,6 +557,9 @@ class BasicOperationsSuite extends TestSuiteBase {
     withTestServer(new TestServer()) { testServer =>
       withStreamingContext(new StreamingContext(conf, batchDuration)) { ssc =>
         testServer.start()
+
+        val batchCounter = new BatchCounter(ssc)
+
         // Set up the streaming context and input streams
         val networkStream =
           ssc.socketTextStream("localhost", testServer.port, StorageLevel.MEMORY_AND_DISK)
@@ -587,7 +590,11 @@ class BasicOperationsSuite extends TestSuiteBase {
         for (i <- 0 until input.size) {
           testServer.send(input(i).toString + "\n")
           Thread.sleep(200)
+          val numCompletedBatches = batchCounter.getNumCompletedBatches
           clock.advance(batchDuration.milliseconds)
+          if (!batchCounter.waitUntilBatchesCompleted(numCompletedBatches + 1, 5000)) {
+            fail("Batch took more than 5 seconds to complete")
+          }
           collectRddInfo()
         }
 

From 42c592adb381ff20832cce55e0849ed68dd7eee4 Mon Sep 17 00:00:00 2001
From: Cheng Hao <hao.cheng@intel.com>
Date: Wed, 20 May 2015 19:58:22 -0700
Subject: [PATCH 101/525] [SPARK-7320] [SQL] Add Cube / Rollup for dataframe

This is a follow up for #6257, which broke the maven test.

Add cube & rollup for DataFrame
For example:
```scala
testData.rollup($"a" + $"b", $"b").agg(sum($"a" - $"b"))
testData.cube($"a" + $"b", $"b").agg(sum($"a" - $"b"))
```

Author: Cheng Hao <hao.cheng@intel.com>

Closes #6304 from chenghao-intel/rollup and squashes the following commits:

04bb1de [Cheng Hao] move the table register/unregister into beforeAll/afterAll
a6069f1 [Cheng Hao] cancel the implicit keyword
ced4b8f [Cheng Hao] remove the unnecessary code changes
9959dfa [Cheng Hao] update the code as comments
e1d88aa [Cheng Hao] update the code as suggested
03bc3d9 [Cheng Hao] Remove the CubedData & RollupedData
5fd62d0 [Cheng Hao] hiden the CubedData & RollupedData
5ffb196 [Cheng Hao] Add Cube / Rollup for dataframe
---
 .../org/apache/spark/sql/DataFrame.scala      | 104 +++++++++++++++++-
 .../org/apache/spark/sql/GroupedData.scala    |  92 +++++++++++-----
 .../hive/HiveDataFrameAnalyticsSuite.scala    |  69 ++++++++++++
 3 files changed, 237 insertions(+), 28 deletions(-)
 create mode 100644 sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveDataFrameAnalyticsSuite.scala

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
index adad85806d1ea..d78b4c2f8909c 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
@@ -685,7 +685,53 @@ class DataFrame private[sql](
    * @since 1.3.0
    */
   @scala.annotation.varargs
-  def groupBy(cols: Column*): GroupedData = new GroupedData(this, cols.map(_.expr))
+  def groupBy(cols: Column*): GroupedData = {
+    GroupedData(this, cols.map(_.expr), GroupedData.GroupByType)
+  }
+
+  /**
+   * Create a multi-dimensional rollup for the current [[DataFrame]] using the specified columns,
+   * so we can run aggregation on them.
+   * See [[GroupedData]] for all the available aggregate functions.
+   *
+   * {{{
+   *   // Compute the average for all numeric columns rolluped by department and group.
+   *   df.rollup($"department", $"group").avg()
+   *
+   *   // Compute the max age and average salary, rolluped by department and gender.
+   *   df.rollup($"department", $"gender").agg(Map(
+   *     "salary" -> "avg",
+   *     "age" -> "max"
+   *   ))
+   * }}}
+   * @group dfops
+   * @since 1.4.0
+   */
+  @scala.annotation.varargs
+  def rollup(cols: Column*): GroupedData = {
+    GroupedData(this, cols.map(_.expr), GroupedData.RollupType)
+  }
+
+  /**
+   * Create a multi-dimensional cube for the current [[DataFrame]] using the specified columns,
+   * so we can run aggregation on them.
+   * See [[GroupedData]] for all the available aggregate functions.
+   *
+   * {{{
+   *   // Compute the average for all numeric columns cubed by department and group.
+   *   df.cube($"department", $"group").avg()
+   *
+   *   // Compute the max age and average salary, cubed by department and gender.
+   *   df.cube($"department", $"gender").agg(Map(
+   *     "salary" -> "avg",
+   *     "age" -> "max"
+   *   ))
+   * }}}
+   * @group dfops
+   * @since 1.4.0
+   */
+  @scala.annotation.varargs
+  def cube(cols: Column*): GroupedData = GroupedData(this, cols.map(_.expr), GroupedData.CubeType)
 
   /**
    * Groups the [[DataFrame]] using the specified columns, so we can run aggregation on them.
@@ -710,7 +756,61 @@ class DataFrame private[sql](
   @scala.annotation.varargs
   def groupBy(col1: String, cols: String*): GroupedData = {
     val colNames: Seq[String] = col1 +: cols
-    new GroupedData(this, colNames.map(colName => resolve(colName)))
+    GroupedData(this, colNames.map(colName => resolve(colName)), GroupedData.GroupByType)
+  }
+
+  /**
+   * Create a multi-dimensional rollup for the current [[DataFrame]] using the specified columns,
+   * so we can run aggregation on them.
+   * See [[GroupedData]] for all the available aggregate functions.
+   *
+   * This is a variant of rollup that can only group by existing columns using column names
+   * (i.e. cannot construct expressions).
+   *
+   * {{{
+   *   // Compute the average for all numeric columns rolluped by department and group.
+   *   df.rollup("department", "group").avg()
+   *
+   *   // Compute the max age and average salary, rolluped by department and gender.
+   *   df.rollup($"department", $"gender").agg(Map(
+   *     "salary" -> "avg",
+   *     "age" -> "max"
+   *   ))
+   * }}}
+   * @group dfops
+   * @since 1.4.0
+   */
+  @scala.annotation.varargs
+  def rollup(col1: String, cols: String*): GroupedData = {
+    val colNames: Seq[String] = col1 +: cols
+    GroupedData(this, colNames.map(colName => resolve(colName)), GroupedData.RollupType)
+  }
+
+  /**
+   * Create a multi-dimensional cube for the current [[DataFrame]] using the specified columns,
+   * so we can run aggregation on them.
+   * See [[GroupedData]] for all the available aggregate functions.
+   *
+   * This is a variant of cube that can only group by existing columns using column names
+   * (i.e. cannot construct expressions).
+   *
+   * {{{
+   *   // Compute the average for all numeric columns cubed by department and group.
+   *   df.cube("department", "group").avg()
+   *
+   *   // Compute the max age and average salary, cubed by department and gender.
+   *   df.cube($"department", $"gender").agg(Map(
+   *     "salary" -> "avg",
+   *     "age" -> "max"
+   *   ))
+   * }}}
+   * @group dfops
+   * @since 1.4.0
+   */
+  @scala.annotation.varargs
+  def cube(col1: String, cols: String*): GroupedData = {
+    val colNames: Seq[String] = col1 +: cols
+    GroupedData(this, colNames.map(colName => resolve(colName)), GroupedData.CubeType)
   }
 
   /**
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/GroupedData.scala b/sql/core/src/main/scala/org/apache/spark/sql/GroupedData.scala
index 1381b9f1a6080..f730e4ae00e2b 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/GroupedData.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/GroupedData.scala
@@ -23,9 +23,40 @@ import scala.language.implicitConversions
 import org.apache.spark.annotation.Experimental
 import org.apache.spark.sql.catalyst.analysis.Star
 import org.apache.spark.sql.catalyst.expressions._
-import org.apache.spark.sql.catalyst.plans.logical.Aggregate
+import org.apache.spark.sql.catalyst.plans.logical.{Rollup, Cube, Aggregate}
 import org.apache.spark.sql.types.NumericType
 
+/**
+ * Companion object for GroupedData
+ */
+private[sql] object GroupedData {
+  def apply(
+      df: DataFrame,
+      groupingExprs: Seq[Expression],
+      groupType: GroupType): GroupedData = {
+    new GroupedData(df, groupingExprs, groupType: GroupType)
+  }
+
+  /**
+   * The Grouping Type
+   */
+  trait GroupType
+
+  /**
+   * To indicate it's the GroupBy
+   */
+  object GroupByType extends GroupType
+
+  /**
+   * To indicate it's the CUBE
+   */
+  object CubeType extends GroupType
+
+  /**
+   * To indicate it's the ROLLUP
+   */
+  object RollupType extends GroupType
+}
 
 /**
  * :: Experimental ::
@@ -34,19 +65,37 @@ import org.apache.spark.sql.types.NumericType
  * @since 1.3.0
  */
 @Experimental
-class GroupedData protected[sql](df: DataFrame, groupingExprs: Seq[Expression]) {
+class GroupedData protected[sql](
+    df: DataFrame,
+    groupingExprs: Seq[Expression],
+    private val groupType: GroupedData.GroupType) {
 
-  private[sql] implicit def toDF(aggExprs: Seq[NamedExpression]): DataFrame = {
-    val namedGroupingExprs = groupingExprs.map {
-      case expr: NamedExpression => expr
-      case expr: Expression => Alias(expr, expr.prettyString)()
+  private[this] def toDF(aggExprs: Seq[NamedExpression]): DataFrame = {
+    val aggregates = if (df.sqlContext.conf.dataFrameRetainGroupColumns) {
+        val retainedExprs = groupingExprs.map {
+          case expr: NamedExpression => expr
+          case expr: Expression => Alias(expr, expr.prettyString)()
+        }
+        retainedExprs ++ aggExprs
+      } else {
+        aggExprs
+      }
+
+    groupType match {
+      case GroupedData.GroupByType =>
+        DataFrame(
+          df.sqlContext, Aggregate(groupingExprs, aggregates, df.logicalPlan))
+      case GroupedData.RollupType =>
+        DataFrame(
+          df.sqlContext, Rollup(groupingExprs, df.logicalPlan, aggregates))
+      case GroupedData.CubeType =>
+        DataFrame(
+          df.sqlContext, Cube(groupingExprs, df.logicalPlan, aggregates))
     }
-    DataFrame(
-      df.sqlContext, Aggregate(groupingExprs, namedGroupingExprs ++ aggExprs, df.logicalPlan))
   }
 
   private[this] def aggregateNumericColumns(colNames: String*)(f: Expression => Expression)
-    : Seq[NamedExpression] = {
+    : DataFrame = {
 
     val columnExprs = if (colNames.isEmpty) {
       // No columns specified. Use all numeric columns.
@@ -63,10 +112,10 @@ class GroupedData protected[sql](df: DataFrame, groupingExprs: Seq[Expression])
         namedExpr
       }
     }
-    columnExprs.map { c =>
+    toDF(columnExprs.map { c =>
       val a = f(c)
       Alias(a, a.prettyString)()
-    }
+    })
   }
 
   private[this] def strToExpr(expr: String): (Expression => Expression) = {
@@ -119,10 +168,10 @@ class GroupedData protected[sql](df: DataFrame, groupingExprs: Seq[Expression])
    * @since 1.3.0
    */
   def agg(exprs: Map[String, String]): DataFrame = {
-    exprs.map { case (colName, expr) =>
+    toDF(exprs.map { case (colName, expr) =>
       val a = strToExpr(expr)(df(colName).expr)
       Alias(a, a.prettyString)()
-    }.toSeq
+    }.toSeq)
   }
 
   /**
@@ -175,19 +224,10 @@ class GroupedData protected[sql](df: DataFrame, groupingExprs: Seq[Expression])
    */
   @scala.annotation.varargs
   def agg(expr: Column, exprs: Column*): DataFrame = {
-    val aggExprs = (expr +: exprs).map(_.expr).map {
+    toDF((expr +: exprs).map(_.expr).map {
       case expr: NamedExpression => expr
       case expr: Expression => Alias(expr, expr.prettyString)()
-    }
-    if (df.sqlContext.conf.dataFrameRetainGroupColumns) {
-      val retainedExprs = groupingExprs.map {
-        case expr: NamedExpression => expr
-        case expr: Expression => Alias(expr, expr.prettyString)()
-      }
-      DataFrame(df.sqlContext, Aggregate(groupingExprs, retainedExprs ++ aggExprs, df.logicalPlan))
-    } else {
-      DataFrame(df.sqlContext, Aggregate(groupingExprs, aggExprs, df.logicalPlan))
-    }
+    })
   }
 
   /**
@@ -196,7 +236,7 @@ class GroupedData protected[sql](df: DataFrame, groupingExprs: Seq[Expression])
    *
    * @since 1.3.0
    */
-  def count(): DataFrame = Seq(Alias(Count(Literal(1)), "count")())
+  def count(): DataFrame = toDF(Seq(Alias(Count(Literal(1)), "count")()))
 
   /**
    * Compute the average value for each numeric columns for each group. This is an alias for `avg`.
@@ -256,5 +296,5 @@ class GroupedData protected[sql](df: DataFrame, groupingExprs: Seq[Expression])
   @scala.annotation.varargs
   def sum(colNames: String*): DataFrame = {
     aggregateNumericColumns(colNames:_*)(Sum)
-  }    
+  }
 }
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveDataFrameAnalyticsSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveDataFrameAnalyticsSuite.scala
new file mode 100644
index 0000000000000..99de14660f676
--- /dev/null
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveDataFrameAnalyticsSuite.scala
@@ -0,0 +1,69 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.hive
+
+import org.apache.spark.sql.QueryTest
+import org.apache.spark.sql.functions._
+import org.apache.spark.sql.hive.test.TestHive
+import org.apache.spark.sql.hive.test.TestHive._
+import org.apache.spark.sql.hive.test.TestHive.implicits._
+import org.scalatest.BeforeAndAfterAll
+
+case class TestData2Int(a: Int, b: Int)
+
+// TODO ideally we should put the test suite into the package `sql`, as
+// `hive` package is optional in compiling, however, `SQLContext.sql` doesn't
+// support the `cube` or `rollup` yet.
+class HiveDataFrameAnalyticsSuite extends QueryTest with BeforeAndAfterAll {
+  val testData =
+    TestHive.sparkContext.parallelize(
+      TestData2Int(1, 2) ::
+        TestData2Int(2, 4) :: Nil).toDF()
+
+  override def beforeAll() {
+    TestHive.registerDataFrameAsTable(testData, "mytable")
+  }
+
+  override def afterAll(): Unit = {
+    TestHive.dropTempTable("mytable")
+  }
+
+  test("rollup") {
+    checkAnswer(
+      testData.rollup($"a" + $"b", $"b").agg(sum($"a" - $"b")),
+      sql("select a + b, b, sum(a - b) from mytable group by a + b, b with rollup").collect()
+    )
+
+    checkAnswer(
+      testData.rollup("a", "b").agg(sum("b")),
+      sql("select a, b, sum(b) from mytable group by a, b with rollup").collect()
+    )
+  }
+
+  test("cube") {
+    checkAnswer(
+      testData.cube($"a" + $"b", $"b").agg(sum($"a" - $"b")),
+      sql("select a + b, b, sum(a - b) from mytable group by a + b, b with cube").collect()
+    )
+
+    checkAnswer(
+      testData.cube("a", "b").agg(sum("b")),
+      sql("select a, b, sum(b) from mytable group by a, b with cube").collect()
+    )
+  }
+}

From ddec173cba63df723cd94508121d8c06d8c153c6 Mon Sep 17 00:00:00 2001
From: Xiangrui Meng <meng@databricks.com>
Date: Wed, 20 May 2015 20:30:39 -0700
Subject: [PATCH 102/525] [SPARK-7774] [MLLIB] add sqlContext to
 MLlibTestSparkContext

to simplify test suites that require a SQLContext.

Author: Xiangrui Meng <meng@databricks.com>

Closes #6303 from mengxr/SPARK-7774 and squashes the following commits:

0622b5a [Xiangrui Meng] update some other test suites
e1f9b8d [Xiangrui Meng] add sqlContext to MLlibTestSparkContext
---
 .../ml/classification/LogisticRegressionSuite.scala   |  4 +---
 .../spark/ml/classification/OneVsRestSuite.scala      |  7 +++----
 .../org/apache/spark/ml/feature/BinarizerSuite.scala  |  6 +-----
 .../org/apache/spark/ml/feature/BucketizerSuite.scala |  9 +--------
 .../scala/org/apache/spark/ml/feature/IDFSuite.scala  |  9 +--------
 .../apache/spark/ml/feature/OneHotEncoderSuite.scala  |  8 +-------
 .../spark/ml/feature/PolynomialExpansionSuite.scala   | 11 ++---------
 .../apache/spark/ml/feature/StringIndexerSuite.scala  |  7 -------
 .../org/apache/spark/ml/feature/TokenizerSuite.scala  |  9 +--------
 .../spark/ml/feature/VectorAssemblerSuite.scala       |  9 +--------
 .../apache/spark/ml/feature/VectorIndexerSuite.scala  |  6 +-----
 .../org/apache/spark/ml/recommendation/ALSSuite.scala |  2 --
 .../spark/ml/regression/LinearRegressionSuite.scala   |  4 +---
 .../spark/mllib/util/MLlibTestSparkContext.scala      |  8 ++++++--
 14 files changed, 20 insertions(+), 79 deletions(-)

diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala
index 97f9749cb4a9a..9f77d5f3efc55 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala
@@ -23,18 +23,16 @@ import org.apache.spark.mllib.classification.LogisticRegressionSuite._
 import org.apache.spark.mllib.linalg.Vector
 import org.apache.spark.mllib.util.MLlibTestSparkContext
 import org.apache.spark.mllib.util.TestingUtils._
-import org.apache.spark.sql.{DataFrame, Row, SQLContext}
+import org.apache.spark.sql.{DataFrame, Row}
 
 class LogisticRegressionSuite extends FunSuite with MLlibTestSparkContext {
 
-  @transient var sqlContext: SQLContext = _
   @transient var dataset: DataFrame = _
   @transient var binaryDataset: DataFrame = _
   private val eps: Double = 1e-5
 
   override def beforeAll(): Unit = {
     super.beforeAll()
-    sqlContext = new SQLContext(sc)
 
     dataset = sqlContext.createDataFrame(generateLogisticInput(1.0, 1.0, nPoints = 100, seed = 42))
 
diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/OneVsRestSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/OneVsRestSuite.scala
index 990cfb08af83b..770b56890fa45 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/classification/OneVsRestSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/classification/OneVsRestSuite.scala
@@ -21,24 +21,23 @@ import org.scalatest.FunSuite
 
 import org.apache.spark.ml.attribute.NominalAttribute
 import org.apache.spark.ml.util.MetadataUtils
-import org.apache.spark.mllib.classification.LogisticRegressionSuite._
 import org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS
+import org.apache.spark.mllib.classification.LogisticRegressionSuite._
 import org.apache.spark.mllib.evaluation.MulticlassMetrics
 import org.apache.spark.mllib.regression.LabeledPoint
 import org.apache.spark.mllib.util.MLlibTestSparkContext
 import org.apache.spark.mllib.util.TestingUtils._
 import org.apache.spark.rdd.RDD
-import org.apache.spark.sql.{DataFrame, SQLContext}
+import org.apache.spark.sql.DataFrame
 
 class OneVsRestSuite extends FunSuite with MLlibTestSparkContext {
 
-  @transient var sqlContext: SQLContext = _
   @transient var dataset: DataFrame = _
   @transient var rdd: RDD[LabeledPoint] = _
 
   override def beforeAll(): Unit = {
     super.beforeAll()
-    sqlContext = new SQLContext(sc)
+
     val nPoints = 1000
 
     // The following weights and xMean/xVariance are computed from iris dataset with lambda=0.2.
diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/BinarizerSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/BinarizerSuite.scala
index caf1b759593f3..8f6c6b39dc93b 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/feature/BinarizerSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/feature/BinarizerSuite.scala
@@ -20,18 +20,14 @@ package org.apache.spark.ml.feature
 import org.scalatest.FunSuite
 
 import org.apache.spark.mllib.util.MLlibTestSparkContext
-import org.apache.spark.mllib.util.TestingUtils._
-import org.apache.spark.sql.{DataFrame, Row, SQLContext}
-
+import org.apache.spark.sql.{DataFrame, Row}
 
 class BinarizerSuite extends FunSuite with MLlibTestSparkContext {
 
   @transient var data: Array[Double] = _
-  @transient var sqlContext: SQLContext = _
 
   override def beforeAll(): Unit = {
     super.beforeAll()
-    sqlContext = new SQLContext(sc)
     data = Array(0.1, -0.5, 0.2, -0.3, 0.8, 0.7, -0.1, -0.4)
   }
 
diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/BucketizerSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/BucketizerSuite.scala
index 20d2f3ac6696b..0391bd8427c2c 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/feature/BucketizerSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/feature/BucketizerSuite.scala
@@ -25,17 +25,10 @@ import org.apache.spark.SparkException
 import org.apache.spark.mllib.linalg.Vectors
 import org.apache.spark.mllib.util.MLlibTestSparkContext
 import org.apache.spark.mllib.util.TestingUtils._
-import org.apache.spark.sql.{DataFrame, Row, SQLContext}
+import org.apache.spark.sql.{DataFrame, Row}
 
 class BucketizerSuite extends FunSuite with MLlibTestSparkContext {
 
-  @transient private var sqlContext: SQLContext = _
-
-  override def beforeAll(): Unit = {
-    super.beforeAll()
-    sqlContext = new SQLContext(sc)
-  }
-
   test("Bucket continuous features, without -inf,inf") {
     // Check a set of valid feature values.
     val splits = Array(-0.5, 0.0, 0.5)
diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/IDFSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/IDFSuite.scala
index eaee3443c1f23..f85e85471617a 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/feature/IDFSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/feature/IDFSuite.scala
@@ -22,17 +22,10 @@ import org.scalatest.FunSuite
 import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vector, Vectors}
 import org.apache.spark.mllib.util.MLlibTestSparkContext
 import org.apache.spark.mllib.util.TestingUtils._
-import org.apache.spark.sql.{Row, SQLContext}
+import org.apache.spark.sql.Row
 
 class IDFSuite extends FunSuite with MLlibTestSparkContext {
 
-  @transient var sqlContext: SQLContext = _
-
-  override def beforeAll(): Unit = {
-    super.beforeAll()
-    sqlContext = new SQLContext(sc)
-  }
-
   def scaleDataWithIDF(dataSet: Array[Vector], model: Vector): Array[Vector] = {
     dataSet.map {
       case data: DenseVector =>
diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/OneHotEncoderSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/OneHotEncoderSuite.scala
index 92ec407b98d69..056b9eda86bba 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/feature/OneHotEncoderSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/feature/OneHotEncoderSuite.scala
@@ -21,16 +21,10 @@ import org.scalatest.FunSuite
 
 import org.apache.spark.mllib.linalg.Vector
 import org.apache.spark.mllib.util.MLlibTestSparkContext
-import org.apache.spark.sql.{DataFrame, SQLContext}
+import org.apache.spark.sql.DataFrame
 
 
 class OneHotEncoderSuite extends FunSuite with MLlibTestSparkContext {
-  private var sqlContext: SQLContext = _
-
-  override def beforeAll(): Unit = {
-    super.beforeAll()
-    sqlContext = new SQLContext(sc)
-  }
 
   def stringIndexed(): DataFrame = {
     val data = sc.parallelize(Seq((0, "a"), (1, "b"), (2, "c"), (3, "a"), (4, "a"), (5, "c")), 2)
diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/PolynomialExpansionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/PolynomialExpansionSuite.scala
index c1d64fba0aa8f..aa230ca073d5b 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/feature/PolynomialExpansionSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/feature/PolynomialExpansionSuite.scala
@@ -18,22 +18,15 @@
 package org.apache.spark.ml.feature
 
 import org.scalatest.FunSuite
+import org.scalatest.exceptions.TestFailedException
 
 import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vector, Vectors}
 import org.apache.spark.mllib.util.MLlibTestSparkContext
 import org.apache.spark.mllib.util.TestingUtils._
-import org.apache.spark.sql.{Row, SQLContext}
-import org.scalatest.exceptions.TestFailedException
+import org.apache.spark.sql.Row
 
 class PolynomialExpansionSuite extends FunSuite with MLlibTestSparkContext {
 
-  @transient var sqlContext: SQLContext = _
-
-  override def beforeAll(): Unit = {
-    super.beforeAll()
-    sqlContext = new SQLContext(sc)
-  }
-
   test("Polynomial expansion with default parameter") {
     val data = Array(
       Vectors.sparse(3, Seq((0, -2.0), (1, 2.3))),
diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/StringIndexerSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/StringIndexerSuite.scala
index b6939e5870410..89c2fe45573aa 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/feature/StringIndexerSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/feature/StringIndexerSuite.scala
@@ -21,15 +21,8 @@ import org.scalatest.FunSuite
 
 import org.apache.spark.ml.attribute.{Attribute, NominalAttribute}
 import org.apache.spark.mllib.util.MLlibTestSparkContext
-import org.apache.spark.sql.SQLContext
 
 class StringIndexerSuite extends FunSuite with MLlibTestSparkContext {
-  private var sqlContext: SQLContext = _
-
-  override def beforeAll(): Unit = {
-    super.beforeAll()
-    sqlContext = new SQLContext(sc)
-  }
 
   test("StringIndexer") {
     val data = sc.parallelize(Seq((0, "a"), (1, "b"), (2, "c"), (3, "a"), (4, "a"), (5, "c")), 2)
diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/TokenizerSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/TokenizerSuite.scala
index d186ead8f542f..a46d08d65150f 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/feature/TokenizerSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/feature/TokenizerSuite.scala
@@ -22,7 +22,7 @@ import scala.beans.BeanInfo
 import org.scalatest.FunSuite
 
 import org.apache.spark.mllib.util.MLlibTestSparkContext
-import org.apache.spark.sql.{DataFrame, Row, SQLContext}
+import org.apache.spark.sql.{DataFrame, Row}
 
 @BeanInfo
 case class TokenizerTestData(rawText: String, wantedTokens: Array[String])
@@ -30,13 +30,6 @@ case class TokenizerTestData(rawText: String, wantedTokens: Array[String])
 class RegexTokenizerSuite extends FunSuite with MLlibTestSparkContext {
   import org.apache.spark.ml.feature.RegexTokenizerSuite._
   
-  @transient var sqlContext: SQLContext = _
-
-  override def beforeAll(): Unit = {
-    super.beforeAll()
-    sqlContext = new SQLContext(sc)
-  }
-
   test("RegexTokenizer") {
     val tokenizer = new RegexTokenizer()
       .setInputCol("rawText")
diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/VectorAssemblerSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/VectorAssemblerSuite.scala
index 0db27607bc274..d0cd62c5e4864 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/feature/VectorAssemblerSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/feature/VectorAssemblerSuite.scala
@@ -22,17 +22,10 @@ import org.scalatest.FunSuite
 import org.apache.spark.SparkException
 import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vector, Vectors}
 import org.apache.spark.mllib.util.MLlibTestSparkContext
-import org.apache.spark.sql.{Row, SQLContext}
+import org.apache.spark.sql.Row
 
 class VectorAssemblerSuite extends FunSuite with MLlibTestSparkContext {
 
-  @transient var sqlContext: SQLContext = _
-
-  override def beforeAll(): Unit = {
-    super.beforeAll()
-    sqlContext = new SQLContext(sc)
-  }
-
   test("assemble") {
     import org.apache.spark.ml.feature.VectorAssembler.assemble
     assert(assemble(0.0) === Vectors.sparse(1, Array.empty, Array.empty))
diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/VectorIndexerSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/VectorIndexerSuite.scala
index 38dc83b1241cf..b11b029c6343e 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/feature/VectorIndexerSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/feature/VectorIndexerSuite.scala
@@ -26,15 +26,12 @@ import org.apache.spark.ml.attribute._
 import org.apache.spark.mllib.linalg.{SparseVector, Vector, Vectors}
 import org.apache.spark.mllib.util.MLlibTestSparkContext
 import org.apache.spark.rdd.RDD
-import org.apache.spark.sql.{DataFrame, SQLContext}
-
+import org.apache.spark.sql.DataFrame
 
 class VectorIndexerSuite extends FunSuite with MLlibTestSparkContext {
 
   import VectorIndexerSuite.FeatureData
 
-  @transient var sqlContext: SQLContext = _
-
   // identical, of length 3
   @transient var densePoints1: DataFrame = _
   @transient var sparsePoints1: DataFrame = _
@@ -86,7 +83,6 @@ class VectorIndexerSuite extends FunSuite with MLlibTestSparkContext {
     checkPair(densePoints1Seq, sparsePoints1Seq)
     checkPair(densePoints2Seq, sparsePoints2Seq)
 
-    sqlContext = new SQLContext(sc)
     densePoints1 = sqlContext.createDataFrame(sc.parallelize(densePoints1Seq, 2).map(FeatureData))
     sparsePoints1 = sqlContext.createDataFrame(sc.parallelize(sparsePoints1Seq, 2).map(FeatureData))
     densePoints2 = sqlContext.createDataFrame(sc.parallelize(densePoints2Seq, 2).map(FeatureData))
diff --git a/mllib/src/test/scala/org/apache/spark/ml/recommendation/ALSSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/recommendation/ALSSuite.scala
index 6cc6ec94eb643..9a35555e52b90 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/recommendation/ALSSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/recommendation/ALSSuite.scala
@@ -38,14 +38,12 @@ import org.apache.spark.util.Utils
 
 class ALSSuite extends FunSuite with MLlibTestSparkContext with Logging {
 
-  private var sqlContext: SQLContext = _
   private var tempDir: File = _
 
   override def beforeAll(): Unit = {
     super.beforeAll()
     tempDir = Utils.createTempDir()
     sc.setCheckpointDir(tempDir.getAbsolutePath)
-    sqlContext = new SQLContext(sc)
   }
 
   override def afterAll(): Unit = {
diff --git a/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala
index 80323ef5201a6..50a78631fa6d6 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala
@@ -22,11 +22,10 @@ import org.scalatest.FunSuite
 import org.apache.spark.mllib.linalg.DenseVector
 import org.apache.spark.mllib.util.{LinearDataGenerator, MLlibTestSparkContext}
 import org.apache.spark.mllib.util.TestingUtils._
-import org.apache.spark.sql.{Row, SQLContext, DataFrame}
+import org.apache.spark.sql.{DataFrame, Row}
 
 class LinearRegressionSuite extends FunSuite with MLlibTestSparkContext {
 
-  @transient var sqlContext: SQLContext = _
   @transient var dataset: DataFrame = _
 
   /**
@@ -41,7 +40,6 @@ class LinearRegressionSuite extends FunSuite with MLlibTestSparkContext {
    */
   override def beforeAll(): Unit = {
     super.beforeAll()
-    sqlContext = new SQLContext(sc)
     dataset = sqlContext.createDataFrame(
       sc.parallelize(LinearDataGenerator.generateLinearInput(
         6.3, Array(4.7, 7.2), Array(0.9, -1.3), Array(0.7, 1.2), 10000, 42, 0.1), 2))
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/util/MLlibTestSparkContext.scala b/mllib/src/test/scala/org/apache/spark/mllib/util/MLlibTestSparkContext.scala
index b658889476d37..5d1796ef65722 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/util/MLlibTestSparkContext.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/util/MLlibTestSparkContext.scala
@@ -17,13 +17,14 @@
 
 package org.apache.spark.mllib.util
 
-import org.scalatest.Suite
-import org.scalatest.BeforeAndAfterAll
+import org.scalatest.{BeforeAndAfterAll, Suite}
 
 import org.apache.spark.{SparkConf, SparkContext}
+import org.apache.spark.sql.SQLContext
 
 trait MLlibTestSparkContext extends BeforeAndAfterAll { self: Suite =>
   @transient var sc: SparkContext = _
+  @transient var sqlContext: SQLContext = _
 
   override def beforeAll() {
     super.beforeAll()
@@ -31,12 +32,15 @@ trait MLlibTestSparkContext extends BeforeAndAfterAll { self: Suite =>
       .setMaster("local[2]")
       .setAppName("MLlibUnitTest")
     sc = new SparkContext(conf)
+    sqlContext = new SQLContext(sc)
   }
 
   override def afterAll() {
+    sqlContext = null
     if (sc != null) {
       sc.stop()
     }
+    sc = null
     super.afterAll()
   }
 }

From d0eb9ffe978c663b7aa06e908cadee81767d23d1 Mon Sep 17 00:00:00 2001
From: Liang-Chi Hsieh <viirya@gmail.com>
Date: Wed, 20 May 2015 22:23:49 -0700
Subject: [PATCH 103/525] [SPARK-7746][SQL] Add FetchSize parameter for JDBC
 driver

JIRA: https://issues.apache.org/jira/browse/SPARK-7746

Looks like an easy to add parameter but can show significant performance improvement if the JDBC driver accepts it.

Author: Liang-Chi Hsieh <viirya@gmail.com>

Closes #6283 from viirya/jdbc_fetchsize and squashes the following commits:

de47f94 [Liang-Chi Hsieh] Don't keep fetchSize as single parameter.
b7bff2f [Liang-Chi Hsieh] Add FetchSize parameter for JDBC driver.
---
 .../org/apache/spark/sql/jdbc/JDBCRDD.scala   |  8 +++--
 .../org/apache/spark/sql/jdbc/JDBCSuite.scala | 33 ++++++++++++++++++-
 2 files changed, 38 insertions(+), 3 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/jdbc/JDBCRDD.scala b/sql/core/src/main/scala/org/apache/spark/sql/jdbc/JDBCRDD.scala
index f7b19096eaacb..be03a237b6c4e 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/jdbc/JDBCRDD.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/jdbc/JDBCRDD.scala
@@ -211,7 +211,8 @@ private[sql] object JDBCRDD extends Logging {
       fqTable,
       requiredColumns,
       filters,
-      parts)
+      parts,
+      properties)
   }
 }
 
@@ -227,7 +228,8 @@ private[sql] class JDBCRDD(
     fqTable: String,
     columns: Array[String],
     filters: Array[Filter],
-    partitions: Array[Partition])
+    partitions: Array[Partition],
+    properties: Properties)
   extends RDD[Row](sc, Nil) {
 
   /**
@@ -356,6 +358,8 @@ private[sql] class JDBCRDD(
     val sqlText = s"SELECT $columnList FROM $fqTable $myWhereClause"
     val stmt = conn.prepareStatement(sqlText,
         ResultSet.TYPE_FORWARD_ONLY, ResultSet.CONCUR_READ_ONLY)
+    val fetchSize = properties.getProperty("fetchSize", "0").toInt
+    stmt.setFetchSize(fetchSize)
     val rs = stmt.executeQuery()
 
     val conversions = getConversions(schema)
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCSuite.scala
index a8dddfb9b6858..347f28351fd72 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCSuite.scala
@@ -67,7 +67,15 @@ class JDBCSuite extends FunSuite with BeforeAndAfter {
         |USING org.apache.spark.sql.jdbc
         |OPTIONS (url '$url', dbtable 'TEST.PEOPLE', user 'testUser', password 'testPass')
       """.stripMargin.replaceAll("\n", " "))
-
+ 
+    sql(
+      s"""
+        |CREATE TEMPORARY TABLE fetchtwo
+        |USING org.apache.spark.sql.jdbc
+        |OPTIONS (url '$url', dbtable 'TEST.PEOPLE', user 'testUser', password 'testPass',
+        |         fetchSize '2')
+      """.stripMargin.replaceAll("\n", " "))
+ 
     sql(
       s"""
         |CREATE TEMPORARY TABLE parts
@@ -185,6 +193,14 @@ class JDBCSuite extends FunSuite with BeforeAndAfter {
     assert(names(2).equals("mary"))
   }
 
+  test("SELECT first field when fetchSize is two") {
+    val names = sql("SELECT NAME FROM fetchtwo").collect().map(x => x.getString(0)).sortWith(_ < _)
+    assert(names.size === 3)
+    assert(names(0).equals("fred"))
+    assert(names(1).equals("joe 'foo' \"bar\""))
+    assert(names(2).equals("mary"))
+  }
+
   test("SELECT second field") {
     val ids = sql("SELECT THEID FROM foobar").collect().map(x => x.getInt(0)).sortWith(_ < _)
     assert(ids.size === 3)
@@ -192,6 +208,14 @@ class JDBCSuite extends FunSuite with BeforeAndAfter {
     assert(ids(1) === 2)
     assert(ids(2) === 3)
   }
+ 
+  test("SELECT second field when fetchSize is two") {
+    val ids = sql("SELECT THEID FROM fetchtwo").collect().map(x => x.getInt(0)).sortWith(_ < _)
+    assert(ids.size === 3)
+    assert(ids(0) === 1)
+    assert(ids(1) === 2)
+    assert(ids(2) === 3)
+  }
 
   test("SELECT * partitioned") {
     assert(sql("SELECT * FROM parts").collect().size == 3)
@@ -232,6 +256,13 @@ class JDBCSuite extends FunSuite with BeforeAndAfter {
       urlWithUserAndPass, "TEST.PEOPLE", new Properties).collect().length === 3)
   }
 
+  test("Basic API with FetchSize") {
+    val properties = new Properties
+    properties.setProperty("fetchSize", "2")
+    assert(TestSQLContext.read.jdbc(
+      urlWithUserAndPass, "TEST.PEOPLE", properties).collect().length === 3)
+  }
+
   test("Partitioning via JDBCPartitioningInfo API") {
     assert(
       TestSQLContext.read.jdbc(urlWithUserAndPass, "TEST.PEOPLE", "THEID", 0, 4, 3, new Properties)

From 04940c49755fd2e7f1ed7b875da287c946bfebeb Mon Sep 17 00:00:00 2001
From: Mingfei <mingfei.shi@intel.com>
Date: Wed, 20 May 2015 22:33:03 -0700
Subject: [PATCH 104/525] [SPARK-7389] [CORE] Tachyon integration improvement

Two main changes:

Add two functions in ExternalBlockManager, which are putValues and getValues
because the implementation may not rely on the putBytes and getBytes

improve Tachyon integration.
Currently, when putting data into Tachyon, Spark first serialize all data in one partition into a ByteBuffer, and then write into Tachyon, this will uses much memory and increase GC overhead

when get data from Tachyon, getValues depends on getBytes, which also read all data into On heap byte arry, and result in much memory usage.
This PR changes the approach of the two functions, make them read / write data by stream to reduce memory usage.

In our testing,  when data size is huge, this patch reduces about 30% GC time and 70% full GC time, and total execution time reduces about 10%

Author: Mingfei <mingfei.shi@intel.com>

Closes #5908 from shimingfei/Tachyon-integration-rebase and squashes the following commits:

033bc57 [Mingfei] modify accroding to comments
747c69a [Mingfei] modify according to comments - format changes
ce52c67 [Mingfei] put close() in a finally block
d2c60bb [Mingfei] modify according to comments, some code style change
4c11591 [Mingfei] modify according to comments split putIntoExternalBlockStore into two functions add default implementation for getValues and putValues
cc0a32e [Mingfei] Make getValues read data from Tachyon by stream Make putValues write data to Tachyon by stream
017593d [Mingfei] add getValues and putValues in ExternalBlockManager's Interface
---
 .../apache/spark/storage/BlockManager.scala   | 36 +++++---
 .../spark/storage/ExternalBlockManager.scala  | 22 ++++-
 .../spark/storage/ExternalBlockStore.scala    | 88 +++++++++++++------
 .../spark/storage/TachyonBlockManager.scala   | 51 +++++++++--
 4 files changed, 149 insertions(+), 48 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/storage/BlockManager.scala b/core/src/main/scala/org/apache/spark/storage/BlockManager.scala
index 16d67cbfca80b..5048c7dab240b 100644
--- a/core/src/main/scala/org/apache/spark/storage/BlockManager.scala
+++ b/core/src/main/scala/org/apache/spark/storage/BlockManager.scala
@@ -17,7 +17,7 @@
 
 package org.apache.spark.storage
 
-import java.io.{BufferedOutputStream, ByteArrayOutputStream, File, InputStream, OutputStream}
+import java.io._
 import java.nio.{ByteBuffer, MappedByteBuffer}
 
 import scala.collection.mutable.{ArrayBuffer, HashMap}
@@ -489,16 +489,17 @@ private[spark] class BlockManager(
         if (level.useOffHeap) {
           logDebug(s"Getting block $blockId from ExternalBlockStore")
           if (externalBlockStore.contains(blockId)) {
-            externalBlockStore.getBytes(blockId) match {
-              case Some(bytes) =>
-                if (!asBlockResult) {
-                  return Some(bytes)
-                } else {
-                  return Some(new BlockResult(
-                    dataDeserialize(blockId, bytes), DataReadMethod.Memory, info.size))
-                }
+            val result = if (asBlockResult) {
+              externalBlockStore.getValues(blockId)
+                .map(new BlockResult(_, DataReadMethod.Memory, info.size))
+            } else {
+              externalBlockStore.getBytes(blockId)
+            }
+            result match {
+              case Some(values) =>
+                return result
               case None =>
-                logDebug(s"Block $blockId not found in externalBlockStore")
+                logDebug(s"Block $blockId not found in ExternalBlockStore")
             }
           }
         }
@@ -1206,8 +1207,19 @@ private[spark] class BlockManager(
       bytes: ByteBuffer,
       serializer: Serializer = defaultSerializer): Iterator[Any] = {
     bytes.rewind()
-    val stream = wrapForCompression(blockId, new ByteBufferInputStream(bytes, true))
-    serializer.newInstance().deserializeStream(stream).asIterator
+    dataDeserializeStream(blockId, new ByteBufferInputStream(bytes, true), serializer)
+  }
+
+  /**
+   * Deserializes a InputStream into an iterator of values and disposes of it when the end of
+   * the iterator is reached.
+   */
+  def dataDeserializeStream(
+      blockId: BlockId,
+      inputStream: InputStream,
+      serializer: Serializer = defaultSerializer): Iterator[Any] = {
+    val stream = new BufferedInputStream(inputStream)
+    serializer.newInstance().deserializeStream(wrapForCompression(blockId, stream)).asIterator
   }
 
   def stop(): Unit = {
diff --git a/core/src/main/scala/org/apache/spark/storage/ExternalBlockManager.scala b/core/src/main/scala/org/apache/spark/storage/ExternalBlockManager.scala
index 8964762df6af3..f39325a12d244 100644
--- a/core/src/main/scala/org/apache/spark/storage/ExternalBlockManager.scala
+++ b/core/src/main/scala/org/apache/spark/storage/ExternalBlockManager.scala
@@ -32,6 +32,8 @@ import java.nio.ByteBuffer
  */
 private[spark] abstract class ExternalBlockManager {
 
+  protected var blockManager: BlockManager = _
+
   override def toString: String = {"External Block Store"}
 
   /**
@@ -41,7 +43,9 @@ private[spark] abstract class ExternalBlockManager {
    *
    * @throws java.io.IOException if there is any file system failure during the initialization.
    */
-  def init(blockManager: BlockManager, executorId: String): Unit
+  def init(blockManager: BlockManager, executorId: String): Unit = {
+    this.blockManager = blockManager
+  }
 
   /**
    * Drop the block from underlying external block store, if it exists..
@@ -73,6 +77,11 @@ private[spark] abstract class ExternalBlockManager {
    */
   def putBytes(blockId: BlockId, bytes: ByteBuffer): Unit
 
+  def putValues(blockId: BlockId, values: Iterator[_]): Unit = {
+    val bytes = blockManager.dataSerialize(blockId, values)
+    putBytes(blockId, bytes)
+  }
+
   /**
    * Retrieve the block bytes.
    * @return Some(ByteBuffer) if the block bytes is successfully retrieved
@@ -82,6 +91,17 @@ private[spark] abstract class ExternalBlockManager {
    */
   def getBytes(blockId: BlockId): Option[ByteBuffer]
 
+  /**
+   * Retrieve the block data.
+   * @return Some(Iterator[Any]) if the block data is successfully retrieved
+   *         None if the block does not exist in the external block store.
+   *
+   * @throws java.io.IOException if there is any file system failure in getting the block.
+   */
+  def getValues(blockId: BlockId): Option[Iterator[_]] = {
+    getBytes(blockId).map(buffer => blockManager.dataDeserialize(blockId, buffer))
+  }
+
   /**
    * Get the size of the block saved in the underlying external block store,
    * which is saved before by putBytes.
diff --git a/core/src/main/scala/org/apache/spark/storage/ExternalBlockStore.scala b/core/src/main/scala/org/apache/spark/storage/ExternalBlockStore.scala
index 0bf770306ae9b..291394ed34816 100644
--- a/core/src/main/scala/org/apache/spark/storage/ExternalBlockStore.scala
+++ b/core/src/main/scala/org/apache/spark/storage/ExternalBlockStore.scala
@@ -18,9 +18,11 @@
 package org.apache.spark.storage
 
 import java.nio.ByteBuffer
+
+import scala.util.control.NonFatal
+
 import org.apache.spark.Logging
 import org.apache.spark.util.Utils
-import scala.util.control.NonFatal
 
 
 /**
@@ -40,7 +42,7 @@ private[spark] class ExternalBlockStore(blockManager: BlockManager, executorId:
       externalBlockManager.map(_.getSize(blockId)).getOrElse(0)
     } catch {
       case NonFatal(t) =>
-        logError(s"error in getSize from $blockId", t)
+        logError(s"Error in getSize($blockId)", t)
         0L
     }
   }
@@ -54,7 +56,7 @@ private[spark] class ExternalBlockStore(blockManager: BlockManager, executorId:
       values: Array[Any],
       level: StorageLevel,
       returnValues: Boolean): PutResult = {
-    putIterator(blockId, values.toIterator, level, returnValues)
+    putIntoExternalBlockStore(blockId, values.toIterator, returnValues)
   }
 
   override def putIterator(
@@ -62,42 +64,70 @@ private[spark] class ExternalBlockStore(blockManager: BlockManager, executorId:
       values: Iterator[Any],
       level: StorageLevel,
       returnValues: Boolean): PutResult = {
-    logDebug(s"Attempting to write values for block $blockId")
-    val bytes = blockManager.dataSerialize(blockId, values)
-    putIntoExternalBlockStore(blockId, bytes, returnValues)
+    putIntoExternalBlockStore(blockId, values, returnValues)
   }
 
   private def putIntoExternalBlockStore(
       blockId: BlockId,
-      bytes: ByteBuffer,
+      values: Iterator[_],
       returnValues: Boolean): PutResult = {
-    // So that we do not modify the input offsets !
-    // duplicate does not copy buffer, so inexpensive
-    val byteBuffer = bytes.duplicate()
-    byteBuffer.rewind()
-    logDebug(s"Attempting to put block $blockId into ExtBlk store")
+    logTrace(s"Attempting to put block $blockId into ExternalBlockStore")
     // we should never hit here if externalBlockManager is None. Handle it anyway for safety.
     try {
       val startTime = System.currentTimeMillis
       if (externalBlockManager.isDefined) {
-        externalBlockManager.get.putBytes(blockId, bytes)
+        externalBlockManager.get.putValues(blockId, values)
+        val size = getSize(blockId)
+        val data = if (returnValues) {
+          Left(getValues(blockId).get)
+        } else {
+          null
+        }
         val finishTime = System.currentTimeMillis
         logDebug("Block %s stored as %s file in ExternalBlockStore in %d ms".format(
-          blockId, Utils.bytesToString(byteBuffer.limit), finishTime - startTime))
+          blockId, Utils.bytesToString(size), finishTime - startTime))
+        PutResult(size, data)
+      } else {
+        logError(s"Error in putValues($blockId): no ExternalBlockManager has been configured")
+        PutResult(-1, null, Seq((blockId, BlockStatus.empty)))
+      }
+    } catch {
+      case NonFatal(t) =>
+        logError(s"Error in putValues($blockId)", t)
+        PutResult(-1, null, Seq((blockId, BlockStatus.empty)))
+    }
+  }
 
-        if (returnValues) {
-          PutResult(bytes.limit(), Right(bytes.duplicate()))
+  private def putIntoExternalBlockStore(
+      blockId: BlockId,
+      bytes: ByteBuffer,
+      returnValues: Boolean): PutResult = {
+    logTrace(s"Attempting to put block $blockId into ExternalBlockStore")
+    // we should never hit here if externalBlockManager is None. Handle it anyway for safety.
+    try {
+      val startTime = System.currentTimeMillis
+      if (externalBlockManager.isDefined) {
+        val byteBuffer = bytes.duplicate()
+        byteBuffer.rewind()
+        externalBlockManager.get.putBytes(blockId, byteBuffer)
+        val size = bytes.limit()
+        val data = if (returnValues) {
+          Right(bytes)
         } else {
-          PutResult(bytes.limit(), null)
+          null
         }
+        val finishTime = System.currentTimeMillis
+        logDebug("Block %s stored as %s file in ExternalBlockStore in %d ms".format(
+          blockId, Utils.bytesToString(size), finishTime - startTime))
+        PutResult(size, data)
       } else {
-        logError(s"error in putBytes $blockId")
-        PutResult(bytes.limit(), null, Seq((blockId, BlockStatus.empty)))
+        logError(s"Error in putBytes($blockId): no ExternalBlockManager has been configured")
+        PutResult(-1, null, Seq((blockId, BlockStatus.empty)))
       }
     } catch {
       case NonFatal(t) =>
-        logError(s"error in putBytes $blockId", t)
-        PutResult(bytes.limit(), null, Seq((blockId, BlockStatus.empty)))
+        logError(s"Error in putBytes($blockId)", t)
+        PutResult(-1, null, Seq((blockId, BlockStatus.empty)))
     }
   }
 
@@ -107,13 +137,19 @@ private[spark] class ExternalBlockStore(blockManager: BlockManager, executorId:
       externalBlockManager.map(_.removeBlock(blockId)).getOrElse(true)
     } catch {
       case NonFatal(t) =>
-        logError(s"error in removing $blockId", t)
+        logError(s"Error in removeBlock($blockId)", t)
         true
     }
   }
 
   override def getValues(blockId: BlockId): Option[Iterator[Any]] = {
-    getBytes(blockId).map(buffer => blockManager.dataDeserialize(blockId, buffer))
+    try {
+      externalBlockManager.flatMap(_.getValues(blockId))
+    } catch {
+      case NonFatal(t) =>
+        logError(s"Error in getValues($blockId)", t)
+        None
+    }
   }
 
   override def getBytes(blockId: BlockId): Option[ByteBuffer] = {
@@ -121,7 +157,7 @@ private[spark] class ExternalBlockStore(blockManager: BlockManager, executorId:
       externalBlockManager.flatMap(_.getBytes(blockId))
     } catch {
       case NonFatal(t) =>
-        logError(s"error in getBytes from $blockId", t)
+        logError(s"Error in getBytes($blockId)", t)
         None
     }
   }
@@ -130,13 +166,13 @@ private[spark] class ExternalBlockStore(blockManager: BlockManager, executorId:
     try {
       val ret = externalBlockManager.map(_.blockExists(blockId)).getOrElse(false)
       if (!ret) {
-        logInfo(s"remove block $blockId")
+        logInfo(s"Remove block $blockId")
         blockManager.removeBlock(blockId, true)
       }
       ret
     } catch {
       case NonFatal(t) =>
-        logError(s"error in getBytes from $blockId", t)
+        logError(s"Error in getBytes($blockId)", t)
         false
     }
   }
diff --git a/core/src/main/scala/org/apache/spark/storage/TachyonBlockManager.scala b/core/src/main/scala/org/apache/spark/storage/TachyonBlockManager.scala
index bdc6276e41915..fb4ba0eac9d9a 100644
--- a/core/src/main/scala/org/apache/spark/storage/TachyonBlockManager.scala
+++ b/core/src/main/scala/org/apache/spark/storage/TachyonBlockManager.scala
@@ -22,7 +22,10 @@ import java.nio.ByteBuffer
 import java.text.SimpleDateFormat
 import java.util.{Date, Random}
 
+import scala.util.control.NonFatal
+
 import com.google.common.io.ByteStreams
+
 import tachyon.client.{ReadType, WriteType, TachyonFS, TachyonFile}
 import tachyon.TachyonURI
 
@@ -38,7 +41,6 @@ import org.apache.spark.util.Utils
  */
 private[spark] class TachyonBlockManager() extends ExternalBlockManager with Logging {
 
-  var blockManager: BlockManager =_
   var rootDirs: String = _
   var master: String = _
   var client: tachyon.client.TachyonFS = _
@@ -52,7 +54,7 @@ private[spark] class TachyonBlockManager() extends ExternalBlockManager with Log
 
 
   override def init(blockManager: BlockManager, executorId: String): Unit = {
-    this.blockManager = blockManager
+    super.init(blockManager, executorId)
     val storeDir = blockManager.conf.get(ExternalBlockStore.BASE_DIR, "/tmp_spark_tachyon")
     val appFolderName = blockManager.conf.get(ExternalBlockStore.FOLD_NAME)
 
@@ -95,8 +97,29 @@ private[spark] class TachyonBlockManager() extends ExternalBlockManager with Log
   override def putBytes(blockId: BlockId, bytes: ByteBuffer): Unit = {
     val file = getFile(blockId)
     val os = file.getOutStream(WriteType.TRY_CACHE)
-    os.write(bytes.array())
-    os.close()
+    try {
+      os.write(bytes.array())
+    } catch {
+      case NonFatal(e) => 
+        logWarning(s"Failed to put bytes of block $blockId into Tachyon", e)
+        os.cancel()
+    } finally {
+      os.close()
+    }
+  }
+
+  override def putValues(blockId: BlockId, values: Iterator[_]): Unit = {
+    val file = getFile(blockId)
+    val os = file.getOutStream(WriteType.TRY_CACHE)
+    try {
+      blockManager.dataSerializeStream(blockId, os, values)
+    } catch {
+      case NonFatal(e) => 
+        logWarning(s"Failed to put values of block $blockId into Tachyon", e)
+        os.cancel()
+    } finally {
+      os.close()
+    }
   }
 
   override def getBytes(blockId: BlockId): Option[ByteBuffer] = {
@@ -105,21 +128,31 @@ private[spark] class TachyonBlockManager() extends ExternalBlockManager with Log
       return None
     }
     val is = file.getInStream(ReadType.CACHE)
-    assert (is != null)
     try {
       val size = file.length
       val bs = new Array[Byte](size.asInstanceOf[Int])
       ByteStreams.readFully(is, bs)
       Some(ByteBuffer.wrap(bs))
     } catch {
-      case ioe: IOException =>
-        logWarning(s"Failed to fetch the block $blockId from Tachyon", ioe)
+      case NonFatal(e) =>
+        logWarning(s"Failed to get bytes of block $blockId from Tachyon", e)
         None
     } finally {
       is.close()
     }
   }
 
+  override def getValues(blockId: BlockId): Option[Iterator[_]] = {
+    val file = getFile(blockId)
+    if (file == null || file.getLocationHosts().size() == 0) {
+      return None
+    }
+    val is = file.getInStream(ReadType.CACHE)
+    Option(is).map { is =>
+      blockManager.dataDeserializeStream(blockId, is)
+    }
+  }
+
   override def getSize(blockId: BlockId): Long = {
     getFile(blockId.name).length
   }
@@ -184,7 +217,7 @@ private[spark] class TachyonBlockManager() extends ExternalBlockManager with Log
             tachyonDir = client.getFile(path)
           }
         } catch {
-          case e: Exception =>
+          case NonFatal(e) =>
             logWarning("Attempt " + tries + " to create tachyon dir " + tachyonDir + " failed", e)
         }
       }
@@ -206,7 +239,7 @@ private[spark] class TachyonBlockManager() extends ExternalBlockManager with Log
           Utils.deleteRecursively(tachyonDir, client)
         }
       } catch {
-        case e: Exception =>
+        case NonFatal(e) =>
           logError("Exception while deleting tachyon spark dir: " + tachyonDir, e)
       }
     }

From 8ddcb25b3990ec691463f87d4071e7425f4909a9 Mon Sep 17 00:00:00 2001
From: Davies Liu <davies@databricks.com>
Date: Wed, 20 May 2015 23:05:54 -0700
Subject: [PATCH 105/525] [SPARK-7606] [SQL] [PySpark] add version to Python
 SQL API docs

Add version info for public Python SQL API.

cc rxin

Author: Davies Liu <davies@databricks.com>

Closes #6295 from davies/versions and squashes the following commits:

cfd91e6 [Davies Liu] add more version for DataFrame API
600834d [Davies Liu] add version to SQL API docs
---
 python/pyspark/sql/__init__.py   |  7 ++++
 python/pyspark/sql/column.py     | 12 ++++++
 python/pyspark/sql/context.py    | 29 ++++++++++++--
 python/pyspark/sql/dataframe.py  | 68 +++++++++++++++++++++++++++++++-
 python/pyspark/sql/functions.py  | 47 +++++++++++++++-------
 python/pyspark/sql/group.py      | 10 +++++
 python/pyspark/sql/readwriter.py | 15 +++++++
 7 files changed, 170 insertions(+), 18 deletions(-)

diff --git a/python/pyspark/sql/__init__.py b/python/pyspark/sql/__init__.py
index 634c575ecd80e..66b0bff2908b7 100644
--- a/python/pyspark/sql/__init__.py
+++ b/python/pyspark/sql/__init__.py
@@ -41,6 +41,13 @@
 """
 from __future__ import absolute_import
 
+
+def since(version):
+    def deco(f):
+        f.__doc__ = f.__doc__.rstrip() + "\n\n.. versionadded:: %s" % version
+        return f
+    return deco
+
 # fix the module name conflict for Python 3+
 import sys
 from . import _types as types
diff --git a/python/pyspark/sql/column.py b/python/pyspark/sql/column.py
index fc7ad674daa5b..d03bb6d33dd03 100644
--- a/python/pyspark/sql/column.py
+++ b/python/pyspark/sql/column.py
@@ -23,6 +23,7 @@
 
 from pyspark.context import SparkContext
 from pyspark.rdd import ignore_unicode_prefix
+from pyspark.sql import since
 from pyspark.sql.types import *
 
 __all__ = ["DataFrame", "Column", "SchemaRDD", "DataFrameNaFunctions",
@@ -114,6 +115,8 @@ class Column(object):
         # 2. Create from an expression
         df.colName + 1
         1 / df.colName
+
+    .. versionadded:: 1.3
     """
 
     def __init__(self, jc):
@@ -159,6 +162,7 @@ def __init__(self, jc):
     bitwiseAND = _bin_op("bitwiseAND")
     bitwiseXOR = _bin_op("bitwiseXOR")
 
+    @since(1.3)
     def getItem(self, key):
         """An expression that gets an item at position `ordinal` out of a list,
          or gets an item by key out of a dict.
@@ -179,6 +183,7 @@ def getItem(self, key):
         """
         return self[key]
 
+    @since(1.3)
     def getField(self, name):
         """An expression that gets a field by name in a StructField.
 
@@ -211,6 +216,7 @@ def __getattr__(self, item):
     endswith = _bin_op("endsWith")
 
     @ignore_unicode_prefix
+    @since(1.3)
     def substr(self, startPos, length):
         """
         Return a :class:`Column` which is a substring of the column
@@ -234,6 +240,7 @@ def substr(self, startPos, length):
     __getslice__ = substr
 
     @ignore_unicode_prefix
+    @since(1.3)
     def inSet(self, *cols):
         """ A boolean expression that is evaluated to true if the value of this
         expression is contained by the evaluated values of the arguments.
@@ -259,6 +266,7 @@ def inSet(self, *cols):
     isNull = _unary_op("isNull", "True if the current expression is null.")
     isNotNull = _unary_op("isNotNull", "True if the current expression is not null.")
 
+    @since(1.3)
     def alias(self, *alias):
         """Returns this column aliased with a new name or names (in the case of expressions that
         return more than one column, such as explode).
@@ -274,6 +282,7 @@ def alias(self, *alias):
             return Column(getattr(self._jc, "as")(_to_seq(sc, list(alias))))
 
     @ignore_unicode_prefix
+    @since(1.3)
     def cast(self, dataType):
         """ Convert the column into type `dataType`
 
@@ -294,6 +303,7 @@ def cast(self, dataType):
         return Column(jc)
 
     @ignore_unicode_prefix
+    @since(1.3)
     def between(self, lowerBound, upperBound):
         """ A boolean expression that is evaluated to true if the value of this
         expression is between the given columns.
@@ -301,6 +311,7 @@ def between(self, lowerBound, upperBound):
         return (self >= lowerBound) & (self <= upperBound)
 
     @ignore_unicode_prefix
+    @since(1.4)
     def when(self, condition, value):
         """Evaluates a list of conditions and returns one of multiple possible result expressions.
         If :func:`Column.otherwise` is not invoked, None is returned for unmatched conditions.
@@ -319,6 +330,7 @@ def when(self, condition, value):
         return Column(jc)
 
     @ignore_unicode_prefix
+    @since(1.4)
     def otherwise(self, value):
         """Evaluates a list of conditions and returns one of multiple possible result expressions.
         If :func:`Column.otherwise` is not invoked, None is returned for unmatched conditions.
diff --git a/python/pyspark/sql/context.py b/python/pyspark/sql/context.py
index 7543475014bd2..51f12c5bb4198 100644
--- a/python/pyspark/sql/context.py
+++ b/python/pyspark/sql/context.py
@@ -28,6 +28,7 @@
 
 from pyspark.rdd import RDD, _prepare_for_python_RDD, ignore_unicode_prefix
 from pyspark.serializers import AutoBatchedSerializer, PickleSerializer
+from pyspark.sql import since
 from pyspark.sql.types import Row, StringType, StructType, _verify_type, \
     _infer_schema, _has_nulltype, _merge_type, _create_converter, _python_to_sql_converter
 from pyspark.sql.dataframe import DataFrame
@@ -106,11 +107,13 @@ def _ssql_ctx(self):
             self._scala_SQLContext = self._jvm.SQLContext(self._jsc.sc())
         return self._scala_SQLContext
 
+    @since(1.3)
     def setConf(self, key, value):
         """Sets the given Spark SQL configuration property.
         """
         self._ssql_ctx.setConf(key, value)
 
+    @since(1.3)
     def getConf(self, key, defaultValue):
         """Returns the value of Spark SQL configuration property for the given key.
 
@@ -119,10 +122,12 @@ def getConf(self, key, defaultValue):
         return self._ssql_ctx.getConf(key, defaultValue)
 
     @property
+    @since("1.3.1")
     def udf(self):
         """Returns a :class:`UDFRegistration` for UDF registration."""
         return UDFRegistration(self)
 
+    @since(1.4)
     def range(self, start, end, step=1, numPartitions=None):
         """
         Create a :class:`DataFrame` with single LongType column named `id`,
@@ -144,6 +149,7 @@ def range(self, start, end, step=1, numPartitions=None):
         return DataFrame(jdf, self)
 
     @ignore_unicode_prefix
+    @since(1.2)
     def registerFunction(self, name, f, returnType=StringType()):
         """Registers a lambda function as a UDF so it can be used in SQL statements.
 
@@ -210,7 +216,8 @@ def _inferSchema(self, rdd, samplingRatio=None):
 
     @ignore_unicode_prefix
     def inferSchema(self, rdd, samplingRatio=None):
-        """::note: Deprecated in 1.3, use :func:`createDataFrame` instead.
+        """
+        .. note:: Deprecated in 1.3, use :func:`createDataFrame` instead.
         """
         warnings.warn("inferSchema is deprecated, please use createDataFrame instead")
 
@@ -221,7 +228,8 @@ def inferSchema(self, rdd, samplingRatio=None):
 
     @ignore_unicode_prefix
     def applySchema(self, rdd, schema):
-        """::note: Deprecated in 1.3, use :func:`createDataFrame` instead.
+        """
+        .. note:: Deprecated in 1.3, use :func:`createDataFrame` instead.
         """
         warnings.warn("applySchema is deprecated, please use createDataFrame instead")
 
@@ -233,6 +241,7 @@ def applySchema(self, rdd, schema):
 
         return self.createDataFrame(rdd, schema)
 
+    @since(1.3)
     @ignore_unicode_prefix
     def createDataFrame(self, data, schema=None, samplingRatio=None):
         """
@@ -337,6 +346,7 @@ def createDataFrame(self, data, schema=None, samplingRatio=None):
         df = self._ssql_ctx.applySchemaToPythonRDD(jrdd.rdd(), schema.json())
         return DataFrame(df, self)
 
+    @since(1.3)
     def registerDataFrameAsTable(self, df, tableName):
         """Registers the given :class:`DataFrame` as a temporary table in the catalog.
 
@@ -349,6 +359,7 @@ def registerDataFrameAsTable(self, df, tableName):
         else:
             raise ValueError("Can only register DataFrame as table")
 
+    @since(1.0)
     def parquetFile(self, *paths):
         """Loads a Parquet file, returning the result as a :class:`DataFrame`.
 
@@ -367,6 +378,7 @@ def parquetFile(self, *paths):
         jdf = self._ssql_ctx.parquetFile(jpaths)
         return DataFrame(jdf, self)
 
+    @since(1.0)
     def jsonFile(self, path, schema=None, samplingRatio=1.0):
         """Loads a text file storing one JSON object per line as a :class:`DataFrame`.
 
@@ -407,6 +419,7 @@ def jsonFile(self, path, schema=None, samplingRatio=1.0):
         return DataFrame(df, self)
 
     @ignore_unicode_prefix
+    @since(1.0)
     def jsonRDD(self, rdd, schema=None, samplingRatio=1.0):
         """Loads an RDD storing one JSON object per string as a :class:`DataFrame`.
 
@@ -449,6 +462,7 @@ def func(iterator):
             df = self._ssql_ctx.jsonRDD(jrdd.rdd(), scala_datatype)
         return DataFrame(df, self)
 
+    @since(1.3)
     def load(self, path=None, source=None, schema=None, **options):
         """Returns the dataset in a data source as a :class:`DataFrame`.
 
@@ -460,6 +474,7 @@ def load(self, path=None, source=None, schema=None, **options):
         """
         return self.read.load(path, source, schema, **options)
 
+    @since(1.3)
     def createExternalTable(self, tableName, path=None, source=None,
                             schema=None, **options):
         """Creates an external table based on the dataset in a data source.
@@ -489,6 +504,7 @@ def createExternalTable(self, tableName, path=None, source=None,
         return DataFrame(df, self)
 
     @ignore_unicode_prefix
+    @since(1.0)
     def sql(self, sqlQuery):
         """Returns a :class:`DataFrame` representing the result of the given query.
 
@@ -499,6 +515,7 @@ def sql(self, sqlQuery):
         """
         return DataFrame(self._ssql_ctx.sql(sqlQuery), self)
 
+    @since(1.0)
     def table(self, tableName):
         """Returns the specified table as a :class:`DataFrame`.
 
@@ -510,6 +527,7 @@ def table(self, tableName):
         return DataFrame(self._ssql_ctx.table(tableName), self)
 
     @ignore_unicode_prefix
+    @since(1.3)
     def tables(self, dbName=None):
         """Returns a :class:`DataFrame` containing names of tables in the given database.
 
@@ -528,6 +546,7 @@ def tables(self, dbName=None):
         else:
             return DataFrame(self._ssql_ctx.tables(dbName), self)
 
+    @since(1.3)
     def tableNames(self, dbName=None):
         """Returns a list of names of tables in the database ``dbName``.
 
@@ -544,25 +563,29 @@ def tableNames(self, dbName=None):
         else:
             return [name for name in self._ssql_ctx.tableNames(dbName)]
 
+    @since(1.0)
     def cacheTable(self, tableName):
         """Caches the specified table in-memory."""
         self._ssql_ctx.cacheTable(tableName)
 
+    @since(1.0)
     def uncacheTable(self, tableName):
         """Removes the specified table from the in-memory cache."""
         self._ssql_ctx.uncacheTable(tableName)
 
+    @since(1.3)
     def clearCache(self):
         """Removes all cached tables from the in-memory cache. """
         self._ssql_ctx.clearCache()
 
     @property
+    @since(1.4)
     def read(self):
         """
         Returns a :class:`DataFrameReader` that can be used to read data
         in as a :class:`DataFrame`.
 
-        ::note: Experimental
+        .. note:: Experimental
 
         >>> sqlContext.read
         <pyspark.sql.readwriter.DataFrameReader object at ...>
diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py
index f2280b5100e53..3fc7d0048edf6 100644
--- a/python/pyspark/sql/dataframe.py
+++ b/python/pyspark/sql/dataframe.py
@@ -29,6 +29,7 @@
 from pyspark.serializers import BatchedSerializer, PickleSerializer, UTF8Deserializer
 from pyspark.storagelevel import StorageLevel
 from pyspark.traceback_utils import SCCallSiteSync
+from pyspark.sql import since
 from pyspark.sql.types import _create_cls, _parse_datatype_json_string
 from pyspark.sql.column import Column, _to_seq, _to_java_column
 from pyspark.sql.readwriter import DataFrameWriter
@@ -60,6 +61,8 @@ class DataFrame(object):
 
         people.filter(people.age > 30).join(department, people.deptId == department.id)) \
           .groupBy(department.name, "gender").agg({"salary": "avg", "age": "max"})
+
+    .. versionadded:: 1.3
     """
 
     def __init__(self, jdf, sql_ctx):
@@ -71,6 +74,7 @@ def __init__(self, jdf, sql_ctx):
         self._lazy_rdd = None
 
     @property
+    @since(1.3)
     def rdd(self):
         """Returns the content as an :class:`pyspark.RDD` of :class:`Row`.
         """
@@ -88,18 +92,21 @@ def applySchema(it):
         return self._lazy_rdd
 
     @property
+    @since("1.3.1")
     def na(self):
         """Returns a :class:`DataFrameNaFunctions` for handling missing values.
         """
         return DataFrameNaFunctions(self)
 
     @property
+    @since(1.4)
     def stat(self):
         """Returns a :class:`DataFrameStatFunctions` for statistic functions.
         """
         return DataFrameStatFunctions(self)
 
     @ignore_unicode_prefix
+    @since(1.3)
     def toJSON(self, use_unicode=True):
         """Converts a :class:`DataFrame` into a :class:`RDD` of string.
 
@@ -111,6 +118,7 @@ def toJSON(self, use_unicode=True):
         rdd = self._jdf.toJSON()
         return RDD(rdd.toJavaRDD(), self._sc, UTF8Deserializer(use_unicode))
 
+    @since(1.3)
     def saveAsParquetFile(self, path):
         """Saves the contents as a Parquet file, preserving the schema.
 
@@ -127,6 +135,7 @@ def saveAsParquetFile(self, path):
         """
         self._jdf.saveAsParquetFile(path)
 
+    @since(1.3)
     def registerTempTable(self, name):
         """Registers this RDD as a temporary table using the given name.
 
@@ -140,11 +149,13 @@ def registerTempTable(self, name):
         """
         self._jdf.registerTempTable(name)
 
+    @since(1.3)
     def registerAsTable(self, name):
         """DEPRECATED: use :func:`registerTempTable` instead"""
         warnings.warn("Use registerTempTable instead of registerAsTable.", DeprecationWarning)
         self.registerTempTable(name)
 
+    @since(1.3)
     def insertInto(self, tableName, overwrite=False):
         """Inserts the contents of this :class:`DataFrame` into the specified table.
 
@@ -152,6 +163,7 @@ def insertInto(self, tableName, overwrite=False):
         """
         self._jdf.insertInto(tableName, overwrite)
 
+    @since(1.3)
     def saveAsTable(self, tableName, source=None, mode="error", **options):
         """Saves the contents of this :class:`DataFrame` to a data source as a table.
 
@@ -169,6 +181,7 @@ def saveAsTable(self, tableName, source=None, mode="error", **options):
         """
         self.write.saveAsTable(tableName, source, mode, **options)
 
+    @since(1.3)
     def save(self, path=None, source=None, mode="error", **options):
         """Saves the contents of the :class:`DataFrame` to a data source.
 
@@ -187,6 +200,7 @@ def save(self, path=None, source=None, mode="error", **options):
         return self.write.save(path, source, mode, **options)
 
     @property
+    @since(1.4)
     def write(self):
         """
         Interface for saving the content of the :class:`DataFrame` out
@@ -194,7 +208,7 @@ def write(self):
 
         :return :class:`DataFrameWriter`
 
-        ::note: Experimental
+        .. note:: Experimental
 
         >>> df.write
         <pyspark.sql.readwriter.DataFrameWriter object at ...>
@@ -202,6 +216,7 @@ def write(self):
         return DataFrameWriter(self)
 
     @property
+    @since(1.3)
     def schema(self):
         """Returns the schema of this :class:`DataFrame` as a :class:`types.StructType`.
 
@@ -212,6 +227,7 @@ def schema(self):
             self._schema = _parse_datatype_json_string(self._jdf.schema().json())
         return self._schema
 
+    @since(1.3)
     def printSchema(self):
         """Prints out the schema in the tree format.
 
@@ -223,6 +239,7 @@ def printSchema(self):
         """
         print(self._jdf.schema().treeString())
 
+    @since(1.3)
     def explain(self, extended=False):
         """Prints the (logical and physical) plans to the console for debugging purpose.
 
@@ -248,12 +265,14 @@ def explain(self, extended=False):
         else:
             print(self._jdf.queryExecution().executedPlan().toString())
 
+    @since(1.3)
     def isLocal(self):
         """Returns ``True`` if the :func:`collect` and :func:`take` methods can be run locally
         (without any Spark executors).
         """
         return self._jdf.isLocal()
 
+    @since(1.3)
     def show(self, n=20):
         """Prints the first ``n`` rows to the console.
 
@@ -272,6 +291,7 @@ def show(self, n=20):
     def __repr__(self):
         return "DataFrame[%s]" % (", ".join("%s: %s" % c for c in self.dtypes))
 
+    @since(1.3)
     def count(self):
         """Returns the number of rows in this :class:`DataFrame`.
 
@@ -281,6 +301,7 @@ def count(self):
         return int(self._jdf.count())
 
     @ignore_unicode_prefix
+    @since(1.3)
     def collect(self):
         """Returns all the records as a list of :class:`Row`.
 
@@ -294,6 +315,7 @@ def collect(self):
         return [cls(r) for r in rs]
 
     @ignore_unicode_prefix
+    @since(1.3)
     def limit(self, num):
         """Limits the result count to the number specified.
 
@@ -306,6 +328,7 @@ def limit(self, num):
         return DataFrame(jdf, self.sql_ctx)
 
     @ignore_unicode_prefix
+    @since(1.3)
     def take(self, num):
         """Returns the first ``num`` rows as a :class:`list` of :class:`Row`.
 
@@ -315,6 +338,7 @@ def take(self, num):
         return self.limit(num).collect()
 
     @ignore_unicode_prefix
+    @since(1.3)
     def map(self, f):
         """ Returns a new :class:`RDD` by applying a the ``f`` function to each :class:`Row`.
 
@@ -326,6 +350,7 @@ def map(self, f):
         return self.rdd.map(f)
 
     @ignore_unicode_prefix
+    @since(1.3)
     def flatMap(self, f):
         """ Returns a new :class:`RDD` by first applying the ``f`` function to each :class:`Row`,
         and then flattening the results.
@@ -337,6 +362,7 @@ def flatMap(self, f):
         """
         return self.rdd.flatMap(f)
 
+    @since(1.3)
     def mapPartitions(self, f, preservesPartitioning=False):
         """Returns a new :class:`RDD` by applying the ``f`` function to each partition.
 
@@ -349,6 +375,7 @@ def mapPartitions(self, f, preservesPartitioning=False):
         """
         return self.rdd.mapPartitions(f, preservesPartitioning)
 
+    @since(1.3)
     def foreach(self, f):
         """Applies the ``f`` function to all :class:`Row` of this :class:`DataFrame`.
 
@@ -360,6 +387,7 @@ def foreach(self, f):
         """
         return self.rdd.foreach(f)
 
+    @since(1.3)
     def foreachPartition(self, f):
         """Applies the ``f`` function to each partition of this :class:`DataFrame`.
 
@@ -372,6 +400,7 @@ def foreachPartition(self, f):
         """
         return self.rdd.foreachPartition(f)
 
+    @since(1.3)
     def cache(self):
         """ Persists with the default storage level (C{MEMORY_ONLY_SER}).
         """
@@ -379,6 +408,7 @@ def cache(self):
         self._jdf.cache()
         return self
 
+    @since(1.3)
     def persist(self, storageLevel=StorageLevel.MEMORY_ONLY_SER):
         """Sets the storage level to persist its values across operations
         after the first time it is computed. This can only be used to assign
@@ -390,6 +420,7 @@ def persist(self, storageLevel=StorageLevel.MEMORY_ONLY_SER):
         self._jdf.persist(javaStorageLevel)
         return self
 
+    @since(1.3)
     def unpersist(self, blocking=True):
         """Marks the :class:`DataFrame` as non-persistent, and remove all blocks for it from
         memory and disk.
@@ -398,6 +429,7 @@ def unpersist(self, blocking=True):
         self._jdf.unpersist(blocking)
         return self
 
+    @since(1.4)
     def coalesce(self, numPartitions):
         """
         Returns a new :class:`DataFrame` that has exactly `numPartitions` partitions.
@@ -412,6 +444,7 @@ def coalesce(self, numPartitions):
         """
         return DataFrame(self._jdf.coalesce(numPartitions), self.sql_ctx)
 
+    @since(1.3)
     def repartition(self, numPartitions):
         """Returns a new :class:`DataFrame` that has exactly ``numPartitions`` partitions.
 
@@ -420,6 +453,7 @@ def repartition(self, numPartitions):
         """
         return DataFrame(self._jdf.repartition(numPartitions), self.sql_ctx)
 
+    @since(1.3)
     def distinct(self):
         """Returns a new :class:`DataFrame` containing the distinct rows in this :class:`DataFrame`.
 
@@ -428,6 +462,7 @@ def distinct(self):
         """
         return DataFrame(self._jdf.distinct(), self.sql_ctx)
 
+    @since(1.3)
     def sample(self, withReplacement, fraction, seed=None):
         """Returns a sampled subset of this :class:`DataFrame`.
 
@@ -439,6 +474,7 @@ def sample(self, withReplacement, fraction, seed=None):
         rdd = self._jdf.sample(withReplacement, fraction, long(seed))
         return DataFrame(rdd, self.sql_ctx)
 
+    @since(1.4)
     def randomSplit(self, weights, seed=None):
         """Randomly splits this :class:`DataFrame` with the provided weights.
 
@@ -461,6 +497,7 @@ def randomSplit(self, weights, seed=None):
         return [DataFrame(rdd, self.sql_ctx) for rdd in rdd_array]
 
     @property
+    @since(1.3)
     def dtypes(self):
         """Returns all column names and their data types as a list.
 
@@ -471,6 +508,7 @@ def dtypes(self):
 
     @property
     @ignore_unicode_prefix
+    @since(1.3)
     def columns(self):
         """Returns all column names as a list.
 
@@ -480,6 +518,7 @@ def columns(self):
         return [f.name for f in self.schema.fields]
 
     @ignore_unicode_prefix
+    @since(1.3)
     def alias(self, alias):
         """Returns a new :class:`DataFrame` with an alias set.
 
@@ -494,6 +533,7 @@ def alias(self, alias):
         return DataFrame(getattr(self._jdf, "as")(alias), self.sql_ctx)
 
     @ignore_unicode_prefix
+    @since(1.3)
     def join(self, other, joinExprs=None, joinType=None):
         """Joins with another :class:`DataFrame`, using the given join expression.
 
@@ -527,6 +567,7 @@ def join(self, other, joinExprs=None, joinType=None):
         return DataFrame(jdf, self.sql_ctx)
 
     @ignore_unicode_prefix
+    @since(1.3)
     def sort(self, *cols, **kwargs):
         """Returns a new :class:`DataFrame` sorted by the specified column(s).
 
@@ -586,6 +627,7 @@ def _jcols(self, *cols):
             cols = cols[0]
         return self._jseq(cols, _to_java_column)
 
+    @since("1.3.1")
     def describe(self, *cols):
         """Computes statistics for numeric columns.
 
@@ -607,6 +649,7 @@ def describe(self, *cols):
         return DataFrame(jdf, self.sql_ctx)
 
     @ignore_unicode_prefix
+    @since(1.3)
     def head(self, n=None):
         """
         Returns the first ``n`` rows as a list of :class:`Row`,
@@ -623,6 +666,7 @@ def head(self, n=None):
         return self.take(n)
 
     @ignore_unicode_prefix
+    @since(1.3)
     def first(self):
         """Returns the first row as a :class:`Row`.
 
@@ -632,6 +676,7 @@ def first(self):
         return self.head()
 
     @ignore_unicode_prefix
+    @since(1.3)
     def __getitem__(self, item):
         """Returns the column as a :class:`Column`.
 
@@ -659,6 +704,7 @@ def __getitem__(self, item):
         else:
             raise TypeError("unexpected item type: %s" % type(item))
 
+    @since(1.3)
     def __getattr__(self, name):
         """Returns the :class:`Column` denoted by ``name``.
 
@@ -672,6 +718,7 @@ def __getattr__(self, name):
         return Column(jc)
 
     @ignore_unicode_prefix
+    @since(1.3)
     def select(self, *cols):
         """Projects a set of expressions and returns a new :class:`DataFrame`.
 
@@ -689,6 +736,7 @@ def select(self, *cols):
         jdf = self._jdf.select(self._jcols(*cols))
         return DataFrame(jdf, self.sql_ctx)
 
+    @since(1.3)
     def selectExpr(self, *expr):
         """Projects a set of SQL expressions and returns a new :class:`DataFrame`.
 
@@ -703,6 +751,7 @@ def selectExpr(self, *expr):
         return DataFrame(jdf, self.sql_ctx)
 
     @ignore_unicode_prefix
+    @since(1.3)
     def filter(self, condition):
         """Filters rows using the given condition.
 
@@ -732,6 +781,7 @@ def filter(self, condition):
     where = filter
 
     @ignore_unicode_prefix
+    @since(1.3)
     def groupBy(self, *cols):
         """Groups the :class:`DataFrame` using the specified columns,
         so we can run aggregation on them. See :class:`GroupedData`
@@ -755,6 +805,7 @@ def groupBy(self, *cols):
         from pyspark.sql.group import GroupedData
         return GroupedData(jdf, self.sql_ctx)
 
+    @since(1.3)
     def agg(self, *exprs):
         """ Aggregate on the entire :class:`DataFrame` without groups
         (shorthand for ``df.groupBy.agg()``).
@@ -767,6 +818,7 @@ def agg(self, *exprs):
         """
         return self.groupBy().agg(*exprs)
 
+    @since(1.3)
     def unionAll(self, other):
         """ Return a new :class:`DataFrame` containing union of rows in this
         frame and another frame.
@@ -775,6 +827,7 @@ def unionAll(self, other):
         """
         return DataFrame(self._jdf.unionAll(other._jdf), self.sql_ctx)
 
+    @since(1.3)
     def intersect(self, other):
         """ Return a new :class:`DataFrame` containing rows only in
         both this frame and another frame.
@@ -783,6 +836,7 @@ def intersect(self, other):
         """
         return DataFrame(self._jdf.intersect(other._jdf), self.sql_ctx)
 
+    @since(1.3)
     def subtract(self, other):
         """ Return a new :class:`DataFrame` containing rows in this frame
         but not in another frame.
@@ -791,6 +845,7 @@ def subtract(self, other):
         """
         return DataFrame(getattr(self._jdf, "except")(other._jdf), self.sql_ctx)
 
+    @since(1.4)
     def dropDuplicates(self, subset=None):
         """Return a new :class:`DataFrame` with duplicate rows removed,
         optionally only considering certain columns.
@@ -821,6 +876,7 @@ def dropDuplicates(self, subset=None):
             jdf = self._jdf.dropDuplicates(self._jseq(subset))
         return DataFrame(jdf, self.sql_ctx)
 
+    @since("1.3.1")
     def dropna(self, how='any', thresh=None, subset=None):
         """Returns a new :class:`DataFrame` omitting rows with null values.
 
@@ -863,6 +919,7 @@ def dropna(self, how='any', thresh=None, subset=None):
 
         return DataFrame(self._jdf.na().drop(thresh, self._jseq(subset)), self.sql_ctx)
 
+    @since("1.3.1")
     def fillna(self, value, subset=None):
         """Replace null values, alias for ``na.fill()``.
 
@@ -924,6 +981,7 @@ def fillna(self, value, subset=None):
 
             return DataFrame(self._jdf.na().fill(value, self._jseq(subset)), self.sql_ctx)
 
+    @since(1.4)
     def replace(self, to_replace, value, subset=None):
         """Returns a new :class:`DataFrame` replacing a value with another value.
 
@@ -999,6 +1057,7 @@ def replace(self, to_replace, value, subset=None):
         return DataFrame(
             self._jdf.na().replace(self._jseq(subset), self._jmap(rep_dict)), self.sql_ctx)
 
+    @since(1.4)
     def corr(self, col1, col2, method=None):
         """
         Calculates the correlation of two columns of a DataFrame as a double value. Currently only
@@ -1020,6 +1079,7 @@ def corr(self, col1, col2, method=None):
                              "coefficient is supported.")
         return self._jdf.stat().corr(col1, col2, method)
 
+    @since(1.4)
     def cov(self, col1, col2):
         """
         Calculate the sample covariance for the given columns, specified by their names, as a
@@ -1034,6 +1094,7 @@ def cov(self, col1, col2):
             raise ValueError("col2 should be a string.")
         return self._jdf.stat().cov(col1, col2)
 
+    @since(1.4)
     def crosstab(self, col1, col2):
         """
         Computes a pair-wise frequency table of the given columns. Also known as a contingency
@@ -1055,6 +1116,7 @@ def crosstab(self, col1, col2):
             raise ValueError("col2 should be a string.")
         return DataFrame(self._jdf.stat().crosstab(col1, col2), self.sql_ctx)
 
+    @since(1.4)
     def freqItems(self, cols, support=None):
         """
         Finding frequent items for columns, possibly with false positives. Using the
@@ -1076,6 +1138,7 @@ def freqItems(self, cols, support=None):
         return DataFrame(self._jdf.stat().freqItems(_to_seq(self._sc, cols), support), self.sql_ctx)
 
     @ignore_unicode_prefix
+    @since(1.3)
     def withColumn(self, colName, col):
         """Returns a new :class:`DataFrame` by adding a column.
 
@@ -1088,6 +1151,7 @@ def withColumn(self, colName, col):
         return self.select('*', col.alias(colName))
 
     @ignore_unicode_prefix
+    @since(1.3)
     def withColumnRenamed(self, existing, new):
         """Returns a new :class:`DataFrame` by renaming an existing column.
 
@@ -1102,6 +1166,7 @@ def withColumnRenamed(self, existing, new):
                 for c in self.columns]
         return self.select(*cols)
 
+    @since(1.4)
     @ignore_unicode_prefix
     def drop(self, colName):
         """Returns a new :class:`DataFrame` that drops the specified column.
@@ -1114,6 +1179,7 @@ def drop(self, colName):
         jdf = self._jdf.drop(colName)
         return DataFrame(jdf, self.sql_ctx)
 
+    @since(1.3)
     def toPandas(self):
         """Returns the contents of this :class:`DataFrame` as Pandas ``pandas.DataFrame``.
 
diff --git a/python/pyspark/sql/functions.py b/python/pyspark/sql/functions.py
index fbe9bf5b526af..9b0d7f3e6656e 100644
--- a/python/pyspark/sql/functions.py
+++ b/python/pyspark/sql/functions.py
@@ -26,6 +26,7 @@
 from pyspark import SparkContext
 from pyspark.rdd import _prepare_for_python_RDD, ignore_unicode_prefix
 from pyspark.serializers import PickleSerializer, AutoBatchedSerializer
+from pyspark.sql import since
 from pyspark.sql.types import StringType
 from pyspark.sql.column import Column, _to_java_column, _to_seq
 
@@ -78,6 +79,18 @@ def _(col1, col2):
     'sqrt': 'Computes the square root of the specified float value.',
     'abs': 'Computes the absolute value.',
 
+    'max': 'Aggregate function: returns the maximum value of the expression in a group.',
+    'min': 'Aggregate function: returns the minimum value of the expression in a group.',
+    'first': 'Aggregate function: returns the first value in a group.',
+    'last': 'Aggregate function: returns the last value in a group.',
+    'count': 'Aggregate function: returns the number of items in a group.',
+    'sum': 'Aggregate function: returns the sum of all values in the expression.',
+    'avg': 'Aggregate function: returns the average of the values in a group.',
+    'mean': 'Aggregate function: returns the average of the values in a group.',
+    'sumDistinct': 'Aggregate function: returns the sum of distinct values in the expression.',
+}
+
+_functions_1_4 = {
     # unary math functions
     'acos': 'Computes the cosine inverse of the given value; the returned angle is in the range' +
             '0.0 through pi.',
@@ -102,21 +115,11 @@ def _(col1, col2):
     'tan': 'Computes the tangent of the given value.',
     'tanh': 'Computes the hyperbolic tangent of the given value.',
     'toDegrees': 'Converts an angle measured in radians to an approximately equivalent angle ' +
-             'measured in degrees.',
+                 'measured in degrees.',
     'toRadians': 'Converts an angle measured in degrees to an approximately equivalent angle ' +
-             'measured in radians.',
+                 'measured in radians.',
 
     'bitwiseNOT': 'Computes bitwise not.',
-
-    'max': 'Aggregate function: returns the maximum value of the expression in a group.',
-    'min': 'Aggregate function: returns the minimum value of the expression in a group.',
-    'first': 'Aggregate function: returns the first value in a group.',
-    'last': 'Aggregate function: returns the last value in a group.',
-    'count': 'Aggregate function: returns the number of items in a group.',
-    'sum': 'Aggregate function: returns the sum of all values in the expression.',
-    'avg': 'Aggregate function: returns the average of the values in a group.',
-    'mean': 'Aggregate function: returns the average of the values in a group.',
-    'sumDistinct': 'Aggregate function: returns the sum of distinct values in the expression.',
 }
 
 # math functions that take two arguments as input
@@ -128,15 +131,18 @@ def _(col1, col2):
 }
 
 for _name, _doc in _functions.items():
-    globals()[_name] = _create_function(_name, _doc)
+    globals()[_name] = since(1.3)(_create_function(_name, _doc))
+for _name, _doc in _functions_1_4.items():
+    globals()[_name] = since(1.4)(_create_function(_name, _doc))
 for _name, _doc in _binary_mathfunctions.items():
-    globals()[_name] = _create_binary_mathfunction(_name, _doc)
+    globals()[_name] = since(1.4)(_create_binary_mathfunction(_name, _doc))
 del _name, _doc
 __all__ += _functions.keys()
 __all__ += _binary_mathfunctions.keys()
 __all__.sort()
 
 
+@since(1.4)
 def array(*cols):
     """Creates a new array column.
 
@@ -155,6 +161,7 @@ def array(*cols):
     return Column(jc)
 
 
+@since(1.3)
 def approxCountDistinct(col, rsd=None):
     """Returns a new :class:`Column` for approximate distinct count of ``col``.
 
@@ -169,6 +176,7 @@ def approxCountDistinct(col, rsd=None):
     return Column(jc)
 
 
+@since(1.4)
 def explode(col):
     """Returns a new row for each element in the given array or map.
 
@@ -189,6 +197,7 @@ def explode(col):
     return Column(jc)
 
 
+@since(1.4)
 def coalesce(*cols):
     """Returns the first column that is not null.
 
@@ -225,6 +234,7 @@ def coalesce(*cols):
     return Column(jc)
 
 
+@since(1.3)
 def countDistinct(col, *cols):
     """Returns a new :class:`Column` for distinct count of ``col`` or ``cols``.
 
@@ -239,6 +249,7 @@ def countDistinct(col, *cols):
     return Column(jc)
 
 
+@since(1.4)
 def monotonicallyIncreasingId():
     """A column that generates monotonically increasing 64-bit integers.
 
@@ -259,6 +270,7 @@ def monotonicallyIncreasingId():
     return Column(sc._jvm.functions.monotonicallyIncreasingId())
 
 
+@since(1.4)
 def rand(seed=None):
     """Generates a random column with i.i.d. samples from U[0.0, 1.0].
     """
@@ -270,6 +282,7 @@ def rand(seed=None):
     return Column(jc)
 
 
+@since(1.4)
 def randn(seed=None):
     """Generates a column with i.i.d. samples from the standard normal distribution.
     """
@@ -281,6 +294,7 @@ def randn(seed=None):
     return Column(jc)
 
 
+@since(1.4)
 def sparkPartitionId():
     """A column for partition ID of the Spark task.
 
@@ -294,6 +308,7 @@ def sparkPartitionId():
 
 
 @ignore_unicode_prefix
+@since(1.4)
 def struct(*cols):
     """Creates a new struct column.
 
@@ -312,6 +327,7 @@ def struct(*cols):
     return Column(jc)
 
 
+@since(1.4)
 def when(condition, value):
     """Evaluates a list of conditions and returns one of multiple possible result expressions.
     If :func:`Column.otherwise` is not invoked, None is returned for unmatched conditions.
@@ -336,6 +352,8 @@ def when(condition, value):
 class UserDefinedFunction(object):
     """
     User defined function in Python
+
+    .. versionadded:: 1.3
     """
     def __init__(self, func, returnType):
         self.func = func
@@ -369,6 +387,7 @@ def __call__(self, *cols):
         return Column(jc)
 
 
+@since(1.3)
 def udf(f, returnType=StringType()):
     """Creates a :class:`Column` expression representing a user defined function (UDF).
 
diff --git a/python/pyspark/sql/group.py b/python/pyspark/sql/group.py
index 9f7c743c051d3..4da472a577eae 100644
--- a/python/pyspark/sql/group.py
+++ b/python/pyspark/sql/group.py
@@ -16,6 +16,7 @@
 #
 
 from pyspark.rdd import ignore_unicode_prefix
+from pyspark.sql import since
 from pyspark.sql.column import Column, _to_seq
 from pyspark.sql.dataframe import DataFrame
 from pyspark.sql.types import *
@@ -47,6 +48,8 @@ class GroupedData(object):
     """
     A set of methods for aggregations on a :class:`DataFrame`,
     created by :func:`DataFrame.groupBy`.
+
+    .. versionadded:: 1.3
     """
 
     def __init__(self, jdf, sql_ctx):
@@ -54,6 +57,7 @@ def __init__(self, jdf, sql_ctx):
         self.sql_ctx = sql_ctx
 
     @ignore_unicode_prefix
+    @since(1.3)
     def agg(self, *exprs):
         """Compute aggregates and returns the result as a :class:`DataFrame`.
 
@@ -86,6 +90,7 @@ def agg(self, *exprs):
         return DataFrame(jdf, self.sql_ctx)
 
     @dfapi
+    @since(1.3)
     def count(self):
         """Counts the number of records for each group.
 
@@ -94,6 +99,7 @@ def count(self):
         """
 
     @df_varargs_api
+    @since(1.3)
     def mean(self, *cols):
         """Computes average values for each numeric columns for each group.
 
@@ -108,6 +114,7 @@ def mean(self, *cols):
         """
 
     @df_varargs_api
+    @since(1.3)
     def avg(self, *cols):
         """Computes average values for each numeric columns for each group.
 
@@ -122,6 +129,7 @@ def avg(self, *cols):
         """
 
     @df_varargs_api
+    @since(1.3)
     def max(self, *cols):
         """Computes the max value for each numeric columns for each group.
 
@@ -132,6 +140,7 @@ def max(self, *cols):
         """
 
     @df_varargs_api
+    @since(1.3)
     def min(self, *cols):
         """Computes the min value for each numeric column for each group.
 
@@ -144,6 +153,7 @@ def min(self, *cols):
         """
 
     @df_varargs_api
+    @since(1.3)
     def sum(self, *cols):
         """Compute the sum for each numeric columns for each group.
 
diff --git a/python/pyspark/sql/readwriter.py b/python/pyspark/sql/readwriter.py
index e2b27fb587e73..02b3aab2b12e4 100644
--- a/python/pyspark/sql/readwriter.py
+++ b/python/pyspark/sql/readwriter.py
@@ -17,6 +17,7 @@
 
 from py4j.java_gateway import JavaClass
 
+from pyspark.sql import since
 from pyspark.sql.column import _to_seq
 from pyspark.sql.types import *
 
@@ -30,6 +31,8 @@ class DataFrameReader(object):
     to access this.
 
     ::Note: Experimental
+
+    .. versionadded:: 1.4
     """
 
     def __init__(self, sqlContext):
@@ -40,6 +43,7 @@ def _df(self, jdf):
         from pyspark.sql.dataframe import DataFrame
         return DataFrame(jdf, self._sqlContext)
 
+    @since(1.4)
     def load(self, path=None, format=None, schema=None, **options):
         """Loads data from a data source and returns it as a :class`DataFrame`.
 
@@ -63,6 +67,7 @@ def load(self, path=None, format=None, schema=None, **options):
         else:
             return self._df(jreader.load())
 
+    @since(1.4)
     def json(self, path, schema=None):
         """
         Loads a JSON file (one object per line) and returns the result as
@@ -107,6 +112,7 @@ def json(self, path, schema=None):
             jdf = self._jreader.schema(jschema).json(path)
         return self._df(jdf)
 
+    @since(1.4)
     def table(self, tableName):
         """Returns the specified table as a :class:`DataFrame`.
 
@@ -117,6 +123,7 @@ def table(self, tableName):
         """
         return self._df(self._jreader.table(tableName))
 
+    @since(1.4)
     def parquet(self, *path):
         """Loads a Parquet file, returning the result as a :class:`DataFrame`.
 
@@ -130,6 +137,7 @@ def parquet(self, *path):
         """
         return self._df(self._jreader.parquet(_to_seq(self._sqlContext._sc, path)))
 
+    @since(1.4)
     def jdbc(self, url, table, column=None, lowerBound=None, upperBound=None, numPartitions=None,
              predicates=None, properties={}):
         """
@@ -178,12 +186,15 @@ class DataFrameWriter(object):
     to access this.
 
     ::Note: Experimental
+
+    .. versionadded:: 1.4
     """
     def __init__(self, df):
         self._df = df
         self._sqlContext = df.sql_ctx
         self._jwrite = df._jdf.write()
 
+    @since(1.4)
     def save(self, path=None, format=None, mode="error", **options):
         """
         Saves the contents of the :class:`DataFrame` to a data source.
@@ -215,6 +226,7 @@ def save(self, path=None, format=None, mode="error", **options):
         else:
             jwrite.save(path)
 
+    @since(1.4)
     def saveAsTable(self, name, format=None, mode="error", **options):
         """
         Saves the contents of this :class:`DataFrame` to a data source as a table.
@@ -243,6 +255,7 @@ def saveAsTable(self, name, format=None, mode="error", **options):
             jwrite = jwrite.option(k, options[k])
         return jwrite.saveAsTable(name)
 
+    @since(1.4)
     def json(self, path, mode="error"):
         """
         Saves the content of the :class:`DataFrame` in JSON format at the
@@ -261,6 +274,7 @@ def json(self, path, mode="error"):
         """
         return self._jwrite.mode(mode).json(path)
 
+    @since(1.4)
     def parquet(self, path, mode="error"):
         """
         Saves the content of the :class:`DataFrame` in Parquet format at the
@@ -279,6 +293,7 @@ def parquet(self, path, mode="error"):
         """
         return self._jwrite.mode(mode).parquet(path)
 
+    @since(1.4)
     def jdbc(self, url, table, mode="error", properties={}):
         """
         Saves the content of the :class:`DataFrame` to a external database table

From 947ea1cf5f6986aa687631d6cf9f0fb974ee7caf Mon Sep 17 00:00:00 2001
From: Xiangrui Meng <meng@databricks.com>
Date: Wed, 20 May 2015 23:38:58 -0700
Subject: [PATCH 106/525] [SPARK-7753] [MLLIB] Update KernelDensity API

Update `KernelDensity` API to make it extensible to different kernels in the future. `bandwidth` is used instead of `standardDeviation`. The static `kernelDensity` method is removed from `Statistics`. The implementation is updated using BLAS, while the algorithm remains the same. sryza srowen

Author: Xiangrui Meng <meng@databricks.com>

Closes #6279 from mengxr/SPARK-7753 and squashes the following commits:

4cdfadc [Xiangrui Meng] add example code in the doc
767fd5a [Xiangrui Meng] update KernelDensity API
---
 .../spark/mllib/stat/KernelDensity.scala      | 109 +++++++++++++-----
 .../apache/spark/mllib/stat/Statistics.scala  |  14 ---
 .../spark/mllib/stat/KernelDensitySuite.scala |   7 +-
 3 files changed, 82 insertions(+), 48 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/stat/KernelDensity.scala b/mllib/src/main/scala/org/apache/spark/mllib/stat/KernelDensity.scala
index 79747cc5d7d74..a6bfe26e1e4f5 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/stat/KernelDensity.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/stat/KernelDensity.scala
@@ -17,52 +17,101 @@
 
 package org.apache.spark.mllib.stat
 
+import com.github.fommil.netlib.BLAS.{getInstance => blas}
+
+import org.apache.spark.annotation.Experimental
+import org.apache.spark.api.java.JavaRDD
 import org.apache.spark.rdd.RDD
 
-private[stat] object KernelDensity {
+/**
+ * :: Experimental ::
+ * Kernel density estimation. Given a sample from a population, estimate its probability density
+ * function at each of the given evaluation points using kernels. Only Gaussian kernel is supported.
+ *
+ * Scala example:
+ *
+ * {{{
+ * val sample = sc.parallelize(Seq(0.0, 1.0, 4.0, 4.0))
+ * val kd = new KernelDensity()
+ *   .setSample(sample)
+ *   .setBandwidth(3.0)
+ * val densities = kd.estimate(Array(-1.0, 2.0, 5.0))
+ * }}}
+ */
+@Experimental
+class KernelDensity extends Serializable {
+
+  import KernelDensity._
+
+  /** Bandwidth of the kernel function. */
+  private var bandwidth: Double = 1.0
+
+  /** A sample from a population. */
+  private var sample: RDD[Double] = _
+
   /**
-   * Given a set of samples from a distribution, estimates its density at the set of given points.
-   * Uses a Gaussian kernel with the given standard deviation.
+   * Sets the bandwidth (standard deviation) of the Gaussian kernel (default: `1.0`).
    */
-  def estimate(samples: RDD[Double], standardDeviation: Double,
-      evaluationPoints: Array[Double]): Array[Double] = {
-    if (standardDeviation <= 0.0) {
-      throw new IllegalArgumentException("Standard deviation must be positive")
-    }
+  def setBandwidth(bandwidth: Double): this.type = {
+    require(bandwidth > 0, s"Bandwidth must be positive, but got $bandwidth.")
+    this.bandwidth = bandwidth
+    this
+  }
 
-    // This gets used in each Gaussian PDF computation, so compute it up front
-    val logStandardDeviationPlusHalfLog2Pi =
-      math.log(standardDeviation) + 0.5 * math.log(2 * math.Pi)
+  /**
+   * Sets the sample to use for density estimation.
+   */
+  def setSample(sample: RDD[Double]): this.type = {
+    this.sample = sample
+    this
+  }
+
+  /**
+   * Sets the sample to use for density estimation (for Java users).
+   */
+  def setSample(sample: JavaRDD[java.lang.Double]): this.type = {
+    this.sample = sample.rdd.asInstanceOf[RDD[Double]]
+    this
+  }
+
+  /**
+   * Estimates probability density function at the given array of points.
+   */
+  def estimate(points: Array[Double]): Array[Double] = {
+    val sample = this.sample
+    val bandwidth = this.bandwidth
+
+    require(sample != null, "Must set sample before calling estimate.")
 
-    val (points, count) = samples.aggregate((new Array[Double](evaluationPoints.length), 0))(
+    val n = points.length
+    // This gets used in each Gaussian PDF computation, so compute it up front
+    val logStandardDeviationPlusHalfLog2Pi = math.log(bandwidth) + 0.5 * math.log(2 * math.Pi)
+    val (densities, count) = sample.aggregate((new Array[Double](n), 0L))(
       (x, y) => {
         var i = 0
-        while (i < evaluationPoints.length) {
-          x._1(i) += normPdf(y, standardDeviation, logStandardDeviationPlusHalfLog2Pi,
-            evaluationPoints(i))
+        while (i < n) {
+          x._1(i) += normPdf(y, bandwidth, logStandardDeviationPlusHalfLog2Pi, points(i))
           i += 1
         }
-        (x._1, i)
+        (x._1, n)
       },
       (x, y) => {
-        var i = 0
-        while (i < evaluationPoints.length) {
-          x._1(i) += y._1(i)
-          i += 1
-        }
+        blas.daxpy(n, 1.0, y._1, 1, x._1, 1)
         (x._1, x._2 + y._2)
       })
-
-    var i = 0
-    while (i < points.length) {
-      points(i) /= count
-      i += 1
-    }
-    points
+    blas.dscal(n, 1.0 / count, densities, 1)
+    densities
   }
+}
+
+private object KernelDensity {
 
-  private def normPdf(mean: Double, standardDeviation: Double,
-      logStandardDeviationPlusHalfLog2Pi: Double, x: Double): Double = {
+  /** Evaluates the PDF of a normal distribution. */
+  def normPdf(
+      mean: Double,
+      standardDeviation: Double,
+      logStandardDeviationPlusHalfLog2Pi: Double,
+      x: Double): Double = {
     val x0 = x - mean
     val x1 = x0 / standardDeviation
     val logDensity = -0.5 * x1 * x1 - logStandardDeviationPlusHalfLog2Pi
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/stat/Statistics.scala b/mllib/src/main/scala/org/apache/spark/mllib/stat/Statistics.scala
index 32561620ac914..b3fad0c52d655 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/stat/Statistics.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/stat/Statistics.scala
@@ -149,18 +149,4 @@ object Statistics {
   def chiSqTest(data: RDD[LabeledPoint]): Array[ChiSqTestResult] = {
     ChiSqTest.chiSquaredFeatures(data)
   }
-
-  /**
-   * Given an empirical distribution defined by the input RDD of samples, estimate its density at
-   * each of the given evaluation points using a Gaussian kernel.
-   *
-   * @param samples The samples RDD used to define the empirical distribution.
-   * @param standardDeviation The standard deviation of the kernel Gaussians.
-   * @param evaluationPoints The points at which to estimate densities.
-   * @return An array the same size as evaluationPoints with the density at each point.
-   */
-  def kernelDensity(samples: RDD[Double], standardDeviation: Double,
-      evaluationPoints: Iterable[Double]): Array[Double] = {
-    KernelDensity.estimate(samples, standardDeviation, evaluationPoints.toArray)
-  }
 }
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/stat/KernelDensitySuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/stat/KernelDensitySuite.scala
index 16ecae23dd9d4..14bb1cebf0b8f 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/stat/KernelDensitySuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/stat/KernelDensitySuite.scala
@@ -17,9 +17,8 @@
 
 package org.apache.spark.mllib.stat
 
-import org.scalatest.FunSuite
-
 import org.apache.commons.math3.distribution.NormalDistribution
+import org.scalatest.FunSuite
 
 import org.apache.spark.mllib.util.MLlibTestSparkContext
 
@@ -27,7 +26,7 @@ class KernelDensitySuite extends FunSuite with MLlibTestSparkContext {
   test("kernel density single sample") {
     val rdd = sc.parallelize(Array(5.0))
     val evaluationPoints = Array(5.0, 6.0)
-    val densities = KernelDensity.estimate(rdd, 3.0, evaluationPoints)
+    val densities = new KernelDensity().setSample(rdd).setBandwidth(3.0).estimate(evaluationPoints)
     val normal = new NormalDistribution(5.0, 3.0)
     val acceptableErr = 1e-6
     assert(densities(0) - normal.density(5.0) < acceptableErr)
@@ -37,7 +36,7 @@ class KernelDensitySuite extends FunSuite with MLlibTestSparkContext {
   test("kernel density multiple samples") {
     val rdd = sc.parallelize(Array(5.0, 10.0))
     val evaluationPoints = Array(5.0, 6.0)
-    val densities = KernelDensity.estimate(rdd, 3.0, evaluationPoints)
+    val densities = new KernelDensity().setSample(rdd).setBandwidth(3.0).estimate(evaluationPoints)
     val normal1 = new NormalDistribution(5.0, 3.0)
     val normal2 = new NormalDistribution(10.0, 3.0)
     val acceptableErr = 1e-6

From 1ee8eb431e04db16f95f0bcb3a546ad6e14b616f Mon Sep 17 00:00:00 2001
From: Burak Yavuz <brkyvz@gmail.com>
Date: Thu, 21 May 2015 00:30:55 -0700
Subject: [PATCH 107/525] [SPARK-7745] Change asserts to requires for user
 input checks in Spark Streaming

Assertions can be turned off. `require` throws an `IllegalArgumentException` which makes more sense when it's a user set variable.

Author: Burak Yavuz <brkyvz@gmail.com>

Closes #6271 from brkyvz/streaming-require and squashes the following commits:

d249484 [Burak Yavuz] fix merge conflict
264adb8 [Burak Yavuz] addressed comments v1.0
6161350 [Burak Yavuz] fix tests
16aa766 [Burak Yavuz] changed more assertions to more meaningful errors
afd923d [Burak Yavuz] changed some assertions to require
---
 .../apache/spark/streaming/DStreamGraph.scala |  4 +-
 .../spark/streaming/StreamingContext.scala    | 11 ++---
 .../streaming/api/python/PythonDStream.scala  |  4 +-
 .../spark/streaming/dstream/DStream.scala     | 45 +++++++++----------
 .../dstream/ReducedWindowedDStream.scala      |  4 +-
 .../scheduler/ReceivedBlockTracker.scala      |  2 +-
 .../streaming/StreamingContextSuite.scala     |  6 +--
 7 files changed, 38 insertions(+), 38 deletions(-)

diff --git a/streaming/src/main/scala/org/apache/spark/streaming/DStreamGraph.scala b/streaming/src/main/scala/org/apache/spark/streaming/DStreamGraph.scala
index 85b354ff4aa0d..40789c66f3991 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/DStreamGraph.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/DStreamGraph.scala
@@ -157,10 +157,10 @@ final private[streaming] class DStreamGraph extends Serializable with Logging {
 
   def validate() {
     this.synchronized {
-      assert(batchDuration != null, "Batch duration has not been set")
+      require(batchDuration != null, "Batch duration has not been set")
       // assert(batchDuration >= Milliseconds(100), "Batch duration of " + batchDuration +
       // " is very low")
-      assert(getOutputStreams().size > 0, "No output streams registered, so nothing to execute")
+      require(getOutputStreams().size > 0, "No output operations registered, so nothing to execute")
     }
   }
 
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/StreamingContext.scala b/streaming/src/main/scala/org/apache/spark/streaming/StreamingContext.scala
index 95063692e1146..160fc42c57d18 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/StreamingContext.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/StreamingContext.scala
@@ -156,7 +156,7 @@ class StreamingContext private[streaming] (
       cp_.graph.restoreCheckpointData()
       cp_.graph
     } else {
-      assert(batchDur_ != null, "Batch duration for streaming context cannot be null")
+      require(batchDur_ != null, "Batch duration for StreamingContext cannot be null")
       val newGraph = new DStreamGraph()
       newGraph.setBatchDuration(batchDur_)
       newGraph
@@ -462,7 +462,8 @@ class StreamingContext private[streaming] (
       directory, FileInputDStream.defaultFilter : Path => Boolean, newFilesOnly=true, conf)
     val data = br.map { case (k, v) =>
       val bytes = v.getBytes
-      assert(bytes.length == recordLength, "Byte array does not have correct length")
+      require(bytes.length == recordLength, "Byte array does not have correct length. " +
+        s"${bytes.length} did not equal recordLength: $recordLength")
       bytes
     }
     data
@@ -568,7 +569,7 @@ class StreamingContext private[streaming] (
   /**
    * Start the execution of the streams.
    *
-   * @throws SparkException if the StreamingContext is already stopped.
+   * @throws IllegalStateException if the StreamingContext is already stopped.
    */
   def start(): Unit = synchronized {
     state match {
@@ -587,7 +588,7 @@ class StreamingContext private[streaming] (
       case ACTIVE =>
         logWarning("StreamingContext has already been started")
       case STOPPED =>
-        throw new SparkException("StreamingContext has already been stopped")
+        throw new IllegalStateException("StreamingContext has already been stopped")
     }
   }
 
@@ -689,7 +690,7 @@ object StreamingContext extends Logging {
   private def assertNoOtherContextIsActive(): Unit = {
     ACTIVATION_LOCK.synchronized {
       if (activeContext.get() != null) {
-        throw new SparkException(
+        throw new IllegalStateException(
           "Only one StreamingContext may be started in this JVM. " +
             "Currently running StreamingContext was started at" +
             activeContext.get.startSite.get.longForm)
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
index 4c28654ef6413..d06401245ff17 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
@@ -109,7 +109,7 @@ private[python] object PythonTransformFunctionSerializer {
   }
 
   def serialize(func: PythonTransformFunction): Array[Byte] = {
-    assert(serializer != null, "Serializer has not been registered!")
+    require(serializer != null, "Serializer has not been registered!")
     // get the id of PythonTransformFunction in py4j
     val h = Proxy.getInvocationHandler(func.asInstanceOf[Proxy])
     val f = h.getClass().getDeclaredField("id")
@@ -119,7 +119,7 @@ private[python] object PythonTransformFunctionSerializer {
   }
 
   def deserialize(bytes: Array[Byte]): PythonTransformFunction = {
-    assert(serializer != null, "Serializer has not been registered!")
+    require(serializer != null, "Serializer has not been registered!")
     serializer.loads(bytes)
   }
 }
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/dstream/DStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/dstream/DStream.scala
index 7c50a766a9bad..c858647c6406d 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/dstream/DStream.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/dstream/DStream.scala
@@ -217,53 +217,52 @@ abstract class DStream[T: ClassTag] (
       case StreamingContextState.INITIALIZED =>
         // good to go
       case StreamingContextState.ACTIVE =>
-        throw new SparkException(
+        throw new IllegalStateException(
           "Adding new inputs, transformations, and output operations after " +
             "starting a context is not supported")
       case StreamingContextState.STOPPED =>
-        throw new SparkException(
+        throw new IllegalStateException(
           "Adding new inputs, transformations, and output operations after " +
             "stopping a context is not supported")
     }
   }
 
   private[streaming] def validateAtStart() {
-    assert(rememberDuration != null, "Remember duration is set to null")
+    require(rememberDuration != null, "Remember duration is set to null")
 
-    assert(
+    require(
       !mustCheckpoint || checkpointDuration != null,
       "The checkpoint interval for " + this.getClass.getSimpleName + " has not been set." +
         " Please use DStream.checkpoint() to set the interval."
     )
 
-    assert(
+    require(
      checkpointDuration == null || context.sparkContext.checkpointDir.isDefined,
-      "The checkpoint directory has not been set. Please use StreamingContext.checkpoint()" +
-      " or SparkContext.checkpoint() to set the checkpoint directory."
+      "The checkpoint directory has not been set. Please set it by StreamingContext.checkpoint()."
     )
 
-    assert(
+    require(
       checkpointDuration == null || checkpointDuration >= slideDuration,
       "The checkpoint interval for " + this.getClass.getSimpleName + " has been set to " +
         checkpointDuration + " which is lower than its slide time (" + slideDuration + "). " +
         "Please set it to at least " + slideDuration + "."
     )
 
-    assert(
+    require(
       checkpointDuration == null || checkpointDuration.isMultipleOf(slideDuration),
       "The checkpoint interval for " + this.getClass.getSimpleName + " has been set to " +
         checkpointDuration + " which not a multiple of its slide time (" + slideDuration + "). " +
-        "Please set it to a multiple " + slideDuration + "."
+        "Please set it to a multiple of " + slideDuration + "."
     )
 
-    assert(
+    require(
       checkpointDuration == null || storageLevel != StorageLevel.NONE,
       "" + this.getClass.getSimpleName + " has been marked for checkpointing but the storage " +
         "level has not been set to enable persisting. Please use DStream.persist() to set the " +
         "storage level to use memory for better checkpointing performance."
     )
 
-    assert(
+    require(
       checkpointDuration == null || rememberDuration > checkpointDuration,
       "The remember duration for " + this.getClass.getSimpleName + " has been set to " +
         rememberDuration + " which is not more than the checkpoint interval (" +
@@ -272,7 +271,7 @@ abstract class DStream[T: ClassTag] (
 
     val metadataCleanerDelay = MetadataCleaner.getDelaySeconds(ssc.conf)
     logInfo("metadataCleanupDelay = " + metadataCleanerDelay)
-    assert(
+    require(
       metadataCleanerDelay < 0 || rememberDuration.milliseconds < metadataCleanerDelay * 1000,
       "It seems you are doing some DStream window operation or setting a checkpoint interval " +
         "which requires " + this.getClass.getSimpleName + " to remember generated RDDs for more " +
@@ -633,8 +632,8 @@ abstract class DStream[T: ClassTag] (
    * 'this' DStream will be registered as an output stream and therefore materialized.
    */
   def foreachRDD(foreachFunc: (RDD[T], Time) => Unit): Unit = ssc.withScope {
-    // because the DStream is reachable from the outer object here, and because 
-    // DStreams can't be serialized with closures, we can't proactively check 
+    // because the DStream is reachable from the outer object here, and because
+    // DStreams can't be serialized with closures, we can't proactively check
     // it for serializability and so we pass the optional false to SparkContext.clean
     new ForEachDStream(this, context.sparkContext.clean(foreachFunc, false)).register()
   }
@@ -644,8 +643,8 @@ abstract class DStream[T: ClassTag] (
    * on each RDD of 'this' DStream.
    */
   def transform[U: ClassTag](transformFunc: RDD[T] => RDD[U]): DStream[U] = ssc.withScope {
-    // because the DStream is reachable from the outer object here, and because 
-    // DStreams can't be serialized with closures, we can't proactively check 
+    // because the DStream is reachable from the outer object here, and because
+    // DStreams can't be serialized with closures, we can't proactively check
     // it for serializability and so we pass the optional false to SparkContext.clean
     val cleanedF = context.sparkContext.clean(transformFunc, false)
     transform((r: RDD[T], t: Time) => cleanedF(r))
@@ -656,8 +655,8 @@ abstract class DStream[T: ClassTag] (
    * on each RDD of 'this' DStream.
    */
   def transform[U: ClassTag](transformFunc: (RDD[T], Time) => RDD[U]): DStream[U] = ssc.withScope {
-    // because the DStream is reachable from the outer object here, and because 
-    // DStreams can't be serialized with closures, we can't proactively check 
+    // because the DStream is reachable from the outer object here, and because
+    // DStreams can't be serialized with closures, we can't proactively check
     // it for serializability and so we pass the optional false to SparkContext.clean
     val cleanedF = context.sparkContext.clean(transformFunc, false)
     val realTransformFunc =  (rdds: Seq[RDD[_]], time: Time) => {
@@ -674,8 +673,8 @@ abstract class DStream[T: ClassTag] (
   def transformWith[U: ClassTag, V: ClassTag](
       other: DStream[U], transformFunc: (RDD[T], RDD[U]) => RDD[V]
     ): DStream[V] = ssc.withScope {
-    // because the DStream is reachable from the outer object here, and because 
-    // DStreams can't be serialized with closures, we can't proactively check 
+    // because the DStream is reachable from the outer object here, and because
+    // DStreams can't be serialized with closures, we can't proactively check
     // it for serializability and so we pass the optional false to SparkContext.clean
     val cleanedF = ssc.sparkContext.clean(transformFunc, false)
     transformWith(other, (rdd1: RDD[T], rdd2: RDD[U], time: Time) => cleanedF(rdd1, rdd2))
@@ -688,8 +687,8 @@ abstract class DStream[T: ClassTag] (
   def transformWith[U: ClassTag, V: ClassTag](
       other: DStream[U], transformFunc: (RDD[T], RDD[U], Time) => RDD[V]
     ): DStream[V] = ssc.withScope {
-    // because the DStream is reachable from the outer object here, and because 
-    // DStreams can't be serialized with closures, we can't proactively check 
+    // because the DStream is reachable from the outer object here, and because
+    // DStreams can't be serialized with closures, we can't proactively check
     // it for serializability and so we pass the optional false to SparkContext.clean
     val cleanedF = ssc.sparkContext.clean(transformFunc, false)
     val realTransformFunc = (rdds: Seq[RDD[_]], time: Time) => {
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/dstream/ReducedWindowedDStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/dstream/ReducedWindowedDStream.scala
index 1385ccbf56ee5..df9f7f140eddc 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/dstream/ReducedWindowedDStream.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/dstream/ReducedWindowedDStream.scala
@@ -40,12 +40,12 @@ class ReducedWindowedDStream[K: ClassTag, V: ClassTag](
     partitioner: Partitioner
   ) extends DStream[(K,V)](parent.ssc) {
 
-  assert(_windowDuration.isMultipleOf(parent.slideDuration),
+  require(_windowDuration.isMultipleOf(parent.slideDuration),
     "The window duration of ReducedWindowedDStream (" + _windowDuration + ") " +
       "must be multiple of the slide duration of parent DStream (" + parent.slideDuration + ")"
   )
 
-  assert(_slideDuration.isMultipleOf(parent.slideDuration),
+  require(_slideDuration.isMultipleOf(parent.slideDuration),
     "The slide duration of ReducedWindowedDStream (" + _slideDuration + ") " +
       "must be multiple of the slide duration of parent DStream (" + parent.slideDuration + ")"
   )
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ReceivedBlockTracker.scala b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ReceivedBlockTracker.scala
index a9f4147a5f020..7720259a5d794 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ReceivedBlockTracker.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ReceivedBlockTracker.scala
@@ -153,7 +153,7 @@ private[streaming] class ReceivedBlockTracker(
    * returns only after the files are cleaned up.
    */
   def cleanupOldBatches(cleanupThreshTime: Time, waitForCompletion: Boolean): Unit = synchronized {
-    assert(cleanupThreshTime.milliseconds < clock.getTimeMillis())
+    require(cleanupThreshTime.milliseconds < clock.getTimeMillis())
     val timesToCleanup = timeToAllocatedBlocks.keys.filter { _ < cleanupThreshTime }.toSeq
     logInfo("Deleting batches " + timesToCleanup)
     writeToLog(BatchCleanupEvent(timesToCleanup))
diff --git a/streaming/src/test/scala/org/apache/spark/streaming/StreamingContextSuite.scala b/streaming/src/test/scala/org/apache/spark/streaming/StreamingContextSuite.scala
index 3a958bf3a3c19..f8e8030791df1 100644
--- a/streaming/src/test/scala/org/apache/spark/streaming/StreamingContextSuite.scala
+++ b/streaming/src/test/scala/org/apache/spark/streaming/StreamingContextSuite.scala
@@ -182,7 +182,7 @@ class StreamingContextSuite extends FunSuite with BeforeAndAfter with Timeouts w
     ssc = new StreamingContext(master, appName, batchDuration)
     addInputStream(ssc).register()
     ssc.stop()
-    intercept[SparkException] {
+    intercept[IllegalStateException] {
       ssc.start() // start after stop should throw exception
     }
     assert(ssc.getState() === StreamingContextState.STOPPED)
@@ -600,7 +600,7 @@ class StreamingContextSuite extends FunSuite with BeforeAndAfter with Timeouts w
     val anotherInput = addInputStream(anotherSsc)
     anotherInput.foreachRDD { rdd => rdd.count }
 
-    val exception = intercept[SparkException] {
+    val exception = intercept[IllegalStateException] {
       anotherSsc.start()
     }
     assert(exception.getMessage.contains("StreamingContext"), "Did not get the right exception")
@@ -623,7 +623,7 @@ class StreamingContextSuite extends FunSuite with BeforeAndAfter with Timeouts w
 
     def testForException(clue: String, expectedErrorMsg: String)(body: => Unit): Unit = {
       withClue(clue) {
-        val ex = intercept[SparkException] {
+        val ex = intercept[IllegalStateException] {
           body
         }
         assert(ex.getMessage.toLowerCase().contains(expectedErrorMsg))

From feb3a9d3f81f19850fddbd9639823f59a60efa52 Mon Sep 17 00:00:00 2001
From: Cheng Hao <hao.cheng@intel.com>
Date: Thu, 21 May 2015 09:28:00 -0700
Subject: [PATCH 108/525] [SPARK-7320] [SQL] [Minor] Move the testData into
 beforeAll()

Follow up of #6340, to avoid the test report missing once it fails.

Author: Cheng Hao <hao.cheng@intel.com>

Closes #6312 from chenghao-intel/rollup_minor and squashes the following commits:

b03a25f [Cheng Hao] simplify the testData instantiation
09b7e8b [Cheng Hao] move the testData into beforeAll()
---
 .../spark/sql/hive/HiveDataFrameAnalyticsSuite.scala   | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveDataFrameAnalyticsSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveDataFrameAnalyticsSuite.scala
index 99de14660f676..fb10f8583da99 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveDataFrameAnalyticsSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveDataFrameAnalyticsSuite.scala
@@ -17,25 +17,21 @@
 
 package org.apache.spark.sql.hive
 
-import org.apache.spark.sql.QueryTest
+import org.apache.spark.sql.{DataFrame, QueryTest}
 import org.apache.spark.sql.functions._
 import org.apache.spark.sql.hive.test.TestHive
 import org.apache.spark.sql.hive.test.TestHive._
 import org.apache.spark.sql.hive.test.TestHive.implicits._
 import org.scalatest.BeforeAndAfterAll
 
-case class TestData2Int(a: Int, b: Int)
-
 // TODO ideally we should put the test suite into the package `sql`, as
 // `hive` package is optional in compiling, however, `SQLContext.sql` doesn't
 // support the `cube` or `rollup` yet.
 class HiveDataFrameAnalyticsSuite extends QueryTest with BeforeAndAfterAll {
-  val testData =
-    TestHive.sparkContext.parallelize(
-      TestData2Int(1, 2) ::
-        TestData2Int(2, 4) :: Nil).toDF()
+  private var testData: DataFrame = _
 
   override def beforeAll() {
+    testData = Seq((1, 2), (2, 4)).toDF("a", "b")
     TestHive.registerDataFrameAsTable(testData, "mytable")
   }
 

From a25c1ab8f04a4e19d82ff4c18a0b1689d8b3ddac Mon Sep 17 00:00:00 2001
From: Davies Liu <davies@databricks.com>
Date: Thu, 21 May 2015 09:58:47 -0700
Subject: [PATCH 109/525] [SPARK-7565] [SQL] fix MapType in JsonRDD

The key of Map in JsonRDD should be converted into UTF8String (also failed records), Thanks to yhuai viirya

Closes #6084

Author: Davies Liu <davies@databricks.com>

Closes #6299 from davies/string_in_json and squashes the following commits:

0dbf559 [Davies Liu] improve test, fix corrupt record
6836a80 [Davies Liu] move unit tests into Scala
b97af11 [Davies Liu] fix MapType in JsonRDD
---
 .../apache/spark/sql/json/JacksonParser.scala |  8 +++---
 .../org/apache/spark/sql/json/JsonRDD.scala   | 16 +++++++----
 .../org/apache/spark/sql/json/JsonSuite.scala | 28 ++++++++++++++++++-
 3 files changed, 41 insertions(+), 11 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/json/JacksonParser.scala b/sql/core/src/main/scala/org/apache/spark/sql/json/JacksonParser.scala
index 81611513582a8..0e223758051a6 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/json/JacksonParser.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/json/JacksonParser.scala
@@ -150,10 +150,10 @@ private[sql] object JacksonParser {
   private def convertMap(
       factory: JsonFactory,
       parser: JsonParser,
-      valueType: DataType): Map[String, Any] = {
-    val builder = Map.newBuilder[String, Any]
+      valueType: DataType): Map[UTF8String, Any] = {
+    val builder = Map.newBuilder[UTF8String, Any]
     while (nextUntil(parser, JsonToken.END_OBJECT)) {
-      builder += parser.getCurrentName -> convertField(factory, parser, valueType)
+      builder += UTF8String(parser.getCurrentName) -> convertField(factory, parser, valueType)
     }
 
     builder.result()
@@ -181,7 +181,7 @@ private[sql] object JacksonParser {
       val row = new GenericMutableRow(schema.length)
       for (corruptIndex <- schema.getFieldIndex(columnNameOfCorruptRecords)) {
         require(schema(corruptIndex).dataType == StringType)
-        row.update(corruptIndex, record)
+        row.update(corruptIndex, UTF8String(record))
       }
 
       Seq(row)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/json/JsonRDD.scala b/sql/core/src/main/scala/org/apache/spark/sql/json/JsonRDD.scala
index 4c32710a17bc7..037a6d60a2ed6 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/json/JsonRDD.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/json/JsonRDD.scala
@@ -20,18 +20,18 @@ package org.apache.spark.sql.json
 import java.sql.Timestamp
 
 import scala.collection.Map
-import scala.collection.convert.Wrappers.{JMapWrapper, JListWrapper}
+import scala.collection.convert.Wrappers.{JListWrapper, JMapWrapper}
 
-import com.fasterxml.jackson.core.{JsonGenerator, JsonProcessingException}
+import com.fasterxml.jackson.core.JsonProcessingException
 import com.fasterxml.jackson.databind.ObjectMapper
 
+import org.apache.spark.Logging
 import org.apache.spark.rdd.RDD
+import org.apache.spark.sql.catalyst.ScalaReflection
 import org.apache.spark.sql.catalyst.analysis.HiveTypeCoercion
 import org.apache.spark.sql.catalyst.expressions._
-import org.apache.spark.sql.catalyst.ScalaReflection
 import org.apache.spark.sql.catalyst.util.DateUtils
 import org.apache.spark.sql.types._
-import org.apache.spark.Logging
 
 private[sql] object JsonRDD extends Logging {
 
@@ -318,7 +318,8 @@ private[sql] object JsonRDD extends Logging {
 
           parsed
         } catch {
-          case e: JsonProcessingException => Map(columnNameOfCorruptRecords -> record) :: Nil
+          case e: JsonProcessingException =>
+            Map(columnNameOfCorruptRecords -> UTF8String(record)) :: Nil
         }
       }
     })
@@ -422,7 +423,10 @@ private[sql] object JsonRDD extends Logging {
           value.asInstanceOf[Seq[Any]].map(enforceCorrectType(_, elementType))
         case MapType(StringType, valueType, _) =>
           val map = value.asInstanceOf[Map[String, Any]]
-          map.mapValues(enforceCorrectType(_, valueType)).map(identity)
+          map.map {
+            case (k, v) =>
+              (UTF8String(k), enforceCorrectType(v, valueType))
+          }.map(identity)
         case struct: StructType => asRow(value.asInstanceOf[Map[String, Any]], struct)
         case DateType => toDate(value)
         case TimestampType => toTimestamp(value)
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/json/JsonSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/json/JsonSuite.scala
index 6f747e5846f74..7e6eeba17752a 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/json/JsonSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/json/JsonSuite.scala
@@ -25,7 +25,6 @@ import org.scalactic.Tolerance._
 
 import org.apache.spark.sql.TestData._
 import org.apache.spark.sql.catalyst.util.DateUtils
-import org.apache.spark.sql.functions._
 import org.apache.spark.sql.json.InferSchema.compatibleType
 import org.apache.spark.sql.sources.LogicalRelation
 import org.apache.spark.sql.test.TestSQLContext
@@ -1074,4 +1073,31 @@ class JsonSuite extends QueryTest {
     assert(StructType(Seq()) === emptySchema)
   }
 
+  test("SPARK-7565 MapType in JsonRDD") {
+    val useStreaming = getConf(SQLConf.USE_JACKSON_STREAMING_API, "true")
+    val oldColumnNameOfCorruptRecord = TestSQLContext.conf.columnNameOfCorruptRecord
+    TestSQLContext.setConf(SQLConf.COLUMN_NAME_OF_CORRUPT_RECORD, "_unparsed")
+
+    val schemaWithSimpleMap = StructType(
+      StructField("map", MapType(StringType, IntegerType, true), false) :: Nil)
+    try{
+      for (useStreaming <- List("true", "false")) {
+        setConf(SQLConf.USE_JACKSON_STREAMING_API, useStreaming)
+        val temp = Utils.createTempDir().getPath
+
+        val df = read.schema(schemaWithSimpleMap).json(mapType1)
+        df.write.mode("overwrite").parquet(temp)
+        // order of MapType is not defined
+        assert(read.parquet(temp).count() == 5)
+
+        val df2 = read.json(corruptRecords)
+        df2.write.mode("overwrite").parquet(temp)
+        checkAnswer(read.parquet(temp), df2.collect())
+      }
+    } finally {
+      setConf(SQLConf.USE_JACKSON_STREAMING_API, useStreaming)
+      setConf(SQLConf.COLUMN_NAME_OF_CORRUPT_RECORD, oldColumnNameOfCorruptRecord)
+    }
+  }
+
 }

From 13348e21b6b1c0df42c18b82b86c613291228863 Mon Sep 17 00:00:00 2001
From: Xiangrui Meng <meng@databricks.com>
Date: Thu, 21 May 2015 10:30:08 -0700
Subject: [PATCH 110/525] [SPARK-7752] [MLLIB] Use lowercase letters for
 NaiveBayes.modelType

to be consistent with other string names in MLlib. This PR also updates the implementation to use vals instead of hardcoded strings. jkbradley leahmcguire

Author: Xiangrui Meng <meng@databricks.com>

Closes #6277 from mengxr/SPARK-7752 and squashes the following commits:

f38b662 [Xiangrui Meng] add another case _ back in test
ae5c66a [Xiangrui Meng] model type -> modelType
711d1c6 [Xiangrui Meng] Merge remote-tracking branch 'apache/master' into SPARK-7752
40ae53e [Xiangrui Meng] fix Java test suite
264a814 [Xiangrui Meng] add case _ back
3c456a8 [Xiangrui Meng] update NB user guide
17bba53 [Xiangrui Meng] update naive Bayes to use lowercase model type strings
---
 docs/mllib-naive-bayes.md                     |  9 ++-
 .../mllib/classification/NaiveBayes.scala     | 75 +++++++++++--------
 .../classification/JavaNaiveBayesSuite.java   |  4 +-
 .../classification/NaiveBayesSuite.scala      | 46 ++++++------
 4 files changed, 75 insertions(+), 59 deletions(-)

diff --git a/docs/mllib-naive-bayes.md b/docs/mllib-naive-bayes.md
index 9780ea52c4994..56a2e9ca86bb1 100644
--- a/docs/mllib-naive-bayes.md
+++ b/docs/mllib-naive-bayes.md
@@ -21,7 +21,7 @@ Within that context, each observation is a document and each
 feature represents a term whose value is the frequency of the term (in multinomial naive Bayes) or
 a zero or one indicating whether the term was found in the document (in Bernoulli naive Bayes).
 Feature values must be nonnegative. The model type is selected with an optional parameter
-"Multinomial" or "Bernoulli" with "Multinomial" as the default.
+"multinomial" or "bernoulli" with "multinomial" as the default.
 [Additive smoothing](http://en.wikipedia.org/wiki/Lidstone_smoothing) can be used by
 setting the parameter $\lambda$ (default to $1.0$). For document classification, the input feature
 vectors are usually sparse, and sparse vectors should be supplied as input to take advantage of
@@ -35,7 +35,7 @@ sparsity. Since the training data is only used once, it is not necessary to cach
 [NaiveBayes](api/scala/index.html#org.apache.spark.mllib.classification.NaiveBayes$) implements
 multinomial naive Bayes. It takes an RDD of
 [LabeledPoint](api/scala/index.html#org.apache.spark.mllib.regression.LabeledPoint) and an optional
-smoothing parameter `lambda` as input, an optional model type parameter (default is Multinomial), and outputs a
+smoothing parameter `lambda` as input, an optional model type parameter (default is "multinomial"), and outputs a
 [NaiveBayesModel](api/scala/index.html#org.apache.spark.mllib.classification.NaiveBayesModel), which
 can be used for evaluation and prediction.
 
@@ -54,7 +54,7 @@ val splits = parsedData.randomSplit(Array(0.6, 0.4), seed = 11L)
 val training = splits(0)
 val test = splits(1)
 
-val model = NaiveBayes.train(training, lambda = 1.0, model = "Multinomial")
+val model = NaiveBayes.train(training, lambda = 1.0, model = "multinomial")
 
 val predictionAndLabel = test.map(p => (model.predict(p.features), p.label))
 val accuracy = 1.0 * predictionAndLabel.filter(x => x._1 == x._2).count() / test.count()
@@ -75,6 +75,8 @@ optionally smoothing parameter `lambda` as input, and output a
 can be used for evaluation and prediction.
 
 {% highlight java %}
+import scala.Tuple2;
+
 import org.apache.spark.api.java.JavaPairRDD;
 import org.apache.spark.api.java.JavaRDD;
 import org.apache.spark.api.java.function.Function;
@@ -82,7 +84,6 @@ import org.apache.spark.api.java.function.PairFunction;
 import org.apache.spark.mllib.classification.NaiveBayes;
 import org.apache.spark.mllib.classification.NaiveBayesModel;
 import org.apache.spark.mllib.regression.LabeledPoint;
-import scala.Tuple2;
 
 JavaRDD<LabeledPoint> training = ... // training set
 JavaRDD<LabeledPoint> test = ... // test set
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/classification/NaiveBayes.scala b/mllib/src/main/scala/org/apache/spark/mllib/classification/NaiveBayes.scala
index cffe9ef1e0b2a..f51ee36d0dfcb 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/classification/NaiveBayes.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/classification/NaiveBayes.scala
@@ -25,13 +25,12 @@ import org.json4s.JsonDSL._
 import org.json4s.jackson.JsonMethods._
 
 import org.apache.spark.{Logging, SparkContext, SparkException}
-import org.apache.spark.mllib.linalg.{BLAS, DenseMatrix, DenseVector, SparseVector, Vector, Vectors}
+import org.apache.spark.mllib.linalg.{BLAS, DenseMatrix, DenseVector, SparseVector, Vector}
 import org.apache.spark.mllib.regression.LabeledPoint
 import org.apache.spark.mllib.util.{Loader, Saveable}
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.{DataFrame, SQLContext}
 
-
 /**
  * Model for Naive Bayes Classifiers.
  *
@@ -39,7 +38,7 @@ import org.apache.spark.sql.{DataFrame, SQLContext}
  * @param pi log of class priors, whose dimension is C, number of labels
  * @param theta log of class conditional probabilities, whose dimension is C-by-D,
  *              where D is number of features
- * @param modelType The type of NB model to fit  can be "Multinomial" or "Bernoulli"
+ * @param modelType The type of NB model to fit  can be "multinomial" or "bernoulli"
  */
 class NaiveBayesModel private[mllib] (
     val labels: Array[Double],
@@ -48,11 +47,13 @@ class NaiveBayesModel private[mllib] (
     val modelType: String)
   extends ClassificationModel with Serializable with Saveable {
 
+  import NaiveBayes.{Bernoulli, Multinomial, supportedModelTypes}
+
   private val piVector = new DenseVector(pi)
-  private val thetaMatrix = new DenseMatrix(labels.size, theta(0).size, theta.flatten, true)
+  private val thetaMatrix = new DenseMatrix(labels.length, theta(0).length, theta.flatten, true)
 
   private[mllib] def this(labels: Array[Double], pi: Array[Double], theta: Array[Array[Double]]) =
-    this(labels, pi, theta, "Multinomial")
+    this(labels, pi, theta, NaiveBayes.Multinomial)
 
   /** A Java-friendly constructor that takes three Iterable parameters. */
   private[mllib] def this(
@@ -61,12 +62,15 @@ class NaiveBayesModel private[mllib] (
       theta: JIterable[JIterable[Double]]) =
     this(labels.asScala.toArray, pi.asScala.toArray, theta.asScala.toArray.map(_.asScala.toArray))
 
+  require(supportedModelTypes.contains(modelType),
+    s"Invalid modelType $modelType. Supported modelTypes are $supportedModelTypes.")
+
   // Bernoulli scoring requires log(condprob) if 1, log(1-condprob) if 0.
   // This precomputes log(1.0 - exp(theta)) and its sum which are used for the linear algebra
   // application of this condition (in predict function).
   private val (thetaMinusNegTheta, negThetaSum) = modelType match {
-    case "Multinomial" => (None, None)
-    case "Bernoulli" =>
+    case Multinomial => (None, None)
+    case Bernoulli =>
       val negTheta = thetaMatrix.map(value => math.log(1.0 - math.exp(value)))
       val ones = new DenseVector(Array.fill(thetaMatrix.numCols){1.0})
       val thetaMinusNegTheta = thetaMatrix.map { value =>
@@ -75,7 +79,7 @@ class NaiveBayesModel private[mllib] (
       (Option(thetaMinusNegTheta), Option(negTheta.multiply(ones)))
     case _ =>
       // This should never happen.
-      throw new UnknownError(s"NaiveBayesModel was created with an unknown ModelType: $modelType")
+      throw new UnknownError(s"Invalid modelType: $modelType.")
   }
 
   override def predict(testData: RDD[Vector]): RDD[Double] = {
@@ -88,15 +92,15 @@ class NaiveBayesModel private[mllib] (
 
   override def predict(testData: Vector): Double = {
     modelType match {
-      case "Multinomial" =>
+      case Multinomial =>
         val prob = thetaMatrix.multiply(testData)
         BLAS.axpy(1.0, piVector, prob)
         labels(prob.argmax)
-      case "Bernoulli" =>
+      case Bernoulli =>
         testData.foreachActive { (index, value) =>
           if (value != 0.0 && value != 1.0) {
             throw new SparkException(
-              s"Bernoulli Naive Bayes requires 0 or 1 feature values but found $testData.")
+              s"Bernoulli naive Bayes requires 0 or 1 feature values but found $testData.")
           }
         }
         val prob = thetaMinusNegTheta.get.multiply(testData)
@@ -105,7 +109,7 @@ class NaiveBayesModel private[mllib] (
         labels(prob.argmax)
       case _ =>
         // This should never happen.
-        throw new UnknownError(s"NaiveBayesModel was created with an unknown ModelType: $modelType")
+        throw new UnknownError(s"Invalid modelType: $modelType.")
     }
   }
 
@@ -230,16 +234,16 @@ object NaiveBayesModel extends Loader[NaiveBayesModel] {
         s"($loadedClassName, $version).  Supported:\n" +
         s"  ($classNameV1_0, 1.0)")
     }
-    assert(model.pi.size == numClasses,
+    assert(model.pi.length == numClasses,
       s"NaiveBayesModel.load expected $numClasses classes," +
-        s" but class priors vector pi had ${model.pi.size} elements")
-    assert(model.theta.size == numClasses,
+        s" but class priors vector pi had ${model.pi.length} elements")
+    assert(model.theta.length == numClasses,
       s"NaiveBayesModel.load expected $numClasses classes," +
-        s" but class conditionals array theta had ${model.theta.size} elements")
-    assert(model.theta.forall(_.size == numFeatures),
+        s" but class conditionals array theta had ${model.theta.length} elements")
+    assert(model.theta.forall(_.length == numFeatures),
       s"NaiveBayesModel.load expected $numFeatures features," +
         s" but class conditionals array theta had elements of size:" +
-        s" ${model.theta.map(_.size).mkString(",")}")
+        s" ${model.theta.map(_.length).mkString(",")}")
     model
   }
 }
@@ -257,9 +261,11 @@ class NaiveBayes private (
     private var lambda: Double,
     private var modelType: String) extends Serializable with Logging {
 
-  def this(lambda: Double) = this(lambda, "Multinomial")
+  import NaiveBayes.{Bernoulli, Multinomial}
 
-  def this() = this(1.0, "Multinomial")
+  def this(lambda: Double) = this(lambda, NaiveBayes.Multinomial)
+
+  def this() = this(1.0, NaiveBayes.Multinomial)
 
   /** Set the smoothing parameter. Default: 1.0. */
   def setLambda(lambda: Double): NaiveBayes = {
@@ -272,12 +278,11 @@ class NaiveBayes private (
 
   /**
    * Set the model type using a string (case-sensitive).
-   * Supported options: "Multinomial" and "Bernoulli".
-   * (default: Multinomial)
+   * Supported options: "multinomial" (default) and "bernoulli".
    */
-  def setModelType(modelType:String): NaiveBayes = {
+  def setModelType(modelType: String): NaiveBayes = {
     require(NaiveBayes.supportedModelTypes.contains(modelType),
-      s"NaiveBayes was created with an unknown ModelType: $modelType")
+      s"NaiveBayes was created with an unknown modelType: $modelType.")
     this.modelType = modelType
     this
   }
@@ -308,7 +313,7 @@ class NaiveBayes private (
       }
       if (!values.forall(v => v == 0.0 || v == 1.0)) {
         throw new SparkException(
-          s"Bernoulli Naive Bayes requires 0 or 1 feature values but found $v.")
+          s"Bernoulli naive Bayes requires 0 or 1 feature values but found $v.")
       }
     }
 
@@ -317,7 +322,7 @@ class NaiveBayes private (
     // TODO: similar to reduceByKeyLocally to save one stage.
     val aggregated = data.map(p => (p.label, p.features)).combineByKey[(Long, DenseVector)](
       createCombiner = (v: Vector) => {
-        if (modelType == "Bernoulli") {
+        if (modelType == Bernoulli) {
           requireZeroOneBernoulliValues(v)
         } else {
           requireNonnegativeValues(v)
@@ -352,11 +357,11 @@ class NaiveBayes private (
       labels(i) = label
       pi(i) = math.log(n + lambda) - piLogDenom
       val thetaLogDenom = modelType match {
-        case "Multinomial" => math.log(sumTermFreqs.values.sum + numFeatures * lambda)
-        case "Bernoulli" => math.log(n + 2.0 * lambda)
+        case Multinomial => math.log(sumTermFreqs.values.sum + numFeatures * lambda)
+        case Bernoulli => math.log(n + 2.0 * lambda)
         case _ =>
           // This should never happen.
-          throw new UnknownError(s"NaiveBayes was created with an unknown ModelType: $modelType")
+          throw new UnknownError(s"Invalid modelType: $modelType.")
       }
       var j = 0
       while (j < numFeatures) {
@@ -375,8 +380,14 @@ class NaiveBayes private (
  */
 object NaiveBayes {
 
+  /** String name for multinomial model type. */
+  private[classification] val Multinomial: String = "multinomial"
+
+  /** String name for Bernoulli model type. */
+  private[classification] val Bernoulli: String = "bernoulli"
+
   /* Set of modelTypes that NaiveBayes supports */
-  private[mllib] val supportedModelTypes = Set("Multinomial", "Bernoulli")
+  private[classification] val supportedModelTypes = Set(Multinomial, Bernoulli)
 
   /**
    * Trains a Naive Bayes model given an RDD of `(label, features)` pairs.
@@ -406,7 +417,7 @@ object NaiveBayes {
    * @param lambda The smoothing parameter
    */
   def train(input: RDD[LabeledPoint], lambda: Double): NaiveBayesModel = {
-    new NaiveBayes(lambda, "Multinomial").run(input)
+    new NaiveBayes(lambda, Multinomial).run(input)
   }
 
   /**
@@ -429,7 +440,7 @@ object NaiveBayes {
    */
   def train(input: RDD[LabeledPoint], lambda: Double, modelType: String): NaiveBayesModel = {
     require(supportedModelTypes.contains(modelType),
-      s"NaiveBayes was created with an unknown ModelType: $modelType")
+      s"NaiveBayes was created with an unknown modelType: $modelType.")
     new NaiveBayes(lambda, modelType).run(input)
   }
 
diff --git a/mllib/src/test/java/org/apache/spark/mllib/classification/JavaNaiveBayesSuite.java b/mllib/src/test/java/org/apache/spark/mllib/classification/JavaNaiveBayesSuite.java
index 71fb7f13c39c2..3771c0ea7ad83 100644
--- a/mllib/src/test/java/org/apache/spark/mllib/classification/JavaNaiveBayesSuite.java
+++ b/mllib/src/test/java/org/apache/spark/mllib/classification/JavaNaiveBayesSuite.java
@@ -108,7 +108,7 @@ public Vector call(LabeledPoint v) throws Exception {
   @Test
   public void testModelTypeSetters() {
     NaiveBayes nb = new NaiveBayes()
-        .setModelType("Bernoulli")
-        .setModelType("Multinomial");
+      .setModelType("bernoulli")
+      .setModelType("multinomial");
   }
 }
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/classification/NaiveBayesSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/classification/NaiveBayesSuite.scala
index 40a79a1f19bd9..c111a78a55806 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/classification/NaiveBayesSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/classification/NaiveBayesSuite.scala
@@ -19,9 +19,8 @@ package org.apache.spark.mllib.classification
 
 import scala.util.Random
 
-import breeze.linalg.{DenseMatrix => BDM, DenseVector => BDV, argmax => brzArgmax, sum => brzSum, Axis}
+import breeze.linalg.{DenseMatrix => BDM, DenseVector => BDV, argmax => brzArgmax, sum => brzSum}
 import breeze.stats.distributions.{Multinomial => BrzMultinomial}
-
 import org.scalatest.FunSuite
 
 import org.apache.spark.SparkException
@@ -30,9 +29,10 @@ import org.apache.spark.mllib.regression.LabeledPoint
 import org.apache.spark.mllib.util.{LocalClusterSparkContext, MLlibTestSparkContext}
 import org.apache.spark.util.Utils
 
-
 object NaiveBayesSuite {
 
+  import NaiveBayes.{Multinomial, Bernoulli}
+
   private def calcLabel(p: Double, pi: Array[Double]): Int = {
     var sum = 0.0
     for (j <- 0 until pi.length) {
@@ -48,7 +48,7 @@ object NaiveBayesSuite {
     theta: Array[Array[Double]],  // CXD
     nPoints: Int,
     seed: Int,
-    modelType: String = "Multinomial",
+    modelType: String = Multinomial,
     sample: Int = 10): Seq[LabeledPoint] = {
     val D = theta(0).length
     val rnd = new Random(seed)
@@ -58,10 +58,10 @@ object NaiveBayesSuite {
     for (i <- 0 until nPoints) yield {
       val y = calcLabel(rnd.nextDouble(), _pi)
       val xi = modelType match {
-        case "Bernoulli" => Array.tabulate[Double] (D) { j =>
+        case Bernoulli => Array.tabulate[Double] (D) { j =>
             if (rnd.nextDouble () < _theta(y)(j) ) 1 else 0
         }
-        case "Multinomial" =>
+        case Multinomial =>
           val mult = BrzMultinomial(BDV(_theta(y)))
           val emptyMap = (0 until D).map(x => (x, 0.0)).toMap
           val counts = emptyMap ++ mult.sample(sample).groupBy(x => x).map {
@@ -70,7 +70,7 @@ object NaiveBayesSuite {
           counts.toArray.sortBy(_._1).map(_._2)
         case _ =>
           // This should never happen.
-          throw new UnknownError(s"NaiveBayesSuite found unknown ModelType: $modelType")
+          throw new UnknownError(s"Invalid modelType: $modelType.")
       }
 
       LabeledPoint(y, Vectors.dense(xi))
@@ -79,17 +79,17 @@ object NaiveBayesSuite {
 
   /** Bernoulli NaiveBayes with binary labels, 3 features */
   private val binaryBernoulliModel = new NaiveBayesModel(labels = Array(0.0, 1.0),
-    pi = Array(0.2, 0.8), theta = Array(Array(0.1, 0.3, 0.6), Array(0.2, 0.4, 0.4)),
-    "Bernoulli")
+    pi = Array(0.2, 0.8), theta = Array(Array(0.1, 0.3, 0.6), Array(0.2, 0.4, 0.4)), Bernoulli)
 
   /** Multinomial NaiveBayes with binary labels, 3 features */
   private val binaryMultinomialModel = new NaiveBayesModel(labels = Array(0.0, 1.0),
-    pi = Array(0.2, 0.8), theta = Array(Array(0.1, 0.3, 0.6), Array(0.2, 0.4, 0.4)),
-    "Multinomial")
+    pi = Array(0.2, 0.8), theta = Array(Array(0.1, 0.3, 0.6), Array(0.2, 0.4, 0.4)), Multinomial)
 }
 
 class NaiveBayesSuite extends FunSuite with MLlibTestSparkContext {
 
+  import NaiveBayes.{Multinomial, Bernoulli}
+
   def validatePrediction(predictions: Seq[Double], input: Seq[LabeledPoint]) {
     val numOfPredictions = predictions.zip(input).count {
       case (prediction, expected) =>
@@ -117,6 +117,11 @@ class NaiveBayesSuite extends FunSuite with MLlibTestSparkContext {
     }
   }
 
+  test("model types") {
+    assert(Multinomial === "multinomial")
+    assert(Bernoulli === "bernoulli")
+  }
+
   test("get, set params") {
     val nb = new NaiveBayes()
     nb.setLambda(2.0)
@@ -134,16 +139,15 @@ class NaiveBayesSuite extends FunSuite with MLlibTestSparkContext {
       Array(0.10, 0.10, 0.70, 0.10)  // label 2
     ).map(_.map(math.log))
 
-    val testData = NaiveBayesSuite.generateNaiveBayesInput(
-      pi, theta, nPoints, 42, "Multinomial")
+    val testData = NaiveBayesSuite.generateNaiveBayesInput(pi, theta, nPoints, 42, Multinomial)
     val testRDD = sc.parallelize(testData, 2)
     testRDD.cache()
 
-    val model = NaiveBayes.train(testRDD, 1.0, "Multinomial")
+    val model = NaiveBayes.train(testRDD, 1.0, Multinomial)
     validateModelFit(pi, theta, model)
 
     val validationData = NaiveBayesSuite.generateNaiveBayesInput(
-      pi, theta, nPoints, 17, "Multinomial")
+      pi, theta, nPoints, 17, Multinomial)
     val validationRDD = sc.parallelize(validationData, 2)
 
     // Test prediction on RDD.
@@ -163,15 +167,15 @@ class NaiveBayesSuite extends FunSuite with MLlibTestSparkContext {
     ).map(_.map(math.log))
 
     val testData = NaiveBayesSuite.generateNaiveBayesInput(
-      pi, theta, nPoints, 45, "Bernoulli")
+      pi, theta, nPoints, 45, Bernoulli)
     val testRDD = sc.parallelize(testData, 2)
     testRDD.cache()
 
-    val model = NaiveBayes.train(testRDD, 1.0, "Bernoulli")
+    val model = NaiveBayes.train(testRDD, 1.0, Bernoulli)
     validateModelFit(pi, theta, model)
 
     val validationData = NaiveBayesSuite.generateNaiveBayesInput(
-      pi, theta, nPoints, 20, "Bernoulli")
+      pi, theta, nPoints, 20, Bernoulli)
     val validationRDD = sc.parallelize(validationData, 2)
 
     // Test prediction on RDD.
@@ -216,7 +220,7 @@ class NaiveBayesSuite extends FunSuite with MLlibTestSparkContext {
       LabeledPoint(1.0, Vectors.dense(0.0)))
 
     intercept[SparkException] {
-      NaiveBayes.train(sc.makeRDD(badTrain, 2), 1.0, "Bernoulli")
+      NaiveBayes.train(sc.makeRDD(badTrain, 2), 1.0, Bernoulli)
     }
 
     val okTrain = Seq(
@@ -235,7 +239,7 @@ class NaiveBayesSuite extends FunSuite with MLlibTestSparkContext {
       Vectors.dense(1.0),
       Vectors.dense(0.0))
 
-    val model = NaiveBayes.train(sc.makeRDD(okTrain, 2), 1.0, "Bernoulli")
+    val model = NaiveBayes.train(sc.makeRDD(okTrain, 2), 1.0, Bernoulli)
     intercept[SparkException] {
       model.predict(sc.makeRDD(badPredict, 2)).collect()
     }
@@ -275,7 +279,7 @@ class NaiveBayesSuite extends FunSuite with MLlibTestSparkContext {
       assert(model.labels === sameModel.labels)
       assert(model.pi === sameModel.pi)
       assert(model.theta === sameModel.theta)
-      assert(model.modelType === "Multinomial")
+      assert(model.modelType === Multinomial)
     } finally {
       Utils.deleteRecursively(tempDir)
     }

From 8730fbb47b09fcf955fe16dd03b75596db6d53b6 Mon Sep 17 00:00:00 2001
From: Cheng Lian <lian@databricks.com>
Date: Thu, 21 May 2015 10:56:17 -0700
Subject: [PATCH 111/525] [SPARK-7749] [SQL] Fixes partition discovery for
 non-partitioned tables

When no partition columns can be found, we should have an empty `PartitionSpec`, rather than a `PartitionSpec` with empty partition columns.

This PR together with #6285 should fix SPARK-7749.

Author: Cheng Lian <lian@databricks.com>
Author: Yin Huai <yhuai@databricks.com>

Closes #6287 from liancheng/spark-7749 and squashes the following commits:

a799ff3 [Cheng Lian] Adds test cases for SPARK-7749
c4949be [Cheng Lian] Minor refactoring, and tolerant _TEMPORARY directory name
5aa87ea [Yin Huai] Make parsePartitions more robust.
fc56656 [Cheng Lian] Returns empty PartitionSpec if no partition columns can be inferred
19ae41e [Cheng Lian] Don't list base directory as leaf directory
---
 .../spark/sql/sources/PartitioningUtils.scala | 84 +++++++++++++------
 .../apache/spark/sql/sources/interfaces.scala |  7 +-
 .../ParquetPartitionDiscoverySuite.scala      | 49 +++++++++--
 .../apache/spark/sql/hive/parquetSuites.scala | 51 ++++++++++-
 4 files changed, 150 insertions(+), 41 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/sources/PartitioningUtils.scala b/sql/core/src/main/scala/org/apache/spark/sql/sources/PartitioningUtils.scala
index 8f8138d6ebebc..e0ead23d786f9 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/sources/PartitioningUtils.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/sources/PartitioningUtils.scala
@@ -33,6 +33,10 @@ private[sql] case class Partition(values: Row, path: String)
 
 private[sql] case class PartitionSpec(partitionColumns: StructType, partitions: Seq[Partition])
 
+private[sql] object PartitionSpec {
+  val emptySpec = PartitionSpec(StructType(Seq.empty[StructField]), Seq.empty[Partition])
+}
+
 private[sql] object PartitioningUtils {
   // This duplicates default value of Hive `ConfVars.DEFAULTPARTITIONNAME`, since sql/core doesn't
   // depend on Hive.
@@ -68,20 +72,37 @@ private[sql] object PartitioningUtils {
   private[sql] def parsePartitions(
       paths: Seq[Path],
       defaultPartitionName: String): PartitionSpec = {
-    val partitionValues = resolvePartitions(paths.flatMap(parsePartition(_, defaultPartitionName)))
-    val fields = {
-      val (PartitionValues(columnNames, literals)) = partitionValues.head
-      columnNames.zip(literals).map { case (name, Literal(_, dataType)) =>
-        StructField(name, dataType, nullable = true)
-      }
+    // First, we need to parse every partition's path and see if we can find partition values.
+    val pathsWithPartitionValues = paths.flatMap { path =>
+      parsePartition(path, defaultPartitionName).map(path -> _)
     }
 
-    val partitions = partitionValues.zip(paths).map {
-      case (PartitionValues(_, literals), path) =>
-        Partition(Row(literals.map(_.value): _*), path.toString)
-    }
+    if (pathsWithPartitionValues.isEmpty) {
+      // This dataset is not partitioned.
+      PartitionSpec.emptySpec
+    } else {
+      // This dataset is partitioned. We need to check whether all partitions have the same
+      // partition columns and resolve potential type conflicts.
+      val resolvedPartitionValues = resolvePartitions(pathsWithPartitionValues.map(_._2))
+
+      // Creates the StructType which represents the partition columns.
+      val fields = {
+        val PartitionValues(columnNames, literals) = resolvedPartitionValues.head
+        columnNames.zip(literals).map { case (name, Literal(_, dataType)) =>
+          // We always assume partition columns are nullable since we've no idea whether null values
+          // will be appended in the future.
+          StructField(name, dataType, nullable = true)
+        }
+      }
+
+      // Finally, we create `Partition`s based on paths and resolved partition values.
+      val partitions = resolvedPartitionValues.zip(pathsWithPartitionValues).map {
+        case (PartitionValues(_, literals), (path, _)) =>
+          Partition(Row.fromSeq(literals.map(_.value)), path.toString)
+      }
 
-    PartitionSpec(StructType(fields), partitions)
+      PartitionSpec(StructType(fields), partitions)
+    }
   }
 
   /**
@@ -111,7 +132,7 @@ private[sql] object PartitioningUtils {
     while (!finished) {
       // Sometimes (e.g., when speculative task is enabled), temporary directories may be left
       // uncleaned.  Here we simply ignore them.
-      if (chopped.getName == "_temporary") {
+      if (chopped.getName.toLowerCase == "_temporary") {
         return None
       }
 
@@ -121,8 +142,12 @@ private[sql] object PartitioningUtils {
       finished = maybeColumn.isEmpty || chopped.getParent == null
     }
 
-    val (columnNames, values) = columns.reverse.unzip
-    Some(PartitionValues(columnNames, values))
+    if (columns.isEmpty) {
+      None
+    } else {
+      val (columnNames, values) = columns.reverse.unzip
+      Some(PartitionValues(columnNames, values))
+    }
   }
 
   private def parsePartitionColumn(
@@ -156,20 +181,25 @@ private[sql] object PartitioningUtils {
   private[sql] def resolvePartitions(values: Seq[PartitionValues]): Seq[PartitionValues] = {
     // Column names of all partitions must match
     val distinctPartitionsColNames = values.map(_.columnNames).distinct
-    assert(distinctPartitionsColNames.size == 1, {
-      val list = distinctPartitionsColNames.mkString("\t", "\n", "")
-      s"Conflicting partition column names detected:\n$list"
-    })
-
-    // Resolves possible type conflicts for each column
-    val columnCount = values.head.columnNames.size
-    val resolvedValues = (0 until columnCount).map { i =>
-      resolveTypeConflicts(values.map(_.literals(i)))
-    }
 
-    // Fills resolved literals back to each partition
-    values.zipWithIndex.map { case (d, index) =>
-      d.copy(literals = resolvedValues.map(_(index)))
+    if (distinctPartitionsColNames.isEmpty) {
+      Seq.empty
+    } else {
+      assert(distinctPartitionsColNames.size == 1, {
+        val list = distinctPartitionsColNames.mkString("\t", "\n", "")
+        s"Conflicting partition column names detected:\n$list"
+      })
+
+      // Resolves possible type conflicts for each column
+      val columnCount = values.head.columnNames.size
+      val resolvedValues = (0 until columnCount).map { i =>
+        resolveTypeConflicts(values.map(_.literals(i)))
+      }
+
+      // Fills resolved literals back to each partition
+      values.zipWithIndex.map { case (d, index) =>
+        d.copy(literals = resolvedValues.map(_(index)))
+      }
     }
   }
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/sources/interfaces.scala b/sql/core/src/main/scala/org/apache/spark/sql/sources/interfaces.scala
index 6a917bf38b139..fcbac0d457950 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/sources/interfaces.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/sources/interfaces.scala
@@ -462,12 +462,7 @@ abstract class HadoopFsRelation private[sql](maybePartitionSpec: Option[Partitio
 
   private def discoverPartitions(): PartitionSpec = {
     val leafDirs = fileStatusCache.leafDirs.keys.toSeq
-
-    if (leafDirs.nonEmpty) {
-      PartitioningUtils.parsePartitions(leafDirs, PartitioningUtils.DEFAULT_PARTITION_NAME)
-    } else {
-      PartitionSpec(StructType(Array.empty[StructField]), Array.empty[Partition])
-    }
+    PartitioningUtils.parsePartitions(leafDirs, PartitioningUtils.DEFAULT_PARTITION_NAME)
   }
 
   /**
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetPartitionDiscoverySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetPartitionDiscoverySuite.scala
index 1927114b8d58f..907dbb0119b40 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetPartitionDiscoverySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetPartitionDiscoverySuite.scala
@@ -22,7 +22,7 @@ import org.apache.hadoop.fs.Path
 
 import org.apache.spark.sql.catalyst.expressions.Literal
 import org.apache.spark.sql.sources.PartitioningUtils._
-import org.apache.spark.sql.sources.{Partition, PartitionSpec}
+import org.apache.spark.sql.sources.{LogicalRelation, Partition, PartitionSpec}
 import org.apache.spark.sql.test.TestSQLContext
 import org.apache.spark.sql.types._
 import org.apache.spark.sql.{QueryTest, Row, SQLContext}
@@ -66,12 +66,6 @@ class ParquetPartitionDiscoverySuite extends QueryTest with ParquetTest {
       assert(message.contains(expected))
     }
 
-    check("file:///", Some {
-      PartitionValues(
-        ArrayBuffer.empty[String],
-        ArrayBuffer.empty[Literal])
-    })
-
     check("file://path/a=10", Some {
       PartitionValues(
         ArrayBuffer("a"),
@@ -93,6 +87,10 @@ class ParquetPartitionDiscoverySuite extends QueryTest with ParquetTest {
         ArrayBuffer(Literal.create(1.5, FloatType)))
     })
 
+    check("file:///", None)
+    check("file:///path/_temporary", None)
+    check("file:///path/_temporary/c=1.5", None)
+    check("file:///path/_temporary/path", None)
     check("file://path/a=10/_temporary/c=1.5", None)
     check("file://path/a=10/c=1.5/_temporary", None)
 
@@ -124,6 +122,25 @@ class ParquetPartitionDiscoverySuite extends QueryTest with ParquetTest {
           Partition(Row(10, "20"), "hdfs://host:9000/path/a=10/b=20"),
           Partition(Row(10.5, "hello"), "hdfs://host:9000/path/a=10.5/b=hello"))))
 
+    check(Seq(
+      "hdfs://host:9000/path/_temporary",
+      "hdfs://host:9000/path/a=10/b=20",
+      "hdfs://host:9000/path/a=10.5/b=hello",
+      "hdfs://host:9000/path/a=10.5/_temporary",
+      "hdfs://host:9000/path/a=10.5/_TeMpOrArY",
+      "hdfs://host:9000/path/a=10.5/b=hello/_temporary",
+      "hdfs://host:9000/path/a=10.5/b=hello/_TEMPORARY",
+      "hdfs://host:9000/path/_temporary/path",
+      "hdfs://host:9000/path/a=11/_temporary/path",
+      "hdfs://host:9000/path/a=10.5/b=world/_temporary/path"),
+      PartitionSpec(
+        StructType(Seq(
+          StructField("a", FloatType),
+          StructField("b", StringType))),
+        Seq(
+          Partition(Row(10, "20"), "hdfs://host:9000/path/a=10/b=20"),
+          Partition(Row(10.5, "hello"), "hdfs://host:9000/path/a=10.5/b=hello"))))
+
     check(Seq(
       s"hdfs://host:9000/path/a=10/b=20",
       s"hdfs://host:9000/path/a=$defaultPartitionName/b=hello"),
@@ -145,6 +162,11 @@ class ParquetPartitionDiscoverySuite extends QueryTest with ParquetTest {
         Seq(
           Partition(Row(10, null), s"hdfs://host:9000/path/a=10/b=$defaultPartitionName"),
           Partition(Row(10.5, null), s"hdfs://host:9000/path/a=10.5/b=$defaultPartitionName"))))
+
+    check(Seq(
+      s"hdfs://host:9000/path1",
+      s"hdfs://host:9000/path2"),
+      PartitionSpec.emptySpec)
   }
 
   test("read partitioned table - normal case") {
@@ -334,4 +356,17 @@ class ParquetPartitionDiscoverySuite extends QueryTest with ParquetTest {
       }
     }
   }
+
+  test("SPARK-7749 Non-partitioned table should have empty partition spec") {
+    withTempPath { dir =>
+      (1 to 10).map(i => (i, i.toString)).toDF("a", "b").write.parquet(dir.getCanonicalPath)
+      val queryExecution = read.parquet(dir.getCanonicalPath).queryExecution
+      queryExecution.analyzed.collectFirst {
+        case LogicalRelation(relation: ParquetRelation2) =>
+          assert(relation.partitionSpec === PartitionSpec.emptySpec)
+      }.getOrElse {
+        fail(s"Expecting a ParquetRelation2, but got:\n$queryExecution")
+      }
+    }
+  }
 }
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/parquetSuites.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/parquetSuites.scala
index 05d99983b6a63..1da990bc959ba 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/parquetSuites.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/parquetSuites.scala
@@ -22,6 +22,7 @@ import java.io.File
 import org.scalatest.BeforeAndAfterAll
 
 import org.apache.spark.sql.catalyst.expressions.Row
+import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
 import org.apache.spark.sql.execution.{ExecutedCommand, PhysicalRDD}
 import org.apache.spark.sql.hive.execution.HiveTableScan
 import org.apache.spark.sql.hive.test.TestHive._
@@ -29,7 +30,7 @@ import org.apache.spark.sql.hive.test.TestHive.implicits._
 import org.apache.spark.sql.parquet.{ParquetRelation2, ParquetTableScan}
 import org.apache.spark.sql.sources.{InsertIntoDataSource, InsertIntoHadoopFsRelation, LogicalRelation}
 import org.apache.spark.sql.types._
-import org.apache.spark.sql.{QueryTest, SQLConf, SaveMode}
+import org.apache.spark.sql.{DataFrame, QueryTest, SQLConf, SaveMode}
 import org.apache.spark.util.Utils
 
 // The data where the partitioning key exists only in the directory structure.
@@ -385,6 +386,54 @@ class ParquetDataSourceOnMetastoreSuite extends ParquetMetastoreSuiteBase {
     sql("DROP TABLE ms_convert")
   }
 
+  def collectParquetRelation(df: DataFrame): ParquetRelation2 = {
+    val plan = df.queryExecution.analyzed
+    plan.collectFirst {
+      case LogicalRelation(r: ParquetRelation2) => r
+    }.getOrElse {
+      fail(s"Expecting a ParquetRelation2, but got:\n$plan")
+    }
+  }
+
+  test("SPARK-7749: non-partitioned metastore Parquet table lookup should use cached relation") {
+    sql(
+      s"""CREATE TABLE nonPartitioned (
+         |  key INT,
+         |  value STRING
+         |)
+         |STORED AS PARQUET
+       """.stripMargin)
+
+    // First lookup fills the cache
+    val r1 = collectParquetRelation(table("nonPartitioned"))
+    // Second lookup should reuse the cache
+    val r2 = collectParquetRelation(table("nonPartitioned"))
+    // They should be the same instance
+    assert(r1 eq r2)
+
+    sql("DROP TABLE nonPartitioned")
+  }
+
+  test("SPARK-7749: partitioned metastore Parquet table lookup should use cached relation") {
+    sql(
+      s"""CREATE TABLE partitioned (
+         |  key INT,
+         |  value STRING
+         |)
+         |PARTITIONED BY (part INT)
+         |STORED AS PARQUET
+       """.stripMargin)
+
+    // First lookup fills the cache
+    val r1 = collectParquetRelation(table("partitioned"))
+    // Second lookup should reuse the cache
+    val r2 = collectParquetRelation(table("partitioned"))
+    // They should be the same instance
+    assert(r1 eq r2)
+
+    sql("DROP TABLE partitioned")
+  }
+
   test("Caching converted data source Parquet Relations") {
     def checkCached(tableIdentifer: catalog.QualifiedTableName): Unit = {
       // Converted test_parquet should be cached.

From 4b7ff3092c53827817079e0810563cbb0b9d0747 Mon Sep 17 00:00:00 2001
From: Tathagata Das <tathagata.das1565@gmail.com>
Date: Thu, 21 May 2015 11:39:32 -0700
Subject: [PATCH 112/525] [SPARK-7787] [STREAMING] Fix serialization issue of
 SerializableAWSCredentials

Lack of default constructor causes deserialization to fail. This occurs only when the AWS credentials are explicitly specified through KinesisUtils.

Author: Tathagata Das <tathagata.das1565@gmail.com>

Closes #6316 from tdas/SPARK-7787 and squashes the following commits:

248ca5c [Tathagata Das] Fixed serializability
---
 .../streaming/kinesis/KinesisReceiver.scala   |  5 +++-
 .../kinesis/KinesisReceiverSuite.scala        | 30 ++++++++-----------
 2 files changed, 17 insertions(+), 18 deletions(-)

diff --git a/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisReceiver.scala b/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisReceiver.scala
index 90164490efb2e..800202e9fb86a 100644
--- a/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisReceiver.scala
+++ b/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisReceiver.scala
@@ -31,7 +31,10 @@ import org.apache.spark.util.Utils
 
 private[kinesis]
 case class SerializableAWSCredentials(accessKeyId: String, secretKey: String)
-  extends BasicAWSCredentials(accessKeyId, secretKey) with Serializable
+  extends AWSCredentials {
+  override def getAWSAccessKeyId: String = accessKeyId
+  override def getAWSSecretKey: String = secretKey
+}
 
 /**
  * Custom AWS Kinesis-specific implementation of Spark Streaming's Receiver.
diff --git a/extras/kinesis-asl/src/test/scala/org/apache/spark/streaming/kinesis/KinesisReceiverSuite.scala b/extras/kinesis-asl/src/test/scala/org/apache/spark/streaming/kinesis/KinesisReceiverSuite.scala
index 7c17ee9dceddd..cd19c33b90050 100644
--- a/extras/kinesis-asl/src/test/scala/org/apache/spark/streaming/kinesis/KinesisReceiverSuite.scala
+++ b/extras/kinesis-asl/src/test/scala/org/apache/spark/streaming/kinesis/KinesisReceiverSuite.scala
@@ -20,27 +20,18 @@ import java.nio.ByteBuffer
 
 import scala.collection.JavaConversions.seqAsJavaList
 
-import org.apache.spark.storage.StorageLevel
-import org.apache.spark.streaming.Milliseconds
-import org.apache.spark.streaming.Seconds
-import org.apache.spark.streaming.StreamingContext
-import org.apache.spark.streaming.TestSuiteBase
-import org.apache.spark.util.{ManualClock, Clock}
-
-import org.mockito.Mockito._
-import org.scalatest.BeforeAndAfter
-import org.scalatest.Matchers
-import org.scalatest.mock.MockitoSugar
-
-import com.amazonaws.services.kinesis.clientlibrary.exceptions.InvalidStateException
-import com.amazonaws.services.kinesis.clientlibrary.exceptions.KinesisClientLibDependencyException
-import com.amazonaws.services.kinesis.clientlibrary.exceptions.ShutdownException
-import com.amazonaws.services.kinesis.clientlibrary.exceptions.ThrottlingException
+import com.amazonaws.services.kinesis.clientlibrary.exceptions.{InvalidStateException, KinesisClientLibDependencyException, ShutdownException, ThrottlingException}
 import com.amazonaws.services.kinesis.clientlibrary.interfaces.IRecordProcessorCheckpointer
 import com.amazonaws.services.kinesis.clientlibrary.lib.worker.InitialPositionInStream
 import com.amazonaws.services.kinesis.clientlibrary.types.ShutdownReason
 import com.amazonaws.services.kinesis.model.Record
-import com.amazonaws.auth.DefaultAWSCredentialsProviderChain
+import org.mockito.Mockito._
+import org.scalatest.{BeforeAndAfter, Matchers}
+import org.scalatest.mock.MockitoSugar
+
+import org.apache.spark.storage.StorageLevel
+import org.apache.spark.streaming.{Milliseconds, Seconds, StreamingContext, TestSuiteBase}
+import org.apache.spark.util.{Clock, ManualClock, Utils}
 
 /**
  * Suite of Kinesis streaming receiver tests focusing mostly on the KinesisRecordProcessor
@@ -99,6 +90,11 @@ class KinesisReceiverSuite extends TestSuiteBase with Matchers with BeforeAndAft
     ssc.stop()
   }
 
+  test("check serializability of SerializableAWSCredentials") {
+    Utils.deserialize[SerializableAWSCredentials](
+      Utils.serialize(new SerializableAWSCredentials("x", "y")))
+  }
+
   test("process records including store and checkpoint") {
     when(receiverMock.isStopped()).thenReturn(false)
     when(checkpointStateMock.shouldCheckpoint()).thenReturn(true)

From 6e534026963e567f92743c5721de16325645223e Mon Sep 17 00:00:00 2001
From: Sean Owen <sowen@cloudera.com>
Date: Thu, 21 May 2015 19:42:51 +0100
Subject: [PATCH 113/525] [SPARK-6416] [DOCS] RDD.fold() requires the operator
 to be commutative

Document current limitation of rdd.fold.

This does not resolve SPARK-6416 but just documents the issue.
CC JoshRosen

Author: Sean Owen <sowen@cloudera.com>

Closes #6231 from srowen/SPARK-6416 and squashes the following commits:

9fef39f [Sean Owen] Add comment to other languages; reword to highlight the difference from non-distributed collections and to not suggest it is a bug that is to be fixed
da40d84 [Sean Owen] Document current limitation of rdd.fold.
---
 .../org/apache/spark/api/java/JavaRDDLike.scala     | 13 ++++++++++---
 core/src/main/scala/org/apache/spark/rdd/RDD.scala  | 13 ++++++++++---
 python/pyspark/rdd.py                               | 12 ++++++++++--
 3 files changed, 30 insertions(+), 8 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/api/java/JavaRDDLike.scala b/core/src/main/scala/org/apache/spark/api/java/JavaRDDLike.scala
index 8bf0627fc420d..74db7643224f5 100644
--- a/core/src/main/scala/org/apache/spark/api/java/JavaRDDLike.scala
+++ b/core/src/main/scala/org/apache/spark/api/java/JavaRDDLike.scala
@@ -386,9 +386,16 @@ trait JavaRDDLike[T, This <: JavaRDDLike[T, This]] extends Serializable {
 
   /**
    * Aggregate the elements of each partition, and then the results for all the partitions, using a
-   * given associative function and a neutral "zero value". The function op(t1, t2) is allowed to
-   * modify t1 and return it as its result value to avoid object allocation; however, it should not
-   * modify t2.
+   * given associative and commutative function and a neutral "zero value". The function
+   * op(t1, t2) is allowed to modify t1 and return it as its result value to avoid object
+   * allocation; however, it should not modify t2.
+   *
+   * This behaves somewhat differently from fold operations implemented for non-distributed
+   * collections in functional languages like Scala. This fold operation may be applied to
+   * partitions individually, and then fold those results into the final result, rather than
+   * apply the fold to each element sequentially in some defined ordering. For functions
+   * that are not commutative, the result may differ from that of a fold applied to a
+   * non-distributed collection.
    */
   def fold(zeroValue: T)(f: JFunction2[T, T, T]): T =
     rdd.fold(zeroValue)(f)
diff --git a/core/src/main/scala/org/apache/spark/rdd/RDD.scala b/core/src/main/scala/org/apache/spark/rdd/RDD.scala
index f7fa37e4cdcdc..d772f03f76651 100644
--- a/core/src/main/scala/org/apache/spark/rdd/RDD.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/RDD.scala
@@ -1015,9 +1015,16 @@ abstract class RDD[T: ClassTag](
 
   /**
    * Aggregate the elements of each partition, and then the results for all the partitions, using a
-   * given associative function and a neutral "zero value". The function op(t1, t2) is allowed to
-   * modify t1 and return it as its result value to avoid object allocation; however, it should not
-   * modify t2.
+   * given associative and commutative function and a neutral "zero value". The function
+   * op(t1, t2) is allowed to modify t1 and return it as its result value to avoid object
+   * allocation; however, it should not modify t2.
+   *
+   * This behaves somewhat differently from fold operations implemented for non-distributed
+   * collections in functional languages like Scala. This fold operation may be applied to
+   * partitions individually, and then fold those results into the final result, rather than
+   * apply the fold to each element sequentially in some defined ordering. For functions
+   * that are not commutative, the result may differ from that of a fold applied to a
+   * non-distributed collection.
    */
   def fold(zeroValue: T)(op: (T, T) => T): T = withScope {
     // Clone the zero value since we will also be serializing it as part of tasks
diff --git a/python/pyspark/rdd.py b/python/pyspark/rdd.py
index 70db4bbe4cbc5..98a8ff8606366 100644
--- a/python/pyspark/rdd.py
+++ b/python/pyspark/rdd.py
@@ -813,13 +813,21 @@ def op(x, y):
     def fold(self, zeroValue, op):
         """
         Aggregate the elements of each partition, and then the results for all
-        the partitions, using a given associative function and a neutral "zero
-        value."
+        the partitions, using a given associative and commutative function and
+        a neutral "zero value."
 
         The function C{op(t1, t2)} is allowed to modify C{t1} and return it
         as its result value to avoid object allocation; however, it should not
         modify C{t2}.
 
+        This behaves somewhat differently from fold operations implemented
+        for non-distributed collections in functional languages like Scala.
+        This fold operation may be applied to partitions individually, and then
+        fold those results into the final result, rather than apply the fold
+        to each element sequentially in some defined ordering. For functions
+        that are not commutative, the result may differ from that of a fold
+        applied to a non-distributed collection.
+
         >>> from operator import add
         >>> sc.parallelize([1, 2, 3, 4, 5]).fold(0, add)
         15

From 699906e538a3d03636adab546ca86d06d5d89293 Mon Sep 17 00:00:00 2001
From: kaka1992 <kaka_1992@163.com>
Date: Thu, 21 May 2015 11:50:39 -0700
Subject: [PATCH 114/525] [SPARK-7394][SQL] Add Pandas style cast (astype)

Author: kaka1992 <kaka_1992@163.com>

Closes #6313 from kaka1992/astype and squashes the following commits:

73dfd0b [kaka1992] [SPARK-7394] Add Pandas style cast (astype)
ad8feb2 [kaka1992] [SPARK-7394] Add Pandas style cast (astype)
4f328b7 [kaka1992] [SPARK-7394] Add Pandas style cast (astype)
---
 python/pyspark/sql/column.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/python/pyspark/sql/column.py b/python/pyspark/sql/column.py
index d03bb6d33dd03..baf1ecbd0a2fc 100644
--- a/python/pyspark/sql/column.py
+++ b/python/pyspark/sql/column.py
@@ -302,6 +302,8 @@ def cast(self, dataType):
             raise TypeError("unexpected type: %s" % type(dataType))
         return Column(jc)
 
+    astype = cast
+
     @ignore_unicode_prefix
     @since(1.3)
     def between(self, lowerBound, upperBound):

From 4f572008f804068c1a81cc334ff2367dbeae6493 Mon Sep 17 00:00:00 2001
From: Shuo Xiang <shuoxiangpub@gmail.com>
Date: Thu, 21 May 2015 12:09:44 -0700
Subject: [PATCH 115/525] [SPARK-7793] [MLLIB] Use getOrElse for getting the
 threshold of SVM model

same issue and fix as in Spark-7694.

Author: Shuo Xiang <shuoxiangpub@gmail.com>

Closes #6321 from coderxiang/nb and squashes the following commits:

a5e6de4 [Shuo Xiang] use getOrElse for svmmodel.tostring
2cb0177 [Shuo Xiang] Merge remote-tracking branch 'upstream/master' into nb
5f109b4 [Shuo Xiang] Merge remote-tracking branch 'upstream/master'
c5c5bfe [Shuo Xiang] Merge remote-tracking branch 'upstream/master'
98804c9 [Shuo Xiang] fix bug in topBykey and update test
---
 .../main/scala/org/apache/spark/mllib/classification/SVM.scala  | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/classification/SVM.scala b/mllib/src/main/scala/org/apache/spark/mllib/classification/SVM.scala
index 33104cf06c6ea..348485560713e 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/classification/SVM.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/classification/SVM.scala
@@ -89,7 +89,7 @@ class SVMModel (
   override protected def formatVersion: String = "1.0"
 
   override def toString: String = {
-    s"${super.toString}, numClasses = 2, threshold = ${threshold.get}"
+    s"${super.toString}, numClasses = 2, threshold = ${threshold.getOrElse("None")}"
   }
 }
 

From f6c486aa4b0d3a50b53c110fd63d226fffeb87f7 Mon Sep 17 00:00:00 2001
From: scwf <wangfei1@huawei.com>
Date: Thu, 21 May 2015 12:31:58 -0700
Subject: [PATCH 116/525] [SQL] [TEST] udf_java_method failed due to jdk
 version

java.lang.Math.exp(1.0) has different result between jdk versions. so do not use createQueryTest, write a separate test for it.
```
jdk version   	result
1.7.0_11		2.7182818284590455
1.7.0_05        2.7182818284590455
1.7.0_71		2.718281828459045
```

Author: scwf <wangfei1@huawei.com>

Closes #6274 from scwf/java_method and squashes the following commits:

3dd2516 [scwf] address comments
5fa1459 [scwf] style
df46445 [scwf] fix test error
fcb6d22 [scwf] fix udf_java_method
---
 .../execution/HiveCompatibilitySuite.scala    |  6 +++--
 .../sql/hive/execution/HiveQuerySuite.scala   |  7 ++----
 .../sql/hive/execution/SQLQuerySuite.scala    | 23 +++++++++++++++++++
 3 files changed, 29 insertions(+), 7 deletions(-)

diff --git a/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveCompatibilitySuite.scala b/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveCompatibilitySuite.scala
index b6245a57074c8..0b1917a392901 100644
--- a/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveCompatibilitySuite.scala
+++ b/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveCompatibilitySuite.scala
@@ -250,7 +250,10 @@ class HiveCompatibilitySuite extends HiveQueryFileTest with BeforeAndAfter {
 
     // The isolated classloader seemed to make some of our test reset mechanisms less robust.
     "combine1", // This test changes compression settings in a way that breaks all subsequent tests.
-    "load_dyn_part14.*" // These work alone but fail when run with other tests...
+    "load_dyn_part14.*", // These work alone but fail when run with other tests...
+
+    // the answer is sensitive for jdk version
+    "udf_java_method"
   ) ++ HiveShim.compatibilityBlackList
 
   /**
@@ -877,7 +880,6 @@ class HiveCompatibilitySuite extends HiveQueryFileTest with BeforeAndAfter {
     "udf_int",
     "udf_isnotnull",
     "udf_isnull",
-    "udf_java_method",
     "udf_lcase",
     "udf_length",
     "udf_lessthan",
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala
index e7aec0b188c66..65c6ef03bf041 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala
@@ -20,13 +20,10 @@ package org.apache.spark.sql.hive.execution
 import java.io.File
 import java.util.{Locale, TimeZone}
 
-import org.apache.hadoop.hive.ql.udf.generic.GenericUDTF
-import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory
-import org.apache.hadoop.hive.serde2.objectinspector.{ObjectInspectorFactory, StructObjectInspector, ObjectInspector}
-import org.scalatest.BeforeAndAfter
-
 import scala.util.Try
 
+import org.scalatest.BeforeAndAfter
+
 import org.apache.hadoop.hive.conf.HiveConf.ConfVars
 
 import org.apache.spark.{SparkFiles, SparkException}
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
index fbbf6ba5947dc..ba53ed99beb03 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
@@ -814,4 +814,27 @@ class SQLQuerySuite extends QueryTest {
       sql("SELECT cast(key+2 as Int) from df_analysis A group by cast(key+1 as int)")
     }
   }
+
+  // `Math.exp(1.0)` has different result for different jdk version, so not use createQueryTest
+  test("udf_java_method") {
+    checkAnswer(sql(
+      """
+        |SELECT java_method("java.lang.String", "valueOf", 1),
+        |       java_method("java.lang.String", "isEmpty"),
+        |       java_method("java.lang.Math", "max", 2, 3),
+        |       java_method("java.lang.Math", "min", 2, 3),
+        |       java_method("java.lang.Math", "round", 2.5),
+        |       java_method("java.lang.Math", "exp", 1.0),
+        |       java_method("java.lang.Math", "floor", 1.9)
+        |FROM src tablesample (1 rows)
+      """.stripMargin),
+      Row(
+        "1",
+        "true",
+        java.lang.Math.max(2, 3).toString,
+        java.lang.Math.min(2, 3).toString,
+        java.lang.Math.round(2.5).toString,
+        java.lang.Math.exp(1.0).toString,
+        java.lang.Math.floor(1.9).toString))
+  }
 }

From 15680aeed425c900a5de34d12b61929d1e5df607 Mon Sep 17 00:00:00 2001
From: Andrew Or <andrew@databricks.com>
Date: Thu, 21 May 2015 20:34:20 +0100
Subject: [PATCH 117/525] [SPARK-7775] YARN AM negative sleep exception

```
SLF4J: See http://www.slf4j.org/codes.html#multiple_bindings for an explanation.
SLF4J: Actual binding is of type [org.slf4j.impl.Log4jLoggerFactory]
Exception in thread "Reporter" java.lang.IllegalArgumentException: timeout value is negative
  at java.lang.Thread.sleep(Native Method)
  at org.apache.spark.deploy.yarn.ApplicationMaster$$anon$1.run(ApplicationMaster.scala:356)
```
This kills the reporter thread. This is caused by #6082 (merged into master branch only).

Author: Andrew Or <andrew@databricks.com>

Closes #6305 from andrewor14/yarn-negative-sleep and squashes the following commits:

b970770 [Andrew Or] Use existing cap
56d6e5e [Andrew Or] Avoid negative sleep
---
 .../scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala  | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala b/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala
index 63a6f2e9472c1..af4927b0e4bf7 100644
--- a/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala
+++ b/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala
@@ -345,7 +345,7 @@ private[spark] class ApplicationMaster(
               if (numPendingAllocate > 0) {
                 val currentAllocationInterval =
                   math.min(heartbeatInterval, nextAllocationInterval)
-                nextAllocationInterval *= 2
+                nextAllocationInterval = currentAllocationInterval * 2 // avoid overflow
                 currentAllocationInterval
               } else {
                 nextAllocationInterval = initialAllocationInterval

From 6d75ed7e5ccf6c58143de4608115f9a2b3ff6cf4 Mon Sep 17 00:00:00 2001
From: "Joseph K. Bradley" <joseph@databricks.com>
Date: Thu, 21 May 2015 13:05:48 -0700
Subject: [PATCH 118/525] [SPARK-7585] [ML] [DOC] VectorIndexer user guide
 section

Added VectorIndexer section to ML user guide.  Also added javaCategoryMaps() method and Java unit test for it.

CC: mengxr

Author: Joseph K. Bradley <joseph@databricks.com>

Closes #6255 from jkbradley/vector-indexer-guide and squashes the following commits:

dbb8c4c [Joseph K. Bradley] simplified VectorIndexerModel.javaCategoryMaps
f692084 [Joseph K. Bradley] Added VectorIndexer section to ML user guide.  Also added javaCategoryMaps() method and Java unit test for it.
---
 docs/ml-features.md                           | 83 +++++++++++++++++++
 .../spark/ml/feature/VectorIndexer.scala      | 10 +++
 .../ml/feature/JavaVectorIndexerSuite.java    |  4 +-
 3 files changed, 96 insertions(+), 1 deletion(-)

diff --git a/docs/ml-features.md b/docs/ml-features.md
index 235029d71fadd..06f1ac196b39d 100644
--- a/docs/ml-features.md
+++ b/docs/ml-features.md
@@ -535,5 +535,88 @@ encoded = encoder.transform(indexed)
 </div>
 </div>
 
+## VectorIndexer
+
+`VectorIndexer` helps index categorical features in datasets of `Vector`s.
+It can both automatically decide which features are categorical and convert original values to category indices.  Specifically, it does the following:
+
+1. Take an input column of type [Vector](api/scala/index.html#org.apache.spark.mllib.linalg.Vector) and a parameter `maxCategories`.
+2. Decide which features should be categorical based on the number of distinct values, where features with at most `maxCategories` are declared categorical.
+3. Compute 0-based category indices for each categorical feature.
+4. Index categorical features and transform original feature values to indices.
+
+Indexing categorical features allows algorithms such as Decision Trees and Tree Ensembles to treat categorical features appropriately, improving performance.
+
+Please refer to the [VectorIndexer API docs](api/scala/index.html#org.apache.spark.ml.feature.VectorIndexer) for more details.
+
+In the example below, we read in a dataset of labeled points and then use `VectorIndexer` to decide which features should be treated as categorical.  We transform the categorical feature values to their indices.  This transformed data could then be passed to algorithms such as `DecisionTreeRegressor` that handle categorical features.
+
+<div class="codetabs">
+<div data-lang="scala" markdown="1">
+{% highlight scala %}
+import org.apache.spark.ml.feature.VectorIndexer
+import org.apache.spark.mllib.util.MLUtils
+
+val data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt").toDF()
+val indexer = new VectorIndexer()
+  .setInputCol("features")
+  .setOutputCol("indexed")
+  .setMaxCategories(10)
+val indexerModel = indexer.fit(data)
+val categoricalFeatures: Set[Int] = indexerModel.categoryMaps.keys.toSet
+println(s"Chose ${categoricalFeatures.size} categorical features: " +
+  categoricalFeatures.mkString(", "))
+
+// Create new column "indexed" with categorical values transformed to indices
+val indexedData = indexerModel.transform(data)
+{% endhighlight %}
+</div>
+
+<div data-lang="java" markdown="1">
+{% highlight java %}
+import java.util.Map;
+
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.ml.feature.VectorIndexer;
+import org.apache.spark.ml.feature.VectorIndexerModel;
+import org.apache.spark.mllib.regression.LabeledPoint;
+import org.apache.spark.mllib.util.MLUtils;
+import org.apache.spark.sql.DataFrame;
+
+JavaRDD<LabeledPoint> rdd = MLUtils.loadLibSVMFile(sc.sc(),
+  "data/mllib/sample_libsvm_data.txt").toJavaRDD();
+DataFrame data = sqlContext.createDataFrame(rdd, LabeledPoint.class);
+VectorIndexer indexer = new VectorIndexer()
+  .setInputCol("features")
+  .setOutputCol("indexed")
+  .setMaxCategories(10);
+VectorIndexerModel indexerModel = indexer.fit(data);
+Map<Integer, Map<Double, Integer>> categoryMaps = indexerModel.javaCategoryMaps();
+System.out.print("Chose " + categoryMaps.size() + "categorical features:");
+for (Integer feature : categoryMaps.keySet()) {
+  System.out.print(" " + feature);
+}
+System.out.println();
+
+// Create new column "indexed" with categorical values transformed to indices
+DataFrame indexedData = indexerModel.transform(data);
+{% endhighlight %}
+</div>
+
+<div data-lang="python" markdown="1">
+{% highlight python %}
+from pyspark.ml.feature import VectorIndexer
+from pyspark.mllib.util import MLUtils
+
+data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt").toDF()
+indexer = VectorIndexer(inputCol="features", outputCol="indexed", maxCategories=10)
+indexerModel = indexer.fit(data)
+
+# Create new column "indexed" with categorical values transformed to indices
+indexedData = indexerModel.transform(data)
+{% endhighlight %}
+</div>
+</div>
+
 # Feature Selectors
 
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/VectorIndexer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/VectorIndexer.scala
index 6d1d0524e59ee..e238fb310ed37 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/VectorIndexer.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/VectorIndexer.scala
@@ -17,6 +17,11 @@
 
 package org.apache.spark.ml.feature
 
+import java.lang.{Double => JDouble, Integer => JInt}
+import java.util.{Map => JMap}
+
+import scala.collection.JavaConverters._
+
 import org.apache.spark.annotation.AlphaComponent
 import org.apache.spark.ml.{Estimator, Model}
 import org.apache.spark.ml.attribute._
@@ -248,6 +253,11 @@ class VectorIndexerModel private[ml] (
     val categoryMaps: Map[Int, Map[Double, Int]])
   extends Model[VectorIndexerModel] with VectorIndexerParams {
 
+  /** Java-friendly version of [[categoryMaps]] */
+  def javaCategoryMaps: JMap[JInt, JMap[JDouble, JInt]] = {
+    categoryMaps.mapValues(_.asJava).asJava.asInstanceOf[JMap[JInt, JMap[JDouble, JInt]]]
+  }
+
   /**
    * Pre-computed feature attributes, with some missing info.
    * In transform(), set attribute name and other info, if available.
diff --git a/mllib/src/test/java/org/apache/spark/ml/feature/JavaVectorIndexerSuite.java b/mllib/src/test/java/org/apache/spark/ml/feature/JavaVectorIndexerSuite.java
index 161100134c92d..c7ae5468b9429 100644
--- a/mllib/src/test/java/org/apache/spark/ml/feature/JavaVectorIndexerSuite.java
+++ b/mllib/src/test/java/org/apache/spark/ml/feature/JavaVectorIndexerSuite.java
@@ -19,6 +19,7 @@
 
 import java.io.Serializable;
 import java.util.List;
+import java.util.Map;
 
 import org.junit.After;
 import org.junit.Assert;
@@ -64,7 +65,8 @@ public void vectorIndexerAPI() {
       .setMaxCategories(2);
     VectorIndexerModel model = indexer.fit(data);
     Assert.assertEquals(model.numFeatures(), 2);
-    Assert.assertEquals(model.categoryMaps().size(), 1);
+    Map<Integer, Map<Double, Integer>> categoryMaps = model.javaCategoryMaps();
+    Assert.assertEquals(categoryMaps.size(), 1);
     DataFrame indexedData = model.transform(data);
   }
 }

From cdc7c055c931c4c931a11b510de473455f3256da Mon Sep 17 00:00:00 2001
From: Xiangrui Meng <meng@databricks.com>
Date: Thu, 21 May 2015 13:06:53 -0700
Subject: [PATCH 119/525] [SPARK-7498] [MLLIB] add varargs back to setDefault

We removed `varargs` due to Java compilation issues. That was a false alarm because I didn't run `build/sbt clean`. So this PR reverts the changes. jkbradley

Author: Xiangrui Meng <meng@databricks.com>

Closes #6320 from mengxr/SPARK-7498 and squashes the following commits:

74a7259 [Xiangrui Meng] add varargs back to setDefault
---
 .../src/main/scala/org/apache/spark/ml/param/params.scala  | 7 +++----
 .../java/org/apache/spark/ml/param/JavaTestParams.java     | 1 +
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/param/params.scala b/mllib/src/main/scala/org/apache/spark/ml/param/params.scala
index c33b66d31cd4f..94abfcda5cf2a 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/param/params.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/param/params.scala
@@ -438,19 +438,18 @@ trait Params extends Identifiable with Serializable {
    * @param value  the default value
    */
   protected final def setDefault[T](param: Param[T], value: T): this.type = {
-    defaultParamMap.put(param, value)
+    defaultParamMap.put(param -> value)
     this
   }
 
   /**
    * Sets default values for a list of params.
    *
-   * Note: Java developers should use the single-parameter [[setDefault()]].
-   *       Annotating this with varargs causes compilation failures. See SPARK-7498.
    * @param paramPairs  a list of param pairs that specify params and their default values to set
    *                    respectively. Make sure that the params are initialized before this method
    *                    gets called.
    */
+  @varargs
   protected final def setDefault(paramPairs: ParamPair[_]*): this.type = {
     paramPairs.foreach { p =>
       setDefault(p.param.asInstanceOf[Param[Any]], p.value)
@@ -559,7 +558,7 @@ final class ParamMap private[ml] (private val map: mutable.Map[Param[Any], Any])
   /**
    * Puts a (param, value) pair (overwrites if the input param exists).
    */
-  def put[T](param: Param[T], value: T): this.type = put(ParamPair(param, value))
+  def put[T](param: Param[T], value: T): this.type = put(param -> value)
 
   /**
    * Puts a list of param pairs (overwrites if the input params exists).
diff --git a/mllib/src/test/java/org/apache/spark/ml/param/JavaTestParams.java b/mllib/src/test/java/org/apache/spark/ml/param/JavaTestParams.java
index 3a41890b92d63..947ae3a2ce06f 100644
--- a/mllib/src/test/java/org/apache/spark/ml/param/JavaTestParams.java
+++ b/mllib/src/test/java/org/apache/spark/ml/param/JavaTestParams.java
@@ -81,5 +81,6 @@ private void init() {
       ParamValidators.inArray(validStrings));
     setDefault(myIntParam_, 1);
     setDefault(myDoubleParam_, 0.5);
+    setDefault(myIntParam().w(1), myDoubleParam().w(0.5));
   }
 }

From 311fab6f1b00db1a581d77be5196dd045f93d83d Mon Sep 17 00:00:00 2001
From: Tathagata Das <tathagata.das1565@gmail.com>
Date: Thu, 21 May 2015 13:50:08 -0700
Subject: [PATCH 120/525] [SPARK-7722] [STREAMING] Added Kinesis to style
 checker

Author: Tathagata Das <tathagata.das1565@gmail.com>

Closes #6325 from tdas/SPARK-7722 and squashes the following commits:

9ab35b2 [Tathagata Das] Fixed styles in Kinesis
---
 dev/scalastyle                                            | 8 ++++----
 .../spark/streaming/kinesis/KinesisReceiverSuite.scala    | 2 +-
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/dev/scalastyle b/dev/scalastyle
index 7f014c82f14c6..ad93f7e85b27c 100755
--- a/dev/scalastyle
+++ b/dev/scalastyle
@@ -17,11 +17,11 @@
 # limitations under the License.
 #
 
-echo -e "q\n" | build/sbt -Phive -Phive-thriftserver scalastyle > scalastyle.txt
-echo -e "q\n" | build/sbt -Phive -Phive-thriftserver test:scalastyle >> scalastyle.txt
+echo -e "q\n" | build/sbt -Pkinesis-asl -Phive -Phive-thriftserver scalastyle > scalastyle.txt
+echo -e "q\n" | build/sbt -Pkinesis-asl -Phive -Phive-thriftserver test:scalastyle >> scalastyle.txt
 # Check style with YARN built too
-echo -e "q\n" | build/sbt -Pyarn -Phadoop-2.2 scalastyle >> scalastyle.txt
-echo -e "q\n" | build/sbt -Pyarn -Phadoop-2.2 test:scalastyle >> scalastyle.txt
+echo -e "q\n" | build/sbt -Pkinesis-asl -Pyarn -Phadoop-2.2 scalastyle >> scalastyle.txt
+echo -e "q\n" | build/sbt -Pkinesis-asl -Pyarn -Phadoop-2.2 test:scalastyle >> scalastyle.txt
 
 ERRORS=$(cat scalastyle.txt | awk '{if($1~/error/)print}')
 rm scalastyle.txt
diff --git a/extras/kinesis-asl/src/test/scala/org/apache/spark/streaming/kinesis/KinesisReceiverSuite.scala b/extras/kinesis-asl/src/test/scala/org/apache/spark/streaming/kinesis/KinesisReceiverSuite.scala
index cd19c33b90050..2103dca6b766f 100644
--- a/extras/kinesis-asl/src/test/scala/org/apache/spark/streaming/kinesis/KinesisReceiverSuite.scala
+++ b/extras/kinesis-asl/src/test/scala/org/apache/spark/streaming/kinesis/KinesisReceiverSuite.scala
@@ -57,7 +57,7 @@ class KinesisReceiverSuite extends TestSuiteBase with Matchers with BeforeAndAft
   var checkpointStateMock: KinesisCheckpointState = _
   var currentClockMock: Clock = _
 
-  override def beforeFunction() = {
+  override def beforeFunction(): Unit = {
     receiverMock = mock[KinesisReceiver]
     checkpointerMock = mock[IRecordProcessorCheckpointer]
     checkpointClockMock = mock[ManualClock]

From 30f3f556f7161a49baf145c0cbba8c088b512a6a Mon Sep 17 00:00:00 2001
From: Yin Huai <yhuai@databricks.com>
Date: Thu, 21 May 2015 13:51:40 -0700
Subject: [PATCH 121/525] [SPARK-7763] [SPARK-7616] [SQL] Persists partition
 columns into metastore

Author: Yin Huai <yhuai@databricks.com>
Author: Cheng Lian <lian@databricks.com>

Closes #6285 from liancheng/spark-7763 and squashes the following commits:

bb2829d [Yin Huai] Fix hashCode.
d677f7d [Cheng Lian] Fixes Scala style issue
44b283f [Cheng Lian] Adds test case for SPARK-7616
6733276 [Yin Huai] Fix a bug that potentially causes https://issues.apache.org/jira/browse/SPARK-7616.
6cabf3c [Yin Huai] Update unit test.
7e02910 [Yin Huai] Use metastore partition columns and do not hijack maybePartitionSpec.
e9a03ec [Cheng Lian] Persists partition columns into metastore
---
 .../apache/spark/sql/parquet/newParquet.scala | 26 +++++++---
 .../apache/spark/sql/sources/commands.scala   |  2 +
 .../org/apache/spark/sql/sources/ddl.scala    | 19 +++++--
 .../apache/spark/sql/sources/interfaces.scala | 31 ++++++++++--
 .../apache/spark/sql/test/SQLTestUtils.scala  |  7 +++
 .../spark/sql/hive/HiveMetastoreCatalog.scala | 49 +++++++++++++++----
 .../spark/sql/hive/execution/commands.scala   |  2 +
 .../spark/sql/hive/orc/OrcRelation.scala      | 35 ++++++++-----
 .../sql/hive/MetastoreDataSourcesSuite.scala  | 30 ++++++++++++
 .../apache/spark/sql/hive/parquetSuites.scala | 28 +++++------
 .../sql/sources/SimpleTextRelation.scala      |  2 +-
 .../sql/sources/hadoopFsRelationSuites.scala  | 36 ++++++++++++--
 12 files changed, 211 insertions(+), 56 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/newParquet.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/newParquet.scala
index c35b7eff82af5..32986aa3ecc20 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/newParquet.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/newParquet.scala
@@ -49,8 +49,7 @@ private[sql] class DefaultSource extends HadoopFsRelationProvider {
       schema: Option[StructType],
       partitionColumns: Option[StructType],
       parameters: Map[String, String]): HadoopFsRelation = {
-    val partitionSpec = partitionColumns.map(PartitionSpec(_, Seq.empty))
-    new ParquetRelation2(paths, schema, partitionSpec, parameters)(sqlContext)
+    new ParquetRelation2(paths, schema, None, partitionColumns, parameters)(sqlContext)
   }
 }
 
@@ -118,12 +117,28 @@ private[sql] class ParquetOutputWriter(path: String, context: TaskAttemptContext
 private[sql] class ParquetRelation2(
     override val paths: Array[String],
     private val maybeDataSchema: Option[StructType],
+    // This is for metastore conversion.
     private val maybePartitionSpec: Option[PartitionSpec],
+    override val userDefinedPartitionColumns: Option[StructType],
     parameters: Map[String, String])(
     val sqlContext: SQLContext)
   extends HadoopFsRelation(maybePartitionSpec)
   with Logging {
 
+  private[sql] def this(
+      paths: Array[String],
+      maybeDataSchema: Option[StructType],
+      maybePartitionSpec: Option[PartitionSpec],
+      parameters: Map[String, String])(
+      sqlContext: SQLContext) = {
+    this(
+      paths,
+      maybeDataSchema,
+      maybePartitionSpec,
+      maybePartitionSpec.map(_.partitionColumns),
+      parameters)(sqlContext)
+  }
+
   // Should we merge schemas from all Parquet part-files?
   private val shouldMergeSchemas =
     parameters.getOrElse(ParquetRelation2.MERGE_SCHEMA, "true").toBoolean
@@ -161,7 +176,7 @@ private[sql] class ParquetRelation2(
         Boolean.box(shouldMergeSchemas),
         paths.toSet,
         maybeDataSchema,
-        maybePartitionSpec)
+        partitionColumns)
     } else {
       Objects.hashCode(
         Boolean.box(shouldMergeSchemas),
@@ -169,7 +184,7 @@ private[sql] class ParquetRelation2(
         dataSchema,
         schema,
         maybeDataSchema,
-        maybePartitionSpec)
+        partitionColumns)
     }
   }
 
@@ -185,9 +200,6 @@ private[sql] class ParquetRelation2(
 
   override def sizeInBytes: Long = metadataCache.dataStatuses.map(_.getLen).sum
 
-  override def userDefinedPartitionColumns: Option[StructType] =
-    maybePartitionSpec.map(_.partitionColumns)
-
   override def prepareJobForWrite(job: Job): OutputWriterFactory = {
     val conf = ContextUtil.getConfiguration(job)
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/sources/commands.scala b/sql/core/src/main/scala/org/apache/spark/sql/sources/commands.scala
index d54dbb0831444..498f7538d4f55 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/sources/commands.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/sources/commands.scala
@@ -93,6 +93,8 @@ private[sql] case class InsertIntoHadoopFsRelation(
       job.setOutputValueClass(classOf[Row])
       FileOutputFormat.setOutputPath(job, qualifiedOutputPath)
 
+      // We create a DataFrame by applying the schema of relation to the data to make sure.
+      // We are writing data based on the expected schema,
       val df = sqlContext.createDataFrame(
         DataFrame(sqlContext, query).queryExecution.toRdd,
         relation.schema,
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/sources/ddl.scala b/sql/core/src/main/scala/org/apache/spark/sql/sources/ddl.scala
index a13ab74852ff3..5e723122eeab1 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/sources/ddl.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/sources/ddl.scala
@@ -25,7 +25,7 @@ import org.apache.hadoop.fs.Path
 import org.apache.spark.Logging
 import org.apache.spark.deploy.SparkHadoopUtil
 import org.apache.spark.sql.catalyst.AbstractSparkSQLParser
-import org.apache.spark.sql.catalyst.analysis.UnresolvedRelation
+import org.apache.spark.sql.catalyst.analysis.{UnresolvedAttribute, UnresolvedRelation}
 import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference, Row}
 import org.apache.spark.sql.catalyst.plans.logical._
 import org.apache.spark.sql.execution.RunnableCommand
@@ -245,12 +245,13 @@ private[sql] object ResolvedDataSource {
             SparkHadoopUtil.get.globPath(patternPath).map(_.toString).toArray
           }
 
-          val dataSchema = StructType(schema.filterNot(f => partitionColumns.contains(f.name)))
+          val dataSchema =
+            StructType(schema.filterNot(f => partitionColumns.contains(f.name))).asNullable
 
           dataSource.createRelation(
             sqlContext,
             paths,
-            Some(schema),
+            Some(dataSchema),
             maybePartitionsSchema,
             caseInsensitiveOptions)
         case dataSource: org.apache.spark.sql.sources.RelationProvider =>
@@ -320,10 +321,20 @@ private[sql] object ResolvedDataSource {
           Some(dataSchema.asNullable),
           Some(partitionColumnsSchema(data.schema, partitionColumns)),
           caseInsensitiveOptions)
+
+        // For partitioned relation r, r.schema's column ordering is different with the column
+        // ordering of data.logicalPlan. We need a Project to adjust the ordering.
+        // So, inside InsertIntoHadoopFsRelation, we can safely apply the schema of r.schema to
+        // the data.
+        val project =
+          Project(
+            r.schema.map(field => new UnresolvedAttribute(Seq(field.name))),
+            data.logicalPlan)
+
         sqlContext.executePlan(
           InsertIntoHadoopFsRelation(
             r,
-            data.logicalPlan,
+            project,
             partitionColumns.toArray,
             mode)).toRdd
         r
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/sources/interfaces.scala b/sql/core/src/main/scala/org/apache/spark/sql/sources/interfaces.scala
index fcbac0d457950..61fc4e5c19998 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/sources/interfaces.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/sources/interfaces.scala
@@ -28,7 +28,7 @@ import org.apache.spark.annotation.{DeveloperApi, Experimental}
 import org.apache.spark.broadcast.Broadcast
 import org.apache.spark.rdd.RDD
 import org.apache.spark.SerializableWritable
-import org.apache.spark.sql._
+import org.apache.spark.sql.{Row, _}
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.expressions.codegen.GenerateMutableProjection
 import org.apache.spark.sql.types.{StructField, StructType}
@@ -120,11 +120,13 @@ trait HadoopFsRelationProvider {
    * Returns a new base relation with the given parameters, a user defined schema, and a list of
    * partition columns. Note: the parameters' keywords are case insensitive and this insensitivity
    * is enforced by the Map that is passed to the function.
+   *
+   * @param dataSchema Schema of data columns (i.e., columns that are not partition columns).
    */
   def createRelation(
       sqlContext: SQLContext,
       paths: Array[String],
-      schema: Option[StructType],
+      dataSchema: Option[StructType],
       partitionColumns: Option[StructType],
       parameters: Map[String, String]): HadoopFsRelation
 }
@@ -416,8 +418,29 @@ abstract class HadoopFsRelation private[sql](maybePartitionSpec: Option[Partitio
   final private[sql] def partitionSpec: PartitionSpec = {
     if (_partitionSpec == null) {
       _partitionSpec = maybePartitionSpec
-        .map(spec => spec.copy(partitionColumns = spec.partitionColumns.asNullable))
-        .orElse(userDefinedPartitionColumns.map(PartitionSpec(_, Array.empty[Partition])))
+        .flatMap {
+          case spec if spec.partitions.nonEmpty =>
+            Some(spec.copy(partitionColumns = spec.partitionColumns.asNullable))
+          case _ =>
+            None
+        }
+        .orElse {
+          // We only know the partition columns and their data types. We need to discover
+          // partition values.
+          userDefinedPartitionColumns.map { partitionSchema =>
+            val spec = discoverPartitions()
+            val castedPartitions = spec.partitions.map { case p @ Partition(values, path) =>
+              val literals = values.toSeq.zip(spec.partitionColumns.map(_.dataType)).map {
+                case (value, dataType) => Literal.create(value, dataType)
+              }
+              val castedValues = partitionSchema.zip(literals).map { case (field, literal) =>
+                Cast(literal, field.dataType).eval()
+              }
+              p.copy(values = Row.fromSeq(castedValues))
+            }
+            PartitionSpec(partitionSchema, castedPartitions)
+          }
+        }
         .getOrElse {
           if (sqlContext.conf.partitionDiscoveryEnabled()) {
             discoverPartitions()
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/test/SQLTestUtils.scala b/sql/core/src/main/scala/org/apache/spark/sql/test/SQLTestUtils.scala
index 75d290625ec38..ca66cdc48272d 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/test/SQLTestUtils.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/test/SQLTestUtils.scala
@@ -78,4 +78,11 @@ trait SQLTestUtils {
   protected def withTempTable(tableName: String)(f: => Unit): Unit = {
     try f finally sqlContext.dropTempTable(tableName)
   }
+
+  /**
+   * Drops table `tableName` after calling `f`.
+   */
+  protected def withTable(tableName: String)(f: => Unit): Unit = {
+    try f finally sqlContext.sql(s"DROP TABLE IF EXISTS $tableName")
+  }
 }
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
index 2aa80b47a97e2..5b6840008f1ce 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
@@ -66,11 +66,11 @@ private[hive] class HiveMetastoreCatalog(val client: ClientInterface, hive: Hive
         def schemaStringFromParts: Option[String] = {
           table.properties.get("spark.sql.sources.schema.numParts").map { numParts =>
             val parts = (0 until numParts.toInt).map { index =>
-              val part = table.properties.get(s"spark.sql.sources.schema.part.${index}").orNull
+              val part = table.properties.get(s"spark.sql.sources.schema.part.$index").orNull
               if (part == null) {
                 throw new AnalysisException(
-                  s"Could not read schema from the metastore because it is corrupted " +
-                  s"(missing part ${index} of the schema).")
+                  "Could not read schema from the metastore because it is corrupted " +
+                    s"(missing part $index of the schema, $numParts parts are expected).")
               }
 
               part
@@ -89,6 +89,11 @@ private[hive] class HiveMetastoreCatalog(val client: ClientInterface, hive: Hive
         val userSpecifiedSchema =
           schemaString.map(s => DataType.fromJson(s).asInstanceOf[StructType])
 
+        // We only need names at here since userSpecifiedSchema we loaded from the metastore
+        // contains partition columns. We can always get datatypes of partitioning columns
+        // from userSpecifiedSchema.
+        val partitionColumns = table.partitionColumns.map(_.name)
+
         // It does not appear that the ql client for the metastore has a way to enumerate all the
         // SerDe properties directly...
         val options = table.serdeProperties
@@ -97,7 +102,7 @@ private[hive] class HiveMetastoreCatalog(val client: ClientInterface, hive: Hive
           ResolvedDataSource(
             hive,
             userSpecifiedSchema,
-            Array.empty[String],
+            partitionColumns.toArray,
             table.properties("spark.sql.sources.provider"),
             options)
 
@@ -111,8 +116,8 @@ private[hive] class HiveMetastoreCatalog(val client: ClientInterface, hive: Hive
   override def refreshTable(databaseName: String, tableName: String): Unit = {
     // refreshTable does not eagerly reload the cache. It just invalidate the cache.
     // Next time when we use the table, it will be populated in the cache.
-    // Since we also cache ParquetRealtions converted from Hive Parquet tables and
-    // adding converted ParquetRealtions into the cache is not defined in the load function
+    // Since we also cache ParquetRelations converted from Hive Parquet tables and
+    // adding converted ParquetRelations into the cache is not defined in the load function
     // of the cache (instead, we add the cache entry in convertToParquetRelation),
     // it is better at here to invalidate the cache to avoid confusing waring logs from the
     // cache loader (e.g. cannot find data source provider, which is only defined for
@@ -133,12 +138,17 @@ private[hive] class HiveMetastoreCatalog(val client: ClientInterface, hive: Hive
   def createDataSourceTable(
       tableName: String,
       userSpecifiedSchema: Option[StructType],
+      partitionColumns: Array[String],
       provider: String,
       options: Map[String, String],
       isExternal: Boolean): Unit = {
     val (dbName, tblName) = processDatabaseAndTableName("default", tableName)
     val tableProperties = new scala.collection.mutable.HashMap[String, String]
     tableProperties.put("spark.sql.sources.provider", provider)
+
+    // Saves optional user specified schema.  Serialized JSON schema string may be too long to be
+    // stored into a single metastore SerDe property.  In this case, we split the JSON string and
+    // store each part as a separate SerDe property.
     if (userSpecifiedSchema.isDefined) {
       val threshold = conf.schemaStringLengthThreshold
       val schemaJsonString = userSpecifiedSchema.get.json
@@ -146,8 +156,29 @@ private[hive] class HiveMetastoreCatalog(val client: ClientInterface, hive: Hive
       val parts = schemaJsonString.grouped(threshold).toSeq
       tableProperties.put("spark.sql.sources.schema.numParts", parts.size.toString)
       parts.zipWithIndex.foreach { case (part, index) =>
-        tableProperties.put(s"spark.sql.sources.schema.part.${index}", part)
+        tableProperties.put(s"spark.sql.sources.schema.part.$index", part)
+      }
+    }
+
+    val metastorePartitionColumns = userSpecifiedSchema.map { schema =>
+      val fields = partitionColumns.map(col => schema(col))
+      fields.map { field =>
+        HiveColumn(
+          name = field.name,
+          hiveType = HiveMetastoreTypes.toMetastoreType(field.dataType),
+          comment = "")
+      }.toSeq
+    }.getOrElse {
+      if (partitionColumns.length > 0) {
+        // The table does not have a specified schema, which means that the schema will be inferred
+        // when we load the table. So, we are not expecting partition columns and we will discover
+        // partitions when we load the table. However, if there are specified partition columns,
+        // we simplily ignore them and provide a warning message..
+        logWarning(
+          s"The schema and partitions of table $tableName will be inferred when it is loaded. " +
+            s"Specified partition columns (${partitionColumns.mkString(",")}) will be ignored.")
       }
+      Seq.empty[HiveColumn]
     }
 
     val tableType = if (isExternal) {
@@ -163,7 +194,7 @@ private[hive] class HiveMetastoreCatalog(val client: ClientInterface, hive: Hive
         specifiedDatabase = Option(dbName),
         name = tblName,
         schema = Seq.empty,
-        partitionColumns = Seq.empty,
+        partitionColumns = metastorePartitionColumns,
         tableType = tableType,
         properties = tableProperties.toMap,
         serdeProperties = options))
@@ -199,7 +230,7 @@ private[hive] class HiveMetastoreCatalog(val client: ClientInterface, hive: Hive
       val dataSourceTable =
         cachedDataSourceTables(QualifiedTableName(databaseName, tblName).toLowerCase)
       // Then, if alias is specified, wrap the table with a Subquery using the alias.
-      // Othersie, wrap the table with a Subquery using the table name.
+      // Otherwise, wrap the table with a Subquery using the table name.
       val withAlias =
         alias.map(a => Subquery(a, dataSourceTable)).getOrElse(
           Subquery(tableIdent.last, dataSourceTable))
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/commands.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/commands.scala
index 6609763343752..0ba94d7b7c649 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/commands.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/commands.scala
@@ -146,6 +146,7 @@ case class CreateMetastoreDataSource(
     hiveContext.catalog.createDataSourceTable(
       tableName,
       userSpecifiedSchema,
+      Array.empty[String],
       provider,
       optionsWithPath,
       isExternal)
@@ -244,6 +245,7 @@ case class CreateMetastoreDataSourceAsSelect(
       hiveContext.catalog.createDataSourceTable(
         tableName,
         Some(resolved.relation.schema),
+        partitionColumns,
         provider,
         optionsWithPath,
         isExternal)
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcRelation.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcRelation.scala
index b69e14a179d0a..f03c4cd54e7e6 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcRelation.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcRelation.scala
@@ -48,15 +48,14 @@ private[sql] class DefaultSource extends HadoopFsRelationProvider {
   def createRelation(
       sqlContext: SQLContext,
       paths: Array[String],
-      schema: Option[StructType],
+      dataSchema: Option[StructType],
       partitionColumns: Option[StructType],
       parameters: Map[String, String]): HadoopFsRelation = {
     assert(
       sqlContext.isInstanceOf[HiveContext],
       "The ORC data source can only be used with HiveContext.")
 
-    val partitionSpec = partitionColumns.map(PartitionSpec(_, Seq.empty[Partition]))
-    OrcRelation(paths, parameters, schema, partitionSpec)(sqlContext)
+    new OrcRelation(paths, dataSchema, None, partitionColumns, parameters)(sqlContext)
   }
 }
 
@@ -136,23 +135,35 @@ private[orc] class OrcOutputWriter(
 }
 
 @DeveloperApi
-private[sql] case class OrcRelation(
+private[sql] class OrcRelation(
     override val paths: Array[String],
-    parameters: Map[String, String],
-    maybeSchema: Option[StructType] = None,
-    maybePartitionSpec: Option[PartitionSpec] = None)(
+    maybeDataSchema: Option[StructType],
+    maybePartitionSpec: Option[PartitionSpec],
+    override val userDefinedPartitionColumns: Option[StructType],
+    parameters: Map[String, String])(
     @transient val sqlContext: SQLContext)
   extends HadoopFsRelation(maybePartitionSpec)
   with Logging {
 
-  override val dataSchema: StructType = maybeSchema.getOrElse {
+  private[sql] def this(
+      paths: Array[String],
+      maybeDataSchema: Option[StructType],
+      maybePartitionSpec: Option[PartitionSpec],
+      parameters: Map[String, String])(
+      sqlContext: SQLContext) = {
+    this(
+      paths,
+      maybeDataSchema,
+      maybePartitionSpec,
+      maybePartitionSpec.map(_.partitionColumns),
+      parameters)(sqlContext)
+  }
+
+  override val dataSchema: StructType = maybeDataSchema.getOrElse {
     OrcFileOperator.readSchema(
       paths.head, Some(sqlContext.sparkContext.hadoopConfiguration))
   }
 
-  override def userDefinedPartitionColumns: Option[StructType] =
-    maybePartitionSpec.map(_.partitionColumns)
-
   override def needConversion: Boolean = false
 
   override def equals(other: Any): Boolean = other match {
@@ -169,7 +180,7 @@ private[sql] case class OrcRelation(
       paths.toSet,
       dataSchema,
       schema,
-      maybePartitionSpec)
+      partitionColumns)
   }
 
   override def buildScan(
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala
index 30db976a3ae74..c4c7b634964ed 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala
@@ -670,6 +670,7 @@ class MetastoreDataSourcesSuite extends QueryTest with BeforeAndAfterEach {
     catalog.createDataSourceTable(
       tableName = "wide_schema",
       userSpecifiedSchema = Some(schema),
+      partitionColumns = Array.empty[String],
       provider = "json",
       options = Map("path" -> "just a dummy path"),
       isExternal = false)
@@ -705,6 +706,35 @@ class MetastoreDataSourcesSuite extends QueryTest with BeforeAndAfterEach {
     sql(s"drop table $tableName")
   }
 
+  test("Saving partition columns information") {
+    val df =
+      sparkContext.parallelize(1 to 10, 4).map { i =>
+        Tuple4(i, i + 1, s"str$i", s"str${i + 1}")
+      }.toDF("a", "b", "c", "d")
+
+    val tableName = s"partitionInfo_${System.currentTimeMillis()}"
+    df.write.format("parquet").partitionBy("d", "b").saveAsTable(tableName)
+    invalidateTable(tableName)
+    val metastoreTable = catalog.client.getTable("default", tableName)
+    val expectedPartitionColumns =
+      StructType(df.schema("d") :: df.schema("b") :: Nil)
+    val actualPartitionColumns =
+      StructType(
+        metastoreTable.partitionColumns.map(c =>
+          StructField(c.name, HiveMetastoreTypes.toDataType(c.hiveType))))
+    // Make sure partition columns are correctly stored in metastore.
+    assert(
+      expectedPartitionColumns.sameType(actualPartitionColumns),
+      s"Partitions columns stored in metastore $actualPartitionColumns is not the " +
+        s"partition columns defined by the saveAsTable operation $expectedPartitionColumns.")
+
+    // Check the content of the saved table.
+    checkAnswer(
+      table(tableName).selectExpr("c", "b", "d", "a"),
+      df.selectExpr("c", "b", "d", "a").collect())
+
+    sql(s"drop table $tableName")
+  }
 
   test("insert into a table") {
     def createDF(from: Int, to: Int): DataFrame =
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/parquetSuites.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/parquetSuites.scala
index 1da990bc959ba..223ba65f47b90 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/parquetSuites.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/parquetSuites.scala
@@ -435,9 +435,9 @@ class ParquetDataSourceOnMetastoreSuite extends ParquetMetastoreSuiteBase {
   }
 
   test("Caching converted data source Parquet Relations") {
-    def checkCached(tableIdentifer: catalog.QualifiedTableName): Unit = {
+    def checkCached(tableIdentifier: catalog.QualifiedTableName): Unit = {
       // Converted test_parquet should be cached.
-      catalog.cachedDataSourceTables.getIfPresent(tableIdentifer) match {
+      catalog.cachedDataSourceTables.getIfPresent(tableIdentifier) match {
         case null => fail("Converted test_parquet should be cached in the cache.")
         case logical @ LogicalRelation(parquetRelation: ParquetRelation2) => // OK
         case other =>
@@ -463,30 +463,30 @@ class ParquetDataSourceOnMetastoreSuite extends ParquetMetastoreSuiteBase {
         |  OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat'
       """.stripMargin)
 
-    var tableIdentifer = catalog.QualifiedTableName("default", "test_insert_parquet")
+    var tableIdentifier = catalog.QualifiedTableName("default", "test_insert_parquet")
 
     // First, make sure the converted test_parquet is not cached.
-    assert(catalog.cachedDataSourceTables.getIfPresent(tableIdentifer) === null)
+    assert(catalog.cachedDataSourceTables.getIfPresent(tableIdentifier) === null)
     // Table lookup will make the table cached.
     table("test_insert_parquet")
-    checkCached(tableIdentifer)
+    checkCached(tableIdentifier)
     // For insert into non-partitioned table, we will do the conversion,
     // so the converted test_insert_parquet should be cached.
     invalidateTable("test_insert_parquet")
-    assert(catalog.cachedDataSourceTables.getIfPresent(tableIdentifer) === null)
+    assert(catalog.cachedDataSourceTables.getIfPresent(tableIdentifier) === null)
     sql(
       """
         |INSERT INTO TABLE test_insert_parquet
         |select a, b from jt
       """.stripMargin)
-    checkCached(tableIdentifer)
+    checkCached(tableIdentifier)
     // Make sure we can read the data.
     checkAnswer(
       sql("select * from test_insert_parquet"),
       sql("select a, b from jt").collect())
     // Invalidate the cache.
     invalidateTable("test_insert_parquet")
-    assert(catalog.cachedDataSourceTables.getIfPresent(tableIdentifer) === null)
+    assert(catalog.cachedDataSourceTables.getIfPresent(tableIdentifier) === null)
 
     // Create a partitioned table.
     sql(
@@ -503,8 +503,8 @@ class ParquetDataSourceOnMetastoreSuite extends ParquetMetastoreSuiteBase {
         |  OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat'
       """.stripMargin)
 
-    tableIdentifer = catalog.QualifiedTableName("default", "test_parquet_partitioned_cache_test")
-    assert(catalog.cachedDataSourceTables.getIfPresent(tableIdentifer) === null)
+    tableIdentifier = catalog.QualifiedTableName("default", "test_parquet_partitioned_cache_test")
+    assert(catalog.cachedDataSourceTables.getIfPresent(tableIdentifier) === null)
     sql(
       """
         |INSERT INTO TABLE test_parquet_partitioned_cache_test
@@ -513,18 +513,18 @@ class ParquetDataSourceOnMetastoreSuite extends ParquetMetastoreSuiteBase {
       """.stripMargin)
     // Right now, insert into a partitioned Parquet is not supported in data source Parquet.
     // So, we expect it is not cached.
-    assert(catalog.cachedDataSourceTables.getIfPresent(tableIdentifer) === null)
+    assert(catalog.cachedDataSourceTables.getIfPresent(tableIdentifier) === null)
     sql(
       """
         |INSERT INTO TABLE test_parquet_partitioned_cache_test
         |PARTITION (date='2015-04-02')
         |select a, b from jt
       """.stripMargin)
-    assert(catalog.cachedDataSourceTables.getIfPresent(tableIdentifer) === null)
+    assert(catalog.cachedDataSourceTables.getIfPresent(tableIdentifier) === null)
 
     // Make sure we can cache the partitioned table.
     table("test_parquet_partitioned_cache_test")
-    checkCached(tableIdentifer)
+    checkCached(tableIdentifier)
     // Make sure we can read the data.
     checkAnswer(
       sql("select STRINGField, date, intField from test_parquet_partitioned_cache_test"),
@@ -536,7 +536,7 @@ class ParquetDataSourceOnMetastoreSuite extends ParquetMetastoreSuiteBase {
         """.stripMargin).collect())
 
     invalidateTable("test_parquet_partitioned_cache_test")
-    assert(catalog.cachedDataSourceTables.getIfPresent(tableIdentifer) === null)
+    assert(catalog.cachedDataSourceTables.getIfPresent(tableIdentifier) === null)
 
     sql("DROP TABLE test_insert_parquet")
     sql("DROP TABLE test_parquet_partitioned_cache_test")
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/sources/SimpleTextRelation.scala b/sql/hive/src/test/scala/org/apache/spark/sql/sources/SimpleTextRelation.scala
index 09eed6646c55a..2d69b89fd9a9c 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/sources/SimpleTextRelation.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/sources/SimpleTextRelation.scala
@@ -99,7 +99,7 @@ class SimpleTextRelation(
   }
 
   override def hashCode(): Int =
-    Objects.hashCode(paths, maybeDataSchema, dataSchema)
+    Objects.hashCode(paths, maybeDataSchema, dataSchema, partitionColumns)
 
   override def buildScan(inputStatuses: Array[FileStatus]): RDD[Row] = {
     val fields = dataSchema.map(_.dataType)
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/sources/hadoopFsRelationSuites.scala b/sql/hive/src/test/scala/org/apache/spark/sql/sources/hadoopFsRelationSuites.scala
index ad4a4826c6b45..c7c8bcd27fbde 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/sources/hadoopFsRelationSuites.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/sources/hadoopFsRelationSuites.scala
@@ -22,7 +22,6 @@ import org.apache.hadoop.fs.Path
 import org.apache.spark.deploy.SparkHadoopUtil
 import org.apache.spark.sql._
 import org.apache.spark.sql.hive.test.TestHive
-import org.apache.spark.sql.parquet.ParquetTest
 import org.apache.spark.sql.test.SQLTestUtils
 import org.apache.spark.sql.types._
 
@@ -237,10 +236,6 @@ abstract class HadoopFsRelationTest extends QueryTest with SQLTestUtils {
     }
   }
 
-  def withTable(tableName: String)(f: => Unit): Unit = {
-    try f finally sql(s"DROP TABLE $tableName")
-  }
-
   test("saveAsTable()/load() - non-partitioned table - Overwrite") {
     testDF.write.format(dataSourceName).mode(SaveMode.Overwrite)
       .option("dataSchema", dataSchema.json)
@@ -444,6 +439,23 @@ abstract class HadoopFsRelationTest extends QueryTest with SQLTestUtils {
       checkAnswer(df, partitionedTestDF.collect())
     }
   }
+
+  test("Partition column type casting") {
+    withTempPath { file =>
+      val input = partitionedTestDF.select('a, 'b, 'p1.cast(StringType).as('ps), 'p2)
+
+      input
+        .write
+        .format(dataSourceName)
+        .mode(SaveMode.Overwrite)
+        .partitionBy("ps", "p2")
+        .saveAsTable("t")
+
+      withTempTable("t") {
+        checkAnswer(table("t"), input.collect())
+      }
+    }
+  }
 }
 
 class SimpleTextHadoopFsRelationSuite extends HadoopFsRelationTest {
@@ -504,4 +516,18 @@ class ParquetHadoopFsRelationSuite extends HadoopFsRelationTest {
           .load(file.getCanonicalPath))
     }
   }
+
+  test("SPARK-7616: adjust column name order accordingly when saving partitioned table") {
+    val df = (1 to 3).map(i => (i, s"val_$i", i * 2)).toDF("a", "b", "c")
+
+    df.write
+      .format("parquet")
+      .mode(SaveMode.Overwrite)
+      .partitionBy("c", "a")
+      .saveAsTable("t")
+
+    withTable("t") {
+      checkAnswer(table("t"), df.select('b, 'c, 'a).collect())
+    }
+  }
 }

From 3d0cccc85850ca9c79f3e5ff7395bd04d212b063 Mon Sep 17 00:00:00 2001
From: Tathagata Das <tathagata.das1565@gmail.com>
Date: Thu, 21 May 2015 14:08:20 -0700
Subject: [PATCH 122/525] [SPARK-7478] [SQL] Added SQLContext.getOrCreate

Having a SQLContext singleton would make it easier for applications to use a lazily instantiated single shared instance of SQLContext when needed. It would avoid problems like

1. In REPL/notebook environment, rerunning the line {{val sqlContext = new SQLContext}} multiple times created different contexts while overriding the reference to previous context, leading to issues like registered temp tables going missing.

2. In Streaming, creating SQLContext directly leads to serialization/deserialization issues when attempting to recover from DStream checkpoints. See [SPARK-6770]. Also to get around this problem I had to suggest creating a singleton instance - https://github.com/apache/spark/blob/master/examples/src/main/scala/org/apache/spark/examples/streaming/SqlNetworkWordCount.scala

This can be solved by {{SQLContext.getOrCreate}} which get or creates a new singleton instance of SQLContext using either a given SparkContext or a given SparkConf.

rxin marmbrus

Author: Tathagata Das <tathagata.das1565@gmail.com>

Closes #6006 from tdas/SPARK-7478 and squashes the following commits:

25f4da9 [Tathagata Das] Addressed comments.
79fe069 [Tathagata Das] Added comments.
c66ca76 [Tathagata Das] Merge remote-tracking branch 'apache-github/master' into SPARK-7478
48adb14 [Tathagata Das] Removed HiveContext.getOrCreate
bf8cf50 [Tathagata Das] Fix more bug
dec5594 [Tathagata Das] Fixed bug
b4e9721 [Tathagata Das] Remove unnecessary import
4ef513b [Tathagata Das] Merge remote-tracking branch 'apache-github/master' into SPARK-7478
d3ea8e4 [Tathagata Das] Added HiveContext
83bc950 [Tathagata Das] Updated tests
f82ae81 [Tathagata Das] Fixed test
bc72868 [Tathagata Das] Added SQLContext.getOrCreate
---
 .../org/apache/spark/sql/SQLContext.scala     | 47 +++++++++++++++++-
 .../apache/spark/sql/SQLContextSuite.scala    | 49 +++++++++++++++++++
 2 files changed, 95 insertions(+), 1 deletion(-)
 create mode 100644 sql/core/src/test/scala/org/apache/spark/sql/SQLContextSuite.scala

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
index 304e958192bb9..1ea596dddff02 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
@@ -19,6 +19,7 @@ package org.apache.spark.sql
 
 import java.beans.Introspector
 import java.util.Properties
+import java.util.concurrent.atomic.AtomicReference
 
 import scala.collection.JavaConversions._
 import scala.collection.immutable
@@ -1270,9 +1271,53 @@ class SQLContext(@transient val sparkContext: SparkContext)
 
   ////////////////////////////////////////////////////////////////////////////
   ////////////////////////////////////////////////////////////////////////////
-  // End of eeprecated methods
+  // End of deprecated methods
   ////////////////////////////////////////////////////////////////////////////
   ////////////////////////////////////////////////////////////////////////////
+
+
+  // Register a succesfully instantiatd context to the singleton. This should be at the end of
+  // the class definition so that the singleton is updated only if there is no exception in the
+  // construction of the instance.
+  SQLContext.setLastInstantiatedContext(self)
 }
 
+/**
+ * This SQLContext object contains utility functions to create a singleton SQLContext instance,
+ * or to get the last created SQLContext instance.
+ */
+object SQLContext {
+
+  private val INSTANTIATION_LOCK = new Object()
+
+  /**
+   * Reference to the last created SQLContext.
+   */
+  @transient private val lastInstantiatedContext = new AtomicReference[SQLContext]()
+
+  /**
+   * Get the singleton SQLContext if it exists or create a new one using the given SparkContext.
+   * This function can be used to create a singleton SQLContext object that can be shared across
+   * the JVM.
+   */
+  def getOrCreate(sparkContext: SparkContext): SQLContext = {
+    INSTANTIATION_LOCK.synchronized {
+      if (lastInstantiatedContext.get() == null) {
+        new SQLContext(sparkContext)
+      }
+    }
+    lastInstantiatedContext.get()
+  }
+
+  private[sql] def clearLastInstantiatedContext(): Unit = {
+    INSTANTIATION_LOCK.synchronized {
+      lastInstantiatedContext.set(null)
+    }
+  }
 
+  private[sql] def setLastInstantiatedContext(sqlContext: SQLContext): Unit = {
+    INSTANTIATION_LOCK.synchronized {
+      lastInstantiatedContext.set(sqlContext)
+    }
+  }
+}
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLContextSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLContextSuite.scala
new file mode 100644
index 0000000000000..f186bc1c18123
--- /dev/null
+++ b/sql/core/src/test/scala/org/apache/spark/sql/SQLContextSuite.scala
@@ -0,0 +1,49 @@
+/*
+* Licensed to the Apache Software Foundation (ASF) under one or more
+* contributor license agreements.  See the NOTICE file distributed with
+* this work for additional information regarding copyright ownership.
+* The ASF licenses this file to You under the Apache License, Version 2.0
+* (the "License"); you may not use this file except in compliance with
+* the License.  You may obtain a copy of the License at
+*
+*    http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+package org.apache.spark.sql
+
+import org.scalatest.{BeforeAndAfterAll, FunSuite}
+
+import org.apache.spark.sql.test.TestSQLContext
+
+class SQLContextSuite extends FunSuite with BeforeAndAfterAll {
+
+  private val testSqlContext = TestSQLContext
+  private val testSparkContext = TestSQLContext.sparkContext
+
+  override def afterAll(): Unit = {
+    SQLContext.setLastInstantiatedContext(testSqlContext)
+  }
+
+  test("getOrCreate instantiates SQLContext") {
+    SQLContext.clearLastInstantiatedContext()
+    val sqlContext = SQLContext.getOrCreate(testSparkContext)
+    assert(sqlContext != null, "SQLContext.getOrCreate returned null")
+    assert(SQLContext.getOrCreate(testSparkContext).eq(sqlContext),
+      "SQLContext created by SQLContext.getOrCreate not returned by SQLContext.getOrCreate")
+  }
+
+  test("getOrCreate gets last explicitly instantiated SQLContext") {
+    SQLContext.clearLastInstantiatedContext()
+    val sqlContext = new SQLContext(testSparkContext)
+    assert(SQLContext.getOrCreate(testSparkContext) != null,
+      "SQLContext.getOrCreate after explicitly created SQLContext returned null")
+    assert(SQLContext.getOrCreate(testSparkContext).eq(sqlContext),
+      "SQLContext.getOrCreate after explicitly created SQLContext did not return the context")
+  }
+}

From 6b18cdc1b1284b1d48d637d06a1e64829aeb6202 Mon Sep 17 00:00:00 2001
From: Holden Karau <holden@pigscanfly.ca>
Date: Thu, 21 May 2015 14:08:57 -0700
Subject: [PATCH 123/525] [SPARK-7711] Add a startTime property to match the
 corresponding one in Scala

Author: Holden Karau <holden@pigscanfly.ca>

Closes #6275 from holdenk/SPARK-771-startTime-is-missing-from-pyspark and squashes the following commits:

06662dc [Holden Karau] add mising blank line for style checks
7a87410 [Holden Karau] add back missing newline
7a7876b [Holden Karau] Add a startTime property to match the corresponding one in the Scala SparkContext
---
 python/pyspark/context.py | 5 +++++
 python/pyspark/tests.py   | 4 ++++
 2 files changed, 9 insertions(+)

diff --git a/python/pyspark/context.py b/python/pyspark/context.py
index 1f2b40b29fafa..aeb7ad4f2f83e 100644
--- a/python/pyspark/context.py
+++ b/python/pyspark/context.py
@@ -291,6 +291,11 @@ def version(self):
         """
         return self._jsc.version()
 
+    @property
+    def startTime(self):
+        """Return the epoch time when the Spark Context was started."""
+        return self._jsc.startTime()
+
     @property
     def defaultParallelism(self):
         """
diff --git a/python/pyspark/tests.py b/python/pyspark/tests.py
index d8e319994cc96..f9fb37f7fc139 100644
--- a/python/pyspark/tests.py
+++ b/python/pyspark/tests.py
@@ -1809,6 +1809,10 @@ def run():
 
             sc.stop()
 
+    def test_startTime(self):
+        with SparkContext() as sc:
+            self.assertGreater(sc.startTime, 0)
+
 
 @unittest.skipIf(not _have_scipy, "SciPy not installed")
 class SciPyTests(PySparkTestCase):

From 5287eec5a6948c0c6e0baaebf35f512324c0679a Mon Sep 17 00:00:00 2001
From: Andrew Or <andrew@databricks.com>
Date: Thu, 21 May 2015 14:33:11 -0700
Subject: [PATCH 124/525] [SPARK-7718] [SQL] Speed up partitioning by avoiding
 closure cleaning

According to yhuai we spent 6-7 seconds cleaning closures in a partitioning job that takes 12 seconds. Since we provide these closures in Spark we know for sure they are serializable, so we can bypass the cleaning.

Author: Andrew Or <andrew@databricks.com>

Closes #6256 from andrewor14/sql-partition-speed-up and squashes the following commits:

a82b451 [Andrew Or] Fix style
10f7e3e [Andrew Or] Avoid getting call sites and cleaning closures
17e2943 [Andrew Or] Merge branch 'master' of github.com:apache/spark into sql-partition-speed-up
523f042 [Andrew Or] Skip unnecessary Utils.getCallSites too
f7fe143 [Andrew Or] Avoid unnecessary closure cleaning
---
 .../scala/org/apache/spark/util/Utils.scala   | 18 ++++
 .../apache/spark/sql/parquet/newParquet.scala | 98 ++++++++++---------
 .../sql/sources/DataSourceStrategy.scala      | 18 +++-
 .../spark/sql/sources/SqlNewHadoopRDD.scala   |  4 -
 4 files changed, 83 insertions(+), 55 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/util/Utils.scala b/core/src/main/scala/org/apache/spark/util/Utils.scala
index 6a7d1fae3320e..b7a2473dfe920 100644
--- a/core/src/main/scala/org/apache/spark/util/Utils.scala
+++ b/core/src/main/scala/org/apache/spark/util/Utils.scala
@@ -2201,6 +2201,24 @@ private[spark] object Utils extends Logging {
     shutdownHooks.remove(ref)
   }
 
+  /**
+   * To avoid calling `Utils.getCallSite` for every single RDD we create in the body,
+   * set a dummy call site that RDDs use instead. This is for performance optimization.
+   */
+  def withDummyCallSite[T](sc: SparkContext)(body: => T): T = {
+    val oldShortCallSite = sc.getLocalProperty(CallSite.SHORT_FORM)
+    val oldLongCallSite = sc.getLocalProperty(CallSite.LONG_FORM)
+    try {
+      sc.setLocalProperty(CallSite.SHORT_FORM, "")
+      sc.setLocalProperty(CallSite.LONG_FORM, "")
+      body
+    } finally {
+      // Restore the old ones here
+      sc.setLocalProperty(CallSite.SHORT_FORM, oldShortCallSite)
+      sc.setLocalProperty(CallSite.LONG_FORM, oldLongCallSite)
+    }
+  }
+
 }
 
 private [util] class SparkShutdownHookManager {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/newParquet.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/newParquet.scala
index 32986aa3ecc20..cb1e60883df1e 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/newParquet.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/newParquet.scala
@@ -33,6 +33,7 @@ import parquet.hadoop._
 import parquet.hadoop.metadata.CompressionCodecName
 import parquet.hadoop.util.ContextUtil
 
+import org.apache.spark.{Partition => SparkPartition, SerializableWritable, Logging, SparkException}
 import org.apache.spark.broadcast.Broadcast
 import org.apache.spark.deploy.SparkHadoopUtil
 import org.apache.spark.rdd.RDD._
@@ -40,7 +41,7 @@ import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.sources._
 import org.apache.spark.sql.types.{DataType, StructType}
 import org.apache.spark.sql.{Row, SQLConf, SQLContext}
-import org.apache.spark.{Partition => SparkPartition, SparkEnv, SerializableWritable, Logging, SparkException}
+import org.apache.spark.util.Utils
 
 private[sql] class DefaultSource extends HadoopFsRelationProvider {
   override def createRelation(
@@ -264,57 +265,58 @@ private[sql] class ParquetRelation2(
 
     val footers = inputFiles.map(f => metadataCache.footers(f.getPath))
 
-    // TODO Stop using `FilteringParquetRowInputFormat` and overriding `getPartition`.
-    // After upgrading to Parquet 1.6.0, we should be able to stop caching `FileStatus` objects and
-    // footers.  Especially when a global arbitrative schema (either from metastore or data source
-    // DDL) is available.
-    new SqlNewHadoopRDD(
-      sc = sqlContext.sparkContext,
-      broadcastedConf = broadcastedConf,
-      initDriverSideJobFuncOpt = Some(setInputPaths),
-      initLocalJobFuncOpt = Some(initLocalJobFuncOpt),
-      inputFormatClass = classOf[FilteringParquetRowInputFormat],
-      keyClass = classOf[Void],
-      valueClass = classOf[Row]) {
-
-      val cacheMetadata = useMetadataCache
-
-      @transient val cachedStatuses = inputFiles.map { f =>
-        // In order to encode the authority of a Path containing special characters such as /,
-        // we need to use the string returned by the URI of the path to create a new Path.
-        val pathWithAuthority = new Path(f.getPath.toUri.toString)
-
-        new FileStatus(
-          f.getLen, f.isDir, f.getReplication, f.getBlockSize, f.getModificationTime,
-          f.getAccessTime, f.getPermission, f.getOwner, f.getGroup, pathWithAuthority)
-      }.toSeq
-
-      @transient val cachedFooters = footers.map { f =>
-        // In order to encode the authority of a Path containing special characters such as /,
-        // we need to use the string returned by the URI of the path to create a new Path.
-        new Footer(new Path(f.getFile.toUri.toString), f.getParquetMetadata)
-      }.toSeq
-
-      // Overridden so we can inject our own cached files statuses.
-      override def getPartitions: Array[SparkPartition] = {
-        val inputFormat = if (cacheMetadata) {
-          new FilteringParquetRowInputFormat {
-            override def listStatus(jobContext: JobContext): JList[FileStatus] = cachedStatuses
-
-            override def getFooters(jobContext: JobContext): JList[Footer] = cachedFooters
+    Utils.withDummyCallSite(sqlContext.sparkContext) {
+      // TODO Stop using `FilteringParquetRowInputFormat` and overriding `getPartition`.
+      // After upgrading to Parquet 1.6.0, we should be able to stop caching `FileStatus` objects
+      // and footers. Especially when a global arbitrative schema (either from metastore or data
+      // source DDL) is available.
+      new SqlNewHadoopRDD(
+        sc = sqlContext.sparkContext,
+        broadcastedConf = broadcastedConf,
+        initDriverSideJobFuncOpt = Some(setInputPaths),
+        initLocalJobFuncOpt = Some(initLocalJobFuncOpt),
+        inputFormatClass = classOf[FilteringParquetRowInputFormat],
+        keyClass = classOf[Void],
+        valueClass = classOf[Row]) {
+
+        val cacheMetadata = useMetadataCache
+
+        @transient val cachedStatuses = inputFiles.map { f =>
+          // In order to encode the authority of a Path containing special characters such as /,
+          // we need to use the string returned by the URI of the path to create a new Path.
+          val pathWithAuthority = new Path(f.getPath.toUri.toString)
+
+          new FileStatus(
+            f.getLen, f.isDir, f.getReplication, f.getBlockSize, f.getModificationTime,
+            f.getAccessTime, f.getPermission, f.getOwner, f.getGroup, pathWithAuthority)
+        }.toSeq
+
+        @transient val cachedFooters = footers.map { f =>
+          // In order to encode the authority of a Path containing special characters such as /,
+          // we need to use the string returned by the URI of the path to create a new Path.
+          new Footer(new Path(f.getFile.toUri.toString), f.getParquetMetadata)
+        }.toSeq
+
+        // Overridden so we can inject our own cached files statuses.
+        override def getPartitions: Array[SparkPartition] = {
+          val inputFormat = if (cacheMetadata) {
+            new FilteringParquetRowInputFormat {
+              override def listStatus(jobContext: JobContext): JList[FileStatus] = cachedStatuses
+              override def getFooters(jobContext: JobContext): JList[Footer] = cachedFooters
+            }
+          } else {
+            new FilteringParquetRowInputFormat
           }
-        } else {
-          new FilteringParquetRowInputFormat
-        }
 
-        val jobContext = newJobContext(getConf(isDriverSide = true), jobId)
-        val rawSplits = inputFormat.getSplits(jobContext)
+          val jobContext = newJobContext(getConf(isDriverSide = true), jobId)
+          val rawSplits = inputFormat.getSplits(jobContext)
 
-        Array.tabulate[SparkPartition](rawSplits.size) { i =>
-          new SqlNewHadoopPartition(id, i, rawSplits(i).asInstanceOf[InputSplit with Writable])
+          Array.tabulate[SparkPartition](rawSplits.size) { i =>
+            new SqlNewHadoopPartition(id, i, rawSplits(i).asInstanceOf[InputSplit with Writable])
+          }
         }
-      }
-    }.values
+      }.values
+    }
   }
 
   private class MetadataCache {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/sources/DataSourceStrategy.scala b/sql/core/src/main/scala/org/apache/spark/sql/sources/DataSourceStrategy.scala
index 550090d22d551..c03649d00bbae 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/sources/DataSourceStrategy.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/sources/DataSourceStrategy.scala
@@ -17,9 +17,9 @@
 
 package org.apache.spark.sql.sources
 
-import org.apache.spark.{SerializableWritable, Logging}
+import org.apache.spark.{Logging, SerializableWritable, TaskContext}
 import org.apache.spark.deploy.SparkHadoopUtil
-import org.apache.spark.rdd.{RDD, UnionRDD}
+import org.apache.spark.rdd.{MapPartitionsRDD, RDD, UnionRDD}
 import org.apache.spark.sql.catalyst.expressions
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.planning.PhysicalOperation
@@ -28,6 +28,7 @@ import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
 import org.apache.spark.sql.execution.SparkPlan
 import org.apache.spark.sql.types.{StringType, StructType, UTF8String}
 import org.apache.spark.sql.{SaveMode, Strategy, execution, sources}
+import org.apache.spark.util.Utils
 
 /**
  * A Strategy for planning scans over data sources defined using the sources API.
@@ -197,7 +198,10 @@ private[sql] object DataSourceStrategy extends Strategy with Logging {
         }
       }
 
-      dataRows.mapPartitions { iterator =>
+      // Since we know for sure that this closure is serializable, we can avoid the overhead
+      // of cleaning a closure for each RDD by creating our own MapPartitionsRDD. Functionally
+      // this is equivalent to calling `dataRows.mapPartitions(mapPartitionsFunc)` (SPARK-7718).
+      val mapPartitionsFunc = (_: TaskContext, _: Int, iterator: Iterator[Row]) => {
         val dataTypes = requiredColumns.map(schema(_).dataType)
         val mutableRow = new SpecificMutableRow(dataTypes)
         iterator.map { dataRow =>
@@ -209,6 +213,14 @@ private[sql] object DataSourceStrategy extends Strategy with Logging {
           mutableRow.asInstanceOf[expressions.Row]
         }
       }
+
+      // This is an internal RDD whose call site the user should not be concerned with
+      // Since we create many of these (one per partition), the time spent on computing
+      // the call site may add up.
+      Utils.withDummyCallSite(dataRows.sparkContext) {
+        new MapPartitionsRDD(dataRows, mapPartitionsFunc, preservesPartitioning = false)
+      }
+
     } else {
       dataRows
     }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/sources/SqlNewHadoopRDD.scala b/sql/core/src/main/scala/org/apache/spark/sql/sources/SqlNewHadoopRDD.scala
index 0c7bb6e50cd98..a74a98631da35 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/sources/SqlNewHadoopRDD.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/sources/SqlNewHadoopRDD.scala
@@ -75,10 +75,6 @@ private[sql] class SqlNewHadoopRDD[K, V](
   with SparkHadoopMapReduceUtil
   with Logging {
 
-  if (initLocalJobFuncOpt.isDefined) {
-    sc.clean(initLocalJobFuncOpt.get)
-  }
-
   protected def getJob(): Job = {
     val conf: Configuration = broadcastedConf.value.value
     // "new Job" will make a copy of the conf. Then, it is

From 5a3c04bb92e21bd221a75c4ae13a71f7d4716b44 Mon Sep 17 00:00:00 2001
From: Liang-Chi Hsieh <viirya@gmail.com>
Date: Thu, 21 May 2015 23:12:00 +0100
Subject: [PATCH 125/525] [SPARK-7800] isDefined should not marked too early in
 putNewKey

JIRA: https://issues.apache.org/jira/browse/SPARK-7800

`isDefined` is marked as true twice in `Location.putNewKey`. The first one is unnecessary and will cause problem because it is too early and before some assert checking. E.g., if an attempt with incorrect `keyLengthBytes` marks `isDefined` as true, the location can not be used later.

ping JoshRosen

Author: Liang-Chi Hsieh <viirya@gmail.com>

Closes #6324 from viirya/dup_isdefined and squashes the following commits:

cbfe03b [Liang-Chi Hsieh] isDefined should not marked too early in putNewKey.
---
 .../main/java/org/apache/spark/unsafe/map/BytesToBytesMap.java   | 1 -
 1 file changed, 1 deletion(-)

diff --git a/unsafe/src/main/java/org/apache/spark/unsafe/map/BytesToBytesMap.java b/unsafe/src/main/java/org/apache/spark/unsafe/map/BytesToBytesMap.java
index bd4ca74cc7764..0b4d8d286f5f9 100644
--- a/unsafe/src/main/java/org/apache/spark/unsafe/map/BytesToBytesMap.java
+++ b/unsafe/src/main/java/org/apache/spark/unsafe/map/BytesToBytesMap.java
@@ -429,7 +429,6 @@ public void putNewKey(
         long valueBaseOffset,
         int valueLengthBytes) {
       assert (!isDefined) : "Can only set value once for a key";
-      isDefined = true;
       assert (keyLengthBytes % 8 == 0);
       assert (valueLengthBytes % 8 == 0);
       if (size == MAX_CAPACITY) {

From 147b6be3b6d464dfc14836c08e690ab021a600de Mon Sep 17 00:00:00 2001
From: Yin Huai <yhuai@databricks.com>
Date: Thu, 21 May 2015 15:40:58 -0700
Subject: [PATCH 126/525] [BUILD] Always run SQL tests in master build.

Seems our master build does not run HiveCompatibilitySuite (because _RUN_SQL_TESTS is not set). This PR introduces a property `AMP_JENKINS_PRB` to differentiate a PR build and a regular build. If a build is a regular one, we always set _RUN_SQL_TESTS to true.

cc JoshRosen nchammas

Author: Yin Huai <yhuai@databricks.com>

Closes #5955 from yhuai/runSQLTests and squashes the following commits:

3d399bc [Yin Huai] Always run SQL tests in master build.
---
 dev/run-tests         | 41 ++++++++++++++++++++++++-----------------
 dev/run-tests-jenkins |  2 ++
 2 files changed, 26 insertions(+), 17 deletions(-)

diff --git a/dev/run-tests b/dev/run-tests
index 44d802782c4a4..b444e74706b65 100755
--- a/dev/run-tests
+++ b/dev/run-tests
@@ -82,24 +82,31 @@ export SBT_MAVEN_PROFILES_ARGS="$SBT_MAVEN_PROFILES_ARGS -Pkinesis-asl"
 if [ -n "$AMPLAB_JENKINS" ]; then
   git fetch origin master:master
 
-  sql_diffs=$(
-    git diff --name-only master \
-    | grep -e "^sql/" -e "^bin/spark-sql" -e "^sbin/start-thriftserver.sh"
-  )
-
-  non_sql_diffs=$(
-    git diff --name-only master \
-    | grep -v -e "^sql/" -e "^bin/spark-sql" -e "^sbin/start-thriftserver.sh"
-  )
-
-  if [ -n "$sql_diffs" ]; then
-    echo "[info] Detected changes in SQL. Will run Hive test suite."
-    _RUN_SQL_TESTS=true
-
-    if [ -z "$non_sql_diffs" ]; then
-      echo "[info] Detected no changes except in SQL. Will only run SQL tests."
-      _SQL_TESTS_ONLY=true
+  # AMP_JENKINS_PRB indicates if the current build is a pull request build.
+  if [ -n "$AMP_JENKINS_PRB" ]; then
+    # It is a pull request build.
+    sql_diffs=$(
+      git diff --name-only master \
+      | grep -e "^sql/" -e "^bin/spark-sql" -e "^sbin/start-thriftserver.sh"
+    )
+
+    non_sql_diffs=$(
+      git diff --name-only master \
+      | grep -v -e "^sql/" -e "^bin/spark-sql" -e "^sbin/start-thriftserver.sh"
+    )
+
+    if [ -n "$sql_diffs" ]; then
+      echo "[info] Detected changes in SQL. Will run Hive test suite."
+      _RUN_SQL_TESTS=true
+
+      if [ -z "$non_sql_diffs" ]; then
+        echo "[info] Detected no changes except in SQL. Will only run SQL tests."
+        _SQL_TESTS_ONLY=true
+      fi
     fi
+  else
+    # It is a regular build. We should run SQL tests.
+    _RUN_SQL_TESTS=true
   fi
 fi
 
diff --git a/dev/run-tests-jenkins b/dev/run-tests-jenkins
index f452ab66efcd8..8b2a44fd72ba5 100755
--- a/dev/run-tests-jenkins
+++ b/dev/run-tests-jenkins
@@ -185,6 +185,8 @@ done
 
 # run tests
 {
+  # Marks this build is a pull request build.
+  export AMP_JENKINS_PRB=true
   timeout "${TESTS_TIMEOUT}" ./dev/run-tests
   test_result="$?"
 

From 347b50106bd1bcd40049f1ca29cefbb0baf53413 Mon Sep 17 00:00:00 2001
From: Yin Huai <yhuai@databricks.com>
Date: Fri, 22 May 2015 07:10:26 +0800
Subject: [PATCH 127/525] [SPARK-7737] [SQL] Use leaf dirs having data files to
 discover partitions.

https://issues.apache.org/jira/browse/SPARK-7737

cc liancheng

Author: Yin Huai <yhuai@databricks.com>

Closes #6329 from yhuai/spark-7737 and squashes the following commits:

7e0dfc7 [Yin Huai] Use leaf dirs having data files to discover partitions.
---
 .../org/apache/spark/sql/sources/interfaces.scala      |  7 ++-----
 .../sql/parquet/ParquetPartitionDiscoverySuite.scala   | 10 +++++++++-
 2 files changed, 11 insertions(+), 6 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/sources/interfaces.scala b/sql/core/src/main/scala/org/apache/spark/sql/sources/interfaces.scala
index 61fc4e5c19998..aaabbadcd651b 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/sources/interfaces.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/sources/interfaces.scala
@@ -377,8 +377,6 @@ abstract class HadoopFsRelation private[sql](maybePartitionSpec: Option[Partitio
 
     var leafDirToChildrenFiles = mutable.Map.empty[Path, Array[FileStatus]]
 
-    var leafDirs = mutable.Map.empty[Path, FileStatus]
-
     def refresh(): Unit = {
       def listLeafFilesAndDirs(fs: FileSystem, status: FileStatus): Set[FileStatus] = {
         val (dirs, files) = fs.listStatus(status.getPath).partition(_.isDir)
@@ -386,7 +384,6 @@ abstract class HadoopFsRelation private[sql](maybePartitionSpec: Option[Partitio
         files.toSet ++ leafDirs ++ dirs.flatMap(dir => listLeafFilesAndDirs(fs, dir))
       }
 
-      leafDirs.clear()
       leafFiles.clear()
 
       // We don't filter files/directories like _temporary/_SUCCESS here, as specific data sources
@@ -399,7 +396,6 @@ abstract class HadoopFsRelation private[sql](maybePartitionSpec: Option[Partitio
       }
 
       val (dirs, files) = statuses.partition(_.isDir)
-      leafDirs ++= dirs.map(d => d.getPath -> d).toMap
       leafFiles ++= files.map(f => f.getPath -> f).toMap
       leafDirToChildrenFiles ++= files.groupBy(_.getPath.getParent)
     }
@@ -484,7 +480,8 @@ abstract class HadoopFsRelation private[sql](maybePartitionSpec: Option[Partitio
   }
 
   private def discoverPartitions(): PartitionSpec = {
-    val leafDirs = fileStatusCache.leafDirs.keys.toSeq
+    // We use leaf dirs containing data files to discover the schema.
+    val leafDirs = fileStatusCache.leafDirToChildrenFiles.keys.toSeq
     PartitioningUtils.parsePartitions(leafDirs, PartitioningUtils.DEFAULT_PARTITION_NAME)
   }
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetPartitionDiscoverySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetPartitionDiscoverySuite.scala
index 907dbb0119b40..90d4528efca48 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetPartitionDiscoverySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetPartitionDiscoverySuite.scala
@@ -16,6 +16,8 @@
  */
 package org.apache.spark.sql.parquet
 
+import java.io.File
+
 import scala.collection.mutable.ArrayBuffer
 
 import org.apache.hadoop.fs.Path
@@ -175,11 +177,17 @@ class ParquetPartitionDiscoverySuite extends QueryTest with ParquetTest {
         pi <- Seq(1, 2)
         ps <- Seq("foo", "bar")
       } {
+        val dir = makePartitionDir(base, defaultPartitionName, "pi" -> pi, "ps" -> ps)
         makeParquetFile(
           (1 to 10).map(i => ParquetData(i, i.toString)),
-          makePartitionDir(base, defaultPartitionName, "pi" -> pi, "ps" -> ps))
+          dir)
+        // Introduce _temporary dir to test the robustness of the schema discovery process.
+        new File(dir.toString, "_temporary").mkdir()
       }
+      // Introduce _temporary dir to the base dir the robustness of the schema discovery process.
+      new File(base.getCanonicalPath, "_temporary").mkdir()
 
+      println("load the partitioned table")
       read.parquet(base.getCanonicalPath).registerTempTable("t")
 
       withTempTable("t") {

From d68ea24d60ce1aa55b06a8c107f42544d696eb41 Mon Sep 17 00:00:00 2001
From: Tathagata Das <tathagata.das1565@gmail.com>
Date: Thu, 21 May 2015 17:41:31 -0700
Subject: [PATCH 128/525] [SPARK-7776] [STREAMING] Added shutdown hook to
 StreamingContext

Shutdown hook to stop SparkContext was added recently. This results in ugly errors when a streaming application is terminated by ctrl-C.

```
Exception in thread "Thread-27" org.apache.spark.SparkException: Job cancelled because SparkContext was shut down
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$cleanUpAfterSchedulerStop$1.apply(DAGScheduler.scala:736)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$cleanUpAfterSchedulerStop$1.apply(DAGScheduler.scala:735)
	at scala.collection.mutable.HashSet.foreach(HashSet.scala:79)
	at org.apache.spark.scheduler.DAGScheduler.cleanUpAfterSchedulerStop(DAGScheduler.scala:735)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onStop(DAGScheduler.scala:1468)
	at org.apache.spark.util.EventLoop.stop(EventLoop.scala:84)
	at org.apache.spark.scheduler.DAGScheduler.stop(DAGScheduler.scala:1403)
	at org.apache.spark.SparkContext.stop(SparkContext.scala:1642)
	at org.apache.spark.SparkContext$$anonfun$3.apply$mcV$sp(SparkContext.scala:559)
	at org.apache.spark.util.SparkShutdownHook.run(Utils.scala:2266)
	at org.apache.spark.util.SparkShutdownHookManager$$anonfun$runAll$1$$anonfun$apply$mcV$sp$1.apply$mcV$sp(Utils.scala:2236)
	at org.apache.spark.util.SparkShutdownHookManager$$anonfun$runAll$1$$anonfun$apply$mcV$sp$1.apply(Utils.scala:2236)
	at org.apache.spark.util.SparkShutdownHookManager$$anonfun$runAll$1$$anonfun$apply$mcV$sp$1.apply(Utils.scala:2236)
	at org.apache.spark.util.Utils$.logUncaughtExceptions(Utils.scala:1764)
	at org.apache.spark.util.SparkShutdownHookManager$$anonfun$runAll$1.apply$mcV$sp(Utils.scala:2236)
	at org.apache.spark.util.SparkShutdownHookManager$$anonfun$runAll$1.apply(Utils.scala:2236)
	at org.apache.spark.util.SparkShutdownHookManager$$anonfun$runAll$1.apply(Utils.scala:2236)
	at scala.util.Try$.apply(Try.scala:161)
	at org.apache.spark.util.SparkShutdownHookManager.runAll(Utils.scala:2236)
	at org.apache.spark.util.SparkShutdownHookManager$$anon$6.run(Utils.scala:2218)
	at org.apache.hadoop.util.ShutdownHookManager$1.run(ShutdownHookManager.java:54)
```

This is because the Spark's shutdown hook stops the context, and the streaming jobs fail in the middle. The correct solution is to stop the streaming context before the spark context. This PR adds the shutdown hook to do so with a priority higher than the SparkContext's shutdown hooks priority.

Author: Tathagata Das <tathagata.das1565@gmail.com>

Closes #6307 from tdas/SPARK-7776 and squashes the following commits:

e3d5475 [Tathagata Das] Added conf to specify graceful shutdown
4c18652 [Tathagata Das] Added shutdown hook to StreamingContxt.
---
 .../spark/streaming/StreamingContext.scala     | 18 +++++++++++++++++-
 1 file changed, 17 insertions(+), 1 deletion(-)

diff --git a/streaming/src/main/scala/org/apache/spark/streaming/StreamingContext.scala b/streaming/src/main/scala/org/apache/spark/streaming/StreamingContext.scala
index 160fc42c57d18..7b77d447ce6df 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/StreamingContext.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/StreamingContext.scala
@@ -42,7 +42,7 @@ import org.apache.spark.streaming.dstream._
 import org.apache.spark.streaming.receiver.{ActorReceiver, ActorSupervisorStrategy, Receiver}
 import org.apache.spark.streaming.scheduler.{JobScheduler, StreamingListener}
 import org.apache.spark.streaming.ui.{StreamingJobProgressListener, StreamingTab}
-import org.apache.spark.util.CallSite
+import org.apache.spark.util.{CallSite, Utils}
 
 /**
  * Main entry point for Spark Streaming functionality. It provides methods used to create
@@ -201,6 +201,8 @@ class StreamingContext private[streaming] (
 
   private val startSite = new AtomicReference[CallSite](null)
 
+  private var shutdownHookRef: AnyRef = _
+
   /**
    * Return the associated Spark context
    */
@@ -584,6 +586,8 @@ class StreamingContext private[streaming] (
           state = StreamingContextState.ACTIVE
           StreamingContext.setActiveContext(this)
         }
+        shutdownHookRef = Utils.addShutdownHook(
+          StreamingContext.SHUTDOWN_HOOK_PRIORITY)(stopOnShutdown)
         logInfo("StreamingContext started")
       case ACTIVE =>
         logWarning("StreamingContext has already been started")
@@ -660,6 +664,9 @@ class StreamingContext private[streaming] (
           uiTab.foreach(_.detach())
           StreamingContext.setActiveContext(null)
           waiter.notifyStop()
+          if (shutdownHookRef != null) {
+            Utils.removeShutdownHook(shutdownHookRef)
+          }
           logInfo("StreamingContext stopped successfully")
       }
       // Even if we have already stopped, we still need to attempt to stop the SparkContext because
@@ -670,6 +677,13 @@ class StreamingContext private[streaming] (
       state = STOPPED
     }
   }
+
+  private def stopOnShutdown(): Unit = {
+    val stopGracefully = conf.getBoolean("spark.streaming.stopGracefullyOnShutdown", false)
+    logInfo(s"Invoking stop(stopGracefully=$stopGracefully) from shutdown hook")
+    // Do not stop SparkContext, let its own shutdown hook stop it
+    stop(stopSparkContext = false, stopGracefully = stopGracefully)
+  }
 }
 
 /**
@@ -685,6 +699,8 @@ object StreamingContext extends Logging {
    */
   private val ACTIVATION_LOCK = new Object()
 
+  private val SHUTDOWN_HOOK_PRIORITY = Utils.SPARK_CONTEXT_SHUTDOWN_PRIORITY + 1
+
   private val activeContext = new AtomicReference[StreamingContext](null)
 
   private def assertNoOtherContextIsActive(): Unit = {

From 17791a58159b3e4619d0367f54a4c5332342658b Mon Sep 17 00:00:00 2001
From: Davies Liu <davies@databricks.com>
Date: Thu, 21 May 2015 17:43:08 -0700
Subject: [PATCH 129/525] [SPARK-7783] [SQL] [PySpark] add
 DataFrame.rollup/cube in Python

Author: Davies Liu <davies@databricks.com>

Closes #6311 from davies/rollup and squashes the following commits:

0261db1 [Davies Liu] use @since
a51ca6b [Davies Liu] Merge branch 'master' of github.com:apache/spark into rollup
8ad5af4 [Davies Liu] Update dataframe.py
ade3841 [Davies Liu] add DataFrame.rollup/cube in Python
---
 python/pyspark/sql/dataframe.py | 48 +++++++++++++++++++++++++++++++--
 1 file changed, 46 insertions(+), 2 deletions(-)

diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py
index 3fc7d0048edf6..132db90e69f59 100644
--- a/python/pyspark/sql/dataframe.py
+++ b/python/pyspark/sql/dataframe.py
@@ -801,9 +801,53 @@ def groupBy(self, *cols):
         >>> df.groupBy(['name', df.age]).count().collect()
         [Row(name=u'Bob', age=5, count=1), Row(name=u'Alice', age=2, count=1)]
         """
-        jdf = self._jdf.groupBy(self._jcols(*cols))
+        jgd = self._jdf.groupBy(self._jcols(*cols))
         from pyspark.sql.group import GroupedData
-        return GroupedData(jdf, self.sql_ctx)
+        return GroupedData(jgd, self.sql_ctx)
+
+    @since(1.4)
+    def rollup(self, *cols):
+        """
+        Create a multi-dimensional rollup for the current :class:`DataFrame` using
+        the specified columns, so we can run aggregation on them.
+
+        >>> df.rollup('name', df.age).count().show()
+        +-----+----+-----+
+        | name| age|count|
+        +-----+----+-----+
+        |Alice|null|    1|
+        |  Bob|   5|    1|
+        |  Bob|null|    1|
+        | null|null|    2|
+        |Alice|   2|    1|
+        +-----+----+-----+
+        """
+        jgd = self._jdf.rollup(self._jcols(*cols))
+        from pyspark.sql.group import GroupedData
+        return GroupedData(jgd, self.sql_ctx)
+
+    @since(1.4)
+    def cube(self, *cols):
+        """
+        Create a multi-dimensional cube for the current :class:`DataFrame` using
+        the specified columns, so we can run aggregation on them.
+
+        >>> df.cube('name', df.age).count().show()
+        +-----+----+-----+
+        | name| age|count|
+        +-----+----+-----+
+        | null|   2|    1|
+        |Alice|null|    1|
+        |  Bob|   5|    1|
+        |  Bob|null|    1|
+        | null|   5|    1|
+        | null|null|    2|
+        |Alice|   2|    1|
+        +-----+----+-----+
+        """
+        jgd = self._jdf.cube(self._jcols(*cols))
+        from pyspark.sql.group import GroupedData
+        return GroupedData(jgd, self.sql_ctx)
 
     @since(1.3)
     def agg(self, *exprs):

From f5db4b416c922db7a8f1b0c098b4f08647106231 Mon Sep 17 00:00:00 2001
From: Xiangrui Meng <meng@databricks.com>
Date: Thu, 21 May 2015 17:59:03 -0700
Subject: [PATCH 130/525] [SPARK-7794] [MLLIB] update RegexTokenizer default
 settings

The previous default is `{gaps: false, pattern: "\\p{L}+|[^\\p{L}\\s]+"}`. The default pattern is hard to understand. This PR changes the default to `{gaps: true, pattern: "\\s+"}`. jkbradley

Author: Xiangrui Meng <meng@databricks.com>

Closes #6330 from mengxr/SPARK-7794 and squashes the following commits:

5ee7cde [Xiangrui Meng] update RegexTokenizer default settings
---
 .../apache/spark/ml/feature/Tokenizer.scala   | 18 +++++----
 .../spark/ml/feature/TokenizerSuite.scala     | 32 +++++++--------
 python/pyspark/ml/feature.py                  | 40 +++++++++----------
 3 files changed, 44 insertions(+), 46 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/Tokenizer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/Tokenizer.scala
index 3f7f4f96fc422..31f3a1aa4c76b 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/Tokenizer.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/Tokenizer.scala
@@ -26,6 +26,8 @@ import org.apache.spark.sql.types.{ArrayType, DataType, StringType}
 /**
  * :: AlphaComponent ::
  * A tokenizer that converts the input string to lowercase and then splits it by white spaces.
+ *
+ * @see [[RegexTokenizer]]
  */
 @AlphaComponent
 class Tokenizer(override val uid: String) extends UnaryTransformer[String, Seq[String], Tokenizer] {
@@ -45,9 +47,9 @@ class Tokenizer(override val uid: String) extends UnaryTransformer[String, Seq[S
 
 /**
  * :: AlphaComponent ::
- * A regex based tokenizer that extracts tokens either by repeatedly matching the regex(default)
- * or using it to split the text (set matching to false). Optional parameters also allow filtering
- * tokens using a minimal length.
+ * A regex based tokenizer that extracts tokens either by using the provided regex pattern to split
+ * the text (default) or repeatedly matching the regex (if `gaps` is true).
+ * Optional parameters also allow filtering tokens using a minimal length.
  * It returns an array of strings that can be empty.
  */
 @AlphaComponent
@@ -71,8 +73,8 @@ class RegexTokenizer(override val uid: String)
   def getMinTokenLength: Int = $(minTokenLength)
 
   /**
-   * Indicates whether regex splits on gaps (true) or matching tokens (false).
-   * Default: false
+   * Indicates whether regex splits on gaps (true) or matches tokens (false).
+   * Default: true
    * @group param
    */
   val gaps: BooleanParam = new BooleanParam(this, "gaps", "Set regex to match gaps or tokens")
@@ -84,8 +86,8 @@ class RegexTokenizer(override val uid: String)
   def getGaps: Boolean = $(gaps)
 
   /**
-   * Regex pattern used by tokenizer.
-   * Default: `"\\p{L}+|[^\\p{L}\\s]+"`
+   * Regex pattern used to match delimiters if [[gaps]] is true or tokens if [[gaps]] is false.
+   * Default: `"\\s+"`
    * @group param
    */
   val pattern: Param[String] = new Param(this, "pattern", "regex pattern used for tokenizing")
@@ -96,7 +98,7 @@ class RegexTokenizer(override val uid: String)
   /** @group getParam */
   def getPattern: String = $(pattern)
 
-  setDefault(minTokenLength -> 1, gaps -> false, pattern -> "\\p{L}+|[^\\p{L}\\s]+")
+  setDefault(minTokenLength -> 1, gaps -> true, pattern -> "\\s+")
 
   override protected def createTransformFunc: String => Seq[String] = { str =>
     val re = $(pattern).r
diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/TokenizerSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/TokenizerSuite.scala
index a46d08d65150f..eabda089d0988 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/feature/TokenizerSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/feature/TokenizerSuite.scala
@@ -29,35 +29,34 @@ case class TokenizerTestData(rawText: String, wantedTokens: Array[String])
 
 class RegexTokenizerSuite extends FunSuite with MLlibTestSparkContext {
   import org.apache.spark.ml.feature.RegexTokenizerSuite._
-  
+
   test("RegexTokenizer") {
-    val tokenizer = new RegexTokenizer()
+    val tokenizer0 = new RegexTokenizer()
+      .setGaps(false)
+      .setPattern("\\w+|\\p{Punct}")
       .setInputCol("rawText")
       .setOutputCol("tokens")
-
     val dataset0 = sqlContext.createDataFrame(Seq(
       TokenizerTestData("Test for tokenization.", Array("Test", "for", "tokenization", ".")),
       TokenizerTestData("Te,st. punct", Array("Te", ",", "st", ".", "punct"))
     ))
-    testRegexTokenizer(tokenizer, dataset0)
+    testRegexTokenizer(tokenizer0, dataset0)
 
     val dataset1 = sqlContext.createDataFrame(Seq(
       TokenizerTestData("Test for tokenization.", Array("Test", "for", "tokenization")),
       TokenizerTestData("Te,st. punct", Array("punct"))
     ))
+    tokenizer0.setMinTokenLength(3)
+    testRegexTokenizer(tokenizer0, dataset1)
 
-    tokenizer.setMinTokenLength(3)
-    testRegexTokenizer(tokenizer, dataset1)
-
-    tokenizer
-      .setPattern("\\s")
-      .setGaps(true)
-      .setMinTokenLength(0)
+    val tokenizer2 = new RegexTokenizer()
+      .setInputCol("rawText")
+      .setOutputCol("tokens")
     val dataset2 = sqlContext.createDataFrame(Seq(
       TokenizerTestData("Test for tokenization.", Array("Test", "for", "tokenization.")),
-      TokenizerTestData("Te,st.  punct", Array("Te,st.", "", "punct"))
+      TokenizerTestData("Te,st.  punct", Array("Te,st.", "punct"))
     ))
-    testRegexTokenizer(tokenizer, dataset2)
+    testRegexTokenizer(tokenizer2, dataset2)
   }
 }
 
@@ -67,9 +66,8 @@ object RegexTokenizerSuite extends FunSuite {
     t.transform(dataset)
       .select("tokens", "wantedTokens")
       .collect()
-      .foreach {
-        case Row(tokens, wantedTokens) =>
-          assert(tokens === wantedTokens)
-    }
+      .foreach { case Row(tokens, wantedTokens) =>
+        assert(tokens === wantedTokens)
+      }
   }
 }
diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py
index 5511dceb70419..b0479d9b074db 100644
--- a/python/pyspark/ml/feature.py
+++ b/python/pyspark/ml/feature.py
@@ -446,23 +446,25 @@ def getDegree(self):
 @ignore_unicode_prefix
 class RegexTokenizer(JavaTransformer, HasInputCol, HasOutputCol):
     """
-    A regex based tokenizer that extracts tokens either by repeatedly matching the regex(default)
-    or using it to split the text (set matching to false). Optional parameters also allow filtering
-    tokens using a minimal length.
+    A regex based tokenizer that extracts tokens either by using the
+    provided regex pattern (in Java dialect) to split the text
+    (default) or repeatedly matching the regex (if gaps is true).
+    Optional parameters also allow filtering tokens using a minimal
+    length.
     It returns an array of strings that can be empty.
 
-    >>> df = sqlContext.createDataFrame([("a b c",)], ["text"])
+    >>> df = sqlContext.createDataFrame([("a b  c",)], ["text"])
     >>> reTokenizer = RegexTokenizer(inputCol="text", outputCol="words")
     >>> reTokenizer.transform(df).head()
-    Row(text=u'a b c', words=[u'a', u'b', u'c'])
+    Row(text=u'a b  c', words=[u'a', u'b', u'c'])
     >>> # Change a parameter.
     >>> reTokenizer.setParams(outputCol="tokens").transform(df).head()
-    Row(text=u'a b c', tokens=[u'a', u'b', u'c'])
+    Row(text=u'a b  c', tokens=[u'a', u'b', u'c'])
     >>> # Temporarily modify a parameter.
     >>> reTokenizer.transform(df, {reTokenizer.outputCol: "words"}).head()
-    Row(text=u'a b c', words=[u'a', u'b', u'c'])
+    Row(text=u'a b  c', words=[u'a', u'b', u'c'])
     >>> reTokenizer.transform(df).head()
-    Row(text=u'a b c', tokens=[u'a', u'b', u'c'])
+    Row(text=u'a b  c', tokens=[u'a', u'b', u'c'])
     >>> # Must use keyword arguments to specify params.
     >>> reTokenizer.setParams("text")
     Traceback (most recent call last):
@@ -472,31 +474,27 @@ class RegexTokenizer(JavaTransformer, HasInputCol, HasOutputCol):
 
     # a placeholder to make it appear in the generated doc
     minTokenLength = Param(Params._dummy(), "minTokenLength", "minimum token length (>= 0)")
-    gaps = Param(Params._dummy(), "gaps", "Set regex to match gaps or tokens")
-    pattern = Param(Params._dummy(), "pattern", "regex pattern used for tokenizing")
+    gaps = Param(Params._dummy(), "gaps", "whether regex splits on gaps (True) or matches tokens")
+    pattern = Param(Params._dummy(), "pattern", "regex pattern (Java dialect) used for tokenizing")
 
     @keyword_only
-    def __init__(self, minTokenLength=1, gaps=False, pattern="\\p{L}+|[^\\p{L}\\s]+",
-                 inputCol=None, outputCol=None):
+    def __init__(self, minTokenLength=1, gaps=True, pattern="\\s+", inputCol=None, outputCol=None):
         """
-        __init__(self, minTokenLength=1, gaps=False, pattern="\\p{L}+|[^\\p{L}\\s]+", \
-                 inputCol=None, outputCol=None)
+        __init__(self, minTokenLength=1, gaps=True, pattern="\\s+", inputCol=None, outputCol=None)
         """
         super(RegexTokenizer, self).__init__()
         self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.RegexTokenizer", self.uid)
         self.minTokenLength = Param(self, "minTokenLength", "minimum token length (>= 0)")
-        self.gaps = Param(self, "gaps", "Set regex to match gaps or tokens")
-        self.pattern = Param(self, "pattern", "regex pattern used for tokenizing")
-        self._setDefault(minTokenLength=1, gaps=False, pattern="\\p{L}+|[^\\p{L}\\s]+")
+        self.gaps = Param(self, "gaps", "whether regex splits on gaps (True) or matches tokens")
+        self.pattern = Param(self, "pattern", "regex pattern (Java dialect) used for tokenizing")
+        self._setDefault(minTokenLength=1, gaps=True, pattern="\\s+")
         kwargs = self.__init__._input_kwargs
         self.setParams(**kwargs)
 
     @keyword_only
-    def setParams(self, minTokenLength=1, gaps=False, pattern="\\p{L}+|[^\\p{L}\\s]+",
-                  inputCol=None, outputCol=None):
+    def setParams(self, minTokenLength=1, gaps=True, pattern="\\s+", inputCol=None, outputCol=None):
         """
-        setParams(self, minTokenLength=1, gaps=False, pattern="\\p{L}+|[^\\p{L}\\s]+", \
-                  inputCol="input", outputCol="output")
+        setParams(self, minTokenLength=1, gaps=True, pattern="\\s+", inputCol=None, outputCol=None)
         Sets params for this RegexTokenizer.
         """
         kwargs = self.setParams._input_kwargs

From 85b96372cf0fd055f89fc639f45c1f2cb02a378f Mon Sep 17 00:00:00 2001
From: Xiangrui Meng <meng@databricks.com>
Date: Thu, 21 May 2015 18:04:45 -0700
Subject: [PATCH 131/525] [SPARK-7219] [MLLIB] Output feature attributes in
 HashingTF

This PR updates `HashingTF` to output ML attributes that tell the number of features in the output column. We need to expand `UnaryTransformer` to support output metadata. A `df outputMetadata: Metadata` is not sufficient because the metadata may also depends on the input data. Though this is not true for `HashingTF`, I think it is reasonable to update `UnaryTransformer` in a separate PR. `checkParams` is added to verify common requirements for params. I will send a separate PR to use it in other test suites. jkbradley

Author: Xiangrui Meng <meng@databricks.com>

Closes #6308 from mengxr/SPARK-7219 and squashes the following commits:

9bd2922 [Xiangrui Meng] address comments
e82a68a [Xiangrui Meng] remove sqlContext from test suite
995535b [Xiangrui Meng] Merge remote-tracking branch 'apache/master' into SPARK-7219
2194703 [Xiangrui Meng] add test for attributes
178ae23 [Xiangrui Meng] update HashingTF with tests
91a6106 [Xiangrui Meng] WIP
---
 .../apache/spark/ml/feature/HashingTF.scala   | 34 +++++++++---
 .../spark/ml/feature/HashingTFSuite.scala     | 55 +++++++++++++++++++
 .../apache/spark/ml/param/ParamsSuite.scala   | 20 +++++++
 3 files changed, 101 insertions(+), 8 deletions(-)
 create mode 100644 mllib/src/test/scala/org/apache/spark/ml/feature/HashingTFSuite.scala

diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/HashingTF.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/HashingTF.scala
index 30033ced68a04..8942d45219177 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/HashingTF.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/HashingTF.scala
@@ -18,22 +18,31 @@
 package org.apache.spark.ml.feature
 
 import org.apache.spark.annotation.AlphaComponent
-import org.apache.spark.ml.UnaryTransformer
+import org.apache.spark.ml.Transformer
+import org.apache.spark.ml.attribute.AttributeGroup
+import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol}
 import org.apache.spark.ml.param.{IntParam, ParamValidators}
-import org.apache.spark.ml.util.Identifiable
+import org.apache.spark.ml.util.{Identifiable, SchemaUtils}
 import org.apache.spark.mllib.feature
-import org.apache.spark.mllib.linalg.{Vector, VectorUDT}
-import org.apache.spark.sql.types.DataType
+import org.apache.spark.sql.DataFrame
+import org.apache.spark.sql.functions.{udf, col}
+import org.apache.spark.sql.types.{ArrayType, StructType}
 
 /**
  * :: AlphaComponent ::
  * Maps a sequence of terms to their term frequencies using the hashing trick.
  */
 @AlphaComponent
-class HashingTF(override val uid: String) extends UnaryTransformer[Iterable[_], Vector, HashingTF] {
+class HashingTF(override val uid: String) extends Transformer with HasInputCol with HasOutputCol {
 
   def this() = this(Identifiable.randomUID("hashingTF"))
 
+  /** @group setParam */
+  def setInputCol(value: String): this.type = set(inputCol, value)
+
+  /** @group setParam */
+  def setOutputCol(value: String): this.type = set(outputCol, value)
+
   /**
    * Number of features.  Should be > 0.
    * (default = 2^18^)
@@ -50,10 +59,19 @@ class HashingTF(override val uid: String) extends UnaryTransformer[Iterable[_],
   /** @group setParam */
   def setNumFeatures(value: Int): this.type = set(numFeatures, value)
 
-  override protected def createTransformFunc: Iterable[_] => Vector = {
+  override def transform(dataset: DataFrame): DataFrame = {
+    val outputSchema = transformSchema(dataset.schema)
     val hashingTF = new feature.HashingTF($(numFeatures))
-    hashingTF.transform
+    val t = udf { terms: Seq[_] => hashingTF.transform(terms) }
+    val metadata = outputSchema($(outputCol)).metadata
+    dataset.select(col("*"), t(col($(inputCol))).as($(outputCol), metadata))
   }
 
-  override protected def outputDataType: DataType = new VectorUDT()
+  override def transformSchema(schema: StructType): StructType = {
+    val inputType = schema($(inputCol)).dataType
+    require(inputType.isInstanceOf[ArrayType],
+      s"The input column must be ArrayType, but got $inputType.")
+    val attrGroup = new AttributeGroup($(outputCol), $(numFeatures))
+    SchemaUtils.appendColumn(schema, attrGroup.toStructField())
+  }
 }
diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/HashingTFSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/HashingTFSuite.scala
new file mode 100644
index 0000000000000..2e4beb0bfff63
--- /dev/null
+++ b/mllib/src/test/scala/org/apache/spark/ml/feature/HashingTFSuite.scala
@@ -0,0 +1,55 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.ml.feature
+
+import org.scalatest.FunSuite
+
+import org.apache.spark.ml.attribute.AttributeGroup
+import org.apache.spark.ml.param.ParamsSuite
+import org.apache.spark.mllib.linalg.{Vector, Vectors}
+import org.apache.spark.mllib.util.MLlibTestSparkContext
+import org.apache.spark.mllib.util.TestingUtils._
+import org.apache.spark.util.Utils
+
+class HashingTFSuite extends FunSuite with MLlibTestSparkContext {
+
+  test("params") {
+    val hashingTF = new HashingTF
+    ParamsSuite.checkParams(hashingTF, 3)
+  }
+
+  test("hashingTF") {
+    val df = sqlContext.createDataFrame(Seq(
+      (0, "a a b b c d".split(" ").toSeq)
+    )).toDF("id", "words")
+    val n = 100
+    val hashingTF = new HashingTF()
+      .setInputCol("words")
+      .setOutputCol("features")
+      .setNumFeatures(n)
+    val output = hashingTF.transform(df)
+    val attrGroup = AttributeGroup.fromStructField(output.schema("features"))
+    require(attrGroup.numAttributes === Some(n))
+    val features = output.select("features").first().getAs[Vector](0)
+    // Assume perfect hash on "a", "b", "c", and "d".
+    def idx(any: Any): Int = Utils.nonNegativeMod(any.##, n)
+    val expected = Vectors.sparse(n,
+      Seq((idx("a"), 2.0), (idx("b"), 2.0), (idx("c"), 1.0), (idx("d"), 1.0)))
+    assert(features ~== expected absTol 1e-14)
+  }
+}
diff --git a/mllib/src/test/scala/org/apache/spark/ml/param/ParamsSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/param/ParamsSuite.scala
index b96874f3a8821..d270ad7613af1 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/param/ParamsSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/param/ParamsSuite.scala
@@ -201,3 +201,23 @@ class ParamsSuite extends FunSuite {
     assert(inArray(1) && inArray(2) && !inArray(0))
   }
 }
+
+object ParamsSuite extends FunSuite {
+
+  /**
+   * Checks common requirements for [[Params.params]]: 1) number of params; 2) params are ordered
+   * by names; 3) param parent has the same UID as the object's UID; 4) param name is the same as
+   * the param method name.
+   */
+  def checkParams(obj: Params, expectedNumParams: Int): Unit = {
+    val params = obj.params
+    require(params.length === expectedNumParams,
+      s"Expect $expectedNumParams params but got ${params.length}: ${params.map(_.name).toSeq}.")
+    val paramNames = params.map(_.name)
+    require(paramNames === paramNames.sorted)
+    params.foreach { p =>
+      assert(p.parent === obj.uid)
+      assert(obj.getParam(p.name) === p)
+    }
+  }
+}

From 956c4c910cb536a02128349f2250d0a5f9924d0c Mon Sep 17 00:00:00 2001
From: Hari Shreedharan <hshreedharan@apache.org>
Date: Thu, 21 May 2015 20:24:28 -0500
Subject: [PATCH 132/525] [SPARK-7657] [YARN] Add driver logs links in
 application UI, in cluster mode.

This PR adds the URLs to the driver logs to `SparkListenerApplicationStarted` event, which is later used by the `ExecutorsListener` to populate the URLs to the driver logs in its own state. This info is then used when the UI is rendered to display links to the logs.

Author: Hari Shreedharan <hshreedharan@apache.org>

Closes #6166 from harishreedharan/am-log-link and squashes the following commits:

943fc4f [Hari Shreedharan] Merge remote-tracking branch 'asf/master' into am-log-link
9e5c04b [Hari Shreedharan] Merge remote-tracking branch 'asf/master' into am-log-link
b3f9b9d [Hari Shreedharan] Updated comment based on feedback.
0840a95 [Hari Shreedharan] Move the result and sc.stop back to original location, minor import changes.
537a2f7 [Hari Shreedharan] Add test to ensure the log urls are populated and valid.
4033725 [Hari Shreedharan] Adding comments explaining how node reports are used to get the log urls.
6c5c285 [Hari Shreedharan] Import order.
346f4ea [Hari Shreedharan] Review feedback fixes.
629c1dc [Hari Shreedharan] Cleanup.
99fb1a3 [Hari Shreedharan] Send the log urls in App start event, to ensure that other listeners are not affected.
c0de336 [Hari Shreedharan] Ensure new unit test cleans up after itself.
50cdae3 [Hari Shreedharan] Added unit test, made the approach generic.
402e8e4 [Hari Shreedharan] Use `NodeReport` to get the URL for the logs. Also, make the environment variables generic so other cluster managers can use them as well.
1cf338f [Hari Shreedharan] [SPARK-7657][YARN] Add driver link in application UI, in cluster mode.
---
 .../scala/org/apache/spark/SparkContext.scala |  2 +-
 .../spark/scheduler/SchedulerBackend.scala    |  7 ++
 .../spark/scheduler/SparkListener.scala       |  9 ++-
 .../apache/spark/ui/exec/ExecutorsTab.scala   | 12 ++-
 .../org/apache/spark/util/JsonProtocol.scala  |  6 +-
 .../spark/deploy/yarn/YarnRMClient.scala      |  4 +-
 .../deploy/yarn/YarnSparkHadoopUtil.scala     |  7 +-
 .../cluster/YarnClusterSchedulerBackend.scala | 77 ++++++++++++++++++-
 .../spark/deploy/yarn/YarnClusterSuite.scala  | 24 +++++-
 9 files changed, 136 insertions(+), 12 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/SparkContext.scala b/core/src/main/scala/org/apache/spark/SparkContext.scala
index cf3820fcb6a35..ad78bdfde2dfb 100644
--- a/core/src/main/scala/org/apache/spark/SparkContext.scala
+++ b/core/src/main/scala/org/apache/spark/SparkContext.scala
@@ -1991,7 +1991,7 @@ class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationCli
     // Note: this code assumes that the task scheduler has been initialized and has contacted
     // the cluster manager to get an application ID (in case the cluster manager provides one).
     listenerBus.post(SparkListenerApplicationStart(appName, Some(applicationId),
-      startTime, sparkUser, applicationAttemptId))
+      startTime, sparkUser, applicationAttemptId, schedulerBackend.getDriverLogUrls))
   }
 
   /** Post the application end event */
diff --git a/core/src/main/scala/org/apache/spark/scheduler/SchedulerBackend.scala b/core/src/main/scala/org/apache/spark/scheduler/SchedulerBackend.scala
index 646820520ea1b..8801a761afae3 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/SchedulerBackend.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/SchedulerBackend.scala
@@ -49,4 +49,11 @@ private[spark] trait SchedulerBackend {
    */
   def applicationAttemptId(): Option[String] = None
 
+  /**
+   * Get the URLs for the driver logs. These URLs are used to display the links in the UI
+   * Executors tab for the driver.
+   * @return Map containing the log names and their respective URLs
+   */
+  def getDriverLogUrls: Option[Map[String, String]] = None
+
 }
diff --git a/core/src/main/scala/org/apache/spark/scheduler/SparkListener.scala b/core/src/main/scala/org/apache/spark/scheduler/SparkListener.scala
index 169d4fd3a94f0..863d0befbc19e 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/SparkListener.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/SparkListener.scala
@@ -110,8 +110,13 @@ case class SparkListenerExecutorMetricsUpdate(
   extends SparkListenerEvent
 
 @DeveloperApi
-case class SparkListenerApplicationStart(appName: String, appId: Option[String],
-   time: Long, sparkUser: String, appAttemptId: Option[String]) extends SparkListenerEvent
+case class SparkListenerApplicationStart(
+    appName: String,
+    appId: Option[String],
+    time: Long,
+    sparkUser: String,
+    appAttemptId: Option[String],
+    driverLogs: Option[Map[String, String]] = None) extends SparkListenerEvent
 
 @DeveloperApi
 case class SparkListenerApplicationEnd(time: Long) extends SparkListenerEvent
diff --git a/core/src/main/scala/org/apache/spark/ui/exec/ExecutorsTab.scala b/core/src/main/scala/org/apache/spark/ui/exec/ExecutorsTab.scala
index 0a08b000e2d03..39583af14390d 100644
--- a/core/src/main/scala/org/apache/spark/ui/exec/ExecutorsTab.scala
+++ b/core/src/main/scala/org/apache/spark/ui/exec/ExecutorsTab.scala
@@ -19,7 +19,7 @@ package org.apache.spark.ui.exec
 
 import scala.collection.mutable.HashMap
 
-import org.apache.spark.ExceptionFailure
+import org.apache.spark.{ExceptionFailure, SparkContext}
 import org.apache.spark.annotation.DeveloperApi
 import org.apache.spark.scheduler._
 import org.apache.spark.storage.{StorageStatus, StorageStatusListener}
@@ -73,6 +73,16 @@ class ExecutorsListener(storageStatusListener: StorageStatusListener) extends Sp
     uiData.finishReason = Some(executorRemoved.reason)
   }
 
+  override def onApplicationStart(applicationStart: SparkListenerApplicationStart): Unit = {
+    applicationStart.driverLogs.foreach { logs =>
+      val storageStatus = storageStatusList.find { s =>
+        s.blockManagerId.executorId == SparkContext.LEGACY_DRIVER_IDENTIFIER ||
+        s.blockManagerId.executorId == SparkContext.DRIVER_IDENTIFIER
+      }
+      storageStatus.foreach { s => executorToLogUrls(s.blockManagerId.executorId) = logs.toMap }
+    }
+  }
+
   override def onTaskStart(taskStart: SparkListenerTaskStart): Unit = synchronized {
     val eid = taskStart.taskInfo.executorId
     executorToTasksActive(eid) = executorToTasksActive.getOrElse(eid, 0) + 1
diff --git a/core/src/main/scala/org/apache/spark/util/JsonProtocol.scala b/core/src/main/scala/org/apache/spark/util/JsonProtocol.scala
index 3f162d1f6c3eb..adf69a4e78e71 100644
--- a/core/src/main/scala/org/apache/spark/util/JsonProtocol.scala
+++ b/core/src/main/scala/org/apache/spark/util/JsonProtocol.scala
@@ -196,7 +196,8 @@ private[spark] object JsonProtocol {
     ("App ID" -> applicationStart.appId.map(JString(_)).getOrElse(JNothing)) ~
     ("Timestamp" -> applicationStart.time) ~
     ("User" -> applicationStart.sparkUser) ~
-    ("App Attempt ID" -> applicationStart.appAttemptId.map(JString(_)).getOrElse(JNothing))
+    ("App Attempt ID" -> applicationStart.appAttemptId.map(JString(_)).getOrElse(JNothing)) ~
+    ("Driver Logs" -> applicationStart.driverLogs.map(mapToJson).getOrElse(JNothing))
   }
 
   def applicationEndToJson(applicationEnd: SparkListenerApplicationEnd): JValue = {
@@ -570,7 +571,8 @@ private[spark] object JsonProtocol {
     val time = (json \ "Timestamp").extract[Long]
     val sparkUser = (json \ "User").extract[String]
     val appAttemptId = Utils.jsonOption(json \ "App Attempt ID").map(_.extract[String])
-    SparkListenerApplicationStart(appName, appId, time, sparkUser, appAttemptId)
+    val driverLogs = Utils.jsonOption(json \ "Driver Logs").map(mapFromJson)
+    SparkListenerApplicationStart(appName, appId, time, sparkUser, appAttemptId, driverLogs)
   }
 
   def applicationEndFromJson(json: JValue): SparkListenerApplicationEnd = {
diff --git a/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnRMClient.scala b/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnRMClient.scala
index b134751366522..ffe71dfd7d257 100644
--- a/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnRMClient.scala
+++ b/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnRMClient.scala
@@ -89,9 +89,7 @@ private[spark] class YarnRMClient(args: ApplicationMasterArguments) extends Logg
 
   /** Returns the attempt ID. */
   def getAttemptId(): ApplicationAttemptId = {
-    val containerIdString = System.getenv(ApplicationConstants.Environment.CONTAINER_ID.name())
-    val containerId = ConverterUtils.toContainerId(containerIdString)
-    containerId.getApplicationAttemptId()
+    YarnSparkHadoopUtil.get.getContainerId.getApplicationAttemptId()
   }
 
   /** Returns the configuration for the AmIpFilter to add to the Spark UI. */
diff --git a/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnSparkHadoopUtil.scala b/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnSparkHadoopUtil.scala
index ba91872107d0c..5e6531895c7ba 100644
--- a/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnSparkHadoopUtil.scala
+++ b/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnSparkHadoopUtil.scala
@@ -33,7 +33,8 @@ import org.apache.hadoop.security.UserGroupInformation
 import org.apache.hadoop.yarn.conf.YarnConfiguration
 import org.apache.hadoop.yarn.api.ApplicationConstants
 import org.apache.hadoop.yarn.api.ApplicationConstants.Environment
-import org.apache.hadoop.yarn.api.records.{Priority, ApplicationAccessType}
+import org.apache.hadoop.yarn.api.records.{ApplicationAccessType, ContainerId, Priority}
+import org.apache.hadoop.yarn.util.ConverterUtils
 
 import org.apache.spark.deploy.SparkHadoopUtil
 import org.apache.spark.{SecurityManager, SparkConf, SparkException}
@@ -136,6 +137,10 @@ class YarnSparkHadoopUtil extends SparkHadoopUtil {
     tokenRenewer.foreach(_.stop())
   }
 
+  private[spark] def getContainerId: ContainerId = {
+    val containerIdString = System.getenv(ApplicationConstants.Environment.CONTAINER_ID.name())
+    ConverterUtils.toContainerId(containerIdString)
+  }
 }
 
 object YarnSparkHadoopUtil {
diff --git a/yarn/src/main/scala/org/apache/spark/scheduler/cluster/YarnClusterSchedulerBackend.scala b/yarn/src/main/scala/org/apache/spark/scheduler/cluster/YarnClusterSchedulerBackend.scala
index aeb218a575455..1ace1a97d5156 100644
--- a/yarn/src/main/scala/org/apache/spark/scheduler/cluster/YarnClusterSchedulerBackend.scala
+++ b/yarn/src/main/scala/org/apache/spark/scheduler/cluster/YarnClusterSchedulerBackend.scala
@@ -17,10 +17,19 @@
 
 package org.apache.spark.scheduler.cluster
 
+import java.net.NetworkInterface
+
+import scala.collection.JavaConverters._
+
+import org.apache.hadoop.yarn.api.records.NodeState
+import org.apache.hadoop.yarn.client.api.YarnClient
+import org.apache.hadoop.yarn.conf.YarnConfiguration
+
 import org.apache.spark.SparkContext
+import org.apache.spark.deploy.yarn.YarnSparkHadoopUtil
 import org.apache.spark.deploy.yarn.YarnSparkHadoopUtil._
 import org.apache.spark.scheduler.TaskSchedulerImpl
-import org.apache.spark.util.IntParam
+import org.apache.spark.util.{IntParam, Utils}
 
 private[spark] class YarnClusterSchedulerBackend(
     scheduler: TaskSchedulerImpl,
@@ -53,4 +62,70 @@ private[spark] class YarnClusterSchedulerBackend(
       logError("Application attempt ID is not set.")
       super.applicationAttemptId
     }
+
+  override def getDriverLogUrls: Option[Map[String, String]] = {
+    var yarnClientOpt: Option[YarnClient] = None
+    var driverLogs: Option[Map[String, String]] = None
+    try {
+      val yarnConf = new YarnConfiguration(sc.hadoopConfiguration)
+      val containerId = YarnSparkHadoopUtil.get.getContainerId
+      yarnClientOpt = Some(YarnClient.createYarnClient())
+      yarnClientOpt.foreach { yarnClient =>
+        yarnClient.init(yarnConf)
+        yarnClient.start()
+
+        // For newer versions of YARN, we can find the HTTP address for a given node by getting a
+        // container report for a given container. But container reports came only in Hadoop 2.4,
+        // so we basically have to get the node reports for all nodes and find the one which runs
+        // this container. For that we have to compare the node's host against the current host.
+        // Since the host can have multiple addresses, we need to compare against all of them to
+        // find out if one matches.
+
+        // Get all the addresses of this node.
+        val addresses =
+          NetworkInterface.getNetworkInterfaces.asScala
+            .flatMap(_.getInetAddresses.asScala)
+            .toSeq
+
+        // Find a node report that matches one of the addresses
+        val nodeReport =
+          yarnClient.getNodeReports(NodeState.RUNNING).asScala.find { x =>
+            val host = x.getNodeId.getHost
+            addresses.exists { address =>
+              address.getHostAddress == host ||
+                address.getHostName == host ||
+                address.getCanonicalHostName == host
+            }
+          }
+
+        // Now that we have found the report for the Node Manager that the AM is running on, we
+        // can get the base HTTP address for the Node manager from the report.
+        // The format used for the logs for each container is well-known and can be constructed
+        // using the NM's HTTP address and the container ID.
+        // The NM may be running several containers, but we can build the URL for the AM using
+        // the AM's container ID, which we already know.
+        nodeReport.foreach { report =>
+          val httpAddress = report.getHttpAddress
+          // lookup appropriate http scheme for container log urls
+          val yarnHttpPolicy = yarnConf.get(
+            YarnConfiguration.YARN_HTTP_POLICY_KEY,
+            YarnConfiguration.YARN_HTTP_POLICY_DEFAULT
+          )
+          val user = Utils.getCurrentUserName()
+          val httpScheme = if (yarnHttpPolicy == "HTTPS_ONLY") "https://" else "http://"
+          val baseUrl = s"$httpScheme$httpAddress/node/containerlogs/$containerId/$user"
+          logDebug(s"Base URL for logs: $baseUrl")
+          driverLogs = Some(
+            Map("stderr" -> s"$baseUrl/stderr?start=0", "stdout" -> s"$baseUrl/stdout?start=0"))
+        }
+      }
+    } catch {
+      case e: Exception =>
+        logInfo("Node Report API is not available in the version of YARN being used, so AM" +
+          " logs link will not appear in application UI", e)
+    } finally {
+      yarnClientOpt.foreach(_.close())
+    }
+    driverLogs
+  }
 }
diff --git a/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnClusterSuite.scala b/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnClusterSuite.scala
index d3c606e0ed998..dcaeb2e43ff41 100644
--- a/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnClusterSuite.scala
+++ b/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnClusterSuite.scala
@@ -23,6 +23,7 @@ import java.util.concurrent.TimeUnit
 
 import scala.collection.JavaConversions._
 import scala.collection.mutable
+import scala.io.Source
 
 import com.google.common.base.Charsets.UTF_8
 import com.google.common.io.ByteStreams
@@ -33,7 +34,8 @@ import org.scalatest.{BeforeAndAfterAll, FunSuite, Matchers}
 
 import org.apache.spark.{Logging, SparkConf, SparkContext, SparkException, TestUtils}
 import org.apache.spark.scheduler.cluster.ExecutorInfo
-import org.apache.spark.scheduler.{SparkListenerJobStart, SparkListener, SparkListenerExecutorAdded}
+import org.apache.spark.scheduler.{SparkListener, SparkListenerApplicationStart,
+  SparkListenerExecutorAdded}
 import org.apache.spark.util.Utils
 
 /**
@@ -290,10 +292,15 @@ class YarnClusterSuite extends FunSuite with BeforeAndAfterAll with Matchers wit
 
 private[spark] class SaveExecutorInfo extends SparkListener {
   val addedExecutorInfos = mutable.Map[String, ExecutorInfo]()
+  var driverLogs: Option[collection.Map[String, String]] = None
 
   override def onExecutorAdded(executor: SparkListenerExecutorAdded) {
     addedExecutorInfos(executor.executorId) = executor.executorInfo
   }
+
+  override def onApplicationStart(appStart: SparkListenerApplicationStart): Unit = {
+    driverLogs = appStart.driverLogs
+  }
 }
 
 private object YarnClusterDriver extends Logging with Matchers {
@@ -314,6 +321,7 @@ private object YarnClusterDriver extends Logging with Matchers {
     val sc = new SparkContext(new SparkConf()
       .set("spark.extraListeners", classOf[SaveExecutorInfo].getName)
       .setAppName("yarn \"test app\" 'with quotes' and \\back\\slashes and $dollarSigns"))
+    val conf = sc.getConf
     val status = new File(args(0))
     var result = "failure"
     try {
@@ -335,6 +343,20 @@ private object YarnClusterDriver extends Logging with Matchers {
     executorInfos.foreach { info =>
       assert(info.logUrlMap.nonEmpty)
     }
+
+    // If we are running in yarn-cluster mode, verify that driver logs are downloadable.
+    if (conf.get("spark.master") == "yarn-cluster") {
+      assert(listener.driverLogs.nonEmpty)
+      val driverLogs = listener.driverLogs.get
+      assert(driverLogs.size === 2)
+      assert(driverLogs.containsKey("stderr"))
+      assert(driverLogs.containsKey("stdout"))
+      val stderr = driverLogs("stderr") // YARN puts everything in stderr.
+      val lines = Source.fromURL(stderr).getLines()
+      // Look for a line that contains YarnClusterSchedulerBackend, since that is guaranteed in
+      // cluster mode.
+      assert(lines.exists(_.contains("YarnClusterSchedulerBackend")))
+    }
   }
 
 }

From e4136ea6c457bc74cee312aa14974498ab4633eb Mon Sep 17 00:00:00 2001
From: Mike Dusenberry <dusenberrymw@gmail.com>
Date: Thu, 21 May 2015 19:05:04 -0700
Subject: [PATCH 133/525] [DOCS] [MLLIB] Fixing broken link in MLlib Linear
 Methods documentation.

Just a small change: fixed a broken link in the MLlib Linear Methods documentation by removing a newline character between the link title and link address.

Author: Mike Dusenberry <dusenberrymw@gmail.com>

Closes #6340 from dusenberrymw/Fix_MLlib_Linear_Methods_link and squashes the following commits:

0a57818 [Mike Dusenberry] Fixing broken link in MLlib Linear Methods documentation.
---
 docs/mllib-linear-methods.md | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/docs/mllib-linear-methods.md b/docs/mllib-linear-methods.md
index 2b2be4d9d0273..8029edca16002 100644
--- a/docs/mllib-linear-methods.md
+++ b/docs/mllib-linear-methods.md
@@ -785,8 +785,7 @@ gradient descent (`stepSize`, `numIterations`, `miniBatchFraction`).  For each o
 all three possible regularizations (none, L1 or L2).
 
 For Logistic Regression, [L-BFGS](api/scala/index.html#org.apache.spark.mllib.optimization.LBFGS)
-version is implemented under [LogisticRegressionWithLBFGS]
-(api/scala/index.html#org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS), and this
+version is implemented under [LogisticRegressionWithLBFGS](api/scala/index.html#org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS), and this
 version supports both binary and multinomial Logistic Regression while SGD version only supports
 binary Logistic Regression. However, L-BFGS version doesn't support L1 regularization but SGD one
 supports L1 regularization. When L1 regularization is not required, L-BFGS version is strongly

From 8f11c6116bf8c7246682cbb2d6f27bf0f1531c6d Mon Sep 17 00:00:00 2001
From: Xiangrui Meng <meng@databricks.com>
Date: Thu, 21 May 2015 22:57:33 -0700
Subject: [PATCH 134/525] [SPARK-7535] [.0] [MLLIB] Audit the pipeline APIs for
 1.4
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Some changes to the pipeilne APIs:

1. Estimator/Transformer/ doesn’t need to extend Params since PipelineStage already does.
1. Move Evaluator to ml.evaluation.
1. Mention larger metric values are better.
1. PipelineModel doc. “compiled” -> “fitted”
1. Hide object PolynomialExpansion.
1. Hide object VectorAssembler.
1. Word2Vec.minCount (and other) -> group param
1. ParamValidators -> DeveloperApi
1. Hide MetadataUtils/SchemaUtils.

jkbradley

Author: Xiangrui Meng <meng@databricks.com>

Closes #6322 from mengxr/SPARK-7535.0 and squashes the following commits:

9e9c7da [Xiangrui Meng] move JavaEvaluator to ml.evaluation as well
e179480 [Xiangrui Meng] move Evaluation to ml.evaluation in PySpark
08ef61f [Xiangrui Meng] update pipieline APIs
---
 .../scala/org/apache/spark/ml/Estimator.scala |  2 +-
 .../scala/org/apache/spark/ml/Pipeline.scala  |  2 +-
 .../org/apache/spark/ml/Transformer.scala     |  2 +-
 .../BinaryClassificationEvaluator.scala       |  2 +-
 .../spark/ml/{ => evaluation}/Evaluator.scala |  4 +-
 .../ml/feature/PolynomialExpansion.scala      |  2 +-
 .../spark/ml/feature/VectorAssembler.scala    |  3 +-
 .../apache/spark/ml/feature/Word2Vec.scala    |  3 +
 .../org/apache/spark/ml/param/params.scala    |  6 +-
 .../spark/ml/tuning/CrossValidator.scala      |  1 +
 .../apache/spark/ml/util/MetadataUtils.scala  |  6 +-
 .../apache/spark/ml/util/SchemaUtils.scala    |  6 +-
 python/pyspark/ml/__init__.py                 |  4 +-
 python/pyspark/ml/evaluation.py               | 63 ++++++++++++++++++-
 python/pyspark/ml/pipeline.py                 | 37 -----------
 python/pyspark/ml/wrapper.py                  | 21 +------
 16 files changed, 84 insertions(+), 80 deletions(-)
 rename mllib/src/main/scala/org/apache/spark/ml/{ => evaluation}/Evaluator.scala (93%)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/Estimator.scala b/mllib/src/main/scala/org/apache/spark/ml/Estimator.scala
index 7f3f3262a644f..9e16e60270141 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/Estimator.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/Estimator.scala
@@ -28,7 +28,7 @@ import org.apache.spark.sql.DataFrame
  * Abstract class for estimators that fit models to data.
  */
 @AlphaComponent
-abstract class Estimator[M <: Model[M]] extends PipelineStage with Params {
+abstract class Estimator[M <: Model[M]] extends PipelineStage {
 
   /**
    * Fits a single model to the input data with optional parameters.
diff --git a/mllib/src/main/scala/org/apache/spark/ml/Pipeline.scala b/mllib/src/main/scala/org/apache/spark/ml/Pipeline.scala
index fac54188f9f4e..43bee1b770e67 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/Pipeline.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/Pipeline.scala
@@ -170,7 +170,7 @@ class Pipeline(override val uid: String) extends Estimator[PipelineModel] {
 
 /**
  * :: AlphaComponent ::
- * Represents a compiled pipeline.
+ * Represents a fitted pipeline.
  */
 @AlphaComponent
 class PipelineModel private[ml] (
diff --git a/mllib/src/main/scala/org/apache/spark/ml/Transformer.scala b/mllib/src/main/scala/org/apache/spark/ml/Transformer.scala
index d96b54e511e9c..38bb6a5a5391e 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/Transformer.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/Transformer.scala
@@ -32,7 +32,7 @@ import org.apache.spark.sql.types._
  * Abstract class for transformers that transform one dataset into another.
  */
 @AlphaComponent
-abstract class Transformer extends PipelineStage with Params {
+abstract class Transformer extends PipelineStage {
 
   /**
    * Transforms the dataset with optional parameters
diff --git a/mllib/src/main/scala/org/apache/spark/ml/evaluation/BinaryClassificationEvaluator.scala b/mllib/src/main/scala/org/apache/spark/ml/evaluation/BinaryClassificationEvaluator.scala
index c1af09c9694ba..ddbdd00ceb159 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/evaluation/BinaryClassificationEvaluator.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/evaluation/BinaryClassificationEvaluator.scala
@@ -18,7 +18,7 @@
 package org.apache.spark.ml.evaluation
 
 import org.apache.spark.annotation.AlphaComponent
-import org.apache.spark.ml.Evaluator
+import org.apache.spark.ml.evaluation.Evaluator
 import org.apache.spark.ml.param._
 import org.apache.spark.ml.param.shared._
 import org.apache.spark.ml.util.{Identifiable, SchemaUtils}
diff --git a/mllib/src/main/scala/org/apache/spark/ml/Evaluator.scala b/mllib/src/main/scala/org/apache/spark/ml/evaluation/Evaluator.scala
similarity index 93%
rename from mllib/src/main/scala/org/apache/spark/ml/Evaluator.scala
rename to mllib/src/main/scala/org/apache/spark/ml/evaluation/Evaluator.scala
index 5f2f8c94e9ff7..cabd1c97c085c 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/Evaluator.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/evaluation/Evaluator.scala
@@ -15,7 +15,7 @@
  * limitations under the License.
  */
 
-package org.apache.spark.ml
+package org.apache.spark.ml.evaluation
 
 import org.apache.spark.annotation.AlphaComponent
 import org.apache.spark.ml.param.{ParamMap, Params}
@@ -29,7 +29,7 @@ import org.apache.spark.sql.DataFrame
 abstract class Evaluator extends Params {
 
   /**
-   * Evaluates the output.
+   * Evaluates model output and returns a scalar metric (larger is better).
    *
    * @param dataset a dataset that contains labels/observations and predictions.
    * @param paramMap parameter map that specifies the input columns and output metrics
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/PolynomialExpansion.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/PolynomialExpansion.scala
index 41564410e4965..8ddf9d6a1e138 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/PolynomialExpansion.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/PolynomialExpansion.scala
@@ -75,7 +75,7 @@ class PolynomialExpansion(override val uid: String)
  * To handle sparsity, if c is zero, we can skip all monomials that contain it. We remember the
  * current index and increment it properly for sparse input.
  */
-object PolynomialExpansion {
+private[feature] object PolynomialExpansion {
 
   private def choose(n: Int, k: Int): Int = {
     Range(n, n - k, -1).product / Range(k, 1, -1).product
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/VectorAssembler.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/VectorAssembler.scala
index 1c0009476908c..181b62f46fce8 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/VectorAssembler.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/VectorAssembler.scala
@@ -78,8 +78,7 @@ class VectorAssembler(override val uid: String)
   }
 }
 
-@AlphaComponent
-object VectorAssembler {
+private object VectorAssembler {
 
   private[feature] def assemble(vv: Any*): Vector = {
     val indices = ArrayBuilder.make[Int]
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/Word2Vec.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/Word2Vec.scala
index 90f0be76df44f..ed032669229ce 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/Word2Vec.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/Word2Vec.scala
@@ -37,6 +37,7 @@ private[feature] trait Word2VecBase extends Params
 
   /**
    * The dimension of the code that you want to transform from words.
+   * @group param
    */
   final val vectorSize = new IntParam(
     this, "vectorSize", "the dimension of codes after transforming from words")
@@ -47,6 +48,7 @@ private[feature] trait Word2VecBase extends Params
 
   /**
    * Number of partitions for sentences of words.
+   * @group param
    */
   final val numPartitions = new IntParam(
     this, "numPartitions", "number of partitions for sentences of words")
@@ -58,6 +60,7 @@ private[feature] trait Word2VecBase extends Params
   /**
    * The minimum number of times a token must appear to be included in the word2vec model's
    * vocabulary.
+   * @group param
    */
   final val minCount = new IntParam(this, "minCount", "the minimum number of times a token must " +
     "appear to be included in the word2vec model's vocabulary")
diff --git a/mllib/src/main/scala/org/apache/spark/ml/param/params.scala b/mllib/src/main/scala/org/apache/spark/ml/param/params.scala
index 94abfcda5cf2a..12fc5b561f76e 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/param/params.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/param/params.scala
@@ -24,7 +24,7 @@ import scala.annotation.varargs
 import scala.collection.mutable
 import scala.collection.JavaConverters._
 
-import org.apache.spark.annotation.AlphaComponent
+import org.apache.spark.annotation.{DeveloperApi, AlphaComponent}
 import org.apache.spark.ml.util.Identifiable
 
 /**
@@ -92,9 +92,11 @@ class Param[T](val parent: String, val name: String, val doc: String, val isVali
 }
 
 /**
+ * :: DeveloperApi ::
  * Factory methods for common validation functions for [[Param.isValid]].
  * The numerical methods only support Int, Long, Float, and Double.
  */
+@DeveloperApi
 object ParamValidators {
 
   /** (private[param]) Default validation always return true */
@@ -529,11 +531,13 @@ trait Params extends Identifiable with Serializable {
 }
 
 /**
+ * :: DeveloperApi ::
  * Java-friendly wrapper for [[Params]].
  * Java developers who need to extend [[Params]] should use this class instead.
  * If you need to extend a abstract class which already extends [[Params]], then that abstract
  * class should be Java-friendly as well.
  */
+@DeveloperApi
 abstract class JavaParams extends Params
 
 /**
diff --git a/mllib/src/main/scala/org/apache/spark/ml/tuning/CrossValidator.scala b/mllib/src/main/scala/org/apache/spark/ml/tuning/CrossValidator.scala
index 5c6ff2dda3604..e21ff94a20f54 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/tuning/CrossValidator.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/tuning/CrossValidator.scala
@@ -22,6 +22,7 @@ import com.github.fommil.netlib.F2jBLAS
 import org.apache.spark.Logging
 import org.apache.spark.annotation.AlphaComponent
 import org.apache.spark.ml._
+import org.apache.spark.ml.evaluation.Evaluator
 import org.apache.spark.ml.param._
 import org.apache.spark.ml.util.Identifiable
 import org.apache.spark.mllib.util.MLUtils
diff --git a/mllib/src/main/scala/org/apache/spark/ml/util/MetadataUtils.scala b/mllib/src/main/scala/org/apache/spark/ml/util/MetadataUtils.scala
index 56075c9a6b39f..2a1db90f2ca2b 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/util/MetadataUtils.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/util/MetadataUtils.scala
@@ -19,18 +19,14 @@ package org.apache.spark.ml.util
 
 import scala.collection.immutable.HashMap
 
-import org.apache.spark.annotation.Experimental
 import org.apache.spark.ml.attribute._
 import org.apache.spark.sql.types.StructField
 
 
 /**
- * :: Experimental ::
- *
  * Helper utilities for tree-based algorithms
  */
-@Experimental
-object MetadataUtils {
+private[spark] object MetadataUtils {
 
   /**
    * Examine a schema to identify the number of classes in a label column.
diff --git a/mllib/src/main/scala/org/apache/spark/ml/util/SchemaUtils.scala b/mllib/src/main/scala/org/apache/spark/ml/util/SchemaUtils.scala
index 11592b77eb356..7cd53c6d7ef79 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/util/SchemaUtils.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/util/SchemaUtils.scala
@@ -17,15 +17,13 @@
 
 package org.apache.spark.ml.util
 
-import org.apache.spark.annotation.DeveloperApi
 import org.apache.spark.sql.types.{DataType, StructField, StructType}
 
+
 /**
- * :: DeveloperApi ::
  * Utils for handling schemas.
  */
-@DeveloperApi
-object SchemaUtils {
+private[spark] object SchemaUtils {
 
   // TODO: Move the utility methods to SQL.
 
diff --git a/python/pyspark/ml/__init__.py b/python/pyspark/ml/__init__.py
index da793d9db7f91..327a11b14b5aa 100644
--- a/python/pyspark/ml/__init__.py
+++ b/python/pyspark/ml/__init__.py
@@ -15,6 +15,6 @@
 # limitations under the License.
 #
 
-from pyspark.ml.pipeline import Transformer, Estimator, Model, Pipeline, PipelineModel, Evaluator
+from pyspark.ml.pipeline import Transformer, Estimator, Model, Pipeline, PipelineModel
 
-__all__ = ["Transformer", "Estimator", "Model", "Pipeline", "PipelineModel", "Evaluator"]
+__all__ = ["Transformer", "Estimator", "Model", "Pipeline", "PipelineModel"]
diff --git a/python/pyspark/ml/evaluation.py b/python/pyspark/ml/evaluation.py
index f4655c513cae7..34e1353def467 100644
--- a/python/pyspark/ml/evaluation.py
+++ b/python/pyspark/ml/evaluation.py
@@ -15,13 +15,72 @@
 # limitations under the License.
 #
 
-from pyspark.ml.wrapper import JavaEvaluator
+from abc import abstractmethod, ABCMeta
+
+from pyspark.ml.wrapper import JavaWrapper
 from pyspark.ml.param import Param, Params
 from pyspark.ml.param.shared import HasLabelCol, HasRawPredictionCol
 from pyspark.ml.util import keyword_only
 from pyspark.mllib.common import inherit_doc
 
-__all__ = ['BinaryClassificationEvaluator']
+__all__ = ['Evaluator', 'BinaryClassificationEvaluator']
+
+
+@inherit_doc
+class Evaluator(Params):
+    """
+    Base class for evaluators that compute metrics from predictions.
+    """
+
+    __metaclass__ = ABCMeta
+
+    @abstractmethod
+    def _evaluate(self, dataset):
+        """
+        Evaluates the output.
+
+        :param dataset: a dataset that contains labels/observations and
+               predictions
+        :return: metric
+        """
+        raise NotImplementedError()
+
+    def evaluate(self, dataset, params={}):
+        """
+        Evaluates the output with optional parameters.
+
+        :param dataset: a dataset that contains labels/observations and
+                        predictions
+        :param params: an optional param map that overrides embedded
+                       params
+        :return: metric
+        """
+        if isinstance(params, dict):
+            if params:
+                return self.copy(params)._evaluate(dataset)
+            else:
+                return self._evaluate(dataset)
+        else:
+            raise ValueError("Params must be a param map but got %s." % type(params))
+
+
+@inherit_doc
+class JavaEvaluator(Evaluator, JavaWrapper):
+    """
+    Base class for :py:class:`Evaluator`s that wrap Java/Scala
+    implementations.
+    """
+
+    __metaclass__ = ABCMeta
+
+    def _evaluate(self, dataset):
+        """
+        Evaluates the output.
+        :param dataset: a dataset that contains labels/observations and predictions.
+        :return: evaluation metric
+        """
+        self._transfer_params_to_java()
+        return self._java_obj.evaluate(dataset._jdf)
 
 
 @inherit_doc
diff --git a/python/pyspark/ml/pipeline.py b/python/pyspark/ml/pipeline.py
index 0f38e021273b0..a563024b2cdcb 100644
--- a/python/pyspark/ml/pipeline.py
+++ b/python/pyspark/ml/pipeline.py
@@ -219,40 +219,3 @@ def _transform(self, dataset):
     def copy(self, extra={}):
         stages = [stage.copy(extra) for stage in self.stages]
         return PipelineModel(stages)
-
-
-class Evaluator(Params):
-    """
-    Base class for evaluators that compute metrics from predictions.
-    """
-
-    __metaclass__ = ABCMeta
-
-    @abstractmethod
-    def _evaluate(self, dataset):
-        """
-        Evaluates the output.
-
-        :param dataset: a dataset that contains labels/observations and
-               predictions
-        :return: metric
-        """
-        raise NotImplementedError()
-
-    def evaluate(self, dataset, params={}):
-        """
-        Evaluates the output with optional parameters.
-
-        :param dataset: a dataset that contains labels/observations and
-                        predictions
-        :param params: an optional param map that overrides embedded
-                       params
-        :return: metric
-        """
-        if isinstance(params, dict):
-            if params:
-                return self.copy(params)._evaluate(dataset)
-            else:
-                return self._evaluate(dataset)
-        else:
-            raise ValueError("Params must be a param map but got %s." % type(params))
diff --git a/python/pyspark/ml/wrapper.py b/python/pyspark/ml/wrapper.py
index 4419e16184da8..7b0893e2cdadc 100644
--- a/python/pyspark/ml/wrapper.py
+++ b/python/pyspark/ml/wrapper.py
@@ -20,7 +20,7 @@
 from pyspark import SparkContext
 from pyspark.sql import DataFrame
 from pyspark.ml.param import Params
-from pyspark.ml.pipeline import Estimator, Transformer, Evaluator, Model
+from pyspark.ml.pipeline import Estimator, Transformer, Model
 from pyspark.mllib.common import inherit_doc, _java2py, _py2java
 
 
@@ -185,22 +185,3 @@ def _call_java(self, name, *args):
         sc = SparkContext._active_spark_context
         java_args = [_py2java(sc, arg) for arg in args]
         return _java2py(sc, m(*java_args))
-
-
-@inherit_doc
-class JavaEvaluator(Evaluator, JavaWrapper):
-    """
-    Base class for :py:class:`Evaluator`s that wrap Java/Scala
-    implementations.
-    """
-
-    __metaclass__ = ABCMeta
-
-    def _evaluate(self, dataset):
-        """
-        Evaluates the output.
-        :param dataset: a dataset that contains labels/observations and predictions.
-        :return: evaluation metric
-        """
-        self._transfer_params_to_java()
-        return self._java_obj.evaluate(dataset._jdf)

From 2728c3df6690c2fcd4af3bd1c604c98ef6d509a5 Mon Sep 17 00:00:00 2001
From: "Joseph K. Bradley" <joseph@databricks.com>
Date: Thu, 21 May 2015 22:59:45 -0700
Subject: [PATCH 135/525] [SPARK-7578] [ML] [DOC] User guide for spark.ml
 Normalizer, IDF, StandardScaler

Added user guide sections with code examples.
Also added small Java unit tests to test Java example in guide.

CC: mengxr

Author: Joseph K. Bradley <joseph@databricks.com>

Closes #6127 from jkbradley/feature-guide-2 and squashes the following commits:

cd47f4b [Joseph K. Bradley] Updated based on code review
f16bcec [Joseph K. Bradley] Fixed merge issues and update Python examples print calls for Python 3
0a862f9 [Joseph K. Bradley] Added Normalizer, StandardScaler to ml-features doc, plus small Java unit tests
a21c2d6 [Joseph K. Bradley] Updated ml-features.md with IDF
---
 docs/ml-features.md                           | 224 ++++++++++++++++--
 .../spark/ml/feature/JavaHashingTFSuite.java  |  17 +-
 .../spark/ml/feature/JavaNormalizerSuite.java |  71 ++++++
 .../ml/feature/JavaStandardScalerSuite.java   |  71 ++++++
 4 files changed, 351 insertions(+), 32 deletions(-)
 create mode 100644 mllib/src/test/java/org/apache/spark/ml/feature/JavaNormalizerSuite.java
 create mode 100644 mllib/src/test/java/org/apache/spark/ml/feature/JavaStandardScalerSuite.java

diff --git a/docs/ml-features.md b/docs/ml-features.md
index 06f1ac196b39d..efe9b3b8edb6e 100644
--- a/docs/ml-features.md
+++ b/docs/ml-features.md
@@ -18,30 +18,38 @@ This section covers algorithms for working with features, roughly divided into t
 
 # Feature Extractors
 
-## Hashing Term-Frequency (HashingTF)
+## TF-IDF (HashingTF and IDF)
 
-`HashingTF` is a `Transformer` which takes sets of terms (e.g., `String` terms can be sets of words) and converts those sets into fixed-length feature vectors.
-The algorithm combines [Term Frequency (TF)](http://en.wikipedia.org/wiki/Tf%E2%80%93idf) counts with the [hashing trick](http://en.wikipedia.org/wiki/Feature_hashing) for dimensionality reduction.  Please refer to the [MLlib user guide on TF-IDF](mllib-feature-extraction.html#tf-idf) for more details on Term-Frequency.
+[Term Frequency-Inverse Document Frequency (TF-IDF)](http://en.wikipedia.org/wiki/Tf%E2%80%93idf) is a common text pre-processing step.  In Spark ML, TF-IDF is separate into two parts: TF (+hashing) and IDF.
 
-HashingTF is implemented in
-[HashingTF](api/scala/index.html#org.apache.spark.ml.feature.HashingTF).
-In the following code segment, we start with a set of sentences.  We split each sentence into words using `Tokenizer`.  For each sentence (bag of words), we hash it into a feature vector.  This feature vector could then be passed to a learning algorithm.
+**TF**: `HashingTF` is a `Transformer` which takes sets of terms and converts those sets into fixed-length feature vectors.  In text processing, a "set of terms" might be a bag of words.
+The algorithm combines Term Frequency (TF) counts with the [hashing trick](http://en.wikipedia.org/wiki/Feature_hashing) for dimensionality reduction.
+
+**IDF**: `IDF` is an `Estimator` which fits on a dataset and produces an `IDFModel`.  The `IDFModel` takes feature vectors (generally created from `HashingTF`) and scales each column.  Intuitively, it down-weights columns which appear frequently in a corpus.
+
+Please refer to the [MLlib user guide on TF-IDF](mllib-feature-extraction.html#tf-idf) for more details on Term Frequency and Inverse Document Frequency.
+For API details, refer to the [HashingTF API docs](api/scala/index.html#org.apache.spark.ml.feature.HashingTF) and the [IDF API docs](api/scala/index.html#org.apache.spark.ml.feature.IDF).
+
+In the following code segment, we start with a set of sentences.  We split each sentence into words using `Tokenizer`.  For each sentence (bag of words), we use `HashingTF` to hash the sentence into a feature vector.  We use `IDF` to rescale the feature vectors; this generally improves performance when using text as features.  Our feature vectors could then be passed to a learning algorithm.
 
 <div class="codetabs">
 <div data-lang="scala" markdown="1">
 {% highlight scala %}
-import org.apache.spark.ml.feature.{HashingTF, Tokenizer}
+import org.apache.spark.ml.feature.{HashingTF, IDF, Tokenizer}
 
-val sentenceDataFrame = sqlContext.createDataFrame(Seq(
+val sentenceData = sqlContext.createDataFrame(Seq(
   (0, "Hi I heard about Spark"),
   (0, "I wish Java could use case classes"),
   (1, "Logistic regression models are neat")
 )).toDF("label", "sentence")
 val tokenizer = new Tokenizer().setInputCol("sentence").setOutputCol("words")
-val wordsDataFrame = tokenizer.transform(sentenceDataFrame)
-val hashingTF = new HashingTF().setInputCol("words").setOutputCol("features").setNumFeatures(20)
-val featurized = hashingTF.transform(wordsDataFrame)
-featurized.select("features", "label").take(3).foreach(println)
+val wordsData = tokenizer.transform(sentenceData)
+val hashingTF = new HashingTF().setInputCol("words").setOutputCol("rawFeatures").setNumFeatures(20)
+val featurizedData = hashingTF.transform(wordsData)
+val idf = new IDF().setInputCol("rawFeatures").setOutputCol("features")
+val idfModel = idf.fit(featurizedData)
+val rescaledData = idfModel.transform(featurizedData)
+rescaledData.select("features", "label").take(3).foreach(println)
 {% endhighlight %}
 </div>
 
@@ -51,6 +59,7 @@ import com.google.common.collect.Lists;
 
 import org.apache.spark.api.java.JavaRDD;
 import org.apache.spark.ml.feature.HashingTF;
+import org.apache.spark.ml.feature.IDF;
 import org.apache.spark.ml.feature.Tokenizer;
 import org.apache.spark.mllib.linalg.Vector;
 import org.apache.spark.sql.DataFrame;
@@ -70,16 +79,19 @@ StructType schema = new StructType(new StructField[]{
   new StructField("label", DataTypes.DoubleType, false, Metadata.empty()),
   new StructField("sentence", DataTypes.StringType, false, Metadata.empty())
 });
-DataFrame sentenceDataFrame = sqlContext.createDataFrame(jrdd, schema);
+DataFrame sentenceData = sqlContext.createDataFrame(jrdd, schema);
 Tokenizer tokenizer = new Tokenizer().setInputCol("sentence").setOutputCol("words");
-DataFrame wordsDataFrame = tokenizer.transform(sentenceDataFrame);
+DataFrame wordsData = tokenizer.transform(sentenceData);
 int numFeatures = 20;
 HashingTF hashingTF = new HashingTF()
   .setInputCol("words")
-  .setOutputCol("features")
+  .setOutputCol("rawFeatures")
   .setNumFeatures(numFeatures);
-DataFrame featurized = hashingTF.transform(wordsDataFrame);
-for (Row r : featurized.select("features", "label").take(3)) {
+DataFrame featurizedData = hashingTF.transform(wordsData);
+IDF idf = new IDF().setInputCol("rawFeatures").setOutputCol("features");
+IDFModel idfModel = idf.fit(featurizedData);
+DataFrame rescaledData = idfModel.transform(featurizedData);
+for (Row r : rescaledData.select("features", "label").take(3)) {
   Vector features = r.getAs(0);
   Double label = r.getDouble(1);
   System.out.println(features);
@@ -89,19 +101,22 @@ for (Row r : featurized.select("features", "label").take(3)) {
 
 <div data-lang="python" markdown="1">
 {% highlight python %}
-from pyspark.ml.feature import HashingTF, Tokenizer
+from pyspark.ml.feature import HashingTF, IDF, Tokenizer
 
-sentenceDataFrame = sqlContext.createDataFrame([
+sentenceData = sqlContext.createDataFrame([
   (0, "Hi I heard about Spark"),
   (0, "I wish Java could use case classes"),
   (1, "Logistic regression models are neat")
 ], ["label", "sentence"])
 tokenizer = Tokenizer(inputCol="sentence", outputCol="words")
-wordsDataFrame = tokenizer.transform(sentenceDataFrame)
-hashingTF = HashingTF(inputCol="words", outputCol="features", numFeatures=20)
-featurized = hashingTF.transform(wordsDataFrame)
-for features_label in featurized.select("features", "label").take(3):
-  print features_label
+wordsData = tokenizer.transform(sentenceData)
+hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=20)
+featurizedData = hashingTF.transform(wordsData)
+idf = IDF(inputCol="rawFeatures", outputCol="features")
+idfModel = idf.fit(featurizedData)
+rescaledData = idfModel.transform(featurizedData)
+for features_label in rescaledData.select("features", "label").take(3):
+  print(features_label)
 {% endhighlight %}
 </div>
 </div>
@@ -267,11 +282,12 @@ sentenceDataFrame = sqlContext.createDataFrame([
 tokenizer = Tokenizer(inputCol="sentence", outputCol="words")
 wordsDataFrame = tokenizer.transform(sentenceDataFrame)
 for words_label in wordsDataFrame.select("words", "label").take(3):
-  print words_label
+  print(words_label)
 {% endhighlight %}
 </div>
 </div>
 
+
 ## Binarizer
 
 Binarization is the process of thresholding numerical features to binary features. As some probabilistic estimators make assumption that the input data is distributed according to [Bernoulli distribution](http://en.wikipedia.org/wiki/Bernoulli_distribution), a binarizer is useful for pre-processing the input data with continuous numerical features.
@@ -352,7 +368,7 @@ binarizer = Binarizer(threshold=0.5, inputCol="feature", outputCol="binarized_fe
 binarizedDataFrame = binarizer.transform(continuousDataFrame)
 binarizedFeatures = binarizedDataFrame.select("binarized_feature")
 for binarized_feature, in binarizedFeatures.collect():
-  print binarized_feature
+  print(binarized_feature)
 {% endhighlight %}
 </div>
 </div>
@@ -618,5 +634,161 @@ indexedData = indexerModel.transform(data)
 </div>
 </div>
 
+
+## Normalizer
+
+`Normalizer` is a `Transformer` which transforms a dataset of `Vector` rows, normalizing each `Vector` to have unit norm.  It takes parameter `p`, which specifies the [p-norm](http://en.wikipedia.org/wiki/Norm_%28mathematics%29#p-norm) used for normalization.  ($p = 2$ by default.)  This normalization can help standardize your input data and improve the behavior of learning algorithms.
+
+The following example demonstrates how to load a dataset in libsvm format and then normalize each row to have unit $L^2$ norm and unit $L^\infty$ norm.
+
+<div class="codetabs">
+<div data-lang="scala">
+{% highlight scala %}
+import org.apache.spark.ml.feature.Normalizer
+import org.apache.spark.mllib.util.MLUtils
+
+val data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt")
+val dataFrame = sqlContext.createDataFrame(data)
+
+// Normalize each Vector using $L^1$ norm.
+val normalizer = new Normalizer()
+  .setInputCol("features")
+  .setOutputCol("normFeatures")
+  .setP(1.0)
+val l1NormData = normalizer.transform(dataFrame)
+
+// Normalize each Vector using $L^\infty$ norm.
+val lInfNormData = normalizer.transform(dataFrame, normalizer.p -> Double.PositiveInfinity)
+{% endhighlight %}
+</div>
+
+<div data-lang="java">
+{% highlight java %}
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.ml.feature.Normalizer;
+import org.apache.spark.mllib.regression.LabeledPoint;
+import org.apache.spark.mllib.util.MLUtils;
+import org.apache.spark.sql.DataFrame;
+
+JavaRDD<LabeledPoint> data =
+  MLUtils.loadLibSVMFile(jsc.sc(), "data/mllib/sample_libsvm_data.txt").toJavaRDD();
+DataFrame dataFrame = jsql.createDataFrame(data, LabeledPoint.class);
+
+// Normalize each Vector using $L^1$ norm.
+Normalizer normalizer = new Normalizer()
+  .setInputCol("features")
+  .setOutputCol("normFeatures")
+  .setP(1.0);
+DataFrame l1NormData = normalizer.transform(dataFrame);
+
+// Normalize each Vector using $L^\infty$ norm.
+DataFrame lInfNormData =
+  normalizer.transform(dataFrame, normalizer.p().w(Double.POSITIVE_INFINITY));
+{% endhighlight %}
+</div>
+
+<div data-lang="python">
+{% highlight python %}
+from pyspark.mllib.util import MLUtils
+from pyspark.ml.feature import Normalizer
+
+data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt")
+dataFrame = sqlContext.createDataFrame(data)
+
+# Normalize each Vector using $L^1$ norm.
+normalizer = Normalizer(inputCol="features", outputCol="normFeatures", p=1.0)
+l1NormData = normalizer.transform(dataFrame)
+
+# Normalize each Vector using $L^\infty$ norm.
+lInfNormData = normalizer.transform(dataFrame, {normalizer.p: float("inf")})
+{% endhighlight %}
+</div>
+</div>
+
+
+## StandardScaler
+
+`StandardScaler` transforms a dataset of `Vector` rows, normalizing each feature to have unit standard deviation and/or zero mean.  It takes parameters:
+
+* `withStd`: True by default. Scales the data to unit standard deviation.
+* `withMean`: False by default. Centers the data with mean before scaling. It will build a dense output, so this does not work on sparse input and will raise an exception.
+
+`StandardScaler` is a `Model` which can be `fit` on a dataset to produce a `StandardScalerModel`; this amounts to computing summary statistics.  The model can then transform a `Vector` column in a dataset to have unit standard deviation and/or zero mean features.
+
+Note that if the standard deviation of a feature is zero, it will return default `0.0` value in the `Vector` for that feature.
+
+More details can be found in the API docs for
+[StandardScaler](api/scala/index.html#org.apache.spark.ml.feature.StandardScaler) and
+[StandardScalerModel](api/scala/index.html#org.apache.spark.ml.feature.StandardScalerModel).
+
+The following example demonstrates how to load a dataset in libsvm format and then normalize each feature to have unit standard deviation.
+
+<div class="codetabs">
+<div data-lang="scala">
+{% highlight scala %}
+import org.apache.spark.ml.feature.StandardScaler
+import org.apache.spark.mllib.util.MLUtils
+
+val data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt")
+val dataFrame = sqlContext.createDataFrame(data)
+val scaler = new StandardScaler()
+  .setInputCol("features")
+  .setOutputCol("scaledFeatures")
+  .setWithStd(true)
+  .setWithMean(false)
+
+// Compute summary statistics by fitting the StandardScaler
+val scalerModel = scaler.fit(dataFrame)
+
+// Normalize each feature to have unit standard deviation.
+val scaledData = scalerModel.transform(dataFrame)
+{% endhighlight %}
+</div>
+
+<div data-lang="java">
+{% highlight java %}
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.ml.feature.StandardScaler;
+import org.apache.spark.mllib.regression.LabeledPoint;
+import org.apache.spark.mllib.util.MLUtils;
+import org.apache.spark.sql.DataFrame;
+
+JavaRDD<LabeledPoint> data =
+  MLUtils.loadLibSVMFile(jsc.sc(), "data/mllib/sample_libsvm_data.txt").toJavaRDD();
+DataFrame dataFrame = jsql.createDataFrame(data, LabeledPoint.class);
+StandardScaler scaler = new StandardScaler()
+  .setInputCol("features")
+  .setOutputCol("scaledFeatures")
+  .setWithStd(true)
+  .setWithMean(false);
+
+// Compute summary statistics by fitting the StandardScaler
+StandardScalerModel scalerModel = scaler.fit(dataFrame);
+
+// Normalize each feature to have unit standard deviation.
+DataFrame scaledData = scalerModel.transform(dataFrame);
+{% endhighlight %}
+</div>
+
+<div data-lang="python">
+{% highlight python %}
+from pyspark.mllib.util import MLUtils
+from pyspark.ml.feature import StandardScaler
+
+data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt")
+dataFrame = sqlContext.createDataFrame(data)
+scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures",
+                        withStd=True, withMean=False)
+
+# Compute summary statistics by fitting the StandardScaler
+scalerModel = scaler.fit(dataFrame)
+
+# Normalize each feature to have unit standard deviation.
+scaledData = scalerModel.transform(dataFrame)
+{% endhighlight %}
+</div>
+</div>
+
+
 # Feature Selectors
 
diff --git a/mllib/src/test/java/org/apache/spark/ml/feature/JavaHashingTFSuite.java b/mllib/src/test/java/org/apache/spark/ml/feature/JavaHashingTFSuite.java
index 23463ab5fe848..da2218056307e 100644
--- a/mllib/src/test/java/org/apache/spark/ml/feature/JavaHashingTFSuite.java
+++ b/mllib/src/test/java/org/apache/spark/ml/feature/JavaHashingTFSuite.java
@@ -63,17 +63,22 @@ public void hashingTF() {
       new StructField("label", DataTypes.DoubleType, false, Metadata.empty()),
       new StructField("sentence", DataTypes.StringType, false, Metadata.empty())
     });
-    DataFrame sentenceDataFrame = jsql.createDataFrame(jrdd, schema);
 
-    Tokenizer tokenizer = new Tokenizer().setInputCol("sentence").setOutputCol("words");
-    DataFrame wordsDataFrame = tokenizer.transform(sentenceDataFrame);
+    DataFrame sentenceData = jsql.createDataFrame(jrdd, schema);
+    Tokenizer tokenizer = new Tokenizer()
+      .setInputCol("sentence")
+      .setOutputCol("words");
+    DataFrame wordsData = tokenizer.transform(sentenceData);
     int numFeatures = 20;
     HashingTF hashingTF = new HashingTF()
       .setInputCol("words")
-      .setOutputCol("features")
+      .setOutputCol("rawFeatures")
       .setNumFeatures(numFeatures);
-    DataFrame featurized = hashingTF.transform(wordsDataFrame);
-    for (Row r : featurized.select("features", "words", "label").take(3)) {
+    DataFrame featurizedData = hashingTF.transform(wordsData);
+    IDF idf = new IDF().setInputCol("rawFeatures").setOutputCol("features");
+    IDFModel idfModel = idf.fit(featurizedData);
+    DataFrame rescaledData = idfModel.transform(featurizedData);
+    for (Row r : rescaledData.select("features", "label").take(3)) {
       Vector features = r.getAs(0);
       Assert.assertEquals(features.size(), numFeatures);
     }
diff --git a/mllib/src/test/java/org/apache/spark/ml/feature/JavaNormalizerSuite.java b/mllib/src/test/java/org/apache/spark/ml/feature/JavaNormalizerSuite.java
new file mode 100644
index 0000000000000..d82f3b7e8c076
--- /dev/null
+++ b/mllib/src/test/java/org/apache/spark/ml/feature/JavaNormalizerSuite.java
@@ -0,0 +1,71 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.ml.feature;
+
+import java.util.List;
+
+import com.google.common.collect.Lists;
+import org.junit.After;
+import org.junit.Before;
+import org.junit.Test;
+
+import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.mllib.linalg.Vectors;
+import org.apache.spark.sql.DataFrame;
+import org.apache.spark.sql.SQLContext;
+
+public class JavaNormalizerSuite {
+  private transient JavaSparkContext jsc;
+  private transient SQLContext jsql;
+
+  @Before
+  public void setUp() {
+    jsc = new JavaSparkContext("local", "JavaNormalizerSuite");
+    jsql = new SQLContext(jsc);
+  }
+
+  @After
+  public void tearDown() {
+    jsc.stop();
+    jsc = null;
+  }
+
+  @Test
+  public void normalizer() {
+    // The tests are to check Java compatibility.
+    List<VectorIndexerSuite.FeatureData> points = Lists.newArrayList(
+      new VectorIndexerSuite.FeatureData(Vectors.dense(0.0, -2.0)),
+      new VectorIndexerSuite.FeatureData(Vectors.dense(1.0, 3.0)),
+      new VectorIndexerSuite.FeatureData(Vectors.dense(1.0, 4.0))
+    );
+    DataFrame dataFrame = jsql.createDataFrame(jsc.parallelize(points, 2),
+      VectorIndexerSuite.FeatureData.class);
+    Normalizer normalizer = new Normalizer()
+      .setInputCol("features")
+      .setOutputCol("normFeatures");
+
+    // Normalize each Vector using $L^2$ norm.
+    DataFrame l2NormData = normalizer.transform(dataFrame, normalizer.p().w(2));
+    l2NormData.count();
+
+    // Normalize each Vector using $L^\infty$ norm.
+    DataFrame lInfNormData =
+      normalizer.transform(dataFrame, normalizer.p().w(Double.POSITIVE_INFINITY));
+    lInfNormData.count();
+  }
+}
diff --git a/mllib/src/test/java/org/apache/spark/ml/feature/JavaStandardScalerSuite.java b/mllib/src/test/java/org/apache/spark/ml/feature/JavaStandardScalerSuite.java
new file mode 100644
index 0000000000000..74eb2733f06ef
--- /dev/null
+++ b/mllib/src/test/java/org/apache/spark/ml/feature/JavaStandardScalerSuite.java
@@ -0,0 +1,71 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.ml.feature;
+
+import java.util.List;
+
+import com.google.common.collect.Lists;
+import org.junit.After;
+import org.junit.Before;
+import org.junit.Test;
+
+import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.mllib.linalg.Vectors;
+import org.apache.spark.sql.DataFrame;
+import org.apache.spark.sql.SQLContext;
+
+public class JavaStandardScalerSuite {
+  private transient JavaSparkContext jsc;
+  private transient SQLContext jsql;
+
+  @Before
+  public void setUp() {
+    jsc = new JavaSparkContext("local", "JavaStandardScalerSuite");
+    jsql = new SQLContext(jsc);
+  }
+
+  @After
+  public void tearDown() {
+    jsc.stop();
+    jsc = null;
+  }
+
+  @Test
+  public void standardScaler() {
+    // The tests are to check Java compatibility.
+    List<VectorIndexerSuite.FeatureData> points = Lists.newArrayList(
+      new VectorIndexerSuite.FeatureData(Vectors.dense(0.0, -2.0)),
+      new VectorIndexerSuite.FeatureData(Vectors.dense(1.0, 3.0)),
+      new VectorIndexerSuite.FeatureData(Vectors.dense(1.0, 4.0))
+    );
+    DataFrame dataFrame = jsql.createDataFrame(jsc.parallelize(points, 2),
+      VectorIndexerSuite.FeatureData.class);
+    StandardScaler scaler = new StandardScaler()
+      .setInputCol("features")
+      .setOutputCol("scaledFeatures")
+      .setWithStd(true)
+      .setWithMean(false);
+
+    // Compute summary statistics by fitting the StandardScaler
+    StandardScalerModel scalerModel = scaler.fit(dataFrame);
+
+    // Normalize each feature to have unit standard deviation.
+    DataFrame scaledData = scalerModel.transform(dataFrame);
+    scaledData.count();
+  }
+}

From f6f2eeb17910b5d446dfd61839e37dd698d0860f Mon Sep 17 00:00:00 2001
From: Cheng Hao <hao.cheng@intel.com>
Date: Fri, 22 May 2015 01:00:16 -0700
Subject: [PATCH 136/525] [SPARK-7322][SQL] Window functions in DataFrame

This closes #6104.

Author: Cheng Hao <hao.cheng@intel.com>
Author: Reynold Xin <rxin@databricks.com>

Closes #6343 from rxin/window-df and squashes the following commits:

026d587 [Reynold Xin] Address code review feedback.
dc448fe [Reynold Xin] Fixed Hive tests.
9794d9d [Reynold Xin] Moved Java test package.
9331605 [Reynold Xin] Refactored API.
3313e2a [Reynold Xin] Merge pull request #6104 from chenghao-intel/df_window
d625a64 [Cheng Hao] Update the dataframe window API as suggsted
c141fb1 [Cheng Hao] hide all of properties of the WindowFunctionDefinition
3b1865f [Cheng Hao] scaladoc typos
f3fd2d0 [Cheng Hao] polish the unit test
6847825 [Cheng Hao] Add additional analystcs functions
57e3bc0 [Cheng Hao] typos
24a08ec [Cheng Hao] scaladoc
28222ed [Cheng Hao] fix bug of range/row Frame
1d91865 [Cheng Hao] style issue
53f89f2 [Cheng Hao] remove the over from the functions.scala
964c013 [Cheng Hao] add more unit tests and window functions
64e18a7 [Cheng Hao] Add Window Function support for DataFrame
---
 .../scala/org/apache/spark/sql/Column.scala   |  20 +-
 .../org/apache/spark/sql/DataFrame.scala      |   9 +-
 .../apache/spark/sql/expressions/Window.scala |  81 +++++++
 .../spark/sql/expressions/WindowSpec.scala    | 175 ++++++++++++++
 .../org/apache/spark/sql/functions.scala      | 228 ++++++++++++++++++
 .../spark/sql/hive/JavaDataFrameSuite.java    |  78 ++++++
 .../hive/JavaMetastoreDataSourcesSuite.java   |   4 +-
 .../hive/execution/UDFIntegerToString.java    |   0
 .../sql/hive/execution/UDFListListInt.java    |   0
 .../sql/hive/execution/UDFListString.java     |   0
 .../sql/hive/execution/UDFStringString.java   |   0
 .../sql/hive/execution/UDFTwoListList.java    |   0
 .../sql/hive/HiveDataFrameWindowSuite.scala   | 219 +++++++++++++++++
 13 files changed, 807 insertions(+), 7 deletions(-)
 create mode 100644 sql/core/src/main/scala/org/apache/spark/sql/expressions/Window.scala
 create mode 100644 sql/core/src/main/scala/org/apache/spark/sql/expressions/WindowSpec.scala
 create mode 100644 sql/hive/src/test/java/test/org/apache/spark/sql/hive/JavaDataFrameSuite.java
 rename sql/hive/src/test/java/{ => test}/org/apache/spark/sql/hive/JavaMetastoreDataSourcesSuite.java (98%)
 rename sql/hive/src/test/java/{ => test}/org/apache/spark/sql/hive/execution/UDFIntegerToString.java (100%)
 rename sql/hive/src/test/java/{ => test}/org/apache/spark/sql/hive/execution/UDFListListInt.java (100%)
 rename sql/hive/src/test/java/{ => test}/org/apache/spark/sql/hive/execution/UDFListString.java (100%)
 rename sql/hive/src/test/java/{ => test}/org/apache/spark/sql/hive/execution/UDFStringString.java (100%)
 rename sql/hive/src/test/java/{ => test}/org/apache/spark/sql/hive/execution/UDFTwoListList.java (100%)
 create mode 100644 sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveDataFrameWindowSuite.scala

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/Column.scala b/sql/core/src/main/scala/org/apache/spark/sql/Column.scala
index dc0aeea7c4aea..6895aa1010956 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/Column.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/Column.scala
@@ -18,13 +18,13 @@
 package org.apache.spark.sql
 
 import scala.language.implicitConversions
-import scala.collection.JavaConversions._
 
 import org.apache.spark.annotation.Experimental
 import org.apache.spark.Logging
+import org.apache.spark.sql.expressions.Window
 import org.apache.spark.sql.functions.lit
 import org.apache.spark.sql.catalyst.expressions._
-import org.apache.spark.sql.catalyst.analysis.{MultiAlias, UnresolvedAttribute, UnresolvedStar, UnresolvedExtractValue}
+import org.apache.spark.sql.catalyst.analysis._
 import org.apache.spark.sql.types._
 
 
@@ -889,6 +889,22 @@ class Column(protected[sql] val expr: Expression) extends Logging {
    */
   def bitwiseXOR(other: Any): Column = BitwiseXor(expr, lit(other).expr)
 
+  /**
+   * Define a windowing column.
+   *
+   * {{{
+   *   val w = Window.partitionBy("name").orderBy("id")
+   *   df.select(
+   *     sum("price").over(w.rangeBetween(Long.MinValue, 2)),
+   *     avg("price").over(w.rowsBetween(0, 4))
+   *   )
+   * }}}
+   *
+   * @group expr_ops
+   * @since 1.4.0
+   */
+  def over(window: expressions.WindowSpec): Column = window.withAggregate(this)
+
 }
 
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
index d78b4c2f8909c..3ec1c4a2f1027 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
@@ -37,7 +37,7 @@ import org.apache.spark.sql.catalyst.analysis.{MultiAlias, ResolvedStar, Unresol
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.plans.logical.{Filter, _}
 import org.apache.spark.sql.catalyst.plans.{Inner, JoinType}
-import org.apache.spark.sql.catalyst.{expressions, CatalystTypeConverters, ScalaReflection, SqlParser}
+import org.apache.spark.sql.catalyst.{CatalystTypeConverters, ScalaReflection, SqlParser}
 import org.apache.spark.sql.execution.{EvaluatePython, ExplainCommand, LogicalRDD}
 import org.apache.spark.sql.json.JacksonGenerator
 import org.apache.spark.sql.sources.CreateTableUsingAsSelect
@@ -411,7 +411,7 @@ class DataFrame private[sql](
         joined.left,
         joined.right,
         joinType = Inner,
-        Some(expressions.EqualTo(
+        Some(catalyst.expressions.EqualTo(
           joined.left.resolve(usingColumn),
           joined.right.resolve(usingColumn))))
     )
@@ -480,8 +480,9 @@ class DataFrame private[sql](
     // By the time we get here, since we have already run analysis, all attributes should've been
     // resolved and become AttributeReference.
     val cond = plan.condition.map { _.transform {
-      case expressions.EqualTo(a: AttributeReference, b: AttributeReference) if a.sameRef(b) =>
-        expressions.EqualTo(plan.left.resolve(a.name), plan.right.resolve(b.name))
+      case catalyst.expressions.EqualTo(a: AttributeReference, b: AttributeReference)
+          if a.sameRef(b) =>
+        catalyst.expressions.EqualTo(plan.left.resolve(a.name), plan.right.resolve(b.name))
     }}
     plan.copy(condition = cond)
   }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/expressions/Window.scala b/sql/core/src/main/scala/org/apache/spark/sql/expressions/Window.scala
new file mode 100644
index 0000000000000..d4003b2d9cbf6
--- /dev/null
+++ b/sql/core/src/main/scala/org/apache/spark/sql/expressions/Window.scala
@@ -0,0 +1,81 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.expressions
+
+import org.apache.spark.annotation.Experimental
+import org.apache.spark.sql.Column
+import org.apache.spark.sql.catalyst.expressions._
+
+/**
+ * :: Experimental ::
+ * Utility functions for defining window in DataFrames.
+ *
+ * {{{
+ *   // PARTITION BY country ORDER BY date ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW
+ *   Window.partitionBy("country").orderBy("date").rowsBetween(Long.MinValue, 0)
+ *
+ *   // PARTITION BY country ORDER BY date ROWS BETWEEN 3 PRECEDING AND 3 FOLLOWING
+ *   Window.partitionBy("country").orderBy("date").rowsBetween(-3, 3)
+ * }}}
+ *
+ * @since 1.4.0
+ */
+@Experimental
+object Window {
+
+  /**
+   * Creates a [[WindowSpec]] with the partitioning defined.
+   * @since 1.4.0
+   */
+  @scala.annotation.varargs
+  def partitionBy(colName: String, colNames: String*): WindowSpec = {
+    spec.partitionBy(colName, colNames : _*)
+  }
+
+  /**
+   * Creates a [[WindowSpec]] with the partitioning defined.
+   * @since 1.4.0
+   */
+  @scala.annotation.varargs
+  def partitionBy(cols: Column*): WindowSpec = {
+    spec.partitionBy(cols : _*)
+  }
+
+  /**
+   * Creates a [[WindowSpec]] with the ordering defined.
+   * @since 1.4.0
+   */
+  @scala.annotation.varargs
+  def orderBy(colName: String, colNames: String*): WindowSpec = {
+    spec.orderBy(colName, colNames : _*)
+  }
+
+  /**
+   * Creates a [[WindowSpec]] with the ordering defined.
+   * @since 1.4.0
+   */
+  @scala.annotation.varargs
+  def orderBy(cols: Column*): WindowSpec = {
+    spec.orderBy(cols : _*)
+  }
+
+  private def spec: WindowSpec = {
+    new WindowSpec(Seq.empty, Seq.empty, UnspecifiedFrame)
+  }
+
+}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/expressions/WindowSpec.scala b/sql/core/src/main/scala/org/apache/spark/sql/expressions/WindowSpec.scala
new file mode 100644
index 0000000000000..c3d2246297021
--- /dev/null
+++ b/sql/core/src/main/scala/org/apache/spark/sql/expressions/WindowSpec.scala
@@ -0,0 +1,175 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.expressions
+
+import org.apache.spark.annotation.Experimental
+import org.apache.spark.sql.{Column, catalyst}
+import org.apache.spark.sql.catalyst.expressions._
+
+
+/**
+ * :: Experimental ::
+ * A window specification that defines the partitioning, ordering, and frame boundaries.
+ *
+ * Use the static methods in [[Window]] to create a [[WindowSpec]].
+ *
+ * @since 1.4.0
+ */
+@Experimental
+class WindowSpec private[sql](
+    partitionSpec: Seq[Expression],
+    orderSpec: Seq[SortOrder],
+    frame: catalyst.expressions.WindowFrame) {
+
+  /**
+   * Defines the partitioning columns in a [[WindowSpec]].
+   * @since 1.4.0
+   */
+  @scala.annotation.varargs
+  def partitionBy(colName: String, colNames: String*): WindowSpec = {
+    partitionBy((colName +: colNames).map(Column(_)): _*)
+  }
+
+  /**
+   * Defines the partitioning columns in a [[WindowSpec]].
+   * @since 1.4.0
+   */
+  @scala.annotation.varargs
+  def partitionBy(cols: Column*): WindowSpec = {
+    new WindowSpec(cols.map(_.expr), orderSpec, frame)
+  }
+
+  /**
+   * Defines the ordering columns in a [[WindowSpec]].
+   * @since 1.4.0
+   */
+  @scala.annotation.varargs
+  def orderBy(colName: String, colNames: String*): WindowSpec = {
+    orderBy((colName +: colNames).map(Column(_)): _*)
+  }
+
+  /**
+   * Defines the ordering columns in a [[WindowSpec]].
+   * @since 1.4.0
+   */
+  @scala.annotation.varargs
+  def orderBy(cols: Column*): WindowSpec = {
+    val sortOrder: Seq[SortOrder] = cols.map { col =>
+      col.expr match {
+        case expr: SortOrder =>
+          expr
+        case expr: Expression =>
+          SortOrder(expr, Ascending)
+      }
+    }
+    new WindowSpec(partitionSpec, sortOrder, frame)
+  }
+
+  /**
+   * Defines the frame boundaries, from `start` (inclusive) to `end` (inclusive).
+   *
+   * Both `start` and `end` are relative positions from the current row. For example, "0" means
+   * "current row", while "-1" means the row before the current row, and "5" means the fifth row
+   * after the current row.
+   *
+   * @param start boundary start, inclusive.
+   *              The frame is unbounded if this is the minimum long value.
+   * @param end boundary end, inclusive.
+   *            The frame is unbounded if this is the maximum long value.
+   * @since 1.4.0
+   */
+  def rowsBetween(start: Long, end: Long): WindowSpec = {
+    between(RowFrame, start, end)
+  }
+
+  /**
+   * Defines the frame boundaries, from `start` (inclusive) to `end` (inclusive).
+   *
+   * Both `start` and `end` are relative from the current row. For example, "0" means "current row",
+   * while "-1" means one off before the current row, and "5" means the five off after the
+   * current row.
+   *
+   * @param start boundary start, inclusive.
+   *              The frame is unbounded if this is the minimum long value.
+   * @param end boundary end, inclusive.
+   *            The frame is unbounded if this is the maximum long value.
+   * @since 1.4.0
+   */
+  def rangeBetween(start: Long, end: Long): WindowSpec = {
+    between(RangeFrame, start, end)
+  }
+
+  private def between(typ: FrameType, start: Long, end: Long): WindowSpec = {
+    val boundaryStart = start match {
+      case 0 => CurrentRow
+      case Long.MinValue => UnboundedPreceding
+      case x if x < 0 => ValuePreceding(-start.toInt)
+      case x if x > 0 => ValueFollowing(start.toInt)
+    }
+
+    val boundaryEnd = end match {
+      case 0 => CurrentRow
+      case Long.MaxValue => UnboundedFollowing
+      case x if x < 0 => ValuePreceding(-end.toInt)
+      case x if x > 0 => ValueFollowing(end.toInt)
+    }
+
+    new WindowSpec(
+      partitionSpec,
+      orderSpec,
+      SpecifiedWindowFrame(typ, boundaryStart, boundaryEnd))
+  }
+
+  /**
+   * Converts this [[WindowSpec]] into a [[Column]] with an aggregate expression.
+   */
+  private[sql] def withAggregate(aggregate: Column): Column = {
+    val windowExpr = aggregate.expr match {
+      case Average(child) => WindowExpression(
+        UnresolvedWindowFunction("avg", child :: Nil),
+        WindowSpecDefinition(partitionSpec, orderSpec, frame))
+      case Sum(child) => WindowExpression(
+        UnresolvedWindowFunction("sum", child :: Nil),
+        WindowSpecDefinition(partitionSpec, orderSpec, frame))
+      case Count(child) => WindowExpression(
+        UnresolvedWindowFunction("count", child :: Nil),
+        WindowSpecDefinition(partitionSpec, orderSpec, frame))
+      case First(child) => WindowExpression(
+        // TODO this is a hack for Hive UDAF first_value
+        UnresolvedWindowFunction("first_value", child :: Nil),
+        WindowSpecDefinition(partitionSpec, orderSpec, frame))
+      case Last(child) => WindowExpression(
+        // TODO this is a hack for Hive UDAF last_value
+        UnresolvedWindowFunction("last_value", child :: Nil),
+        WindowSpecDefinition(partitionSpec, orderSpec, frame))
+      case Min(child) => WindowExpression(
+        UnresolvedWindowFunction("min", child :: Nil),
+        WindowSpecDefinition(partitionSpec, orderSpec, frame))
+      case Max(child) => WindowExpression(
+        UnresolvedWindowFunction("max", child :: Nil),
+        WindowSpecDefinition(partitionSpec, orderSpec, frame))
+      case wf: WindowFunction => WindowExpression(
+        wf,
+        WindowSpecDefinition(partitionSpec, orderSpec, frame))
+      case x =>
+        throw new UnsupportedOperationException(s"$x is not supported in window operation.")
+    }
+    new Column(windowExpr)
+  }
+
+}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
index 6640631cf0719..8775be724e0f9 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
@@ -37,6 +37,7 @@ import org.apache.spark.util.Utils
  * @groupname sort_funcs Sorting functions
  * @groupname normal_funcs Non-aggregate functions
  * @groupname math_funcs Math functions
+ * @groupname window_funcs Window functions
  * @groupname Ungrouped Support functions for DataFrames.
  * @since 1.3.0
  */
@@ -320,6 +321,233 @@ object functions {
    */
   def max(columnName: String): Column = max(Column(columnName))
 
+  //////////////////////////////////////////////////////////////////////////////////////////////
+  // Window functions
+  //////////////////////////////////////////////////////////////////////////////////////////////
+
+  /**
+   * Window function: returns the lag value of current row of the expression,
+   * null when the current row extends before the beginning of the window.
+   *
+   * @group window_funcs
+   * @since 1.4.0
+   */
+  def lag(columnName: String): Column = {
+    lag(columnName, 1)
+  }
+
+  /**
+   * Window function: returns the lag value of current row of the column,
+   * null when the current row extends before the beginning of the window.
+   *
+   * @group window_funcs
+   * @since 1.4.0
+   */
+  def lag(e: Column): Column = {
+    lag(e, 1)
+  }
+
+  /**
+   * Window function: returns the lag values of current row of the expression,
+   * null when the current row extends before the beginning of the window.
+   *
+   * @group window_funcs
+   * @since 1.4.0
+   */
+  def lag(e: Column, count: Int): Column = {
+    lag(e, count, null)
+  }
+
+  /**
+   * Window function: returns the lag values of current row of the column,
+   * null when the current row extends before the beginning of the window.
+   *
+   * @group window_funcs
+   * @since 1.4.0
+   */
+  def lag(columnName: String, count: Int): Column = {
+    lag(columnName, count, null)
+  }
+
+  /**
+   * Window function: returns the lag values of current row of the column,
+   * given default value when the current row extends before the beginning
+   * of the window.
+   *
+   * @group window_funcs
+   * @since 1.4.0
+   */
+  def lag(columnName: String, count: Int, defaultValue: Any): Column = {
+    lag(Column(columnName), count, defaultValue)
+  }
+
+  /**
+   * Window function: returns the lag values of current row of the expression,
+   * given default value when the current row extends before the beginning
+   * of the window.
+   *
+   * @group window_funcs
+   * @since 1.4.0
+   */
+  def lag(e: Column, count: Int, defaultValue: Any): Column = {
+    UnresolvedWindowFunction("lag", e.expr :: Literal(count) :: Literal(defaultValue) :: Nil)
+  }
+
+  /**
+   * Window function: returns the lead value of current row of the column,
+   * null when the current row extends before the end of the window.
+   *
+   * @group window_funcs
+   * @since 1.4.0
+   */
+  def lead(columnName: String): Column = {
+    lead(columnName, 1)
+  }
+
+  /**
+   * Window function: returns the lead value of current row of the expression,
+   * null when the current row extends before the end of the window.
+   *
+   * @group window_funcs
+   * @since 1.4.0
+   */
+  def lead(e: Column): Column = {
+    lead(e, 1)
+  }
+
+  /**
+   * Window function: returns the lead values of current row of the column,
+   * null when the current row extends before the end of the window.
+   *
+   * @group window_funcs
+   * @since 1.4.0
+   */
+  def lead(columnName: String, count: Int): Column = {
+    lead(columnName, count, null)
+  }
+
+  /**
+   * Window function: returns the lead values of current row of the expression,
+   * null when the current row extends before the end of the window.
+   *
+   * @group window_funcs
+   * @since 1.4.0
+   */
+  def lead(e: Column, count: Int): Column = {
+    lead(e, count, null)
+  }
+
+  /**
+   * Window function: returns the lead values of current row of the column,
+   * given default value when the current row extends before the end of the window.
+   *
+   * @group window_funcs
+   * @since 1.4.0
+   */
+  def lead(columnName: String, count: Int, defaultValue: Any): Column = {
+    lead(Column(columnName), count, defaultValue)
+  }
+
+  /**
+   * Window function: returns the lead values of current row of the expression,
+   * given default value when the current row extends before the end of the window.
+   *
+   * @group window_funcs
+   * @since 1.4.0
+   */
+  def lead(e: Column, count: Int, defaultValue: Any): Column = {
+    UnresolvedWindowFunction("lead", e.expr :: Literal(count) :: Literal(defaultValue) :: Nil)
+  }
+
+  /**
+   * NTILE for specified expression.
+   * NTILE allows easy calculation of tertiles, quartiles, deciles and other
+   * common summary statistics. This function divides an ordered partition into a specified
+   * number of groups called buckets and assigns a bucket number to each row in the partition.
+   *
+   * @group window_funcs
+   * @since 1.4.0
+   */
+  def ntile(e: Column): Column = {
+    UnresolvedWindowFunction("ntile", e.expr :: Nil)
+  }
+
+  /**
+   * NTILE for specified column.
+   * NTILE allows easy calculation of tertiles, quartiles, deciles and other
+   * common summary statistics. This function divides an ordered partition into a specified
+   * number of groups called buckets and assigns a bucket number to each row in the partition.
+   *
+   * @group window_funcs
+   * @since 1.4.0
+   */
+  def ntile(columnName: String): Column = {
+    ntile(Column(columnName))
+  }
+
+  /**
+   * Assigns a unique number (sequentially, starting from 1, as defined by ORDER BY) to each
+   * row within the partition.
+   *
+   * @group window_funcs
+   * @since 1.4.0
+   */
+  def rowNumber(): Column = {
+    UnresolvedWindowFunction("row_number", Nil)
+  }
+
+  /**
+   * The difference between RANK and DENSE_RANK is that DENSE_RANK leaves no gaps in ranking
+   * sequence when there are ties. That is, if you were ranking a competition using DENSE_RANK
+   * and had three people tie for second place, you would say that all three were in second
+   * place and that the next person came in third.
+   *
+   * @group window_funcs
+   * @since 1.4.0
+   */
+  def denseRank(): Column = {
+    UnresolvedWindowFunction("dense_rank", Nil)
+  }
+
+  /**
+   * The difference between RANK and DENSE_RANK is that DENSE_RANK leaves no gaps in ranking
+   * sequence when there are ties. That is, if you were ranking a competition using DENSE_RANK
+   * and had three people tie for second place, you would say that all three were in second
+   * place and that the next person came in third.
+   *
+   * @group window_funcs
+   * @since 1.4.0
+   */
+  def rank(): Column = {
+    UnresolvedWindowFunction("rank", Nil)
+  }
+
+  /**
+   * CUME_DIST (defined as the inverse of percentile in some statistical books) computes
+   * the position of a specified value relative to a set of values.
+   * To compute the CUME_DIST of a value x in a set S of size N, you use the formula:
+   * CUME_DIST(x) = number of values in S coming before and including x in the specified order / N
+   *
+   * @group window_funcs
+   * @since 1.4.0
+   */
+  def cumeDist(): Column = {
+    UnresolvedWindowFunction("cume_dist", Nil)
+  }
+
+  /**
+   * PERCENT_RANK is similar to CUME_DIST, but it uses rank values rather than row counts
+   * in its numerator.
+   * The formula:
+   * (rank of row in its partition - 1) / (number of rows in the partition - 1)
+   *
+   * @group window_funcs
+   * @since 1.4.0
+   */
+  def percentRank(): Column = {
+    UnresolvedWindowFunction("percent_rank", Nil)
+  }
+
   //////////////////////////////////////////////////////////////////////////////////////////////
   // Non-aggregate functions
   //////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/sql/hive/src/test/java/test/org/apache/spark/sql/hive/JavaDataFrameSuite.java b/sql/hive/src/test/java/test/org/apache/spark/sql/hive/JavaDataFrameSuite.java
new file mode 100644
index 0000000000000..c4828c4717643
--- /dev/null
+++ b/sql/hive/src/test/java/test/org/apache/spark/sql/hive/JavaDataFrameSuite.java
@@ -0,0 +1,78 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package test.org.apache.spark.sql.hive;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+
+import org.junit.After;
+import org.junit.Assert;
+import org.junit.Before;
+import org.junit.Test;
+
+import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.sql.*;
+import org.apache.spark.sql.expressions.Window;
+import org.apache.spark.sql.hive.HiveContext;
+import org.apache.spark.sql.hive.test.TestHive$;
+
+public class JavaDataFrameSuite {
+  private transient JavaSparkContext sc;
+  private transient HiveContext hc;
+
+  DataFrame df;
+
+  private void checkAnswer(DataFrame actual, List<Row> expected) {
+    String errorMessage = QueryTest$.MODULE$.checkAnswer(actual, expected);
+    if (errorMessage != null) {
+      Assert.fail(errorMessage);
+    }
+  }
+
+  @Before
+  public void setUp() throws IOException {
+    hc = TestHive$.MODULE$;
+    sc = new JavaSparkContext(hc.sparkContext());
+
+    List<String> jsonObjects = new ArrayList<String>(10);
+    for (int i = 0; i < 10; i++) {
+      jsonObjects.add("{\"key\":" + i + ", \"value\":\"str" + i + "\"}");
+    }
+    df = hc.jsonRDD(sc.parallelize(jsonObjects));
+    df.registerTempTable("window_table");
+  }
+
+  @After
+  public void tearDown() throws IOException {
+    // Clean up tables.
+    hc.sql("DROP TABLE IF EXISTS window_table");
+  }
+
+  @Test
+  public void saveTableAndQueryIt() {
+    checkAnswer(
+      df.select(functions.avg("key").over(
+        Window.partitionBy("value").orderBy("key").rowsBetween(-1, 1))),
+      hc.sql("SELECT avg(key) " +
+        "OVER (PARTITION BY value " +
+        "      ORDER BY key " +
+        "      ROWS BETWEEN 1 preceding and 1 following) " +
+        "FROM window_table").collectAsList());
+  }
+}
diff --git a/sql/hive/src/test/java/org/apache/spark/sql/hive/JavaMetastoreDataSourcesSuite.java b/sql/hive/src/test/java/test/org/apache/spark/sql/hive/JavaMetastoreDataSourcesSuite.java
similarity index 98%
rename from sql/hive/src/test/java/org/apache/spark/sql/hive/JavaMetastoreDataSourcesSuite.java
rename to sql/hive/src/test/java/test/org/apache/spark/sql/hive/JavaMetastoreDataSourcesSuite.java
index 58fe96adab17e..64d1ce92931eb 100644
--- a/sql/hive/src/test/java/org/apache/spark/sql/hive/JavaMetastoreDataSourcesSuite.java
+++ b/sql/hive/src/test/java/test/org/apache/spark/sql/hive/JavaMetastoreDataSourcesSuite.java
@@ -14,7 +14,8 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-package org.apache.spark.sql.hive;
+
+package test.org.apache.spark.sql.hive;
 
 import java.io.File;
 import java.io.IOException;
@@ -36,6 +37,7 @@
 import org.apache.spark.sql.DataFrame;
 import org.apache.spark.sql.QueryTest$;
 import org.apache.spark.sql.Row;
+import org.apache.spark.sql.hive.HiveContext;
 import org.apache.spark.sql.hive.test.TestHive$;
 import org.apache.spark.sql.types.DataTypes;
 import org.apache.spark.sql.types.StructField;
diff --git a/sql/hive/src/test/java/org/apache/spark/sql/hive/execution/UDFIntegerToString.java b/sql/hive/src/test/java/test/org/apache/spark/sql/hive/execution/UDFIntegerToString.java
similarity index 100%
rename from sql/hive/src/test/java/org/apache/spark/sql/hive/execution/UDFIntegerToString.java
rename to sql/hive/src/test/java/test/org/apache/spark/sql/hive/execution/UDFIntegerToString.java
diff --git a/sql/hive/src/test/java/org/apache/spark/sql/hive/execution/UDFListListInt.java b/sql/hive/src/test/java/test/org/apache/spark/sql/hive/execution/UDFListListInt.java
similarity index 100%
rename from sql/hive/src/test/java/org/apache/spark/sql/hive/execution/UDFListListInt.java
rename to sql/hive/src/test/java/test/org/apache/spark/sql/hive/execution/UDFListListInt.java
diff --git a/sql/hive/src/test/java/org/apache/spark/sql/hive/execution/UDFListString.java b/sql/hive/src/test/java/test/org/apache/spark/sql/hive/execution/UDFListString.java
similarity index 100%
rename from sql/hive/src/test/java/org/apache/spark/sql/hive/execution/UDFListString.java
rename to sql/hive/src/test/java/test/org/apache/spark/sql/hive/execution/UDFListString.java
diff --git a/sql/hive/src/test/java/org/apache/spark/sql/hive/execution/UDFStringString.java b/sql/hive/src/test/java/test/org/apache/spark/sql/hive/execution/UDFStringString.java
similarity index 100%
rename from sql/hive/src/test/java/org/apache/spark/sql/hive/execution/UDFStringString.java
rename to sql/hive/src/test/java/test/org/apache/spark/sql/hive/execution/UDFStringString.java
diff --git a/sql/hive/src/test/java/org/apache/spark/sql/hive/execution/UDFTwoListList.java b/sql/hive/src/test/java/test/org/apache/spark/sql/hive/execution/UDFTwoListList.java
similarity index 100%
rename from sql/hive/src/test/java/org/apache/spark/sql/hive/execution/UDFTwoListList.java
rename to sql/hive/src/test/java/test/org/apache/spark/sql/hive/execution/UDFTwoListList.java
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveDataFrameWindowSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveDataFrameWindowSuite.scala
new file mode 100644
index 0000000000000..6cea6776c8ca6
--- /dev/null
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveDataFrameWindowSuite.scala
@@ -0,0 +1,219 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.hive
+
+import org.apache.spark.sql.{Row, QueryTest}
+import org.apache.spark.sql.expressions.Window
+import org.apache.spark.sql.functions._
+import org.apache.spark.sql.hive.test.TestHive._
+import org.apache.spark.sql.hive.test.TestHive.implicits._
+
+class HiveDataFrameWindowSuite extends QueryTest {
+
+  test("reuse window partitionBy") {
+    val df = Seq((1, "1"), (2, "2"), (1, "1"), (2, "2")).toDF("key", "value")
+    val w = Window.partitionBy("key").orderBy("value")
+
+    checkAnswer(
+      df.select(
+        lead("key").over(w),
+        lead("value").over(w)),
+      Row(1, "1") :: Row(2, "2") :: Row(null, null) :: Row(null, null) :: Nil)
+  }
+
+  test("reuse window orderBy") {
+    val df = Seq((1, "1"), (2, "2"), (1, "1"), (2, "2")).toDF("key", "value")
+    val w = Window.orderBy("value").partitionBy("key")
+
+    checkAnswer(
+      df.select(
+        lead("key").over(w),
+        lead("value").over(w)),
+      Row(1, "1") :: Row(2, "2") :: Row(null, null) :: Row(null, null) :: Nil)
+  }
+
+  test("lead") {
+    val df = Seq((1, "1"), (2, "2"), (1, "1"), (2, "2")).toDF("key", "value")
+    df.registerTempTable("window_table")
+
+    checkAnswer(
+      df.select(
+        lead("value").over(Window.partitionBy($"key").orderBy($"value"))),
+      sql(
+        """SELECT
+          | lead(value) OVER (PARTITION BY key ORDER BY value)
+          | FROM window_table""".stripMargin).collect())
+  }
+
+  test("lag") {
+    val df = Seq((1, "1"), (2, "2"), (1, "1"), (2, "2")).toDF("key", "value")
+    df.registerTempTable("window_table")
+
+    checkAnswer(
+      df.select(
+        lag("value").over(
+          Window.partitionBy($"key")
+          .orderBy($"value"))),
+      sql(
+        """SELECT
+          | lag(value) OVER (PARTITION BY key ORDER BY value)
+          | FROM window_table""".stripMargin).collect())
+  }
+
+  test("lead with default value") {
+    val df = Seq((1, "1"), (1, "1"), (2, "2"), (1, "1"),
+                 (2, "2"), (1, "1"), (2, "2")).toDF("key", "value")
+    df.registerTempTable("window_table")
+    checkAnswer(
+      df.select(
+        lead("value", 2, "n/a").over(Window.partitionBy("key").orderBy("value"))),
+      sql(
+        """SELECT
+          | lead(value, 2, "n/a") OVER (PARTITION BY key ORDER BY value)
+          | FROM window_table""".stripMargin).collect())
+  }
+
+  test("lag with default value") {
+    val df = Seq((1, "1"), (1, "1"), (2, "2"), (1, "1"),
+                 (2, "2"), (1, "1"), (2, "2")).toDF("key", "value")
+    df.registerTempTable("window_table")
+    checkAnswer(
+      df.select(
+        lag("value", 2, "n/a").over(Window.partitionBy($"key").orderBy($"value"))),
+      sql(
+        """SELECT
+          | lag(value, 2, "n/a") OVER (PARTITION BY key ORDER BY value)
+          | FROM window_table""".stripMargin).collect())
+  }
+
+  test("rank functions in unspecific window") {
+    val df = Seq((1, "1"), (2, "2"), (1, "2"), (2, "2")).toDF("key", "value")
+    df.registerTempTable("window_table")
+    checkAnswer(
+      df.select(
+        $"key",
+        max("key").over(Window.partitionBy("value").orderBy("key")),
+        min("key").over(Window.partitionBy("value").orderBy("key")),
+        mean("key").over(Window.partitionBy("value").orderBy("key")),
+        count("key").over(Window.partitionBy("value").orderBy("key")),
+        sum("key").over(Window.partitionBy("value").orderBy("key")),
+        ntile("key").over(Window.partitionBy("value").orderBy("key")),
+        ntile($"key").over(Window.partitionBy("value").orderBy("key")),
+        rowNumber().over(Window.partitionBy("value").orderBy("key")),
+        denseRank().over(Window.partitionBy("value").orderBy("key")),
+        rank().over(Window.partitionBy("value").orderBy("key")),
+        cumeDist().over(Window.partitionBy("value").orderBy("key")),
+        percentRank().over(Window.partitionBy("value").orderBy("key"))),
+      sql(
+        s"""SELECT
+           |key,
+           |max(key) over (partition by value order by key),
+           |min(key) over (partition by value order by key),
+           |avg(key) over (partition by value order by key),
+           |count(key) over (partition by value order by key),
+           |sum(key) over (partition by value order by key),
+           |ntile(key) over (partition by value order by key),
+           |ntile(key) over (partition by value order by key),
+           |row_number() over (partition by value order by key),
+           |dense_rank() over (partition by value order by key),
+           |rank() over (partition by value order by key),
+           |cume_dist() over (partition by value order by key),
+           |percent_rank() over (partition by value order by key)
+           |FROM window_table""".stripMargin).collect())
+  }
+
+  test("aggregation and rows between") {
+    val df = Seq((1, "1"), (2, "2"), (1, "1"), (2, "2")).toDF("key", "value")
+    df.registerTempTable("window_table")
+    checkAnswer(
+      df.select(
+        avg("key").over(Window.partitionBy($"value").orderBy($"key").rowsBetween(-1, 2))),
+      sql(
+        """SELECT
+          | avg(key) OVER
+          |   (PARTITION BY value ORDER BY key ROWS BETWEEN 1 preceding and 2 following)
+          | FROM window_table""".stripMargin).collect())
+  }
+
+  test("aggregation and range betweens") {
+    val df = Seq((1, "1"), (2, "2"), (1, "1"), (2, "2")).toDF("key", "value")
+    df.registerTempTable("window_table")
+    checkAnswer(
+      df.select(
+        avg("key").over(Window.partitionBy($"value").orderBy($"key").rangeBetween(-1, 1))),
+      sql(
+        """SELECT
+          | avg(key) OVER
+          |   (PARTITION BY value ORDER BY key RANGE BETWEEN 1 preceding and 1 following)
+          | FROM window_table""".stripMargin).collect())
+  }
+
+  test("aggregation and rows betweens with unbounded") {
+    val df = Seq((1, "1"), (2, "2"), (2, "3"), (1, "3"), (3, "2"), (4, "3")).toDF("key", "value")
+    df.registerTempTable("window_table")
+    checkAnswer(
+      df.select(
+        $"key",
+        last("value").over(
+          Window.partitionBy($"value").orderBy($"key").rowsBetween(0, Long.MaxValue)),
+        last("value").over(
+          Window.partitionBy($"value").orderBy($"key").rowsBetween(Long.MinValue, 0)),
+        last("value").over(Window.partitionBy($"value").orderBy($"key").rowsBetween(-1, 3))),
+      sql(
+        """SELECT
+          | key,
+          | last_value(value) OVER
+          |   (PARTITION BY value ORDER BY key ROWS between current row and unbounded following),
+          | last_value(value) OVER
+          |   (PARTITION BY value ORDER BY key ROWS between unbounded preceding and current row),
+          | last_value(value) OVER
+          |   (PARTITION BY value ORDER BY key ROWS between 1 preceding and 3 following)
+          | FROM window_table""".stripMargin).collect())
+  }
+
+  test("aggregation and range betweens with unbounded") {
+    val df = Seq((1, "1"), (2, "2"), (2, "2"), (2, "2"), (1, "1"), (2, "2")).toDF("key", "value")
+    df.registerTempTable("window_table")
+    checkAnswer(
+      df.select(
+        $"key",
+        last("value").over(
+          Window.partitionBy($"value").orderBy($"key").rangeBetween(1, Long.MaxValue))
+          .equalTo("2")
+          .as("last_v"),
+        avg("key").over(Window.partitionBy("value").orderBy("key").rangeBetween(Long.MinValue, 1))
+          .as("avg_key1"),
+        avg("key").over(Window.partitionBy("value").orderBy("key").rangeBetween(0, Long.MaxValue))
+          .as("avg_key2"),
+        avg("key").over(Window.partitionBy("value").orderBy("key").rangeBetween(-1, 0))
+          .as("avg_key3")
+      ),
+      sql(
+        """SELECT
+          | key,
+          | last_value(value) OVER
+          |   (PARTITION BY value ORDER BY key RANGE 1 preceding) == "2",
+          | avg(key) OVER
+          |   (PARTITION BY value ORDER BY key RANGE BETWEEN unbounded preceding and 1 following),
+          | avg(key) OVER
+          |   (PARTITION BY value ORDER BY key RANGE BETWEEN current row and unbounded following),
+          | avg(key) OVER
+          |   (PARTITION BY value ORDER BY key RANGE BETWEEN 1 preceding and current row)
+          | FROM window_table""".stripMargin).collect())
+  }
+}

From 4e5220c3171b6a2f4970409bd16be2db930df65d Mon Sep 17 00:00:00 2001
From: Cheng Lian <lian@databricks.com>
Date: Fri, 22 May 2015 16:25:52 +0800
Subject: [PATCH 137/525] [MINOR] [SQL] Ignores Thrift server UISeleniumSuite

This Selenium test case has been flaky for a while and led to frequent Jenkins build failure. Let's disable it temporarily until we figure out a proper solution.

Author: Cheng Lian <lian@databricks.com>

Closes #6345 from liancheng/ignore-selenium-test and squashes the following commits:

09996fe [Cheng Lian] Ignores Thrift server UISeleniumSuite
---
 .../hive/thriftserver/UISeleniumSuite.scala   | 19 ++++++++-----------
 1 file changed, 8 insertions(+), 11 deletions(-)

diff --git a/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/UISeleniumSuite.scala b/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/UISeleniumSuite.scala
index 47541015a3611..a286dc5825f77 100644
--- a/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/UISeleniumSuite.scala
+++ b/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/UISeleniumSuite.scala
@@ -17,21 +17,18 @@
 
 package org.apache.spark.sql.hive.thriftserver
 
-
-
 import scala.util.Random
 
+import org.apache.hadoop.hive.conf.HiveConf.ConfVars
 import org.openqa.selenium.WebDriver
 import org.openqa.selenium.htmlunit.HtmlUnitDriver
-import org.scalatest.{Matchers, BeforeAndAfterAll}
 import org.scalatest.concurrent.Eventually._
 import org.scalatest.selenium.WebBrowser
 import org.scalatest.time.SpanSugar._
+import org.scalatest.{BeforeAndAfterAll, Matchers}
 
-import org.apache.hadoop.hive.conf.HiveConf.ConfVars
 import org.apache.spark.sql.hive.HiveContext
 
-
 class UISeleniumSuite
   extends HiveThriftJdbcTest
   with WebBrowser with Matchers with BeforeAndAfterAll {
@@ -75,9 +72,9 @@ class UISeleniumSuite
      """.stripMargin.split("\\s+").toSeq
   }
 
-  test("thrift server ui test") {
+  ignore("thrift server ui test") {
     withJdbcStatement(statement =>{
-      val baseURL = s"http://localhost:${uiPort}"
+      val baseURL = s"http://localhost:$uiPort"
 
       val queries = Seq(
         "CREATE TABLE test_map(key INT, value STRING)",
@@ -86,14 +83,14 @@ class UISeleniumSuite
       queries.foreach(statement.execute)
 
       eventually(timeout(10 seconds), interval(50 milliseconds)) {
-        go to (baseURL)
-        find(cssSelector("""ul li a[href*="ThriftServer"]""")) should not be(None)
+        go to baseURL
+        find(cssSelector("""ul li a[href*="ThriftServer"]""")) should not be None
       }
 
       eventually(timeout(10 seconds), interval(50 milliseconds)) {
         go to (baseURL + "/ThriftServer")
-        find(id("sessionstat")) should not be(None)
-        find(id("sqlstat")) should not be(None)
+        find(id("sessionstat")) should not be None
+        find(id("sqlstat")) should not be None
 
         // check whether statements exists
         queries.foreach { line =>

From 3b68cb0430067059e9c7b9a86dbea4865e29bf78 Mon Sep 17 00:00:00 2001
From: Michael Armbrust <michael@databricks.com>
Date: Fri, 22 May 2015 09:43:46 -0700
Subject: [PATCH 138/525] [SPARK-6743] [SQL] Fix empty projections of cached
 data

Author: Michael Armbrust <michael@databricks.com>

Closes #6165 from marmbrus/wrongColumn and squashes the following commits:

4fad158 [Michael Armbrust] Merge remote-tracking branch 'origin/master' into wrongColumn
aad7eab [Michael Armbrust] rxins comments
f1e8df1 [Michael Armbrust] [SPARK-6743][SQL] Fix empty projections of cached data
---
 project/SparkBuild.scala                        |  1 +
 .../main/scala/org/apache/spark/sql/Row.scala   |  3 +++
 .../columnar/InMemoryColumnarTableScan.scala    |  2 +-
 .../org/apache/spark/sql/SQLQuerySuite.scala    | 17 +++++++++++++++--
 4 files changed, 20 insertions(+), 3 deletions(-)

diff --git a/project/SparkBuild.scala b/project/SparkBuild.scala
index 1b87e4e98bd83..b9515a12bc573 100644
--- a/project/SparkBuild.scala
+++ b/project/SparkBuild.scala
@@ -324,6 +324,7 @@ object Hive {
         |import org.apache.spark.sql.functions._
         |import org.apache.spark.sql.hive._
         |import org.apache.spark.sql.hive.test.TestHive._
+        |import org.apache.spark.sql.hive.test.TestHive.implicits._
         |import org.apache.spark.sql.types._""".stripMargin,
     cleanupCommands in console := "sparkContext.stop()",
     // Some of our log4j jars make it impossible to submit jobs from this JVM to Hive Map/Reduce
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/Row.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/Row.scala
index 4190b7ffe1c8f..0d460b634d9b0 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/Row.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/Row.scala
@@ -55,6 +55,9 @@ object Row {
     // TODO: Improve the performance of this if used in performance critical part.
     new GenericRow(rows.flatMap(_.toSeq).toArray)
   }
+
+  /** Returns an empty row. */
+  val empty = apply()
 }
 
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/columnar/InMemoryColumnarTableScan.scala b/sql/core/src/main/scala/org/apache/spark/sql/columnar/InMemoryColumnarTableScan.scala
index 0ded1cce68391..a59d42cdd6028 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/columnar/InMemoryColumnarTableScan.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/columnar/InMemoryColumnarTableScan.scala
@@ -314,7 +314,7 @@ private[sql] case class InMemoryColumnarTableScan(
                 columnAccessors(i).extractTo(nextRow, i)
                 i += 1
               }
-              nextRow
+              if (attributes.isEmpty) Row.empty else nextRow
             }
 
             override def hasNext: Boolean = columnAccessors(0).hasNext
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
index c5c4f448a7224..7c47fe454b6dc 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
@@ -39,6 +39,19 @@ class SQLQuerySuite extends QueryTest with BeforeAndAfterAll {
   import org.apache.spark.sql.test.TestSQLContext.implicits._
   val sqlCtx = TestSQLContext
 
+  test("SPARK-6743: no columns from cache") {
+    Seq(
+      (83, 0, 38),
+      (26, 0, 79),
+      (43, 81, 24)
+    ).toDF("a", "b", "c").registerTempTable("cachedData")
+
+    cacheTable("cachedData")
+    checkAnswer(
+      sql("SELECT t1.b FROM cachedData, cachedData t1 GROUP BY t1.b"),
+      Row(0) :: Row(81) :: Nil)
+  }
+
   test("self join with aliases") {
     Seq(1,2,3).map(i => (i, i.toString)).toDF("int", "str").registerTempTable("df")
 
@@ -142,7 +155,7 @@ class SQLQuerySuite extends QueryTest with BeforeAndAfterAll {
       sql("SELECT ABS(2.5)"),
       Row(2.5))
   }
-  
+
   test("aggregation with codegen") {
     val originalValue = conf.codegenEnabled
     setConf(SQLConf.CODEGEN_ENABLED, "true")
@@ -194,7 +207,7 @@ class SQLQuerySuite extends QueryTest with BeforeAndAfterAll {
       "SELECT value, sum(key) FROM testData3x GROUP BY value",
       (1 to 100).map(i => Row(i.toString, 3 * i)))
     testCodeGen(
-      "SELECT sum(key), SUM(CAST(key as Double)) FROM testData3x",      
+      "SELECT sum(key), SUM(CAST(key as Double)) FROM testData3x",
       Row(5050 * 3, 5050 * 3.0) :: Nil)
     // AVERAGE
     testCodeGen(

From f490b3b4c706c92aa65d000b9d885f4d160a5f39 Mon Sep 17 00:00:00 2001
From: Ram Sriharsha <rsriharsha@hw11853.local>
Date: Fri, 22 May 2015 09:59:44 -0700
Subject: [PATCH 139/525] [SPARK-7404] [ML] Add RegressionEvaluator to spark.ml

Author: Ram Sriharsha <rsriharsha@hw11853.local>

Closes #6344 from harsha2010/SPARK-7404 and squashes the following commits:

16b9d77 [Ram Sriharsha] consistent naming
7f100b6 [Ram Sriharsha] cleanup
c46044d [Ram Sriharsha] Merge with Master + Code Review Fixes
188fa0a [Ram Sriharsha] Merge branch 'master' into SPARK-7404
f5b6a4c [Ram Sriharsha] cleanup doc
97beca5 [Ram Sriharsha] update test to use R packages
32dd310 [Ram Sriharsha] fix indentation
f93b812 [Ram Sriharsha] fix test
1b6ebb3 [Ram Sriharsha] [SPARK-7404][ml] Add RegressionEvaluator to spark.ml
---
 .../ml/evaluation/RegressionEvaluator.scala   | 84 +++++++++++++++++++
 .../evaluation/RegressionEvaluatorSuite.scala | 71 ++++++++++++++++
 2 files changed, 155 insertions(+)
 create mode 100644 mllib/src/main/scala/org/apache/spark/ml/evaluation/RegressionEvaluator.scala
 create mode 100644 mllib/src/test/scala/org/apache/spark/ml/evaluation/RegressionEvaluatorSuite.scala

diff --git a/mllib/src/main/scala/org/apache/spark/ml/evaluation/RegressionEvaluator.scala b/mllib/src/main/scala/org/apache/spark/ml/evaluation/RegressionEvaluator.scala
new file mode 100644
index 0000000000000..ec493f8f1b504
--- /dev/null
+++ b/mllib/src/main/scala/org/apache/spark/ml/evaluation/RegressionEvaluator.scala
@@ -0,0 +1,84 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.ml.evaluation
+
+import org.apache.spark.annotation.AlphaComponent
+import org.apache.spark.ml.param.{Param, ParamValidators}
+import org.apache.spark.ml.param.shared.{HasLabelCol, HasPredictionCol}
+import org.apache.spark.ml.util.{Identifiable, SchemaUtils}
+import org.apache.spark.mllib.evaluation.RegressionMetrics
+import org.apache.spark.sql.{DataFrame, Row}
+import org.apache.spark.sql.types.DoubleType
+
+/**
+ * :: AlphaComponent ::
+ *
+ * Evaluator for regression, which expects two input columns: prediction and label.
+ */
+@AlphaComponent
+class RegressionEvaluator(override val uid: String)
+  extends Evaluator with HasPredictionCol with HasLabelCol {
+
+  def this() = this(Identifiable.randomUID("regEval"))
+
+  /**
+   * param for metric name in evaluation
+   * @group param
+   */
+  val metricName: Param[String] = {
+    val allowedParams = ParamValidators.inArray(Array("mse", "rmse", "r2", "mae"))
+    new Param(this, "metricName", "metric name in evaluation (mse|rmse|r2|mae)", allowedParams)
+  }
+
+  /** @group getParam */
+  def getMetricName: String = $(metricName)
+
+  /** @group setParam */
+  def setMetricName(value: String): this.type = set(metricName, value)
+
+  /** @group setParam */
+  def setPredictionCol(value: String): this.type = set(predictionCol, value)
+
+  /** @group setParam */
+  def setLabelCol(value: String): this.type = set(labelCol, value)
+
+  setDefault(metricName -> "rmse")
+
+  override def evaluate(dataset: DataFrame): Double = {
+    val schema = dataset.schema
+    SchemaUtils.checkColumnType(schema, $(predictionCol), DoubleType)
+    SchemaUtils.checkColumnType(schema, $(labelCol), DoubleType)
+
+    val predictionAndLabels = dataset.select($(predictionCol), $(labelCol))
+      .map { case Row(prediction: Double, label: Double) =>
+        (prediction, label)
+      }
+    val metrics = new RegressionMetrics(predictionAndLabels)
+    val metric = $(metricName) match {
+      case "rmse" =>
+        metrics.rootMeanSquaredError
+      case "mse" =>
+        metrics.meanSquaredError
+      case "r2" =>
+        metrics.r2
+      case "mae" =>
+        metrics.meanAbsoluteError
+    }
+    metric
+  }
+}
diff --git a/mllib/src/test/scala/org/apache/spark/ml/evaluation/RegressionEvaluatorSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/evaluation/RegressionEvaluatorSuite.scala
new file mode 100644
index 0000000000000..983f8b460b9c0
--- /dev/null
+++ b/mllib/src/test/scala/org/apache/spark/ml/evaluation/RegressionEvaluatorSuite.scala
@@ -0,0 +1,71 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.ml.evaluation
+
+import org.scalatest.FunSuite
+
+import org.apache.spark.ml.regression.LinearRegression
+import org.apache.spark.mllib.util.{LinearDataGenerator, MLlibTestSparkContext}
+import org.apache.spark.mllib.util.TestingUtils._
+
+class RegressionEvaluatorSuite extends FunSuite with MLlibTestSparkContext {
+
+  test("Regression Evaluator: default params") {
+    /**
+     * Here is the instruction describing how to export the test data into CSV format
+     * so we can validate the metrics compared with R's mmetric package.
+     *
+     * import org.apache.spark.mllib.util.LinearDataGenerator
+     * val data = sc.parallelize(LinearDataGenerator.generateLinearInput(6.3,
+     *   Array(4.7, 7.2), Array(0.9, -1.3), Array(0.7, 1.2), 100, 42, 0.1))
+     * data.map(x=> x.label + ", " + x.features(0) + ", " + x.features(1))
+     *   .saveAsTextFile("path")
+     */
+    val dataset = sqlContext.createDataFrame(
+      sc.parallelize(LinearDataGenerator.generateLinearInput(
+        6.3, Array(4.7, 7.2), Array(0.9, -1.3), Array(0.7, 1.2), 100, 42, 0.1), 2))
+    /**
+     * Using the following R code to load the data, train the model and evaluate metrics.
+     *
+     * > library("glmnet")
+     * > library("rminer")
+     * > data <- read.csv("path", header=FALSE, stringsAsFactors=FALSE)
+     * > features <- as.matrix(data.frame(as.numeric(data$V2), as.numeric(data$V3)))
+     * > label <- as.numeric(data$V1)
+     * > model <- glmnet(features, label, family="gaussian", alpha = 0, lambda = 0)
+     * > rmse <- mmetric(label, predict(model, features), metric='RMSE')
+     * > mae <- mmetric(label, predict(model, features), metric='MAE')
+     * > r2 <- mmetric(label, predict(model, features), metric='R2')
+     */
+    val trainer = new LinearRegression
+    val model = trainer.fit(dataset)
+    val predictions = model.transform(dataset)
+
+    // default = rmse
+    val evaluator = new RegressionEvaluator()
+    assert(evaluator.evaluate(predictions) ~== 0.1019382 absTol 0.001)
+
+    // r2 score
+    evaluator.setMetricName("r2")
+    assert(evaluator.evaluate(predictions) ~== 0.9998196 absTol 0.001)
+
+    // mae
+    evaluator.setMetricName("mae")
+    assert(evaluator.evaluate(predictions) ~== 0.08036075 absTol 0.001)
+  }
+}

From c63036cd475cfc26093c296ca1be13802c51093a Mon Sep 17 00:00:00 2001
From: Patrick Wendell <patrick@databricks.com>
Date: Fri, 22 May 2015 10:04:45 -0700
Subject: [PATCH 140/525] Revert "[BUILD] Always run SQL tests in master
 build."

This reverts commit 147b6be3b6d464dfc14836c08e690ab021a600de.
---
 dev/run-tests         | 41 +++++++++++++++++------------------------
 dev/run-tests-jenkins |  2 --
 2 files changed, 17 insertions(+), 26 deletions(-)

diff --git a/dev/run-tests b/dev/run-tests
index b444e74706b65..44d802782c4a4 100755
--- a/dev/run-tests
+++ b/dev/run-tests
@@ -82,31 +82,24 @@ export SBT_MAVEN_PROFILES_ARGS="$SBT_MAVEN_PROFILES_ARGS -Pkinesis-asl"
 if [ -n "$AMPLAB_JENKINS" ]; then
   git fetch origin master:master
 
-  # AMP_JENKINS_PRB indicates if the current build is a pull request build.
-  if [ -n "$AMP_JENKINS_PRB" ]; then
-    # It is a pull request build.
-    sql_diffs=$(
-      git diff --name-only master \
-      | grep -e "^sql/" -e "^bin/spark-sql" -e "^sbin/start-thriftserver.sh"
-    )
-
-    non_sql_diffs=$(
-      git diff --name-only master \
-      | grep -v -e "^sql/" -e "^bin/spark-sql" -e "^sbin/start-thriftserver.sh"
-    )
-
-    if [ -n "$sql_diffs" ]; then
-      echo "[info] Detected changes in SQL. Will run Hive test suite."
-      _RUN_SQL_TESTS=true
-
-      if [ -z "$non_sql_diffs" ]; then
-        echo "[info] Detected no changes except in SQL. Will only run SQL tests."
-        _SQL_TESTS_ONLY=true
-      fi
-    fi
-  else
-    # It is a regular build. We should run SQL tests.
+  sql_diffs=$(
+    git diff --name-only master \
+    | grep -e "^sql/" -e "^bin/spark-sql" -e "^sbin/start-thriftserver.sh"
+  )
+
+  non_sql_diffs=$(
+    git diff --name-only master \
+    | grep -v -e "^sql/" -e "^bin/spark-sql" -e "^sbin/start-thriftserver.sh"
+  )
+
+  if [ -n "$sql_diffs" ]; then
+    echo "[info] Detected changes in SQL. Will run Hive test suite."
     _RUN_SQL_TESTS=true
+
+    if [ -z "$non_sql_diffs" ]; then
+      echo "[info] Detected no changes except in SQL. Will only run SQL tests."
+      _SQL_TESTS_ONLY=true
+    fi
   fi
 fi
 
diff --git a/dev/run-tests-jenkins b/dev/run-tests-jenkins
index 8b2a44fd72ba5..f452ab66efcd8 100755
--- a/dev/run-tests-jenkins
+++ b/dev/run-tests-jenkins
@@ -185,8 +185,6 @@ done
 
 # run tests
 {
-  # Marks this build is a pull request build.
-  export AMP_JENKINS_PRB=true
   timeout "${TESTS_TIMEOUT}" ./dev/run-tests
   test_result="$?"
 

From 509d55ab416359fab0525189458e2ea96379cf14 Mon Sep 17 00:00:00 2001
From: Ram Sriharsha <rsriharsha@hw11853.local>
Date: Fri, 22 May 2015 13:18:08 -0700
Subject: [PATCH 141/525] [SPARK-7574] [ML] [DOC] User guide for OneVsRest

Including Iris Dataset (after shuffling and relabeling 3 -> 0 to confirm to 0 -> numClasses-1 labeling). Could not find an existing dataset in data/mllib for multiclass classification.

Author: Ram Sriharsha <rsriharsha@hw11853.local>

Closes #6296 from harsha2010/SPARK-7574 and squashes the following commits:

645427c [Ram Sriharsha] cleanup
46c41b1 [Ram Sriharsha] cleanup
2f76295 [Ram Sriharsha] Code Review Fixes
ebdf103 [Ram Sriharsha] Java Example
c026613 [Ram Sriharsha] Code Review fixes
4b7d1a6 [Ram Sriharsha] minor cleanup
13bed9c [Ram Sriharsha] add wikipedia link
bb9dbfa [Ram Sriharsha] Clean up naming
6f90db1 [Ram Sriharsha] [SPARK-7574][ml][doc] User guide for OneVsRest
---
 .../sample_multiclass_classification_data.txt | 150 ++++++++++++++++++
 docs/ml-ensembles.md                          | 129 +++++++++++++++
 docs/ml-guide.md                              |   3 +-
 3 files changed, 281 insertions(+), 1 deletion(-)
 create mode 100644 data/mllib/sample_multiclass_classification_data.txt
 create mode 100644 docs/ml-ensembles.md

diff --git a/data/mllib/sample_multiclass_classification_data.txt b/data/mllib/sample_multiclass_classification_data.txt
new file mode 100644
index 0000000000000..a0d7f90113919
--- /dev/null
+++ b/data/mllib/sample_multiclass_classification_data.txt
@@ -0,0 +1,150 @@
+1 1:-0.222222 2:0.5 3:-0.762712 4:-0.833333 
+1 1:-0.555556 2:0.25 3:-0.864407 4:-0.916667 
+1 1:-0.722222 2:-0.166667 3:-0.864407 4:-0.833333 
+1 1:-0.722222 2:0.166667 3:-0.694915 4:-0.916667 
+0 1:0.166667 2:-0.416667 3:0.457627 4:0.5 
+1 1:-0.833333 3:-0.864407 4:-0.916667 
+2 1:-1.32455e-07 2:-0.166667 3:0.220339 4:0.0833333 
+2 1:-1.32455e-07 2:-0.333333 3:0.0169491 4:-4.03573e-08 
+1 1:-0.5 2:0.75 3:-0.830508 4:-1 
+0 1:0.611111 3:0.694915 4:0.416667 
+0 1:0.222222 2:-0.166667 3:0.423729 4:0.583333 
+1 1:-0.722222 2:-0.166667 3:-0.864407 4:-1 
+1 1:-0.5 2:0.166667 3:-0.864407 4:-0.916667 
+2 1:-0.222222 2:-0.333333 3:0.0508474 4:-4.03573e-08 
+2 1:-0.0555556 2:-0.833333 3:0.0169491 4:-0.25 
+2 1:-0.166667 2:-0.416667 3:-0.0169491 4:-0.0833333 
+1 1:-0.944444 3:-0.898305 4:-0.916667 
+2 1:-0.277778 2:-0.583333 3:-0.0169491 4:-0.166667 
+0 1:0.111111 2:-0.333333 3:0.38983 4:0.166667 
+2 1:-0.222222 2:-0.166667 3:0.0847457 4:-0.0833333 
+0 1:0.166667 2:-0.333333 3:0.559322 4:0.666667 
+1 1:-0.611111 2:0.0833333 3:-0.864407 4:-0.916667 
+2 1:-0.333333 2:-0.583333 3:0.0169491 4:-4.03573e-08 
+0 1:0.555555 2:-0.166667 3:0.661017 4:0.666667 
+2 1:0.166667 3:0.186441 4:0.166667 
+2 1:0.111111 2:-0.75 3:0.152542 4:-4.03573e-08 
+2 1:0.166667 2:-0.25 3:0.118644 4:-4.03573e-08 
+0 1:-0.0555556 2:-0.833333 3:0.355932 4:0.166667 
+0 1:-0.277778 2:-0.333333 3:0.322034 4:0.583333 
+2 1:-0.222222 2:-0.5 3:-0.152542 4:-0.25 
+2 1:-0.111111 3:0.288136 4:0.416667 
+2 1:-0.0555556 2:-0.25 3:0.186441 4:0.166667 
+2 1:0.333333 2:-0.166667 3:0.355932 4:0.333333 
+1 1:-0.611111 2:0.25 3:-0.898305 4:-0.833333 
+0 1:0.166667 2:-0.333333 3:0.559322 4:0.75 
+0 1:0.111111 2:-0.25 3:0.559322 4:0.416667 
+0 1:0.833333 2:-0.166667 3:0.898305 4:0.666667 
+2 1:-0.277778 2:-0.166667 3:0.186441 4:0.166667 
+0 1:-0.666667 2:-0.583333 3:0.186441 4:0.333333 
+1 1:-0.666667 2:-0.0833334 3:-0.830508 4:-1 
+1 1:-0.166667 2:0.666667 3:-0.932203 4:-0.916667 
+0 1:0.0555554 2:-0.333333 3:0.288136 4:0.416667 
+1 1:-0.666667 2:-0.0833334 3:-0.830508 4:-1 
+1 1:-0.833333 2:0.166667 3:-0.864407 4:-0.833333 
+0 1:0.0555554 2:0.166667 3:0.491525 4:0.833333 
+0 1:0.722222 2:-0.333333 3:0.728813 4:0.5 
+2 1:-0.166667 2:-0.416667 3:0.0508474 4:-0.25 
+2 1:0.5 3:0.254237 4:0.0833333 
+0 1:0.111111 2:-0.583333 3:0.355932 4:0.5 
+1 1:-0.944444 2:-0.166667 3:-0.898305 4:-0.916667 
+2 1:0.277778 2:-0.25 3:0.220339 4:-4.03573e-08 
+0 1:0.666667 2:-0.25 3:0.79661 4:0.416667 
+0 1:0.111111 2:0.0833333 3:0.694915 4:1 
+0 1:0.444444 3:0.59322 4:0.833333 
+2 1:-0.0555556 2:0.166667 3:0.186441 4:0.25 
+1 1:-0.833333 2:0.333333 3:-1 4:-0.916667 
+1 1:-0.555556 2:0.416667 3:-0.830508 4:-0.75 
+2 1:-0.333333 2:-0.5 3:0.152542 4:-0.0833333 
+1 1:-1 2:-0.166667 3:-0.966102 4:-1 
+1 1:-0.333333 2:0.25 3:-0.898305 4:-0.916667 
+2 1:0.388889 2:-0.333333 3:0.288136 4:0.0833333 
+2 1:0.277778 2:-0.166667 3:0.152542 4:0.0833333 
+0 1:0.333333 2:0.0833333 3:0.59322 4:0.666667 
+1 1:-0.777778 3:-0.79661 4:-0.916667 
+1 1:-0.444444 2:0.416667 3:-0.830508 4:-0.916667 
+0 1:0.222222 2:-0.166667 3:0.627119 4:0.75 
+1 1:-0.555556 2:0.5 3:-0.79661 4:-0.916667 
+1 1:-0.555556 2:0.5 3:-0.694915 4:-0.75 
+2 1:-1.32455e-07 2:-0.25 3:0.254237 4:0.0833333 
+1 1:-0.5 2:0.25 3:-0.830508 4:-0.916667 
+0 1:0.166667 3:0.457627 4:0.833333 
+2 1:0.444444 2:-0.0833334 3:0.322034 4:0.166667 
+0 1:0.111111 2:0.166667 3:0.559322 4:0.916667 
+1 1:-0.611111 2:0.25 3:-0.79661 4:-0.583333 
+0 1:0.388889 3:0.661017 4:0.833333 
+1 1:-0.722222 2:0.166667 3:-0.79661 4:-0.916667 
+1 1:-0.722222 2:-0.0833334 3:-0.79661 4:-0.916667 
+1 1:-0.555556 2:0.166667 3:-0.830508 4:-0.916667 
+2 1:-0.666667 2:-0.666667 3:-0.220339 4:-0.25 
+2 1:-0.611111 2:-0.75 3:-0.220339 4:-0.25 
+2 1:0.0555554 2:-0.833333 3:0.186441 4:0.166667 
+0 1:-0.166667 2:-0.416667 3:0.38983 4:0.5 
+0 1:0.611111 2:0.333333 3:0.728813 4:1 
+2 1:0.0555554 2:-0.25 3:0.118644 4:-4.03573e-08 
+1 1:-0.666667 2:-0.166667 3:-0.864407 4:-0.916667 
+1 1:-0.833333 2:-0.0833334 3:-0.830508 4:-0.916667 
+0 1:0.611111 2:-0.166667 3:0.627119 4:0.25 
+0 1:0.888889 2:0.5 3:0.932203 4:0.75 
+2 1:0.222222 2:-0.333333 3:0.220339 4:0.166667 
+1 1:-0.555556 2:0.25 3:-0.864407 4:-0.833333 
+0 1:-1.32455e-07 2:-0.166667 3:0.322034 4:0.416667 
+0 1:-1.32455e-07 2:-0.5 3:0.559322 4:0.0833333 
+1 1:-0.611111 3:-0.932203 4:-0.916667 
+1 1:-0.333333 2:0.833333 3:-0.864407 4:-0.916667 
+0 1:-0.166667 2:-0.333333 3:0.38983 4:0.916667 
+2 1:-0.333333 2:-0.666667 3:-0.0847458 4:-0.25 
+2 1:-0.0555556 2:-0.416667 3:0.38983 4:0.25 
+1 1:-0.388889 2:0.416667 3:-0.830508 4:-0.916667 
+0 1:0.444444 2:-0.0833334 3:0.38983 4:0.833333 
+1 1:-0.611111 2:0.333333 3:-0.864407 4:-0.916667 
+0 1:0.111111 2:-0.416667 3:0.322034 4:0.416667 
+0 1:0.166667 2:-0.0833334 3:0.525424 4:0.416667 
+2 1:0.333333 2:-0.0833334 3:0.152542 4:0.0833333 
+0 1:-0.0555556 2:-0.166667 3:0.288136 4:0.416667 
+0 1:-0.166667 2:-0.416667 3:0.38983 4:0.5 
+1 1:-0.611111 2:0.166667 3:-0.830508 4:-0.916667 
+0 1:0.888889 2:-0.166667 3:0.728813 4:0.833333 
+2 1:-0.277778 2:-0.25 3:-0.118644 4:-4.03573e-08 
+2 1:-0.222222 2:-0.333333 3:0.186441 4:-4.03573e-08 
+0 1:0.333333 2:-0.583333 3:0.627119 4:0.416667 
+0 1:0.444444 2:-0.0833334 3:0.491525 4:0.666667 
+2 1:-0.222222 2:-0.25 3:0.0847457 4:-4.03573e-08 
+1 1:-0.611111 2:0.166667 3:-0.79661 4:-0.75 
+2 1:-0.277778 2:-0.166667 3:0.0508474 4:-4.03573e-08 
+0 1:1 2:0.5 3:0.830508 4:0.583333 
+2 1:-0.333333 2:-0.666667 3:-0.0508475 4:-0.166667 
+2 1:-0.277778 2:-0.416667 3:0.0847457 4:-4.03573e-08 
+0 1:0.888889 2:-0.333333 3:0.932203 4:0.583333 
+2 1:-0.111111 2:-0.166667 3:0.0847457 4:0.166667 
+2 1:0.111111 2:-0.583333 3:0.322034 4:0.166667 
+0 1:0.333333 2:0.0833333 3:0.59322 4:1 
+0 1:0.222222 2:-0.166667 3:0.525424 4:0.416667 
+1 1:-0.555556 2:0.5 3:-0.830508 4:-0.833333 
+0 1:-0.111111 2:-0.166667 3:0.38983 4:0.416667
+0 1:0.888889 2:-0.5 3:1 4:0.833333 
+1 1:-0.388889 2:0.583333 3:-0.898305 4:-0.75 
+2 1:0.111111 2:0.0833333 3:0.254237 4:0.25 
+0 1:0.333333 2:-0.166667 3:0.423729 4:0.833333 
+1 1:-0.388889 2:0.166667 3:-0.762712 4:-0.916667 
+0 1:0.333333 2:-0.0833334 3:0.559322 4:0.916667 
+2 1:-0.333333 2:-0.75 3:0.0169491 4:-4.03573e-08 
+1 1:-0.222222 2:1 3:-0.830508 4:-0.75 
+1 1:-0.388889 2:0.583333 3:-0.762712 4:-0.75 
+2 1:-0.611111 2:-1 3:-0.152542 4:-0.25 
+2 1:-1.32455e-07 2:-0.333333 3:0.254237 4:-0.0833333 
+2 1:-0.5 2:-0.416667 3:-0.0169491 4:0.0833333 
+1 1:-0.888889 2:-0.75 3:-0.898305 4:-0.833333 
+1 1:-0.666667 2:-0.0833334 3:-0.830508 4:-1 
+2 1:-0.555556 2:-0.583333 3:-0.322034 4:-0.166667 
+2 1:-0.166667 2:-0.5 3:0.0169491 4:-0.0833333 
+1 1:-0.555556 2:0.0833333 3:-0.762712 4:-0.666667 
+1 1:-0.777778 3:-0.898305 4:-0.916667 
+0 1:0.388889 2:-0.166667 3:0.525424 4:0.666667 
+0 1:0.222222 3:0.38983 4:0.583333 
+2 1:0.333333 2:-0.0833334 3:0.254237 4:0.166667 
+2 1:-0.388889 2:-0.166667 3:0.186441 4:0.166667 
+0 1:-0.222222 2:-0.583333 3:0.355932 4:0.583333 
+1 1:-0.611111 2:-0.166667 3:-0.79661 4:-0.916667 
+1 1:-0.944444 2:-0.25 3:-0.864407 4:-0.916667 
+1 1:-0.388889 2:0.166667 3:-0.830508 4:-0.75 
diff --git a/docs/ml-ensembles.md b/docs/ml-ensembles.md
new file mode 100644
index 0000000000000..9ff50e95fc479
--- /dev/null
+++ b/docs/ml-ensembles.md
@@ -0,0 +1,129 @@
+---
+layout: global
+title: Ensembles
+displayTitle: <a href="ml-guide.html">ML</a> - Ensembles
+---
+
+**Table of Contents**
+
+* This will become a table of contents (this text will be scraped).
+{:toc}
+
+An [ensemble method](http://en.wikipedia.org/wiki/Ensemble_learning)
+is a learning algorithm which creates a model composed of a set of other base models.
+The Pipelines API supports the following ensemble algorithms: [`OneVsRest`](api/scala/index.html#org.apache.spark.ml.classifier.OneVsRest)
+
+## OneVsRest
+
+[OneVsRest](http://en.wikipedia.org/wiki/Multiclass_classification#One-vs.-rest) is an example of a machine learning reduction for performing multiclass classification given a base classifier that can perform binary classification efficiently.
+
+`OneVsRest` is implemented as an `Estimator`. For the base classifier it takes instances of `Classifier` and creates a binary classification problem for each of the k classes. The classifier for class i is trained to predict whether the label is i or not, distinguishing class i from all other classes.
+
+Predictions are done by evaluating each binary classifier and the index of the most confident classifier is output as label.
+
+### Example
+
+The example below demonstrates how to load the
+[Iris dataset](http://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/multiclass/iris.scale), parse it as a DataFrame and perform multiclass classification using `OneVsRest`. The test error is calculated to measure the algorithm accuracy.
+
+<div class="codetabs">
+<div data-lang="scala" markdown="1">
+{% highlight scala %}
+import org.apache.spark.ml.classification.{LogisticRegression, OneVsRest}
+import org.apache.spark.mllib.evaluation.MulticlassMetrics
+import org.apache.spark.mllib.util.MLUtils
+import org.apache.spark.sql.{Row, SQLContext}
+
+val sqlContext = new SQLContext(sc)
+
+// parse data into dataframe
+val data = MLUtils.loadLibSVMFile(sc, 
+  "data/mllib/sample_multiclass_classification_data.txt")
+val Array(train, test) = data.toDF().randomSplit(Array(0.7, 0.3))
+
+// instantiate multiclass learner and train
+val ovr = new OneVsRest().setClassifier(new LogisticRegression)
+
+val ovrModel = ovr.fit(train)
+
+// score model on test data
+val predictions = ovrModel.transform(test).select("prediction", "label")
+val predictionsAndLabels = predictions.map {case Row(p: Double, l: Double) => (p, l)}
+
+// compute confusion matrix
+val metrics = new MulticlassMetrics(predictionsAndLabels)
+println(metrics.confusionMatrix)
+
+// the Iris DataSet has three classes
+val numClasses = 3
+
+println("label\tfpr\n")
+(0 until numClasses).foreach { index =>
+  val label = index.toDouble
+  println(label + "\t" + metrics.falsePositiveRate(label))
+}
+{% endhighlight %}
+</div>
+<div data-lang="java" markdown="1">
+{% highlight java %}
+
+import org.apache.spark.SparkConf;
+import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.ml.classification.LogisticRegression;
+import org.apache.spark.ml.classification.OneVsRest;
+import org.apache.spark.ml.classification.OneVsRestModel;
+import org.apache.spark.mllib.evaluation.MulticlassMetrics;
+import org.apache.spark.mllib.linalg.Matrix;
+import org.apache.spark.mllib.regression.LabeledPoint;
+import org.apache.spark.mllib.util.MLUtils;
+import org.apache.spark.rdd.RDD;
+import org.apache.spark.sql.DataFrame;
+import org.apache.spark.sql.SQLContext;
+
+SparkConf conf = new SparkConf().setAppName("JavaOneVsRestExample");
+JavaSparkContext jsc = new JavaSparkContext(conf);
+SQLContext jsql = new SQLContext(jsc);
+
+RDD<LabeledPoint> data = MLUtils.loadLibSVMFile(jsc.sc(),
+  "data/mllib/sample_multiclass_classification_data.txt");
+
+DataFrame dataFrame = jsql.createDataFrame(data, LabeledPoint.class);
+DataFrame[] splits = dataFrame.randomSplit(new double[]{0.7, 0.3}, 12345);
+DataFrame train = splits[0];
+DataFrame test = splits[1];
+
+// instantiate the One Vs Rest Classifier
+OneVsRest ovr = new OneVsRest().setClassifier(new LogisticRegression());
+
+// train the multiclass model
+OneVsRestModel ovrModel = ovr.fit(train.cache());
+
+// score the model on test data
+DataFrame predictions = ovrModel
+  .transform(test)
+  .select("prediction", "label");
+
+// obtain metrics
+MulticlassMetrics metrics = new MulticlassMetrics(predictions);
+Matrix confusionMatrix = metrics.confusionMatrix();
+
+// output the Confusion Matrix
+System.out.println("Confusion Matrix");
+System.out.println(confusionMatrix);
+
+// compute the false positive rate per label
+System.out.println();
+System.out.println("label\tfpr\n");
+
+// the Iris DataSet has three classes
+int numClasses = 3;
+for (int index = 0; index < numClasses; index++) {
+  double label = (double) index;
+  System.out.print(label);
+  System.out.print("\t");
+  System.out.print(metrics.falsePositiveRate(label));
+  System.out.println();
+}
+{% endhighlight %}
+</div>
+</div>
diff --git a/docs/ml-guide.md b/docs/ml-guide.md
index cac705683c8bc..c5f50ed7990f1 100644
--- a/docs/ml-guide.md
+++ b/docs/ml-guide.md
@@ -150,11 +150,12 @@ This is useful if there are two algorithms with the `maxIter` parameter in a `Pi
 
 # Algorithm Guides
 
-There are now several algorithms in the Pipelines API which are not in the lower-level MLlib API, so we link to documentation for them here.  These algorithms are mostly feature transformers, which fit naturally into the `Transformer` abstraction in Pipelines.
+There are now several algorithms in the Pipelines API which are not in the lower-level MLlib API, so we link to documentation for them here.  These algorithms are mostly feature transformers, which fit naturally into the `Transformer` abstraction in Pipelines, and ensembles, which fit naturally into the `Estimator` abstraction in the Pipelines.
 
 **Pipelines API Algorithm Guides**
 
 * [Feature Extraction, Transformation, and Selection](ml-features.html)
+* [Ensembles](ml-ensembles.html)
 
 
 # Code Examples

From eac00691da93a94e6cff5ae0f8952e5724e78094 Mon Sep 17 00:00:00 2001
From: Josh Rosen <joshrosen@databricks.com>
Date: Fri, 22 May 2015 13:28:14 -0700
Subject: [PATCH 142/525] [SPARK-7766] KryoSerializerInstance reuse is unsafe
 when auto-reset is disabled

SPARK-3386 / #5606 modified the shuffle write path to re-use serializer instances across multiple calls to DiskBlockObjectWriter. It turns out that this introduced a very rare bug when using `KryoSerializer`: if auto-reset is disabled and reference-tracking is enabled, then we'll end up re-using the same serializer instance to write multiple output streams without calling `reset()` between write calls, which can lead to cases where objects in one file may contain references to objects that are in previous files, causing errors during deserialization.

This patch fixes this bug by calling `reset()` at the start of `serialize()` and `serializeStream()`. I also added a regression test which demonstrates that this problem only occurs when auto-reset is disabled and reference-tracking is enabled.

Author: Josh Rosen <joshrosen@databricks.com>

Closes #6293 from JoshRosen/kryo-instance-reuse-bug and squashes the following commits:

e19726d [Josh Rosen] Add fix for SPARK-7766.
71845e3 [Josh Rosen] Add failing regression test to trigger Kryo re-use bug
---
 .../spark/serializer/KryoSerializer.scala     |  2 ++
 .../serializer/KryoSerializerSuite.scala      | 33 +++++++++++++++++++
 2 files changed, 35 insertions(+)

diff --git a/core/src/main/scala/org/apache/spark/serializer/KryoSerializer.scala b/core/src/main/scala/org/apache/spark/serializer/KryoSerializer.scala
index 64ba27f34d2f1..217957963437d 100644
--- a/core/src/main/scala/org/apache/spark/serializer/KryoSerializer.scala
+++ b/core/src/main/scala/org/apache/spark/serializer/KryoSerializer.scala
@@ -177,6 +177,7 @@ private[spark] class KryoSerializerInstance(ks: KryoSerializer) extends Serializ
 
   override def serialize[T: ClassTag](t: T): ByteBuffer = {
     output.clear()
+    kryo.reset() // We must reset in case this serializer instance was reused (see SPARK-7766)
     try {
       kryo.writeClassAndObject(output, t)
     } catch {
@@ -202,6 +203,7 @@ private[spark] class KryoSerializerInstance(ks: KryoSerializer) extends Serializ
   }
 
   override def serializeStream(s: OutputStream): SerializationStream = {
+    kryo.reset() // We must reset in case this serializer instance was reused (see SPARK-7766)
     new KryoSerializationStream(kryo, s)
   }
 
diff --git a/core/src/test/scala/org/apache/spark/serializer/KryoSerializerSuite.scala b/core/src/test/scala/org/apache/spark/serializer/KryoSerializerSuite.scala
index c7369de24b81f..0bd91a8dba2ab 100644
--- a/core/src/test/scala/org/apache/spark/serializer/KryoSerializerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/serializer/KryoSerializerSuite.scala
@@ -17,6 +17,8 @@
 
 package org.apache.spark.serializer
 
+import java.io.ByteArrayOutputStream
+
 import scala.collection.mutable
 import scala.reflect.ClassTag
 
@@ -319,6 +321,37 @@ class KryoSerializerSuite extends FunSuite with SharedSparkContext {
     val ser2 = new KryoSerializer(conf).newInstance().asInstanceOf[KryoSerializerInstance]
     assert(!ser2.getAutoReset)
   }
+
+  private def testSerializerInstanceReuse(autoReset: Boolean, referenceTracking: Boolean): Unit = {
+    val conf = new SparkConf(loadDefaults = false)
+      .set("spark.kryo.referenceTracking", referenceTracking.toString)
+    if (!autoReset) {
+      conf.set("spark.kryo.registrator", classOf[RegistratorWithoutAutoReset].getName)
+    }
+    val ser = new KryoSerializer(conf)
+    val serInstance = ser.newInstance().asInstanceOf[KryoSerializerInstance]
+    assert (serInstance.getAutoReset() === autoReset)
+    val obj = ("Hello", "World")
+    def serializeObjects(): Array[Byte] = {
+      val baos = new ByteArrayOutputStream()
+      val serStream = serInstance.serializeStream(baos)
+      serStream.writeObject(obj)
+      serStream.writeObject(obj)
+      serStream.close()
+      baos.toByteArray
+    }
+    val output1: Array[Byte] = serializeObjects()
+    val output2: Array[Byte] = serializeObjects()
+    assert (output1 === output2)
+  }
+
+  // Regression test for SPARK-7766, an issue where disabling auto-reset and enabling
+  // reference-tracking would lead to corrupted output when serializer instances are re-used
+  for (referenceTracking <- Set(true, false); autoReset <- Set(true, false)) {
+    test(s"instance reuse with autoReset = $autoReset, referenceTracking = $referenceTracking") {
+      testSerializerInstanceReuse(autoReset = autoReset, referenceTracking = referenceTracking)
+    }
+  }
 }
 
 

From 31d5d463e76b6611c854c6cf27059fec8198adc9 Mon Sep 17 00:00:00 2001
From: WangTaoTheTonic <wangtao111@huawei.com>
Date: Fri, 22 May 2015 14:43:16 -0700
Subject: [PATCH 143/525] [SPARK-7758] [SQL] Override more configs to avoid
 failure when connect to a postgre sql

https://issues.apache.org/jira/browse/SPARK-7758

When initializing `executionHive`, we only masks
`javax.jdo.option.ConnectionURL` to override metastore location.  However,
other properties that relates to the actual Hive metastore data source are not
masked.  For example, when using Spark SQL with a PostgreSQL backed Hive
metastore, `executionHive` actually tries to use settings read from
`hive-site.xml`, which talks about PostgreSQL, to connect to the temporary
Derby metastore, thus causes error.

To fix this, we need to mask all metastore data source properties.
Specifically, according to the code of [Hive `ObjectStore.getDataSourceProps()`
method] [1], all properties whose name mentions "jdo" and "datanucleus" must be
included.

[1]: https://github.com/apache/hive/blob/release-0.13.1/metastore/src/java/org/apache/hadoop/hive/metastore/ObjectStore.java#L288

Have tested using postgre sql as metastore, it worked fine.

Author: WangTaoTheTonic <wangtao111@huawei.com>

Closes #6314 from WangTaoTheTonic/SPARK-7758 and squashes the following commits:

ca7ae7c [WangTaoTheTonic] add comments
86caf2c [WangTaoTheTonic] delete unused import
e4f0feb [WangTaoTheTonic] block more data source related property
92a81fa [WangTaoTheTonic] fix style check
e3e683d [WangTaoTheTonic] override more configs to avoid failuer connecting to postgre sql
---
 .../scala/org/apache/spark/SparkContext.scala  |  2 +-
 .../apache/spark/sql/hive/HiveContext.scala    | 18 +++++++++++++++---
 2 files changed, 16 insertions(+), 4 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/SparkContext.scala b/core/src/main/scala/org/apache/spark/SparkContext.scala
index ad78bdfde2dfb..ea6c0dea08e47 100644
--- a/core/src/main/scala/org/apache/spark/SparkContext.scala
+++ b/core/src/main/scala/org/apache/spark/SparkContext.scala
@@ -1884,7 +1884,7 @@ class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationCli
    *
    * @param f the closure to clean
    * @param checkSerializable whether or not to immediately check <tt>f</tt> for serializability
-   * @throws <tt>SparkException<tt> if <tt>checkSerializable</tt> is set but <tt>f</tt> is not
+   * @throws SparkException if <tt>checkSerializable</tt> is set but <tt>f</tt> is not
    *   serializable
    */
   private[spark] def clean[F <: AnyRef](f: F, checkSerializable: Boolean = true): F = {
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala
index 863a5db1bf98c..a8e8e70db0430 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala
@@ -25,6 +25,7 @@ import org.apache.hadoop.hive.ql.parse.VariableSubstitution
 import org.apache.spark.sql.catalyst.ParserDialect
 
 import scala.collection.JavaConversions._
+import scala.collection.mutable.HashMap
 import scala.language.implicitConversions
 
 import org.apache.hadoop.fs.{FileSystem, Path}
@@ -153,7 +154,7 @@ class HiveContext(sc: SparkContext) extends SQLContext(sc) {
    * Hive 13 as this is the version of Hive that is packaged with Spark SQL.  This copy of the
    * client is used for execution related tasks like registering temporary functions or ensuring
    * that the ThreadLocal SessionState is correctly populated.  This copy of Hive is *not* used
-   * for storing peristent metadata, and only point to a dummy metastore in a temporary directory.
+   * for storing persistent metadata, and only point to a dummy metastore in a temporary directory.
    */
   @transient
   protected[hive] lazy val executionHive: ClientWrapper = {
@@ -507,8 +508,19 @@ private[hive] object HiveContext {
   def newTemporaryConfiguration(): Map[String, String] = {
     val tempDir = Utils.createTempDir()
     val localMetastore = new File(tempDir, "metastore").getAbsolutePath
-    Map(
-      "javax.jdo.option.ConnectionURL" -> s"jdbc:derby:;databaseName=$localMetastore;create=true")
+    val propMap: HashMap[String, String] = HashMap()
+    // We have to mask all properties in hive-site.xml that relates to metastore data source
+    // as we used a local metastore here.
+    HiveConf.ConfVars.values().foreach { confvar  =>
+      if (confvar.varname.contains("datanucleus") || confvar.varname.contains("jdo")) {
+        propMap.put(confvar.varname, confvar.defaultVal)
+      }
+    }
+    propMap.put("javax.jdo.option.ConnectionURL",
+      s"jdbc:derby:;databaseName=$localMetastore;create=true")
+    propMap.put("datanucleus.rdbms.datastoreAdapterClassName",
+      "org.datanucleus.store.rdbms.adapter.DerbyAdapter")
+    propMap.toMap
   }
 
   protected val primitiveTypes =

From e4aef91fe70d6c9765d530b913a9d79103fc27ce Mon Sep 17 00:00:00 2001
From: "Santiago M. Mola" <santi@mola.io>
Date: Fri, 22 May 2015 15:10:27 -0700
Subject: [PATCH 144/525] [SPARK-7724] [SQL] Support Intersect/Except in
 Catalyst DSL.

Author: Santiago M. Mola <santi@mola.io>

Closes #6327 from smola/feature/catalyst-dsl-set-ops and squashes the following commits:

11db778 [Santiago M. Mola] [SPARK-7724] [SQL] Support Intersect/Except in Catalyst DSL.
---
 .../scala/org/apache/spark/sql/catalyst/dsl/package.scala     | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/dsl/package.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/dsl/package.scala
index 4c0d70203c6f5..307a9ca9b0070 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/dsl/package.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/dsl/package.scala
@@ -276,6 +276,10 @@ package object dsl {
 
     def unionAll(otherPlan: LogicalPlan): LogicalPlan = Union(logicalPlan, otherPlan)
 
+    def except(otherPlan: LogicalPlan): LogicalPlan = Except(logicalPlan, otherPlan)
+
+    def intersect(otherPlan: LogicalPlan): LogicalPlan = Intersect(logicalPlan, otherPlan)
+
     def sfilter[T1](arg1: Symbol)(udf: (T1) => Boolean): LogicalPlan =
       Filter(ScalaUdf(udf, BooleanType, Seq(UnresolvedAttribute(arg1.name))), logicalPlan)
 

From 126d7235de649ea5619dee6ad3a70970ee90df93 Mon Sep 17 00:00:00 2001
From: Liang-Chi Hsieh <viirya@gmail.com>
Date: Fri, 22 May 2015 15:39:58 -0700
Subject: [PATCH 145/525] [SPARK-7270] [SQL] Consider dynamic partition when
 inserting into hive table

JIRA: https://issues.apache.org/jira/browse/SPARK-7270

Author: Liang-Chi Hsieh <viirya@gmail.com>

Closes #5864 from viirya/dyn_partition_insert and squashes the following commits:

b5627df [Liang-Chi Hsieh] For comments.
3b21e4b [Liang-Chi Hsieh] Merge remote-tracking branch 'upstream/master' into dyn_partition_insert
8a4352d [Liang-Chi Hsieh] Consider dynamic partition when inserting into hive table.
---
 .../spark/sql/hive/HiveMetastoreCatalog.scala | 18 ++++++++++++-----
 .../sql/hive/execution/HiveQuerySuite.scala   | 20 +++++++++++++++++++
 2 files changed, 33 insertions(+), 5 deletions(-)

diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
index 5b6840008f1ce..425a4005aa2c3 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
@@ -516,17 +516,19 @@ private[hive] class HiveMetastoreCatalog(val client: ClientInterface, hive: Hive
     def castChildOutput(p: InsertIntoTable, table: MetastoreRelation, child: LogicalPlan)
       : LogicalPlan = {
       val childOutputDataTypes = child.output.map(_.dataType)
+      val numDynamicPartitions = p.partition.values.count(_.isEmpty)
       val tableOutputDataTypes =
-        (table.attributes ++ table.partitionKeys).take(child.output.length).map(_.dataType)
+        (table.attributes ++ table.partitionKeys.takeRight(numDynamicPartitions))
+          .take(child.output.length).map(_.dataType)
 
       if (childOutputDataTypes == tableOutputDataTypes) {
-        p
+        InsertIntoHiveTable(table, p.partition, p.child, p.overwrite, p.ifNotExists)
       } else if (childOutputDataTypes.size == tableOutputDataTypes.size &&
         childOutputDataTypes.zip(tableOutputDataTypes)
           .forall { case (left, right) => left.sameType(right) }) {
         // If both types ignoring nullability of ArrayType, MapType, StructType are the same,
         // use InsertIntoHiveTable instead of InsertIntoTable.
-        InsertIntoHiveTable(p.table, p.partition, p.child, p.overwrite, p.ifNotExists)
+        InsertIntoHiveTable(table, p.partition, p.child, p.overwrite, p.ifNotExists)
       } else {
         // Only do the casting when child output data types differ from table output data types.
         val castedChildOutput = child.output.zip(table.output).map {
@@ -561,7 +563,7 @@ private[hive] class HiveMetastoreCatalog(val client: ClientInterface, hive: Hive
  * because Hive table doesn't have nullability for ARRAY, MAP, STRUCT types.
  */
 private[hive] case class InsertIntoHiveTable(
-    table: LogicalPlan,
+    table: MetastoreRelation,
     partition: Map[String, Option[String]],
     child: LogicalPlan,
     overwrite: Boolean,
@@ -571,7 +573,13 @@ private[hive] case class InsertIntoHiveTable(
   override def children: Seq[LogicalPlan] = child :: Nil
   override def output: Seq[Attribute] = child.output
 
-  override lazy val resolved: Boolean = childrenResolved && child.output.zip(table.output).forall {
+  val numDynamicPartitions = partition.values.count(_.isEmpty)
+
+  // This is the expected schema of the table prepared to be inserted into,
+  // including dynamic partition columns.
+  val tableOutput = table.attributes ++ table.partitionKeys.takeRight(numDynamicPartitions)
+
+  override lazy val resolved: Boolean = childrenResolved && child.output.zip(tableOutput).forall {
     case (childAttr, tableAttr) => childAttr.dataType.sameType(tableAttr.dataType)
   }
 }
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala
index 65c6ef03bf041..4af31d482ce42 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala
@@ -28,6 +28,7 @@ import org.apache.hadoop.hive.conf.HiveConf.ConfVars
 
 import org.apache.spark.{SparkFiles, SparkException}
 import org.apache.spark.sql.{AnalysisException, DataFrame, Row}
+import org.apache.spark.sql.catalyst.expressions.Cast
 import org.apache.spark.sql.catalyst.plans.logical.Project
 import org.apache.spark.sql.hive._
 import org.apache.spark.sql.hive.test.TestHive
@@ -415,6 +416,25 @@ class HiveQuerySuite extends HiveComparisonTest with BeforeAndAfter {
       |SELECT * FROM createdtable;
     """.stripMargin)
 
+  test("SPARK-7270: consider dynamic partition when comparing table output") {
+    sql(s"CREATE TABLE test_partition (a STRING) PARTITIONED BY (b BIGINT, c STRING)")
+    sql(s"CREATE TABLE ptest (a STRING, b BIGINT, c STRING)")
+
+    val analyzedPlan = sql(
+      """
+        |INSERT OVERWRITE table test_partition PARTITION (b=1, c)
+        |SELECT 'a', 'c' from ptest
+      """.stripMargin).queryExecution.analyzed
+
+    assertResult(false, "Incorrect cast detected\n" + analyzedPlan) {
+      var hasCast = false
+      analyzedPlan.collect {
+        case p: Project => p.transformExpressionsUp { case c: Cast => hasCast = true; c }
+      }
+      hasCast
+    }
+  }
+
   createQueryTest("transform",
     "SELECT TRANSFORM (key) USING 'cat' AS (tKey) FROM src")
 

From 821254fb945c3e19540eb57fff1f656737ef484b Mon Sep 17 00:00:00 2001
From: Imran Rashid <irashid@cloudera.com>
Date: Fri, 22 May 2015 16:05:07 -0700
Subject: [PATCH 146/525] [SPARK-7760] add /json back into master & worker
 pages; add test

Author: Imran Rashid <irashid@cloudera.com>

Closes #6284 from squito/SPARK-7760 and squashes the following commits:

5e02d8a [Imran Rashid] style; increase timeout
9987399 [Imran Rashid] comment
8c7ed63 [Imran Rashid] add /json back into master & worker pages; add test
---
 .../spark/deploy/LocalSparkCluster.scala      |  6 +++-
 .../scala/org/apache/spark/ui/WebUI.scala     |  3 ++
 .../spark/deploy/master/MasterSuite.scala     | 31 +++++++++++++++++--
 3 files changed, 37 insertions(+), 3 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/deploy/LocalSparkCluster.scala b/core/src/main/scala/org/apache/spark/deploy/LocalSparkCluster.scala
index 860e1a24901b6..0550f00a172ab 100644
--- a/core/src/main/scala/org/apache/spark/deploy/LocalSparkCluster.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/LocalSparkCluster.scala
@@ -43,6 +43,8 @@ class LocalSparkCluster(
   private val localHostname = Utils.localHostName()
   private val masterActorSystems = ArrayBuffer[ActorSystem]()
   private val workerActorSystems = ArrayBuffer[ActorSystem]()
+  // exposed for testing
+  var masterWebUIPort = -1
 
   def start(): Array[String] = {
     logInfo("Starting a local Spark cluster with " + numWorkers + " workers.")
@@ -53,7 +55,9 @@ class LocalSparkCluster(
       .set("spark.shuffle.service.enabled", "false")
 
     /* Start the Master */
-    val (masterSystem, masterPort, _, _) = Master.startSystemAndActor(localHostname, 0, 0, _conf)
+    val (masterSystem, masterPort, webUiPort, _) =
+      Master.startSystemAndActor(localHostname, 0, 0, _conf)
+    masterWebUIPort = webUiPort
     masterActorSystems += masterSystem
     val masterUrl = "spark://" + Utils.localHostNameForURI() + ":" + masterPort
     val masters = Array(masterUrl)
diff --git a/core/src/main/scala/org/apache/spark/ui/WebUI.scala b/core/src/main/scala/org/apache/spark/ui/WebUI.scala
index 1df9cd0fa18b4..594df15e9cc85 100644
--- a/core/src/main/scala/org/apache/spark/ui/WebUI.scala
+++ b/core/src/main/scala/org/apache/spark/ui/WebUI.scala
@@ -77,7 +77,10 @@ private[spark] abstract class WebUI(
     val pagePath = "/" + page.prefix
     val renderHandler = createServletHandler(pagePath,
       (request: HttpServletRequest) => page.render(request), securityManager, basePath)
+    val renderJsonHandler = createServletHandler(pagePath.stripSuffix("/") + "/json",
+      (request: HttpServletRequest) => page.renderJson(request), securityManager, basePath)
     attachHandler(renderHandler)
+    attachHandler(renderJsonHandler)
     pageToHandlers.getOrElseUpdate(page, ArrayBuffer[ServletContextHandler]())
       .append(renderHandler)
   }
diff --git a/core/src/test/scala/org/apache/spark/deploy/master/MasterSuite.scala b/core/src/test/scala/org/apache/spark/deploy/master/MasterSuite.scala
index 0faa8f650e5e1..f97e5ff6db31d 100644
--- a/core/src/test/scala/org/apache/spark/deploy/master/MasterSuite.scala
+++ b/core/src/test/scala/org/apache/spark/deploy/master/MasterSuite.scala
@@ -21,16 +21,20 @@ import java.util.Date
 
 import scala.concurrent.Await
 import scala.concurrent.duration._
+import scala.io.Source
 import scala.language.postfixOps
 
 import akka.actor.Address
+import org.json4s._
+import org.json4s.jackson.JsonMethods._
 import org.scalatest.{FunSuite, Matchers}
+import org.scalatest.concurrent.Eventually
 import other.supplier.{CustomPersistenceEngine, CustomRecoveryModeFactory}
 
-import org.apache.spark.deploy._
 import org.apache.spark.{SparkConf, SparkException}
+import org.apache.spark.deploy._
 
-class MasterSuite extends FunSuite with Matchers {
+class MasterSuite extends FunSuite with Matchers with Eventually {
 
   test("toAkkaUrl") {
     val conf = new SparkConf(loadDefaults = false)
@@ -157,4 +161,27 @@ class MasterSuite extends FunSuite with Matchers {
     CustomRecoveryModeFactory.instantiationAttempts should be > instantiationAttempts
   }
 
+  test("Master & worker web ui available") {
+    implicit val formats = org.json4s.DefaultFormats
+    val conf = new SparkConf()
+    val localCluster = new LocalSparkCluster(2, 2, 512, conf)
+    localCluster.start()
+    try {
+      eventually(timeout(5 seconds), interval(100 milliseconds)) {
+        val json = Source.fromURL(s"http://localhost:${localCluster.masterWebUIPort}/json")
+          .getLines().mkString("\n")
+        val JArray(workers) = (parse(json) \ "workers")
+        workers.size should be (2)
+        workers.foreach { workerSummaryJson =>
+          val JString(workerWebUi) = workerSummaryJson \ "webuiaddress"
+          val workerResponse = parse(Source.fromURL(s"${workerWebUi}/json")
+            .getLines().mkString("\n"))
+          (workerResponse \ "cores").extract[Int] should be (2)
+        }
+      }
+    } finally {
+      localCluster.stop()
+    }
+  }
+
 }

From 3c1305107a2d6d2de862e8b41dbad0e85585b1ef Mon Sep 17 00:00:00 2001
From: Michael Armbrust <michael@databricks.com>
Date: Fri, 22 May 2015 17:23:12 -0700
Subject: [PATCH 147/525] [SPARK-7834] [SQL] Better window error messages

Author: Michael Armbrust <michael@databricks.com>

Closes #6363 from marmbrus/windowErrors and squashes the following commits:

516b02d [Michael Armbrust] [SPARK-7834] [SQL] Better window error messages
---
 .../spark/sql/catalyst/analysis/CheckAnalysis.scala |  5 +++++
 .../spark/sql/catalyst/analysis/AnalysisSuite.scala | 13 +++++++++++++
 2 files changed, 18 insertions(+)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala
index 06a0504359f6e..193dc6b6546b5 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala
@@ -71,6 +71,11 @@ trait CheckAnalysis {
               s"invalid expression ${b.prettyString} " +
               s"between ${b.left.dataType.simpleString} and ${b.right.dataType.simpleString}")
 
+          case WindowExpression(UnresolvedWindowFunction(name, _), _) =>
+            failAnalysis(
+              s"Could not resolve window function '$name'. " +
+              "Note that, using window functions currently requires a HiveContext")
+
           case w @ WindowExpression(windowFunction, windowSpec) if windowSpec.validate.nonEmpty =>
             // The window spec is not valid.
             val reason = windowSpec.validate.get
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala
index e1d6ac462fbcc..939cefb71b817 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala
@@ -166,6 +166,19 @@ class AnalysisSuite extends FunSuite with BeforeAndAfter {
     }
   }
 
+  errorTest(
+    "unresolved window function",
+    testRelation2.select(
+      WindowExpression(
+        UnresolvedWindowFunction(
+          "lead",
+          UnresolvedAttribute("c") :: Nil),
+        WindowSpecDefinition(
+          UnresolvedAttribute("a") :: Nil,
+          SortOrder(UnresolvedAttribute("b"), Ascending) :: Nil,
+          UnspecifiedFrame)).as('window)),
+      "lead" :: "window functions currently requires a HiveContext" :: Nil)
+
   errorTest(
     "too many generators",
     listRelation.select(Explode('list).as('a), Explode('list).as('b)),

From 3d8760d76eae41dcaab8e9aeda19619f3d5f1596 Mon Sep 17 00:00:00 2001
From: Andrew Or <andrew@databricks.com>
Date: Fri, 22 May 2015 17:37:38 -0700
Subject: [PATCH 148/525] [SPARK-7771] [SPARK-7779] Dynamic allocation: lower
 default timeouts further

The default add time of 5s is still too slow for small jobs. Also, the current default remove time of 10 minutes seem rather high. This patch lowers both and rephrases a few log messages.

Author: Andrew Or <andrew@databricks.com>

Closes #6301 from andrewor14/da-minor and squashes the following commits:

6d614a6 [Andrew Or] Lower log level
2811492 [Andrew Or] Log information when requests are canceled
5fcd3eb [Andrew Or] Fix tests
3320710 [Andrew Or] Lower timeouts + rephrase a few log messages
---
 .../spark/ExecutorAllocationManager.scala     | 26 +++++++++++++------
 docs/configuration.md                         |  4 +--
 2 files changed, 20 insertions(+), 10 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/ExecutorAllocationManager.scala b/core/src/main/scala/org/apache/spark/ExecutorAllocationManager.scala
index 66bda68088502..9514604752640 100644
--- a/core/src/main/scala/org/apache/spark/ExecutorAllocationManager.scala
+++ b/core/src/main/scala/org/apache/spark/ExecutorAllocationManager.scala
@@ -91,7 +91,7 @@ private[spark] class ExecutorAllocationManager(
 
   // How long there must be backlogged tasks for before an addition is triggered (seconds)
   private val schedulerBacklogTimeoutS = conf.getTimeAsSeconds(
-    "spark.dynamicAllocation.schedulerBacklogTimeout", "5s")
+    "spark.dynamicAllocation.schedulerBacklogTimeout", "1s")
 
   // Same as above, but used only after `schedulerBacklogTimeoutS` is exceeded
   private val sustainedSchedulerBacklogTimeoutS = conf.getTimeAsSeconds(
@@ -99,7 +99,7 @@ private[spark] class ExecutorAllocationManager(
 
   // How long an executor must be idle for before it is removed (seconds)
   private val executorIdleTimeoutS = conf.getTimeAsSeconds(
-    "spark.dynamicAllocation.executorIdleTimeout", "600s")
+    "spark.dynamicAllocation.executorIdleTimeout", "60s")
 
   // During testing, the methods to actually kill and add executors are mocked out
   private val testing = conf.getBoolean("spark.dynamicAllocation.testing", false)
@@ -268,6 +268,8 @@ private[spark] class ExecutorAllocationManager(
       numExecutorsTarget = math.max(maxNeeded, minNumExecutors)
       client.requestTotalExecutors(numExecutorsTarget)
       numExecutorsToAdd = 1
+      logInfo(s"Lowering target number of executors to $numExecutorsTarget because " +
+        s"not all requests are actually needed (previously $oldNumExecutorsTarget)")
       numExecutorsTarget - oldNumExecutorsTarget
     } else if (addTime != NOT_SET && now >= addTime) {
       val delta = addExecutors(maxNeeded)
@@ -292,9 +294,8 @@ private[spark] class ExecutorAllocationManager(
   private def addExecutors(maxNumExecutorsNeeded: Int): Int = {
     // Do not request more executors if it would put our target over the upper bound
     if (numExecutorsTarget >= maxNumExecutors) {
-      val numExecutorsPending = numExecutorsTarget - executorIds.size
-      logDebug(s"Not adding executors because there are already ${executorIds.size} registered " +
-        s"and ${numExecutorsPending} pending executor(s) (limit $maxNumExecutors)")
+      logDebug(s"Not adding executors because our current target total " +
+        s"is already $numExecutorsTarget (limit $maxNumExecutors)")
       numExecutorsToAdd = 1
       return 0
     }
@@ -310,10 +311,19 @@ private[spark] class ExecutorAllocationManager(
     // Ensure that our target fits within configured bounds:
     numExecutorsTarget = math.max(math.min(numExecutorsTarget, maxNumExecutors), minNumExecutors)
 
+    val delta = numExecutorsTarget - oldNumExecutorsTarget
+
+    // If our target has not changed, do not send a message
+    // to the cluster manager and reset our exponential growth
+    if (delta == 0) {
+      numExecutorsToAdd = 1
+      return 0
+    }
+
     val addRequestAcknowledged = testing || client.requestTotalExecutors(numExecutorsTarget)
     if (addRequestAcknowledged) {
-      val delta = numExecutorsTarget - oldNumExecutorsTarget
-      logInfo(s"Requesting $delta new executor(s) because tasks are backlogged" +
+      val executorsString = "executor" + { if (delta > 1) "s" else "" }
+      logInfo(s"Requesting $delta new $executorsString because tasks are backlogged" +
         s" (new desired total will be $numExecutorsTarget)")
       numExecutorsToAdd = if (delta == numExecutorsToAdd) {
         numExecutorsToAdd * 2
@@ -420,7 +430,7 @@ private[spark] class ExecutorAllocationManager(
    * This resets all variables used for adding executors.
    */
   private def onSchedulerQueueEmpty(): Unit = synchronized {
-    logDebug(s"Clearing timer to add executors because there are no more pending tasks")
+    logDebug("Clearing timer to add executors because there are no more pending tasks")
     addTime = NOT_SET
     numExecutorsToAdd = 1
   }
diff --git a/docs/configuration.md b/docs/configuration.md
index 0de824546c751..30508a617fdd8 100644
--- a/docs/configuration.md
+++ b/docs/configuration.md
@@ -1194,7 +1194,7 @@ Apart from these, the following properties are also available, and may be useful
 </tr>
 <tr>
   <td><code>spark.dynamicAllocation.executorIdleTimeout</code></td>
-  <td>600s</td>
+  <td>60s</td>
   <td>
     If dynamic allocation is enabled and an executor has been idle for more than this duration, 
     the executor will be removed. For more detail, see this
@@ -1224,7 +1224,7 @@ Apart from these, the following properties are also available, and may be useful
 </tr>
 <tr>
   <td><code>spark.dynamicAllocation.schedulerBacklogTimeout</code></td>
-  <td>5s</td>
+  <td>1s</td>
   <td>
     If dynamic allocation is enabled and there have been pending tasks backlogged for more than
     this duration, new executors will be requested. For more detail, see this

From 1c388a9985999e043fa002618a357bc8f0a8b65a Mon Sep 17 00:00:00 2001
From: Tathagata Das <tathagata.das1565@gmail.com>
Date: Fri, 22 May 2015 17:39:01 -0700
Subject: [PATCH 149/525] [SPARK-7788] Made KinesisReceiver.onStart()
 non-blocking

KinesisReceiver calls worker.run() which is a blocking call (while loop) as per source code of kinesis-client library - https://github.com/awslabs/amazon-kinesis-client/blob/v1.2.1/src/main/java/com/amazonaws/services/kinesis/clientlibrary/lib/worker/Worker.java.
This results in infinite loop while calling sparkStreamingContext.stop(stopSparkContext = false, stopGracefully = true) perhaps because ReceiverTracker is never able to register the receiver (it's receiverInfo field is a empty map) causing it to be stuck in infinite loop while waiting for running flag to be set to false.

Author: Tathagata Das <tathagata.das1565@gmail.com>

Closes #6348 from tdas/SPARK-7788 and squashes the following commits:

2584683 [Tathagata Das] Added receiver id in thread name
6cf1cd4 [Tathagata Das] Made KinesisReceiver.onStart non-blocking
---
 .../streaming/kinesis/KinesisReceiver.scala   | 30 +++++++++++++++----
 1 file changed, 25 insertions(+), 5 deletions(-)

diff --git a/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisReceiver.scala b/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisReceiver.scala
index 800202e9fb86a..7dd8bfdc2a6db 100644
--- a/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisReceiver.scala
+++ b/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisReceiver.scala
@@ -18,6 +18,8 @@ package org.apache.spark.streaming.kinesis
 
 import java.util.UUID
 
+import scala.util.control.NonFatal
+
 import com.amazonaws.auth.{AWSCredentials, AWSCredentialsProvider, BasicAWSCredentials, DefaultAWSCredentialsProviderChain}
 import com.amazonaws.services.kinesis.clientlibrary.interfaces.{IRecordProcessor, IRecordProcessorFactory}
 import com.amazonaws.services.kinesis.clientlibrary.lib.worker.{InitialPositionInStream, KinesisClientLibConfiguration, Worker}
@@ -98,6 +100,9 @@ private[kinesis] class KinesisReceiver(
    */
   private var worker: Worker = null
 
+  /** Thread running the worker */
+  private var workerThread: Thread = null
+
   /**
    * This is called when the KinesisReceiver starts and must be non-blocking.
    * The KCL creates and manages the receiving/processing thread pool through Worker.run().
@@ -126,8 +131,19 @@ private[kinesis] class KinesisReceiver(
     }
 
     worker = new Worker(recordProcessorFactory, kinesisClientLibConfiguration)
-    worker.run()
-
+    workerThread = new Thread() {
+      override def run(): Unit = {
+        try {
+          worker.run()
+        } catch {
+          case NonFatal(e) =>
+            restart("Error running the KCL worker in Receiver", e)
+        }
+      }
+    }
+    workerThread.setName(s"Kinesis Receiver ${streamId}")
+    workerThread.setDaemon(true)
+    workerThread.start()
     logInfo(s"Started receiver with workerId $workerId")
   }
 
@@ -137,10 +153,14 @@ private[kinesis] class KinesisReceiver(
    * The KCL will do its best to drain and checkpoint any in-flight records upon shutdown.
    */
   override def onStop() {
-    if (worker != null) {
-      worker.shutdown()
+    if (workerThread != null) {
+      if (worker != null) {
+        worker.shutdown()
+        worker = null
+      }
+      workerThread.join()
+      workerThread = null
       logInfo(s"Stopped receiver for workerId $workerId")
-      worker = null
     }
     workerId = null
   }

From 8014e1f6bb871d9fd4db74106eb4425d0c1e9dd6 Mon Sep 17 00:00:00 2001
From: Burak Yavuz <brkyvz@gmail.com>
Date: Fri, 22 May 2015 17:48:09 -0700
Subject: [PATCH 150/525] [SPARK-7224] [SPARK-7306] mock repository generator
 for --packages tests without nio.Path

The previous PR for SPARK-7224 (#5790) broke JDK 6, because it used java.nio.Path, which was in jdk 7, and not in 6. This PR uses Guava's `Files` to handle directory creation, and etc...

The description from the previous PR:
> This patch contains an `IvyTestUtils` file, which dynamically generates jars and pom files to test the `--packages` feature without having to rely on the internet, and Maven Central.

cc pwendell

I also rand the flaky test about 20 times locally, it didn't fail a single time, but I think it may fail like once every 100 builds? I still haven't figured the cause yet, but the test before it, `--jars` was also failing after we turned off the `--packages` test in `SparkSubmitSuite`. It may be related to the launch of SparkSubmit.

Author: Burak Yavuz <brkyvz@gmail.com>

Closes #5892 from brkyvz/maven-utils and squashes the following commits:

e9b1903 [Burak Yavuz] fix merge conflict
68214e0 [Burak Yavuz] remove ignore for test(neglect spark dependencies)
e632381 [Burak Yavuz] fix ignore
9ef1408 [Burak Yavuz] re-enable --packages test
22eea62 [Burak Yavuz] Merge branch 'master' of github.com:apache/spark into maven-utils
05cd0de [Burak Yavuz] added mock repository generator
---
 .../scala/org/apache/spark/TestUtils.scala    |  27 +-
 .../org/apache/spark/deploy/SparkSubmit.scala | 128 +++++----
 .../apache/spark/deploy/IvyTestUtils.scala    | 261 ++++++++++++++++++
 .../spark/deploy/SparkSubmitSuite.scala       |  27 +-
 .../spark/deploy/SparkSubmitUtilsSuite.scala  |  61 ++--
 5 files changed, 404 insertions(+), 100 deletions(-)
 create mode 100644 core/src/test/scala/org/apache/spark/deploy/IvyTestUtils.scala

diff --git a/core/src/main/scala/org/apache/spark/TestUtils.scala b/core/src/main/scala/org/apache/spark/TestUtils.scala
index 398ca41e16151..fe6320b504e15 100644
--- a/core/src/main/scala/org/apache/spark/TestUtils.scala
+++ b/core/src/main/scala/org/apache/spark/TestUtils.scala
@@ -105,23 +105,18 @@ private[spark] object TestUtils {
     URI.create(s"string:///${name.replace(".", "/")}${SOURCE.extension}")
   }
 
-  private class JavaSourceFromString(val name: String, val code: String)
+  private[spark] class JavaSourceFromString(val name: String, val code: String)
     extends SimpleJavaFileObject(createURI(name), SOURCE) {
     override def getCharContent(ignoreEncodingErrors: Boolean): String = code
   }
 
-  /** Creates a compiled class with the given name. Class file will be placed in destDir. */
+  /** Creates a compiled class with the source file. Class file will be placed in destDir. */
   def createCompiledClass(
       className: String,
       destDir: File,
-      toStringValue: String = "",
-      baseClass: String = null,
-      classpathUrls: Seq[URL] = Seq()): File = {
+      sourceFile: JavaSourceFromString,
+      classpathUrls: Seq[URL]): File = {
     val compiler = ToolProvider.getSystemJavaCompiler
-    val extendsText = Option(baseClass).map { c => s" extends ${c}" }.getOrElse("")
-    val sourceFile = new JavaSourceFromString(className,
-      "public class " + className + extendsText + " implements java.io.Serializable {" +
-      "  @Override public String toString() { return \"" + toStringValue + "\"; }}")
 
     // Calling this outputs a class file in pwd. It's easier to just rename the file than
     // build a custom FileManager that controls the output location.
@@ -144,4 +139,18 @@ private[spark] object TestUtils {
     assert(out.exists(), "Destination file not moved: " + out.getAbsolutePath())
     out
   }
+
+  /** Creates a compiled class with the given name. Class file will be placed in destDir. */
+  def createCompiledClass(
+      className: String,
+      destDir: File,
+      toStringValue: String = "",
+      baseClass: String = null,
+      classpathUrls: Seq[URL] = Seq()): File = {
+    val extendsText = Option(baseClass).map { c => s" extends ${c}" }.getOrElse("")
+    val sourceFile = new JavaSourceFromString(className,
+      "public class " + className + extendsText + " implements java.io.Serializable {" +
+      "  @Override public String toString() { return \"" + toStringValue + "\"; }}")
+    createCompiledClass(className, destDir, sourceFile, classpathUrls)
+  }
 }
diff --git a/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala b/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
index 329fa06ba8ba5..198371b70f14f 100644
--- a/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
@@ -753,7 +753,9 @@ private[spark] object SparkSubmitUtils {
    * @param artifactId the artifactId of the coordinate
    * @param version the version of the coordinate
    */
-  private[deploy] case class MavenCoordinate(groupId: String, artifactId: String, version: String)
+  private[deploy] case class MavenCoordinate(groupId: String, artifactId: String, version: String) {
+    override def toString: String = s"$groupId:$artifactId:$version"
+  }
 
 /**
  * Extracts maven coordinates from a comma-delimited string. Coordinates should be provided
@@ -776,6 +778,10 @@ private[spark] object SparkSubmitUtils {
     }
   }
 
+  /** Path of the local Maven cache. */
+  private[spark] def m2Path: File = new File(System.getProperty("user.home"),
+    ".m2" + File.separator + "repository" + File.separator)
+
   /**
    * Extracts maven coordinates from a comma-delimited string
    * @param remoteRepos Comma-delimited string of remote repositories
@@ -789,8 +795,7 @@ private[spark] object SparkSubmitUtils {
 
     val localM2 = new IBiblioResolver
     localM2.setM2compatible(true)
-    val m2Path = ".m2" + File.separator + "repository" + File.separator
-    localM2.setRoot(new File(System.getProperty("user.home"), m2Path).toURI.toString)
+    localM2.setRoot(m2Path.toURI.toString)
     localM2.setUsepoms(true)
     localM2.setName("local-m2-cache")
     cr.add(localM2)
@@ -915,69 +920,72 @@ private[spark] object SparkSubmitUtils {
       ""
     } else {
       val sysOut = System.out
-      // To prevent ivy from logging to system out
-      System.setOut(printStream)
-      val artifacts = extractMavenCoordinates(coordinates)
-      // Default configuration name for ivy
-      val ivyConfName = "default"
-      // set ivy settings for location of cache
-      val ivySettings: IvySettings = new IvySettings
-      // Directories for caching downloads through ivy and storing the jars when maven coordinates
-      // are supplied to spark-submit
-      val alternateIvyCache = ivyPath.getOrElse("")
-      val packagesDirectory: File =
-        if (alternateIvyCache.trim.isEmpty) {
-          new File(ivySettings.getDefaultIvyUserDir, "jars")
+      try {
+        // To prevent ivy from logging to system out
+        System.setOut(printStream)
+        val artifacts = extractMavenCoordinates(coordinates)
+        // Default configuration name for ivy
+        val ivyConfName = "default"
+        // set ivy settings for location of cache
+        val ivySettings: IvySettings = new IvySettings
+        // Directories for caching downloads through ivy and storing the jars when maven coordinates
+        // are supplied to spark-submit
+        val alternateIvyCache = ivyPath.getOrElse("")
+        val packagesDirectory: File =
+          if (alternateIvyCache.trim.isEmpty) {
+            new File(ivySettings.getDefaultIvyUserDir, "jars")
+          } else {
+            ivySettings.setDefaultIvyUserDir(new File(alternateIvyCache))
+            ivySettings.setDefaultCache(new File(alternateIvyCache, "cache"))
+            new File(alternateIvyCache, "jars")
+          }
+        printStream.println(
+          s"Ivy Default Cache set to: ${ivySettings.getDefaultCache.getAbsolutePath}")
+        printStream.println(s"The jars for the packages stored in: $packagesDirectory")
+        // create a pattern matcher
+        ivySettings.addMatcher(new GlobPatternMatcher)
+        // create the dependency resolvers
+        val repoResolver = createRepoResolvers(remoteRepos, ivySettings)
+        ivySettings.addResolver(repoResolver)
+        ivySettings.setDefaultResolver(repoResolver.getName)
+
+        val ivy = Ivy.newInstance(ivySettings)
+        // Set resolve options to download transitive dependencies as well
+        val resolveOptions = new ResolveOptions
+        resolveOptions.setTransitive(true)
+        val retrieveOptions = new RetrieveOptions
+        // Turn downloading and logging off for testing
+        if (isTest) {
+          resolveOptions.setDownload(false)
+          resolveOptions.setLog(LogOptions.LOG_QUIET)
+          retrieveOptions.setLog(LogOptions.LOG_QUIET)
         } else {
-          ivySettings.setDefaultIvyUserDir(new File(alternateIvyCache))
-          ivySettings.setDefaultCache(new File(alternateIvyCache, "cache"))
-          new File(alternateIvyCache, "jars")
+          resolveOptions.setDownload(true)
         }
-      printStream.println(
-        s"Ivy Default Cache set to: ${ivySettings.getDefaultCache.getAbsolutePath}")
-      printStream.println(s"The jars for the packages stored in: $packagesDirectory")
-      // create a pattern matcher
-      ivySettings.addMatcher(new GlobPatternMatcher)
-      // create the dependency resolvers
-      val repoResolver = createRepoResolvers(remoteRepos, ivySettings)
-      ivySettings.addResolver(repoResolver)
-      ivySettings.setDefaultResolver(repoResolver.getName)
-
-      val ivy = Ivy.newInstance(ivySettings)
-      // Set resolve options to download transitive dependencies as well
-      val resolveOptions = new ResolveOptions
-      resolveOptions.setTransitive(true)
-      val retrieveOptions = new RetrieveOptions
-      // Turn downloading and logging off for testing
-      if (isTest) {
-        resolveOptions.setDownload(false)
-        resolveOptions.setLog(LogOptions.LOG_QUIET)
-        retrieveOptions.setLog(LogOptions.LOG_QUIET)
-      } else {
-        resolveOptions.setDownload(true)
-      }
 
-      // A Module descriptor must be specified. Entries are dummy strings
-      val md = getModuleDescriptor
-      md.setDefaultConf(ivyConfName)
+        // A Module descriptor must be specified. Entries are dummy strings
+        val md = getModuleDescriptor
+        md.setDefaultConf(ivyConfName)
 
-      // Add exclusion rules for Spark and Scala Library
-      addExclusionRules(ivySettings, ivyConfName, md)
-      // add all supplied maven artifacts as dependencies
-      addDependenciesToIvy(md, artifacts, ivyConfName)
+        // Add exclusion rules for Spark and Scala Library
+        addExclusionRules(ivySettings, ivyConfName, md)
+        // add all supplied maven artifacts as dependencies
+        addDependenciesToIvy(md, artifacts, ivyConfName)
 
-      // resolve dependencies
-      val rr: ResolveReport = ivy.resolve(md, resolveOptions)
-      if (rr.hasError) {
-        throw new RuntimeException(rr.getAllProblemMessages.toString)
+        // resolve dependencies
+        val rr: ResolveReport = ivy.resolve(md, resolveOptions)
+        if (rr.hasError) {
+          throw new RuntimeException(rr.getAllProblemMessages.toString)
+        }
+        // retrieve all resolved dependencies
+        ivy.retrieve(rr.getModuleDescriptor.getModuleRevisionId,
+          packagesDirectory.getAbsolutePath + File.separator +
+            "[organization]_[artifact]-[revision].[ext]",
+          retrieveOptions.setConfs(Array(ivyConfName)))
+        resolveDependencyPaths(rr.getArtifacts.toArray, packagesDirectory)
+      } finally {
+        System.setOut(sysOut)
       }
-      // retrieve all resolved dependencies
-      ivy.retrieve(rr.getModuleDescriptor.getModuleRevisionId,
-        packagesDirectory.getAbsolutePath + File.separator +
-          "[organization]_[artifact]-[revision].[ext]",
-        retrieveOptions.setConfs(Array(ivyConfName)))
-      System.setOut(sysOut)
-      resolveDependencyPaths(rr.getArtifacts.toArray, packagesDirectory)
     }
   }
 }
diff --git a/core/src/test/scala/org/apache/spark/deploy/IvyTestUtils.scala b/core/src/test/scala/org/apache/spark/deploy/IvyTestUtils.scala
new file mode 100644
index 0000000000000..7d39984424842
--- /dev/null
+++ b/core/src/test/scala/org/apache/spark/deploy/IvyTestUtils.scala
@@ -0,0 +1,261 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.deploy
+
+import java.io.{File, FileInputStream, FileOutputStream}
+import java.util.jar.{JarEntry, JarOutputStream}
+
+import com.google.common.io.{Files, ByteStreams}
+
+import org.apache.commons.io.FileUtils
+
+import org.apache.spark.TestUtils.{createCompiledClass, JavaSourceFromString}
+import org.apache.spark.deploy.SparkSubmitUtils.MavenCoordinate
+
+private[deploy] object IvyTestUtils {
+
+  /**
+   * Create the path for the jar and pom from the maven coordinate. Extension should be `jar`
+   * or `pom`.
+   */
+  private def pathFromCoordinate(
+      artifact: MavenCoordinate,
+      prefix: File,
+      ext: String,
+      useIvyLayout: Boolean): File = {
+    val groupDirs = artifact.groupId.replace(".", File.separator)
+    val artifactDirs = artifact.artifactId
+    val artifactPath =
+      if (!useIvyLayout) {
+        Seq(groupDirs, artifactDirs, artifact.version).mkString(File.separator)
+      } else {
+        Seq(groupDirs, artifactDirs, artifact.version, ext + "s").mkString(File.separator)
+      }
+    new File(prefix, artifactPath)
+  }
+
+  private def artifactName(artifact: MavenCoordinate, ext: String = ".jar"): String = {
+    s"${artifact.artifactId}-${artifact.version}$ext"
+  }
+
+  /** Write the contents to a file to the supplied directory. */
+  private def writeFile(dir: File, fileName: String, contents: String): File = {
+    val outputFile = new File(dir, fileName)
+    val outputStream = new FileOutputStream(outputFile)
+    outputStream.write(contents.toCharArray.map(_.toByte))
+    outputStream.close()
+    outputFile
+  }
+
+  /** Create an example Python file. */
+  private def createPythonFile(dir: File): File = {
+    val contents =
+      """def myfunc(x):
+        |   return x + 1
+      """.stripMargin
+    writeFile(dir, "mylib.py", contents)
+  }
+
+  /** Create a simple testable Class. */
+  private def createJavaClass(dir: File, className: String, packageName: String): File = {
+    val contents =
+      s"""package $packageName;
+        |
+        |import java.lang.Integer;
+        |
+        |class $className implements java.io.Serializable {
+        |
+        | public $className() {}
+        |
+        | public Integer myFunc(Integer x) {
+        |   return x + 1;
+        | }
+        |}
+      """.stripMargin
+    val sourceFile =
+      new JavaSourceFromString(new File(dir, className + ".java").getAbsolutePath, contents)
+    createCompiledClass(className, dir, sourceFile, Seq.empty)
+  }
+
+  /** Helper method to write artifact information in the pom. */
+  private def pomArtifactWriter(artifact: MavenCoordinate, tabCount: Int = 1): String = {
+    var result = "\n" + "  " * tabCount + s"<groupId>${artifact.groupId}</groupId>"
+    result += "\n" + "  " * tabCount + s"<artifactId>${artifact.artifactId}</artifactId>"
+    result += "\n" + "  " * tabCount + s"<version>${artifact.version}</version>"
+    result
+  }
+
+  /** Create a pom file for this artifact. */
+  private def createPom(
+      dir: File,
+      artifact: MavenCoordinate,
+      dependencies: Option[Seq[MavenCoordinate]]): File = {
+    var content = """
+                    |<?xml version="1.0" encoding="UTF-8"?>
+                    |<project xmlns="http://maven.apache.org/POM/4.0.0"
+                    |       xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+                    |       xsi:schemaLocation="http://maven.apache.org/POM/4.0.0
+                    |       http://maven.apache.org/xsd/maven-4.0.0.xsd">
+                    |   <modelVersion>4.0.0</modelVersion>
+                  """.stripMargin.trim
+    content += pomArtifactWriter(artifact)
+    content += dependencies.map { deps =>
+      val inside = deps.map { dep =>
+        "\t<dependency>" + pomArtifactWriter(dep, 3) + "\n\t</dependency>"
+      }.mkString("\n")
+      "\n  <dependencies>\n" + inside + "\n  </dependencies>"
+    }.getOrElse("")
+    content += "\n</project>"
+    writeFile(dir, artifactName(artifact, ".pom"), content.trim)
+  }
+
+  /** Create the jar for the given maven coordinate, using the supplied files. */
+  private def packJar(
+      dir: File,
+      artifact: MavenCoordinate,
+      files: Seq[(String, File)]): File = {
+    val jarFile = new File(dir, artifactName(artifact))
+    val jarFileStream = new FileOutputStream(jarFile)
+    val jarStream = new JarOutputStream(jarFileStream, new java.util.jar.Manifest())
+
+    for (file <- files) {
+      val jarEntry = new JarEntry(file._1)
+      jarStream.putNextEntry(jarEntry)
+
+      val in = new FileInputStream(file._2)
+      ByteStreams.copy(in, jarStream)
+      in.close()
+    }
+    jarStream.close()
+    jarFileStream.close()
+
+    jarFile
+  }
+
+  /**
+   * Creates a jar and pom file, mocking a Maven repository. The root path can be supplied with
+   * `tempDir`, dependencies can be created into the same repo, and python files can also be packed
+   * inside the jar.
+   *
+   * @param artifact The maven coordinate to generate the jar and pom for.
+   * @param dependencies List of dependencies this artifact might have to also create jars and poms.
+   * @param tempDir The root folder of the repository
+   * @param useIvyLayout whether to mock the Ivy layout for local repository testing
+   * @param withPython Whether to pack python files inside the jar for extensive testing.
+   * @return Root path of the repository
+   */
+  private def createLocalRepository(
+      artifact: MavenCoordinate,
+      dependencies: Option[Seq[MavenCoordinate]] = None,
+      tempDir: Option[File] = None,
+      useIvyLayout: Boolean = false,
+      withPython: Boolean = false): File = {
+    // Where the root of the repository exists, and what Ivy will search in
+    val tempPath = tempDir.getOrElse(Files.createTempDir())
+    // Create directory if it doesn't exist
+    Files.createParentDirs(tempPath)
+    // Where to create temporary class files and such
+    val root = new File(tempPath, tempPath.hashCode().toString)
+    Files.createParentDirs(new File(root, "dummy"))
+    try {
+      val jarPath = pathFromCoordinate(artifact, tempPath, "jar", useIvyLayout)
+      Files.createParentDirs(new File(jarPath, "dummy"))
+      val className = "MyLib"
+
+      val javaClass = createJavaClass(root, className, artifact.groupId)
+      // A tuple of files representation in the jar, and the file
+      val javaFile = (artifact.groupId.replace(".", "/") + "/" + javaClass.getName, javaClass)
+      val allFiles =
+        if (withPython) {
+          val pythonFile = createPythonFile(root)
+          Seq(javaFile, (pythonFile.getName, pythonFile))
+        } else {
+          Seq(javaFile)
+        }
+      val jarFile = packJar(jarPath, artifact, allFiles)
+      assert(jarFile.exists(), "Problem creating Jar file")
+      val pomPath = pathFromCoordinate(artifact, tempPath, "pom", useIvyLayout)
+      Files.createParentDirs(new File(pomPath, "dummy"))
+      val pomFile = createPom(pomPath, artifact, dependencies)
+      assert(pomFile.exists(), "Problem creating Pom file")
+    } finally {
+      FileUtils.deleteDirectory(root)
+    }
+    tempPath
+  }
+
+  /**
+   * Creates a suite of jars and poms, with or without dependencies, mocking a maven repository.
+   * @param artifact The main maven coordinate to generate the jar and pom for.
+   * @param dependencies List of dependencies this artifact might have to also create jars and poms.
+   * @param rootDir The root folder of the repository (like `~/.m2/repositories`)
+   * @param useIvyLayout whether to mock the Ivy layout for local repository testing
+   * @param withPython Whether to pack python files inside the jar for extensive testing.
+   * @return Root path of the repository. Will be `rootDir` if supplied.
+   */
+  private[deploy] def createLocalRepositoryForTests(
+      artifact: MavenCoordinate,
+      dependencies: Option[String],
+      rootDir: Option[File],
+      useIvyLayout: Boolean = false,
+      withPython: Boolean = false): File = {
+    val deps = dependencies.map(SparkSubmitUtils.extractMavenCoordinates)
+    val mainRepo = createLocalRepository(artifact, deps, rootDir, useIvyLayout, withPython)
+    deps.foreach { seq => seq.foreach { dep =>
+      createLocalRepository(dep, None, Some(mainRepo), useIvyLayout, withPython = false)
+    }}
+    mainRepo
+  }
+
+  /**
+   * Creates a repository for a test, and cleans it up afterwards.
+   *
+   * @param artifact The main maven coordinate to generate the jar and pom for.
+   * @param dependencies List of dependencies this artifact might have to also create jars and poms.
+   * @param rootDir The root folder of the repository (like `~/.m2/repositories`)
+   * @param useIvyLayout whether to mock the Ivy layout for local repository testing
+   * @param withPython Whether to pack python files inside the jar for extensive testing.
+   * @return Root path of the repository. Will be `rootDir` if supplied.
+   */
+  private[deploy] def withRepository(
+      artifact: MavenCoordinate,
+      dependencies: Option[String],
+      rootDir: Option[File],
+      useIvyLayout: Boolean = false,
+      withPython: Boolean = false)(f: String => Unit): Unit = {
+    val repo = createLocalRepositoryForTests(artifact, dependencies, rootDir, useIvyLayout,
+      withPython)
+    try {
+      f(repo.toURI.toString)
+    } finally {
+      // Clean up
+      if (repo.toString.contains(".m2") || repo.toString.contains(".ivy2")) {
+        FileUtils.deleteDirectory(new File(repo,
+          artifact.groupId.replace(".", File.separator) + File.separator + artifact.artifactId))
+        dependencies.map(SparkSubmitUtils.extractMavenCoordinates).foreach { seq =>
+          seq.foreach { dep =>
+            FileUtils.deleteDirectory(new File(repo,
+              dep.artifactId.replace(".", File.separator)))
+          }
+        }
+      } else {
+        FileUtils.deleteDirectory(repo)
+      }
+    }
+  }
+}
diff --git a/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala b/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala
index 61c95419aedcf..8f64ab5e42108 100644
--- a/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala
+++ b/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala
@@ -30,6 +30,7 @@ import org.scalatest.time.SpanSugar._
 
 import org.apache.spark._
 import org.apache.spark.deploy.SparkSubmit._
+import org.apache.spark.deploy.SparkSubmitUtils.MavenCoordinate
 import org.apache.spark.util.{ResetSystemProperties, Utils}
 
 // Note: this suite mixes in ResetSystemProperties because SparkSubmit.main() sets a bunch
@@ -334,18 +335,22 @@ class SparkSubmitSuite extends FunSuite with Matchers with ResetSystemProperties
     runSparkSubmit(args)
   }
 
-  ignore("includes jars passed in through --packages") {
+  test("includes jars passed in through --packages") {
     val unusedJar = TestUtils.createJarWithClasses(Seq.empty)
-    val packagesString = "com.databricks:spark-csv_2.10:0.1,com.databricks:spark-avro_2.10:0.1"
-    val args = Seq(
-      "--class", JarCreationTest.getClass.getName.stripSuffix("$"),
-      "--name", "testApp",
-      "--master", "local-cluster[2,1,512]",
-      "--packages", packagesString,
-      "--conf", "spark.ui.enabled=false",
-      unusedJar.toString,
-      "com.databricks.spark.csv.DefaultSource", "com.databricks.spark.avro.DefaultSource")
-    runSparkSubmit(args)
+    val main = MavenCoordinate("my.great.lib", "mylib", "0.1")
+    val dep = MavenCoordinate("my.great.dep", "mylib", "0.1")
+    IvyTestUtils.withRepository(main, Some(dep.toString), None) { repo =>
+      val args = Seq(
+        "--class", JarCreationTest.getClass.getName.stripSuffix("$"),
+        "--name", "testApp",
+        "--master", "local-cluster[2,1,512]",
+        "--packages", Seq(main, dep).mkString(","),
+        "--repositories", repo,
+        "--conf", "spark.ui.enabled=false",
+        unusedJar.toString,
+        "my.great.lib.MyLib", "my.great.dep.MyLib")
+      runSparkSubmit(args)
+    }
   }
 
   test("resolves command line argument paths correctly") {
diff --git a/core/src/test/scala/org/apache/spark/deploy/SparkSubmitUtilsSuite.scala b/core/src/test/scala/org/apache/spark/deploy/SparkSubmitUtilsSuite.scala
index da9578478bed9..088ca3cb93b49 100644
--- a/core/src/test/scala/org/apache/spark/deploy/SparkSubmitUtilsSuite.scala
+++ b/core/src/test/scala/org/apache/spark/deploy/SparkSubmitUtilsSuite.scala
@@ -17,17 +17,17 @@
 
 package org.apache.spark.deploy
 
-import java.io.{PrintStream, OutputStream, File}
-
-import org.apache.ivy.core.settings.IvySettings
+import java.io.{File, PrintStream, OutputStream}
 
 import scala.collection.mutable.ArrayBuffer
-
 import org.scalatest.{BeforeAndAfterAll, FunSuite}
 
 import org.apache.ivy.core.module.descriptor.MDArtifact
+import org.apache.ivy.core.settings.IvySettings
 import org.apache.ivy.plugins.resolver.IBiblioResolver
 
+import org.apache.spark.deploy.SparkSubmitUtils.MavenCoordinate
+
 class SparkSubmitUtilsSuite extends FunSuite with BeforeAndAfterAll {
 
   private val noOpOutputStream = new OutputStream {
@@ -89,7 +89,7 @@ class SparkSubmitUtilsSuite extends FunSuite with BeforeAndAfterAll {
   }
 
   test("ivy path works correctly") {
-    val ivyPath = "dummy/ivy"
+    val ivyPath = "dummy" + File.separator +  "ivy"
     val md = SparkSubmitUtils.getModuleDescriptor
     val artifacts = for (i <- 0 until 3) yield new MDArtifact(md, s"jar-$i", "jar", "jar")
     var jPaths = SparkSubmitUtils.resolveDependencyPaths(artifacts.toArray, new File(ivyPath))
@@ -98,17 +98,38 @@ class SparkSubmitUtilsSuite extends FunSuite with BeforeAndAfterAll {
       assert(index >= 0)
       jPaths = jPaths.substring(index + ivyPath.length)
     }
-    // end to end
-    val jarPath = SparkSubmitUtils.resolveMavenCoordinates(
-      "com.databricks:spark-csv_2.10:0.1", None, Option(ivyPath), true)
-    assert(jarPath.indexOf(ivyPath) >= 0, "should use non-default ivy path")
+    val main = MavenCoordinate("my.awesome.lib", "mylib", "0.1")
+    IvyTestUtils.withRepository(main, None, None) { repo =>
+      // end to end
+      val jarPath = SparkSubmitUtils.resolveMavenCoordinates(main.toString, Option(repo),
+        Option(ivyPath), true)
+      assert(jarPath.indexOf(ivyPath) >= 0, "should use non-default ivy path")
+    }
   }
 
-  ignore("search for artifact at other repositories") {
-    val path = SparkSubmitUtils.resolveMavenCoordinates("com.agimatec:agimatec-validation:0.9.3",
-      Option("https://oss.sonatype.org/content/repositories/agimatec/"), None, true)
-    assert(path.indexOf("agimatec-validation") >= 0, "should find package. If it doesn't, check" +
-      "if package still exists. If it has been removed, replace the example in this test.")
+  test("search for artifact at local repositories") {
+    val main = new MavenCoordinate("my.awesome.lib", "mylib", "0.1")
+    // Local M2 repository
+    IvyTestUtils.withRepository(main, None, Some(SparkSubmitUtils.m2Path)) { repo =>
+      val jarPath = SparkSubmitUtils.resolveMavenCoordinates(main.toString, None, None, true)
+      assert(jarPath.indexOf("mylib") >= 0, "should find artifact")
+    }
+    // Local Ivy Repository
+    val settings = new IvySettings
+    val ivyLocal = new File(settings.getDefaultIvyUserDir, "local" + File.separator)
+    IvyTestUtils.withRepository(main, None, Some(ivyLocal), true) { repo =>
+      val jarPath = SparkSubmitUtils.resolveMavenCoordinates(main.toString, None, None, true)
+      assert(jarPath.indexOf("mylib") >= 0, "should find artifact")
+    }
+    // Local ivy repository with modified home
+    val dummyIvyPath = "dummy" + File.separator + "ivy"
+    val dummyIvyLocal = new File(dummyIvyPath, "local" + File.separator)
+    IvyTestUtils.withRepository(main, None, Some(dummyIvyLocal), true) { repo =>
+      val jarPath = SparkSubmitUtils.resolveMavenCoordinates(main.toString, None,
+        Some(dummyIvyPath), true)
+      assert(jarPath.indexOf("mylib") >= 0, "should find artifact")
+      assert(jarPath.indexOf(dummyIvyPath) >= 0, "should be in new ivy path")
+    }
   }
 
   test("dependency not found throws RuntimeException") {
@@ -117,7 +138,7 @@ class SparkSubmitUtilsSuite extends FunSuite with BeforeAndAfterAll {
     }
   }
 
-  ignore("neglects Spark and Spark's dependencies") {
+  test("neglects Spark and Spark's dependencies") {
     val components = Seq("bagel_", "catalyst_", "core_", "graphx_", "hive_", "mllib_", "repl_",
       "sql_", "streaming_", "yarn_", "network-common_", "network-shuffle_", "network-yarn_")
 
@@ -127,11 +148,11 @@ class SparkSubmitUtilsSuite extends FunSuite with BeforeAndAfterAll {
 
     val path = SparkSubmitUtils.resolveMavenCoordinates(coordinates, None, None, true)
     assert(path === "", "should return empty path")
-    // Should not exclude the following dependency. Will throw an error, because it doesn't exist,
-    // but the fact that it is checking means that it wasn't excluded.
-    intercept[RuntimeException] {
-      SparkSubmitUtils.resolveMavenCoordinates(coordinates +
-        ",org.apache.spark:spark-streaming-kafka-assembly_2.10:1.2.0", None, None, true)
+    val main = MavenCoordinate("org.apache.spark", "spark-streaming-kafka-assembly_2.10", "1.2.0")
+    IvyTestUtils.withRepository(main, None, None) { repo =>
+      val files = SparkSubmitUtils.resolveMavenCoordinates(coordinates + "," + main.toString,
+        Some(repo), None, true)
+      assert(files.indexOf(main.artifactId) >= 0, "Did not return artifact")
     }
   }
 }

From 63a5ce75eac48a297751ac505d70ce4d47daf903 Mon Sep 17 00:00:00 2001
From: Mike Dusenberry <dusenberrymw@gmail.com>
Date: Fri, 22 May 2015 18:03:12 -0700
Subject: [PATCH 151/525] [SPARK-7830] [DOCS] [MLLIB] Adding logistic
 regression to the list of Multiclass Classification Supported Methods
 documentation

Added logistic regression to the list of Multiclass Classification Supported Methods in the MLlib Classification and Regression documentation, as it was missing.

Author: Mike Dusenberry <dusenberrymw@gmail.com>

Closes #6357 from dusenberrymw/Add_LR_To_List_Of_Multiclass_Classification_Methods and squashes the following commits:

7918650 [Mike Dusenberry] Updating broken link due to the "Binary Classification" section on the Linear Methods page being renamed to "Classification".
3005dc2 [Mike Dusenberry] Adding logistic regression to the list of Multiclass Classification Supported Methods in the MLlib Classification and Regression documentation, as it was missing.
---
 docs/mllib-classification-regression.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/mllib-classification-regression.md b/docs/mllib-classification-regression.md
index 8e91d62f4a907..0210950b89906 100644
--- a/docs/mllib-classification-regression.md
+++ b/docs/mllib-classification-regression.md
@@ -20,7 +20,7 @@ the supported algorithms for each type of problem.
       <td>Binary Classification</td><td>linear SVMs, logistic regression, decision trees, random forests, gradient-boosted trees, naive Bayes</td>
     </tr>
     <tr>
-      <td>Multiclass Classification</td><td>decision trees, random forests, naive Bayes</td>
+      <td>Multiclass Classification</td><td>logistic regression, decision trees, random forests, naive Bayes</td>
     </tr>
     <tr>
       <td>Regression</td><td>linear least squares, Lasso, ridge regression, decision trees, random forests, gradient-boosted trees, isotonic regression</td>
@@ -31,7 +31,7 @@ the supported algorithms for each type of problem.
 More details for these methods can be found here:
 
 * [Linear models](mllib-linear-methods.html)
-  * [binary classification (SVMs, logistic regression)](mllib-linear-methods.html#binary-classification)
+  * [classification (SVMs, logistic regression)](mllib-linear-methods.html#classification)
   * [linear regression (least squares, Lasso, ridge)](mllib-linear-methods.html#linear-least-squares-lasso-and-ridge-regression)
 * [Decision trees](mllib-decision-tree.html)
 * [Ensembles of decision trees](mllib-ensembles.html)

From a16357413d2823bcc1d1bf55b4da191dc9b1b69a Mon Sep 17 00:00:00 2001
From: Akshat Aranya <aaranya@quantcast.com>
Date: Fri, 22 May 2015 22:03:31 -0700
Subject: [PATCH 152/525] [SPARK-7795] [CORE] Speed up task scheduling in
 standalone mode by reusing serializer

My experiments with scheduling very short tasks in standalone cluster mode indicated that a significant amount of time was being spent in scheduling the tasks (>500ms for 256 tasks).  I found that most of the time was being spent in creating a new instance of serializer for each task.  Changing this to just one serializer brought down the scheduling time to 8ms.

Author: Akshat Aranya <aaranya@quantcast.com>

Closes #6323 from coolfrood/master and squashes the following commits:

12d8c9e [Akshat Aranya] Reduce visibility of serializer
bd4a5dd [Akshat Aranya] Style fix
0b8ca93 [Akshat Aranya] Incorporate review comments
fe530cd [Akshat Aranya] Speed up task scheduling in standalone mode by reusing serializer instead of creating a new one for each task.
---
 .../cluster/CoarseGrainedSchedulerBackend.scala      | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala b/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala
index f107148f3b8c6..c5bc6294a5577 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala
@@ -69,6 +69,11 @@ class CoarseGrainedSchedulerBackend(scheduler: TaskSchedulerImpl, val rpcEnv: Rp
   class DriverEndpoint(override val rpcEnv: RpcEnv, sparkProperties: Seq[(String, String)])
     extends ThreadSafeRpcEndpoint with Logging {
 
+    // If this DriverEndpoint is changed to support multiple threads,
+    // then this may need to be changed so that we don't share the serializer
+    // instance across threads
+    private val ser = SparkEnv.get.closureSerializer.newInstance()
+
     override protected def log = CoarseGrainedSchedulerBackend.this.log
 
     private val addressToExecutorId = new HashMap[RpcAddress, String]
@@ -163,7 +168,7 @@ class CoarseGrainedSchedulerBackend(scheduler: TaskSchedulerImpl, val rpcEnv: Rp
     }
 
     // Make fake resource offers on all executors
-    def makeOffers() {
+    private def makeOffers() {
       launchTasks(scheduler.resourceOffers(executorDataMap.map { case (id, executorData) =>
         new WorkerOffer(id, executorData.executorHost, executorData.freeCores)
       }.toSeq))
@@ -175,16 +180,15 @@ class CoarseGrainedSchedulerBackend(scheduler: TaskSchedulerImpl, val rpcEnv: Rp
     }
 
     // Make fake resource offers on just one executor
-    def makeOffers(executorId: String) {
+    private def makeOffers(executorId: String) {
       val executorData = executorDataMap(executorId)
       launchTasks(scheduler.resourceOffers(
         Seq(new WorkerOffer(executorId, executorData.executorHost, executorData.freeCores))))
     }
 
     // Launch tasks returned by a set of resource offers
-    def launchTasks(tasks: Seq[Seq[TaskDescription]]) {
+    private def launchTasks(tasks: Seq[Seq[TaskDescription]]) {
       for (task <- tasks.flatten) {
-        val ser = SparkEnv.get.closureSerializer.newInstance()
         val serializedTask = ser.serialize(task)
         if (serializedTask.limit >= akkaFrameSize - AkkaUtils.reservedSizeBytes) {
           val taskSetId = scheduler.taskIdToTaskSetId(task.taskId)

From 017b3404a50bd4b04ed73c5a69acb7b19a929822 Mon Sep 17 00:00:00 2001
From: Shivaram Venkataraman <shivaram@cs.berkeley.edu>
Date: Fri, 22 May 2015 22:33:49 -0700
Subject: [PATCH 153/525] [MINOR] Add SparkR to create-release script

Enables the SparkR profiles for all the binary builds we create

cc pwendell

Author: Shivaram Venkataraman <shivaram@cs.berkeley.edu>

Closes #6371 from shivaram/sparkr-create-release and squashes the following commits:

ca5a0b2 [Shivaram Venkataraman] Add -Psparkr to create-release.sh
---
 dev/create-release/create-release.sh | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/dev/create-release/create-release.sh b/dev/create-release/create-release.sh
index af4f00054997c..54274a83f6d66 100755
--- a/dev/create-release/create-release.sh
+++ b/dev/create-release/create-release.sh
@@ -228,14 +228,14 @@ if [[ ! "$@" =~ --skip-package ]]; then
 
   # We increment the Zinc port each time to avoid OOM's and other craziness if multiple builds
   # share the same Zinc server.
-  make_binary_release "hadoop1" "-Phadoop-1 -Phive -Phive-thriftserver" "3030" &
-  make_binary_release "hadoop1-scala2.11" "-Phadoop-1 -Phive -Dscala-2.11" "3031" &
-  make_binary_release "cdh4" "-Phadoop-1 -Phive -Phive-thriftserver -Dhadoop.version=2.0.0-mr1-cdh4.2.0" "3032" &
-  make_binary_release "hadoop2.3" "-Phadoop-2.3 -Phive -Phive-thriftserver -Pyarn" "3033" &
-  make_binary_release "hadoop2.4" "-Phadoop-2.4 -Phive -Phive-thriftserver -Pyarn" "3034" &
-  make_binary_release "mapr3" "-Pmapr3 -Phive -Phive-thriftserver" "3035" &
-  make_binary_release "mapr4" "-Pmapr4 -Pyarn -Phive -Phive-thriftserver" "3036" &
-  make_binary_release "hadoop2.4-without-hive" "-Phadoop-2.4 -Pyarn" "3037" &
+  make_binary_release "hadoop1" "-Psparkr -Phadoop-1 -Phive -Phive-thriftserver" "3030" &
+  make_binary_release "hadoop1-scala2.11" "-Psparkr -Phadoop-1 -Phive -Dscala-2.11" "3031" &
+  make_binary_release "cdh4" "-Psparkr -Phadoop-1 -Phive -Phive-thriftserver -Dhadoop.version=2.0.0-mr1-cdh4.2.0" "3032" &
+  make_binary_release "hadoop2.3" "-Psparkr -Phadoop-2.3 -Phive -Phive-thriftserver -Pyarn" "3033" &
+  make_binary_release "hadoop2.4" "-Psparkr -Phadoop-2.4 -Phive -Phive-thriftserver -Pyarn" "3034" &
+  make_binary_release "mapr3" "-Pmapr3 -Psparkr -Phive -Phive-thriftserver" "3035" &
+  make_binary_release "mapr4" "-Pmapr4 -Psparkr -Pyarn -Phive -Phive-thriftserver" "3036" &
+  make_binary_release "hadoop2.4-without-hive" "-Psparkr -Phadoop-2.4 -Pyarn" "3037" &
   wait
   rm -rf spark-$RELEASE_VERSION-bin-*/
 

From baa89838cca96fa091c9e5ce62be01e1a265d820 Mon Sep 17 00:00:00 2001
From: Tathagata Das <tathagata.das1565@gmail.com>
Date: Fri, 22 May 2015 23:05:54 -0700
Subject: [PATCH 154/525] [SPARK-7838] [STREAMING] Set scope for kinesis stream

Author: Tathagata Das <tathagata.das1565@gmail.com>

Closes #6369 from tdas/SPARK-7838 and squashes the following commits:

87d1c7f [Tathagata Das] Addressed comment
37775d8 [Tathagata Das] set scope for kinesis stream
---
 .../apache/spark/streaming/kinesis/KinesisUtils.scala    | 9 ++++++---
 .../org/apache/spark/streaming/StreamingContext.scala    | 2 +-
 2 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisUtils.scala b/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisUtils.scala
index b114bcff92d0f..2531aebe7813c 100644
--- a/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisUtils.scala
+++ b/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisUtils.scala
@@ -63,9 +63,12 @@ object KinesisUtils {
       checkpointInterval: Duration,
       storageLevel: StorageLevel
     ): ReceiverInputDStream[Array[Byte]] = {
-    ssc.receiverStream(
-      new KinesisReceiver(kinesisAppName, streamName, endpointUrl, validateRegion(regionName),
-        initialPositionInStream, checkpointInterval, storageLevel, None))
+    // Setting scope to override receiver stream's scope of "receiver stream"
+    ssc.withNamedScope("kinesis stream") {
+      ssc.receiverStream(
+        new KinesisReceiver(kinesisAppName, streamName, endpointUrl, validateRegion(regionName),
+          initialPositionInStream, checkpointInterval, storageLevel, None))
+    }
   }
 
   /**
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/StreamingContext.scala b/streaming/src/main/scala/org/apache/spark/streaming/StreamingContext.scala
index 7b77d447ce6df..5e58ed714829e 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/StreamingContext.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/StreamingContext.scala
@@ -262,7 +262,7 @@ class StreamingContext private[streaming] (
    *
    * Note: Return statements are NOT allowed in the given body.
    */
-  private def withNamedScope[U](name: String)(body: => U): U = {
+  private[streaming] def withNamedScope[U](name: String)(body: => U): U = {
     RDDOperationScope.withScope(sc, name, allowNesting = false, ignoreParent = false)(body)
   }
 

From 368b8c2b5ed8b06b00ac87059f75915b13ba3b8d Mon Sep 17 00:00:00 2001
From: Hari Shreedharan <hshreedharan@apache.org>
Date: Fri, 22 May 2015 23:07:56 -0700
Subject: [PATCH 155/525] [HOTFIX] Add tests for SparkListenerApplicationStart
 with Driver Logs.

#6166 added the driver logs to `SparkListenerApplicationStart`. This  adds tests in `JsonProtocolSuite` to ensure we don't regress.

Author: Hari Shreedharan <hshreedharan@apache.org>

Closes #6368 from harishreedharan/jsonprotocol-test and squashes the following commits:

dc9eafc [Hari Shreedharan] [HOTFIX] Add tests for SparkListenerApplicationStart with Driver Logs.
---
 .../apache/spark/util/JsonProtocolSuite.scala | 25 +++++++++++++++++--
 1 file changed, 23 insertions(+), 2 deletions(-)

diff --git a/core/src/test/scala/org/apache/spark/util/JsonProtocolSuite.scala b/core/src/test/scala/org/apache/spark/util/JsonProtocolSuite.scala
index 0c5221d10d79d..0d9126f23ccc5 100644
--- a/core/src/test/scala/org/apache/spark/util/JsonProtocolSuite.scala
+++ b/core/src/test/scala/org/apache/spark/util/JsonProtocolSuite.scala
@@ -75,10 +75,12 @@ class JsonProtocolSuite extends FunSuite {
     val blockManagerRemoved = SparkListenerBlockManagerRemoved(2L,
       BlockManagerId("Scarce", "to be counted...", 100))
     val unpersistRdd = SparkListenerUnpersistRDD(12345)
+    val logUrlMap = Map("stderr" -> "mystderr", "stdout" -> "mystdout").toMap
     val applicationStart = SparkListenerApplicationStart("The winner of all", Some("appId"),
       42L, "Garfield", Some("appAttempt"))
+    val applicationStartWithLogs = SparkListenerApplicationStart("The winner of all", Some("appId"),
+      42L, "Garfield", Some("appAttempt"), Some(logUrlMap))
     val applicationEnd = SparkListenerApplicationEnd(42L)
-    val logUrlMap = Map("stderr" -> "mystderr", "stdout" -> "mystdout").toMap
     val executorAdded = SparkListenerExecutorAdded(executorAddedTime, "exec1",
       new ExecutorInfo("Hostee.awesome.com", 11, logUrlMap))
     val executorRemoved = SparkListenerExecutorRemoved(executorRemovedTime, "exec2", "test reason")
@@ -97,6 +99,7 @@ class JsonProtocolSuite extends FunSuite {
     testEvent(blockManagerRemoved, blockManagerRemovedJsonString)
     testEvent(unpersistRdd, unpersistRDDJsonString)
     testEvent(applicationStart, applicationStartJsonString)
+    testEvent(applicationStartWithLogs, applicationStartJsonWithLogUrlsString)
     testEvent(applicationEnd, applicationEndJsonString)
     testEvent(executorAdded, executorAddedJsonString)
     testEvent(executorRemoved, executorRemovedJsonString)
@@ -277,10 +280,12 @@ class JsonProtocolSuite extends FunSuite {
   test("SparkListenerApplicationStart backwards compatibility") {
     // SparkListenerApplicationStart in Spark 1.0.0 do not have an "appId" property.
     // SparkListenerApplicationStart pre-Spark 1.4 does not have "appAttemptId".
-    val applicationStart = SparkListenerApplicationStart("test", None, 1L, "user", None)
+    // SparkListenerApplicationStart pre-Spark 1.5 does not have "driverLogs
+    val applicationStart = SparkListenerApplicationStart("test", None, 1L, "user", None, None)
     val oldEvent = JsonProtocol.applicationStartToJson(applicationStart)
       .removeField({ _._1 == "App ID" })
       .removeField({ _._1 == "App Attempt ID" })
+      .removeField({ _._1 == "Driver Logs"})
     assert(applicationStart === JsonProtocol.applicationStartFromJson(oldEvent))
   }
 
@@ -1544,6 +1549,22 @@ class JsonProtocolSuite extends FunSuite {
       |}
     """
 
+  private val applicationStartJsonWithLogUrlsString =
+    """
+      |{
+      |  "Event": "SparkListenerApplicationStart",
+      |  "App Name": "The winner of all",
+      |  "App ID": "appId",
+      |  "Timestamp": 42,
+      |  "User": "Garfield",
+      |  "App Attempt ID": "appAttempt",
+      |  "Driver Logs" : {
+      |      "stderr" : "mystderr",
+      |      "stdout" : "mystdout"
+      |  }
+      |}
+    """
+
   private val applicationEndJsonString =
     """
       |{

From 4583cf4be17155c68178155acf6866d7cc8f7df0 Mon Sep 17 00:00:00 2001
From: GenTang <gen.tang86@gmail.com>
Date: Fri, 22 May 2015 23:37:03 -0700
Subject: [PATCH 156/525] [SPARK-5090] [EXAMPLES] The improvement of python
 converter for hbase

Hi,

Following the discussion in http://apache-spark-developers-list.1001551.n3.nabble.com/python-converter-in-HBaseConverter-scala-spark-examples-td10001.html. I made some modification in three files in package examples:
1. HBaseConverters.scala: the new converter will converts all the records in an hbase results into a single string
2. hbase_input.py: as the value string may contain several records, we can use ast package to convert the string into dict
3. HBaseTest.scala: as the package examples use hbase 0.98.7 the original constructor HTableDescriptor is deprecated. The updation to new constructor is made

Author: GenTang <gen.tang86@gmail.com>

Closes #3920 from GenTang/master and squashes the following commits:

d2153df [GenTang] import JSONObject precisely
4802481 [GenTang] dump the result into a singl String
62df7f0 [GenTang] remove the comment
21de653 [GenTang] return the string in json format
15b1fe3 [GenTang] the modification of comments
5cbbcfc [GenTang] the improvement of pythonconverter
ceb31c5 [GenTang] the modification for adapting updation of hbase
3253b61 [GenTang] the modification accompanying the improvement of pythonconverter
---
 examples/src/main/python/hbase_inputformat.py | 21 ++++++++++++-------
 .../pythonconverters/HBaseConverters.scala    | 20 +++++++++++++++---
 2 files changed, 30 insertions(+), 11 deletions(-)

diff --git a/examples/src/main/python/hbase_inputformat.py b/examples/src/main/python/hbase_inputformat.py
index 5b82a14fba413..c5ae5d043b8ea 100644
--- a/examples/src/main/python/hbase_inputformat.py
+++ b/examples/src/main/python/hbase_inputformat.py
@@ -18,6 +18,7 @@
 from __future__ import print_function
 
 import sys
+import json
 
 from pyspark import SparkContext
 
@@ -27,24 +28,24 @@
 hbase(main):016:0> create 'test', 'f1'
 0 row(s) in 1.0430 seconds
 
-hbase(main):017:0> put 'test', 'row1', 'f1', 'value1'
+hbase(main):017:0> put 'test', 'row1', 'f1:a', 'value1'
 0 row(s) in 0.0130 seconds
 
-hbase(main):018:0> put 'test', 'row2', 'f1', 'value2'
+hbase(main):018:0> put 'test', 'row1', 'f1:b', 'value2'
 0 row(s) in 0.0030 seconds
 
-hbase(main):019:0> put 'test', 'row3', 'f1', 'value3'
+hbase(main):019:0> put 'test', 'row2', 'f1', 'value3'
 0 row(s) in 0.0050 seconds
 
-hbase(main):020:0> put 'test', 'row4', 'f1', 'value4'
+hbase(main):020:0> put 'test', 'row3', 'f1', 'value4'
 0 row(s) in 0.0110 seconds
 
 hbase(main):021:0> scan 'test'
 ROW                           COLUMN+CELL
- row1                         column=f1:, timestamp=1401883411986, value=value1
- row2                         column=f1:, timestamp=1401883415212, value=value2
- row3                         column=f1:, timestamp=1401883417858, value=value3
- row4                         column=f1:, timestamp=1401883420805, value=value4
+ row1                         column=f1:a, timestamp=1401883411986, value=value1
+ row1                         column=f1:b, timestamp=1401883415212, value=value2
+ row2                         column=f1:, timestamp=1401883417858, value=value3
+ row3                         column=f1:, timestamp=1401883420805, value=value4
 4 row(s) in 0.0240 seconds
 """
 if __name__ == "__main__":
@@ -64,6 +65,8 @@
     table = sys.argv[2]
     sc = SparkContext(appName="HBaseInputFormat")
 
+    # Other options for configuring scan behavior are available. More information available at
+    # https://github.com/apache/hbase/blob/master/hbase-server/src/main/java/org/apache/hadoop/hbase/mapreduce/TableInputFormat.java
     conf = {"hbase.zookeeper.quorum": host, "hbase.mapreduce.inputtable": table}
     if len(sys.argv) > 3:
         conf = {"hbase.zookeeper.quorum": host, "zookeeper.znode.parent": sys.argv[3],
@@ -78,6 +81,8 @@
         keyConverter=keyConv,
         valueConverter=valueConv,
         conf=conf)
+    hbase_rdd = hbase_rdd.flatMapValues(lambda v: v.split("\n")).mapValues(json.loads)
+
     output = hbase_rdd.collect()
     for (k, v) in output:
         print((k, v))
diff --git a/examples/src/main/scala/org/apache/spark/examples/pythonconverters/HBaseConverters.scala b/examples/src/main/scala/org/apache/spark/examples/pythonconverters/HBaseConverters.scala
index 273bee0a8b30f..90d48a64106c7 100644
--- a/examples/src/main/scala/org/apache/spark/examples/pythonconverters/HBaseConverters.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/pythonconverters/HBaseConverters.scala
@@ -18,20 +18,34 @@
 package org.apache.spark.examples.pythonconverters
 
 import scala.collection.JavaConversions._
+import scala.util.parsing.json.JSONObject
 
 import org.apache.spark.api.python.Converter
 import org.apache.hadoop.hbase.client.{Put, Result}
 import org.apache.hadoop.hbase.io.ImmutableBytesWritable
 import org.apache.hadoop.hbase.util.Bytes
+import org.apache.hadoop.hbase.KeyValue.Type
+import org.apache.hadoop.hbase.CellUtil
 
 /**
- * Implementation of [[org.apache.spark.api.python.Converter]] that converts an
- * HBase Result to a String
+ * Implementation of [[org.apache.spark.api.python.Converter]] that converts all
+ * the records in an HBase Result to a String
  */
 class HBaseResultToStringConverter extends Converter[Any, String] {
   override def convert(obj: Any): String = {
+    import collection.JavaConverters._
     val result = obj.asInstanceOf[Result]
-    Bytes.toStringBinary(result.value())
+    val output = result.listCells.asScala.map(cell =>
+        Map(
+          "row" -> Bytes.toStringBinary(CellUtil.cloneRow(cell)),
+          "columnFamily" -> Bytes.toStringBinary(CellUtil.cloneFamily(cell)),
+          "qualifier" -> Bytes.toStringBinary(CellUtil.cloneQualifier(cell)),
+          "timestamp" -> cell.getTimestamp.toString,
+          "type" -> Type.codeToType(cell.getTypeByte).toString,
+          "value" -> Bytes.toStringBinary(CellUtil.cloneValue(cell))
+        )
+    )
+    output.map(JSONObject(_).toString()).mkString("\n")
   }
 }
 

From 7af3818c6b2bf35bfa531ab7cc3a4a714385015e Mon Sep 17 00:00:00 2001
From: Davies Liu <davies@databricks.com>
Date: Sat, 23 May 2015 00:00:30 -0700
Subject: [PATCH 157/525] [SPARK-6806] [SPARKR] [DOCS] Fill in SparkR examples
 in programming guide

sqlCtx -> sqlContext

You can check the docs by:

```
$ cd docs
$ SKIP_SCALADOC=1 jekyll serve
```
cc shivaram

Author: Davies Liu <davies@databricks.com>

Closes #5442 from davies/r_docs and squashes the following commits:

7a12ec6 [Davies Liu] remove rdd in R docs
8496b26 [Davies Liu] remove the docs related to RDD
e23b9d6 [Davies Liu] delete R docs for RDD API
222e4ff [Davies Liu] Merge branch 'master' into r_docs
89684ce [Davies Liu] Merge branch 'r_docs' of github.com:davies/spark into r_docs
f0a10e1 [Davies Liu] address comments from @shivaram
f61de71 [Davies Liu] Update pairRDD.R
3ef7cf3 [Davies Liu] use + instead of function(a,b) a+b
2f10a77 [Davies Liu] address comments from @cafreeman
9c2a062 [Davies Liu] mention R api together with Python API
23f751a [Davies Liu] Fill in SparkR examples in programming guide
---
 R/README.md                      |   4 +-
 R/pkg/R/DataFrame.R              | 176 +++++++--------
 R/pkg/R/RDD.R                    |   2 +-
 R/pkg/R/SQLContext.R             | 165 +++++++-------
 R/pkg/R/pairRDD.R                |   4 +-
 R/pkg/R/sparkR.R                 |  10 +-
 R/pkg/inst/profile/shell.R       |   6 +-
 R/pkg/inst/tests/test_sparkSQL.R | 156 ++++++-------
 docs/_plugins/copy_api_dirs.rb   |  68 +++---
 docs/api.md                      |   3 +-
 docs/index.md                    |  23 +-
 docs/programming-guide.md        |  21 +-
 docs/quick-start.md              |  18 +-
 docs/sql-programming-guide.md    | 373 ++++++++++++++++++++++++++++++-
 14 files changed, 706 insertions(+), 323 deletions(-)

diff --git a/R/README.md b/R/README.md
index a6970e39b55f3..d7d65b4f0eca5 100644
--- a/R/README.md
+++ b/R/README.md
@@ -52,7 +52,7 @@ The SparkR documentation (Rd files and HTML files) are not a part of the source
 SparkR comes with several sample programs in the `examples/src/main/r` directory.
 To run one of them, use `./bin/sparkR <filename> <args>`. For example:
 
-    ./bin/sparkR examples/src/main/r/pi.R local[2]
+    ./bin/sparkR examples/src/main/r/dataframe.R
 
 You can also run the unit-tests for SparkR by running (you need to install the [testthat](http://cran.r-project.org/web/packages/testthat/index.html) package first):
 
@@ -63,5 +63,5 @@ You can also run the unit-tests for SparkR by running (you need to install the [
 The `./bin/spark-submit` and `./bin/sparkR` can also be used to submit jobs to YARN clusters. You will need to set YARN conf dir before doing so. For example on CDH you can run
 ```
 export YARN_CONF_DIR=/etc/hadoop/conf
-./bin/spark-submit --master yarn examples/src/main/r/pi.R 4
+./bin/spark-submit --master yarn examples/src/main/r/dataframe.R
 ```
diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R
index a7fa32e291fb1..ed8093c80d360 100644
--- a/R/pkg/R/DataFrame.R
+++ b/R/pkg/R/DataFrame.R
@@ -65,9 +65,9 @@ dataFrame <- function(sdf, isCached = FALSE) {
 #' @examples
 #'\dontrun{
 #' sc <- sparkR.init()
-#' sqlCtx <- sparkRSQL.init(sc)
+#' sqlContext <- sparkRSQL.init(sc)
 #' path <- "path/to/file.json"
-#' df <- jsonFile(sqlCtx, path)
+#' df <- jsonFile(sqlContext, path)
 #' printSchema(df)
 #'}
 setMethod("printSchema",
@@ -88,9 +88,9 @@ setMethod("printSchema",
 #' @examples
 #'\dontrun{
 #' sc <- sparkR.init()
-#' sqlCtx <- sparkRSQL.init(sc)
+#' sqlContext <- sparkRSQL.init(sc)
 #' path <- "path/to/file.json"
-#' df <- jsonFile(sqlCtx, path)
+#' df <- jsonFile(sqlContext, path)
 #' dfSchema <- schema(df)
 #'}
 setMethod("schema",
@@ -110,9 +110,9 @@ setMethod("schema",
 #' @examples
 #'\dontrun{
 #' sc <- sparkR.init()
-#' sqlCtx <- sparkRSQL.init(sc)
+#' sqlContext <- sparkRSQL.init(sc)
 #' path <- "path/to/file.json"
-#' df <- jsonFile(sqlCtx, path)
+#' df <- jsonFile(sqlContext, path)
 #' explain(df, TRUE)
 #'}
 setMethod("explain",
@@ -139,9 +139,9 @@ setMethod("explain",
 #' @examples
 #'\dontrun{
 #' sc <- sparkR.init()
-#' sqlCtx <- sparkRSQL.init(sc)
+#' sqlContext <- sparkRSQL.init(sc)
 #' path <- "path/to/file.json"
-#' df <- jsonFile(sqlCtx, path)
+#' df <- jsonFile(sqlContext, path)
 #' isLocal(df)
 #'}
 setMethod("isLocal",
@@ -162,9 +162,9 @@ setMethod("isLocal",
 #' @examples
 #'\dontrun{
 #' sc <- sparkR.init()
-#' sqlCtx <- sparkRSQL.init(sc)
+#' sqlContext <- sparkRSQL.init(sc)
 #' path <- "path/to/file.json"
-#' df <- jsonFile(sqlCtx, path)
+#' df <- jsonFile(sqlContext, path)
 #' showDF(df)
 #'}
 setMethod("showDF",
@@ -185,9 +185,9 @@ setMethod("showDF",
 #' @examples
 #'\dontrun{
 #' sc <- sparkR.init()
-#' sqlCtx <- sparkRSQL.init(sc)
+#' sqlContext <- sparkRSQL.init(sc)
 #' path <- "path/to/file.json"
-#' df <- jsonFile(sqlCtx, path)
+#' df <- jsonFile(sqlContext, path)
 #' df
 #'}
 setMethod("show", "DataFrame",
@@ -210,9 +210,9 @@ setMethod("show", "DataFrame",
 #' @examples
 #'\dontrun{
 #' sc <- sparkR.init()
-#' sqlCtx <- sparkRSQL.init(sc)
+#' sqlContext <- sparkRSQL.init(sc)
 #' path <- "path/to/file.json"
-#' df <- jsonFile(sqlCtx, path)
+#' df <- jsonFile(sqlContext, path)
 #' dtypes(df)
 #'}
 setMethod("dtypes",
@@ -234,9 +234,9 @@ setMethod("dtypes",
 #' @examples
 #'\dontrun{
 #' sc <- sparkR.init()
-#' sqlCtx <- sparkRSQL.init(sc)
+#' sqlContext <- sparkRSQL.init(sc)
 #' path <- "path/to/file.json"
-#' df <- jsonFile(sqlCtx, path)
+#' df <- jsonFile(sqlContext, path)
 #' columns(df)
 #'}
 setMethod("columns",
@@ -267,11 +267,11 @@ setMethod("names",
 #' @examples
 #'\dontrun{
 #' sc <- sparkR.init()
-#' sqlCtx <- sparkRSQL.init(sc)
+#' sqlContext <- sparkRSQL.init(sc)
 #' path <- "path/to/file.json"
-#' df <- jsonFile(sqlCtx, path)
+#' df <- jsonFile(sqlContext, path)
 #' registerTempTable(df, "json_df")
-#' new_df <- sql(sqlCtx, "SELECT * FROM json_df")
+#' new_df <- sql(sqlContext, "SELECT * FROM json_df")
 #'}
 setMethod("registerTempTable",
           signature(x = "DataFrame", tableName = "character"),
@@ -293,9 +293,9 @@ setMethod("registerTempTable",
 #' @examples
 #'\dontrun{
 #' sc <- sparkR.init()
-#' sqlCtx <- sparkRSQL.init(sc)
-#' df <- read.df(sqlCtx, path, "parquet")
-#' df2 <- read.df(sqlCtx, path2, "parquet")
+#' sqlContext <- sparkRSQL.init(sc)
+#' df <- read.df(sqlContext, path, "parquet")
+#' df2 <- read.df(sqlContext, path2, "parquet")
 #' registerTempTable(df, "table1")
 #' insertInto(df2, "table1", overwrite = TRUE)
 #'}
@@ -316,9 +316,9 @@ setMethod("insertInto",
 #' @examples
 #'\dontrun{
 #' sc <- sparkR.init()
-#' sqlCtx <- sparkRSQL.init(sc)
+#' sqlContext <- sparkRSQL.init(sc)
 #' path <- "path/to/file.json"
-#' df <- jsonFile(sqlCtx, path)
+#' df <- jsonFile(sqlContext, path)
 #' cache(df)
 #'}
 setMethod("cache",
@@ -341,9 +341,9 @@ setMethod("cache",
 #' @examples
 #'\dontrun{
 #' sc <- sparkR.init()
-#' sqlCtx <- sparkRSQL.init(sc)
+#' sqlContext <- sparkRSQL.init(sc)
 #' path <- "path/to/file.json"
-#' df <- jsonFile(sqlCtx, path)
+#' df <- jsonFile(sqlContext, path)
 #' persist(df, "MEMORY_AND_DISK")
 #'}
 setMethod("persist",
@@ -366,9 +366,9 @@ setMethod("persist",
 #' @examples
 #'\dontrun{
 #' sc <- sparkR.init()
-#' sqlCtx <- sparkRSQL.init(sc)
+#' sqlContext <- sparkRSQL.init(sc)
 #' path <- "path/to/file.json"
-#' df <- jsonFile(sqlCtx, path)
+#' df <- jsonFile(sqlContext, path)
 #' persist(df, "MEMORY_AND_DISK")
 #' unpersist(df)
 #'}
@@ -391,9 +391,9 @@ setMethod("unpersist",
 #' @examples
 #'\dontrun{
 #' sc <- sparkR.init()
-#' sqlCtx <- sparkRSQL.init(sc)
+#' sqlContext <- sparkRSQL.init(sc)
 #' path <- "path/to/file.json"
-#' df <- jsonFile(sqlCtx, path)
+#' df <- jsonFile(sqlContext, path)
 #' newDF <- repartition(df, 2L)
 #'}
 setMethod("repartition",
@@ -415,9 +415,9 @@ setMethod("repartition",
 # @examples
 #\dontrun{
 # sc <- sparkR.init()
-# sqlCtx <- sparkRSQL.init(sc)
+# sqlContext <- sparkRSQL.init(sc)
 # path <- "path/to/file.json"
-# df <- jsonFile(sqlCtx, path)
+# df <- jsonFile(sqlContext, path)
 # newRDD <- toJSON(df)
 #}
 setMethod("toJSON",
@@ -440,9 +440,9 @@ setMethod("toJSON",
 #' @examples
 #'\dontrun{
 #' sc <- sparkR.init()
-#' sqlCtx <- sparkRSQL.init(sc)
+#' sqlContext <- sparkRSQL.init(sc)
 #' path <- "path/to/file.json"
-#' df <- jsonFile(sqlCtx, path)
+#' df <- jsonFile(sqlContext, path)
 #' saveAsParquetFile(df, "/tmp/sparkr-tmp/")
 #'}
 setMethod("saveAsParquetFile",
@@ -461,9 +461,9 @@ setMethod("saveAsParquetFile",
 #' @examples
 #'\dontrun{
 #' sc <- sparkR.init()
-#' sqlCtx <- sparkRSQL.init(sc)
+#' sqlContext <- sparkRSQL.init(sc)
 #' path <- "path/to/file.json"
-#' df <- jsonFile(sqlCtx, path)
+#' df <- jsonFile(sqlContext, path)
 #' distinctDF <- distinct(df)
 #'}
 setMethod("distinct",
@@ -486,9 +486,9 @@ setMethod("distinct",
 #' @examples
 #'\dontrun{
 #' sc <- sparkR.init()
-#' sqlCtx <- sparkRSQL.init(sc)
+#' sqlContext <- sparkRSQL.init(sc)
 #' path <- "path/to/file.json"
-#' df <- jsonFile(sqlCtx, path)
+#' df <- jsonFile(sqlContext, path)
 #' collect(sample(df, FALSE, 0.5)) 
 #' collect(sample(df, TRUE, 0.5))
 #'}
@@ -523,9 +523,9 @@ setMethod("sample_frac",
 #' @examples
 #'\dontrun{
 #' sc <- sparkR.init()
-#' sqlCtx <- sparkRSQL.init(sc)
+#' sqlContext <- sparkRSQL.init(sc)
 #' path <- "path/to/file.json"
-#' df <- jsonFile(sqlCtx, path)
+#' df <- jsonFile(sqlContext, path)
 #' count(df)
 #' }
 setMethod("count",
@@ -545,9 +545,9 @@ setMethod("count",
 #' @examples
 #'\dontrun{
 #' sc <- sparkR.init()
-#' sqlCtx <- sparkRSQL.init(sc)
+#' sqlContext <- sparkRSQL.init(sc)
 #' path <- "path/to/file.json"
-#' df <- jsonFile(sqlCtx, path)
+#' df <- jsonFile(sqlContext, path)
 #' collected <- collect(df)
 #' firstName <- collected[[1]]$name
 #' }
@@ -580,9 +580,9 @@ setMethod("collect",
 #' @examples
 #' \dontrun{
 #' sc <- sparkR.init()
-#' sqlCtx <- sparkRSQL.init(sc)
+#' sqlContext <- sparkRSQL.init(sc)
 #' path <- "path/to/file.json"
-#' df <- jsonFile(sqlCtx, path)
+#' df <- jsonFile(sqlContext, path)
 #' limitedDF <- limit(df, 10)
 #' }
 setMethod("limit",
@@ -599,9 +599,9 @@ setMethod("limit",
 #' @examples
 #'\dontrun{
 #' sc <- sparkR.init()
-#' sqlCtx <- sparkRSQL.init(sc)
+#' sqlContext <- sparkRSQL.init(sc)
 #' path <- "path/to/file.json"
-#' df <- jsonFile(sqlCtx, path)
+#' df <- jsonFile(sqlContext, path)
 #' take(df, 2)
 #' }
 setMethod("take",
@@ -626,9 +626,9 @@ setMethod("take",
 #' @examples
 #'\dontrun{
 #' sc <- sparkR.init()
-#' sqlCtx <- sparkRSQL.init(sc)
+#' sqlContext <- sparkRSQL.init(sc)
 #' path <- "path/to/file.json"
-#' df <- jsonFile(sqlCtx, path)
+#' df <- jsonFile(sqlContext, path)
 #' head(df)
 #' }
 setMethod("head",
@@ -647,9 +647,9 @@ setMethod("head",
 #' @examples
 #'\dontrun{
 #' sc <- sparkR.init()
-#' sqlCtx <- sparkRSQL.init(sc)
+#' sqlContext <- sparkRSQL.init(sc)
 #' path <- "path/to/file.json"
-#' df <- jsonFile(sqlCtx, path)
+#' df <- jsonFile(sqlContext, path)
 #' first(df)
 #' }
 setMethod("first",
@@ -669,9 +669,9 @@ setMethod("first",
 # @examples
 #\dontrun{
 # sc <- sparkR.init()
-# sqlCtx <- sparkRSQL.init(sc)
+# sqlContext <- sparkRSQL.init(sc)
 # path <- "path/to/file.json"
-# df <- jsonFile(sqlCtx, path)
+# df <- jsonFile(sqlContext, path)
 # rdd <- toRDD(df)
 # }
 setMethod("toRDD",
@@ -938,9 +938,9 @@ setMethod("select",
 #' @examples
 #'\dontrun{
 #' sc <- sparkR.init()
-#' sqlCtx <- sparkRSQL.init(sc)
+#' sqlContext <- sparkRSQL.init(sc)
 #' path <- "path/to/file.json"
-#' df <- jsonFile(sqlCtx, path)
+#' df <- jsonFile(sqlContext, path)
 #' selectExpr(df, "col1", "(col2 * 5) as newCol")
 #' }
 setMethod("selectExpr",
@@ -964,9 +964,9 @@ setMethod("selectExpr",
 #' @examples
 #'\dontrun{
 #' sc <- sparkR.init()
-#' sqlCtx <- sparkRSQL.init(sc)
+#' sqlContext <- sparkRSQL.init(sc)
 #' path <- "path/to/file.json"
-#' df <- jsonFile(sqlCtx, path)
+#' df <- jsonFile(sqlContext, path)
 #' newDF <- withColumn(df, "newCol", df$col1 * 5)
 #' }
 setMethod("withColumn",
@@ -988,9 +988,9 @@ setMethod("withColumn",
 #' @examples
 #'\dontrun{
 #' sc <- sparkR.init()
-#' sqlCtx <- sparkRSQL.init(sc)
+#' sqlContext <- sparkRSQL.init(sc)
 #' path <- "path/to/file.json"
-#' df <- jsonFile(sqlCtx, path)
+#' df <- jsonFile(sqlContext, path)
 #' newDF <- mutate(df, newCol = df$col1 * 5, newCol2 = df$col1 * 2)
 #' names(newDF) # Will contain newCol, newCol2
 #' }
@@ -1024,9 +1024,9 @@ setMethod("mutate",
 #' @examples
 #'\dontrun{
 #' sc <- sparkR.init()
-#' sqlCtx <- sparkRSQL.init(sc)
+#' sqlContext <- sparkRSQL.init(sc)
 #' path <- "path/to/file.json"
-#' df <- jsonFile(sqlCtx, path)
+#' df <- jsonFile(sqlContext, path)
 #' newDF <- withColumnRenamed(df, "col1", "newCol1")
 #' }
 setMethod("withColumnRenamed",
@@ -1055,9 +1055,9 @@ setMethod("withColumnRenamed",
 #' @examples
 #'\dontrun{
 #' sc <- sparkR.init()
-#' sqlCtx <- sparkRSQL.init(sc)
+#' sqlContext <- sparkRSQL.init(sc)
 #' path <- "path/to/file.json"
-#' df <- jsonFile(sqlCtx, path)
+#' df <- jsonFile(sqlContext, path)
 #' newDF <- rename(df, col1 = df$newCol1)
 #' }
 setMethod("rename",
@@ -1095,9 +1095,9 @@ setClassUnion("characterOrColumn", c("character", "Column"))
 #' @examples
 #'\dontrun{
 #' sc <- sparkR.init()
-#' sqlCtx <- sparkRSQL.init(sc)
+#' sqlContext <- sparkRSQL.init(sc)
 #' path <- "path/to/file.json"
-#' df <- jsonFile(sqlCtx, path)
+#' df <- jsonFile(sqlContext, path)
 #' arrange(df, df$col1)
 #' arrange(df, "col1")
 #' arrange(df, asc(df$col1), desc(abs(df$col2)))
@@ -1137,9 +1137,9 @@ setMethod("orderBy",
 #' @examples
 #'\dontrun{
 #' sc <- sparkR.init()
-#' sqlCtx <- sparkRSQL.init(sc)
+#' sqlContext <- sparkRSQL.init(sc)
 #' path <- "path/to/file.json"
-#' df <- jsonFile(sqlCtx, path)
+#' df <- jsonFile(sqlContext, path)
 #' filter(df, "col1 > 0")
 #' filter(df, df$col2 != "abcdefg")
 #' }
@@ -1177,9 +1177,9 @@ setMethod("where",
 #' @examples
 #'\dontrun{
 #' sc <- sparkR.init()
-#' sqlCtx <- sparkRSQL.init(sc)
-#' df1 <- jsonFile(sqlCtx, path)
-#' df2 <- jsonFile(sqlCtx, path2)
+#' sqlContext <- sparkRSQL.init(sc)
+#' df1 <- jsonFile(sqlContext, path)
+#' df2 <- jsonFile(sqlContext, path2)
 #' join(df1, df2) # Performs a Cartesian
 #' join(df1, df2, df1$col1 == df2$col2) # Performs an inner join based on expression
 #' join(df1, df2, df1$col1 == df2$col2, "right_outer")
@@ -1219,9 +1219,9 @@ setMethod("join",
 #' @examples
 #'\dontrun{
 #' sc <- sparkR.init()
-#' sqlCtx <- sparkRSQL.init(sc)
-#' df1 <- jsonFile(sqlCtx, path)
-#' df2 <- jsonFile(sqlCtx, path2)
+#' sqlContext <- sparkRSQL.init(sc)
+#' df1 <- jsonFile(sqlContext, path)
+#' df2 <- jsonFile(sqlContext, path2)
 #' unioned <- unionAll(df, df2)
 #' }
 setMethod("unionAll",
@@ -1244,9 +1244,9 @@ setMethod("unionAll",
 #' @examples
 #'\dontrun{
 #' sc <- sparkR.init()
-#' sqlCtx <- sparkRSQL.init(sc)
-#' df1 <- jsonFile(sqlCtx, path)
-#' df2 <- jsonFile(sqlCtx, path2)
+#' sqlContext <- sparkRSQL.init(sc)
+#' df1 <- jsonFile(sqlContext, path)
+#' df2 <- jsonFile(sqlContext, path2)
 #' intersectDF <- intersect(df, df2)
 #' }
 setMethod("intersect",
@@ -1269,9 +1269,9 @@ setMethod("intersect",
 #' @examples
 #'\dontrun{
 #' sc <- sparkR.init()
-#' sqlCtx <- sparkRSQL.init(sc)
-#' df1 <- jsonFile(sqlCtx, path)
-#' df2 <- jsonFile(sqlCtx, path2)
+#' sqlContext <- sparkRSQL.init(sc)
+#' df1 <- jsonFile(sqlContext, path)
+#' df2 <- jsonFile(sqlContext, path2)
 #' exceptDF <- except(df, df2)
 #' }
 #' @rdname except
@@ -1308,9 +1308,9 @@ setMethod("except",
 #' @examples
 #'\dontrun{
 #' sc <- sparkR.init()
-#' sqlCtx <- sparkRSQL.init(sc)
+#' sqlContext <- sparkRSQL.init(sc)
 #' path <- "path/to/file.json"
-#' df <- jsonFile(sqlCtx, path)
+#' df <- jsonFile(sqlContext, path)
 #' write.df(df, "myfile", "parquet", "overwrite")
 #' }
 setMethod("write.df",
@@ -1318,8 +1318,8 @@ setMethod("write.df",
                     mode = 'character'),
           function(df, path = NULL, source = NULL, mode = "append", ...){
             if (is.null(source)) {
-              sqlCtx <- get(".sparkRSQLsc", envir = .sparkREnv)
-              source <- callJMethod(sqlCtx, "getConf", "spark.sql.sources.default",
+              sqlContext <- get(".sparkRSQLsc", envir = .sparkREnv)
+              source <- callJMethod(sqlContext, "getConf", "spark.sql.sources.default",
                                     "org.apache.spark.sql.parquet")
             }
             allModes <- c("append", "overwrite", "error", "ignore")
@@ -1371,9 +1371,9 @@ setMethod("saveDF",
 #' @examples
 #'\dontrun{
 #' sc <- sparkR.init()
-#' sqlCtx <- sparkRSQL.init(sc)
+#' sqlContext <- sparkRSQL.init(sc)
 #' path <- "path/to/file.json"
-#' df <- jsonFile(sqlCtx, path)
+#' df <- jsonFile(sqlContext, path)
 #' saveAsTable(df, "myfile")
 #' }
 setMethod("saveAsTable",
@@ -1381,8 +1381,8 @@ setMethod("saveAsTable",
                     mode = 'character'),
           function(df, tableName, source = NULL, mode="append", ...){
             if (is.null(source)) {
-              sqlCtx <- get(".sparkRSQLsc", envir = .sparkREnv)
-              source <- callJMethod(sqlCtx, "getConf", "spark.sql.sources.default",
+              sqlContext <- get(".sparkRSQLsc", envir = .sparkREnv)
+              source <- callJMethod(sqlContext, "getConf", "spark.sql.sources.default",
                                     "org.apache.spark.sql.parquet")
             }
             allModes <- c("append", "overwrite", "error", "ignore")
@@ -1408,9 +1408,9 @@ setMethod("saveAsTable",
 #' @examples
 #'\dontrun{
 #' sc <- sparkR.init()
-#' sqlCtx <- sparkRSQL.init(sc)
+#' sqlContext <- sparkRSQL.init(sc)
 #' path <- "path/to/file.json"
-#' df <- jsonFile(sqlCtx, path)
+#' df <- jsonFile(sqlContext, path)
 #' describe(df)
 #' describe(df, "col1")
 #' describe(df, "col1", "col2")
diff --git a/R/pkg/R/RDD.R b/R/pkg/R/RDD.R
index d3a68fff780ce..0513299515644 100644
--- a/R/pkg/R/RDD.R
+++ b/R/pkg/R/RDD.R
@@ -239,7 +239,7 @@ setMethod("cache",
 # @aliases persist,RDD-method
 setMethod("persist",
           signature(x = "RDD", newLevel = "character"),
-          function(x, newLevel) {
+          function(x, newLevel = "MEMORY_ONLY") {
             callJMethod(getJRDD(x), "persist", getStorageLevel(newLevel))
             x@env$isCached <- TRUE
             x
diff --git a/R/pkg/R/SQLContext.R b/R/pkg/R/SQLContext.R
index 531442e8459e4..36cc612875879 100644
--- a/R/pkg/R/SQLContext.R
+++ b/R/pkg/R/SQLContext.R
@@ -69,7 +69,7 @@ infer_type <- function(x) {
 #'
 #' Converts an RDD to a DataFrame by infer the types.
 #'
-#' @param sqlCtx A SQLContext
+#' @param sqlContext A SQLContext
 #' @param data An RDD or list or data.frame
 #' @param schema a list of column names or named list (StructType), optional
 #' @return an DataFrame
@@ -77,13 +77,13 @@ infer_type <- function(x) {
 #' @examples
 #'\dontrun{
 #' sc <- sparkR.init()
-#' sqlCtx <- sparkRSQL.init(sc)
+#' sqlContext <- sparkRSQL.init(sc)
 #' rdd <- lapply(parallelize(sc, 1:10), function(x) list(a=x, b=as.character(x)))
-#' df <- createDataFrame(sqlCtx, rdd)
+#' df <- createDataFrame(sqlContext, rdd)
 #' }
 
 # TODO(davies): support sampling and infer type from NA
-createDataFrame <- function(sqlCtx, data, schema = NULL, samplingRatio = 1.0) {
+createDataFrame <- function(sqlContext, data, schema = NULL, samplingRatio = 1.0) {
   if (is.data.frame(data)) {
       # get the names of columns, they will be put into RDD
       schema <- names(data)
@@ -102,7 +102,7 @@ createDataFrame <- function(sqlCtx, data, schema = NULL, samplingRatio = 1.0) {
       })
   }
   if (is.list(data)) {
-    sc <- callJStatic("org.apache.spark.sql.api.r.SQLUtils", "getJavaSparkContext", sqlCtx)
+    sc <- callJStatic("org.apache.spark.sql.api.r.SQLUtils", "getJavaSparkContext", sqlContext)
     rdd <- parallelize(sc, data)
   } else if (inherits(data, "RDD")) {
     rdd <- data
@@ -146,7 +146,7 @@ createDataFrame <- function(sqlCtx, data, schema = NULL, samplingRatio = 1.0) {
   jrdd <- getJRDD(lapply(rdd, function(x) x), "row")
   srdd <- callJMethod(jrdd, "rdd")
   sdf <- callJStatic("org.apache.spark.sql.api.r.SQLUtils", "createDF",
-                     srdd, schema$jobj, sqlCtx)
+                     srdd, schema$jobj, sqlContext)
   dataFrame(sdf)
 }
 
@@ -161,7 +161,7 @@ createDataFrame <- function(sqlCtx, data, schema = NULL, samplingRatio = 1.0) {
 # @examples
 #\dontrun{
 # sc <- sparkR.init()
-# sqlCtx <- sparkRSQL.init(sc)
+# sqlContext <- sparkRSQL.init(sc)
 # rdd <- lapply(parallelize(sc, 1:10), function(x) list(a=x, b=as.character(x)))
 # df <- toDF(rdd)
 # }
@@ -170,14 +170,14 @@ setGeneric("toDF", function(x, ...) { standardGeneric("toDF") })
 
 setMethod("toDF", signature(x = "RDD"),
           function(x, ...) {
-            sqlCtx <- if (exists(".sparkRHivesc", envir = .sparkREnv)) {
+            sqlContext <- if (exists(".sparkRHivesc", envir = .sparkREnv)) {
               get(".sparkRHivesc", envir = .sparkREnv)
             } else if (exists(".sparkRSQLsc", envir = .sparkREnv)) {
               get(".sparkRSQLsc", envir = .sparkREnv)
             } else {
               stop("no SQL context available")
             }
-            createDataFrame(sqlCtx, x, ...)
+            createDataFrame(sqlContext, x, ...)
           })
 
 #' Create a DataFrame from a JSON file.
@@ -185,24 +185,24 @@ setMethod("toDF", signature(x = "RDD"),
 #' Loads a JSON file (one object per line), returning the result as a DataFrame 
 #' It goes through the entire dataset once to determine the schema.
 #'
-#' @param sqlCtx SQLContext to use
+#' @param sqlContext SQLContext to use
 #' @param path Path of file to read. A vector of multiple paths is allowed.
 #' @return DataFrame
 #' @export
 #' @examples
 #'\dontrun{
 #' sc <- sparkR.init()
-#' sqlCtx <- sparkRSQL.init(sc)
+#' sqlContext <- sparkRSQL.init(sc)
 #' path <- "path/to/file.json"
-#' df <- jsonFile(sqlCtx, path)
+#' df <- jsonFile(sqlContext, path)
 #' }
 
-jsonFile <- function(sqlCtx, path) {
+jsonFile <- function(sqlContext, path) {
   # Allow the user to have a more flexible definiton of the text file path
   path <- normalizePath(path)
   # Convert a string vector of paths to a string containing comma separated paths
   path <- paste(path, collapse = ",")
-  sdf <- callJMethod(sqlCtx, "jsonFile", path)
+  sdf <- callJMethod(sqlContext, "jsonFile", path)
   dataFrame(sdf)
 }
 
@@ -211,7 +211,7 @@ jsonFile <- function(sqlCtx, path) {
 #
 # Loads an RDD storing one JSON object per string as a DataFrame.
 #
-# @param sqlCtx SQLContext to use
+# @param sqlContext SQLContext to use
 # @param rdd An RDD of JSON string
 # @param schema A StructType object to use as schema
 # @param samplingRatio The ratio of simpling used to infer the schema
@@ -220,16 +220,16 @@ jsonFile <- function(sqlCtx, path) {
 # @examples
 #\dontrun{
 # sc <- sparkR.init()
-# sqlCtx <- sparkRSQL.init(sc)
+# sqlContext <- sparkRSQL.init(sc)
 # rdd <- texFile(sc, "path/to/json")
-# df <- jsonRDD(sqlCtx, rdd)
+# df <- jsonRDD(sqlContext, rdd)
 # }
 
 # TODO: support schema
-jsonRDD <- function(sqlCtx, rdd, schema = NULL, samplingRatio = 1.0) {
+jsonRDD <- function(sqlContext, rdd, schema = NULL, samplingRatio = 1.0) {
   rdd <- serializeToString(rdd)
   if (is.null(schema)) {
-    sdf <- callJMethod(sqlCtx, "jsonRDD", callJMethod(getJRDD(rdd), "rdd"), samplingRatio)
+    sdf <- callJMethod(sqlContext, "jsonRDD", callJMethod(getJRDD(rdd), "rdd"), samplingRatio)
     dataFrame(sdf)
   } else {
     stop("not implemented")
@@ -241,64 +241,63 @@ jsonRDD <- function(sqlCtx, rdd, schema = NULL, samplingRatio = 1.0) {
 #' 
 #' Loads a Parquet file, returning the result as a DataFrame.
 #'
-#' @param sqlCtx SQLContext to use
+#' @param sqlContext SQLContext to use
 #' @param ... Path(s) of parquet file(s) to read.
 #' @return DataFrame
 #' @export
 
 # TODO: Implement saveasParquetFile and write examples for both
-parquetFile <- function(sqlCtx, ...) {
+parquetFile <- function(sqlContext, ...) {
   # Allow the user to have a more flexible definiton of the text file path
   paths <- lapply(list(...), normalizePath)
-  sdf <- callJMethod(sqlCtx, "parquetFile", paths)
+  sdf <- callJMethod(sqlContext, "parquetFile", paths)
   dataFrame(sdf)
 }
 
 #' SQL Query
-#' 
+#'
 #' Executes a SQL query using Spark, returning the result as a DataFrame.
 #'
-#' @param sqlCtx SQLContext to use
+#' @param sqlContext SQLContext to use
 #' @param sqlQuery A character vector containing the SQL query
 #' @return DataFrame
 #' @export
 #' @examples
 #'\dontrun{
 #' sc <- sparkR.init()
-#' sqlCtx <- sparkRSQL.init(sc)
+#' sqlContext <- sparkRSQL.init(sc)
 #' path <- "path/to/file.json"
-#' df <- jsonFile(sqlCtx, path)
+#' df <- jsonFile(sqlContext, path)
 #' registerTempTable(df, "table")
-#' new_df <- sql(sqlCtx, "SELECT * FROM table")
+#' new_df <- sql(sqlContext, "SELECT * FROM table")
 #' }
 
-sql <- function(sqlCtx, sqlQuery) {
-  sdf <- callJMethod(sqlCtx, "sql", sqlQuery)
-  dataFrame(sdf)
+sql <- function(sqlContext, sqlQuery) {
+ sdf <- callJMethod(sqlContext, "sql", sqlQuery)
+ dataFrame(sdf)
 }
 
-
 #' Create a DataFrame from a SparkSQL Table
 #' 
 #' Returns the specified Table as a DataFrame.  The Table must have already been registered
 #' in the SQLContext.
 #'
-#' @param sqlCtx SQLContext to use
+#' @param sqlContext SQLContext to use
 #' @param tableName The SparkSQL Table to convert to a DataFrame.
 #' @return DataFrame
 #' @export
 #' @examples
 #'\dontrun{
 #' sc <- sparkR.init()
-#' sqlCtx <- sparkRSQL.init(sc)
+#' sqlContext <- sparkRSQL.init(sc)
 #' path <- "path/to/file.json"
-#' df <- jsonFile(sqlCtx, path)
+#' df <- jsonFile(sqlContext, path)
 #' registerTempTable(df, "table")
-#' new_df <- table(sqlCtx, "table")
+#' new_df <- table(sqlContext, "table")
 #' }
 
-table <- function(sqlCtx, tableName) {
-  sdf <- callJMethod(sqlCtx, "table", tableName)
+table <- function(sqlContext, tableName) {
+  sdf <- callJMethod(sqlContext, "table", tableName)
   dataFrame(sdf) 
 }
 
@@ -307,22 +306,22 @@ table <- function(sqlCtx, tableName) {
 #'
 #' Returns a DataFrame containing names of tables in the given database.
 #'
-#' @param sqlCtx SQLContext to use
+#' @param sqlContext SQLContext to use
 #' @param databaseName name of the database
 #' @return a DataFrame
 #' @export
 #' @examples
 #'\dontrun{
 #' sc <- sparkR.init()
-#' sqlCtx <- sparkRSQL.init(sc)
-#' tables(sqlCtx, "hive")
+#' sqlContext <- sparkRSQL.init(sc)
+#' tables(sqlContext, "hive")
 #' }
 
-tables <- function(sqlCtx, databaseName = NULL) {
+tables <- function(sqlContext, databaseName = NULL) {
   jdf <- if (is.null(databaseName)) {
-    callJMethod(sqlCtx, "tables")
+    callJMethod(sqlContext, "tables")
   } else {
-    callJMethod(sqlCtx, "tables", databaseName)
+    callJMethod(sqlContext, "tables", databaseName)
   }
   dataFrame(jdf)
 }
@@ -332,22 +331,22 @@ tables <- function(sqlCtx, databaseName = NULL) {
 #'
 #' Returns the names of tables in the given database as an array.
 #'
-#' @param sqlCtx SQLContext to use
+#' @param sqlContext SQLContext to use
 #' @param databaseName name of the database
 #' @return a list of table names
 #' @export
 #' @examples
 #'\dontrun{
 #' sc <- sparkR.init()
-#' sqlCtx <- sparkRSQL.init(sc)
-#' tableNames(sqlCtx, "hive")
+#' sqlContext <- sparkRSQL.init(sc)
+#' tableNames(sqlContext, "hive")
 #' }
 
-tableNames <- function(sqlCtx, databaseName = NULL) {
+tableNames <- function(sqlContext, databaseName = NULL) {
   if (is.null(databaseName)) {
-    callJMethod(sqlCtx, "tableNames")
+    callJMethod(sqlContext, "tableNames")
   } else {
-    callJMethod(sqlCtx, "tableNames", databaseName)
+    callJMethod(sqlContext, "tableNames", databaseName)
   }
 }
 
@@ -356,58 +355,58 @@ tableNames <- function(sqlCtx, databaseName = NULL) {
 #' 
 #' Caches the specified table in-memory.
 #'
-#' @param sqlCtx SQLContext to use
+#' @param sqlContext SQLContext to use
 #' @param tableName The name of the table being cached
 #' @return DataFrame
 #' @export
 #' @examples
 #'\dontrun{
 #' sc <- sparkR.init()
-#' sqlCtx <- sparkRSQL.init(sc)
+#' sqlContext <- sparkRSQL.init(sc)
 #' path <- "path/to/file.json"
-#' df <- jsonFile(sqlCtx, path)
+#' df <- jsonFile(sqlContext, path)
 #' registerTempTable(df, "table")
-#' cacheTable(sqlCtx, "table")
+#' cacheTable(sqlContext, "table")
 #' }
 
-cacheTable <- function(sqlCtx, tableName) {
-  callJMethod(sqlCtx, "cacheTable", tableName)  
+cacheTable <- function(sqlContext, tableName) {
+  callJMethod(sqlContext, "cacheTable", tableName)  
 }
 
 #' Uncache Table
 #' 
 #' Removes the specified table from the in-memory cache.
 #'
-#' @param sqlCtx SQLContext to use
+#' @param sqlContext SQLContext to use
 #' @param tableName The name of the table being uncached
 #' @return DataFrame
 #' @export
 #' @examples
 #'\dontrun{
 #' sc <- sparkR.init()
-#' sqlCtx <- sparkRSQL.init(sc)
+#' sqlContext <- sparkRSQL.init(sc)
 #' path <- "path/to/file.json"
-#' df <- jsonFile(sqlCtx, path)
+#' df <- jsonFile(sqlContext, path)
 #' registerTempTable(df, "table")
-#' uncacheTable(sqlCtx, "table")
+#' uncacheTable(sqlContext, "table")
 #' }
 
-uncacheTable <- function(sqlCtx, tableName) {
-  callJMethod(sqlCtx, "uncacheTable", tableName)
+uncacheTable <- function(sqlContext, tableName) {
+  callJMethod(sqlContext, "uncacheTable", tableName)
 }
 
 #' Clear Cache
 #'
 #' Removes all cached tables from the in-memory cache.
 #'
-#' @param sqlCtx SQLContext to use
+#' @param sqlContext SQLContext to use
 #' @examples
 #' \dontrun{
-#' clearCache(sqlCtx)
+#' clearCache(sqlContext)
 #' }
 
-clearCache <- function(sqlCtx) {
-  callJMethod(sqlCtx, "clearCache")
+clearCache <- function(sqlContext) {
+  callJMethod(sqlContext, "clearCache")
 }
 
 #' Drop Temporary Table
@@ -415,22 +414,22 @@ clearCache <- function(sqlCtx) {
 #' Drops the temporary table with the given table name in the catalog.
 #' If the table has been cached/persisted before, it's also unpersisted.
 #'
-#' @param sqlCtx SQLContext to use
+#' @param sqlContext SQLContext to use
 #' @param tableName The name of the SparkSQL table to be dropped.
 #' @examples
 #' \dontrun{
 #' sc <- sparkR.init()
-#' sqlCtx <- sparkRSQL.init(sc)
-#' df <- read.df(sqlCtx, path, "parquet")
+#' sqlContext <- sparkRSQL.init(sc)
+#' df <- read.df(sqlContext, path, "parquet")
 #' registerTempTable(df, "table")
-#' dropTempTable(sqlCtx, "table")
+#' dropTempTable(sqlContext, "table")
 #' }
 
-dropTempTable <- function(sqlCtx, tableName) {
+dropTempTable <- function(sqlContext, tableName) {
   if (class(tableName) != "character") {
     stop("tableName must be a string.")
   }
-  callJMethod(sqlCtx, "dropTempTable", tableName)
+  callJMethod(sqlContext, "dropTempTable", tableName)
 }
 
 #' Load an DataFrame
@@ -441,7 +440,7 @@ dropTempTable <- function(sqlCtx, tableName) {
 #' If `source` is not specified, the default data source configured by
 #' "spark.sql.sources.default" will be used.
 #'
-#' @param sqlCtx SQLContext to use
+#' @param sqlContext SQLContext to use
 #' @param path The path of files to load
 #' @param source the name of external data source
 #' @return DataFrame
@@ -449,24 +448,24 @@ dropTempTable <- function(sqlCtx, tableName) {
 #' @examples
 #'\dontrun{
 #' sc <- sparkR.init()
-#' sqlCtx <- sparkRSQL.init(sc)
-#' df <- read.df(sqlCtx, "path/to/file.json", source = "json")
+#' sqlContext <- sparkRSQL.init(sc)
+#' df <- read.df(sqlContext, "path/to/file.json", source = "json")
 #' }
 
-read.df <- function(sqlCtx, path = NULL, source = NULL, ...) {
+read.df <- function(sqlContext, path = NULL, source = NULL, ...) {
   options <- varargsToEnv(...)
   if (!is.null(path)) {
     options[['path']] <- path
   }
-  sdf <- callJMethod(sqlCtx, "load", source, options)
+  sdf <- callJMethod(sqlContext, "load", source, options)
   dataFrame(sdf)
 }
 
 #' @aliases loadDF
 #' @export
 
-loadDF <- function(sqlCtx, path = NULL, source = NULL, ...) {
-  read.df(sqlCtx, path, source, ...)
+loadDF <- function(sqlContext, path = NULL, source = NULL, ...) {
+  read.df(sqlContext, path, source, ...)
 }
 
 #' Create an external table
@@ -478,7 +477,7 @@ loadDF <- function(sqlCtx, path = NULL, source = NULL, ...) {
 #' If `source` is not specified, the default data source configured by
 #' "spark.sql.sources.default" will be used.
 #'
-#' @param sqlCtx SQLContext to use
+#' @param sqlContext SQLContext to use
 #' @param tableName A name of the table
 #' @param path The path of files to load
 #' @param source the name of external data source
@@ -487,15 +486,15 @@ loadDF <- function(sqlCtx, path = NULL, source = NULL, ...) {
 #' @examples
 #'\dontrun{
 #' sc <- sparkR.init()
-#' sqlCtx <- sparkRSQL.init(sc)
-#' df <- sparkRSQL.createExternalTable(sqlCtx, "myjson", path="path/to/json", source="json")
+#' sqlContext <- sparkRSQL.init(sc)
+#' df <- sparkRSQL.createExternalTable(sqlContext, "myjson", path="path/to/json", source="json")
 #' }
 
-createExternalTable <- function(sqlCtx, tableName, path = NULL, source = NULL, ...) {
+createExternalTable <- function(sqlContext, tableName, path = NULL, source = NULL, ...) {
   options <- varargsToEnv(...)
   if (!is.null(path)) {
     options[['path']] <- path
   }
-  sdf <- callJMethod(sqlCtx, "createExternalTable", tableName, source, options)
+  sdf <- callJMethod(sqlContext, "createExternalTable", tableName, source, options)
   dataFrame(sdf)
 }
diff --git a/R/pkg/R/pairRDD.R b/R/pkg/R/pairRDD.R
index 7694652856da5..1e24286dbcae2 100644
--- a/R/pkg/R/pairRDD.R
+++ b/R/pkg/R/pairRDD.R
@@ -329,7 +329,7 @@ setMethod("reduceByKey",
               convertEnvsToList(keys, vals)
             }
             locallyReduced <- lapplyPartition(x, reduceVals)
-            shuffled <- partitionBy(locallyReduced, numPartitions)
+            shuffled <- partitionBy(locallyReduced, numToInt(numPartitions))
             lapplyPartition(shuffled, reduceVals)
           })
 
@@ -436,7 +436,7 @@ setMethod("combineByKey",
               convertEnvsToList(keys, combiners)
             }
             locallyCombined <- lapplyPartition(x, combineLocally)
-            shuffled <- partitionBy(locallyCombined, numPartitions)
+            shuffled <- partitionBy(locallyCombined, numToInt(numPartitions))
             mergeAfterShuffle <- function(part) {
               combiners <- new.env()
               keys <- new.env()
diff --git a/R/pkg/R/sparkR.R b/R/pkg/R/sparkR.R
index bc82df01f0fff..68387f0f5365d 100644
--- a/R/pkg/R/sparkR.R
+++ b/R/pkg/R/sparkR.R
@@ -222,7 +222,7 @@ sparkR.init <- function(
 #' @examples
 #'\dontrun{
 #' sc <- sparkR.init()
-#' sqlCtx <- sparkRSQL.init(sc)
+#' sqlContext <- sparkRSQL.init(sc)
 #'}
 
 sparkRSQL.init <- function(jsc) {
@@ -230,11 +230,11 @@ sparkRSQL.init <- function(jsc) {
     return(get(".sparkRSQLsc", envir = .sparkREnv))
   }
 
-  sqlCtx <- callJStatic("org.apache.spark.sql.api.r.SQLUtils",
+  sqlContext <- callJStatic("org.apache.spark.sql.api.r.SQLUtils",
                         "createSQLContext",
                         jsc)
-  assign(".sparkRSQLsc", sqlCtx, envir = .sparkREnv)
-  sqlCtx
+  assign(".sparkRSQLsc", sqlContext, envir = .sparkREnv)
+  sqlContext
 }
 
 #' Initialize a new HiveContext.
@@ -246,7 +246,7 @@ sparkRSQL.init <- function(jsc) {
 #' @examples
 #'\dontrun{
 #' sc <- sparkR.init()
-#' sqlCtx <- sparkRHive.init(sc)
+#' sqlContext <- sparkRHive.init(sc)
 #'}
 
 sparkRHive.init <- function(jsc) {
diff --git a/R/pkg/inst/profile/shell.R b/R/pkg/inst/profile/shell.R
index 33478d9e29995..ca94f1d4e7fd5 100644
--- a/R/pkg/inst/profile/shell.R
+++ b/R/pkg/inst/profile/shell.R
@@ -26,8 +26,8 @@
 
   sc <- SparkR::sparkR.init(Sys.getenv("MASTER", unset = ""))
   assign("sc", sc, envir=.GlobalEnv)
-  sqlCtx <- SparkR::sparkRSQL.init(sc)
-  assign("sqlCtx", sqlCtx, envir=.GlobalEnv)
+  sqlContext <- SparkR::sparkRSQL.init(sc)
+  assign("sqlContext", sqlContext, envir=.GlobalEnv)
   cat("\n Welcome to SparkR!")
-  cat("\n Spark context is available as sc, SQL context is available as sqlCtx\n")
+  cat("\n Spark context is available as sc, SQL context is available as sqlContext\n")
 }
diff --git a/R/pkg/inst/tests/test_sparkSQL.R b/R/pkg/inst/tests/test_sparkSQL.R
index 1768c57fd02e4..1857e636e8577 100644
--- a/R/pkg/inst/tests/test_sparkSQL.R
+++ b/R/pkg/inst/tests/test_sparkSQL.R
@@ -23,7 +23,7 @@ context("SparkSQL functions")
 
 sc <- sparkR.init()
 
-sqlCtx <- sparkRSQL.init(sc)
+sqlContext <- sparkRSQL.init(sc)
 
 mockLines <- c("{\"name\":\"Michael\"}",
                "{\"name\":\"Andy\", \"age\":30}",
@@ -67,25 +67,25 @@ test_that("structType and structField", {
 
 test_that("create DataFrame from RDD", {
   rdd <- lapply(parallelize(sc, 1:10), function(x) { list(x, as.character(x)) })
-  df <- createDataFrame(sqlCtx, rdd, list("a", "b"))
+  df <- createDataFrame(sqlContext, rdd, list("a", "b"))
   expect_true(inherits(df, "DataFrame"))
   expect_true(count(df) == 10)
   expect_equal(columns(df), c("a", "b"))
   expect_equal(dtypes(df), list(c("a", "int"), c("b", "string")))
 
-  df <- createDataFrame(sqlCtx, rdd)
+  df <- createDataFrame(sqlContext, rdd)
   expect_true(inherits(df, "DataFrame"))
   expect_equal(columns(df), c("_1", "_2"))
 
   schema <- structType(structField(x = "a", type = "integer", nullable = TRUE),
                         structField(x = "b", type = "string", nullable = TRUE))
-  df <- createDataFrame(sqlCtx, rdd, schema)
+  df <- createDataFrame(sqlContext, rdd, schema)
   expect_true(inherits(df, "DataFrame"))
   expect_equal(columns(df), c("a", "b"))
   expect_equal(dtypes(df), list(c("a", "int"), c("b", "string")))
 
   rdd <- lapply(parallelize(sc, 1:10), function(x) { list(a = x, b = as.character(x)) })
-  df <- createDataFrame(sqlCtx, rdd)
+  df <- createDataFrame(sqlContext, rdd)
   expect_true(inherits(df, "DataFrame"))
   expect_true(count(df) == 10)
   expect_equal(columns(df), c("a", "b"))
@@ -121,17 +121,17 @@ test_that("toDF", {
 
 test_that("create DataFrame from list or data.frame", {
   l <- list(list(1, 2), list(3, 4))
-  df <- createDataFrame(sqlCtx, l, c("a", "b"))
+  df <- createDataFrame(sqlContext, l, c("a", "b"))
   expect_equal(columns(df), c("a", "b"))
 
   l <- list(list(a=1, b=2), list(a=3, b=4))
-  df <- createDataFrame(sqlCtx, l)
+  df <- createDataFrame(sqlContext, l)
   expect_equal(columns(df), c("a", "b"))
 
   a <- 1:3
   b <- c("a", "b", "c")
   ldf <- data.frame(a, b)
-  df <- createDataFrame(sqlCtx, ldf)
+  df <- createDataFrame(sqlContext, ldf)
   expect_equal(columns(df), c("a", "b"))
   expect_equal(dtypes(df), list(c("a", "int"), c("b", "string")))
   expect_equal(count(df), 3)
@@ -142,7 +142,7 @@ test_that("create DataFrame from list or data.frame", {
 test_that("create DataFrame with different data types", {
   l <- list(a = 1L, b = 2, c = TRUE, d = "ss", e = as.Date("2012-12-13"),
             f = as.POSIXct("2015-03-15 12:13:14.056"))
-  df <- createDataFrame(sqlCtx, list(l))
+  df <- createDataFrame(sqlContext, list(l))
   expect_equal(dtypes(df), list(c("a", "int"), c("b", "double"), c("c", "boolean"),
                                 c("d", "string"), c("e", "date"), c("f", "timestamp")))
   expect_equal(count(df), 1)
@@ -154,7 +154,7 @@ test_that("create DataFrame with different data types", {
 #  e <- new.env()
 #  assign("n", 3L, envir = e)
 #  l <- list(1:10, list("a", "b"), e, list(a="aa", b=3L))
-#  df <- createDataFrame(sqlCtx, list(l), c("a", "b", "c", "d"))
+#  df <- createDataFrame(sqlContext, list(l), c("a", "b", "c", "d"))
 #  expect_equal(dtypes(df), list(c("a", "array<int>"), c("b", "array<string>"),
 #                                c("c", "map<string,int>"), c("d", "struct<a:string,b:int>")))
 #  expect_equal(count(df), 1)
@@ -163,7 +163,7 @@ test_that("create DataFrame with different data types", {
 #})
 
 test_that("jsonFile() on a local file returns a DataFrame", {
-  df <- jsonFile(sqlCtx, jsonPath)
+  df <- jsonFile(sqlContext, jsonPath)
   expect_true(inherits(df, "DataFrame"))
   expect_true(count(df) == 3)
 })
@@ -171,88 +171,88 @@ test_that("jsonFile() on a local file returns a DataFrame", {
 test_that("jsonRDD() on a RDD with json string", {
   rdd <- parallelize(sc, mockLines)
   expect_true(count(rdd) == 3)
-  df <- jsonRDD(sqlCtx, rdd)
+  df <- jsonRDD(sqlContext, rdd)
   expect_true(inherits(df, "DataFrame"))
   expect_true(count(df) == 3)
 
   rdd2 <- flatMap(rdd, function(x) c(x, x))
-  df <- jsonRDD(sqlCtx, rdd2)
+  df <- jsonRDD(sqlContext, rdd2)
   expect_true(inherits(df, "DataFrame"))
   expect_true(count(df) == 6)
 })
 
 test_that("test cache, uncache and clearCache", {
-  df <- jsonFile(sqlCtx, jsonPath)
+  df <- jsonFile(sqlContext, jsonPath)
   registerTempTable(df, "table1")
-  cacheTable(sqlCtx, "table1")
-  uncacheTable(sqlCtx, "table1")
-  clearCache(sqlCtx)
-  dropTempTable(sqlCtx, "table1")
+  cacheTable(sqlContext, "table1")
+  uncacheTable(sqlContext, "table1")
+  clearCache(sqlContext)
+  dropTempTable(sqlContext, "table1")
 })
 
 test_that("test tableNames and tables", {
-  df <- jsonFile(sqlCtx, jsonPath)
+  df <- jsonFile(sqlContext, jsonPath)
   registerTempTable(df, "table1")
-  expect_true(length(tableNames(sqlCtx)) == 1)
-  df <- tables(sqlCtx)
+  expect_true(length(tableNames(sqlContext)) == 1)
+  df <- tables(sqlContext)
   expect_true(count(df) == 1)
-  dropTempTable(sqlCtx, "table1")
+  dropTempTable(sqlContext, "table1")
 })
 
 test_that("registerTempTable() results in a queryable table and sql() results in a new DataFrame", {
-  df <- jsonFile(sqlCtx, jsonPath)
+  df <- jsonFile(sqlContext, jsonPath)
   registerTempTable(df, "table1")
-  newdf <- sql(sqlCtx, "SELECT * FROM table1 where name = 'Michael'")
+  newdf <- sql(sqlContext, "SELECT * FROM table1 where name = 'Michael'")
   expect_true(inherits(newdf, "DataFrame"))
   expect_true(count(newdf) == 1)
-  dropTempTable(sqlCtx, "table1")
+  dropTempTable(sqlContext, "table1")
 })
 
 test_that("insertInto() on a registered table", {
-  df <- read.df(sqlCtx, jsonPath, "json")
+  df <- read.df(sqlContext, jsonPath, "json")
   write.df(df, parquetPath, "parquet", "overwrite")
-  dfParquet <- read.df(sqlCtx, parquetPath, "parquet")
+  dfParquet <- read.df(sqlContext, parquetPath, "parquet")
 
   lines <- c("{\"name\":\"Bob\", \"age\":24}",
              "{\"name\":\"James\", \"age\":35}")
   jsonPath2 <- tempfile(pattern="jsonPath2", fileext=".tmp")
   parquetPath2 <- tempfile(pattern = "parquetPath2", fileext = ".parquet")
   writeLines(lines, jsonPath2)
-  df2 <- read.df(sqlCtx, jsonPath2, "json")
+  df2 <- read.df(sqlContext, jsonPath2, "json")
   write.df(df2, parquetPath2, "parquet", "overwrite")
-  dfParquet2 <- read.df(sqlCtx, parquetPath2, "parquet")
+  dfParquet2 <- read.df(sqlContext, parquetPath2, "parquet")
 
   registerTempTable(dfParquet, "table1")
   insertInto(dfParquet2, "table1")
-  expect_true(count(sql(sqlCtx, "select * from table1")) == 5)
-  expect_true(first(sql(sqlCtx, "select * from table1 order by age"))$name == "Michael")
-  dropTempTable(sqlCtx, "table1")
+  expect_true(count(sql(sqlContext, "select * from table1")) == 5)
+  expect_true(first(sql(sqlContext, "select * from table1 order by age"))$name == "Michael")
+  dropTempTable(sqlContext, "table1")
 
   registerTempTable(dfParquet, "table1")
   insertInto(dfParquet2, "table1", overwrite = TRUE)
-  expect_true(count(sql(sqlCtx, "select * from table1")) == 2)
-  expect_true(first(sql(sqlCtx, "select * from table1 order by age"))$name == "Bob")
-  dropTempTable(sqlCtx, "table1")
+  expect_true(count(sql(sqlContext, "select * from table1")) == 2)
+  expect_true(first(sql(sqlContext, "select * from table1 order by age"))$name == "Bob")
+  dropTempTable(sqlContext, "table1")
 })
 
 test_that("table() returns a new DataFrame", {
-  df <- jsonFile(sqlCtx, jsonPath)
+  df <- jsonFile(sqlContext, jsonPath)
   registerTempTable(df, "table1")
-  tabledf <- table(sqlCtx, "table1")
+  tabledf <- table(sqlContext, "table1")
   expect_true(inherits(tabledf, "DataFrame"))
   expect_true(count(tabledf) == 3)
-  dropTempTable(sqlCtx, "table1")
+  dropTempTable(sqlContext, "table1")
 })
 
 test_that("toRDD() returns an RRDD", {
-  df <- jsonFile(sqlCtx, jsonPath)
+  df <- jsonFile(sqlContext, jsonPath)
   testRDD <- toRDD(df)
   expect_true(inherits(testRDD, "RDD"))
   expect_true(count(testRDD) == 3)
 })
 
 test_that("union on two RDDs created from DataFrames returns an RRDD", {
-  df <- jsonFile(sqlCtx, jsonPath)
+  df <- jsonFile(sqlContext, jsonPath)
   RDD1 <- toRDD(df)
   RDD2 <- toRDD(df)
   unioned <- unionRDD(RDD1, RDD2)
@@ -274,7 +274,7 @@ test_that("union on mixed serialization types correctly returns a byte RRDD", {
   writeLines(textLines, textPath)
   textRDD <- textFile(sc, textPath)
 
-  df <- jsonFile(sqlCtx, jsonPath)
+  df <- jsonFile(sqlContext, jsonPath)
   dfRDD <- toRDD(df)
 
   unionByte <- unionRDD(rdd, dfRDD)
@@ -292,7 +292,7 @@ test_that("union on mixed serialization types correctly returns a byte RRDD", {
 
 test_that("objectFile() works with row serialization", {
   objectPath <- tempfile(pattern="spark-test", fileext=".tmp")
-  df <- jsonFile(sqlCtx, jsonPath)
+  df <- jsonFile(sqlContext, jsonPath)
   dfRDD <- toRDD(df)
   saveAsObjectFile(coalesce(dfRDD, 1L), objectPath)
   objectIn <- objectFile(sc, objectPath)
@@ -303,7 +303,7 @@ test_that("objectFile() works with row serialization", {
 })
 
 test_that("lapply() on a DataFrame returns an RDD with the correct columns", {
-  df <- jsonFile(sqlCtx, jsonPath)
+  df <- jsonFile(sqlContext, jsonPath)
   testRDD <- lapply(df, function(row) {
     row$newCol <- row$age + 5
     row
@@ -315,7 +315,7 @@ test_that("lapply() on a DataFrame returns an RDD with the correct columns", {
 })
 
 test_that("collect() returns a data.frame", {
-  df <- jsonFile(sqlCtx, jsonPath)
+  df <- jsonFile(sqlContext, jsonPath)
   rdf <- collect(df)
   expect_true(is.data.frame(rdf))
   expect_true(names(rdf)[1] == "age")
@@ -324,20 +324,20 @@ test_that("collect() returns a data.frame", {
 })
 
 test_that("limit() returns DataFrame with the correct number of rows", {
-  df <- jsonFile(sqlCtx, jsonPath)
+  df <- jsonFile(sqlContext, jsonPath)
   dfLimited <- limit(df, 2)
   expect_true(inherits(dfLimited, "DataFrame"))
   expect_true(count(dfLimited) == 2)
 })
 
 test_that("collect() and take() on a DataFrame return the same number of rows and columns", {
-  df <- jsonFile(sqlCtx, jsonPath)
+  df <- jsonFile(sqlContext, jsonPath)
   expect_true(nrow(collect(df)) == nrow(take(df, 10)))
   expect_true(ncol(collect(df)) == ncol(take(df, 10)))
 })
 
 test_that("multiple pipeline transformations starting with a DataFrame result in an RDD with the correct values", {
-  df <- jsonFile(sqlCtx, jsonPath)
+  df <- jsonFile(sqlContext, jsonPath)
   first <- lapply(df, function(row) {
     row$age <- row$age + 5
     row
@@ -354,7 +354,7 @@ test_that("multiple pipeline transformations starting with a DataFrame result in
 })
 
 test_that("cache(), persist(), and unpersist() on a DataFrame", {
-  df <- jsonFile(sqlCtx, jsonPath)
+  df <- jsonFile(sqlContext, jsonPath)
   expect_false(df@env$isCached)
   cache(df)
   expect_true(df@env$isCached)
@@ -373,7 +373,7 @@ test_that("cache(), persist(), and unpersist() on a DataFrame", {
 })
 
 test_that("schema(), dtypes(), columns(), names() return the correct values/format", {
-  df <- jsonFile(sqlCtx, jsonPath)
+  df <- jsonFile(sqlContext, jsonPath)
   testSchema <- schema(df)
   expect_true(length(testSchema$fields()) == 2)
   expect_true(testSchema$fields()[[1]]$dataType.toString() == "LongType")
@@ -394,7 +394,7 @@ test_that("schema(), dtypes(), columns(), names() return the correct values/form
 })
 
 test_that("head() and first() return the correct data", {
-  df <- jsonFile(sqlCtx, jsonPath)
+  df <- jsonFile(sqlContext, jsonPath)
   testHead <- head(df)
   expect_true(nrow(testHead) == 3)
   expect_true(ncol(testHead) == 2)
@@ -415,14 +415,14 @@ test_that("distinct() on DataFrames", {
   jsonPathWithDup <- tempfile(pattern="sparkr-test", fileext=".tmp")
   writeLines(lines, jsonPathWithDup)
 
-  df <- jsonFile(sqlCtx, jsonPathWithDup)
+  df <- jsonFile(sqlContext, jsonPathWithDup)
   uniques <- distinct(df)
   expect_true(inherits(uniques, "DataFrame"))
   expect_true(count(uniques) == 3)
 })
 
 test_that("sample on a DataFrame", {
-  df <- jsonFile(sqlCtx, jsonPath)
+  df <- jsonFile(sqlContext, jsonPath)
   sampled <- sample(df, FALSE, 1.0)
   expect_equal(nrow(collect(sampled)), count(df))
   expect_true(inherits(sampled, "DataFrame"))
@@ -435,7 +435,7 @@ test_that("sample on a DataFrame", {
 })
 
 test_that("select operators", {
-  df <- select(jsonFile(sqlCtx, jsonPath), "name", "age")
+  df <- select(jsonFile(sqlContext, jsonPath), "name", "age")
   expect_true(inherits(df$name, "Column"))
   expect_true(inherits(df[[2]], "Column"))
   expect_true(inherits(df[["age"]], "Column"))
@@ -461,7 +461,7 @@ test_that("select operators", {
 })
 
 test_that("select with column", {
-  df <- jsonFile(sqlCtx, jsonPath)
+  df <- jsonFile(sqlContext, jsonPath)
   df1 <- select(df, "name")
   expect_true(columns(df1) == c("name"))
   expect_true(count(df1) == 3)
@@ -472,7 +472,7 @@ test_that("select with column", {
 })
 
 test_that("selectExpr() on a DataFrame", {
-  df <- jsonFile(sqlCtx, jsonPath)
+  df <- jsonFile(sqlContext, jsonPath)
   selected <- selectExpr(df, "age * 2")
   expect_true(names(selected) == "(age * 2)")
   expect_equal(collect(selected), collect(select(df, df$age * 2L)))
@@ -483,7 +483,7 @@ test_that("selectExpr() on a DataFrame", {
 })
 
 test_that("column calculation", {
-  df <- jsonFile(sqlCtx, jsonPath)
+  df <- jsonFile(sqlContext, jsonPath)
   d <- collect(select(df, alias(df$age + 1, "age2")))
   expect_true(names(d) == c("age2"))
   df2 <- select(df, lower(df$name), abs(df$age))
@@ -492,15 +492,15 @@ test_that("column calculation", {
 })
 
 test_that("read.df() from json file", {
-  df <- read.df(sqlCtx, jsonPath, "json")
+  df <- read.df(sqlContext, jsonPath, "json")
   expect_true(inherits(df, "DataFrame"))
   expect_true(count(df) == 3)
 })
 
 test_that("write.df() as parquet file", {
-  df <- read.df(sqlCtx, jsonPath, "json")
+  df <- read.df(sqlContext, jsonPath, "json")
   write.df(df, parquetPath, "parquet", mode="overwrite")
-  df2 <- read.df(sqlCtx, parquetPath, "parquet")
+  df2 <- read.df(sqlContext, parquetPath, "parquet")
   expect_true(inherits(df2, "DataFrame"))
   expect_true(count(df2) == 3)
 })
@@ -553,7 +553,7 @@ test_that("column binary mathfunctions", {
              "{\"a\":4, \"b\":8}")
   jsonPathWithDup <- tempfile(pattern="sparkr-test", fileext=".tmp")
   writeLines(lines, jsonPathWithDup)
-  df <- jsonFile(sqlCtx, jsonPathWithDup)
+  df <- jsonFile(sqlContext, jsonPathWithDup)
   expect_equal(collect(select(df, atan2(df$a, df$b)))[1, "ATAN2(a, b)"], atan2(1, 5))
   expect_equal(collect(select(df, atan2(df$a, df$b)))[2, "ATAN2(a, b)"], atan2(2, 6))
   expect_equal(collect(select(df, atan2(df$a, df$b)))[3, "ATAN2(a, b)"], atan2(3, 7))
@@ -565,7 +565,7 @@ test_that("column binary mathfunctions", {
 })
 
 test_that("string operators", {
-  df <- jsonFile(sqlCtx, jsonPath)
+  df <- jsonFile(sqlContext, jsonPath)
   expect_equal(count(where(df, like(df$name, "A%"))), 1)
   expect_equal(count(where(df, startsWith(df$name, "A"))), 1)
   expect_equal(first(select(df, substr(df$name, 1, 2)))[[1]], "Mi")
@@ -573,7 +573,7 @@ test_that("string operators", {
 })
 
 test_that("group by", {
-  df <- jsonFile(sqlCtx, jsonPath)
+  df <- jsonFile(sqlContext, jsonPath)
   df1 <- agg(df, name = "max", age = "sum")
   expect_true(1 == count(df1))
   df1 <- agg(df, age2 = max(df$age))
@@ -610,7 +610,7 @@ test_that("group by", {
 })
 
 test_that("arrange() and orderBy() on a DataFrame", {
-  df <- jsonFile(sqlCtx, jsonPath)
+  df <- jsonFile(sqlContext, jsonPath)
   sorted <- arrange(df, df$age)
   expect_true(collect(sorted)[1,2] == "Michael")
 
@@ -627,7 +627,7 @@ test_that("arrange() and orderBy() on a DataFrame", {
 })
 
 test_that("filter() on a DataFrame", {
-  df <- jsonFile(sqlCtx, jsonPath)
+  df <- jsonFile(sqlContext, jsonPath)
   filtered <- filter(df, "age > 20")
   expect_true(count(filtered) == 1)
   expect_true(collect(filtered)$name == "Andy")
@@ -637,7 +637,7 @@ test_that("filter() on a DataFrame", {
 })
 
 test_that("join() on a DataFrame", {
-  df <- jsonFile(sqlCtx, jsonPath)
+  df <- jsonFile(sqlContext, jsonPath)
 
   mockLines2 <- c("{\"name\":\"Michael\", \"test\": \"yes\"}",
                   "{\"name\":\"Andy\",  \"test\": \"no\"}",
@@ -645,7 +645,7 @@ test_that("join() on a DataFrame", {
                   "{\"name\":\"Bob\", \"test\": \"yes\"}")
   jsonPath2 <- tempfile(pattern="sparkr-test", fileext=".tmp")
   writeLines(mockLines2, jsonPath2)
-  df2 <- jsonFile(sqlCtx, jsonPath2)
+  df2 <- jsonFile(sqlContext, jsonPath2)
 
   joined <- join(df, df2)
   expect_equal(names(joined), c("age", "name", "name", "test"))
@@ -668,7 +668,7 @@ test_that("join() on a DataFrame", {
 })
 
 test_that("toJSON() returns an RDD of the correct values", {
-  df <- jsonFile(sqlCtx, jsonPath)
+  df <- jsonFile(sqlContext, jsonPath)
   testRDD <- toJSON(df)
   expect_true(inherits(testRDD, "RDD"))
   expect_true(SparkR:::getSerializedMode(testRDD) == "string")
@@ -676,25 +676,25 @@ test_that("toJSON() returns an RDD of the correct values", {
 })
 
 test_that("showDF()", {
-  df <- jsonFile(sqlCtx, jsonPath)
+  df <- jsonFile(sqlContext, jsonPath)
   s <- capture.output(showDF(df))
   expect_output(s , "+----+-------+\n| age|   name|\n+----+-------+\n|null|Michael|\n|  30|   Andy|\n|  19| Justin|\n+----+-------+\n")
 })
 
 test_that("isLocal()", {
-  df <- jsonFile(sqlCtx, jsonPath)
+  df <- jsonFile(sqlContext, jsonPath)
   expect_false(isLocal(df))
 })
 
 test_that("unionAll(), except(), and intersect() on a DataFrame", {
-  df <- jsonFile(sqlCtx, jsonPath)
+  df <- jsonFile(sqlContext, jsonPath)
 
   lines <- c("{\"name\":\"Bob\", \"age\":24}",
              "{\"name\":\"Andy\", \"age\":30}",
              "{\"name\":\"James\", \"age\":35}")
   jsonPath2 <- tempfile(pattern="sparkr-test", fileext=".tmp")
   writeLines(lines, jsonPath2)
-  df2 <- read.df(sqlCtx, jsonPath2, "json")
+  df2 <- read.df(sqlContext, jsonPath2, "json")
 
   unioned <- arrange(unionAll(df, df2), df$age)
   expect_true(inherits(unioned, "DataFrame"))
@@ -713,7 +713,7 @@ test_that("unionAll(), except(), and intersect() on a DataFrame", {
 })
 
 test_that("withColumn() and withColumnRenamed()", {
-  df <- jsonFile(sqlCtx, jsonPath)
+  df <- jsonFile(sqlContext, jsonPath)
   newDF <- withColumn(df, "newAge", df$age + 2)
   expect_true(length(columns(newDF)) == 3)
   expect_true(columns(newDF)[3] == "newAge")
@@ -725,7 +725,7 @@ test_that("withColumn() and withColumnRenamed()", {
 })
 
 test_that("mutate() and rename()", {
-  df <- jsonFile(sqlCtx, jsonPath)
+  df <- jsonFile(sqlContext, jsonPath)
   newDF <- mutate(df, newAge = df$age + 2)
   expect_true(length(columns(newDF)) == 3)
   expect_true(columns(newDF)[3] == "newAge")
@@ -737,25 +737,25 @@ test_that("mutate() and rename()", {
 })
 
 test_that("write.df() on DataFrame and works with parquetFile", {
-  df <- jsonFile(sqlCtx, jsonPath)
+  df <- jsonFile(sqlContext, jsonPath)
   write.df(df, parquetPath, "parquet", mode="overwrite")
-  parquetDF <- parquetFile(sqlCtx, parquetPath)
+  parquetDF <- parquetFile(sqlContext, parquetPath)
   expect_true(inherits(parquetDF, "DataFrame"))
   expect_equal(count(df), count(parquetDF))
 })
 
 test_that("parquetFile works with multiple input paths", {
-  df <- jsonFile(sqlCtx, jsonPath)
+  df <- jsonFile(sqlContext, jsonPath)
   write.df(df, parquetPath, "parquet", mode="overwrite")
   parquetPath2 <- tempfile(pattern = "parquetPath2", fileext = ".parquet")
   write.df(df, parquetPath2, "parquet", mode="overwrite")
-  parquetDF <- parquetFile(sqlCtx, parquetPath, parquetPath2)
+  parquetDF <- parquetFile(sqlContext, parquetPath, parquetPath2)
   expect_true(inherits(parquetDF, "DataFrame"))
   expect_true(count(parquetDF) == count(df)*2)
 })
 
 test_that("describe() on a DataFrame", {
-  df <- jsonFile(sqlCtx, jsonPath)
+  df <- jsonFile(sqlContext, jsonPath)
   stats <- describe(df, "age")
   expect_equal(collect(stats)[1, "summary"], "count")
   expect_equal(collect(stats)[2, "age"], "24.5")
diff --git a/docs/_plugins/copy_api_dirs.rb b/docs/_plugins/copy_api_dirs.rb
index 0ea3f8eab461b..6073b3626c45b 100644
--- a/docs/_plugins/copy_api_dirs.rb
+++ b/docs/_plugins/copy_api_dirs.rb
@@ -18,50 +18,52 @@
 require 'fileutils'
 include FileUtils
 
-if not (ENV['SKIP_API'] == '1' or ENV['SKIP_SCALADOC'] == '1')
-  # Build Scaladoc for Java/Scala
+if not (ENV['SKIP_API'] == '1')
+  if not (ENV['SKIP_SCALADOC'] == '1')
+    # Build Scaladoc for Java/Scala
 
-  puts "Moving to project root and building API docs."
-  curr_dir = pwd
-  cd("..")
+    puts "Moving to project root and building API docs."
+    curr_dir = pwd
+    cd("..")
 
-  puts "Running 'build/sbt -Pkinesis-asl compile unidoc' from " + pwd + "; this may take a few minutes..."
-  puts `build/sbt -Pkinesis-asl compile unidoc`
+    puts "Running 'build/sbt -Pkinesis-asl compile unidoc' from " + pwd + "; this may take a few minutes..."
+    puts `build/sbt -Pkinesis-asl compile unidoc`
 
-  puts "Moving back into docs dir."
-  cd("docs")
+    puts "Moving back into docs dir."
+    cd("docs")
 
-  # Copy over the unified ScalaDoc for all projects to api/scala.
-  # This directory will be copied over to _site when `jekyll` command is run.
-  source = "../target/scala-2.10/unidoc"
-  dest = "api/scala"
+    # Copy over the unified ScalaDoc for all projects to api/scala.
+    # This directory will be copied over to _site when `jekyll` command is run.
+    source = "../target/scala-2.10/unidoc"
+    dest = "api/scala"
 
-  puts "Making directory " + dest
-  mkdir_p dest
+    puts "Making directory " + dest
+    mkdir_p dest
 
-  # From the rubydoc: cp_r('src', 'dest') makes src/dest, but this doesn't.
-  puts "cp -r " + source + "/. " + dest
-  cp_r(source + "/.", dest)
+    # From the rubydoc: cp_r('src', 'dest') makes src/dest, but this doesn't.
+    puts "cp -r " + source + "/. " + dest
+    cp_r(source + "/.", dest)
 
-  # Append custom JavaScript
-  js = File.readlines("./js/api-docs.js")
-  js_file = dest + "/lib/template.js"
-  File.open(js_file, 'a') { |f| f.write("\n" + js.join()) }
+    # Append custom JavaScript
+    js = File.readlines("./js/api-docs.js")
+    js_file = dest + "/lib/template.js"
+    File.open(js_file, 'a') { |f| f.write("\n" + js.join()) }
 
-  # Append custom CSS
-  css = File.readlines("./css/api-docs.css")
-  css_file = dest + "/lib/template.css"
-  File.open(css_file, 'a') { |f| f.write("\n" + css.join()) }
+    # Append custom CSS
+    css = File.readlines("./css/api-docs.css")
+    css_file = dest + "/lib/template.css"
+    File.open(css_file, 'a') { |f| f.write("\n" + css.join()) }
 
-  # Copy over the unified JavaDoc for all projects to api/java.
-  source = "../target/javaunidoc"
-  dest = "api/java"
+    # Copy over the unified JavaDoc for all projects to api/java.
+    source = "../target/javaunidoc"
+    dest = "api/java"
 
-  puts "Making directory " + dest
-  mkdir_p dest
+    puts "Making directory " + dest
+    mkdir_p dest
 
-  puts "cp -r " + source + "/. " + dest
-  cp_r(source + "/.", dest)
+    puts "cp -r " + source + "/. " + dest
+    cp_r(source + "/.", dest)
+  end
 
   # Build Sphinx docs for Python
 
diff --git a/docs/api.md b/docs/api.md
index 03460383335e8..45df77ac05f78 100644
--- a/docs/api.md
+++ b/docs/api.md
@@ -7,4 +7,5 @@ Here you can API docs for Spark and its submodules.
 
 - [Spark Scala API (Scaladoc)](api/scala/index.html)
 - [Spark Java API (Javadoc)](api/java/index.html)
-- [Spark Python API (Epydoc)](api/python/index.html)
+- [Spark Python API (Sphinx)](api/python/index.html)
+- [Spark R API (Roxygen2)](api/R/index.html)
diff --git a/docs/index.md b/docs/index.md
index b5b016e34795e..5ef6d983c45a5 100644
--- a/docs/index.md
+++ b/docs/index.md
@@ -6,7 +6,7 @@ description: Apache Spark SPARK_VERSION_SHORT documentation homepage
 ---
 
 Apache Spark is a fast and general-purpose cluster computing system.
-It provides high-level APIs in Java, Scala and Python,
+It provides high-level APIs in Java, Scala, Python and R,
 and an optimized engine that supports general execution graphs.
 It also supports a rich set of higher-level tools including [Spark SQL](sql-programming-guide.html) for SQL and structured data processing, [MLlib](mllib-guide.html) for machine learning, [GraphX](graphx-programming-guide.html) for graph processing, and [Spark Streaming](streaming-programming-guide.html).
 
@@ -20,13 +20,13 @@ Spark runs on both Windows and UNIX-like systems (e.g. Linux, Mac OS). It's easy
 locally on one machine --- all you need is to have `java` installed on your system `PATH`,
 or the `JAVA_HOME` environment variable pointing to a Java installation.
 
-Spark runs on Java 6+ and Python 2.6+. For the Scala API, Spark {{site.SPARK_VERSION}} uses
+Spark runs on Java 6+, Python 2.6+ and R 3.1+. For the Scala API, Spark {{site.SPARK_VERSION}} uses
 Scala {{site.SCALA_BINARY_VERSION}}. You will need to use a compatible Scala version 
 ({{site.SCALA_BINARY_VERSION}}.x).
 
 # Running the Examples and Shell
 
-Spark comes with several sample programs.  Scala, Java and Python examples are in the
+Spark comes with several sample programs.  Scala, Java, Python and R examples are in the
 `examples/src/main` directory. To run one of the Java or Scala sample programs, use
 `bin/run-example <class> [params]` in the top-level Spark directory. (Behind the scenes, this
 invokes the more general
@@ -54,6 +54,15 @@ Example applications are also provided in Python. For example,
 
     ./bin/spark-submit examples/src/main/python/pi.py 10
 
+Spark also provides an experimental R API since 1.4 (only DataFrames APIs included).
+To run Spark interactively in a R interpreter, use `bin/sparkR`:
+
+    ./bin/sparkR --master local[2]
+
+Example applications are also provided in R. For example,
+    
+    ./bin/spark-submit examples/src/main/r/dataframe.R
+
 # Launching on a Cluster
 
 The Spark [cluster mode overview](cluster-overview.html) explains the key concepts in running on a cluster.
@@ -71,7 +80,7 @@ options for deployment:
 
 * [Quick Start](quick-start.html): a quick introduction to the Spark API; start here!
 * [Spark Programming Guide](programming-guide.html): detailed overview of Spark
-  in all supported languages (Scala, Java, Python)
+  in all supported languages (Scala, Java, Python, R)
 * Modules built on Spark:
   * [Spark Streaming](streaming-programming-guide.html): processing real-time data streams
   * [Spark SQL and DataFrames](sql-programming-guide.html): support for structured data and relational queries
@@ -83,7 +92,8 @@ options for deployment:
 
 * [Spark Scala API (Scaladoc)](api/scala/index.html#org.apache.spark.package)
 * [Spark Java API (Javadoc)](api/java/index.html)
-* [Spark Python API (Epydoc)](api/python/index.html)
+* [Spark Python API (Sphinx)](api/python/index.html)
+* [Spark R API (Roxygen2)](api/R/index.html)
 
 **Deployment Guides:**
 
@@ -124,4 +134,5 @@ options for deployment:
   available online for free.
 * [Code Examples](http://spark.apache.org/examples.html): more are also available in the `examples` subfolder of Spark ([Scala]({{site.SPARK_GITHUB_URL}}/tree/master/examples/src/main/scala/org/apache/spark/examples),
  [Java]({{site.SPARK_GITHUB_URL}}/tree/master/examples/src/main/java/org/apache/spark/examples),
- [Python]({{site.SPARK_GITHUB_URL}}/tree/master/examples/src/main/python))
+ [Python]({{site.SPARK_GITHUB_URL}}/tree/master/examples/src/main/python),
+ [R]({{site.SPARK_GITHUB_URL}}/tree/master/examples/src/main/r))
diff --git a/docs/programming-guide.md b/docs/programming-guide.md
index 07a4d29fe7104..5d9df282efed8 100644
--- a/docs/programming-guide.md
+++ b/docs/programming-guide.md
@@ -98,9 +98,9 @@ to your version of HDFS. Some common HDFS version tags are listed on the
 [Prebuilt packages](http://spark.apache.org/downloads.html) are also available on the Spark homepage
 for common HDFS versions.
 
-Finally, you need to import some Spark classes into your program. Add the following lines:
+Finally, you need to import some Spark classes into your program. Add the following line:
 
-{% highlight scala %}
+{% highlight python %}
 from pyspark import SparkContext, SparkConf
 {% endhighlight %}
 
@@ -478,7 +478,6 @@ the [Converter examples]({{site.SPARK_GITHUB_URL}}/tree/master/examples/src/main
 for examples of using Cassandra / HBase ```InputFormat``` and ```OutputFormat``` with custom converters.
 
 </div>
-
 </div>
 
 ## RDD Operations
@@ -915,7 +914,8 @@ The following table lists some of the common transformations supported by Spark.
 RDD API doc
 ([Scala](api/scala/index.html#org.apache.spark.rdd.RDD),
  [Java](api/java/index.html?org/apache/spark/api/java/JavaRDD.html),
- [Python](api/python/pyspark.html#pyspark.RDD))
+ [Python](api/python/pyspark.html#pyspark.RDD),
+ [R](api/R/index.html))
 and pair RDD functions doc
 ([Scala](api/scala/index.html#org.apache.spark.rdd.PairRDDFunctions),
  [Java](api/java/index.html?org/apache/spark/api/java/JavaPairRDD.html))
@@ -1028,7 +1028,9 @@ The following table lists some of the common actions supported by Spark. Refer t
 RDD API doc
 ([Scala](api/scala/index.html#org.apache.spark.rdd.RDD),
  [Java](api/java/index.html?org/apache/spark/api/java/JavaRDD.html),
- [Python](api/python/pyspark.html#pyspark.RDD))
+ [Python](api/python/pyspark.html#pyspark.RDD),
+ [R](api/R/index.html))
+ 
 and pair RDD functions doc
 ([Scala](api/scala/index.html#org.apache.spark.rdd.PairRDDFunctions),
  [Java](api/java/index.html?org/apache/spark/api/java/JavaPairRDD.html))
@@ -1565,7 +1567,8 @@ You can see some [example Spark programs](http://spark.apache.org/examples.html)
 In addition, Spark includes several samples in the `examples` directory
 ([Scala]({{site.SPARK_GITHUB_URL}}/tree/master/examples/src/main/scala/org/apache/spark/examples),
  [Java]({{site.SPARK_GITHUB_URL}}/tree/master/examples/src/main/java/org/apache/spark/examples),
- [Python]({{site.SPARK_GITHUB_URL}}/tree/master/examples/src/main/python)).
+ [Python]({{site.SPARK_GITHUB_URL}}/tree/master/examples/src/main/python),
+ [R]({{site.SPARK_GITHUB_URL}}/tree/master/examples/src/main/r)).
 You can run Java and Scala examples by passing the class name to Spark's `bin/run-example` script; for instance:
 
     ./bin/run-example SparkPi
@@ -1574,6 +1577,10 @@ For Python examples, use `spark-submit` instead:
 
     ./bin/spark-submit examples/src/main/python/pi.py
 
+For R examples, use `spark-submit` instead:
+
+    ./bin/spark-submit examples/src/main/r/dataframe.R
+
 For help on optimizing your programs, the [configuration](configuration.html) and
 [tuning](tuning.html) guides provide information on best practices. They are especially important for
 making sure that your data is stored in memory in an efficient format.
@@ -1581,4 +1588,4 @@ For help on deploying, the [cluster mode overview](cluster-overview.html) descri
 in distributed operation and supported cluster managers.
 
 Finally, full API documentation is available in
-[Scala](api/scala/#org.apache.spark.package), [Java](api/java/) and [Python](api/python/).
+[Scala](api/scala/#org.apache.spark.package), [Java](api/java/), [Python](api/python/) and [R](api/R/).
diff --git a/docs/quick-start.md b/docs/quick-start.md
index 81143da865cf0..bb39e4111f244 100644
--- a/docs/quick-start.md
+++ b/docs/quick-start.md
@@ -184,10 +184,10 @@ scala> linesWithSpark.cache()
 res7: spark.RDD[String] = spark.FilteredRDD@17e51082
 
 scala> linesWithSpark.count()
-res8: Long = 15
+res8: Long = 19
 
 scala> linesWithSpark.count()
-res9: Long = 15
+res9: Long = 19
 {% endhighlight %}
 
 It may seem silly to use Spark to explore and cache a 100-line text file. The interesting part is
@@ -202,10 +202,10 @@ a cluster, as described in the [programming guide](programming-guide.html#initia
 >>> linesWithSpark.cache()
 
 >>> linesWithSpark.count()
-15
+19
 
 >>> linesWithSpark.count()
-15
+19
 {% endhighlight %}
 
 It may seem silly to use Spark to explore and cache a 100-line text file. The interesting part is
@@ -423,14 +423,14 @@ dependencies to `spark-submit` through its `--py-files` argument by packaging th
 
 We can run this application using the `bin/spark-submit` script:
 
-{% highlight python %}
+{% highlight bash %}
 # Use spark-submit to run your application
 $ YOUR_SPARK_HOME/bin/spark-submit \
   --master local[4] \
   SimpleApp.py
 ...
 Lines with a: 46, Lines with b: 23
-{% endhighlight python %}
+{% endhighlight %}
 
 </div>
 </div>
@@ -444,7 +444,8 @@ Congratulations on running your first Spark application!
 * Finally, Spark includes several samples in the `examples` directory
 ([Scala]({{site.SPARK_GITHUB_URL}}/tree/master/examples/src/main/scala/org/apache/spark/examples),
  [Java]({{site.SPARK_GITHUB_URL}}/tree/master/examples/src/main/java/org/apache/spark/examples),
- [Python]({{site.SPARK_GITHUB_URL}}/tree/master/examples/src/main/python)).
+ [Python]({{site.SPARK_GITHUB_URL}}/tree/master/examples/src/main/python),
+ [R]({{site.SPARK_GITHUB_URL}}/tree/master/examples/src/main/r)).
 You can run them as follows:
 
 {% highlight bash %}
@@ -453,4 +454,7 @@ You can run them as follows:
 
 # For Python examples, use spark-submit directly:
 ./bin/spark-submit examples/src/main/python/pi.py
+
+# For R examples, use spark-submit directly:
+./bin/spark-submit examples/src/main/r/dataframe.R
 {% endhighlight %}
diff --git a/docs/sql-programming-guide.md b/docs/sql-programming-guide.md
index 78b8e8ad515a0..5b41c0ee6e430 100644
--- a/docs/sql-programming-guide.md
+++ b/docs/sql-programming-guide.md
@@ -16,9 +16,9 @@ Spark SQL is a Spark module for structured data processing. It provides a progra
 
 A DataFrame is a distributed collection of data organized into named columns. It is conceptually equivalent to a table in a relational database or a data frame in R/Python, but with richer optimizations under the hood. DataFrames can be constructed from a wide array of sources such as: structured data files, tables in Hive, external databases, or existing RDDs.
 
-The DataFrame API is available in [Scala](api/scala/index.html#org.apache.spark.sql.DataFrame), [Java](api/java/index.html?org/apache/spark/sql/DataFrame.html), and [Python](api/python/pyspark.sql.html#pyspark.sql.DataFrame).
+The DataFrame API is available in [Scala](api/scala/index.html#org.apache.spark.sql.DataFrame), [Java](api/java/index.html?org/apache/spark/sql/DataFrame.html), [Python](api/python/pyspark.sql.html#pyspark.sql.DataFrame), and [R](api/R/index.html).
 
-All of the examples on this page use sample data included in the Spark distribution and can be run in the `spark-shell` or the `pyspark` shell.
+All of the examples on this page use sample data included in the Spark distribution and can be run in the `spark-shell`, `pyspark` shell, or `sparkR` shell.
 
 
 ## Starting Point: `SQLContext`
@@ -64,6 +64,17 @@ from pyspark.sql import SQLContext
 sqlContext = SQLContext(sc)
 {% endhighlight %}
 
+</div>
+
+<div data-lang="r"  markdown="1">
+
+The entry point into all relational functionality in Spark is the
+`SQLContext` class, or one of its decedents.  To create a basic `SQLContext`, all you need is a SparkContext.
+
+{% highlight r %}
+sqlContext <- sparkRSQL.init(sc)
+{% endhighlight %}
+
 </div>
 </div>
 
@@ -130,6 +141,19 @@ df.show()
 {% endhighlight %}
 
 </div>
+
+<div data-lang="r"  markdown="1">
+{% highlight r %}
+sqlContext <- SQLContext(sc)
+
+df <- jsonFile(sqlContext, "examples/src/main/resources/people.json")
+
+# Displays the content of the DataFrame to stdout
+showDF(df)
+{% endhighlight %}
+
+</div>
+
 </div>
 
 
@@ -296,6 +320,57 @@ df.groupBy("age").count().show()
 {% endhighlight %}
 
 </div>
+
+<div data-lang="r"  markdown="1">
+{% highlight r %}
+sqlContext <- sparkRSQL.init(sc)
+
+# Create the DataFrame
+df <- jsonFile(sqlContext, "examples/src/main/resources/people.json")
+
+# Show the content of the DataFrame
+showDF(df)
+## age  name
+## null Michael
+## 30   Andy
+## 19   Justin
+
+# Print the schema in a tree format
+printSchema(df)
+## root
+## |-- age: long (nullable = true)
+## |-- name: string (nullable = true)
+
+# Select only the "name" column
+showDF(select(df, "name"))
+## name
+## Michael
+## Andy
+## Justin
+
+# Select everybody, but increment the age by 1
+showDF(select(df, df$name, df$age + 1))
+## name    (age + 1)
+## Michael null
+## Andy    31
+## Justin  20
+
+# Select people older than 21
+showDF(where(df, df$age > 21))
+## age name
+## 30  Andy
+
+# Count people by age
+showDF(count(groupBy(df, "age")))
+## age  count
+## null 1
+## 19   1
+## 30   1
+
+{% endhighlight %}
+
+</div>
+
 </div>
 
 
@@ -325,6 +400,14 @@ sqlContext = SQLContext(sc)
 df = sqlContext.sql("SELECT * FROM table")
 {% endhighlight %}
 </div>
+
+<div data-lang="r"  markdown="1">
+{% highlight r %}
+sqlContext <- sparkRSQL.init(sc)
+df <- sql(sqlContext, "SELECT * FROM table")
+{% endhighlight %}
+</div>
+
 </div>
 
 
@@ -719,6 +802,15 @@ df.select("name", "favorite_color").save("namesAndFavColors.parquet")
 
 {% endhighlight %}
 
+</div>
+
+<div data-lang="r"  markdown="1">
+
+{% highlight r %}
+df <- loadDF(sqlContext, "people.parquet")
+saveDF(select(df, "name", "age"), "namesAndAges.parquet")
+{% endhighlight %}
+
 </div>
 </div>
 
@@ -760,6 +852,16 @@ df.select("name", "age").save("namesAndAges.parquet", "parquet")
 
 {% endhighlight %}
 
+</div>
+<div data-lang="r"  markdown="1">
+
+{% highlight r %}
+
+df <- loadDF(sqlContext, "people.json", "json")
+saveDF(select(df, "name", "age"), "namesAndAges.parquet", "parquet")
+
+{% endhighlight %}
+
 </div>
 </div>
 
@@ -908,6 +1010,31 @@ for teenName in teenNames.collect():
 
 </div>
 
+<div data-lang="r"  markdown="1">
+
+{% highlight r %}
+# sqlContext from the previous example is used in this example.
+
+schemaPeople # The DataFrame from the previous example.
+
+# DataFrames can be saved as Parquet files, maintaining the schema information.
+saveAsParquetFile(schemaPeople, "people.parquet")
+
+# Read in the Parquet file created above.  Parquet files are self-describing so the schema is preserved.
+# The result of loading a parquet file is also a DataFrame.
+parquetFile <- parquetFile(sqlContext, "people.parquet")
+
+# Parquet files can also be registered as tables and then used in SQL statements.
+registerTempTable(parquetFile, "parquetFile");
+teenagers <- sql(sqlContext, "SELECT name FROM parquetFile WHERE age >= 13 AND age <= 19")
+teenNames <- map(teenagers, function(p) { paste("Name:", p$name)})
+for (teenName in collect(teenNames)) {
+  cat(teenName, "\n")
+} 
+{% endhighlight %}
+
+</div>
+
 <div data-lang="sql"  markdown="1">
 
 {% highlight sql %}
@@ -1033,7 +1160,7 @@ df2 = sqlContext.createDataFrame(sc.parallelize(range(6, 11))
 df2.save("data/test_table/key=2", "parquet")
 
 # Read the partitioned table
-df3 = sqlContext.parquetFile("data/test_table")
+df3 = sqlContext.load("data/test_table", "parquet")
 df3.printSchema()
 
 # The final schema consists of all 3 columns in the Parquet files together
@@ -1047,6 +1174,33 @@ df3.printSchema()
 
 </div>
 
+<div data-lang="r"  markdown="1">
+
+{% highlight r %}
+# sqlContext from the previous example is used in this example.
+
+# Create a simple DataFrame, stored into a partition directory
+saveDF(df1, "data/test_table/key=1", "parquet", "overwrite")
+
+# Create another DataFrame in a new partition directory,
+# adding a new column and dropping an existing column
+saveDF(df2, "data/test_table/key=2", "parquet", "overwrite")
+
+# Read the partitioned table
+df3 <- loadDF(sqlContext, "data/test_table", "parquet")
+printSchema(df3)
+
+# The final schema consists of all 3 columns in the Parquet files together
+# with the partiioning column appeared in the partition directory paths.
+# root
+# |-- single: int (nullable = true)
+# |-- double: int (nullable = true)
+# |-- triple: int (nullable = true)
+# |-- key : int (nullable = true)
+{% endhighlight %}
+
+</div>
+
 </div>
 
 ### Configuration
@@ -1238,6 +1392,40 @@ anotherPeople = sqlContext.jsonRDD(anotherPeopleRDD)
 {% endhighlight %}
 </div>
 
+<div data-lang="r"  markdown="1">
+Spark SQL can automatically infer the schema of a JSON dataset and load it as a DataFrame.
+This conversion can be done using one of two methods in a `SQLContext`:
+
+* `jsonFile` - loads data from a directory of JSON files where each line of the files is a JSON object.
+
+Note that the file that is offered as _jsonFile_ is not a typical JSON file. Each
+line must contain a separate, self-contained valid JSON object. As a consequence,
+a regular multi-line JSON file will most often fail.
+
+{% highlight r %}
+# sc is an existing SparkContext.
+sqlContext <- sparkRSQL.init(sc)
+
+# A JSON dataset is pointed to by path.
+# The path can be either a single text file or a directory storing text files.
+path <- "examples/src/main/resources/people.json"
+# Create a DataFrame from the file(s) pointed to by path
+people <- jsonFile(sqlContex,t path)
+
+# The inferred schema can be visualized using the printSchema() method.
+printSchema(people)
+# root
+#  |-- age: integer (nullable = true)
+#  |-- name: string (nullable = true)
+
+# Register this DataFrame as a table.
+registerTempTable(people, "people")
+
+# SQL statements can be run by using the sql methods provided by `sqlContext`.
+teenagers <- sql(sqlContext, "SELECT name FROM people WHERE age >= 13 AND age <= 19")
+{% endhighlight %}
+</div>
+
 <div data-lang="sql"  markdown="1">
 
 {% highlight sql %}
@@ -1314,10 +1502,7 @@ Row[] results = sqlContext.sql("FROM src SELECT key, value").collect();
 <div data-lang="python"  markdown="1">
 
 When working with Hive one must construct a `HiveContext`, which inherits from `SQLContext`, and
-adds support for finding tables in the MetaStore and writing queries using HiveQL. In addition to
-the `sql` method a `HiveContext` also provides an `hql` methods, which allows queries to be
-expressed in HiveQL.
-
+adds support for finding tables in the MetaStore and writing queries using HiveQL. 
 {% highlight python %}
 # sc is an existing SparkContext.
 from pyspark.sql import HiveContext
@@ -1331,6 +1516,24 @@ results = sqlContext.sql("FROM src SELECT key, value").collect()
 
 {% endhighlight %}
 
+</div>
+
+<div data-lang="r"  markdown="1">
+
+When working with Hive one must construct a `HiveContext`, which inherits from `SQLContext`, and
+adds support for finding tables in the MetaStore and writing queries using HiveQL.
+{% highlight r %}
+# sc is an existing SparkContext.
+sqlContext <- sparkRHive.init(sc)
+
+hql(sqlContext, "CREATE TABLE IF NOT EXISTS src (key INT, value STRING)")
+hql(sqlContext, "LOAD DATA LOCAL INPATH 'examples/src/main/resources/kv1.txt' INTO TABLE src")
+
+# Queries can be expressed in HiveQL.
+results = sqlContext.sql("FROM src SELECT key, value").collect()
+
+{% endhighlight %}
+
 </div>
 </div>
 
@@ -1430,6 +1633,16 @@ df = sqlContext.load(source="jdbc", url="jdbc:postgresql:dbserver", dbtable="sch
 
 </div>
 
+<div data-lang="r"  markdown="1">
+
+{% highlight r %}
+
+df <- loadDF(sqlContext, source="jdbc", url="jdbc:postgresql:dbserver", dbtable="schema.tablename")
+
+{% endhighlight %}
+
+</div>
+
 <div data-lang="sql"  markdown="1">
 
 {% highlight sql %}
@@ -2354,5 +2567,151 @@ from pyspark.sql.types import *
 
 </div>
 
+<div data-lang="r"  markdown="1">
+
+<table class="table">
+<tr>
+  <th style="width:20%">Data type</th>
+  <th style="width:40%">Value type in R</th>
+  <th>API to access or create a data type</th></tr>
+<tr>
+  <td> <b>ByteType</b> </td>
+  <td>
+  integer <br />
+  <b>Note:</b> Numbers will be converted to 1-byte signed integer numbers at runtime.
+  Please make sure that numbers are within the range of -128 to 127.
+  </td>
+  <td>
+  "byte"
+  </td>
+</tr>
+<tr>
+  <td> <b>ShortType</b> </td>
+  <td>
+  integer <br />
+  <b>Note:</b> Numbers will be converted to 2-byte signed integer numbers at runtime.
+  Please make sure that numbers are within the range of -32768 to 32767.
+  </td>
+  <td>
+  "short"
+  </td>
+</tr>
+<tr>
+  <td> <b>IntegerType</b> </td>
+  <td> integer </td>
+  <td>
+  "integer"
+  </td>
+</tr>
+<tr>
+  <td> <b>LongType</b> </td>
+  <td>
+  integer <br />
+  <b>Note:</b> Numbers will be converted to 8-byte signed integer numbers at runtime.
+  Please make sure that numbers are within the range of
+  -9223372036854775808 to 9223372036854775807.
+  Otherwise, please convert data to decimal.Decimal and use DecimalType.
+  </td>
+  <td>
+  "long"
+  </td>
+</tr>
+<tr>
+  <td> <b>FloatType</b> </td>
+  <td>
+  numeric <br />
+  <b>Note:</b> Numbers will be converted to 4-byte single-precision floating
+  point numbers at runtime.
+  </td>
+  <td>
+  "float"
+  </td>
+</tr>
+<tr>
+  <td> <b>DoubleType</b> </td>
+  <td> numeric </td>
+  <td>
+  "double"
+  </td>
+</tr>
+<tr>
+  <td> <b>DecimalType</b> </td>
+  <td> Not supported </td>
+  <td>
+   Not supported
+  </td>
+</tr>
+<tr>
+  <td> <b>StringType</b> </td>
+  <td> character </td>
+  <td>
+  "string"
+  </td>
+</tr>
+<tr>
+  <td> <b>BinaryType</b> </td>
+  <td> raw </td>
+  <td>
+  "binary"
+  </td>
+</tr>
+<tr>
+  <td> <b>BooleanType</b> </td>
+  <td> logical </td>
+  <td>
+  "bool"
+  </td>
+</tr>
+<tr>
+  <td> <b>TimestampType</b> </td>
+  <td> POSIXct </td>
+  <td>
+  "timestamp"
+  </td>
+</tr>
+<tr>
+  <td> <b>DateType</b> </td>
+  <td> Date </td>
+  <td>
+  "date"
+  </td>
+</tr>
+<tr>
+  <td> <b>ArrayType</b> </td>
+  <td> vector or list </td>
+  <td>
+  list(type="array", elementType=<i>elementType</i>, containsNull=[<i>containsNull</i>])<br />
+  <b>Note:</b> The default value of <i>containsNull</i> is <i>True</i>.
+  </td>
+</tr>
+<tr>
+  <td> <b>MapType</b> </td>
+  <td> enviroment </td>
+  <td>
+  list(type="map", keyType=<i>keyType</i>, valueType=<i>valueType</i>, valueContainsNull=[<i>valueContainsNull</i>])<br />
+  <b>Note:</b> The default value of <i>valueContainsNull</i> is <i>True</i>.
+  </td>
+</tr>
+<tr>
+  <td> <b>StructType</b> </td>
+  <td> named list</td>
+  <td>
+  list(type="struct", fields=<i>fields</i>)<br />
+  <b>Note:</b> <i>fields</i> is a Seq of StructFields. Also, two fields with the same
+  name are not allowed.
+  </td>
+</tr>
+<tr>
+  <td> <b>StructField</b> </td>
+  <td> The value type in R of the data type of this field
+  (For example, integer for a StructField with the data type IntegerType) </td>
+  <td>
+  list(name=<i>name</i>, type=<i>dataType</i>, nullable=<i>nullable</i>)
+  </td>
+</tr>
+</table>
+
+</div>
+
 </div>
 

From a40bca0111de45763c3ef4270afb2185c16b8f95 Mon Sep 17 00:00:00 2001
From: Shivaram Venkataraman <shivaram@cs.berkeley.edu>
Date: Sat, 23 May 2015 00:04:01 -0700
Subject: [PATCH 158/525] [SPARK-6811] Copy SparkR lib in make-distribution.sh

This change also remove native libraries from SparkR to make sure our distribution works across platforms

Tested by building on Mac, running on Amazon Linux (CentOS), Windows VM and vice-versa (built on Linux run on Mac)

I will also test this with YARN soon and update this PR.

Author: Shivaram Venkataraman <shivaram@cs.berkeley.edu>

Closes #6373 from shivaram/sparkr-binary and squashes the following commits:

ae41b5c [Shivaram Venkataraman] Remove native libraries from SparkR Also include the built SparkR package in make-distribution.sh
---
 R/pkg/NAMESPACE                              |  5 ++-
 R/pkg/R/utils.R                              | 38 +++++++++++++++++++-
 R/pkg/{src => src-native}/Makefile           |  0
 R/pkg/{src => src-native}/Makefile.win       |  0
 R/pkg/{src => src-native}/string_hash_code.c |  0
 make-distribution.sh                         |  2 ++
 6 files changed, 43 insertions(+), 2 deletions(-)
 rename R/pkg/{src => src-native}/Makefile (100%)
 rename R/pkg/{src => src-native}/Makefile.win (100%)
 rename R/pkg/{src => src-native}/string_hash_code.c (100%)

diff --git a/R/pkg/NAMESPACE b/R/pkg/NAMESPACE
index 64ffdcffc9caf..411126a377950 100644
--- a/R/pkg/NAMESPACE
+++ b/R/pkg/NAMESPACE
@@ -1,6 +1,9 @@
 # Imports from base R
 importFrom(methods, setGeneric, setMethod, setOldClass)
-useDynLib(SparkR, stringHashCode)
+
+# Disable native libraries till we figure out how to package it
+# See SPARKR-7839
+#useDynLib(SparkR, stringHashCode)
 
 # S3 methods exported
 export("sparkR.init")
diff --git a/R/pkg/R/utils.R b/R/pkg/R/utils.R
index 0e7b7bd5a5b34..69b2700191c9a 100644
--- a/R/pkg/R/utils.R
+++ b/R/pkg/R/utils.R
@@ -122,13 +122,49 @@ hashCode <- function(key) {
     intBits <- packBits(rawToBits(rawVec), "integer")
     as.integer(bitwXor(intBits[2], intBits[1]))
   } else if (class(key) == "character") {
-    .Call("stringHashCode", key)
+    # TODO: SPARK-7839 means we might not have the native library available
+    if (is.loaded("stringHashCode")) {
+      .Call("stringHashCode", key)
+    } else {
+      n <- nchar(key)
+      if (n == 0) {
+        0L
+      } else {
+        asciiVals <- sapply(charToRaw(key), function(x) { strtoi(x, 16L) })
+        hashC <- 0
+        for (k in 1:length(asciiVals)) {
+          hashC <- mult31AndAdd(hashC, asciiVals[k])
+        }
+        as.integer(hashC)
+      }
+    }
   } else {
     warning(paste("Could not hash object, returning 0", sep = ""))
     as.integer(0)
   }
 }
 
+# Helper function used to wrap a 'numeric' value to integer bounds.
+# Useful for implementing C-like integer arithmetic
+wrapInt <- function(value) {
+  if (value > .Machine$integer.max) {
+    value <- value - 2 * .Machine$integer.max - 2
+  } else if (value < -1 * .Machine$integer.max) {
+    value <- 2 * .Machine$integer.max + value + 2
+  }
+  value
+}
+
+# Multiply `val` by 31 and add `addVal` to the result. Ensures that
+# integer-overflows are handled at every step.
+mult31AndAdd <- function(val, addVal) {
+  vec <- c(bitwShiftL(val, c(4,3,2,1,0)), addVal)
+  Reduce(function(a, b) {
+          wrapInt(as.numeric(a) + as.numeric(b))
+         },
+         vec)
+}
+
 # Create a new RDD with serializedMode == "byte".
 # Return itself if already in "byte" format.
 serializeToBytes <- function(rdd) {
diff --git a/R/pkg/src/Makefile b/R/pkg/src-native/Makefile
similarity index 100%
rename from R/pkg/src/Makefile
rename to R/pkg/src-native/Makefile
diff --git a/R/pkg/src/Makefile.win b/R/pkg/src-native/Makefile.win
similarity index 100%
rename from R/pkg/src/Makefile.win
rename to R/pkg/src-native/Makefile.win
diff --git a/R/pkg/src/string_hash_code.c b/R/pkg/src-native/string_hash_code.c
similarity index 100%
rename from R/pkg/src/string_hash_code.c
rename to R/pkg/src-native/string_hash_code.c
diff --git a/make-distribution.sh b/make-distribution.sh
index 8d6e91d67593f..78827341b956c 100755
--- a/make-distribution.sh
+++ b/make-distribution.sh
@@ -229,6 +229,8 @@ cp "$SPARK_HOME"/conf/*.template "$DISTDIR"/conf
 cp "$SPARK_HOME/README.md" "$DISTDIR"
 cp -r "$SPARK_HOME/bin" "$DISTDIR"
 cp -r "$SPARK_HOME/python" "$DISTDIR"
+mkdir -p "$DISTDIR"/R/lib
+cp -r "$SPARK_HOME/R/lib/SparkR" "$DISTDIR"/R/lib
 cp -r "$SPARK_HOME/sbin" "$DISTDIR"
 cp -r "$SPARK_HOME/ec2" "$DISTDIR"
 

From ad0badba1450295982738934da2cc121cde18213 Mon Sep 17 00:00:00 2001
From: zsxwing <zsxwing@gmail.com>
Date: Sat, 23 May 2015 02:11:17 -0700
Subject: [PATCH 159/525] [SPARK-7777][Streaming] Handle the case when there is
 no block in a batch

In the old implementation, if a batch has no block, `areWALRecordHandlesPresent` will be `true` and it will return `WriteAheadLogBackedBlockRDD`.

This PR handles this case by returning `WriteAheadLogBackedBlockRDD` or `BlockRDD` according to the configuration.

Author: zsxwing <zsxwing@gmail.com>

Closes #6372 from zsxwing/SPARK-7777 and squashes the following commits:

788f895 [zsxwing] Handle the case when there is no block in a batch
---
 .../dstream/ReceiverInputDStream.scala        | 47 ++++++++++++-------
 .../spark/streaming/InputStreamsSuite.scala   | 31 ++++++++++++
 2 files changed, 60 insertions(+), 18 deletions(-)

diff --git a/streaming/src/main/scala/org/apache/spark/streaming/dstream/ReceiverInputDStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/dstream/ReceiverInputDStream.scala
index 5cfe43a1ce726..e4ff05e12f201 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/dstream/ReceiverInputDStream.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/dstream/ReceiverInputDStream.scala
@@ -73,27 +73,38 @@ abstract class ReceiverInputDStream[T: ClassTag](@transient ssc_ : StreamingCont
         val inputInfo = InputInfo(id, blockInfos.map(_.numRecords).sum)
         ssc.scheduler.inputInfoTracker.reportInfo(validTime, inputInfo)
 
-        // Are WAL record handles present with all the blocks
-        val areWALRecordHandlesPresent = blockInfos.forall { _.walRecordHandleOption.nonEmpty }
+        if (blockInfos.nonEmpty) {
+          // Are WAL record handles present with all the blocks
+          val areWALRecordHandlesPresent = blockInfos.forall { _.walRecordHandleOption.nonEmpty }
 
-        if (areWALRecordHandlesPresent) {
-          // If all the blocks have WAL record handle, then create a WALBackedBlockRDD
-          val isBlockIdValid = blockInfos.map { _.isBlockIdValid() }.toArray
-          val walRecordHandles = blockInfos.map { _.walRecordHandleOption.get }.toArray
-          new WriteAheadLogBackedBlockRDD[T](
-            ssc.sparkContext, blockIds, walRecordHandles, isBlockIdValid)
-        } else {
-          // Else, create a BlockRDD. However, if there are some blocks with WAL info but not others
-          // then that is unexpected and log a warning accordingly.
-          if (blockInfos.find(_.walRecordHandleOption.nonEmpty).nonEmpty) {
-            if (WriteAheadLogUtils.enableReceiverLog(ssc.conf)) {
-              logError("Some blocks do not have Write Ahead Log information; " +
-                "this is unexpected and data may not be recoverable after driver failures")
-            } else {
-              logWarning("Some blocks have Write Ahead Log information; this is unexpected")
+          if (areWALRecordHandlesPresent) {
+            // If all the blocks have WAL record handle, then create a WALBackedBlockRDD
+            val isBlockIdValid = blockInfos.map { _.isBlockIdValid() }.toArray
+            val walRecordHandles = blockInfos.map { _.walRecordHandleOption.get }.toArray
+            new WriteAheadLogBackedBlockRDD[T](
+              ssc.sparkContext, blockIds, walRecordHandles, isBlockIdValid)
+          } else {
+            // Else, create a BlockRDD. However, if there are some blocks with WAL info but not
+            // others then that is unexpected and log a warning accordingly.
+            if (blockInfos.find(_.walRecordHandleOption.nonEmpty).nonEmpty) {
+              if (WriteAheadLogUtils.enableReceiverLog(ssc.conf)) {
+                logError("Some blocks do not have Write Ahead Log information; " +
+                  "this is unexpected and data may not be recoverable after driver failures")
+              } else {
+                logWarning("Some blocks have Write Ahead Log information; this is unexpected")
+              }
             }
+            new BlockRDD[T](ssc.sc, blockIds)
+          }
+        } else {
+          // If no block is ready now, creating WriteAheadLogBackedBlockRDD or BlockRDD
+          // according to the configuration
+          if (WriteAheadLogUtils.enableReceiverLog(ssc.conf)) {
+            new WriteAheadLogBackedBlockRDD[T](
+              ssc.sparkContext, Array.empty, Array.empty, Array.empty)
+          } else {
+            new BlockRDD[T](ssc.sc, Array.empty)
           }
-          new BlockRDD[T](ssc.sc, blockIds)
         }
       }
     }
diff --git a/streaming/src/test/scala/org/apache/spark/streaming/InputStreamsSuite.scala b/streaming/src/test/scala/org/apache/spark/streaming/InputStreamsSuite.scala
index 93e6b0cd7c661..0122514f9374c 100644
--- a/streaming/src/test/scala/org/apache/spark/streaming/InputStreamsSuite.scala
+++ b/streaming/src/test/scala/org/apache/spark/streaming/InputStreamsSuite.scala
@@ -39,6 +39,7 @@ import org.apache.spark.storage.StorageLevel
 import org.apache.spark.streaming.scheduler.{StreamingListenerBatchCompleted, StreamingListener}
 import org.apache.spark.util.{ManualClock, Utils}
 import org.apache.spark.streaming.dstream.{InputDStream, ReceiverInputDStream}
+import org.apache.spark.streaming.rdd.WriteAheadLogBackedBlockRDD
 import org.apache.spark.streaming.receiver.Receiver
 
 class InputStreamsSuite extends TestSuiteBase with BeforeAndAfter {
@@ -105,6 +106,36 @@ class InputStreamsSuite extends TestSuiteBase with BeforeAndAfter {
     }
   }
 
+  test("socket input stream - no block in a batch") {
+    withTestServer(new TestServer()) { testServer =>
+      testServer.start()
+
+      withStreamingContext(new StreamingContext(conf, batchDuration)) { ssc =>
+        ssc.addStreamingListener(ssc.progressListener)
+
+        val batchCounter = new BatchCounter(ssc)
+        val networkStream = ssc.socketTextStream(
+          "localhost", testServer.port, StorageLevel.MEMORY_AND_DISK)
+        val outputBuffer = new ArrayBuffer[Seq[String]] with SynchronizedBuffer[Seq[String]]
+        val outputStream = new TestOutputStream(networkStream, outputBuffer)
+        outputStream.register()
+        ssc.start()
+
+        val clock = ssc.scheduler.clock.asInstanceOf[ManualClock]
+        clock.advance(batchDuration.milliseconds)
+
+        // Make sure the first batch is finished
+        if (!batchCounter.waitUntilBatchesCompleted(1, 30000)) {
+          fail("Timeout: cannot finish all batches in 30 seconds")
+        }
+
+        networkStream.generatedRDDs.foreach { case (_, rdd) =>
+          assert(!rdd.isInstanceOf[WriteAheadLogBackedBlockRDD[_]])
+        }
+      }
+    }
+  }
+
   test("binary records stream") {
     val testDir: File = null
     try {

From efe3bfdf496aa6206ace2697e31dd4c0c3c824fb Mon Sep 17 00:00:00 2001
From: Davies Liu <davies@databricks.com>
Date: Sat, 23 May 2015 08:30:05 -0700
Subject: [PATCH 160/525] [SPARK-7322, SPARK-7836, SPARK-7822][SQL] DataFrame
 window function related updates

1. ntile should take an integer as parameter.
2. Added Python API (based on #6364)
3. Update documentation of various DataFrame Python functions.

Author: Davies Liu <davies@databricks.com>
Author: Reynold Xin <rxin@databricks.com>

Closes #6374 from rxin/window-final and squashes the following commits:

69004c7 [Reynold Xin] Style fix.
288cea9 [Reynold Xin] Update documentaiton.
7cb8985 [Reynold Xin] Merge pull request #6364 from davies/window
66092b4 [Davies Liu] update docs
ed73cb4 [Reynold Xin] [SPARK-7322][SQL] Improve DataFrame window function documentation.
ef55132 [Davies Liu] Merge branch 'master' of github.com:apache/spark into window4
8936ade [Davies Liu] fix maxint in python 3
2649358 [Davies Liu] update docs
778e2c0 [Davies Liu] SPARK-7836 and SPARK-7822: Python API of window functions
---
 python/pyspark/sql/__init__.py                |  25 ++-
 python/pyspark/sql/column.py                  |  54 +++--
 python/pyspark/sql/context.py                 |   2 -
 python/pyspark/sql/dataframe.py               |   2 +
 python/pyspark/sql/functions.py               | 147 +++++++++++--
 python/pyspark/sql/group.py                   |   2 +
 python/pyspark/sql/tests.py                   |  31 ++-
 python/pyspark/sql/window.py                  | 158 ++++++++++++++
 .../org/apache/spark/sql/functions.scala      | 197 ++++++++----------
 .../sql/hive/HiveDataFrameWindowSuite.scala   |  20 +-
 10 files changed, 464 insertions(+), 174 deletions(-)
 create mode 100644 python/pyspark/sql/window.py

diff --git a/python/pyspark/sql/__init__.py b/python/pyspark/sql/__init__.py
index 66b0bff2908b7..8fee92ae3aed5 100644
--- a/python/pyspark/sql/__init__.py
+++ b/python/pyspark/sql/__init__.py
@@ -18,26 +18,28 @@
 """
 Important classes of Spark SQL and DataFrames:
 
-    - L{SQLContext}
+    - :class:`pyspark.sql.SQLContext`
       Main entry point for :class:`DataFrame` and SQL functionality.
-    - L{DataFrame}
+    - :class:`pyspark.sql.DataFrame`
       A distributed collection of data grouped into named columns.
-    - L{Column}
+    - :class:`pyspark.sql.Column`
       A column expression in a :class:`DataFrame`.
-    - L{Row}
+    - :class:`pyspark.sql.Row`
       A row of data in a :class:`DataFrame`.
-    - L{HiveContext}
+    - :class:`pyspark.sql.HiveContext`
       Main entry point for accessing data stored in Apache Hive.
-    - L{GroupedData}
+    - :class:`pyspark.sql.GroupedData`
       Aggregation methods, returned by :func:`DataFrame.groupBy`.
-    - L{DataFrameNaFunctions}
+    - :class:`pyspark.sql.DataFrameNaFunctions`
       Methods for handling missing data (null values).
-    - L{DataFrameStatFunctions}
+    - :class:`pyspark.sql.DataFrameStatFunctions`
       Methods for statistics functionality.
-    - L{functions}
+    - :class:`pyspark.sql.functions`
       List of built-in functions available for :class:`DataFrame`.
-    - L{types}
+    - :class:`pyspark.sql.types`
       List of data types available.
+    - :class:`pyspark.sql.Window`
+      For working with window functions.
 """
 from __future__ import absolute_import
 
@@ -66,8 +68,9 @@ def deco(f):
 from pyspark.sql.dataframe import DataFrame, SchemaRDD, DataFrameNaFunctions, DataFrameStatFunctions
 from pyspark.sql.group import GroupedData
 from pyspark.sql.readwriter import DataFrameReader, DataFrameWriter
+from pyspark.sql.window import Window, WindowSpec
 
 __all__ = [
     'SQLContext', 'HiveContext', 'DataFrame', 'GroupedData', 'Column', 'Row',
-    'DataFrameNaFunctions', 'DataFrameStatFunctions'
+    'DataFrameNaFunctions', 'DataFrameStatFunctions', 'Window', 'WindowSpec',
 ]
diff --git a/python/pyspark/sql/column.py b/python/pyspark/sql/column.py
index baf1ecbd0a2fc..8dc5039f587f0 100644
--- a/python/pyspark/sql/column.py
+++ b/python/pyspark/sql/column.py
@@ -116,6 +116,8 @@ class Column(object):
         df.colName + 1
         1 / df.colName
 
+    .. note:: Experimental
+
     .. versionadded:: 1.3
     """
 
@@ -164,8 +166,9 @@ def __init__(self, jc):
 
     @since(1.3)
     def getItem(self, key):
-        """An expression that gets an item at position `ordinal` out of a list,
-         or gets an item by key out of a dict.
+        """
+        An expression that gets an item at position ``ordinal`` out of a list,
+        or gets an item by key out of a dict.
 
         >>> df = sc.parallelize([([1, 2], {"key": "value"})]).toDF(["l", "d"])
         >>> df.select(df.l.getItem(0), df.d.getItem("key")).show()
@@ -185,7 +188,8 @@ def getItem(self, key):
 
     @since(1.3)
     def getField(self, name):
-        """An expression that gets a field by name in a StructField.
+        """
+        An expression that gets a field by name in a StructField.
 
         >>> from pyspark.sql import Row
         >>> df = sc.parallelize([Row(r=Row(a=1, b="b"))]).toDF()
@@ -219,7 +223,7 @@ def __getattr__(self, item):
     @since(1.3)
     def substr(self, startPos, length):
         """
-        Return a :class:`Column` which is a substring of the column
+        Return a :class:`Column` which is a substring of the column.
 
         :param startPos: start position (int or Column)
         :param length:  length of the substring (int or Column)
@@ -242,7 +246,8 @@ def substr(self, startPos, length):
     @ignore_unicode_prefix
     @since(1.3)
     def inSet(self, *cols):
-        """ A boolean expression that is evaluated to true if the value of this
+        """
+        A boolean expression that is evaluated to true if the value of this
         expression is contained by the evaluated values of the arguments.
 
         >>> df[df.name.inSet("Bob", "Mike")].collect()
@@ -268,7 +273,8 @@ def inSet(self, *cols):
 
     @since(1.3)
     def alias(self, *alias):
-        """Returns this column aliased with a new name or names (in the case of expressions that
+        """
+        Returns this column aliased with a new name or names (in the case of expressions that
         return more than one column, such as explode).
 
         >>> df.select(df.age.alias("age2")).collect()
@@ -284,7 +290,7 @@ def alias(self, *alias):
     @ignore_unicode_prefix
     @since(1.3)
     def cast(self, dataType):
-        """ Convert the column into type `dataType`
+        """ Convert the column into type ``dataType``.
 
         >>> df.select(df.age.cast("string").alias('ages')).collect()
         [Row(ages=u'2'), Row(ages=u'5')]
@@ -304,25 +310,24 @@ def cast(self, dataType):
 
     astype = cast
 
-    @ignore_unicode_prefix
     @since(1.3)
     def between(self, lowerBound, upperBound):
-        """ A boolean expression that is evaluated to true if the value of this
+        """
+        A boolean expression that is evaluated to true if the value of this
         expression is between the given columns.
         """
         return (self >= lowerBound) & (self <= upperBound)
 
-    @ignore_unicode_prefix
     @since(1.4)
     def when(self, condition, value):
-        """Evaluates a list of conditions and returns one of multiple possible result expressions.
+        """
+        Evaluates a list of conditions and returns one of multiple possible result expressions.
         If :func:`Column.otherwise` is not invoked, None is returned for unmatched conditions.
 
         See :func:`pyspark.sql.functions.when` for example usage.
 
         :param condition: a boolean :class:`Column` expression.
         :param value: a literal value, or a :class:`Column` expression.
-
         """
         sc = SparkContext._active_spark_context
         if not isinstance(condition, Column):
@@ -331,10 +336,10 @@ def when(self, condition, value):
         jc = sc._jvm.functions.when(condition._jc, v)
         return Column(jc)
 
-    @ignore_unicode_prefix
     @since(1.4)
     def otherwise(self, value):
-        """Evaluates a list of conditions and returns one of multiple possible result expressions.
+        """
+        Evaluates a list of conditions and returns one of multiple possible result expressions.
         If :func:`Column.otherwise` is not invoked, None is returned for unmatched conditions.
 
         See :func:`pyspark.sql.functions.when` for example usage.
@@ -345,6 +350,27 @@ def otherwise(self, value):
         jc = self._jc.otherwise(value)
         return Column(jc)
 
+    @since(1.4)
+    def over(self, window):
+        """
+        Define a windowing column.
+
+        :param window: a :class:`WindowSpec`
+        :return: a Column
+
+        >>> from pyspark.sql import Window
+        >>> window = Window.partitionBy("name").orderBy("age").rowsBetween(-1, 1)
+        >>> from pyspark.sql.functions import rank, min
+        >>> # df.select(rank().over(window), min('age').over(window))
+
+        .. note:: Window functions is only supported with HiveContext in 1.4
+        """
+        from pyspark.sql.window import WindowSpec
+        if not isinstance(window, WindowSpec):
+            raise TypeError("window should be WindowSpec")
+        jc = self._jc.over(window._jspec)
+        return Column(jc)
+
     def __repr__(self):
         return 'Column<%s>' % self._jc.toString().encode('utf8')
 
diff --git a/python/pyspark/sql/context.py b/python/pyspark/sql/context.py
index 51f12c5bb4198..22f6257dfe02d 100644
--- a/python/pyspark/sql/context.py
+++ b/python/pyspark/sql/context.py
@@ -585,8 +585,6 @@ def read(self):
         Returns a :class:`DataFrameReader` that can be used to read data
         in as a :class:`DataFrame`.
 
-        .. note:: Experimental
-
         >>> sqlContext.read
         <pyspark.sql.readwriter.DataFrameReader object at ...>
         """
diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py
index 132db90e69f59..55cad8238ee88 100644
--- a/python/pyspark/sql/dataframe.py
+++ b/python/pyspark/sql/dataframe.py
@@ -62,6 +62,8 @@ class DataFrame(object):
         people.filter(people.age > 30).join(department, people.deptId == department.id)) \
           .groupBy(department.name, "gender").agg({"salary": "avg", "age": "max"})
 
+    .. note:: Experimental
+
     .. versionadded:: 1.3
     """
 
diff --git a/python/pyspark/sql/functions.py b/python/pyspark/sql/functions.py
index 9b0d7f3e6656e..bbf465aca8d4d 100644
--- a/python/pyspark/sql/functions.py
+++ b/python/pyspark/sql/functions.py
@@ -32,16 +32,21 @@
 
 
 __all__ = [
+    'array',
     'approxCountDistinct',
     'coalesce',
     'countDistinct',
+    'explode',
     'monotonicallyIncreasingId',
     'rand',
     'randn',
     'sparkPartitionId',
+    'struct',
     'udf',
     'when']
 
+__all__ += ['lag', 'lead', 'ntile']
+
 
 def _create_function(name, doc=""):
     """ Create a function for aggregator by name"""
@@ -67,6 +72,17 @@ def _(col1, col2):
     return _
 
 
+def _create_window_function(name, doc=''):
+    """ Create a window function by name """
+    def _():
+        sc = SparkContext._active_spark_context
+        jc = getattr(sc._jvm.functions, name)()
+        return Column(jc)
+    _.__name__ = name
+    _.__doc__ = 'Window function: ' + doc
+    return _
+
+
 _functions = {
     'lit': 'Creates a :class:`Column` of literal value.',
     'col': 'Returns a :class:`Column` based on the given column name.',
@@ -130,15 +146,53 @@ def _(col1, col2):
     'pow': 'Returns the value of the first argument raised to the power of the second argument.'
 }
 
+_window_functions = {
+    'rowNumber':
+        """returns a sequential number starting at 1 within a window partition.
+
+        This is equivalent to the ROW_NUMBER function in SQL.""",
+    'denseRank':
+        """returns the rank of rows within a window partition, without any gaps.
+
+        The difference between rank and denseRank is that denseRank leaves no gaps in ranking
+        sequence when there are ties. That is, if you were ranking a competition using denseRank
+        and had three people tie for second place, you would say that all three were in second
+        place and that the next person came in third.
+
+        This is equivalent to the DENSE_RANK function in SQL.""",
+    'rank':
+        """returns the rank of rows within a window partition.
+
+        The difference between rank and denseRank is that denseRank leaves no gaps in ranking
+        sequence when there are ties. That is, if you were ranking a competition using denseRank
+        and had three people tie for second place, you would say that all three were in second
+        place and that the next person came in third.
+
+        This is equivalent to the RANK function in SQL.""",
+    'cumeDist':
+        """returns the cumulative distribution of values within a window partition,
+        i.e. the fraction of rows that are below the current row.
+
+        This is equivalent to the CUME_DIST function in SQL.""",
+    'percentRank':
+        """returns the relative rank (i.e. percentile) of rows within a window partition.
+
+        This is equivalent to the PERCENT_RANK function in SQL.""",
+}
+
 for _name, _doc in _functions.items():
     globals()[_name] = since(1.3)(_create_function(_name, _doc))
 for _name, _doc in _functions_1_4.items():
     globals()[_name] = since(1.4)(_create_function(_name, _doc))
 for _name, _doc in _binary_mathfunctions.items():
     globals()[_name] = since(1.4)(_create_binary_mathfunction(_name, _doc))
+for _name, _doc in _window_functions.items():
+    globals()[_name] = since(1.4)(_create_window_function(_name, _doc))
 del _name, _doc
 __all__ += _functions.keys()
+__all__ += _functions_1_4.keys()
 __all__ += _binary_mathfunctions.keys()
+__all__ += _window_functions.keys()
 __all__.sort()
 
 
@@ -176,27 +230,6 @@ def approxCountDistinct(col, rsd=None):
     return Column(jc)
 
 
-@since(1.4)
-def explode(col):
-    """Returns a new row for each element in the given array or map.
-
-    >>> from pyspark.sql import Row
-    >>> eDF = sqlContext.createDataFrame([Row(a=1, intlist=[1,2,3], mapfield={"a": "b"})])
-    >>> eDF.select(explode(eDF.intlist).alias("anInt")).collect()
-    [Row(anInt=1), Row(anInt=2), Row(anInt=3)]
-
-    >>> eDF.select(explode(eDF.mapfield).alias("key", "value")).show()
-    +---+-----+
-    |key|value|
-    +---+-----+
-    |  a|    b|
-    +---+-----+
-    """
-    sc = SparkContext._active_spark_context
-    jc = sc._jvm.functions.explode(_to_java_column(col))
-    return Column(jc)
-
-
 @since(1.4)
 def coalesce(*cols):
     """Returns the first column that is not null.
@@ -249,6 +282,27 @@ def countDistinct(col, *cols):
     return Column(jc)
 
 
+@since(1.4)
+def explode(col):
+    """Returns a new row for each element in the given array or map.
+
+    >>> from pyspark.sql import Row
+    >>> eDF = sqlContext.createDataFrame([Row(a=1, intlist=[1,2,3], mapfield={"a": "b"})])
+    >>> eDF.select(explode(eDF.intlist).alias("anInt")).collect()
+    [Row(anInt=1), Row(anInt=2), Row(anInt=3)]
+
+    >>> eDF.select(explode(eDF.mapfield).alias("key", "value")).show()
+    +---+-----+
+    |key|value|
+    +---+-----+
+    |  a|    b|
+    +---+-----+
+    """
+    sc = SparkContext._active_spark_context
+    jc = sc._jvm.functions.explode(_to_java_column(col))
+    return Column(jc)
+
+
 @since(1.4)
 def monotonicallyIncreasingId():
     """A column that generates monotonically increasing 64-bit integers.
@@ -258,7 +312,7 @@ def monotonicallyIncreasingId():
     within each partition in the lower 33 bits. The assumption is that the data frame has
     less than 1 billion partitions, and each partition has less than 8 billion records.
 
-    As an example, consider a [[DataFrame]] with two partitions, each with 3 records.
+    As an example, consider a :class:`DataFrame` with two partitions, each with 3 records.
     This expression would return the following IDs:
     0, 1, 2, 8589934592 (1L << 33), 8589934593, 8589934594.
 
@@ -349,6 +403,55 @@ def when(condition, value):
     return Column(jc)
 
 
+@since(1.4)
+def lag(col, count=1, default=None):
+    """
+    Window function: returns the value that is `offset` rows before the current row, and
+    `defaultValue` if there is less than `offset` rows before the current row. For example,
+    an `offset` of one will return the previous row at any given point in the window partition.
+
+    This is equivalent to the LAG function in SQL.
+
+    :param col: name of column or expression
+    :param count: number of row to extend
+    :param default: default value
+    """
+    sc = SparkContext._active_spark_context
+    return Column(sc._jvm.functions.lag(_to_java_column(col), count, default))
+
+
+@since(1.4)
+def lead(col, count=1, default=None):
+    """
+    Window function: returns the value that is `offset` rows after the current row, and
+    `defaultValue` if there is less than `offset` rows after the current row. For example,
+    an `offset` of one will return the next row at any given point in the window partition.
+
+    This is equivalent to the LEAD function in SQL.
+
+    :param col: name of column or expression
+    :param count: number of row to extend
+    :param default: default value
+    """
+    sc = SparkContext._active_spark_context
+    return Column(sc._jvm.functions.lead(_to_java_column(col), count, default))
+
+
+@since(1.4)
+def ntile(n):
+    """
+    Window function: returns a group id from 1 to `n` (inclusive) in a round-robin fashion in
+    a window partition. Fow example, if `n` is 3, the first row will get 1, the second row will
+    get 2, the third row will get 3, and the fourth row will get 1...
+
+    This is equivalent to the NTILE function in SQL.
+
+    :param n: an integer
+    """
+    sc = SparkContext._active_spark_context
+    return Column(sc._jvm.functions.ntile(int(n)))
+
+
 class UserDefinedFunction(object):
     """
     User defined function in Python
diff --git a/python/pyspark/sql/group.py b/python/pyspark/sql/group.py
index 4da472a577eae..5a37a673ee80c 100644
--- a/python/pyspark/sql/group.py
+++ b/python/pyspark/sql/group.py
@@ -49,6 +49,8 @@ class GroupedData(object):
     A set of methods for aggregations on a :class:`DataFrame`,
     created by :func:`DataFrame.groupBy`.
 
+    .. note:: Experimental
+
     .. versionadded:: 1.3
     """
 
diff --git a/python/pyspark/sql/tests.py b/python/pyspark/sql/tests.py
index 7e349962416c9..5c53c3a8ed4f1 100644
--- a/python/pyspark/sql/tests.py
+++ b/python/pyspark/sql/tests.py
@@ -44,6 +44,7 @@
 from pyspark.sql.types import UserDefinedType, _infer_type
 from pyspark.tests import ReusedPySparkTestCase
 from pyspark.sql.functions import UserDefinedFunction
+from pyspark.sql.window import Window
 
 
 class ExamplePointUDT(UserDefinedType):
@@ -743,11 +744,9 @@ def setUpClass(cls):
         try:
             cls.sc._jvm.org.apache.hadoop.hive.conf.HiveConf()
         except py4j.protocol.Py4JError:
-            cls.sqlCtx = None
-            return
+            raise unittest.SkipTest("Hive is not available")
         except TypeError:
-            cls.sqlCtx = None
-            return
+            raise unittest.SkipTest("Hive is not available")
         os.unlink(cls.tempdir.name)
         _scala_HiveContext =\
             cls.sc._jvm.org.apache.spark.sql.hive.test.TestHiveContext(cls.sc._jsc.sc())
@@ -761,9 +760,6 @@ def tearDownClass(cls):
         shutil.rmtree(cls.tempdir.name, ignore_errors=True)
 
     def test_save_and_load_table(self):
-        if self.sqlCtx is None:
-            return  # no hive available, skipped
-
         df = self.df
         tmpPath = tempfile.mkdtemp()
         shutil.rmtree(tmpPath)
@@ -805,6 +801,27 @@ def test_save_and_load_table(self):
 
         shutil.rmtree(tmpPath)
 
+    def test_window_functions(self):
+        df = self.sqlCtx.createDataFrame([(1, "1"), (2, "2"), (1, "2"), (1, "2")], ["key", "value"])
+        w = Window.partitionBy("value").orderBy("key")
+        from pyspark.sql import functions as F
+        sel = df.select(df.value, df.key,
+                        F.max("key").over(w.rowsBetween(0, 1)),
+                        F.min("key").over(w.rowsBetween(0, 1)),
+                        F.count("key").over(w.rowsBetween(float('-inf'), float('inf'))),
+                        F.rowNumber().over(w),
+                        F.rank().over(w),
+                        F.denseRank().over(w),
+                        F.ntile(2).over(w))
+        rs = sorted(sel.collect())
+        expected = [
+            ("1", 1, 1, 1, 1, 1, 1, 1, 1),
+            ("2", 1, 1, 1, 3, 1, 1, 1, 1),
+            ("2", 1, 2, 1, 3, 2, 1, 1, 1),
+            ("2", 2, 2, 2, 3, 3, 3, 2, 2)
+        ]
+        for r, ex in zip(rs, expected):
+            self.assertEqual(tuple(r), ex[:len(r)])
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/pyspark/sql/window.py b/python/pyspark/sql/window.py
new file mode 100644
index 0000000000000..0a0e006bdf83a
--- /dev/null
+++ b/python/pyspark/sql/window.py
@@ -0,0 +1,158 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import sys
+
+from pyspark import SparkContext
+from pyspark.sql import since
+from pyspark.sql.column import _to_seq, _to_java_column
+
+__all__ = ["Window", "WindowSpec"]
+
+
+def _to_java_cols(cols):
+    sc = SparkContext._active_spark_context
+    if len(cols) == 1 and isinstance(cols[0], list):
+        cols = cols[0]
+    return _to_seq(sc, cols, _to_java_column)
+
+
+class Window(object):
+
+    """
+    Utility functions for defining window in DataFrames.
+
+    For example:
+
+    >>> # PARTITION BY country ORDER BY date ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW
+    >>> window = Window.partitionBy("country").orderBy("date").rowsBetween(-sys.maxsize, 0)
+
+    >>> # PARTITION BY country ORDER BY date RANGE BETWEEN 3 PRECEDING AND 3 FOLLOWING
+    >>> window = Window.orderBy("date").partitionBy("country").rangeBetween(-3, 3)
+
+    .. note:: Experimental
+
+    .. versionadded:: 1.4
+    """
+    @staticmethod
+    @since(1.4)
+    def partitionBy(*cols):
+        """
+        Creates a :class:`WindowSpec` with the partitioning defined.
+        """
+        sc = SparkContext._active_spark_context
+        jspec = sc._jvm.org.apache.spark.sql.expressions.Window.partitionBy(_to_java_cols(cols))
+        return WindowSpec(jspec)
+
+    @staticmethod
+    @since(1.4)
+    def orderBy(*cols):
+        """
+        Creates a :class:`WindowSpec` with the partitioning defined.
+        """
+        sc = SparkContext._active_spark_context
+        jspec = sc._jvm.org.apache.spark.sql.expressions.Window.partitionBy(_to_java_cols(cols))
+        return WindowSpec(jspec)
+
+
+class WindowSpec(object):
+    """
+    A window specification that defines the partitioning, ordering,
+    and frame boundaries.
+
+    Use the static methods in :class:`Window` to create a :class:`WindowSpec`.
+
+    .. note:: Experimental
+
+    .. versionadded:: 1.4
+    """
+
+    _JAVA_MAX_LONG = (1 << 63) - 1
+    _JAVA_MIN_LONG = - (1 << 63)
+
+    def __init__(self, jspec):
+        self._jspec = jspec
+
+    @since(1.4)
+    def partitionBy(self, *cols):
+        """
+        Defines the partitioning columns in a :class:`WindowSpec`.
+
+        :param cols: names of columns or expressions
+        """
+        return WindowSpec(self._jspec.partitionBy(_to_java_cols(cols)))
+
+    @since(1.4)
+    def orderBy(self, *cols):
+        """
+        Defines the ordering columns in a :class:`WindowSpec`.
+
+        :param cols: names of columns or expressions
+        """
+        return WindowSpec(self._jspec.orderBy(_to_java_cols(cols)))
+
+    @since(1.4)
+    def rowsBetween(self, start, end):
+        """
+        Defines the frame boundaries, from `start` (inclusive) to `end` (inclusive).
+
+        Both `start` and `end` are relative positions from the current row.
+        For example, "0" means "current row", while "-1" means the row before
+        the current row, and "5" means the fifth row after the current row.
+
+        :param start: boundary start, inclusive.
+                      The frame is unbounded if this is ``-sys.maxsize`` (or lower).
+        :param end: boundary end, inclusive.
+                    The frame is unbounded if this is ``sys.maxsize`` (or higher).
+        """
+        if start <= -sys.maxsize:
+            start = self._JAVA_MIN_LONG
+        if end >= sys.maxsize:
+            end = self._JAVA_MAX_LONG
+        return WindowSpec(self._jspec.rowsBetween(start, end))
+
+    @since(1.4)
+    def rangeBetween(self, start, end):
+        """
+        Defines the frame boundaries, from `start` (inclusive) to `end` (inclusive).
+
+        Both `start` and `end` are relative from the current row. For example,
+        "0" means "current row", while "-1" means one off before the current row,
+        and "5" means the five off after the current row.
+
+        :param start: boundary start, inclusive.
+                      The frame is unbounded if this is ``-sys.maxsize`` (or lower).
+        :param end: boundary end, inclusive.
+                    The frame is unbounded if this is ``sys.maxsize`` (or higher).
+        """
+        if start <= -sys.maxsize:
+            start = self._JAVA_MIN_LONG
+        if end >= sys.maxsize:
+            end = self._JAVA_MAX_LONG
+        return WindowSpec(self._jspec.rangeBetween(start, end))
+
+
+def _test():
+    import doctest
+    SparkContext('local[4]', 'PythonTest')
+    (failure_count, test_count) = doctest.testmod()
+    if failure_count:
+        exit(-1)
+
+
+if __name__ == "__main__":
+    _test()
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
index 8775be724e0f9..9a23cfb89ca12 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
@@ -326,168 +326,135 @@ object functions {
   //////////////////////////////////////////////////////////////////////////////////////////////
 
   /**
-   * Window function: returns the lag value of current row of the expression,
-   * null when the current row extends before the beginning of the window.
+   * Window function: returns the value that is `offset` rows before the current row, and
+   * `null` if there is less than `offset` rows before the current row. For example,
+   * an `offset` of one will return the previous row at any given point in the window partition.
    *
-   * @group window_funcs
-   * @since 1.4.0
-   */
-  def lag(columnName: String): Column = {
-    lag(columnName, 1)
-  }
-
-  /**
-   * Window function: returns the lag value of current row of the column,
-   * null when the current row extends before the beginning of the window.
+   * This is equivalent to the LAG function in SQL.
    *
    * @group window_funcs
    * @since 1.4.0
    */
-  def lag(e: Column): Column = {
-    lag(e, 1)
+  def lag(e: Column, offset: Int): Column = {
+    lag(e, offset, null)
   }
 
   /**
-   * Window function: returns the lag values of current row of the expression,
-   * null when the current row extends before the beginning of the window.
+   * Window function: returns the value that is `offset` rows before the current row, and
+   * `null` if there is less than `offset` rows before the current row. For example,
+   * an `offset` of one will return the previous row at any given point in the window partition.
    *
-   * @group window_funcs
-   * @since 1.4.0
-   */
-  def lag(e: Column, count: Int): Column = {
-    lag(e, count, null)
-  }
-
-  /**
-   * Window function: returns the lag values of current row of the column,
-   * null when the current row extends before the beginning of the window.
+   * This is equivalent to the LAG function in SQL.
    *
    * @group window_funcs
    * @since 1.4.0
    */
-  def lag(columnName: String, count: Int): Column = {
-    lag(columnName, count, null)
+  def lag(columnName: String, offset: Int): Column = {
+    lag(columnName, offset, null)
   }
 
   /**
-   * Window function: returns the lag values of current row of the column,
-   * given default value when the current row extends before the beginning
-   * of the window.
+   * Window function: returns the value that is `offset` rows before the current row, and
+   * `defaultValue` if there is less than `offset` rows before the current row. For example,
+   * an `offset` of one will return the previous row at any given point in the window partition.
    *
-   * @group window_funcs
-   * @since 1.4.0
-   */
-  def lag(columnName: String, count: Int, defaultValue: Any): Column = {
-    lag(Column(columnName), count, defaultValue)
-  }
-
-  /**
-   * Window function: returns the lag values of current row of the expression,
-   * given default value when the current row extends before the beginning
-   * of the window.
+   * This is equivalent to the LAG function in SQL.
    *
    * @group window_funcs
    * @since 1.4.0
    */
-  def lag(e: Column, count: Int, defaultValue: Any): Column = {
-    UnresolvedWindowFunction("lag", e.expr :: Literal(count) :: Literal(defaultValue) :: Nil)
+  def lag(columnName: String, offset: Int, defaultValue: Any): Column = {
+    lag(Column(columnName), offset, defaultValue)
   }
 
   /**
-   * Window function: returns the lead value of current row of the column,
-   * null when the current row extends before the end of the window.
+   * Window function: returns the value that is `offset` rows before the current row, and
+   * `defaultValue` if there is less than `offset` rows before the current row. For example,
+   * an `offset` of one will return the previous row at any given point in the window partition.
    *
-   * @group window_funcs
-   * @since 1.4.0
-   */
-  def lead(columnName: String): Column = {
-    lead(columnName, 1)
-  }
-
-  /**
-   * Window function: returns the lead value of current row of the expression,
-   * null when the current row extends before the end of the window.
+   * This is equivalent to the LAG function in SQL.
    *
    * @group window_funcs
    * @since 1.4.0
    */
-  def lead(e: Column): Column = {
-    lead(e, 1)
+  def lag(e: Column, offset: Int, defaultValue: Any): Column = {
+    UnresolvedWindowFunction("lag", e.expr :: Literal(offset) :: Literal(defaultValue) :: Nil)
   }
 
   /**
-   * Window function: returns the lead values of current row of the column,
-   * null when the current row extends before the end of the window.
+   * Window function: returns the value that is `offset` rows after the current row, and
+   * `null` if there is less than `offset` rows after the current row. For example,
+   * an `offset` of one will return the next row at any given point in the window partition.
    *
-   * @group window_funcs
-   * @since 1.4.0
-   */
-  def lead(columnName: String, count: Int): Column = {
-    lead(columnName, count, null)
-  }
-
-  /**
-   * Window function: returns the lead values of current row of the expression,
-   * null when the current row extends before the end of the window.
+   * This is equivalent to the LEAD function in SQL.
    *
    * @group window_funcs
    * @since 1.4.0
    */
-  def lead(e: Column, count: Int): Column = {
-    lead(e, count, null)
+  def lead(columnName: String, offset: Int): Column = {
+    lead(columnName, offset, null)
   }
 
   /**
-   * Window function: returns the lead values of current row of the column,
-   * given default value when the current row extends before the end of the window.
+   * Window function: returns the value that is `offset` rows after the current row, and
+   * `null` if there is less than `offset` rows after the current row. For example,
+   * an `offset` of one will return the next row at any given point in the window partition.
+   *
+   * This is equivalent to the LEAD function in SQL.
    *
    * @group window_funcs
    * @since 1.4.0
    */
-  def lead(columnName: String, count: Int, defaultValue: Any): Column = {
-    lead(Column(columnName), count, defaultValue)
+  def lead(e: Column, offset: Int): Column = {
+    lead(e, offset, null)
   }
 
   /**
-   * Window function: returns the lead values of current row of the expression,
-   * given default value when the current row extends before the end of the window.
+   * Window function: returns the value that is `offset` rows after the current row, and
+   * `defaultValue` if there is less than `offset` rows after the current row. For example,
+   * an `offset` of one will return the next row at any given point in the window partition.
+   *
+   * This is equivalent to the LEAD function in SQL.
    *
    * @group window_funcs
    * @since 1.4.0
    */
-  def lead(e: Column, count: Int, defaultValue: Any): Column = {
-    UnresolvedWindowFunction("lead", e.expr :: Literal(count) :: Literal(defaultValue) :: Nil)
+  def lead(columnName: String, offset: Int, defaultValue: Any): Column = {
+    lead(Column(columnName), offset, defaultValue)
   }
 
   /**
-   * NTILE for specified expression.
-   * NTILE allows easy calculation of tertiles, quartiles, deciles and other
-   * common summary statistics. This function divides an ordered partition into a specified
-   * number of groups called buckets and assigns a bucket number to each row in the partition.
+   * Window function: returns the value that is `offset` rows after the current row, and
+   * `defaultValue` if there is less than `offset` rows after the current row. For example,
+   * an `offset` of one will return the next row at any given point in the window partition.
+   *
+   * This is equivalent to the LEAD function in SQL.
    *
    * @group window_funcs
    * @since 1.4.0
    */
-  def ntile(e: Column): Column = {
-    UnresolvedWindowFunction("ntile", e.expr :: Nil)
+  def lead(e: Column, offset: Int, defaultValue: Any): Column = {
+    UnresolvedWindowFunction("lead", e.expr :: Literal(offset) :: Literal(defaultValue) :: Nil)
   }
 
   /**
-   * NTILE for specified column.
-   * NTILE allows easy calculation of tertiles, quartiles, deciles and other
-   * common summary statistics. This function divides an ordered partition into a specified
-   * number of groups called buckets and assigns a bucket number to each row in the partition.
+   * Window function: returns the ntile group id (from 1 to `n` inclusive) in an ordered window
+   * partition. Fow example, if `n` is 4, the first quarter of the rows will get value 1, the second
+   * quarter will get 2, the third quarter will get 3, and the last quarter will get 4.
+   *
+   * This is equivalent to the NTILE function in SQL.
    *
    * @group window_funcs
    * @since 1.4.0
    */
-  def ntile(columnName: String): Column = {
-    ntile(Column(columnName))
+  def ntile(n: Int): Column = {
+    UnresolvedWindowFunction("ntile", lit(n).expr :: Nil)
   }
 
   /**
-   * Assigns a unique number (sequentially, starting from 1, as defined by ORDER BY) to each
-   * row within the partition.
+   * Window function: returns a sequential number starting at 1 within a window partition.
+   *
+   * This is equivalent to the ROW_NUMBER function in SQL.
    *
    * @group window_funcs
    * @since 1.4.0
@@ -497,11 +464,15 @@ object functions {
   }
 
   /**
-   * The difference between RANK and DENSE_RANK is that DENSE_RANK leaves no gaps in ranking
-   * sequence when there are ties. That is, if you were ranking a competition using DENSE_RANK
+   * Window function: returns the rank of rows within a window partition, without any gaps.
+   *
+   * The difference between rank and denseRank is that denseRank leaves no gaps in ranking
+   * sequence when there are ties. That is, if you were ranking a competition using denseRank
    * and had three people tie for second place, you would say that all three were in second
    * place and that the next person came in third.
    *
+   * This is equivalent to the DENSE_RANK function in SQL.
+   *
    * @group window_funcs
    * @since 1.4.0
    */
@@ -510,11 +481,15 @@ object functions {
   }
 
   /**
-   * The difference between RANK and DENSE_RANK is that DENSE_RANK leaves no gaps in ranking
-   * sequence when there are ties. That is, if you were ranking a competition using DENSE_RANK
+   * Window function: returns the rank of rows within a window partition.
+   *
+   * The difference between rank and denseRank is that denseRank leaves no gaps in ranking
+   * sequence when there are ties. That is, if you were ranking a competition using denseRank
    * and had three people tie for second place, you would say that all three were in second
    * place and that the next person came in third.
    *
+   * This is equivalent to the RANK function in SQL.
+   *
    * @group window_funcs
    * @since 1.4.0
    */
@@ -523,10 +498,16 @@ object functions {
   }
 
   /**
-   * CUME_DIST (defined as the inverse of percentile in some statistical books) computes
-   * the position of a specified value relative to a set of values.
-   * To compute the CUME_DIST of a value x in a set S of size N, you use the formula:
-   * CUME_DIST(x) = number of values in S coming before and including x in the specified order / N
+   * Window function: returns the cumulative distribution of values within a window partition,
+   * i.e. the fraction of rows that are below the current row.
+   *
+   * {{{
+   *   N = total number of rows in the partition
+   *   cumeDist(x) = number of values before (and including) x / N
+   * }}}
+   *
+   *
+   * This is equivalent to the CUME_DIST function in SQL.
    *
    * @group window_funcs
    * @since 1.4.0
@@ -536,10 +517,14 @@ object functions {
   }
 
   /**
-   * PERCENT_RANK is similar to CUME_DIST, but it uses rank values rather than row counts
-   * in its numerator.
-   * The formula:
-   * (rank of row in its partition - 1) / (number of rows in the partition - 1)
+   * Window function: returns the relative rank (i.e. percentile) of rows within a window partition.
+   *
+   * This is computed by:
+   * {{{
+   *   (rank of row in its partition - 1) / (number of rows in the partition - 1)
+   * }}}
+   *
+   * This is equivalent to the PERCENT_RANK function in SQL.
    *
    * @group window_funcs
    * @since 1.4.0
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveDataFrameWindowSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveDataFrameWindowSuite.scala
index 6cea6776c8ca6..efb3f2545db84 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveDataFrameWindowSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveDataFrameWindowSuite.scala
@@ -31,8 +31,8 @@ class HiveDataFrameWindowSuite extends QueryTest {
 
     checkAnswer(
       df.select(
-        lead("key").over(w),
-        lead("value").over(w)),
+        lead("key", 1).over(w),
+        lead("value", 1).over(w)),
       Row(1, "1") :: Row(2, "2") :: Row(null, null) :: Row(null, null) :: Nil)
   }
 
@@ -42,8 +42,8 @@ class HiveDataFrameWindowSuite extends QueryTest {
 
     checkAnswer(
       df.select(
-        lead("key").over(w),
-        lead("value").over(w)),
+        lead("key", 1).over(w),
+        lead("value", 1).over(w)),
       Row(1, "1") :: Row(2, "2") :: Row(null, null) :: Row(null, null) :: Nil)
   }
 
@@ -53,7 +53,7 @@ class HiveDataFrameWindowSuite extends QueryTest {
 
     checkAnswer(
       df.select(
-        lead("value").over(Window.partitionBy($"key").orderBy($"value"))),
+        lead("value", 1).over(Window.partitionBy($"key").orderBy($"value"))),
       sql(
         """SELECT
           | lead(value) OVER (PARTITION BY key ORDER BY value)
@@ -66,9 +66,7 @@ class HiveDataFrameWindowSuite extends QueryTest {
 
     checkAnswer(
       df.select(
-        lag("value").over(
-          Window.partitionBy($"key")
-          .orderBy($"value"))),
+        lag("value", 1).over(Window.partitionBy($"key").orderBy($"value"))),
       sql(
         """SELECT
           | lag(value) OVER (PARTITION BY key ORDER BY value)
@@ -112,8 +110,7 @@ class HiveDataFrameWindowSuite extends QueryTest {
         mean("key").over(Window.partitionBy("value").orderBy("key")),
         count("key").over(Window.partitionBy("value").orderBy("key")),
         sum("key").over(Window.partitionBy("value").orderBy("key")),
-        ntile("key").over(Window.partitionBy("value").orderBy("key")),
-        ntile($"key").over(Window.partitionBy("value").orderBy("key")),
+        ntile(2).over(Window.partitionBy("value").orderBy("key")),
         rowNumber().over(Window.partitionBy("value").orderBy("key")),
         denseRank().over(Window.partitionBy("value").orderBy("key")),
         rank().over(Window.partitionBy("value").orderBy("key")),
@@ -127,8 +124,7 @@ class HiveDataFrameWindowSuite extends QueryTest {
            |avg(key) over (partition by value order by key),
            |count(key) over (partition by value order by key),
            |sum(key) over (partition by value order by key),
-           |ntile(key) over (partition by value order by key),
-           |ntile(key) over (partition by value order by key),
+           |ntile(2) over (partition by value order by key),
            |row_number() over (partition by value order by key),
            |dense_rank() over (partition by value order by key),
            |rank() over (partition by value order by key),

From be47af1bdba469f84775c2b5936f8cb956c7c02b Mon Sep 17 00:00:00 2001
From: Davies Liu <davies@databricks.com>
Date: Sat, 23 May 2015 09:07:14 -0700
Subject: [PATCH 161/525] [SPARK-7840] add insertInto() to Writer

Add tests later.

Author: Davies Liu <davies@databricks.com>

Closes #6375 from davies/insertInto and squashes the following commits:

826423e [Davies Liu] add insertInto() to Writer
---
 python/pyspark/sql/dataframe.py  |  2 +-
 python/pyspark/sql/readwriter.py | 22 +++++++++++++++-------
 2 files changed, 16 insertions(+), 8 deletions(-)

diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py
index 55cad8238ee88..936487519a645 100644
--- a/python/pyspark/sql/dataframe.py
+++ b/python/pyspark/sql/dataframe.py
@@ -163,7 +163,7 @@ def insertInto(self, tableName, overwrite=False):
 
         Optionally overwriting any existing data.
         """
-        self._jdf.insertInto(tableName, overwrite)
+        self.write.insertInto(tableName, overwrite)
 
     @since(1.3)
     def saveAsTable(self, tableName, source=None, mode="error", **options):
diff --git a/python/pyspark/sql/readwriter.py b/python/pyspark/sql/readwriter.py
index 02b3aab2b12e4..b6fd413bec7db 100644
--- a/python/pyspark/sql/readwriter.py
+++ b/python/pyspark/sql/readwriter.py
@@ -226,17 +226,25 @@ def save(self, path=None, format=None, mode="error", **options):
         else:
             jwrite.save(path)
 
+    def insertInto(self, tableName, overwrite=False):
+        """
+        Inserts the content of the :class:`DataFrame` to the specified table.
+        It requires that the schema of the class:`DataFrame` is the same as the
+        schema of the table.
+
+        Optionally overwriting any existing data.
+        """
+        self._jwrite.mode("overwrite" if overwrite else "append").insertInto(tableName)
+
     @since(1.4)
     def saveAsTable(self, name, format=None, mode="error", **options):
         """
-        Saves the contents of this :class:`DataFrame` to a data source as a table.
-
-        The data source is specified by the ``source`` and a set of ``options``.
-        If ``source`` is not specified, the default data source configured by
-        ``spark.sql.sources.default`` will be used.
+        Saves the content of the :class:`DataFrame` as the specified table.
 
-        Additionally, mode is used to specify the behavior of the saveAsTable operation when
-        table already exists in the data source. There are four modes:
+        In the case the table already exists, behavior of this function depends on the
+        save mode, specified by the `mode` function (default to throwing an exception).
+        When `mode` is `Overwrite`, the schema of the [[DataFrame]] does not need to be
+        the same as that of the existing table.
 
         * `append`: Append contents of this :class:`DataFrame` to existing data.
         * `overwrite`: Overwrite existing data.

From a4df0f2d84ff24318b139db534521141d9d4d593 Mon Sep 17 00:00:00 2001
From: Davies Liu <davies@databricks.com>
Date: Sat, 23 May 2015 09:14:07 -0700
Subject: [PATCH 162/525] Fix install jira-python

jira-pytyhon package should be installed by

  sudo pip install jira

cc pwendell

Author: Davies Liu <davies@databricks.com>

Closes #6367 from davies/fix_jira_python2 and squashes the following commits:

fbb3c8e [Davies Liu] Fix install jira-python
---
 dev/create-release/releaseutils.py | 2 +-
 dev/github_jira_sync.py            | 2 +-
 dev/merge_spark_pr.py              | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/dev/create-release/releaseutils.py b/dev/create-release/releaseutils.py
index 26221b270394e..51ab25a6a5bd8 100755
--- a/dev/create-release/releaseutils.py
+++ b/dev/create-release/releaseutils.py
@@ -27,7 +27,7 @@
     from jira.exceptions import JIRAError
 except ImportError:
     print "This tool requires the jira-python library"
-    print "Install using 'sudo pip install jira-python'"
+    print "Install using 'sudo pip install jira'"
     sys.exit(-1)
 
 try:
diff --git a/dev/github_jira_sync.py b/dev/github_jira_sync.py
index ff1e39664ee04..287f0ca24a7df 100755
--- a/dev/github_jira_sync.py
+++ b/dev/github_jira_sync.py
@@ -28,7 +28,7 @@
     import jira.client
 except ImportError:
     print "This tool requires the jira-python library"
-    print "Install using 'sudo pip install jira-python'"
+    print "Install using 'sudo pip install jira'"
     sys.exit(-1)
 
 # User facing configs
diff --git a/dev/merge_spark_pr.py b/dev/merge_spark_pr.py
index 1c126f50bf095..787c5cc8e892d 100755
--- a/dev/merge_spark_pr.py
+++ b/dev/merge_spark_pr.py
@@ -426,7 +426,7 @@ def main():
             print "JIRA_USERNAME and JIRA_PASSWORD not set"
             print "Exiting without trying to close the associated JIRA."
     else:
-        print "Could not find jira-python library. Run 'sudo pip install jira-python' to install."
+        print "Could not find jira-python library. Run 'sudo pip install jira' to install."
         print "Exiting without trying to close the associated JIRA."
 
 if __name__ == "__main__":

From 2b7e63585d61be2dab78b70af3867cda3983d5b1 Mon Sep 17 00:00:00 2001
From: Yin Huai <yhuai@databricks.com>
Date: Sat, 23 May 2015 09:48:20 -0700
Subject: [PATCH 163/525] [SPARK-7654] [SQL] Move insertInto into reader/writer
 interface.

This one continues the work of https://github.com/apache/spark/pull/6216.

Author: Yin Huai <yhuai@databricks.com>
Author: Reynold Xin <rxin@databricks.com>

Closes #6366 from yhuai/insert and squashes the following commits:

3d717fb [Yin Huai] Use insertInto to handle the casue when table exists and Append is used for saveAsTable.
56d2540 [Yin Huai] Add PreWriteCheck to HiveContext's analyzer.
c636e35 [Yin Huai] Remove unnecessary empty lines.
cf83837 [Yin Huai] Move insertInto to write. Also, remove the partition columns from InsertIntoHadoopFsRelation.
0841a54 [Reynold Xin] Removed experimental tag for deprecated methods.
33ed8ef [Reynold Xin] [SPARK-7654][SQL] Move insertInto into reader/writer interface.
---
 .../org/apache/spark/sql/DataFrame.scala      | 52 +++++++--------
 .../apache/spark/sql/DataFrameReader.scala    | 18 +----
 .../apache/spark/sql/DataFrameWriter.scala    | 66 ++++++++++++++++---
 .../sql/parquet/ParquetTableSupport.scala     |  2 +-
 .../sql/sources/DataSourceStrategy.scala      |  5 +-
 .../apache/spark/sql/sources/commands.scala   |  2 +-
 .../org/apache/spark/sql/sources/ddl.scala    |  1 -
 .../org/apache/spark/sql/sources/rules.scala  | 19 +++++-
 .../apache/spark/sql/hive/HiveContext.scala   |  4 ++
 .../sql/hive/InsertIntoHiveTableSuite.scala   |  6 +-
 .../sql/hive/MetastoreDataSourcesSuite.scala  |  8 +--
 .../sql/hive/execution/SQLQuerySuite.scala    |  8 +--
 .../apache/spark/sql/hive/parquetSuites.scala |  4 +-
 .../sql/sources/hadoopFsRelationSuites.scala  | 10 ---
 14 files changed, 116 insertions(+), 89 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
index 3ec1c4a2f1027..f968577bc5848 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
@@ -1394,28 +1394,6 @@ class DataFrame private[sql](
   @Experimental
   def write: DataFrameWriter = new DataFrameWriter(this)
 
-  /**
-   * :: Experimental ::
-   * Adds the rows from this RDD to the specified table, optionally overwriting the existing data.
-   * @group output
-   * @since 1.3.0
-   */
-  @Experimental
-  def insertInto(tableName: String, overwrite: Boolean): Unit = {
-    sqlContext.executePlan(InsertIntoTable(UnresolvedRelation(Seq(tableName)),
-      Map.empty, logicalPlan, overwrite, ifNotExists = false)).toRdd
-  }
-
-  /**
-   * :: Experimental ::
-   * Adds the rows from this RDD to the specified table.
-   * Throws an exception if the table already exists.
-   * @group output
-   * @since 1.3.0
-   */
-  @Experimental
-  def insertInto(tableName: String): Unit = insertInto(tableName, overwrite = false)
-
   /**
    * Returns the content of the [[DataFrame]] as a RDD of JSON strings.
    * @group rdd
@@ -1551,13 +1529,7 @@ class DataFrame private[sql](
    */
   @deprecated("Use write.mode(mode).saveAsTable(tableName)", "1.4.0")
   def saveAsTable(tableName: String, mode: SaveMode): Unit = {
-    if (sqlContext.catalog.tableExists(Seq(tableName)) && mode == SaveMode.Append) {
-      // If table already exists and the save mode is Append,
-      // we will just call insertInto to append the contents of this DataFrame.
-      insertInto(tableName, overwrite = false)
-    } else {
-      write.mode(mode).saveAsTable(tableName)
-    }
+    write.mode(mode).saveAsTable(tableName)
   }
 
   /**
@@ -1713,9 +1685,29 @@ class DataFrame private[sql](
     write.format(source).mode(mode).options(options).save()
   }
 
+
+  /**
+   * Adds the rows from this RDD to the specified table, optionally overwriting the existing data.
+   * @group output
+   */
+  @deprecated("Use write.mode(SaveMode.Append|SaveMode.Overwrite).saveAsTable(tableName)", "1.4.0")
+  def insertInto(tableName: String, overwrite: Boolean): Unit = {
+    write.mode(if (overwrite) SaveMode.Overwrite else SaveMode.Append).insertInto(tableName)
+  }
+
+  /**
+   * Adds the rows from this RDD to the specified table.
+   * Throws an exception if the table already exists.
+   * @group output
+   */
+  @deprecated("Use write.mode(SaveMode.Append).saveAsTable(tableName)", "1.4.0")
+  def insertInto(tableName: String): Unit = {
+    write.mode(SaveMode.Append).insertInto(tableName)
+  }
+
   ////////////////////////////////////////////////////////////////////////////
   ////////////////////////////////////////////////////////////////////////////
-  // End of eeprecated methods
+  // End of deprecated methods
   ////////////////////////////////////////////////////////////////////////////
   ////////////////////////////////////////////////////////////////////////////
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala
index 381c10f48f3c3..b44d4c86ac5d3 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala
@@ -94,20 +94,6 @@ class DataFrameReader private[sql](sqlContext: SQLContext) {
     this
   }
 
-  /**
-   * Specifies the input partitioning. If specified, the underlying data source does not need to
-   * discover the data partitioning scheme, and thus can speed up very large inputs.
-   *
-   * This is only applicable for Parquet at the moment.
-   *
-   * @since 1.4.0
-   */
-  @scala.annotation.varargs
-  def partitionBy(colNames: String*): DataFrameReader = {
-    this.partitioningColumns = Option(colNames)
-    this
-  }
-
   /**
    * Loads input in as a [[DataFrame]], for data sources that require a path (e.g. data backed by
    * a local or distributed file system).
@@ -128,7 +114,7 @@ class DataFrameReader private[sql](sqlContext: SQLContext) {
     val resolved = ResolvedDataSource(
       sqlContext,
       userSpecifiedSchema = userSpecifiedSchema,
-      partitionColumns = partitioningColumns.map(_.toArray).getOrElse(Array.empty[String]),
+      partitionColumns = Array.empty[String],
       provider = source,
       options = extraOptions.toMap)
     DataFrame(sqlContext, LogicalRelation(resolved.relation))
@@ -300,6 +286,4 @@ class DataFrameReader private[sql](sqlContext: SQLContext) {
 
   private var extraOptions = new scala.collection.mutable.HashMap[String, String]
 
-  private var partitioningColumns: Option[Seq[String]] = None
-
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala
index f2e721d4db271..5548b26cb8f80 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala
@@ -20,6 +20,8 @@ package org.apache.spark.sql
 import java.util.Properties
 
 import org.apache.spark.annotation.Experimental
+import org.apache.spark.sql.catalyst.analysis.UnresolvedRelation
+import org.apache.spark.sql.catalyst.plans.logical.InsertIntoTable
 import org.apache.spark.sql.jdbc.{JDBCWriteDetails, JdbcUtils}
 import org.apache.spark.sql.sources.{ResolvedDataSource, CreateTableUsingAsSelect}
 
@@ -148,22 +150,66 @@ final class DataFrameWriter private[sql](df: DataFrame) {
       df)
   }
 
+  /**
+   * Inserts the content of the [[DataFrame]] to the specified table. It requires that
+   * the schema of the [[DataFrame]] is the same as the schema of the table.
+   *
+   * Because it inserts data to an existing table, format or options will be ignored.
+   *
+   * @since 1.4.0
+   */
+  def insertInto(tableName: String): Unit = {
+    val partitions =
+      partitioningColumns.map(_.map(col => col -> (None: Option[String])).toMap)
+    val overwrite = (mode == SaveMode.Overwrite)
+    df.sqlContext.executePlan(InsertIntoTable(
+      UnresolvedRelation(Seq(tableName)),
+      partitions.getOrElse(Map.empty[String, Option[String]]),
+      df.logicalPlan,
+      overwrite,
+      ifNotExists = false)).toRdd
+  }
+
   /**
    * Saves the content of the [[DataFrame]] as the specified table.
    *
+   * In the case the table already exists, behavior of this function depends on the
+   * save mode, specified by the `mode` function (default to throwing an exception).
+   * When `mode` is `Overwrite`, the schema of the [[DataFrame]] does not need to be
+   * the same as that of the existing table.
+   * When `mode` is `Append`, the schema of the [[DataFrame]] need to be
+   * the same as that of the existing table, and format or options will be ignored.
+   *
    * @since 1.4.0
    */
   def saveAsTable(tableName: String): Unit = {
-    val cmd =
-      CreateTableUsingAsSelect(
-        tableName,
-        source,
-        temporary = false,
-        partitioningColumns.map(_.toArray).getOrElse(Array.empty[String]),
-        mode,
-        extraOptions.toMap,
-        df.logicalPlan)
-    df.sqlContext.executePlan(cmd).toRdd
+    if (df.sqlContext.catalog.tableExists(tableName :: Nil) && mode != SaveMode.Overwrite) {
+      mode match {
+        case SaveMode.Ignore =>
+          // Do nothing
+
+        case SaveMode.ErrorIfExists =>
+          throw new AnalysisException(s"Table $tableName already exists.")
+
+        case SaveMode.Append =>
+          // If it is Append, we just ask insertInto to handle it. We will not use insertInto
+          // to handle saveAsTable with Overwrite because saveAsTable can change the schema of
+          // the table. But, insertInto with Overwrite requires the schema of data be the same
+          // the schema of the table.
+          insertInto(tableName)
+      }
+    } else {
+      val cmd =
+        CreateTableUsingAsSelect(
+          tableName,
+          source,
+          temporary = false,
+          partitioningColumns.map(_.toArray).getOrElse(Array.empty[String]),
+          mode,
+          extraOptions.toMap,
+          df.logicalPlan)
+      df.sqlContext.executePlan(cmd).toRdd
+    }
   }
 
   /**
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableSupport.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableSupport.scala
index c45c431438efc..70a220cc43ab9 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableSupport.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableSupport.scala
@@ -129,7 +129,7 @@ private[parquet] object RowReadSupport {
 }
 
 /**
- * A `parquet.hadoop.api.WriteSupport` for Row ojects.
+ * A `parquet.hadoop.api.WriteSupport` for Row objects.
  */
 private[parquet] class RowWriteSupport extends WriteSupport[Row] with Logging {
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/sources/DataSourceStrategy.scala b/sql/core/src/main/scala/org/apache/spark/sql/sources/DataSourceStrategy.scala
index c03649d00bbae..dacd967cff856 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/sources/DataSourceStrategy.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/sources/DataSourceStrategy.scala
@@ -105,10 +105,9 @@ private[sql] object DataSourceStrategy extends Strategy with Logging {
       execution.ExecutedCommand(InsertIntoDataSource(l, query, overwrite)) :: Nil
 
     case i @ logical.InsertIntoTable(
-      l @ LogicalRelation(t: HadoopFsRelation), part, query, overwrite, false) if part.isEmpty =>
+      l @ LogicalRelation(t: HadoopFsRelation), part, query, overwrite, false) =>
       val mode = if (overwrite) SaveMode.Overwrite else SaveMode.Append
-      execution.ExecutedCommand(
-        InsertIntoHadoopFsRelation(t, query, Array.empty[String], mode)) :: Nil
+      execution.ExecutedCommand(InsertIntoHadoopFsRelation(t, query, mode)) :: Nil
 
     case _ => Nil
   }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/sources/commands.scala b/sql/core/src/main/scala/org/apache/spark/sql/sources/commands.scala
index 498f7538d4f55..c3674a8c76be8 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/sources/commands.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/sources/commands.scala
@@ -61,7 +61,6 @@ private[sql] case class InsertIntoDataSource(
 private[sql] case class InsertIntoHadoopFsRelation(
     @transient relation: HadoopFsRelation,
     @transient query: LogicalPlan,
-    partitionColumns: Array[String],
     mode: SaveMode)
   extends RunnableCommand {
 
@@ -100,6 +99,7 @@ private[sql] case class InsertIntoHadoopFsRelation(
         relation.schema,
         needsConversion = false)
 
+      val partitionColumns = relation.partitionColumns.fieldNames
       if (partitionColumns.isEmpty) {
         insert(new DefaultWriterContainer(relation, job), df)
       } else {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/sources/ddl.scala b/sql/core/src/main/scala/org/apache/spark/sql/sources/ddl.scala
index 5e723122eeab1..ca30b8e74626f 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/sources/ddl.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/sources/ddl.scala
@@ -335,7 +335,6 @@ private[sql] object ResolvedDataSource {
           InsertIntoHadoopFsRelation(
             r,
             project,
-            partitionColumns.toArray,
             mode)).toRdd
         r
       case _ =>
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/sources/rules.scala b/sql/core/src/main/scala/org/apache/spark/sql/sources/rules.scala
index ab33125b74c17..a3fd7f13b3db7 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/sources/rules.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/sources/rules.scala
@@ -35,9 +35,9 @@ private[sql] object PreInsertCastAndRename extends Rule[LogicalPlan] {
       // Wait until children are resolved.
       case p: LogicalPlan if !p.childrenResolved => p
 
-      // We are inserting into an InsertableRelation.
+      // We are inserting into an InsertableRelation or HadoopFsRelation.
       case i @ InsertIntoTable(
-      l @ LogicalRelation(r: InsertableRelation), partition, child, overwrite, ifNotExists) => {
+      l @ LogicalRelation(_: InsertableRelation | _: HadoopFsRelation), _, child, _, _) => {
         // First, make sure the data to be inserted have the same number of fields with the
         // schema of the relation.
         if (l.output.size != child.output.size) {
@@ -101,7 +101,20 @@ private[sql] case class PreWriteCheck(catalog: Catalog) extends (LogicalPlan =>
           }
         }
 
-      case logical.InsertIntoTable(LogicalRelation(_: HadoopFsRelation), _, _, _, _) => // OK
+      case logical.InsertIntoTable(LogicalRelation(r: HadoopFsRelation), part, _, _, _) =>
+        // We need to make sure the partition columns specified by users do match partition
+        // columns of the relation.
+        val existingPartitionColumns = r.partitionColumns.fieldNames.toSet
+        val specifiedPartitionColumns = part.keySet
+        if (existingPartitionColumns != specifiedPartitionColumns) {
+          failAnalysis(s"Specified partition columns " +
+            s"(${specifiedPartitionColumns.mkString(", ")}) " +
+            s"do not match the partition columns of the table. Please use " +
+            s"(${existingPartitionColumns.mkString(", ")}) as the partition columns.")
+        } else {
+          // OK
+        }
+
       case logical.InsertIntoTable(l: LogicalRelation, _, _, _, _) =>
         // The relation in l is not an InsertableRelation.
         failAnalysis(s"$l does not allow insertion.")
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala
index a8e8e70db0430..0d807f428aafc 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala
@@ -373,6 +373,10 @@ class HiveContext(sc: SparkContext) extends SQLContext(sc) {
         ResolveHiveWindowFunction ::
         sources.PreInsertCastAndRename ::
         Nil
+
+      override val extendedCheckRules = Seq(
+        sources.PreWriteCheck(catalog)
+      )
     }
 
   override protected[sql] def createSession(): SQLSession = {
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/InsertIntoHiveTableSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/InsertIntoHiveTableSuite.scala
index ecb990e8aac91..acf2f7da30188 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/InsertIntoHiveTableSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/InsertIntoHiveTableSuite.scala
@@ -53,7 +53,7 @@ class InsertIntoHiveTableSuite extends QueryTest with BeforeAndAfter {
     sql("CREATE TABLE createAndInsertTest (key int, value string)")
 
     // Add some data.
-    testData.insertInto("createAndInsertTest")
+    testData.write.mode(SaveMode.Append).insertInto("createAndInsertTest")
 
     // Make sure the table has also been updated.
     checkAnswer(
@@ -62,7 +62,7 @@ class InsertIntoHiveTableSuite extends QueryTest with BeforeAndAfter {
     )
 
     // Add more data.
-    testData.insertInto("createAndInsertTest")
+    testData.write.mode(SaveMode.Append).insertInto("createAndInsertTest")
 
     // Make sure the table has been updated.
     checkAnswer(
@@ -71,7 +71,7 @@ class InsertIntoHiveTableSuite extends QueryTest with BeforeAndAfter {
     )
 
     // Now overwrite.
-    testData.insertInto("createAndInsertTest", overwrite = true)
+    testData.write.mode(SaveMode.Overwrite).insertInto("createAndInsertTest")
 
     // Make sure the registered table has also been updated.
     checkAnswer(
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala
index c4c7b634964ed..9623ef06aa9b0 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala
@@ -608,7 +608,7 @@ class MetastoreDataSourcesSuite extends QueryTest with BeforeAndAfterEach {
       StructType(
         StructField("a", ArrayType(IntegerType, containsNull = false), nullable = true) :: Nil)
     assert(df2.schema === expectedSchema2)
-    df2.insertInto("arrayInParquet", overwrite = false)
+    df2.write.mode(SaveMode.Append).insertInto("arrayInParquet")
     createDataFrame(Tuple1(Seq(4, 5)) :: Nil).toDF("a").write.mode(SaveMode.Append)
       .saveAsTable("arrayInParquet") // This one internally calls df2.insertInto.
     createDataFrame(Tuple1(Seq(Int.box(6), null.asInstanceOf[Integer])) :: Nil).toDF("a").write
@@ -642,7 +642,7 @@ class MetastoreDataSourcesSuite extends QueryTest with BeforeAndAfterEach {
       StructType(
         StructField("a", mapType2, nullable = true) :: Nil)
     assert(df2.schema === expectedSchema2)
-    df2.insertInto("mapInParquet", overwrite = false)
+    df2.write.mode(SaveMode.Append).insertInto("mapInParquet")
     createDataFrame(Tuple1(Map(4 -> 5)) :: Nil).toDF("a").write.mode(SaveMode.Append)
       .saveAsTable("mapInParquet") // This one internally calls df2.insertInto.
     createDataFrame(Tuple1(Map(6 -> null.asInstanceOf[Integer])) :: Nil).toDF("a").write
@@ -768,7 +768,7 @@ class MetastoreDataSourcesSuite extends QueryTest with BeforeAndAfterEach {
       sql("SELECT p.c1, c2 FROM insertParquet p WHERE p.c1 > 5 AND p.c1 < 35"),
       (6 to 34).map(i => Row(i, s"str$i")))
 
-    createDF(40, 49).insertInto("insertParquet")
+    createDF(40, 49).write.mode(SaveMode.Append).insertInto("insertParquet")
     checkAnswer(
       sql("SELECT p.c1, c2 FROM insertParquet p WHERE p.c1 > 5 AND p.c1 < 45"),
       (6 to 44).map(i => Row(i, s"str$i")))
@@ -782,7 +782,7 @@ class MetastoreDataSourcesSuite extends QueryTest with BeforeAndAfterEach {
       sql("SELECT p.c1, c2 FROM insertParquet p"),
       (50 to 59).map(i => Row(i, s"str$i")))
 
-    createDF(70, 79).insertInto("insertParquet", overwrite = true)
+    createDF(70, 79).write.mode(SaveMode.Overwrite).insertInto("insertParquet")
     checkAnswer(
       sql("SELECT p.c1, c2 FROM insertParquet p"),
       (70 to 79).map(i => Row(i, s"str$i")))
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
index ba53ed99beb03..b707f5e68489b 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
@@ -20,7 +20,7 @@ package org.apache.spark.sql.hive.execution
 import org.apache.spark.sql.catalyst.DefaultParserDialect
 import org.apache.spark.sql.catalyst.analysis.EliminateSubQueries
 import org.apache.spark.sql.catalyst.errors.DialectException
-import org.apache.spark.sql.{AnalysisException, QueryTest, Row, SQLConf}
+import org.apache.spark.sql._
 import org.apache.spark.sql.hive.test.TestHive
 import org.apache.spark.sql.hive.test.TestHive._
 import org.apache.spark.sql.hive.test.TestHive.implicits._
@@ -425,10 +425,10 @@ class SQLQuerySuite extends QueryTest {
   test("SPARK-4825 save join to table") {
     val testData = sparkContext.parallelize(1 to 10).map(i => TestData(i, i.toString)).toDF()
     sql("CREATE TABLE test1 (key INT, value STRING)")
-    testData.insertInto("test1")
+    testData.write.mode(SaveMode.Append).insertInto("test1")
     sql("CREATE TABLE test2 (key INT, value STRING)")
-    testData.insertInto("test2")
-    testData.insertInto("test2")
+    testData.write.mode(SaveMode.Append).insertInto("test2")
+    testData.write.mode(SaveMode.Append).insertInto("test2")
     sql("CREATE TABLE test AS SELECT COUNT(a.value) FROM test1 a JOIN test2 b ON a.key = b.key")
     checkAnswer(
       table("test"),
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/parquetSuites.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/parquetSuites.scala
index 223ba65f47b90..7851f38fd4056 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/parquetSuites.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/parquetSuites.scala
@@ -316,7 +316,7 @@ class ParquetDataSourceOnMetastoreSuite extends ParquetMetastoreSuiteBase {
 
     val df = sql("INSERT INTO TABLE test_insert_parquet SELECT a FROM jt")
     df.queryExecution.executedPlan match {
-      case ExecutedCommand(InsertIntoHadoopFsRelation(_: ParquetRelation2, _, _, _)) => // OK
+      case ExecutedCommand(InsertIntoHadoopFsRelation(_: ParquetRelation2, _, _)) => // OK
       case o => fail("test_insert_parquet should be converted to a " +
         s"${classOf[ParquetRelation2].getCanonicalName} and " +
         s"${classOf[InsertIntoDataSource].getCanonicalName} is expcted as the SparkPlan. " +
@@ -346,7 +346,7 @@ class ParquetDataSourceOnMetastoreSuite extends ParquetMetastoreSuiteBase {
 
     val df = sql("INSERT INTO TABLE test_insert_parquet SELECT a FROM jt_array")
     df.queryExecution.executedPlan match {
-      case ExecutedCommand(InsertIntoHadoopFsRelation(r: ParquetRelation2, _, _, _)) => // OK
+      case ExecutedCommand(InsertIntoHadoopFsRelation(r: ParquetRelation2, _, _)) => // OK
       case o => fail("test_insert_parquet should be converted to a " +
         s"${classOf[ParquetRelation2].getCanonicalName} and " +
         s"${classOf[InsertIntoDataSource].getCanonicalName} is expcted as the SparkPlan." +
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/sources/hadoopFsRelationSuites.scala b/sql/hive/src/test/scala/org/apache/spark/sql/sources/hadoopFsRelationSuites.scala
index c7c8bcd27fbde..32226905bca9d 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/sources/hadoopFsRelationSuites.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/sources/hadoopFsRelationSuites.scala
@@ -362,16 +362,6 @@ abstract class HadoopFsRelationTest extends QueryTest with SQLTestUtils {
         .partitionBy("p1")
         .saveAsTable("t")
     }
-
-    // Using different order of partition columns
-    intercept[Throwable] {
-      partitionedTestDF2.write
-        .format(dataSourceName)
-        .mode(SaveMode.Append)
-        .option("dataSchema", dataSchema.json)
-        .partitionBy("p2", "p1")
-        .saveAsTable("t")
-    }
   }
 
   test("saveAsTable()/load() - partitioned table - ErrorIfExists") {

From b231baa24857ea83c8062dd4e033db4e35bf457d Mon Sep 17 00:00:00 2001
From: Shivaram Venkataraman <shivaram@cs.berkeley.edu>
Date: Sat, 23 May 2015 12:28:16 -0700
Subject: [PATCH 164/525] [HOTFIX] Copy SparkR lib if it exists in
 make-distribution

This is to fix an issue reported in #6373 where the `cp` would fail if `-Psparkr` was not used in the build

cc dragos pwendell

Author: Shivaram Venkataraman <shivaram@cs.berkeley.edu>

Closes #6379 from shivaram/make-distribution-hotfix and squashes the following commits:

08eb7e4 [Shivaram Venkataraman] Copy SparkR lib if it exists in make-distribution
---
 make-distribution.sh | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/make-distribution.sh b/make-distribution.sh
index 78827341b956c..a2b0c431fb4d0 100755
--- a/make-distribution.sh
+++ b/make-distribution.sh
@@ -229,10 +229,13 @@ cp "$SPARK_HOME"/conf/*.template "$DISTDIR"/conf
 cp "$SPARK_HOME/README.md" "$DISTDIR"
 cp -r "$SPARK_HOME/bin" "$DISTDIR"
 cp -r "$SPARK_HOME/python" "$DISTDIR"
-mkdir -p "$DISTDIR"/R/lib
-cp -r "$SPARK_HOME/R/lib/SparkR" "$DISTDIR"/R/lib
 cp -r "$SPARK_HOME/sbin" "$DISTDIR"
 cp -r "$SPARK_HOME/ec2" "$DISTDIR"
+# Copy SparkR if it exists
+if [ -d "$SPARK_HOME"/R/lib/SparkR ]; then
+  mkdir -p "$DISTDIR"/R/lib
+  cp -r "$SPARK_HOME/R/lib/SparkR" "$DISTDIR"/R/lib
+fi
 
 # Download and copy in tachyon, if requested
 if [ "$SPARK_TACHYON" == "true" ]; then

From 3c1a2d049cd4bf35fd48a032f5008b7bab60833e Mon Sep 17 00:00:00 2001
From: Patrick Wendell <patrick@databricks.com>
Date: Sat, 23 May 2015 19:44:03 -0700
Subject: [PATCH 165/525] [SPARK-7287] [HOTFIX] Disable
 o.a.s.deploy.SparkSubmitSuite --packages

---
 .../test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala  | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala b/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala
index 8f64ab5e42108..ea9227a7e9af5 100644
--- a/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala
+++ b/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala
@@ -335,7 +335,8 @@ class SparkSubmitSuite extends FunSuite with Matchers with ResetSystemProperties
     runSparkSubmit(args)
   }
 
-  test("includes jars passed in through --packages") {
+  // SPARK-7287
+  ignore("includes jars passed in through --packages") {
     val unusedJar = TestUtils.createJarWithClasses(Seq.empty)
     val main = MavenCoordinate("my.great.lib", "mylib", "0.1")
     val dep = MavenCoordinate("my.great.dep", "mylib", "0.1")

From bfbc0df72944fe0a900ab920d8c4329a11fccca6 Mon Sep 17 00:00:00 2001
From: Yin Huai <yhuai@databricks.com>
Date: Sun, 24 May 2015 09:49:57 -0700
Subject: [PATCH 166/525] [SPARK-7845] [BUILD] Bump "Hadoop 1" tests to version
 1.2.1

https://issues.apache.org/jira/browse/SPARK-7845

Author: Yin Huai <yhuai@databricks.com>

Closes #6384 from yhuai/hadoop1Test and squashes the following commits:

82fcea8 [Yin Huai] Use hadoop 1.2.1 (a stable version) for hadoop 1 test.
---
 dev/run-tests | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dev/run-tests b/dev/run-tests
index 44d802782c4a4..57296d0c6028e 100755
--- a/dev/run-tests
+++ b/dev/run-tests
@@ -40,7 +40,7 @@ function handle_error () {
 {
   if [ -n "$AMPLAB_JENKINS_BUILD_PROFILE" ]; then
     if [ "$AMPLAB_JENKINS_BUILD_PROFILE" = "hadoop1.0" ]; then
-      export SBT_MAVEN_PROFILES_ARGS="-Phadoop-1 -Dhadoop.version=1.0.4"
+      export SBT_MAVEN_PROFILES_ARGS="-Phadoop-1 -Dhadoop.version=1.2.1"
     elif [ "$AMPLAB_JENKINS_BUILD_PROFILE" = "hadoop2.0" ]; then
       export SBT_MAVEN_PROFILES_ARGS="-Phadoop-1 -Dhadoop.version=2.0.0-mr1-cdh4.1.1"
     elif [ "$AMPLAB_JENKINS_BUILD_PROFILE" = "hadoop2.2" ]; then

From ed21476bc0c760616e7e6bb99f6541745fb09595 Mon Sep 17 00:00:00 2001
From: Yin Huai <yhuai@databricks.com>
Date: Sun, 24 May 2015 09:51:37 -0700
Subject: [PATCH 167/525] [SPARK-7805] [SQL] Move SQLTestUtils.scala and
 ParquetTest.scala to src/test

https://issues.apache.org/jira/browse/SPARK-7805

Because `sql/hive`'s tests depend on the test jar of `sql/core`, we do not need to store `SQLTestUtils` and `ParquetTest` in `src/main`. We should only add stuff that will be needed by `sql/console` or Python tests (for Python, we need it in `src/main`, right? davies).

Author: Yin Huai <yhuai@databricks.com>

Closes #6334 from yhuai/SPARK-7805 and squashes the following commits:

af6d0c9 [Yin Huai] mima
b86746a [Yin Huai] Move SQLTestUtils.scala and ParquetTest.scala to src/test.
---
 project/MimaExcludes.scala                                 | 5 ++++-
 .../src/main/scala/org/apache/spark/sql/test/README.md     | 7 +++++++
 .../scala/org/apache/spark/sql/parquet/ParquetTest.scala   | 0
 .../scala/org/apache/spark/sql/test/SQLTestUtils.scala     | 0
 4 files changed, 11 insertions(+), 1 deletion(-)
 create mode 100644 sql/core/src/main/scala/org/apache/spark/sql/test/README.md
 rename sql/core/src/{main => test}/scala/org/apache/spark/sql/parquet/ParquetTest.scala (100%)
 rename sql/core/src/{main => test}/scala/org/apache/spark/sql/test/SQLTestUtils.scala (100%)

diff --git a/project/MimaExcludes.scala b/project/MimaExcludes.scala
index 03e93a2f98f9b..11b439e7875fc 100644
--- a/project/MimaExcludes.scala
+++ b/project/MimaExcludes.scala
@@ -133,7 +133,10 @@ object MimaExcludes {
               "org.apache.spark.sql.parquet.TestGroupWriteSupport"),
             ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.CachedData"),
             ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.CachedData$"),
-            ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.CacheManager")
+            ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.CacheManager"),
+            // TODO: Remove the following rule once ParquetTest has been moved to src/test.
+            ProblemFilters.exclude[MissingClassProblem](
+              "org.apache.spark.sql.parquet.ParquetTest")
           ) ++ Seq(
             // SPARK-7530 Added StreamingContext.getState()
             ProblemFilters.exclude[MissingMethodProblem](
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/test/README.md b/sql/core/src/main/scala/org/apache/spark/sql/test/README.md
new file mode 100644
index 0000000000000..d867f181b9728
--- /dev/null
+++ b/sql/core/src/main/scala/org/apache/spark/sql/test/README.md
@@ -0,0 +1,7 @@
+README
+======
+
+Please do not add any class in this place unless it is used by `sql/console` or Python tests.
+If you need to create any classes or traits that will be used by tests from both `sql/core` and
+`sql/hive`, you can add them in the `src/test` of `sql/core` (tests of `sql/hive`
+depend on the test jar of `sql/core`).
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTest.scala b/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetTest.scala
similarity index 100%
rename from sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTest.scala
rename to sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetTest.scala
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/test/SQLTestUtils.scala b/sql/core/src/test/scala/org/apache/spark/sql/test/SQLTestUtils.scala
similarity index 100%
rename from sql/core/src/main/scala/org/apache/spark/sql/test/SQLTestUtils.scala
rename to sql/core/src/test/scala/org/apache/spark/sql/test/SQLTestUtils.scala

From 65c696ecc0a913bbe1c8b8399d811da87e4c4343 Mon Sep 17 00:00:00 2001
From: Ram Sriharsha <rsriharsha@hw11853.local>
Date: Sun, 24 May 2015 10:36:02 -0700
Subject: [PATCH 168/525] [SPARK-7833] [ML] Add python wrapper for
 RegressionEvaluator

Author: Ram Sriharsha <rsriharsha@hw11853.local>

Closes #6365 from harsha2010/SPARK-7833 and squashes the following commits:

923f288 [Ram Sriharsha] cleanup
7623b7d [Ram Sriharsha] python style fix
9743f83 [Ram Sriharsha] [SPARK-7833][ml] Add python wrapper for RegressionEvaluator
---
 .../ml/evaluation/RegressionEvaluator.scala   |  4 +-
 .../evaluation/RegressionEvaluatorSuite.scala |  1 +
 python/pyspark/ml/evaluation.py               | 68 ++++++++++++++++++-
 3 files changed, 69 insertions(+), 4 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/evaluation/RegressionEvaluator.scala b/mllib/src/main/scala/org/apache/spark/ml/evaluation/RegressionEvaluator.scala
index ec493f8f1b504..80458928c5439 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/evaluation/RegressionEvaluator.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/evaluation/RegressionEvaluator.scala
@@ -31,14 +31,14 @@ import org.apache.spark.sql.types.DoubleType
  * Evaluator for regression, which expects two input columns: prediction and label.
  */
 @AlphaComponent
-class RegressionEvaluator(override val uid: String)
+final class RegressionEvaluator(override val uid: String)
   extends Evaluator with HasPredictionCol with HasLabelCol {
 
   def this() = this(Identifiable.randomUID("regEval"))
 
   /**
    * param for metric name in evaluation
-   * @group param
+   * @group param supports mse, rmse, r2, mae as valid metric names.
    */
   val metricName: Param[String] = {
     val allowedParams = ParamValidators.inArray(Array("mse", "rmse", "r2", "mae"))
diff --git a/mllib/src/test/scala/org/apache/spark/ml/evaluation/RegressionEvaluatorSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/evaluation/RegressionEvaluatorSuite.scala
index 983f8b460b9c0..3ea7aad5274f2 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/evaluation/RegressionEvaluatorSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/evaluation/RegressionEvaluatorSuite.scala
@@ -39,6 +39,7 @@ class RegressionEvaluatorSuite extends FunSuite with MLlibTestSparkContext {
     val dataset = sqlContext.createDataFrame(
       sc.parallelize(LinearDataGenerator.generateLinearInput(
         6.3, Array(4.7, 7.2), Array(0.9, -1.3), Array(0.7, 1.2), 100, 42, 0.1), 2))
+    
     /**
      * Using the following R code to load the data, train the model and evaluate metrics.
      *
diff --git a/python/pyspark/ml/evaluation.py b/python/pyspark/ml/evaluation.py
index 34e1353def467..23c37167b3711 100644
--- a/python/pyspark/ml/evaluation.py
+++ b/python/pyspark/ml/evaluation.py
@@ -19,11 +19,11 @@
 
 from pyspark.ml.wrapper import JavaWrapper
 from pyspark.ml.param import Param, Params
-from pyspark.ml.param.shared import HasLabelCol, HasRawPredictionCol
+from pyspark.ml.param.shared import HasLabelCol, HasPredictionCol, HasRawPredictionCol
 from pyspark.ml.util import keyword_only
 from pyspark.mllib.common import inherit_doc
 
-__all__ = ['Evaluator', 'BinaryClassificationEvaluator']
+__all__ = ['Evaluator', 'BinaryClassificationEvaluator', 'RegressionEvaluator']
 
 
 @inherit_doc
@@ -148,6 +148,70 @@ def setParams(self, rawPredictionCol="rawPrediction", labelCol="label",
         return self._set(**kwargs)
 
 
+@inherit_doc
+class RegressionEvaluator(JavaEvaluator, HasLabelCol, HasPredictionCol):
+    """
+    Evaluator for Regression, which expects two input
+    columns: prediction and label.
+
+    >>> scoreAndLabels = [(-28.98343821, -27.0), (20.21491975, 21.5),
+    ...   (-25.98418959, -22.0), (30.69731842, 33.0), (74.69283752, 71.0)]
+    >>> dataset = sqlContext.createDataFrame(scoreAndLabels, ["raw", "label"])
+    ...
+    >>> evaluator = RegressionEvaluator(predictionCol="raw")
+    >>> evaluator.evaluate(dataset)
+    2.842...
+    >>> evaluator.evaluate(dataset, {evaluator.metricName: "r2"})
+    0.993...
+    >>> evaluator.evaluate(dataset, {evaluator.metricName: "mae"})
+    2.649...
+    """
+    # a placeholder to make it appear in the generated doc
+    metricName = Param(Params._dummy(), "metricName",
+                       "metric name in evaluation (mse|rmse|r2|mae)")
+
+    @keyword_only
+    def __init__(self, predictionCol="prediction", labelCol="label",
+                 metricName="rmse"):
+        """
+        __init__(self, predictionCol="prediction", labelCol="label", \
+                 metricName="rmse")
+        """
+        super(RegressionEvaluator, self).__init__()
+        self._java_obj = self._new_java_obj(
+            "org.apache.spark.ml.evaluation.RegressionEvaluator", self.uid)
+        #: param for metric name in evaluation (mse|rmse|r2|mae)
+        self.metricName = Param(self, "metricName",
+                                "metric name in evaluation (mse|rmse|r2|mae)")
+        self._setDefault(predictionCol="prediction", labelCol="label",
+                         metricName="rmse")
+        kwargs = self.__init__._input_kwargs
+        self._set(**kwargs)
+
+    def setMetricName(self, value):
+        """
+        Sets the value of :py:attr:`metricName`.
+        """
+        self._paramMap[self.metricName] = value
+        return self
+
+    def getMetricName(self):
+        """
+        Gets the value of metricName or its default value.
+        """
+        return self.getOrDefault(self.metricName)
+
+    @keyword_only
+    def setParams(self, predictionCol="prediction", labelCol="label",
+                  metricName="rmse"):
+        """
+        setParams(self, predictionCol="prediction", labelCol="label",
+                  metricName="rmse")
+        Sets params for regression evaluator.
+        """
+        kwargs = self.setParams._input_kwargs
+        return self._set(**kwargs)
+
 if __name__ == "__main__":
     import doctest
     from pyspark.context import SparkContext

From 4f4ba8fda8285f5f23143eafcd15be7f4a050122 Mon Sep 17 00:00:00 2001
From: Judy Nash <judynash@microsoft.com>
Date: Sun, 24 May 2015 21:48:27 +0100
Subject: [PATCH 169/525] =?UTF-8?q?[SPARK-7811]=20Fix=20typo=20on=20slf4j?=
 =?UTF-8?q?=20configuration=20on=20metrics.properties.tem=E2=80=A6?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Fix minor typo on metrics.properties.template where slf4j is incorrectly spelled as sl4j.

Author: Judy Nash <judynash@microsoft.com>

Closes #6362 from judynash/master and squashes the following commits:

c644875 [Judy Nash] SPARK-7811: Fix typo on slf4j configuration on metrics.properties.template
---
 conf/metrics.properties.template | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/conf/metrics.properties.template b/conf/metrics.properties.template
index 2e0cb5db170ac..7de0011a48ca8 100644
--- a/conf/metrics.properties.template
+++ b/conf/metrics.properties.template
@@ -126,9 +126,9 @@
 #*.sink.slf4j.class=org.apache.spark.metrics.sink.Slf4jSink
 
 # Polling period for Slf4JSink
-#*.sink.sl4j.period=1
+#*.sink.slf4j.period=1
 
-#*.sink.sl4j.unit=minutes
+#*.sink.slf4j.unit=minutes
 
 
 # Enable jvm source for instance master, worker, driver and executor

From 23bea97d922fbd131d9eb9531906a68459346720 Mon Sep 17 00:00:00 2001
From: tedyu <yuzhihong@gmail.com>
Date: Mon, 25 May 2015 08:19:42 +0100
Subject: [PATCH 170/525] Close HBaseAdmin at the end of HBaseTest

Author: tedyu <yuzhihong@gmail.com>

Closes #6381 from ted-yu/master and squashes the following commits:

e2f0ea1 [tedyu] Close HBaseAdmin at the end of HBaseTest
---
 .../src/main/scala/org/apache/spark/examples/HBaseTest.scala     | 1 +
 1 file changed, 1 insertion(+)

diff --git a/examples/src/main/scala/org/apache/spark/examples/HBaseTest.scala b/examples/src/main/scala/org/apache/spark/examples/HBaseTest.scala
index 849887d23c9cf..95c96111c9b1f 100644
--- a/examples/src/main/scala/org/apache/spark/examples/HBaseTest.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/HBaseTest.scala
@@ -59,5 +59,6 @@ object HBaseTest {
     hBaseRDD.count()
 
     sc.stop()
+    admin.close()
   }
 }

From fd31fd49763f7c60b47078c5c2d4b515c123d883 Mon Sep 17 00:00:00 2001
From: tedyu <yuzhihong@gmail.com>
Date: Mon, 25 May 2015 08:20:31 +0100
Subject: [PATCH 171/525] Add test which shows Kryo buffer size configured in
 mb is properly supported

This PR adds test which shows that Kryo buffer size configured in mb is supported properly

Author: tedyu <yuzhihong@gmail.com>

Closes #6390 from tedyu/master and squashes the following commits:

c51ea64 [tedyu] Fix KryoSerializer creation
f12ee04 [tedyu] Correct conf variable name in test
642de51 [tedyu] Drop change in KryoSerializer so that the new test runs
d2fdbc4 [tedyu] Give bufferSizeKb initial value
9a17277 [tedyu] Rewrite bufferSize checking
4739998 [tedyu] Rewrite bufferSize checking
830d0d0 [tedyu] Kryo buffer size configured in mb should be properly supported
---
 .../org/apache/spark/serializer/KryoSerializerSuite.scala     | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/core/src/test/scala/org/apache/spark/serializer/KryoSerializerSuite.scala b/core/src/test/scala/org/apache/spark/serializer/KryoSerializerSuite.scala
index 0bd91a8dba2ab..5faf108b394a1 100644
--- a/core/src/test/scala/org/apache/spark/serializer/KryoSerializerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/serializer/KryoSerializerSuite.scala
@@ -62,6 +62,10 @@ class KryoSerializerSuite extends FunSuite with SharedSparkContext {
     val thrown3 = intercept[IllegalArgumentException](new KryoSerializer(conf4).newInstance())
     assert(thrown3.getMessage.contains(kryoBufferProperty))
     assert(!thrown3.getMessage.contains(kryoBufferMaxProperty))
+    val conf5 = conf.clone()
+    conf5.set(kryoBufferProperty, "8m")
+    conf5.set(kryoBufferMaxProperty, "9m")
+    new KryoSerializer(conf5).newInstance()
   }
   
   test("basic types") {

From bfeedc69a29a1dfbfc520545e3fc95389ea1b82d Mon Sep 17 00:00:00 2001
From: Cheng Lian <lian@databricks.com>
Date: Tue, 26 May 2015 00:16:06 +0800
Subject: [PATCH 172/525] [SPARK-7684] [SQL] Invoking
 HiveContext.newTemporaryConfiguration() shouldn't create new metastore
 directory

The "Database does not exist" error reported in SPARK-7684 was caused by `HiveContext.newTemporaryConfiguration()`, which always creates a new temporary metastore directory and returns a metastore configuration pointing that directory. This makes `TestHive.reset()` always replaces old temporary metastore with an empty new one.

Author: Cheng Lian <lian@databricks.com>

Closes #6359 from liancheng/spark-7684 and squashes the following commits:

95d2eb8 [Cheng Lian] Addresses @marmbrust's comment
042769d [Cheng Lian] Don't create new temp directory in HiveContext.newTemporaryConfiguration()
---
 .../main/scala/org/apache/spark/sql/hive/HiveContext.scala    | 2 +-
 .../main/scala/org/apache/spark/sql/hive/test/TestHive.scala  | 4 +++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala
index 0d807f428aafc..b64768ababef9 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala
@@ -158,7 +158,7 @@ class HiveContext(sc: SparkContext) extends SQLContext(sc) {
    */
   @transient
   protected[hive] lazy val executionHive: ClientWrapper = {
-    logInfo(s"Initilizing execution hive, version $hiveExecutionVersion")
+    logInfo(s"Initializing execution hive, version $hiveExecutionVersion")
     new ClientWrapper(
       version = IsolatedClientLoader.hiveVersion(hiveExecutionVersion),
       config = newTemporaryConfiguration())
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/test/TestHive.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/test/TestHive.scala
index 964828407481e..2e06cabfa80c9 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/test/TestHive.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/test/TestHive.scala
@@ -82,9 +82,11 @@ class TestHiveContext(sc: SparkContext) extends HiveContext(sc) {
 
   lazy val warehousePath = Utils.createTempDir()
 
+  private lazy val temporaryConfig = newTemporaryConfiguration()
+
   /** Sets up the system initially or after a RESET command */
   protected override def configure(): Map[String, String] =
-   newTemporaryConfiguration() ++ Map("hive.metastore.warehouse.dir" -> warehousePath.toString)
+    temporaryConfig ++ Map("hive.metastore.warehouse.dir" -> warehousePath.toString)
 
   val testTempDir = Utils.createTempDir()
 

From 8af1bf10b70b9b67f18f618174e84365d69caa48 Mon Sep 17 00:00:00 2001
From: Cheng Lian <lian@databricks.com>
Date: Tue, 26 May 2015 00:28:47 +0800
Subject: [PATCH 173/525] [SPARK-7842] [SQL] Makes task committing/aborting in
 InsertIntoHadoopFsRelation more robust

When committing/aborting a write task issued in `InsertIntoHadoopFsRelation`, if an exception is thrown from `OutputWriter.close()`, the committing/aborting process will be interrupted, and leaves messy stuff behind (e.g., the `_temporary` directory created by `FileOutputCommitter`).

This PR makes these two process more robust by catching potential exceptions and falling back to normal task committment/abort.

Author: Cheng Lian <lian@databricks.com>

Closes #6378 from liancheng/spark-7838 and squashes the following commits:

f18253a [Cheng Lian] Makes task committing/aborting in InsertIntoHadoopFsRelation more robust
---
 .../apache/spark/sql/sources/commands.scala   | 33 +++++++++++----
 .../sql/sources/SimpleTextRelation.scala      | 42 ++++++++++++++++++-
 .../sql/sources/hadoopFsRelationSuites.scala  | 22 ++++++++++
 3 files changed, 87 insertions(+), 10 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/sources/commands.scala b/sql/core/src/main/scala/org/apache/spark/sql/sources/commands.scala
index c3674a8c76be8..fbd98ef0380e1 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/sources/commands.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/sources/commands.scala
@@ -377,13 +377,22 @@ private[sql] class DefaultWriterContainer(
   override def outputWriterForRow(row: Row): OutputWriter = writer
 
   override def commitTask(): Unit = {
-    writer.close()
-    super.commitTask()
+    try {
+      writer.close()
+      super.commitTask()
+    } catch {
+      case cause: Throwable =>
+        super.abortTask()
+        throw new RuntimeException("Failed to commit task", cause)
+    }
   }
 
   override def abortTask(): Unit = {
-    writer.close()
-    super.abortTask()
+    try {
+      writer.close()
+    } finally {
+      super.abortTask()
+    }
   }
 }
 
@@ -422,13 +431,21 @@ private[sql] class DynamicPartitionWriterContainer(
   }
 
   override def commitTask(): Unit = {
-    outputWriters.values.foreach(_.close())
-    super.commitTask()
+    try {
+      outputWriters.values.foreach(_.close())
+      super.commitTask()
+    } catch { case cause: Throwable =>
+      super.abortTask()
+      throw new RuntimeException("Failed to commit task", cause)
+    }
   }
 
   override def abortTask(): Unit = {
-    outputWriters.values.foreach(_.close())
-    super.abortTask()
+    try {
+      outputWriters.values.foreach(_.close())
+    } finally {
+      super.abortTask()
+    }
   }
 }
 
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/sources/SimpleTextRelation.scala b/sql/hive/src/test/scala/org/apache/spark/sql/sources/SimpleTextRelation.scala
index 2d69b89fd9a9c..de907846b9180 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/sources/SimpleTextRelation.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/sources/SimpleTextRelation.scala
@@ -28,7 +28,7 @@ import org.apache.hadoop.mapreduce.{Job, RecordWriter, TaskAttemptContext}
 
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.catalyst.expressions.{Cast, Literal}
-import org.apache.spark.sql.types.{DataType, StructField, StructType}
+import org.apache.spark.sql.types.{DataType, StructType}
 import org.apache.spark.sql.{Row, SQLContext}
 
 /**
@@ -67,7 +67,9 @@ class SimpleTextOutputWriter(path: String, context: TaskAttemptContext) extends
     recordWriter.write(null, new Text(serialized))
   }
 
-  override def close(): Unit = recordWriter.close(context)
+  override def close(): Unit = {
+    recordWriter.close(context)
+  }
 }
 
 /**
@@ -120,3 +122,39 @@ class SimpleTextRelation(
     }
   }
 }
+
+/**
+ * A simple example [[HadoopFsRelationProvider]].
+ */
+class CommitFailureTestSource extends HadoopFsRelationProvider {
+  override def createRelation(
+      sqlContext: SQLContext,
+      paths: Array[String],
+      schema: Option[StructType],
+      partitionColumns: Option[StructType],
+      parameters: Map[String, String]): HadoopFsRelation = {
+    new CommitFailureTestRelation(paths, schema, partitionColumns, parameters)(sqlContext)
+  }
+}
+
+class CommitFailureTestRelation(
+    override val paths: Array[String],
+    maybeDataSchema: Option[StructType],
+    override val userDefinedPartitionColumns: Option[StructType],
+    parameters: Map[String, String])(
+    @transient sqlContext: SQLContext)
+  extends SimpleTextRelation(
+    paths, maybeDataSchema, userDefinedPartitionColumns, parameters)(sqlContext) {
+  override def prepareJobForWrite(job: Job): OutputWriterFactory = new OutputWriterFactory {
+    override def newInstance(
+        path: String,
+        dataSchema: StructType,
+        context: TaskAttemptContext): OutputWriter = {
+      new SimpleTextOutputWriter(path, context) {
+        override def close(): Unit = {
+          sys.error("Intentional task commitment failure for testing purpose.")
+        }
+      }
+    }
+  }
+}
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/sources/hadoopFsRelationSuites.scala b/sql/hive/src/test/scala/org/apache/spark/sql/sources/hadoopFsRelationSuites.scala
index 32226905bca9d..70328e1ef810d 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/sources/hadoopFsRelationSuites.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/sources/hadoopFsRelationSuites.scala
@@ -18,7 +18,9 @@
 package org.apache.spark.sql.sources
 
 import org.apache.hadoop.fs.Path
+import org.scalatest.FunSuite
 
+import org.apache.spark.SparkException
 import org.apache.spark.deploy.SparkHadoopUtil
 import org.apache.spark.sql._
 import org.apache.spark.sql.hive.test.TestHive
@@ -477,6 +479,26 @@ class SimpleTextHadoopFsRelationSuite extends HadoopFsRelationTest {
   }
 }
 
+class CommitFailureTestRelationSuite extends FunSuite with SQLTestUtils {
+  import TestHive.implicits._
+
+  override val sqlContext = TestHive
+
+  val dataSourceName: String = classOf[CommitFailureTestSource].getCanonicalName
+
+  test("SPARK-7684: commitTask() failure should fallback to abortTask()") {
+    withTempPath { file =>
+      val df = (1 to 3).map(i => i -> s"val_$i").toDF("a", "b")
+      intercept[SparkException] {
+        df.write.format(dataSourceName).save(file.getCanonicalPath)
+      }
+
+      val fs = new Path(file.getCanonicalPath).getFileSystem(SparkHadoopUtil.get.conf)
+      assert(!fs.exists(new Path(file.getCanonicalPath, "_temporary")))
+    }
+  }
+}
+
 class ParquetHadoopFsRelationSuite extends HadoopFsRelationTest {
   override val dataSourceName: String = classOf[parquet.DefaultSource].getCanonicalName
 

From ce0051d6f7e4ed54076676644c78b52b527ba190 Mon Sep 17 00:00:00 2001
From: Calvin Jia <jia.calvin@gmail.com>
Date: Mon, 25 May 2015 16:50:43 -0700
Subject: [PATCH 174/525] [SPARK-6391][DOCS] Document Tachyon compatibility.

Adds a section in the RDD persistence section of the programming-guide docs detailing Spark-Tachyon version compatibility as discussed in [[SPARK-6391]](https://issues.apache.org/jira/browse/SPARK-6391).

Author: Calvin Jia <jia.calvin@gmail.com>

Closes #6382 from calvinjia/spark-6391 and squashes the following commits:

113e863 [Calvin Jia] Move compatibility info to the offheap storage level section.
7942dc5 [Calvin Jia] Add a section in the programming-guide docs for Tachyon compatibility.
---
 docs/programming-guide.md | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/docs/programming-guide.md b/docs/programming-guide.md
index 5d9df282efed8..10f474f237bfa 100644
--- a/docs/programming-guide.md
+++ b/docs/programming-guide.md
@@ -1214,9 +1214,11 @@ storage levels is:
     Compared to MEMORY_ONLY_SER, OFF_HEAP reduces garbage collection overhead and allows executors
     to be smaller and to share a pool of memory, making it attractive in environments with
     large heaps or multiple concurrent applications. Furthermore, as the RDDs reside in Tachyon,
-    the crash of an executor does not lead to losing the in-memory cache. In this mode, the memory 
+    the crash of an executor does not lead to losing the in-memory cache. In this mode, the memory
     in Tachyon is discardable. Thus, Tachyon does not attempt to reconstruct a block that it evicts
-    from memory.
+    from memory. If you plan to use Tachyon as the off heap store, Spark is compatible with Tachyon
+    out-of-the-box. Please refer to this <a href="http://tachyon-project.org/master/Running-Spark-on-Tachyon.html">page</a>
+    for the suggested version pairings.
   </td>
 </tr>
 </table>

From f38e619c41d242143c916373f2a44ec674679f19 Mon Sep 17 00:00:00 2001
From: Yin Huai <yhuai@databricks.com>
Date: Mon, 25 May 2015 18:23:58 -0700
Subject: [PATCH 175/525] [SPARK-7832] [Build] Always run SQL tests in master
 build.

https://issues.apache.org/jira/browse/SPARK-7832

Author: Yin Huai <yhuai@databricks.com>

Closes #6385 from yhuai/runSQLTests and squashes the following commits:

3d399bc [Yin Huai] Always run SQL tests in master build.
---
 dev/run-tests         | 41 ++++++++++++++++++++++++-----------------
 dev/run-tests-jenkins |  2 ++
 2 files changed, 26 insertions(+), 17 deletions(-)

diff --git a/dev/run-tests b/dev/run-tests
index 57296d0c6028e..7dd8d31fd44e3 100755
--- a/dev/run-tests
+++ b/dev/run-tests
@@ -82,24 +82,31 @@ export SBT_MAVEN_PROFILES_ARGS="$SBT_MAVEN_PROFILES_ARGS -Pkinesis-asl"
 if [ -n "$AMPLAB_JENKINS" ]; then
   git fetch origin master:master
 
-  sql_diffs=$(
-    git diff --name-only master \
-    | grep -e "^sql/" -e "^bin/spark-sql" -e "^sbin/start-thriftserver.sh"
-  )
-
-  non_sql_diffs=$(
-    git diff --name-only master \
-    | grep -v -e "^sql/" -e "^bin/spark-sql" -e "^sbin/start-thriftserver.sh"
-  )
-
-  if [ -n "$sql_diffs" ]; then
-    echo "[info] Detected changes in SQL. Will run Hive test suite."
-    _RUN_SQL_TESTS=true
-
-    if [ -z "$non_sql_diffs" ]; then
-      echo "[info] Detected no changes except in SQL. Will only run SQL tests."
-      _SQL_TESTS_ONLY=true
+  # AMP_JENKINS_PRB indicates if the current build is a pull request build.
+  if [ -n "$AMP_JENKINS_PRB" ]; then
+    # It is a pull request build.
+    sql_diffs=$(
+      git diff --name-only master \
+      | grep -e "^sql/" -e "^bin/spark-sql" -e "^sbin/start-thriftserver.sh"
+    )
+
+    non_sql_diffs=$(
+      git diff --name-only master \
+      | grep -v -e "^sql/" -e "^bin/spark-sql" -e "^sbin/start-thriftserver.sh"
+    )
+
+    if [ -n "$sql_diffs" ]; then
+      echo "[info] Detected changes in SQL. Will run Hive test suite."
+      _RUN_SQL_TESTS=true
+
+      if [ -z "$non_sql_diffs" ]; then
+        echo "[info] Detected no changes except in SQL. Will only run SQL tests."
+        _SQL_TESTS_ONLY=true
+      fi
     fi
+  else
+    # It is a regular build. We should run SQL tests.
+    _RUN_SQL_TESTS=true
   fi
 fi
 
diff --git a/dev/run-tests-jenkins b/dev/run-tests-jenkins
index f452ab66efcd8..8b2a44fd72ba5 100755
--- a/dev/run-tests-jenkins
+++ b/dev/run-tests-jenkins
@@ -185,6 +185,8 @@ done
 
 # run tests
 {
+  # Marks this build is a pull request build.
+  export AMP_JENKINS_PRB=true
   timeout "${TESTS_TIMEOUT}" ./dev/run-tests
   test_result="$?"
 

From c9adcad81a493595d075715efb2ae395d88a9ec2 Mon Sep 17 00:00:00 2001
From: Reynold Xin <rxin@databricks.com>
Date: Mon, 25 May 2015 23:09:22 -0700
Subject: [PATCH 176/525] [SQL][minor] Removed unused Catalyst logical plan
 DSL.

The Catalyst DSL is no longer used as a public facing API. This pull request removes the UDF and writeToFile feature from it since they are not used in unit tests.

Author: Reynold Xin <rxin@databricks.com>

Closes #6350 from rxin/unused-logical-dsl and squashes the following commits:

90b3de6 [Reynold Xin] [SQL][minor] Removed unused Catalyst logical plan DSL.
---
 .../spark/sql/catalyst/dsl/package.scala      | 129 ++++--------------
 1 file changed, 27 insertions(+), 102 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/dsl/package.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/dsl/package.scala
index 307a9ca9b0070..60ab9fba4885a 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/dsl/package.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/dsl/package.scala
@@ -20,7 +20,6 @@ package org.apache.spark.sql.catalyst
 import java.sql.{Date, Timestamp}
 
 import scala.language.implicitConversions
-import scala.reflect.runtime.universe.{TypeTag, typeTag}
 
 import org.apache.spark.sql.catalyst.analysis.{EliminateSubQueries, UnresolvedExtractValue, UnresolvedAttribute}
 import org.apache.spark.sql.catalyst.expressions._
@@ -61,7 +60,7 @@ package object dsl {
   trait ImplicitOperators {
     def expr: Expression
 
-    def unary_- : Expression= UnaryMinus(expr)
+    def unary_- : Expression = UnaryMinus(expr)
     def unary_! : Predicate = Not(expr)
     def unary_~ : Expression = BitwiseNot(expr)
 
@@ -234,133 +233,59 @@ package object dsl {
     implicit class DslAttribute(a: AttributeReference) {
       def notNull: AttributeReference = a.withNullability(false)
       def nullable: AttributeReference = a.withNullability(true)
-
-      // Protobuf terminology
-      def required: AttributeReference = a.withNullability(false)
-
       def at(ordinal: Int): BoundReference = BoundReference(ordinal, a.dataType, a.nullable)
     }
   }
 
-
   object expressions extends ExpressionConversions  // scalastyle:ignore
 
-  abstract class LogicalPlanFunctions {
-    def logicalPlan: LogicalPlan
-
-    def select(exprs: NamedExpression*): LogicalPlan = Project(exprs, logicalPlan)
+  object plans {  // scalastyle:ignore
+    implicit class DslLogicalPlan(val logicalPlan: LogicalPlan) {
+      def select(exprs: NamedExpression*): LogicalPlan = Project(exprs, logicalPlan)
 
-    def where(condition: Expression): LogicalPlan = Filter(condition, logicalPlan)
+      def where(condition: Expression): LogicalPlan = Filter(condition, logicalPlan)
 
-    def limit(limitExpr: Expression): LogicalPlan = Limit(limitExpr, logicalPlan)
+      def limit(limitExpr: Expression): LogicalPlan = Limit(limitExpr, logicalPlan)
 
-    def join(
+      def join(
         otherPlan: LogicalPlan,
         joinType: JoinType = Inner,
         condition: Option[Expression] = None): LogicalPlan =
-      Join(logicalPlan, otherPlan, joinType, condition)
+        Join(logicalPlan, otherPlan, joinType, condition)
 
-    def orderBy(sortExprs: SortOrder*): LogicalPlan = Sort(sortExprs, true, logicalPlan)
+      def orderBy(sortExprs: SortOrder*): LogicalPlan = Sort(sortExprs, true, logicalPlan)
 
-    def sortBy(sortExprs: SortOrder*): LogicalPlan = Sort(sortExprs, false, logicalPlan)
+      def sortBy(sortExprs: SortOrder*): LogicalPlan = Sort(sortExprs, false, logicalPlan)
 
-    def groupBy(groupingExprs: Expression*)(aggregateExprs: Expression*): LogicalPlan = {
-      val aliasedExprs = aggregateExprs.map {
-        case ne: NamedExpression => ne
-        case e => Alias(e, e.toString)()
+      def groupBy(groupingExprs: Expression*)(aggregateExprs: Expression*): LogicalPlan = {
+        val aliasedExprs = aggregateExprs.map {
+          case ne: NamedExpression => ne
+          case e => Alias(e, e.toString)()
+        }
+        Aggregate(groupingExprs, aliasedExprs, logicalPlan)
       }
-      Aggregate(groupingExprs, aliasedExprs, logicalPlan)
-    }
-
-    def subquery(alias: Symbol): LogicalPlan = Subquery(alias.name, logicalPlan)
 
-    def unionAll(otherPlan: LogicalPlan): LogicalPlan = Union(logicalPlan, otherPlan)
+      def subquery(alias: Symbol): LogicalPlan = Subquery(alias.name, logicalPlan)
 
-    def except(otherPlan: LogicalPlan): LogicalPlan = Except(logicalPlan, otherPlan)
+      def except(otherPlan: LogicalPlan): LogicalPlan = Except(logicalPlan, otherPlan)
 
-    def intersect(otherPlan: LogicalPlan): LogicalPlan = Intersect(logicalPlan, otherPlan)
+      def intersect(otherPlan: LogicalPlan): LogicalPlan = Intersect(logicalPlan, otherPlan)
 
-    def sfilter[T1](arg1: Symbol)(udf: (T1) => Boolean): LogicalPlan =
-      Filter(ScalaUdf(udf, BooleanType, Seq(UnresolvedAttribute(arg1.name))), logicalPlan)
+      def unionAll(otherPlan: LogicalPlan): LogicalPlan = Union(logicalPlan, otherPlan)
 
-    // TODO specify the output column names
-    def generate(
+      // TODO specify the output column names
+      def generate(
         generator: Generator,
         join: Boolean = false,
         outer: Boolean = false,
         alias: Option[String] = None): LogicalPlan =
-      Generate(generator, join = join, outer = outer, alias, Nil, logicalPlan)
+        Generate(generator, join = join, outer = outer, alias, Nil, logicalPlan)
 
-    def insertInto(tableName: String, overwrite: Boolean = false): LogicalPlan =
-      InsertIntoTable(
-        analysis.UnresolvedRelation(Seq(tableName)), Map.empty, logicalPlan, overwrite, false)
+      def insertInto(tableName: String, overwrite: Boolean = false): LogicalPlan =
+        InsertIntoTable(
+          analysis.UnresolvedRelation(Seq(tableName)), Map.empty, logicalPlan, overwrite, false)
 
-    def analyze: LogicalPlan = EliminateSubQueries(analysis.SimpleAnalyzer.execute(logicalPlan))
-  }
-
-  object plans {  // scalastyle:ignore
-    implicit class DslLogicalPlan(val logicalPlan: LogicalPlan) extends LogicalPlanFunctions {
-      def writeToFile(path: String): LogicalPlan = WriteToFile(path, logicalPlan)
+      def analyze: LogicalPlan = EliminateSubQueries(analysis.SimpleAnalyzer.execute(logicalPlan))
     }
   }
-
-  case class ScalaUdfBuilder[T: TypeTag](f: AnyRef) {
-    def call(args: Expression*): ScalaUdf = {
-      ScalaUdf(f, ScalaReflection.schemaFor(typeTag[T]).dataType, args)
-    }
-  }
-
-  // scalastyle:off
-  /** functionToUdfBuilder 1-22 were generated by this script
-
-    (1 to 22).map { x =>
-      val argTypes = Seq.fill(x)("_").mkString(", ")
-      s"implicit def functionToUdfBuilder[T: TypeTag](func: Function$x[$argTypes, T]): ScalaUdfBuilder[T] = ScalaUdfBuilder(func)"
-    }
-  */
-
-  implicit def functionToUdfBuilder[T: TypeTag](func: Function1[_, T]): ScalaUdfBuilder[T] = ScalaUdfBuilder(func)
-
-  implicit def functionToUdfBuilder[T: TypeTag](func: Function2[_, _, T]): ScalaUdfBuilder[T] = ScalaUdfBuilder(func)
-
-  implicit def functionToUdfBuilder[T: TypeTag](func: Function3[_, _, _, T]): ScalaUdfBuilder[T] = ScalaUdfBuilder(func)
-
-  implicit def functionToUdfBuilder[T: TypeTag](func: Function4[_, _, _, _, T]): ScalaUdfBuilder[T] = ScalaUdfBuilder(func)
-
-  implicit def functionToUdfBuilder[T: TypeTag](func: Function5[_, _, _, _, _, T]): ScalaUdfBuilder[T] = ScalaUdfBuilder(func)
-
-  implicit def functionToUdfBuilder[T: TypeTag](func: Function6[_, _, _, _, _, _, T]): ScalaUdfBuilder[T] = ScalaUdfBuilder(func)
-
-  implicit def functionToUdfBuilder[T: TypeTag](func: Function7[_, _, _, _, _, _, _, T]): ScalaUdfBuilder[T] = ScalaUdfBuilder(func)
-
-  implicit def functionToUdfBuilder[T: TypeTag](func: Function8[_, _, _, _, _, _, _, _, T]): ScalaUdfBuilder[T] = ScalaUdfBuilder(func)
-
-  implicit def functionToUdfBuilder[T: TypeTag](func: Function9[_, _, _, _, _, _, _, _, _, T]): ScalaUdfBuilder[T] = ScalaUdfBuilder(func)
-
-  implicit def functionToUdfBuilder[T: TypeTag](func: Function10[_, _, _, _, _, _, _, _, _, _, T]): ScalaUdfBuilder[T] = ScalaUdfBuilder(func)
-
-  implicit def functionToUdfBuilder[T: TypeTag](func: Function11[_, _, _, _, _, _, _, _, _, _, _, T]): ScalaUdfBuilder[T] = ScalaUdfBuilder(func)
-
-  implicit def functionToUdfBuilder[T: TypeTag](func: Function12[_, _, _, _, _, _, _, _, _, _, _, _, T]): ScalaUdfBuilder[T] = ScalaUdfBuilder(func)
-
-  implicit def functionToUdfBuilder[T: TypeTag](func: Function13[_, _, _, _, _, _, _, _, _, _, _, _, _, T]): ScalaUdfBuilder[T] = ScalaUdfBuilder(func)
-
-  implicit def functionToUdfBuilder[T: TypeTag](func: Function14[_, _, _, _, _, _, _, _, _, _, _, _, _, _, T]): ScalaUdfBuilder[T] = ScalaUdfBuilder(func)
-
-  implicit def functionToUdfBuilder[T: TypeTag](func: Function15[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, T]): ScalaUdfBuilder[T] = ScalaUdfBuilder(func)
-
-  implicit def functionToUdfBuilder[T: TypeTag](func: Function16[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, T]): ScalaUdfBuilder[T] = ScalaUdfBuilder(func)
-
-  implicit def functionToUdfBuilder[T: TypeTag](func: Function17[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, T]): ScalaUdfBuilder[T] = ScalaUdfBuilder(func)
-
-  implicit def functionToUdfBuilder[T: TypeTag](func: Function18[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, T]): ScalaUdfBuilder[T] = ScalaUdfBuilder(func)
-
-  implicit def functionToUdfBuilder[T: TypeTag](func: Function19[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, T]): ScalaUdfBuilder[T] = ScalaUdfBuilder(func)
-
-  implicit def functionToUdfBuilder[T: TypeTag](func: Function20[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, T]): ScalaUdfBuilder[T] = ScalaUdfBuilder(func)
-
-  implicit def functionToUdfBuilder[T: TypeTag](func: Function21[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, T]): ScalaUdfBuilder[T] = ScalaUdfBuilder(func)
-
-  implicit def functionToUdfBuilder[T: TypeTag](func: Function22[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, T]): ScalaUdfBuilder[T] = ScalaUdfBuilder(func)
-  // scalastyle:on
 }

From 43aa819c041f6e8301ad1b8f82eb68e14254f636 Mon Sep 17 00:00:00 2001
From: Konstantin Shaposhnikov <Konstantin.Shaposhnikov@sc.com>
Date: Tue, 26 May 2015 07:49:32 +0100
Subject: [PATCH 177/525] [SPARK-7042] [BUILD] use the standard akka artifacts
 with hadoop-2.x

Both akka 2.3.x and hadoop-2.x use protobuf 2.5 so only hadoop-1 build needs
custom 2.3.4-spark akka version that shades protobuf-2.5

This partially fixes SPARK-7042 (for hadoop-2.x builds)

Author: Konstantin Shaposhnikov <Konstantin.Shaposhnikov@sc.com>

Closes #6341 from kostya-sh/SPARK-7042 and squashes the following commits:

7eb8c60 [Konstantin Shaposhnikov] [SPARK-7042][BUILD] use the standard akka artifacts with hadoop-2.x
---
 pom.xml | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/pom.xml b/pom.xml
index c72d7cbf843ef..8e936ab5ab9de 100644
--- a/pom.xml
+++ b/pom.xml
@@ -114,8 +114,8 @@
   <properties>
     <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
     <project.reporting.outputEncoding>UTF-8</project.reporting.outputEncoding>
-    <akka.group>org.spark-project.akka</akka.group>
-    <akka.version>2.3.4-spark</akka.version>
+    <akka.group>com.typesafe.akka</akka.group>
+    <akka.version>2.3.4</akka.version>
     <java.version>1.6</java.version>
     <sbt.project.name>spark</sbt.project.name>
     <scala.macros.version>2.0.1</scala.macros.version>
@@ -1664,6 +1664,8 @@
         <hbase.version>0.98.7-hadoop1</hbase.version>
         <avro.mapred.classifier>hadoop1</avro.mapred.classifier>
         <codehaus.jackson.version>1.8.8</codehaus.jackson.version>
+        <akka.group>org.spark-project.akka</akka.group>
+        <akka.version>2.3.4-spark</akka.version>
       </properties>
     </profile>
 

From bf49c22130af9a729dcc510743e4c1ea4c5d2439 Mon Sep 17 00:00:00 2001
From: scwf <wangfei1@huawei.com>
Date: Tue, 26 May 2015 08:42:52 -0500
Subject: [PATCH 178/525] [CORE] [TEST] Fix SimpleDateParamTest

```
sbt.ForkMain$ForkError: 1424424077190 was not equal to 1424474477190
	at org.scalatest.MatchersHelper$.newTestFailedException(MatchersHelper.scala:160)
	at org.scalatest.Matchers$ShouldMethodHelper$.shouldMatcher(Matchers.scala:6231)
	at org.scalatest.Matchers$AnyShouldWrapper.should(Matchers.scala:6265)
	at org.apache.spark.status.api.v1.SimpleDateParamTest$$anonfun$1.apply$mcV$sp(SimpleDateParamTest.scala:25)
	at org.apache.spark.status.api.v1.SimpleDateParamTest$$anonfun$1.apply(SimpleDateParamTest.scala:23)
	at org.apache.spark.status.api.v1.SimpleDateParamTest$$anonfun$1.apply(SimpleDateParamTest.scala:23)
	at org.scalatest.Transformer$$anonfun$apply$1.apply$mcV$sp(Transformer.scala:22)
	at org.scalatest.OutcomeOf$class.outcomeOf(OutcomeOf.scala:85)
	at org.scalatest.OutcomeOf$.outcomeOf(OutcomeOf.scala:104)
	at org.scalatest.Transformer.apply(Transformer.scala:22)
	at org.scalatest.Transformer.apply(Transformer.scala:20)
	at org.scalatest.FunSuiteLike$$anon$1.apply(FunSuiteLike.scala:166)
	at org.scalatest.Suite$class.withFixture(Suite.scala:
```

Set timezone to fix SimpleDateParamTest

Author: scwf <wangfei1@huawei.com>
Author: Fei Wang <wangfei1@huawei.com>

Closes #6377 from scwf/fix-SimpleDateParamTest and squashes the following commits:

b8df1e5 [Fei Wang] Update SimpleDateParamSuite.scala
8bb74f0 [scwf] fix SimpleDateParamSuite
---
 ...SimpleDateParamTest.scala => SimpleDateParamSuite.scala} | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)
 rename core/src/test/scala/org/apache/spark/status/api/v1/{SimpleDateParamTest.scala => SimpleDateParamSuite.scala} (86%)

diff --git a/core/src/test/scala/org/apache/spark/status/api/v1/SimpleDateParamTest.scala b/core/src/test/scala/org/apache/spark/status/api/v1/SimpleDateParamSuite.scala
similarity index 86%
rename from core/src/test/scala/org/apache/spark/status/api/v1/SimpleDateParamTest.scala
rename to core/src/test/scala/org/apache/spark/status/api/v1/SimpleDateParamSuite.scala
index 5274df904d395..731d1f557ed33 100644
--- a/core/src/test/scala/org/apache/spark/status/api/v1/SimpleDateParamTest.scala
+++ b/core/src/test/scala/org/apache/spark/status/api/v1/SimpleDateParamSuite.scala
@@ -18,12 +18,12 @@ package org.apache.spark.status.api.v1
 
 import org.scalatest.{Matchers, FunSuite}
 
-class SimpleDateParamTest extends FunSuite with Matchers {
+class SimpleDateParamSuite extends FunSuite with Matchers {
 
   test("date parsing") {
     new SimpleDateParam("2015-02-20T23:21:17.190GMT").timestamp should be (1424474477190L)
-    new SimpleDateParam("2015-02-20T17:21:17.190CST").timestamp should be (1424474477190L)
-    new SimpleDateParam("2015-02-20").timestamp should be (1424390400000L)  // GMT
+    new SimpleDateParam("2015-02-20T17:21:17.190EST").timestamp should be (1424470877190L)
+    new SimpleDateParam("2015-02-20").timestamp should be (1424390400000L) // GMT
   }
 
 }

From 8948ad3fb5d5d095d3942855960d735f27d97dd5 Mon Sep 17 00:00:00 2001
From: linweizhong <linweizhong@huawei.com>
Date: Tue, 26 May 2015 08:35:39 -0700
Subject: [PATCH 179/525] [SPARK-7339] [PYSPARK] PySpark shuffle spill memory
 sometimes are not correct

In PySpark we get memory used before and after spill, then use the difference of these two value as memorySpilled, but if the before value is small than after value, then we will get a negative value, but this scenario 0 value may be more reasonable.

Below is the result in HistoryServer we have tested:
Index	ID	Attempt	Status	Locality Level	Executor ID / Host	Launch Time	Duration	GC Time	Input Size / Records	Write Time	Shuffle Write Size / Records	Shuffle Spill (Memory)	Shuffle Spill (Disk)	Errors
0	0	0	SUCCESS	NODE_LOCAL	3 / vm119	2015/05/04 17:31:06	21 s	0.1 s	128.1 MB (hadoop) / 3237	70 ms	10.1 MB / 2529	0.0 B	5.7 MB
2	2	0	SUCCESS	NODE_LOCAL	1 / vm118	2015/05/04 17:31:06	22 s	89 ms	128.1 MB (hadoop) / 3205	0.1 s	10.1 MB / 2529	-1048576.0 B	5.9 MB
1	1	0	SUCCESS	NODE_LOCAL	2 / vm117	2015/05/04 17:31:06	22 s	0.1 s	128.1 MB (hadoop) / 3271	68 ms	10.1 MB / 2529	-1048576.0 B	5.6 MB
4	4	0	SUCCESS	NODE_LOCAL	2 / vm117	2015/05/04 17:31:06	22 s	0.1 s	128.1 MB (hadoop) / 3192	51 ms	10.1 MB / 2529	-1048576.0 B	5.9 MB
3	3	0	SUCCESS	NODE_LOCAL	3 / vm119	2015/05/04 17:31:06	22 s	0.1 s	128.1 MB (hadoop) / 3262	51 ms	10.1 MB / 2529	1024.0 KB	5.8 MB
5	5	0	SUCCESS	NODE_LOCAL	1 / vm118	2015/05/04 17:31:06	22 s	89 ms	128.1 MB (hadoop) / 3256	93 ms	10.1 MB / 2529	-1048576.0 B	5.7 MB

/cc davies

Author: linweizhong <linweizhong@huawei.com>

Closes #5887 from Sephiroth-Lin/spark-7339 and squashes the following commits:

9186c81 [linweizhong] Use max function to get a nonnegative value
d41672b [linweizhong] Update MemoryBytesSpilled when memorySpilled > 0
---
 python/pyspark/shuffle.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/python/pyspark/shuffle.py b/python/pyspark/shuffle.py
index 1d0b16cade8bb..81c420ce16541 100644
--- a/python/pyspark/shuffle.py
+++ b/python/pyspark/shuffle.py
@@ -362,7 +362,7 @@ def _spill(self):
 
         self.spills += 1
         gc.collect()  # release the memory as much as possible
-        MemoryBytesSpilled += (used_memory - get_used_memory()) << 20
+        MemoryBytesSpilled += max(used_memory - get_used_memory(), 0) << 20
 
     def items(self):
         """ Return all merged items as iterator """
@@ -515,7 +515,7 @@ def load(f):
                 gc.collect()
                 batch //= 2
                 limit = self._next_limit()
-                MemoryBytesSpilled += (used_memory - get_used_memory()) << 20
+                MemoryBytesSpilled += max(used_memory - get_used_memory(), 0) << 20
                 DiskBytesSpilled += os.path.getsize(path)
                 os.unlink(path)  # data will be deleted after close
 
@@ -630,7 +630,7 @@ def _spill(self):
         self.values = []
         gc.collect()
         DiskBytesSpilled += self._file.tell() - pos
-        MemoryBytesSpilled += (used_memory - get_used_memory()) << 20
+        MemoryBytesSpilled += max(used_memory - get_used_memory(), 0) << 20
 
 
 class ExternalListOfList(ExternalList):
@@ -794,7 +794,7 @@ def _spill(self):
 
         self.spills += 1
         gc.collect()  # release the memory as much as possible
-        MemoryBytesSpilled += (used_memory - get_used_memory()) << 20
+        MemoryBytesSpilled += max(used_memory - get_used_memory(), 0) << 20
 
     def _merged_items(self, index):
         size = sum(os.path.getsize(os.path.join(self._get_spill_dir(j), str(index)))

From 8dbe777703e0aaf47cbdfe98f66d22f723352fb5 Mon Sep 17 00:00:00 2001
From: meawoppl <meawoppl@gmail.com>
Date: Tue, 26 May 2015 09:02:25 -0700
Subject: [PATCH 180/525] [SPARK-7806][EC2] Fixes that allow the spark_ec2.py
 tool to run with Python3

I have used this script to launch, destroy, start, and stop clusters successfully.

Author: meawoppl <meawoppl@gmail.com>

Closes #6336 from meawoppl/py3ec2spark and squashes the following commits:

2e87046 [meawoppl] Py3 compat fixes.
---
 ec2/spark_ec2.py | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/ec2/spark_ec2.py b/ec2/spark_ec2.py
index c6d5a1f0d0a81..724811eaa1bdd 100755
--- a/ec2/spark_ec2.py
+++ b/ec2/spark_ec2.py
@@ -19,8 +19,9 @@
 # limitations under the License.
 #
 
-from __future__ import with_statement, print_function
+from __future__ import division, print_function, with_statement
 
+import codecs
 import hashlib
 import itertools
 import logging
@@ -47,6 +48,8 @@
 else:
     from urllib.request import urlopen, Request
     from urllib.error import HTTPError
+    raw_input = input
+    xrange = range
 
 SPARK_EC2_VERSION = "1.3.1"
 SPARK_EC2_DIR = os.path.dirname(os.path.realpath(__file__))
@@ -423,13 +426,14 @@ def get_spark_ami(opts):
         b=opts.spark_ec2_git_branch)
 
     ami_path = "%s/%s/%s" % (ami_prefix, opts.region, instance_type)
+    reader = codecs.getreader("ascii")
     try:
-        ami = urlopen(ami_path).read().strip()
-        print("Spark AMI: " + ami)
+        ami = reader(urlopen(ami_path)).read().strip()
     except:
         print("Could not resolve AMI at: " + ami_path, file=stderr)
         sys.exit(1)
 
+    print("Spark AMI: " + ami)
     return ami
 
 
@@ -750,7 +754,7 @@ def setup_cluster(conn, master_nodes, slave_nodes, opts, deploy_ssh_key):
                'mapreduce', 'spark-standalone', 'tachyon']
 
     if opts.hadoop_major_version == "1":
-        modules = filter(lambda x: x != "mapreduce", modules)
+        modules = list(filter(lambda x: x != "mapreduce", modules))
 
     if opts.ganglia:
         modules.append('ganglia')
@@ -1160,7 +1164,7 @@ def get_zones(conn, opts):
 
 # Gets the number of items in a partition
 def get_partition(total, num_partitions, current_partitions):
-    num_slaves_this_zone = total / num_partitions
+    num_slaves_this_zone = total // num_partitions
     if (total % num_partitions) - current_partitions > 0:
         num_slaves_this_zone += 1
     return num_slaves_this_zone

From e5a63a0e39e35ba5f6b1732e09128c30fff17a2c Mon Sep 17 00:00:00 2001
From: Mike Dusenberry <dusenberrymw@gmail.com>
Date: Tue, 26 May 2015 17:05:58 +0100
Subject: [PATCH 181/525] [DOCS] [MLLIB] Fixing misformatted links in v1.4
 MLlib Naive Bayes documentation by removing space and newline characters.

A couple of links in the MLlib Naive Bayes documentation for v1.4 were broken due to the addition of either space or newline characters between the link title and link URL in the markdown doc.  (Interestingly enough, they are rendered correctly in the GitHub viewer, but not when compiled to HTML by Jekyll.)

Author: Mike Dusenberry <dusenberrymw@gmail.com>

Closes #6412 from dusenberrymw/Fix_Broken_Links_In_MLlib_Naive_Bayes_Docs and squashes the following commits:

91a4028 [Mike Dusenberry] Fixing misformatted links by removing space and newline characters.
---
 docs/mllib-naive-bayes.md | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/docs/mllib-naive-bayes.md b/docs/mllib-naive-bayes.md
index 56a2e9ca86bb1..acdcc371487f8 100644
--- a/docs/mllib-naive-bayes.md
+++ b/docs/mllib-naive-bayes.md
@@ -14,9 +14,8 @@ and use it for prediction.
 
 MLlib supports [multinomial naive
 Bayes](http://en.wikipedia.org/wiki/Naive_Bayes_classifier#Multinomial_naive_Bayes)
-and [Bernoulli naive Bayes] (http://nlp.stanford.edu/IR-book/html/htmledition/the-bernoulli-model-1.html).
-These models are typically used for [document classification]
-(http://nlp.stanford.edu/IR-book/html/htmledition/naive-bayes-text-classification-1.html).
+and [Bernoulli naive Bayes](http://nlp.stanford.edu/IR-book/html/htmledition/the-bernoulli-model-1.html).
+These models are typically used for [document classification](http://nlp.stanford.edu/IR-book/html/htmledition/naive-bayes-text-classification-1.html).
 Within that context, each observation is a document and each
 feature represents a term whose value is the frequency of the term (in multinomial naive Bayes) or
 a zero or one indicating whether the term was found in the document (in Bernoulli naive Bayes).

From 63099122deb93553f5a03bb3ff74e7c2f1ee2164 Mon Sep 17 00:00:00 2001
From: "Zhang, Liye" <liye.zhang@intel.com>
Date: Tue, 26 May 2015 17:08:16 +0100
Subject: [PATCH 182/525] [SPARK-7854] [TEST] refine Kryo test suite

this modification is according to JoshRosen 's comments, for details, please refer to [#5934](https://github.com/apache/spark/pull/5934/files#r30949751).

Author: Zhang, Liye <liye.zhang@intel.com>

Closes #6395 from liyezhang556520/kryoTest and squashes the following commits:

da214c8 [Zhang, Liye] refine Kryo test suite accroding to Josh's comments
---
 .../serializer/KryoSerializerSuite.scala      | 51 ++++++++++---------
 1 file changed, 27 insertions(+), 24 deletions(-)

diff --git a/core/src/test/scala/org/apache/spark/serializer/KryoSerializerSuite.scala b/core/src/test/scala/org/apache/spark/serializer/KryoSerializerSuite.scala
index 5faf108b394a1..8c384bd358ebc 100644
--- a/core/src/test/scala/org/apache/spark/serializer/KryoSerializerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/serializer/KryoSerializerSuite.scala
@@ -34,38 +34,41 @@ class KryoSerializerSuite extends FunSuite with SharedSparkContext {
   conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
   conf.set("spark.kryo.registrator", classOf[MyRegistrator].getName)
 
-  test("configuration limits") {
-    val conf1 = conf.clone()
+  test("SPARK-7392 configuration limits") {
     val kryoBufferProperty = "spark.kryoserializer.buffer"
     val kryoBufferMaxProperty = "spark.kryoserializer.buffer.max"
-    conf1.set(kryoBufferProperty, "64k")
-    conf1.set(kryoBufferMaxProperty, "64m")
-    new KryoSerializer(conf1).newInstance()
+    
+    def newKryoInstance(
+        conf: SparkConf,
+        bufferSize: String = "64k",
+        maxBufferSize: String = "64m"): SerializerInstance = {
+      val kryoConf = conf.clone()
+      kryoConf.set(kryoBufferProperty, bufferSize)
+      kryoConf.set(kryoBufferMaxProperty, maxBufferSize)
+      new KryoSerializer(kryoConf).newInstance()
+    }
+    
+    // test default values
+    newKryoInstance(conf, "64k", "64m")
     // 2048m = 2097152k
-    conf1.set(kryoBufferProperty, "2097151k")
-    conf1.set(kryoBufferMaxProperty, "64m")
     // should not throw exception when kryoBufferMaxProperty < kryoBufferProperty
-    new KryoSerializer(conf1).newInstance()
-    conf1.set(kryoBufferMaxProperty, "2097151k")
-    new KryoSerializer(conf1).newInstance()
-    val conf2 = conf.clone()
-    conf2.set(kryoBufferProperty, "2048m")
-    val thrown1 = intercept[IllegalArgumentException](new KryoSerializer(conf2).newInstance())
+    newKryoInstance(conf, "2097151k", "64m")
+    // test maximum size with unit of KiB
+    newKryoInstance(conf, "2097151k", "2097151k")
+    // should throw exception with bufferSize out of bound
+    val thrown1 = intercept[IllegalArgumentException](newKryoInstance(conf, "2048m"))
     assert(thrown1.getMessage.contains(kryoBufferProperty))
-    val conf3 = conf.clone()
-    conf3.set(kryoBufferMaxProperty, "2048m")
-    val thrown2 = intercept[IllegalArgumentException](new KryoSerializer(conf3).newInstance())
+    // should throw exception with maxBufferSize out of bound
+    val thrown2 = intercept[IllegalArgumentException](
+        newKryoInstance(conf, maxBufferSize = "2048m"))
     assert(thrown2.getMessage.contains(kryoBufferMaxProperty))
-    val conf4 = conf.clone()
-    conf4.set(kryoBufferProperty, "2g")
-    conf4.set(kryoBufferMaxProperty, "3g")
-    val thrown3 = intercept[IllegalArgumentException](new KryoSerializer(conf4).newInstance())
+    // should throw exception when both bufferSize and maxBufferSize out of bound
+    // exception should only contain "spark.kryoserializer.buffer"
+    val thrown3 = intercept[IllegalArgumentException](newKryoInstance(conf, "2g", "3g"))
     assert(thrown3.getMessage.contains(kryoBufferProperty))
     assert(!thrown3.getMessage.contains(kryoBufferMaxProperty))
-    val conf5 = conf.clone()
-    conf5.set(kryoBufferProperty, "8m")
-    conf5.set(kryoBufferMaxProperty, "9m")
-    new KryoSerializer(conf5).newInstance()
+    // test configuration with mb is supported properly
+    newKryoInstance(conf, "8m", "9m")
   }
   
   test("basic types") {

From b7d8085942c564d6c5b81a14d31789e1b215e62b Mon Sep 17 00:00:00 2001
From: Patrick Wendell <patrick@databricks.com>
Date: Tue, 26 May 2015 10:05:13 -0700
Subject: [PATCH 183/525] Revert "[SPARK-7042] [BUILD] use the standard akka
 artifacts with hadoop-2.x"

This reverts commit 43aa819c041f6e8301ad1b8f82eb68e14254f636.
---
 pom.xml | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/pom.xml b/pom.xml
index 8e936ab5ab9de..c72d7cbf843ef 100644
--- a/pom.xml
+++ b/pom.xml
@@ -114,8 +114,8 @@
   <properties>
     <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
     <project.reporting.outputEncoding>UTF-8</project.reporting.outputEncoding>
-    <akka.group>com.typesafe.akka</akka.group>
-    <akka.version>2.3.4</akka.version>
+    <akka.group>org.spark-project.akka</akka.group>
+    <akka.version>2.3.4-spark</akka.version>
     <java.version>1.6</java.version>
     <sbt.project.name>spark</sbt.project.name>
     <scala.macros.version>2.0.1</scala.macros.version>
@@ -1664,8 +1664,6 @@
         <hbase.version>0.98.7-hadoop1</hbase.version>
         <avro.mapred.classifier>hadoop1</avro.mapred.classifier>
         <codehaus.jackson.version>1.8.8</codehaus.jackson.version>
-        <akka.group>org.spark-project.akka</akka.group>
-        <akka.version>2.3.4-spark</akka.version>
       </properties>
     </profile>
 

From 61664732b25b35f94be35a42cde651cbfd0e02b7 Mon Sep 17 00:00:00 2001
From: MechCoder <manojkumarsivaraj334@gmail.com>
Date: Tue, 26 May 2015 13:21:00 -0700
Subject: [PATCH 184/525] [SPARK-7844] [MLLIB] Fix broken tests in
 KernelDensity

The densities in KernelDensity are scaled down by
(number of parallel processes X number of points). It should be just no.of samples. This results in broken tests in KernelDensitySuite which haven't been tested properly.

Author: MechCoder <manojkumarsivaraj334@gmail.com>

Closes #6383 from MechCoder/spark-7844 and squashes the following commits:

ab81302 [MechCoder] Math->math
9b8ed50 [MechCoder] Make one pass to update count
a92fe50 [MechCoder] [SPARK-7844] Fix broken tests in KernelDensity
---
 .../org/apache/spark/mllib/stat/KernelDensity.scala    |  2 +-
 .../apache/spark/mllib/stat/KernelDensitySuite.scala   | 10 ++++++----
 2 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/stat/KernelDensity.scala b/mllib/src/main/scala/org/apache/spark/mllib/stat/KernelDensity.scala
index a6bfe26e1e4f5..58a50f9c19f14 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/stat/KernelDensity.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/stat/KernelDensity.scala
@@ -93,7 +93,7 @@ class KernelDensity extends Serializable {
           x._1(i) += normPdf(y, bandwidth, logStandardDeviationPlusHalfLog2Pi, points(i))
           i += 1
         }
-        (x._1, n)
+        (x._1, x._2 + 1)
       },
       (x, y) => {
         blas.daxpy(n, 1.0, y._1, 1, x._1, 1)
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/stat/KernelDensitySuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/stat/KernelDensitySuite.scala
index 14bb1cebf0b8f..a309c942cf8ff 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/stat/KernelDensitySuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/stat/KernelDensitySuite.scala
@@ -29,8 +29,8 @@ class KernelDensitySuite extends FunSuite with MLlibTestSparkContext {
     val densities = new KernelDensity().setSample(rdd).setBandwidth(3.0).estimate(evaluationPoints)
     val normal = new NormalDistribution(5.0, 3.0)
     val acceptableErr = 1e-6
-    assert(densities(0) - normal.density(5.0) < acceptableErr)
-    assert(densities(0) - normal.density(6.0) < acceptableErr)
+    assert(math.abs(densities(0) - normal.density(5.0)) < acceptableErr)
+    assert(math.abs(densities(1) - normal.density(6.0)) < acceptableErr)
   }
 
   test("kernel density multiple samples") {
@@ -40,7 +40,9 @@ class KernelDensitySuite extends FunSuite with MLlibTestSparkContext {
     val normal1 = new NormalDistribution(5.0, 3.0)
     val normal2 = new NormalDistribution(10.0, 3.0)
     val acceptableErr = 1e-6
-    assert(densities(0) - (normal1.density(5.0) + normal2.density(5.0)) / 2 < acceptableErr)
-    assert(densities(0) - (normal1.density(6.0) + normal2.density(6.0)) / 2 < acceptableErr)
+    assert(math.abs(
+      densities(0) - (normal1.density(5.0) + normal2.density(5.0)) / 2) < acceptableErr)
+    assert(math.abs(
+      densities(1) - (normal1.density(6.0) + normal2.density(6.0)) / 2) < acceptableErr)
   }
 }

From 2e9a5f229e1a2ccffa74fa59fa6a55b2704d9c1a Mon Sep 17 00:00:00 2001
From: Shivaram Venkataraman <shivaram@cs.berkeley.edu>
Date: Tue, 26 May 2015 15:01:27 -0700
Subject: [PATCH 185/525] [SPARK-3674] YARN support in Spark EC2

This corresponds to https://github.com/mesos/spark-ec2/pull/116 in the spark-ec2 repo. The only changes required on the spark_ec2.py script is to open the RM port.

cc andrewor14

Author: Shivaram Venkataraman <shivaram@cs.berkeley.edu>

Closes #6376 from shivaram/spark-ec2-yarn and squashes the following commits:

961504a [Shivaram Venkataraman] Merge branch 'master' of https://github.com/apache/spark into spark-ec2-yarn
152c94c [Shivaram Venkataraman] Open 8088 for YARN in EC2
---
 ec2/spark_ec2.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/ec2/spark_ec2.py b/ec2/spark_ec2.py
index 724811eaa1bdd..ee0904c9e5d54 100755
--- a/ec2/spark_ec2.py
+++ b/ec2/spark_ec2.py
@@ -491,6 +491,8 @@ def launch_cluster(conn, opts, cluster_name):
         master_group.authorize('udp', 2049, 2049, authorized_address)
         master_group.authorize('tcp', 4242, 4242, authorized_address)
         master_group.authorize('udp', 4242, 4242, authorized_address)
+        # RM in YARN mode uses 8088
+        master_group.authorize('tcp', 8088, 8088, authorized_address)
         if opts.ganglia:
             master_group.authorize('tcp', 5080, 5080, authorized_address)
     if slave_group.rules == []:  # Group was just now created

From 9f742241cbf07e5e2dadfee8dcc9b382bb2dbea1 Mon Sep 17 00:00:00 2001
From: zsxwing <zsxwing@gmail.com>
Date: Tue, 26 May 2015 15:28:49 -0700
Subject: [PATCH 186/525] [SPARK-6602] [CORE] Remove some places in core that
 calling SparkEnv.actorSystem

Author: zsxwing <zsxwing@gmail.com>

Closes #6333 from zsxwing/remove-actor-system-usage and squashes the following commits:

f125aa6 [zsxwing] Fix YarnAllocatorSuite
ceadcf6 [zsxwing] Change the "port" parameter type of "AkkaUtils.address" to "int"; update ApplicationMaster and YarnAllocator to get the driverUrl from RpcEnv
3239380 [zsxwing] Remove some places in core that calling SparkEnv.actorSystem
---
 .../spark/scheduler/TaskSchedulerImpl.scala    | 17 +++++++++++------
 .../mesos/CoarseMesosSchedulerBackend.scala    |  9 ++++-----
 .../org/apache/spark/util/AkkaUtils.scala      |  2 +-
 .../spark/deploy/yarn/ApplicationMaster.scala  | 18 ++++++++++++------
 .../spark/deploy/yarn/YarnAllocator.scala      | 12 ++----------
 .../spark/deploy/yarn/YarnRMClient.scala       |  3 ++-
 .../spark/deploy/yarn/YarnAllocatorSuite.scala |  1 +
 7 files changed, 33 insertions(+), 29 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/scheduler/TaskSchedulerImpl.scala b/core/src/main/scala/org/apache/spark/scheduler/TaskSchedulerImpl.scala
index b4b8a630694bb..ed3dde0fc3055 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/TaskSchedulerImpl.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/TaskSchedulerImpl.scala
@@ -19,9 +19,9 @@ package org.apache.spark.scheduler
 
 import java.nio.ByteBuffer
 import java.util.{TimerTask, Timer}
+import java.util.concurrent.TimeUnit
 import java.util.concurrent.atomic.AtomicLong
 
-import scala.concurrent.duration._
 import scala.collection.mutable.ArrayBuffer
 import scala.collection.mutable.HashMap
 import scala.collection.mutable.HashSet
@@ -32,7 +32,7 @@ import org.apache.spark._
 import org.apache.spark.TaskState.TaskState
 import org.apache.spark.scheduler.SchedulingMode.SchedulingMode
 import org.apache.spark.scheduler.TaskLocality.TaskLocality
-import org.apache.spark.util.Utils
+import org.apache.spark.util.{ThreadUtils, Utils}
 import org.apache.spark.executor.TaskMetrics
 import org.apache.spark.storage.BlockManagerId
 
@@ -64,6 +64,9 @@ private[spark] class TaskSchedulerImpl(
   // How often to check for speculative tasks
   val SPECULATION_INTERVAL_MS = conf.getTimeAsMs("spark.speculation.interval", "100ms")
 
+  private val speculationScheduler =
+    ThreadUtils.newDaemonSingleThreadScheduledExecutor("task-scheduler-speculation")
+
   // Threshold above which we warn user initial TaskSet may be starved
   val STARVATION_TIMEOUT_MS = conf.getTimeAsMs("spark.starvation.timeout", "15s")
 
@@ -142,10 +145,11 @@ private[spark] class TaskSchedulerImpl(
 
     if (!isLocal && conf.getBoolean("spark.speculation", false)) {
       logInfo("Starting speculative execution thread")
-      sc.env.actorSystem.scheduler.schedule(SPECULATION_INTERVAL_MS milliseconds,
-            SPECULATION_INTERVAL_MS milliseconds) {
-        Utils.tryOrStopSparkContext(sc) { checkSpeculatableTasks() }
-      }(sc.env.actorSystem.dispatcher)
+      speculationScheduler.scheduleAtFixedRate(new Runnable {
+        override def run(): Unit = Utils.tryOrStopSparkContext(sc) {
+          checkSpeculatableTasks()
+        }
+      }, SPECULATION_INTERVAL_MS, SPECULATION_INTERVAL_MS, TimeUnit.MILLISECONDS)
     }
   }
 
@@ -412,6 +416,7 @@ private[spark] class TaskSchedulerImpl(
   }
 
   override def stop() {
+    speculationScheduler.shutdown()
     if (backend != null) {
       backend.stop()
     }
diff --git a/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/CoarseMesosSchedulerBackend.scala b/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/CoarseMesosSchedulerBackend.scala
index dc59545b43314..aff086594c73f 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/CoarseMesosSchedulerBackend.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/CoarseMesosSchedulerBackend.scala
@@ -25,9 +25,10 @@ import scala.collection.mutable.{HashMap, HashSet}
 
 import org.apache.mesos.Protos.{TaskInfo => MesosTaskInfo, _}
 import org.apache.mesos.{Scheduler => MScheduler, _}
+import org.apache.spark.rpc.RpcAddress
 import org.apache.spark.scheduler.TaskSchedulerImpl
 import org.apache.spark.scheduler.cluster.CoarseGrainedSchedulerBackend
-import org.apache.spark.util.{AkkaUtils, Utils}
+import org.apache.spark.util.Utils
 import org.apache.spark.{SparkContext, SparkEnv, SparkException, TaskState}
 
 /**
@@ -115,11 +116,9 @@ private[spark] class CoarseMesosSchedulerBackend(
     }
     val command = CommandInfo.newBuilder()
       .setEnvironment(environment)
-    val driverUrl = AkkaUtils.address(
-      AkkaUtils.protocol(sc.env.actorSystem),
+    val driverUrl = sc.env.rpcEnv.uriOf(
       SparkEnv.driverActorSystemName,
-      conf.get("spark.driver.host"),
-      conf.get("spark.driver.port"),
+      RpcAddress(conf.get("spark.driver.host"), conf.get("spark.driver.port").toInt),
       CoarseGrainedSchedulerBackend.ENDPOINT_NAME)
 
     val uri = conf.getOption("spark.executor.uri")
diff --git a/core/src/main/scala/org/apache/spark/util/AkkaUtils.scala b/core/src/main/scala/org/apache/spark/util/AkkaUtils.scala
index de3316d083a22..7513b1b795dea 100644
--- a/core/src/main/scala/org/apache/spark/util/AkkaUtils.scala
+++ b/core/src/main/scala/org/apache/spark/util/AkkaUtils.scala
@@ -235,7 +235,7 @@ private[spark] object AkkaUtils extends Logging {
       protocol: String,
       systemName: String,
       host: String,
-      port: Any,
+      port: Int,
       actorName: String): String = {
     s"$protocol://$systemName@$host:$port/user/$actorName"
   }
diff --git a/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala b/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala
index af4927b0e4bf7..760e458972d98 100644
--- a/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala
+++ b/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala
@@ -34,7 +34,7 @@ import org.apache.spark.{Logging, SecurityManager, SparkConf, SparkContext, Spar
 import org.apache.spark.SparkException
 import org.apache.spark.deploy.{PythonRunner, SparkHadoopUtil}
 import org.apache.spark.deploy.history.HistoryServer
-import org.apache.spark.scheduler.cluster.YarnSchedulerBackend
+import org.apache.spark.scheduler.cluster.{CoarseGrainedSchedulerBackend, YarnSchedulerBackend}
 import org.apache.spark.scheduler.cluster.CoarseGrainedClusterMessages._
 import org.apache.spark.util._
 
@@ -220,7 +220,7 @@ private[spark] class ApplicationMaster(
     sparkContextRef.compareAndSet(sc, null)
   }
 
-  private def registerAM(uiAddress: String, securityMgr: SecurityManager) = {
+  private def registerAM(_rpcEnv: RpcEnv, uiAddress: String, securityMgr: SecurityManager) = {
     val sc = sparkContextRef.get()
 
     val appId = client.getAttemptId().getApplicationId().toString()
@@ -231,8 +231,14 @@ private[spark] class ApplicationMaster(
         .map { address => s"${address}${HistoryServer.UI_PATH_PREFIX}/${appId}/${attemptId}" }
         .getOrElse("")
 
-    allocator = client.register(yarnConf,
-      if (sc != null) sc.getConf else sparkConf,
+    val _sparkConf = if (sc != null) sc.getConf else sparkConf
+    val driverUrl = _rpcEnv.uriOf(
+        SparkEnv.driverActorSystemName,
+        RpcAddress(_sparkConf.get("spark.driver.host"), _sparkConf.get("spark.driver.port").toInt),
+        CoarseGrainedSchedulerBackend.ENDPOINT_NAME)
+    allocator = client.register(driverUrl,
+      yarnConf,
+      _sparkConf,
       if (sc != null) sc.preferredNodeLocationData else Map(),
       uiAddress,
       historyAddress,
@@ -279,7 +285,7 @@ private[spark] class ApplicationMaster(
         sc.getConf.get("spark.driver.host"),
         sc.getConf.get("spark.driver.port"),
         isClusterMode = true)
-      registerAM(sc.ui.map(_.appUIAddress).getOrElse(""), securityMgr)
+      registerAM(rpcEnv, sc.ui.map(_.appUIAddress).getOrElse(""), securityMgr)
       userClassThread.join()
     }
   }
@@ -289,7 +295,7 @@ private[spark] class ApplicationMaster(
     rpcEnv = RpcEnv.create("sparkYarnAM", Utils.localHostName, port, sparkConf, securityMgr)
     waitForSparkDriver()
     addAmIpFilter()
-    registerAM(sparkConf.get("spark.driver.appUIAddress", ""), securityMgr)
+    registerAM(rpcEnv, sparkConf.get("spark.driver.appUIAddress", ""), securityMgr)
 
     // In client mode the actor will stop the reporter thread.
     reporterThread.join()
diff --git a/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnAllocator.scala b/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnAllocator.scala
index 8a08f561a2df2..21193e7c625e3 100644
--- a/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnAllocator.scala
+++ b/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnAllocator.scala
@@ -34,10 +34,8 @@ import org.apache.hadoop.yarn.util.RackResolver
 
 import org.apache.log4j.{Level, Logger}
 
-import org.apache.spark.{SparkEnv, Logging, SecurityManager, SparkConf}
+import org.apache.spark.{Logging, SecurityManager, SparkConf}
 import org.apache.spark.deploy.yarn.YarnSparkHadoopUtil._
-import org.apache.spark.scheduler.cluster.CoarseGrainedSchedulerBackend
-import org.apache.spark.util.AkkaUtils
 
 /**
  * YarnAllocator is charged with requesting containers from the YARN ResourceManager and deciding
@@ -53,6 +51,7 @@ import org.apache.spark.util.AkkaUtils
  * synchronized.
  */
 private[yarn] class YarnAllocator(
+    driverUrl: String,
     conf: Configuration,
     sparkConf: SparkConf,
     amClient: AMRMClient[ContainerRequest],
@@ -107,13 +106,6 @@ private[yarn] class YarnAllocator(
     new ThreadFactoryBuilder().setNameFormat("ContainerLauncher #%d").setDaemon(true).build())
   launcherPool.allowCoreThreadTimeOut(true)
 
-  private val driverUrl = AkkaUtils.address(
-    AkkaUtils.protocol(securityMgr.akkaSSLOptions.enabled),
-    SparkEnv.driverActorSystemName,
-    sparkConf.get("spark.driver.host"),
-    sparkConf.get("spark.driver.port"),
-    CoarseGrainedSchedulerBackend.ENDPOINT_NAME)
-
   // For testing
   private val launchContainers = sparkConf.getBoolean("spark.yarn.launchContainers", true)
 
diff --git a/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnRMClient.scala b/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnRMClient.scala
index ffe71dfd7d257..7f533ee55e8bb 100644
--- a/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnRMClient.scala
+++ b/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnRMClient.scala
@@ -55,6 +55,7 @@ private[spark] class YarnRMClient(args: ApplicationMasterArguments) extends Logg
    * @param uiHistoryAddress Address of the application on the History Server.
    */
   def register(
+      driverUrl: String,
       conf: YarnConfiguration,
       sparkConf: SparkConf,
       preferredNodeLocations: Map[String, Set[SplitInfo]],
@@ -72,7 +73,7 @@ private[spark] class YarnRMClient(args: ApplicationMasterArguments) extends Logg
       amClient.registerApplicationMaster(Utils.localHostName(), 0, uiAddress)
       registered = true
     }
-    new YarnAllocator(conf, sparkConf, amClient, getAttemptId(), args, securityMgr)
+    new YarnAllocator(driverUrl, conf, sparkConf, amClient, getAttemptId(), args, securityMgr)
   }
 
   /**
diff --git a/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnAllocatorSuite.scala b/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnAllocatorSuite.scala
index 455f1019d86dd..b343cbb0c7569 100644
--- a/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnAllocatorSuite.scala
+++ b/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnAllocatorSuite.scala
@@ -90,6 +90,7 @@ class YarnAllocatorSuite extends FunSuite with Matchers with BeforeAndAfterEach
       "--jar", "somejar.jar",
       "--class", "SomeClass")
     new YarnAllocator(
+      "not used",
       conf,
       sparkConf,
       rmClient,

From 836a75898fdc4b10d4d00676ef29e24cc96f09fd Mon Sep 17 00:00:00 2001
From: Xiangrui Meng <meng@databricks.com>
Date: Tue, 26 May 2015 15:51:31 -0700
Subject: [PATCH 187/525] [SPARK-7748] [MLLIB] Graduate spark.ml from alpha

With descent coverage of feature transformers, algorithms, and model tuning support, it is time to graduate `spark.ml` from alpha. This PR changes all `AlphaComponent` annotations to either `DeveloperApi` or `Experimental`, depending on whether we expect a class/method to be used by end users (who use the pipeline API to assemble/tune their ML pipelines but not to create new pipeline components.) `UnaryTransformer` becomes a `DeveloperApi` in this PR.

jkbradley harsha2010

Author: Xiangrui Meng <meng@databricks.com>

Closes #6417 from mengxr/SPARK-7748 and squashes the following commits:

effbccd [Xiangrui Meng] organize imports
c15028e [Xiangrui Meng] added missing docs
1b2e5f8 [Xiangrui Meng] update package doc
73ca791 [Xiangrui Meng] alpha -> ex/dev for the rest
93819db [Xiangrui Meng] alpha -> ex/dev in ml.param
55ca073 [Xiangrui Meng] alpha -> ex/dev in ml.feature
83572f1 [Xiangrui Meng] add Experimental and DeveloperApi tags (wip)
---
 .../scala/org/apache/spark/ml/Estimator.scala |  8 +--
 .../scala/org/apache/spark/ml/Model.scala     |  6 +-
 .../scala/org/apache/spark/ml/Pipeline.scala  | 14 ++---
 .../scala/org/apache/spark/ml/Predictor.scala |  3 -
 .../org/apache/spark/ml/Transformer.scala     | 10 ++--
 .../spark/ml/attribute/AttributeGroup.scala   |  9 ++-
 .../spark/ml/attribute/AttributeType.scala    |  5 ++
 .../spark/ml/attribute/attributes.scala       | 29 ++++++++-
 .../DecisionTreeClassifier.scala              | 15 +++--
 .../ml/classification/GBTClassifier.scala     | 15 +++--
 .../classification/LogisticRegression.scala   | 19 +++---
 .../spark/ml/classification/OneVsRest.scala   |  7 +--
 .../RandomForestClassifier.scala              | 15 +++--
 .../BinaryClassificationEvaluator.scala       |  8 +--
 .../spark/ml/evaluation/Evaluator.scala       |  6 +-
 .../ml/evaluation/RegressionEvaluator.scala   |  7 +--
 .../apache/spark/ml/feature/Binarizer.scala   |  6 +-
 .../apache/spark/ml/feature/Bucketizer.scala  |  6 +-
 .../spark/ml/feature/ElementwiseProduct.scala |  6 +-
 .../apache/spark/ml/feature/HashingTF.scala   | 10 ++--
 .../org/apache/spark/ml/feature/IDF.scala     | 10 ++--
 .../apache/spark/ml/feature/Normalizer.scala  |  6 +-
 .../spark/ml/feature/OneHotEncoder.scala      |  7 ++-
 .../ml/feature/PolynomialExpansion.scala      |  6 +-
 .../spark/ml/feature/StandardScaler.scala     | 10 ++--
 .../spark/ml/feature/StringIndexer.scala      | 10 ++--
 .../apache/spark/ml/feature/Tokenizer.scala   | 10 ++--
 .../spark/ml/feature/VectorAssembler.scala    |  6 +-
 .../spark/ml/feature/VectorIndexer.scala      | 12 ++--
 .../apache/spark/ml/feature/Word2Vec.scala    | 10 ++--
 .../org/apache/spark/ml/package-info.java     |  6 +-
 .../scala/org/apache/spark/ml/package.scala   |  2 +-
 .../org/apache/spark/ml/param/params.scala    | 59 ++++++++++++++-----
 .../apache/spark/ml/recommendation/ALS.scala  | 14 ++++-
 .../ml/regression/DecisionTreeRegressor.scala | 15 +++--
 .../spark/ml/regression/GBTRegressor.scala    | 14 ++---
 .../ml/regression/LinearRegression.scala      | 12 ++--
 .../ml/regression/RandomForestRegressor.scala | 15 +++--
 .../scala/org/apache/spark/ml/tree/Node.scala |  8 ++-
 .../org/apache/spark/ml/tree/Split.scala      |  7 +++
 .../org/apache/spark/ml/tree/treeParams.scala |  9 ---
 .../spark/ml/tuning/CrossValidator.scala      | 10 ++--
 .../spark/ml/tuning/ParamGridBuilder.scala    |  6 +-
 43 files changed, 267 insertions(+), 201 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/Estimator.scala b/mllib/src/main/scala/org/apache/spark/ml/Estimator.scala
index 9e16e60270141..e9a5d7c0e7988 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/Estimator.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/Estimator.scala
@@ -19,15 +19,15 @@ package org.apache.spark.ml
 
 import scala.annotation.varargs
 
-import org.apache.spark.annotation.AlphaComponent
-import org.apache.spark.ml.param.{ParamMap, ParamPair, Params}
+import org.apache.spark.annotation.DeveloperApi
+import org.apache.spark.ml.param.{ParamMap, ParamPair}
 import org.apache.spark.sql.DataFrame
 
 /**
- * :: AlphaComponent ::
+ * :: DeveloperApi ::
  * Abstract class for estimators that fit models to data.
  */
-@AlphaComponent
+@DeveloperApi
 abstract class Estimator[M <: Model[M]] extends PipelineStage {
 
   /**
diff --git a/mllib/src/main/scala/org/apache/spark/ml/Model.scala b/mllib/src/main/scala/org/apache/spark/ml/Model.scala
index 70e7495ac616c..186bf7ae7a2f6 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/Model.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/Model.scala
@@ -17,16 +17,16 @@
 
 package org.apache.spark.ml
 
-import org.apache.spark.annotation.AlphaComponent
+import org.apache.spark.annotation.DeveloperApi
 import org.apache.spark.ml.param.ParamMap
 
 /**
- * :: AlphaComponent ::
+ * :: DeveloperApi ::
  * A fitted model, i.e., a [[Transformer]] produced by an [[Estimator]].
  *
  * @tparam M model type
  */
-@AlphaComponent
+@DeveloperApi
 abstract class Model[M <: Model[M]] extends Transformer {
   /**
    * The parent estimator that produced this model.
diff --git a/mllib/src/main/scala/org/apache/spark/ml/Pipeline.scala b/mllib/src/main/scala/org/apache/spark/ml/Pipeline.scala
index 43bee1b770e67..9da3ff65c744e 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/Pipeline.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/Pipeline.scala
@@ -20,17 +20,17 @@ package org.apache.spark.ml
 import scala.collection.mutable.ListBuffer
 
 import org.apache.spark.Logging
-import org.apache.spark.annotation.{AlphaComponent, DeveloperApi}
+import org.apache.spark.annotation.{DeveloperApi, Experimental}
 import org.apache.spark.ml.param.{Param, ParamMap, Params}
 import org.apache.spark.ml.util.Identifiable
 import org.apache.spark.sql.DataFrame
 import org.apache.spark.sql.types.StructType
 
 /**
- * :: AlphaComponent ::
+ * :: DeveloperApi ::
  * A stage in a pipeline, either an [[Estimator]] or a [[Transformer]].
  */
-@AlphaComponent
+@DeveloperApi
 abstract class PipelineStage extends Params with Logging {
 
   /**
@@ -69,7 +69,7 @@ abstract class PipelineStage extends Params with Logging {
 }
 
 /**
- * :: AlphaComponent ::
+ * :: Experimental ::
  * A simple pipeline, which acts as an estimator. A Pipeline consists of a sequence of stages, each
  * of which is either an [[Estimator]] or a [[Transformer]]. When [[Pipeline#fit]] is called, the
  * stages are executed in order. If a stage is an [[Estimator]], its [[Estimator#fit]] method will
@@ -80,7 +80,7 @@ abstract class PipelineStage extends Params with Logging {
  * transformers, corresponding to the pipeline stages. If there are no stages, the pipeline acts as
  * an identity transformer.
  */
-@AlphaComponent
+@Experimental
 class Pipeline(override val uid: String) extends Estimator[PipelineModel] {
 
   def this() = this(Identifiable.randomUID("pipeline"))
@@ -169,10 +169,10 @@ class Pipeline(override val uid: String) extends Estimator[PipelineModel] {
 }
 
 /**
- * :: AlphaComponent ::
+ * :: Experimental ::
  * Represents a fitted pipeline.
  */
-@AlphaComponent
+@Experimental
 class PipelineModel private[ml] (
     override val uid: String,
     val stages: Array[Transformer])
diff --git a/mllib/src/main/scala/org/apache/spark/ml/Predictor.scala b/mllib/src/main/scala/org/apache/spark/ml/Predictor.scala
index ec0f76aa668bd..e752b81a14282 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/Predictor.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/Predictor.scala
@@ -58,7 +58,6 @@ private[ml] trait PredictorParams extends Params
 
 /**
  * :: DeveloperApi ::
- *
  * Abstraction for prediction problems (regression and classification).
  *
  * @tparam FeaturesType  Type of features.
@@ -113,7 +112,6 @@ abstract class Predictor[
    *
    * The default value is VectorUDT, but it may be overridden if FeaturesType is not Vector.
    */
-  @DeveloperApi
   private[ml] def featuresDataType: DataType = new VectorUDT
 
   override def transformSchema(schema: StructType): StructType = {
@@ -134,7 +132,6 @@ abstract class Predictor[
 
 /**
  * :: DeveloperApi ::
- *
  * Abstraction for a model for prediction tasks (regression and classification).
  *
  * @tparam FeaturesType  Type of features.
diff --git a/mllib/src/main/scala/org/apache/spark/ml/Transformer.scala b/mllib/src/main/scala/org/apache/spark/ml/Transformer.scala
index 38bb6a5a5391e..f07f733a5ddb5 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/Transformer.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/Transformer.scala
@@ -20,7 +20,7 @@ package org.apache.spark.ml
 import scala.annotation.varargs
 
 import org.apache.spark.Logging
-import org.apache.spark.annotation.AlphaComponent
+import org.apache.spark.annotation.DeveloperApi
 import org.apache.spark.ml.param._
 import org.apache.spark.ml.param.shared._
 import org.apache.spark.sql.DataFrame
@@ -28,10 +28,10 @@ import org.apache.spark.sql.functions._
 import org.apache.spark.sql.types._
 
 /**
- * :: AlphaComponent ::
+ * :: DeveloperApi ::
  * Abstract class for transformers that transform one dataset into another.
  */
-@AlphaComponent
+@DeveloperApi
 abstract class Transformer extends PipelineStage {
 
   /**
@@ -73,10 +73,12 @@ abstract class Transformer extends PipelineStage {
 }
 
 /**
+ * :: DeveloperApi ::
  * Abstract class for transformers that take one input column, apply transformation, and output the
  * result as a new column.
  */
-private[ml] abstract class UnaryTransformer[IN, OUT, T <: UnaryTransformer[IN, OUT, T]]
+@DeveloperApi
+abstract class UnaryTransformer[IN, OUT, T <: UnaryTransformer[IN, OUT, T]]
   extends Transformer with HasInputCol with HasOutputCol with Logging {
 
   /** @group setParam */
diff --git a/mllib/src/main/scala/org/apache/spark/ml/attribute/AttributeGroup.scala b/mllib/src/main/scala/org/apache/spark/ml/attribute/AttributeGroup.scala
index f5f37aa77929c..457c15830fd38 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/attribute/AttributeGroup.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/attribute/AttributeGroup.scala
@@ -19,10 +19,12 @@ package org.apache.spark.ml.attribute
 
 import scala.collection.mutable.ArrayBuffer
 
+import org.apache.spark.annotation.DeveloperApi
 import org.apache.spark.mllib.linalg.VectorUDT
 import org.apache.spark.sql.types.{Metadata, MetadataBuilder, StructField}
 
 /**
+ * :: DeveloperApi ::
  * Attributes that describe a vector ML column.
  *
  * @param name name of the attribute group (the ML column name)
@@ -31,6 +33,7 @@ import org.apache.spark.sql.types.{Metadata, MetadataBuilder, StructField}
  * @param attrs optional array of attributes. Attribute will be copied with their corresponding
  *              indices in the array.
  */
+@DeveloperApi
 class AttributeGroup private (
     val name: String,
     val numAttributes: Option[Int],
@@ -182,7 +185,11 @@ class AttributeGroup private (
   }
 }
 
-/** Factory methods to create attribute groups. */
+/**
+ * :: DeveloperApi ::
+ * Factory methods to create attribute groups.
+ */
+@DeveloperApi
 object AttributeGroup {
 
   import AttributeKeys._
diff --git a/mllib/src/main/scala/org/apache/spark/ml/attribute/AttributeType.scala b/mllib/src/main/scala/org/apache/spark/ml/attribute/AttributeType.scala
index a83febd7de2cc..5c7089b491677 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/attribute/AttributeType.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/attribute/AttributeType.scala
@@ -17,12 +17,17 @@
 
 package org.apache.spark.ml.attribute
 
+import org.apache.spark.annotation.DeveloperApi
+
 /**
+ * :: DeveloperApi ::
  * An enum-like type for attribute types: [[AttributeType$#Numeric]], [[AttributeType$#Nominal]],
  * and [[AttributeType$#Binary]].
  */
+@DeveloperApi
 sealed abstract class AttributeType(val name: String)
 
+@DeveloperApi
 object AttributeType {
 
   /** Numeric type. */
diff --git a/mllib/src/main/scala/org/apache/spark/ml/attribute/attributes.scala b/mllib/src/main/scala/org/apache/spark/ml/attribute/attributes.scala
index e8f7f152784a1..ce43a450daad0 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/attribute/attributes.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/attribute/attributes.scala
@@ -19,11 +19,14 @@ package org.apache.spark.ml.attribute
 
 import scala.annotation.varargs
 
+import org.apache.spark.annotation.DeveloperApi
 import org.apache.spark.sql.types.{DoubleType, Metadata, MetadataBuilder, StructField}
 
 /**
+ * :: DeveloperApi ::
  * Abstract class for ML attributes.
  */
+@DeveloperApi
 sealed abstract class Attribute extends Serializable {
 
   name.foreach { n =>
@@ -135,6 +138,10 @@ private[attribute] trait AttributeFactory {
   }
 }
 
+/**
+ * :: DeveloperApi ::
+ */
+@DeveloperApi
 object Attribute extends AttributeFactory {
 
   private[attribute] override def fromMetadata(metadata: Metadata): Attribute = {
@@ -163,6 +170,7 @@ object Attribute extends AttributeFactory {
 
 
 /**
+ * :: DeveloperApi ::
  * A numeric attribute with optional summary statistics.
  * @param name optional name
  * @param index optional index
@@ -171,6 +179,7 @@ object Attribute extends AttributeFactory {
  * @param std optional standard deviation
  * @param sparsity optional sparsity (ratio of zeros)
  */
+@DeveloperApi
 class NumericAttribute private[ml] (
     override val name: Option[String] = None,
     override val index: Option[Int] = None,
@@ -278,8 +287,10 @@ class NumericAttribute private[ml] (
 }
 
 /**
+ * :: DeveloperApi ::
  * Factory methods for numeric attributes.
  */
+@DeveloperApi
 object NumericAttribute extends AttributeFactory {
 
   /** The default numeric attribute. */
@@ -298,6 +309,7 @@ object NumericAttribute extends AttributeFactory {
 }
 
 /**
+ * :: DeveloperApi ::
  * A nominal attribute.
  * @param name optional name
  * @param index optional index
@@ -306,6 +318,7 @@ object NumericAttribute extends AttributeFactory {
  *                  defined.
  * @param values optional values. At most one of `numValues` and `values` can be defined.
  */
+@DeveloperApi
 class NominalAttribute private[ml] (
     override val name: Option[String] = None,
     override val index: Option[Int] = None,
@@ -430,7 +443,11 @@ class NominalAttribute private[ml] (
   }
 }
 
-/** Factory methods for nominal attributes. */
+/**
+ * :: DeveloperApi ::
+ * Factory methods for nominal attributes.
+ */
+@DeveloperApi
 object NominalAttribute extends AttributeFactory {
 
   /** The default nominal attribute. */
@@ -450,11 +467,13 @@ object NominalAttribute extends AttributeFactory {
 }
 
 /**
+ * :: DeveloperApi ::
  * A binary attribute.
  * @param name optional name
  * @param index optional index
  * @param values optionla values. If set, its size must be 2.
  */
+@DeveloperApi
 class BinaryAttribute private[ml] (
     override val name: Option[String] = None,
     override val index: Option[Int] = None,
@@ -526,7 +545,11 @@ class BinaryAttribute private[ml] (
   }
 }
 
-/** Factory methods for binary attributes. */
+/**
+ * :: DeveloperApi ::
+ * Factory methods for binary attributes.
+ */
+@DeveloperApi
 object BinaryAttribute extends AttributeFactory {
 
   /** The default binary attribute. */
@@ -543,8 +566,10 @@ object BinaryAttribute extends AttributeFactory {
 }
 
 /**
+ * :: DeveloperApi ::
  * An unresolved attribute.
  */
+@DeveloperApi
 object UnresolvedAttribute extends Attribute {
 
   override def attrType: AttributeType = AttributeType.Unresolved
diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/DecisionTreeClassifier.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/DecisionTreeClassifier.scala
index 7c961332bf5b6..8030e0728a56c 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/classification/DecisionTreeClassifier.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/classification/DecisionTreeClassifier.scala
@@ -17,10 +17,10 @@
 
 package org.apache.spark.ml.classification
 
-import org.apache.spark.annotation.AlphaComponent
+import org.apache.spark.annotation.Experimental
 import org.apache.spark.ml.{PredictionModel, Predictor}
 import org.apache.spark.ml.param.ParamMap
-import org.apache.spark.ml.tree.{TreeClassifierParams, DecisionTreeParams, DecisionTreeModel, Node}
+import org.apache.spark.ml.tree.{DecisionTreeModel, DecisionTreeParams, Node, TreeClassifierParams}
 import org.apache.spark.ml.util.{Identifiable, MetadataUtils}
 import org.apache.spark.mllib.linalg.Vector
 import org.apache.spark.mllib.regression.LabeledPoint
@@ -31,14 +31,13 @@ import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.DataFrame
 
 /**
- * :: AlphaComponent ::
- *
+ * :: Experimental ::
  * [[http://en.wikipedia.org/wiki/Decision_tree_learning Decision tree]] learning algorithm
  * for classification.
  * It supports both binary and multiclass labels, as well as both continuous and categorical
  * features.
  */
-@AlphaComponent
+@Experimental
 final class DecisionTreeClassifier(override val uid: String)
   extends Predictor[Vector, DecisionTreeClassifier, DecisionTreeClassificationModel]
   with DecisionTreeParams with TreeClassifierParams {
@@ -89,19 +88,19 @@ final class DecisionTreeClassifier(override val uid: String)
   }
 }
 
+@Experimental
 object DecisionTreeClassifier {
   /** Accessor for supported impurities: entropy, gini */
   final val supportedImpurities: Array[String] = TreeClassifierParams.supportedImpurities
 }
 
 /**
- * :: AlphaComponent ::
- *
+ * :: Experimental ::
  * [[http://en.wikipedia.org/wiki/Decision_tree_learning Decision tree]] model for classification.
  * It supports both binary and multiclass labels, as well as both continuous and categorical
  * features.
  */
-@AlphaComponent
+@Experimental
 final class DecisionTreeClassificationModel private[ml] (
     override val uid: String,
     override val rootNode: Node)
diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/GBTClassifier.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/GBTClassifier.scala
index d504d84beb91e..d8592eb2d947d 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/classification/GBTClassifier.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/classification/GBTClassifier.scala
@@ -20,11 +20,11 @@ package org.apache.spark.ml.classification
 import com.github.fommil.netlib.BLAS.{getInstance => blas}
 
 import org.apache.spark.Logging
-import org.apache.spark.annotation.AlphaComponent
+import org.apache.spark.annotation.Experimental
 import org.apache.spark.ml.{PredictionModel, Predictor}
 import org.apache.spark.ml.param.{Param, ParamMap}
 import org.apache.spark.ml.regression.DecisionTreeRegressionModel
-import org.apache.spark.ml.tree.{GBTParams, TreeClassifierParams, DecisionTreeModel, TreeEnsembleModel}
+import org.apache.spark.ml.tree.{DecisionTreeModel, GBTParams, TreeClassifierParams, TreeEnsembleModel}
 import org.apache.spark.ml.util.{Identifiable, MetadataUtils}
 import org.apache.spark.mllib.linalg.Vector
 import org.apache.spark.mllib.regression.LabeledPoint
@@ -36,14 +36,13 @@ import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.DataFrame
 
 /**
- * :: AlphaComponent ::
- *
+ * :: Experimental ::
  * [[http://en.wikipedia.org/wiki/Gradient_boosting Gradient-Boosted Trees (GBTs)]]
  * learning algorithm for classification.
  * It supports binary labels, as well as both continuous and categorical features.
  * Note: Multiclass labels are not currently supported.
  */
-@AlphaComponent
+@Experimental
 final class GBTClassifier(override val uid: String)
   extends Predictor[Vector, GBTClassifier, GBTClassificationModel]
   with GBTParams with TreeClassifierParams with Logging {
@@ -144,6 +143,7 @@ final class GBTClassifier(override val uid: String)
   }
 }
 
+@Experimental
 object GBTClassifier {
   // The losses below should be lowercase.
   /** Accessor for supported loss settings: logistic */
@@ -151,8 +151,7 @@ object GBTClassifier {
 }
 
 /**
- * :: AlphaComponent ::
- *
+ * :: Experimental ::
  * [[http://en.wikipedia.org/wiki/Gradient_boosting Gradient-Boosted Trees (GBTs)]]
  * model for classification.
  * It supports binary labels, as well as both continuous and categorical features.
@@ -160,7 +159,7 @@ object GBTClassifier {
  * @param _trees  Decision trees in the ensemble.
  * @param _treeWeights  Weights for the decision trees in the ensemble.
  */
-@AlphaComponent
+@Experimental
 final class GBTClassificationModel(
     override val uid: String,
     private val _trees: Array[DecisionTreeRegressionModel],
diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
index 8694c96e4c5b6..d13109d9da4c0 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
@@ -19,11 +19,11 @@ package org.apache.spark.ml.classification
 
 import scala.collection.mutable
 
-import breeze.linalg.{norm => brzNorm, DenseVector => BDV}
-import breeze.optimize.{LBFGS => BreezeLBFGS, OWLQN => BreezeOWLQN}
-import breeze.optimize.{CachedDiffFunction, DiffFunction}
+import breeze.linalg.{DenseVector => BDV, norm => brzNorm}
+import breeze.optimize.{CachedDiffFunction, DiffFunction, LBFGS => BreezeLBFGS, OWLQN => BreezeOWLQN}
 
-import org.apache.spark.annotation.AlphaComponent
+import org.apache.spark.{Logging, SparkException}
+import org.apache.spark.annotation.Experimental
 import org.apache.spark.ml.param._
 import org.apache.spark.ml.param.shared._
 import org.apache.spark.ml.util.Identifiable
@@ -35,7 +35,6 @@ import org.apache.spark.mllib.util.MLUtils
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.DataFrame
 import org.apache.spark.storage.StorageLevel
-import org.apache.spark.{SparkException, Logging}
 
 /**
  * Params for logistic regression.
@@ -45,12 +44,11 @@ private[classification] trait LogisticRegressionParams extends ProbabilisticClas
   with HasThreshold
 
 /**
- * :: AlphaComponent ::
- *
+ * :: Experimental ::
  * Logistic regression.
  * Currently, this class only supports binary classification.
  */
-@AlphaComponent
+@Experimental
 class LogisticRegression(override val uid: String)
   extends ProbabilisticClassifier[Vector, LogisticRegression, LogisticRegressionModel]
   with LogisticRegressionParams with Logging {
@@ -221,11 +219,10 @@ class LogisticRegression(override val uid: String)
 }
 
 /**
- * :: AlphaComponent ::
- *
+ * :: Experimental ::
  * Model produced by [[LogisticRegression]].
  */
-@AlphaComponent
+@Experimental
 class LogisticRegressionModel private[ml] (
     override val uid: String,
     val weights: Vector,
diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/OneVsRest.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/OneVsRest.scala
index 1543f051ccd17..36735cd834cd4 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/classification/OneVsRest.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/classification/OneVsRest.scala
@@ -21,7 +21,7 @@ import java.util.UUID
 
 import scala.language.existentials
 
-import org.apache.spark.annotation.{AlphaComponent, Experimental}
+import org.apache.spark.annotation.Experimental
 import org.apache.spark.ml._
 import org.apache.spark.ml.attribute._
 import org.apache.spark.ml.param.Param
@@ -54,8 +54,7 @@ private[ml] trait OneVsRestParams extends PredictorParams {
 }
 
 /**
- * :: AlphaComponent ::
- *
+ * :: Experimental ::
  * Model produced by [[OneVsRest]].
  * This stores the models resulting from training k binary classifiers: one for each class.
  * Each example is scored against all k models, and the model with the highest score
@@ -67,7 +66,7 @@ private[ml] trait OneVsRestParams extends PredictorParams {
  *               The i-th model is produced by testing the i-th class (taking label 1) vs the rest
  *               (taking label 0).
  */
-@AlphaComponent
+@Experimental
 final class OneVsRestModel private[ml] (
     override val uid: String,
     labelMetadata: Metadata,
diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/RandomForestClassifier.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/RandomForestClassifier.scala
index a1de7919859eb..67600ebd7b38e 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/classification/RandomForestClassifier.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/classification/RandomForestClassifier.scala
@@ -19,10 +19,10 @@ package org.apache.spark.ml.classification
 
 import scala.collection.mutable
 
-import org.apache.spark.annotation.AlphaComponent
+import org.apache.spark.annotation.Experimental
 import org.apache.spark.ml.{PredictionModel, Predictor}
 import org.apache.spark.ml.param.ParamMap
-import org.apache.spark.ml.tree.{RandomForestParams, TreeClassifierParams, DecisionTreeModel, TreeEnsembleModel}
+import org.apache.spark.ml.tree.{DecisionTreeModel, RandomForestParams, TreeClassifierParams, TreeEnsembleModel}
 import org.apache.spark.ml.util.{Identifiable, MetadataUtils}
 import org.apache.spark.mllib.linalg.Vector
 import org.apache.spark.mllib.regression.LabeledPoint
@@ -33,14 +33,13 @@ import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.DataFrame
 
 /**
- * :: AlphaComponent ::
- *
+ * :: Experimental ::
  * [[http://en.wikipedia.org/wiki/Random_forest  Random Forest]] learning algorithm for
  * classification.
  * It supports both binary and multiclass labels, as well as both continuous and categorical
  * features.
  */
-@AlphaComponent
+@Experimental
 final class RandomForestClassifier(override val uid: String)
   extends Predictor[Vector, RandomForestClassifier, RandomForestClassificationModel]
   with RandomForestParams with TreeClassifierParams {
@@ -100,6 +99,7 @@ final class RandomForestClassifier(override val uid: String)
   }
 }
 
+@Experimental
 object RandomForestClassifier {
   /** Accessor for supported impurity settings: entropy, gini */
   final val supportedImpurities: Array[String] = TreeClassifierParams.supportedImpurities
@@ -110,15 +110,14 @@ object RandomForestClassifier {
 }
 
 /**
- * :: AlphaComponent ::
- *
+ * :: Experimental ::
  * [[http://en.wikipedia.org/wiki/Random_forest  Random Forest]] model for classification.
  * It supports both binary and multiclass labels, as well as both continuous and categorical
  * features.
  * @param _trees  Decision trees in the ensemble.
  *               Warning: These have null parents.
  */
-@AlphaComponent
+@Experimental
 final class RandomForestClassificationModel private[ml] (
     override val uid: String,
     private val _trees: Array[DecisionTreeClassificationModel])
diff --git a/mllib/src/main/scala/org/apache/spark/ml/evaluation/BinaryClassificationEvaluator.scala b/mllib/src/main/scala/org/apache/spark/ml/evaluation/BinaryClassificationEvaluator.scala
index ddbdd00ceb159..f695ddaeefc72 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/evaluation/BinaryClassificationEvaluator.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/evaluation/BinaryClassificationEvaluator.scala
@@ -17,8 +17,7 @@
 
 package org.apache.spark.ml.evaluation
 
-import org.apache.spark.annotation.AlphaComponent
-import org.apache.spark.ml.evaluation.Evaluator
+import org.apache.spark.annotation.Experimental
 import org.apache.spark.ml.param._
 import org.apache.spark.ml.param.shared._
 import org.apache.spark.ml.util.{Identifiable, SchemaUtils}
@@ -28,11 +27,10 @@ import org.apache.spark.sql.{DataFrame, Row}
 import org.apache.spark.sql.types.DoubleType
 
 /**
- * :: AlphaComponent ::
- *
+ * :: Experimental ::
  * Evaluator for binary classification, which expects two input columns: score and label.
  */
-@AlphaComponent
+@Experimental
 class BinaryClassificationEvaluator(override val uid: String)
   extends Evaluator with HasRawPredictionCol with HasLabelCol {
 
diff --git a/mllib/src/main/scala/org/apache/spark/ml/evaluation/Evaluator.scala b/mllib/src/main/scala/org/apache/spark/ml/evaluation/Evaluator.scala
index cabd1c97c085c..61e937e693699 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/evaluation/Evaluator.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/evaluation/Evaluator.scala
@@ -17,15 +17,15 @@
 
 package org.apache.spark.ml.evaluation
 
-import org.apache.spark.annotation.AlphaComponent
+import org.apache.spark.annotation.DeveloperApi
 import org.apache.spark.ml.param.{ParamMap, Params}
 import org.apache.spark.sql.DataFrame
 
 /**
- * :: AlphaComponent ::
+ * :: DeveloperApi ::
  * Abstract class for evaluators that compute metrics from predictions.
  */
-@AlphaComponent
+@DeveloperApi
 abstract class Evaluator extends Params {
 
   /**
diff --git a/mllib/src/main/scala/org/apache/spark/ml/evaluation/RegressionEvaluator.scala b/mllib/src/main/scala/org/apache/spark/ml/evaluation/RegressionEvaluator.scala
index 80458928c5439..1771177e1ea91 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/evaluation/RegressionEvaluator.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/evaluation/RegressionEvaluator.scala
@@ -17,7 +17,7 @@
 
 package org.apache.spark.ml.evaluation
 
-import org.apache.spark.annotation.AlphaComponent
+import org.apache.spark.annotation.Experimental
 import org.apache.spark.ml.param.{Param, ParamValidators}
 import org.apache.spark.ml.param.shared.{HasLabelCol, HasPredictionCol}
 import org.apache.spark.ml.util.{Identifiable, SchemaUtils}
@@ -26,11 +26,10 @@ import org.apache.spark.sql.{DataFrame, Row}
 import org.apache.spark.sql.types.DoubleType
 
 /**
- * :: AlphaComponent ::
- *
+ * :: Experimental ::
  * Evaluator for regression, which expects two input columns: prediction and label.
  */
-@AlphaComponent
+@Experimental
 final class RegressionEvaluator(override val uid: String)
   extends Evaluator with HasPredictionCol with HasLabelCol {
 
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/Binarizer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/Binarizer.scala
index 62f4a6343423e..b06122d733853 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/Binarizer.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/Binarizer.scala
@@ -17,7 +17,7 @@
 
 package org.apache.spark.ml.feature
 
-import org.apache.spark.annotation.AlphaComponent
+import org.apache.spark.annotation.Experimental
 import org.apache.spark.ml.Transformer
 import org.apache.spark.ml.attribute.BinaryAttribute
 import org.apache.spark.ml.param._
@@ -28,10 +28,10 @@ import org.apache.spark.sql.functions._
 import org.apache.spark.sql.types.{DoubleType, StructType}
 
 /**
- * :: AlphaComponent ::
+ * :: Experimental ::
  * Binarize a column of continuous features given a threshold.
  */
-@AlphaComponent
+@Experimental
 final class Binarizer(override val uid: String)
   extends Transformer with HasInputCol with HasOutputCol {
 
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/Bucketizer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/Bucketizer.scala
index ac8dfb5632a7b..a3d1f6f65ccaf 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/Bucketizer.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/Bucketizer.scala
@@ -20,7 +20,7 @@ package org.apache.spark.ml.feature
 import java.{util => ju}
 
 import org.apache.spark.SparkException
-import org.apache.spark.annotation.AlphaComponent
+import org.apache.spark.annotation.Experimental
 import org.apache.spark.ml.Model
 import org.apache.spark.ml.attribute.NominalAttribute
 import org.apache.spark.ml.param._
@@ -31,10 +31,10 @@ import org.apache.spark.sql.functions._
 import org.apache.spark.sql.types.{DoubleType, StructField, StructType}
 
 /**
- * :: AlphaComponent ::
+ * :: Experimental ::
  * `Bucketizer` maps a column of continuous features to a column of feature buckets.
  */
-@AlphaComponent
+@Experimental
 final class Bucketizer(override val uid: String)
   extends Model[Bucketizer] with HasInputCol with HasOutputCol {
 
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/ElementwiseProduct.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/ElementwiseProduct.scala
index 8b32eee0e490a..3ae1833390152 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/ElementwiseProduct.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/ElementwiseProduct.scala
@@ -17,7 +17,7 @@
 
 package org.apache.spark.ml.feature
 
-import org.apache.spark.annotation.AlphaComponent
+import org.apache.spark.annotation.Experimental
 import org.apache.spark.ml.UnaryTransformer
 import org.apache.spark.ml.param.Param
 import org.apache.spark.ml.util.Identifiable
@@ -26,12 +26,12 @@ import org.apache.spark.mllib.linalg.{Vector, VectorUDT}
 import org.apache.spark.sql.types.DataType
 
 /**
- * :: AlphaComponent ::
+ * :: Experimental ::
  * Outputs the Hadamard product (i.e., the element-wise product) of each input vector with a
  * provided "weight" vector.  In other words, it scales each column of the dataset by a scalar
  * multiplier.
  */
-@AlphaComponent
+@Experimental
 class ElementwiseProduct(override val uid: String)
   extends UnaryTransformer[Vector, Vector, ElementwiseProduct] {
 
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/HashingTF.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/HashingTF.scala
index 8942d45219177..f936aef80f8af 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/HashingTF.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/HashingTF.scala
@@ -17,22 +17,22 @@
 
 package org.apache.spark.ml.feature
 
-import org.apache.spark.annotation.AlphaComponent
+import org.apache.spark.annotation.Experimental
 import org.apache.spark.ml.Transformer
 import org.apache.spark.ml.attribute.AttributeGroup
-import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol}
 import org.apache.spark.ml.param.{IntParam, ParamValidators}
+import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol}
 import org.apache.spark.ml.util.{Identifiable, SchemaUtils}
 import org.apache.spark.mllib.feature
 import org.apache.spark.sql.DataFrame
-import org.apache.spark.sql.functions.{udf, col}
+import org.apache.spark.sql.functions.{col, udf}
 import org.apache.spark.sql.types.{ArrayType, StructType}
 
 /**
- * :: AlphaComponent ::
+ * :: Experimental ::
  * Maps a sequence of terms to their term frequencies using the hashing trick.
  */
-@AlphaComponent
+@Experimental
 class HashingTF(override val uid: String) extends Transformer with HasInputCol with HasOutputCol {
 
   def this() = this(Identifiable.randomUID("hashingTF"))
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/IDF.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/IDF.scala
index 788c392050c2d..376b84530cd57 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/IDF.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/IDF.scala
@@ -17,7 +17,7 @@
 
 package org.apache.spark.ml.feature
 
-import org.apache.spark.annotation.AlphaComponent
+import org.apache.spark.annotation.Experimental
 import org.apache.spark.ml._
 import org.apache.spark.ml.param._
 import org.apache.spark.ml.param.shared._
@@ -58,10 +58,10 @@ private[feature] trait IDFBase extends Params with HasInputCol with HasOutputCol
 }
 
 /**
- * :: AlphaComponent ::
+ * :: Experimental ::
  * Compute the Inverse Document Frequency (IDF) given a collection of documents.
  */
-@AlphaComponent
+@Experimental
 final class IDF(override val uid: String) extends Estimator[IDFModel] with IDFBase {
 
   def this() = this(Identifiable.randomUID("idf"))
@@ -85,10 +85,10 @@ final class IDF(override val uid: String) extends Estimator[IDFModel] with IDFBa
 }
 
 /**
- * :: AlphaComponent ::
+ * :: Experimental ::
  * Model fitted by [[IDF]].
  */
-@AlphaComponent
+@Experimental
 class IDFModel private[ml] (
     override val uid: String,
     idfModel: feature.IDFModel)
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/Normalizer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/Normalizer.scala
index 3f689d1585cd6..8282e5ffa17f7 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/Normalizer.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/Normalizer.scala
@@ -17,7 +17,7 @@
 
 package org.apache.spark.ml.feature
 
-import org.apache.spark.annotation.AlphaComponent
+import org.apache.spark.annotation.Experimental
 import org.apache.spark.ml.UnaryTransformer
 import org.apache.spark.ml.param.{DoubleParam, ParamValidators}
 import org.apache.spark.ml.util.Identifiable
@@ -26,10 +26,10 @@ import org.apache.spark.mllib.linalg.{Vector, VectorUDT}
 import org.apache.spark.sql.types.DataType
 
 /**
- * :: AlphaComponent ::
+ * :: Experimental ::
  * Normalize a vector to have unit norm using the given p-norm.
  */
-@AlphaComponent
+@Experimental
 class Normalizer(override val uid: String) extends UnaryTransformer[Vector, Vector, Normalizer] {
 
   def this() = this(Identifiable.randomUID("normalizer"))
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/OneHotEncoder.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/OneHotEncoder.scala
index 1fb9b9ae75091..eb6ec49f854be 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/OneHotEncoder.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/OneHotEncoder.scala
@@ -18,16 +18,17 @@
 package org.apache.spark.ml.feature
 
 import org.apache.spark.SparkException
-import org.apache.spark.annotation.AlphaComponent
+import org.apache.spark.annotation.Experimental
 import org.apache.spark.ml.UnaryTransformer
 import org.apache.spark.ml.attribute.{Attribute, BinaryAttribute, NominalAttribute}
-import org.apache.spark.mllib.linalg.{Vector, Vectors, VectorUDT}
 import org.apache.spark.ml.param._
 import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol}
 import org.apache.spark.ml.util.{Identifiable, SchemaUtils}
+import org.apache.spark.mllib.linalg.{Vector, VectorUDT, Vectors}
 import org.apache.spark.sql.types.{DataType, DoubleType, StructType}
 
 /**
+ * :: Experimental ::
  * A one-hot encoder that maps a column of label indices to a column of binary vectors, with
  * at most a single one-value. By default, the binary vector has an element for each category, so
  * with 5 categories, an input value of 2.0 would map to an output vector of
@@ -36,7 +37,7 @@ import org.apache.spark.sql.types.{DataType, DoubleType, StructType}
  * of 0.0 would map to a vector of all zeros. Including the first category makes the vector columns
  * linearly dependent because they sum up to one.
  */
-@AlphaComponent
+@Experimental
 class OneHotEncoder(override val uid: String)
   extends UnaryTransformer[Double, Vector, OneHotEncoder] with HasInputCol with HasOutputCol {
 
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/PolynomialExpansion.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/PolynomialExpansion.scala
index 8ddf9d6a1e138..442e95820217a 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/PolynomialExpansion.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/PolynomialExpansion.scala
@@ -19,7 +19,7 @@ package org.apache.spark.ml.feature
 
 import scala.collection.mutable
 
-import org.apache.spark.annotation.AlphaComponent
+import org.apache.spark.annotation.Experimental
 import org.apache.spark.ml.UnaryTransformer
 import org.apache.spark.ml.param.{IntParam, ParamValidators}
 import org.apache.spark.ml.util.Identifiable
@@ -27,14 +27,14 @@ import org.apache.spark.mllib.linalg._
 import org.apache.spark.sql.types.DataType
 
 /**
- * :: AlphaComponent ::
+ * :: Experimental ::
  * Perform feature expansion in a polynomial space. As said in wikipedia of Polynomial Expansion,
  * which is available at [[http://en.wikipedia.org/wiki/Polynomial_expansion]], "In mathematics, an
  * expansion of a product of sums expresses it as a sum of products by using the fact that
  * multiplication distributes over addition". Take a 2-variable feature vector as an example:
  * `(x, y)`, if we want to expand it with degree 2, then we get `(x, x * x, y, x * y, y * y)`.
  */
-@AlphaComponent
+@Experimental
 class PolynomialExpansion(override val uid: String)
   extends UnaryTransformer[Vector, Vector, PolynomialExpansion] {
 
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/StandardScaler.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/StandardScaler.scala
index 5ccda15d872ed..fdd2494fc87a6 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/StandardScaler.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/StandardScaler.scala
@@ -17,7 +17,7 @@
 
 package org.apache.spark.ml.feature
 
-import org.apache.spark.annotation.AlphaComponent
+import org.apache.spark.annotation.Experimental
 import org.apache.spark.ml._
 import org.apache.spark.ml.param._
 import org.apache.spark.ml.param.shared._
@@ -51,11 +51,11 @@ private[feature] trait StandardScalerParams extends Params with HasInputCol with
 }
 
 /**
- * :: AlphaComponent ::
+ * :: Experimental ::
  * Standardizes features by removing the mean and scaling to unit variance using column summary
  * statistics on the samples in the training set.
  */
-@AlphaComponent
+@Experimental
 class StandardScaler(override val uid: String) extends Estimator[StandardScalerModel]
   with StandardScalerParams {
 
@@ -95,10 +95,10 @@ class StandardScaler(override val uid: String) extends Estimator[StandardScalerM
 }
 
 /**
- * :: AlphaComponent ::
+ * :: Experimental ::
  * Model fitted by [[StandardScaler]].
  */
-@AlphaComponent
+@Experimental
 class StandardScalerModel private[ml] (
     override val uid: String,
     scaler: feature.StandardScalerModel)
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/StringIndexer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/StringIndexer.scala
index 3f79b67309f07..a2dc8a8b960c5 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/StringIndexer.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/StringIndexer.scala
@@ -18,7 +18,7 @@
 package org.apache.spark.ml.feature
 
 import org.apache.spark.SparkException
-import org.apache.spark.annotation.AlphaComponent
+import org.apache.spark.annotation.Experimental
 import org.apache.spark.ml.{Estimator, Model}
 import org.apache.spark.ml.attribute.NominalAttribute
 import org.apache.spark.ml.param._
@@ -52,13 +52,13 @@ private[feature] trait StringIndexerBase extends Params with HasInputCol with Ha
 }
 
 /**
- * :: AlphaComponent ::
+ * :: Experimental ::
  * A label indexer that maps a string column of labels to an ML column of label indices.
  * If the input column is numeric, we cast it to string and index the string values.
  * The indices are in [0, numLabels), ordered by label frequencies.
  * So the most frequent label gets index 0.
  */
-@AlphaComponent
+@Experimental
 class StringIndexer(override val uid: String) extends Estimator[StringIndexerModel]
   with StringIndexerBase {
 
@@ -86,10 +86,10 @@ class StringIndexer(override val uid: String) extends Estimator[StringIndexerMod
 }
 
 /**
- * :: AlphaComponent ::
+ * :: Experimental ::
  * Model fitted by [[StringIndexer]].
  */
-@AlphaComponent
+@Experimental
 class StringIndexerModel private[ml] (
     override val uid: String,
     labels: Array[String]) extends Model[StringIndexerModel] with StringIndexerBase {
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/Tokenizer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/Tokenizer.scala
index 31f3a1aa4c76b..21c15b6c33f6c 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/Tokenizer.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/Tokenizer.scala
@@ -17,19 +17,19 @@
 
 package org.apache.spark.ml.feature
 
-import org.apache.spark.annotation.AlphaComponent
+import org.apache.spark.annotation.Experimental
 import org.apache.spark.ml.UnaryTransformer
 import org.apache.spark.ml.param._
 import org.apache.spark.ml.util.Identifiable
 import org.apache.spark.sql.types.{ArrayType, DataType, StringType}
 
 /**
- * :: AlphaComponent ::
+ * :: Experimental ::
  * A tokenizer that converts the input string to lowercase and then splits it by white spaces.
  *
  * @see [[RegexTokenizer]]
  */
-@AlphaComponent
+@Experimental
 class Tokenizer(override val uid: String) extends UnaryTransformer[String, Seq[String], Tokenizer] {
 
   def this() = this(Identifiable.randomUID("tok"))
@@ -46,13 +46,13 @@ class Tokenizer(override val uid: String) extends UnaryTransformer[String, Seq[S
 }
 
 /**
- * :: AlphaComponent ::
+ * :: Experimental ::
  * A regex based tokenizer that extracts tokens either by using the provided regex pattern to split
  * the text (default) or repeatedly matching the regex (if `gaps` is true).
  * Optional parameters also allow filtering tokens using a minimal length.
  * It returns an array of strings that can be empty.
  */
-@AlphaComponent
+@Experimental
 class RegexTokenizer(override val uid: String)
   extends UnaryTransformer[String, Seq[String], RegexTokenizer] {
 
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/VectorAssembler.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/VectorAssembler.scala
index 181b62f46fce8..514ffb03c0509 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/VectorAssembler.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/VectorAssembler.scala
@@ -20,7 +20,7 @@ package org.apache.spark.ml.feature
 import scala.collection.mutable.ArrayBuilder
 
 import org.apache.spark.SparkException
-import org.apache.spark.annotation.AlphaComponent
+import org.apache.spark.annotation.Experimental
 import org.apache.spark.ml.Transformer
 import org.apache.spark.ml.param.shared._
 import org.apache.spark.ml.util.Identifiable
@@ -30,10 +30,10 @@ import org.apache.spark.sql.functions._
 import org.apache.spark.sql.types._
 
 /**
- * :: AlphaComponent ::
+ * :: Experimental ::
  * A feature transformer that merges multiple columns into a vector column.
  */
-@AlphaComponent
+@Experimental
 class VectorAssembler(override val uid: String)
   extends Transformer with HasInputCols with HasOutputCol {
 
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/VectorIndexer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/VectorIndexer.scala
index e238fb310ed37..1d0f23b4fb3db 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/VectorIndexer.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/VectorIndexer.scala
@@ -22,7 +22,7 @@ import java.util.{Map => JMap}
 
 import scala.collection.JavaConverters._
 
-import org.apache.spark.annotation.AlphaComponent
+import org.apache.spark.annotation.Experimental
 import org.apache.spark.ml.{Estimator, Model}
 import org.apache.spark.ml.attribute._
 import org.apache.spark.ml.param.{IntParam, ParamValidators, Params}
@@ -56,8 +56,7 @@ private[ml] trait VectorIndexerParams extends Params with HasInputCol with HasOu
 }
 
 /**
- * :: AlphaComponent ::
- *
+ * :: Experimental ::
  * Class for indexing categorical feature columns in a dataset of [[Vector]].
  *
  * This has 2 usage modes:
@@ -91,7 +90,7 @@ private[ml] trait VectorIndexerParams extends Params with HasInputCol with HasOu
  *  - Add warning if a categorical feature has only 1 category.
  *  - Add option for allowing unknown categories.
  */
-@AlphaComponent
+@Experimental
 class VectorIndexer(override val uid: String) extends Estimator[VectorIndexerModel]
   with VectorIndexerParams {
 
@@ -230,8 +229,7 @@ private object VectorIndexer {
 }
 
 /**
- * :: AlphaComponent ::
- *
+ * :: Experimental ::
  * Transform categorical features to use 0-based indices instead of their original values.
  *  - Categorical features are mapped to indices.
  *  - Continuous features (columns) are left unchanged.
@@ -246,7 +244,7 @@ private object VectorIndexer {
  *                      Values are maps from original features values to 0-based category indices.
  *                      If a feature is not in this map, it is treated as continuous.
  */
-@AlphaComponent
+@Experimental
 class VectorIndexerModel private[ml] (
     override val uid: String,
     val numFeatures: Int,
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/Word2Vec.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/Word2Vec.scala
index ed032669229ce..36f19509f0cfb 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/Word2Vec.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/Word2Vec.scala
@@ -17,7 +17,7 @@
 
 package org.apache.spark.ml.feature
 
-import org.apache.spark.annotation.AlphaComponent
+import org.apache.spark.annotation.Experimental
 import org.apache.spark.ml.{Estimator, Model}
 import org.apache.spark.ml.param._
 import org.apache.spark.ml.param.shared._
@@ -82,11 +82,11 @@ private[feature] trait Word2VecBase extends Params
 }
 
 /**
- * :: AlphaComponent ::
+ * :: Experimental ::
  * Word2Vec trains a model of `Map(String, Vector)`, i.e. transforms a word into a code for further
  * natural language processing or machine learning process.
  */
-@AlphaComponent
+@Experimental
 final class Word2Vec(override val uid: String) extends Estimator[Word2VecModel] with Word2VecBase {
 
   def this() = this(Identifiable.randomUID("w2v"))
@@ -135,10 +135,10 @@ final class Word2Vec(override val uid: String) extends Estimator[Word2VecModel]
 }
 
 /**
- * :: AlphaComponent ::
+ * :: Experimental ::
  * Model fitted by [[Word2Vec]].
  */
-@AlphaComponent
+@Experimental
 class Word2VecModel private[ml] (
     override val uid: String,
     wordVectors: feature.Word2VecModel)
diff --git a/mllib/src/main/scala/org/apache/spark/ml/package-info.java b/mllib/src/main/scala/org/apache/spark/ml/package-info.java
index 00d9c802e930d..87f4223964ada 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/package-info.java
+++ b/mllib/src/main/scala/org/apache/spark/ml/package-info.java
@@ -16,10 +16,10 @@
  */
 
 /**
- * Spark ML is an ALPHA component that adds a new set of machine learning APIs to let users quickly
+ * Spark ML is a BETA component that adds a new set of machine learning APIs to let users quickly
  * assemble and configure practical machine learning pipelines.
  */
-@AlphaComponent
+@Experimental
 package org.apache.spark.ml;
 
-import org.apache.spark.annotation.AlphaComponent;
+import org.apache.spark.annotation.Experimental;
diff --git a/mllib/src/main/scala/org/apache/spark/ml/package.scala b/mllib/src/main/scala/org/apache/spark/ml/package.scala
index ac75e9de1a8f2..c589d06d9f7e4 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/package.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/package.scala
@@ -18,7 +18,7 @@
 package org.apache.spark
 
 /**
- * Spark ML is an ALPHA component that adds a new set of machine learning APIs to let users quickly
+ * Spark ML is a BETA component that adds a new set of machine learning APIs to let users quickly
  * assemble and configure practical machine learning pipelines.
  *
  * @groupname param Parameters
diff --git a/mllib/src/main/scala/org/apache/spark/ml/param/params.scala b/mllib/src/main/scala/org/apache/spark/ml/param/params.scala
index 12fc5b561f76e..1afa59c994c2b 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/param/params.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/param/params.scala
@@ -24,11 +24,11 @@ import scala.annotation.varargs
 import scala.collection.mutable
 import scala.collection.JavaConverters._
 
-import org.apache.spark.annotation.{DeveloperApi, AlphaComponent}
+import org.apache.spark.annotation.{DeveloperApi, Experimental}
 import org.apache.spark.ml.util.Identifiable
 
 /**
- * :: AlphaComponent ::
+ * :: DeveloperApi ::
  * A param with self-contained documentation and optionally default value. Primitive-typed param
  * should use the specialized versions, which are more friendly to Java users.
  *
@@ -39,7 +39,7 @@ import org.apache.spark.ml.util.Identifiable
  *                See [[ParamValidators]] for factory methods for common validation functions.
  * @tparam T param value type
  */
-@AlphaComponent
+@DeveloperApi
 class Param[T](val parent: String, val name: String, val doc: String, val isValid: T => Boolean)
   extends Serializable {
 
@@ -174,7 +174,11 @@ object ParamValidators {
 
 // specialize primitive-typed params because Java doesn't recognize scala.Double, scala.Int, ...
 
-/** Specialized version of [[Param[Double]]] for Java. */
+/**
+ * :: DeveloperApi ::
+ * Specialized version of [[Param[Double]]] for Java.
+ */
+@DeveloperApi
 class DoubleParam(parent: String, name: String, doc: String, isValid: Double => Boolean)
   extends Param[Double](parent, name, doc, isValid) {
 
@@ -189,7 +193,11 @@ class DoubleParam(parent: String, name: String, doc: String, isValid: Double =>
   override def w(value: Double): ParamPair[Double] = super.w(value)
 }
 
-/** Specialized version of [[Param[Int]]] for Java. */
+/**
+ * :: DeveloperApi ::
+ * Specialized version of [[Param[Int]]] for Java.
+ */
+@DeveloperApi
 class IntParam(parent: String, name: String, doc: String, isValid: Int => Boolean)
   extends Param[Int](parent, name, doc, isValid) {
 
@@ -204,7 +212,11 @@ class IntParam(parent: String, name: String, doc: String, isValid: Int => Boolea
   override def w(value: Int): ParamPair[Int] = super.w(value)
 }
 
-/** Specialized version of [[Param[Float]]] for Java. */
+/**
+ * :: DeveloperApi ::
+ * Specialized version of [[Param[Float]]] for Java.
+ */
+@DeveloperApi
 class FloatParam(parent: String, name: String, doc: String, isValid: Float => Boolean)
   extends Param[Float](parent, name, doc, isValid) {
 
@@ -219,7 +231,11 @@ class FloatParam(parent: String, name: String, doc: String, isValid: Float => Bo
   override def w(value: Float): ParamPair[Float] = super.w(value)
 }
 
-/** Specialized version of [[Param[Long]]] for Java. */
+/**
+ * :: DeveloperApi ::
+ * Specialized version of [[Param[Long]]] for Java.
+ */
+@DeveloperApi
 class LongParam(parent: String, name: String, doc: String, isValid: Long => Boolean)
   extends Param[Long](parent, name, doc, isValid) {
 
@@ -234,7 +250,11 @@ class LongParam(parent: String, name: String, doc: String, isValid: Long => Bool
   override def w(value: Long): ParamPair[Long] = super.w(value)
 }
 
-/** Specialized version of [[Param[Boolean]]] for Java. */
+/**
+ * :: DeveloperApi ::
+ * Specialized version of [[Param[Boolean]]] for Java.
+ */
+@DeveloperApi
 class BooleanParam(parent: String, name: String, doc: String) // No need for isValid
   extends Param[Boolean](parent, name, doc) {
 
@@ -243,7 +263,11 @@ class BooleanParam(parent: String, name: String, doc: String) // No need for isV
   override def w(value: Boolean): ParamPair[Boolean] = super.w(value)
 }
 
-/** Specialized version of [[Param[Array[String]]]] for Java. */
+/**
+ * :: DeveloperApi ::
+ * Specialized version of [[Param[Array[String]]]] for Java.
+ */
+@DeveloperApi
 class StringArrayParam(parent: Params, name: String, doc: String, isValid: Array[String] => Boolean)
   extends Param[Array[String]](parent, name, doc, isValid) {
 
@@ -256,7 +280,11 @@ class StringArrayParam(parent: Params, name: String, doc: String, isValid: Array
   def w(value: java.util.List[String]): ParamPair[Array[String]] = w(value.asScala.toArray)
 }
 
-/** Specialized version of [[Param[Array[Double]]]] for Java. */
+/**
+ * :: DeveloperApi ::
+ * Specialized version of [[Param[Array[Double]]]] for Java.
+ */
+@DeveloperApi
 class DoubleArrayParam(parent: Params, name: String, doc: String, isValid: Array[Double] => Boolean)
   extends Param[Array[Double]](parent, name, doc, isValid) {
 
@@ -270,8 +298,10 @@ class DoubleArrayParam(parent: Params, name: String, doc: String, isValid: Array
 }
 
 /**
+ * :: Experimental ::
  * A param amd its value.
  */
+@Experimental
 case class ParamPair[T](param: Param[T], value: T) {
   // This is *the* place Param.validate is called.  Whenever a parameter is specified, we should
   // always construct a ParamPair so that validate is called.
@@ -279,11 +309,11 @@ case class ParamPair[T](param: Param[T], value: T) {
 }
 
 /**
- * :: AlphaComponent ::
+ * :: DeveloperApi ::
  * Trait for components that take parameters. This also provides an internal param map to store
  * parameter values attached to the instance.
  */
-@AlphaComponent
+@DeveloperApi
 trait Params extends Identifiable with Serializable {
 
   /**
@@ -541,10 +571,10 @@ trait Params extends Identifiable with Serializable {
 abstract class JavaParams extends Params
 
 /**
- * :: AlphaComponent ::
+ * :: Experimental ::
  * A param to value map.
  */
-@AlphaComponent
+@Experimental
 final class ParamMap private[ml] (private val map: mutable.Map[Param[Any], Any])
   extends Serializable {
 
@@ -665,6 +695,7 @@ final class ParamMap private[ml] (private val map: mutable.Map[Param[Any], Any])
   def size: Int = map.size
 }
 
+@Experimental
 object ParamMap {
 
   /**
diff --git a/mllib/src/main/scala/org/apache/spark/ml/recommendation/ALS.scala b/mllib/src/main/scala/org/apache/spark/ml/recommendation/ALS.scala
index 2a5ddbfae5cdf..900b637ff8ad4 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/recommendation/ALS.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/recommendation/ALS.scala
@@ -31,7 +31,7 @@ import org.apache.hadoop.fs.{FileSystem, Path}
 import org.netlib.util.intW
 
 import org.apache.spark.{Logging, Partitioner}
-import org.apache.spark.annotation.DeveloperApi
+import org.apache.spark.annotation.{DeveloperApi, Experimental}
 import org.apache.spark.ml.{Estimator, Model}
 import org.apache.spark.ml.param._
 import org.apache.spark.ml.param.shared._
@@ -169,8 +169,10 @@ private[recommendation] trait ALSParams extends Params with HasMaxIter with HasR
 }
 
 /**
+ * :: Experimental ::
  * Model fitted by ALS.
  */
+@Experimental
 class ALSModel private[ml] (
     override val uid: String,
     k: Int,
@@ -208,6 +210,7 @@ class ALSModel private[ml] (
 
 
 /**
+ * :: Experimental ::
  * Alternating Least Squares (ALS) matrix factorization.
  *
  * ALS attempts to estimate the ratings matrix `R` as the product of two lower-rank matrices,
@@ -236,6 +239,7 @@ class ALSModel private[ml] (
  * indicated user
  * preferences rather than explicit ratings given to items.
  */
+@Experimental
 class ALS(override val uid: String) extends Estimator[ALSModel] with ALSParams {
 
   import org.apache.spark.ml.recommendation.ALS.Rating
@@ -326,7 +330,11 @@ class ALS(override val uid: String) extends Estimator[ALSModel] with ALSParams {
 @DeveloperApi
 object ALS extends Logging {
 
-  /** Rating class for better code readability. */
+  /**
+   * :: DeveloperApi ::
+   * Rating class for better code readability.
+   */
+  @DeveloperApi
   case class Rating[@specialized(Int, Long) ID](user: ID, item: ID, rating: Float)
 
   /** Trait for least squares solvers applied to the normal equation. */
@@ -487,8 +495,10 @@ object ALS extends Logging {
   }
 
   /**
+   * :: DeveloperApi ::
    * Implementation of the ALS algorithm.
    */
+  @DeveloperApi
   def train[ID: ClassTag]( // scalastyle:ignore
       ratings: RDD[Rating[ID]],
       rank: Int = 10,
diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/DecisionTreeRegressor.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/DecisionTreeRegressor.scala
index e67df21b2e4ae..43b68e7bb20fa 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/regression/DecisionTreeRegressor.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/regression/DecisionTreeRegressor.scala
@@ -17,10 +17,10 @@
 
 package org.apache.spark.ml.regression
 
-import org.apache.spark.annotation.AlphaComponent
+import org.apache.spark.annotation.Experimental
 import org.apache.spark.ml.{PredictionModel, Predictor}
 import org.apache.spark.ml.param.ParamMap
-import org.apache.spark.ml.tree.{TreeRegressorParams, DecisionTreeParams, DecisionTreeModel, Node}
+import org.apache.spark.ml.tree.{DecisionTreeModel, DecisionTreeParams, Node, TreeRegressorParams}
 import org.apache.spark.ml.util.{Identifiable, MetadataUtils}
 import org.apache.spark.mllib.linalg.Vector
 import org.apache.spark.mllib.regression.LabeledPoint
@@ -31,13 +31,12 @@ import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.DataFrame
 
 /**
- * :: AlphaComponent ::
- *
+ * :: Experimental ::
  * [[http://en.wikipedia.org/wiki/Decision_tree_learning Decision tree]] learning algorithm
  * for regression.
  * It supports both continuous and categorical features.
  */
-@AlphaComponent
+@Experimental
 final class DecisionTreeRegressor(override val uid: String)
   extends Predictor[Vector, DecisionTreeRegressor, DecisionTreeRegressionModel]
   with DecisionTreeParams with TreeRegressorParams {
@@ -79,19 +78,19 @@ final class DecisionTreeRegressor(override val uid: String)
   }
 }
 
+@Experimental
 object DecisionTreeRegressor {
   /** Accessor for supported impurities: variance */
   final val supportedImpurities: Array[String] = TreeRegressorParams.supportedImpurities
 }
 
 /**
- * :: AlphaComponent ::
- *
+ * :: Experimental ::
  * [[http://en.wikipedia.org/wiki/Decision_tree_learning Decision tree]] model for regression.
  * It supports both continuous and categorical features.
  * @param rootNode  Root of the decision tree
  */
-@AlphaComponent
+@Experimental
 final class DecisionTreeRegressionModel private[ml] (
     override val uid: String,
     override val rootNode: Node)
diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/GBTRegressor.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/GBTRegressor.scala
index 4249ff5c1ebc7..69f4f5414c8c6 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/regression/GBTRegressor.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/regression/GBTRegressor.scala
@@ -20,10 +20,10 @@ package org.apache.spark.ml.regression
 import com.github.fommil.netlib.BLAS.{getInstance => blas}
 
 import org.apache.spark.Logging
-import org.apache.spark.annotation.AlphaComponent
+import org.apache.spark.annotation.Experimental
 import org.apache.spark.ml.{PredictionModel, Predictor}
 import org.apache.spark.ml.param.{Param, ParamMap}
-import org.apache.spark.ml.tree.{GBTParams, TreeRegressorParams, DecisionTreeModel, TreeEnsembleModel}
+import org.apache.spark.ml.tree.{DecisionTreeModel, GBTParams, TreeEnsembleModel, TreeRegressorParams}
 import org.apache.spark.ml.util.{Identifiable, MetadataUtils}
 import org.apache.spark.mllib.linalg.Vector
 import org.apache.spark.mllib.regression.LabeledPoint
@@ -35,13 +35,12 @@ import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.DataFrame
 
 /**
- * :: AlphaComponent ::
- *
+ * :: Experimental ::
  * [[http://en.wikipedia.org/wiki/Gradient_boosting Gradient-Boosted Trees (GBTs)]]
  * learning algorithm for regression.
  * It supports both continuous and categorical features.
  */
-@AlphaComponent
+@Experimental
 final class GBTRegressor(override val uid: String)
   extends Predictor[Vector, GBTRegressor, GBTRegressionModel]
   with GBTParams with TreeRegressorParams with Logging {
@@ -134,6 +133,7 @@ final class GBTRegressor(override val uid: String)
   }
 }
 
+@Experimental
 object GBTRegressor {
   // The losses below should be lowercase.
   /** Accessor for supported loss settings: squared (L2), absolute (L1) */
@@ -141,7 +141,7 @@ object GBTRegressor {
 }
 
 /**
- * :: AlphaComponent ::
+ * :: Experimental ::
  *
  * [[http://en.wikipedia.org/wiki/Gradient_boosting Gradient-Boosted Trees (GBTs)]]
  * model for regression.
@@ -149,7 +149,7 @@ object GBTRegressor {
  * @param _trees  Decision trees in the ensemble.
  * @param _treeWeights  Weights for the decision trees in the ensemble.
  */
-@AlphaComponent
+@Experimental
 final class GBTRegressionModel(
     override val uid: String,
     private val _trees: Array[DecisionTreeRegressionModel],
diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala
index 3ebb78f79201a..7c40db1a40040 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala
@@ -23,7 +23,7 @@ import breeze.linalg.{DenseVector => BDV, norm => brzNorm}
 import breeze.optimize.{CachedDiffFunction, DiffFunction, LBFGS => BreezeLBFGS, OWLQN => BreezeOWLQN}
 
 import org.apache.spark.Logging
-import org.apache.spark.annotation.AlphaComponent
+import org.apache.spark.annotation.Experimental
 import org.apache.spark.ml.PredictorParams
 import org.apache.spark.ml.param.ParamMap
 import org.apache.spark.ml.param.shared.{HasElasticNetParam, HasMaxIter, HasRegParam, HasTol}
@@ -44,8 +44,7 @@ private[regression] trait LinearRegressionParams extends PredictorParams
   with HasRegParam with HasElasticNetParam with HasMaxIter with HasTol
 
 /**
- * :: AlphaComponent ::
- *
+ * :: Experimental ::
  * Linear regression.
  *
  * The learning objective is to minimize the squared error, with regularization.
@@ -58,7 +57,7 @@ private[regression] trait LinearRegressionParams extends PredictorParams
  *  - L1 (Lasso)
  *  - L2 + L1 (elastic net)
  */
-@AlphaComponent
+@Experimental
 class LinearRegression(override val uid: String)
   extends Regressor[Vector, LinearRegression, LinearRegressionModel]
   with LinearRegressionParams with Logging {
@@ -190,11 +189,10 @@ class LinearRegression(override val uid: String)
 }
 
 /**
- * :: AlphaComponent ::
- *
+ * :: Experimental ::
  * Model produced by [[LinearRegression]].
  */
-@AlphaComponent
+@Experimental
 class LinearRegressionModel private[ml] (
     override val uid: String,
     val weights: Vector,
diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/RandomForestRegressor.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/RandomForestRegressor.scala
index 82437aa8de294..ae767a17329d2 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/regression/RandomForestRegressor.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/regression/RandomForestRegressor.scala
@@ -17,10 +17,10 @@
 
 package org.apache.spark.ml.regression
 
-import org.apache.spark.annotation.AlphaComponent
+import org.apache.spark.annotation.Experimental
 import org.apache.spark.ml.{PredictionModel, Predictor}
 import org.apache.spark.ml.param.ParamMap
-import org.apache.spark.ml.tree.{RandomForestParams, TreeRegressorParams, DecisionTreeModel, TreeEnsembleModel}
+import org.apache.spark.ml.tree.{DecisionTreeModel, RandomForestParams, TreeEnsembleModel, TreeRegressorParams}
 import org.apache.spark.ml.util.{Identifiable, MetadataUtils}
 import org.apache.spark.mllib.linalg.Vector
 import org.apache.spark.mllib.regression.LabeledPoint
@@ -31,12 +31,11 @@ import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.DataFrame
 
 /**
- * :: AlphaComponent ::
- *
+ * :: Experimental ::
  * [[http://en.wikipedia.org/wiki/Random_forest  Random Forest]] learning algorithm for regression.
  * It supports both continuous and categorical features.
  */
-@AlphaComponent
+@Experimental
 final class RandomForestRegressor(override val uid: String)
   extends Predictor[Vector, RandomForestRegressor, RandomForestRegressionModel]
   with RandomForestParams with TreeRegressorParams {
@@ -89,6 +88,7 @@ final class RandomForestRegressor(override val uid: String)
   }
 }
 
+@Experimental
 object RandomForestRegressor {
   /** Accessor for supported impurity settings: variance */
   final val supportedImpurities: Array[String] = TreeRegressorParams.supportedImpurities
@@ -99,13 +99,12 @@ object RandomForestRegressor {
 }
 
 /**
- * :: AlphaComponent ::
- *
+ * :: Experimental ::
  * [[http://en.wikipedia.org/wiki/Random_forest  Random Forest]] model for regression.
  * It supports both continuous and categorical features.
  * @param _trees  Decision trees in the ensemble.
  */
-@AlphaComponent
+@Experimental
 final class RandomForestRegressionModel private[ml] (
     override val uid: String,
     private val _trees: Array[DecisionTreeRegressionModel])
diff --git a/mllib/src/main/scala/org/apache/spark/ml/tree/Node.scala b/mllib/src/main/scala/org/apache/spark/ml/tree/Node.scala
index d2dec0c76cb12..6a84176efb086 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/tree/Node.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/tree/Node.scala
@@ -17,14 +17,16 @@
 
 package org.apache.spark.ml.tree
 
+import org.apache.spark.annotation.DeveloperApi
 import org.apache.spark.mllib.linalg.Vector
 import org.apache.spark.mllib.tree.model.{InformationGainStats => OldInformationGainStats,
   Node => OldNode, Predict => OldPredict}
 
-
 /**
+ * :: DeveloperApi ::
  * Decision tree node interface.
  */
+@DeveloperApi
 sealed abstract class Node extends Serializable {
 
   // TODO: Add aggregate stats (once available).  This will happen after we move the DecisionTree
@@ -89,10 +91,12 @@ private[ml] object Node {
 }
 
 /**
+ * :: DeveloperApi ::
  * Decision tree leaf node.
  * @param prediction  Prediction this node makes
  * @param impurity  Impurity measure at this node (for training data)
  */
+@DeveloperApi
 final class LeafNode private[ml] (
     override val prediction: Double,
     override val impurity: Double) extends Node {
@@ -118,6 +122,7 @@ final class LeafNode private[ml] (
 }
 
 /**
+ * :: DeveloperApi ::
  * Internal Decision Tree node.
  * @param prediction  Prediction this node would make if it were a leaf node
  * @param impurity  Impurity measure at this node (for training data)
@@ -127,6 +132,7 @@ final class LeafNode private[ml] (
  * @param rightChild  Right-hand child node
  * @param split  Information about the test used to split to the left or right child.
  */
+@DeveloperApi
 final class InternalNode private[ml] (
     override val prediction: Double,
     override val impurity: Double,
diff --git a/mllib/src/main/scala/org/apache/spark/ml/tree/Split.scala b/mllib/src/main/scala/org/apache/spark/ml/tree/Split.scala
index 90f1d052764d3..7acdeeee72d23 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/tree/Split.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/tree/Split.scala
@@ -17,15 +17,18 @@
 
 package org.apache.spark.ml.tree
 
+import org.apache.spark.annotation.DeveloperApi
 import org.apache.spark.mllib.linalg.Vector
 import org.apache.spark.mllib.tree.configuration.{FeatureType => OldFeatureType}
 import org.apache.spark.mllib.tree.model.{Split => OldSplit}
 
 
 /**
+ * :: DeveloperApi ::
  * Interface for a "Split," which specifies a test made at a decision tree node
  * to choose the left or right path.
  */
+@DeveloperApi
 sealed trait Split extends Serializable {
 
   /** Index of feature which this split tests */
@@ -52,12 +55,14 @@ private[tree] object Split {
 }
 
 /**
+ * :: DeveloperApi ::
  * Split which tests a categorical feature.
  * @param featureIndex  Index of the feature to test
  * @param _leftCategories  If the feature value is in this set of categories, then the split goes
  *                         left. Otherwise, it goes right.
  * @param numCategories  Number of categories for this feature.
  */
+@DeveloperApi
 final class CategoricalSplit private[ml] (
     override val featureIndex: Int,
     _leftCategories: Array[Double],
@@ -125,11 +130,13 @@ final class CategoricalSplit private[ml] (
 }
 
 /**
+ * :: DeveloperApi ::
  * Split which tests a continuous feature.
  * @param featureIndex  Index of the feature to test
  * @param threshold  If the feature value is <= this threshold, then the split goes left.
  *                    Otherwise, it goes right.
  */
+@DeveloperApi
 final class ContinuousSplit private[ml] (override val featureIndex: Int, val threshold: Double)
   extends Split {
 
diff --git a/mllib/src/main/scala/org/apache/spark/ml/tree/treeParams.scala b/mllib/src/main/scala/org/apache/spark/ml/tree/treeParams.scala
index 816fcedf2efb3..a0c5238d966bf 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/tree/treeParams.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/tree/treeParams.scala
@@ -17,7 +17,6 @@
 
 package org.apache.spark.ml.tree
 
-import org.apache.spark.annotation.DeveloperApi
 import org.apache.spark.ml.PredictorParams
 import org.apache.spark.ml.param._
 import org.apache.spark.ml.param.shared.{HasMaxIter, HasSeed}
@@ -26,12 +25,10 @@ import org.apache.spark.mllib.tree.impurity.{Entropy => OldEntropy, Gini => OldG
 import org.apache.spark.mllib.tree.loss.{Loss => OldLoss}
 
 /**
- * :: DeveloperApi ::
  * Parameters for Decision Tree-based algorithms.
  *
  * Note: Marked as private and DeveloperApi since this may be made public in the future.
  */
-@DeveloperApi
 private[ml] trait DecisionTreeParams extends PredictorParams {
 
   /**
@@ -265,12 +262,10 @@ private[ml] object TreeRegressorParams {
 }
 
 /**
- * :: DeveloperApi ::
  * Parameters for Decision Tree-based ensemble algorithms.
  *
  * Note: Marked as private and DeveloperApi since this may be made public in the future.
  */
-@DeveloperApi
 private[ml] trait TreeEnsembleParams extends DecisionTreeParams with HasSeed {
 
   /**
@@ -307,12 +302,10 @@ private[ml] trait TreeEnsembleParams extends DecisionTreeParams with HasSeed {
 }
 
 /**
- * :: DeveloperApi ::
  * Parameters for Random Forest algorithms.
  *
  * Note: Marked as private and DeveloperApi since this may be made public in the future.
  */
-@DeveloperApi
 private[ml] trait RandomForestParams extends TreeEnsembleParams {
 
   /**
@@ -377,12 +370,10 @@ private[ml] object RandomForestParams {
 }
 
 /**
- * :: DeveloperApi ::
  * Parameters for Gradient-Boosted Tree algorithms.
  *
  * Note: Marked as private and DeveloperApi since this may be made public in the future.
  */
-@DeveloperApi
 private[ml] trait GBTParams extends TreeEnsembleParams with HasMaxIter {
 
   /**
diff --git a/mllib/src/main/scala/org/apache/spark/ml/tuning/CrossValidator.scala b/mllib/src/main/scala/org/apache/spark/ml/tuning/CrossValidator.scala
index e21ff94a20f54..2e5a629561180 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/tuning/CrossValidator.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/tuning/CrossValidator.scala
@@ -20,7 +20,7 @@ package org.apache.spark.ml.tuning
 import com.github.fommil.netlib.F2jBLAS
 
 import org.apache.spark.Logging
-import org.apache.spark.annotation.AlphaComponent
+import org.apache.spark.annotation.Experimental
 import org.apache.spark.ml._
 import org.apache.spark.ml.evaluation.Evaluator
 import org.apache.spark.ml.param._
@@ -79,10 +79,10 @@ private[ml] trait CrossValidatorParams extends Params {
 }
 
 /**
- * :: AlphaComponent ::
+ * :: Experimental ::
  * K-fold cross validation.
  */
-@AlphaComponent
+@Experimental
 class CrossValidator(override val uid: String) extends Estimator[CrossValidatorModel]
   with CrossValidatorParams with Logging {
 
@@ -150,10 +150,10 @@ class CrossValidator(override val uid: String) extends Estimator[CrossValidatorM
 }
 
 /**
- * :: AlphaComponent ::
+ * :: Experimental ::
  * Model from k-fold cross validation.
  */
-@AlphaComponent
+@Experimental
 class CrossValidatorModel private[ml] (
     override val uid: String,
     val bestModel: Model[_])
diff --git a/mllib/src/main/scala/org/apache/spark/ml/tuning/ParamGridBuilder.scala b/mllib/src/main/scala/org/apache/spark/ml/tuning/ParamGridBuilder.scala
index dafe73d82c00a..98a8f0330ca45 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/tuning/ParamGridBuilder.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/tuning/ParamGridBuilder.scala
@@ -20,14 +20,14 @@ package org.apache.spark.ml.tuning
 import scala.annotation.varargs
 import scala.collection.mutable
 
-import org.apache.spark.annotation.AlphaComponent
+import org.apache.spark.annotation.Experimental
 import org.apache.spark.ml.param._
 
 /**
- * :: AlphaComponent ::
+ * :: Experimental ::
  * Builder for a param grid used in grid search-based model selection.
  */
-@AlphaComponent
+@Experimental
 class ParamGridBuilder {
 
   private val paramGrid = mutable.Map.empty[Param[_], Iterable[_]]

From 8f2082426828c15704426ebca1d015bf956c6841 Mon Sep 17 00:00:00 2001
From: Andrew Or <andrew@databricks.com>
Date: Tue, 26 May 2015 16:31:34 -0700
Subject: [PATCH 188/525] [SPARK-7864] [UI] Do not kill innocent stages from
 visualization

**Reproduction.** Run a long-running job, go to the job page, expand the DAG visualization, and click into a stage. Your stage is now killed. Why? This is because the visualization code just reaches into the stage table and grabs the first link it finds. In our case, this first link happens to be the kill link instead of the one to the stage page.

**Fix.** Use proper CSS selectors to avoid ambiguity.

This is an alternative to #6407. Thanks carsonwang for catching this.

Author: Andrew Or <andrew@databricks.com>

Closes #6419 from andrewor14/fix-ui-viz-kill and squashes the following commits:

25203bd [Andrew Or] Do not kill innocent stages
---
 .../main/resources/org/apache/spark/ui/static/spark-dag-viz.js  | 2 +-
 .../main/resources/org/apache/spark/ui/static/timeline-view.js  | 2 +-
 core/src/main/scala/org/apache/spark/ui/jobs/StageTable.scala   | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/core/src/main/resources/org/apache/spark/ui/static/spark-dag-viz.js b/core/src/main/resources/org/apache/spark/ui/static/spark-dag-viz.js
index aaeba5b1027c9..e96af8768daa0 100644
--- a/core/src/main/resources/org/apache/spark/ui/static/spark-dag-viz.js
+++ b/core/src/main/resources/org/apache/spark/ui/static/spark-dag-viz.js
@@ -193,7 +193,7 @@ function renderDagVizForJob(svgContainer) {
       // Use the link from the stage table so it also works for the history server
       var attemptId = 0
       var stageLink = d3.select("#stage-" + stageId + "-" + attemptId)
-        .select("a")
+        .select("a.name-link")
         .attr("href") + "&expandDagViz=true";
       container = svgContainer
         .append("a")
diff --git a/core/src/main/resources/org/apache/spark/ui/static/timeline-view.js b/core/src/main/resources/org/apache/spark/ui/static/timeline-view.js
index 604c29994145a..28ac998e8d065 100644
--- a/core/src/main/resources/org/apache/spark/ui/static/timeline-view.js
+++ b/core/src/main/resources/org/apache/spark/ui/static/timeline-view.js
@@ -105,7 +105,7 @@ function drawJobTimeline(groupArray, eventObjArray, startTime) {
       };
 
       $(this).click(function() {
-        var stagePagePath = $(getSelectorForStageEntry(this)).find("a").attr("href")
+        var stagePagePath = $(getSelectorForStageEntry(this)).find("a.name-link").attr("href")
         window.location.href = stagePagePath
       });
 
diff --git a/core/src/main/scala/org/apache/spark/ui/jobs/StageTable.scala b/core/src/main/scala/org/apache/spark/ui/jobs/StageTable.scala
index 82ba561eefb16..99812db4912a3 100644
--- a/core/src/main/scala/org/apache/spark/ui/jobs/StageTable.scala
+++ b/core/src/main/scala/org/apache/spark/ui/jobs/StageTable.scala
@@ -93,7 +93,7 @@ private[ui] class StageTableBase(
     }
 
     val nameLinkUri = s"$basePathUri/stages/stage?id=${s.stageId}&attempt=${s.attemptId}"
-    val nameLink = <a href={nameLinkUri}>{s.name}</a>
+    val nameLink = <a href={nameLinkUri} class="name-link">{s.name}</a>
 
     val cachedRddInfos = s.rddInfos.filter(_.numCachedPartitions > 0)
     val details = if (s.details.nonEmpty) {

From 0463428b6e8f364f0b1f39445a60cd85ae7c07bc Mon Sep 17 00:00:00 2001
From: Mike Dusenberry <dusenberrymw@gmail.com>
Date: Tue, 26 May 2015 18:08:57 -0700
Subject: [PATCH 189/525] [SPARK-7883] [DOCS] [MLLIB] Fixing broken
 trainImplicit Scala example in MLlib Collaborative Filtering documentation.

Fixing broken trainImplicit Scala example in MLlib Collaborative Filtering documentation to match one of the possible ALS.trainImplicit function signatures.

Author: Mike Dusenberry <dusenberrymw@gmail.com>

Closes #6422 from dusenberrymw/Fix_MLlib_Collab_Filtering_trainImplicit_Example and squashes the following commits:

36492f4 [Mike Dusenberry] Fixing broken trainImplicit example in MLlib Collaborative Filtering documentation to match one of the possible ALS.trainImplicit function signatures.
---
 docs/mllib-collaborative-filtering.md | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/docs/mllib-collaborative-filtering.md b/docs/mllib-collaborative-filtering.md
index 7b397e30b2d90..dfdf6216b270c 100644
--- a/docs/mllib-collaborative-filtering.md
+++ b/docs/mllib-collaborative-filtering.md
@@ -107,7 +107,8 @@ other signals), you can use the `trainImplicit` method to get better results.
 
 {% highlight scala %}
 val alpha = 0.01
-val model = ALS.trainImplicit(ratings, rank, numIterations, alpha)
+val lambda = 0.01
+val model = ALS.trainImplicit(ratings, rank, numIterations, lambda, alpha)
 {% endhighlight %}
 </div>
 

From 03668348e29eb52c1a7d57a1e0ed7fca6c323890 Mon Sep 17 00:00:00 2001
From: rowan <rowan.chattaway@googlemail.com>
Date: Tue, 26 May 2015 18:17:16 -0700
Subject: [PATCH 190/525] [SPARK-7637] [SQL] O(N) merge implementation for
 StructType merge

Contribution is my original work and I license the work to the project under the projects open source license.

Author: rowan <rowan.chattaway@googlemail.com>

Closes #6259 from rowan000/SPARK-7637 and squashes the following commits:

c479df4 [rowan] SPARK-7637: rename mapFields to fieldsMap as per comments on github.
8d2e419 [rowan] SPARK-7637: fix up whitespace changes
0e9d662 [rowan] SPARK-7637: O(N) merge implementatio for StructType merge
---
 .../apache/spark/sql/types/StructType.scala   | 12 ++-
 .../spark/sql/types/DataTypeSuite.scala       | 73 ++++++++++++++++++-
 2 files changed, 81 insertions(+), 4 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/StructType.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/StructType.scala
index 7e00a27dfe724..a4f30c825befb 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/StructType.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/StructType.scala
@@ -230,10 +230,10 @@ object StructType {
       case (StructType(leftFields), StructType(rightFields)) =>
         val newFields = ArrayBuffer.empty[StructField]
 
+        val rightMapped = fieldsMap(rightFields)
         leftFields.foreach {
           case leftField @ StructField(leftName, leftType, leftNullable, _) =>
-            rightFields
-              .find(_.name == leftName)
+            rightMapped.get(leftName)
               .map { case rightField @ StructField(_, rightType, rightNullable, _) =>
               leftField.copy(
                 dataType = merge(leftType, rightType),
@@ -243,8 +243,9 @@ object StructType {
               .foreach(newFields += _)
         }
 
+        val leftMapped = fieldsMap(leftFields)
         rightFields
-          .filterNot(f => leftFields.map(_.name).contains(f.name))
+          .filterNot(f => leftMapped.get(f.name).nonEmpty)
           .foreach(newFields += _)
 
         StructType(newFields)
@@ -264,4 +265,9 @@ object StructType {
       case _ =>
         throw new SparkException(s"Failed to merge incompatible data types $left and $right")
     }
+  
+  private[sql] def fieldsMap(fields: Array[StructField]): Map[String, StructField] = {
+    import scala.collection.breakOut
+    fields.map(s => (s.name, s))(breakOut)
+  }
 }
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/types/DataTypeSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/types/DataTypeSuite.scala
index d797510f36685..a73317c86916b 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/types/DataTypeSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/types/DataTypeSuite.scala
@@ -17,6 +17,7 @@
 
 package org.apache.spark.sql.types
 
+import org.apache.spark.SparkException
 import org.scalatest.FunSuite
 
 class DataTypeSuite extends FunSuite {
@@ -69,6 +70,76 @@ class DataTypeSuite extends FunSuite {
     }
   }
 
+  test("fieldsMap returns map of name to StructField") {
+    val struct = StructType(
+      StructField("a", LongType) :: 
+      StructField("b", FloatType) :: Nil)
+
+    val mapped = StructType.fieldsMap(struct.fields)
+
+    val expected = Map(
+      "a" -> StructField("a", LongType),
+      "b" -> StructField("b", FloatType))
+
+    assert(mapped === expected)
+  }
+
+  test("merge where right is empty") {
+    val left = StructType(
+      StructField("a", LongType) ::
+      StructField("b", FloatType) :: Nil)
+
+    val right = StructType(List())
+    val merged = left.merge(right)
+    
+    assert(merged === left)
+  }
+
+  test("merge where left is empty") {
+
+    val left = StructType(List())
+
+    val right = StructType(
+      StructField("a", LongType) ::
+      StructField("b", FloatType) :: Nil)
+
+    val merged = left.merge(right)
+
+    assert(right === merged)
+
+  }
+
+  test("merge where both are non-empty") {
+    val left = StructType(
+      StructField("a", LongType) ::
+      StructField("b", FloatType) :: Nil)
+
+    val right = StructType(
+      StructField("c", LongType) :: Nil)
+
+    val expected = StructType(
+      StructField("a", LongType) ::
+      StructField("b", FloatType) ::
+      StructField("c", LongType) :: Nil)
+
+    val merged = left.merge(right)
+
+    assert(merged === expected)
+  }
+
+  test("merge where right contains type conflict") {
+    val left = StructType(
+      StructField("a", LongType) ::
+      StructField("b", FloatType) :: Nil)
+
+    val right = StructType(
+      StructField("b", LongType) :: Nil)
+    
+    intercept[SparkException] {
+      left.merge(right)
+    }
+  }
+
   def checkDataTypeJsonRepr(dataType: DataType): Unit = {
     test(s"JSON - $dataType") {
       assert(DataType.fromJson(dataType.json) === dataType)
@@ -120,7 +191,7 @@ class DataTypeSuite extends FunSuite {
   checkDefaultSize(DecimalType(10, 5), 4096)
   checkDefaultSize(DecimalType.Unlimited, 4096)
   checkDefaultSize(DateType, 4)
-  checkDefaultSize(TimestampType,12)
+  checkDefaultSize(TimestampType, 12)
   checkDefaultSize(StringType, 4096)
   checkDefaultSize(BinaryType, 4096)
   checkDefaultSize(ArrayType(DoubleType, true), 800)

From 0c33c7b4a66e47f6246f1b7f2b96f2c33126ec63 Mon Sep 17 00:00:00 2001
From: Josh Rosen <joshrosen@databricks.com>
Date: Tue, 26 May 2015 20:24:35 -0700
Subject: [PATCH 191/525] [SPARK-7858] [SQL] Use output schema, not relation
 schema, for data source input conversion

In `DataSourceStrategy.createPhysicalRDD`, we use the relation schema as the target schema for converting incoming rows into Catalyst rows.  However, we should be using the output schema instead, since our scan might return a subset of the relation's columns.

This patch incorporates #6414 by liancheng, which fixes an issue in `SimpleTestRelation` that prevented this bug from being caught by our old tests:

> In `SimpleTextRelation`, we specified `needsConversion` to `true`, indicating that values produced by this testing relation should be of Scala types, and need to be converted to Catalyst types when necessary. However, we also used `Cast` to convert strings to expected data types. And `Cast` always produces values of Catalyst types, thus no conversion is done at all. This PR makes `SimpleTextRelation` produce Scala values so that data conversion code paths can be properly tested.

Closes #5986.

Author: Josh Rosen <joshrosen@databricks.com>
Author: Cheng Lian <lian@databricks.com>
Author: Cheng Lian <liancheng@users.noreply.github.com>

Closes #6400 from JoshRosen/SPARK-7858 and squashes the following commits:

e71c866 [Josh Rosen] Re-fix bug so that the tests pass again
56b13e5 [Josh Rosen] Add regression test to hadoopFsRelationSuites
2169a0f [Josh Rosen] Remove use of SpecificMutableRow and BufferedIterator
6cd7366 [Josh Rosen] Fix SPARK-7858 by using output types for conversion.
5a00e66 [Josh Rosen] Add assertions in order to reproduce SPARK-7858
8ba195c [Cheng Lian] Merge 9968fba9979287aaa1f141ba18bfb9d4c116a3b3 into 61664732b25b35f94be35a42cde651cbfd0e02b7
9968fba [Cheng Lian] Tests the data type conversion code paths
---
 .../org/apache/spark/sql/SQLContext.scala     |  2 +-
 .../spark/sql/execution/ExistingRDD.scala     | 62 +++++++------------
 .../sql/sources/DataSourceStrategy.scala      |  2 +-
 .../sql/sources/SimpleTextRelation.scala      |  6 +-
 .../sql/sources/hadoopFsRelationSuites.scala  |  6 ++
 5 files changed, 37 insertions(+), 41 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
index 1ea596dddff02..3935f7b321b85 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
@@ -392,7 +392,7 @@ class SQLContext(@transient val sparkContext: SparkContext)
     SparkPlan.currentContext.set(self)
     val schema = ScalaReflection.schemaFor[A].dataType.asInstanceOf[StructType]
     val attributeSeq = schema.toAttributes
-    val rowRDD = RDDConversions.productToRowRdd(rdd, schema)
+    val rowRDD = RDDConversions.productToRowRdd(rdd, schema.map(_.dataType))
     DataFrame(self, LogicalRDD(attributeSeq, rowRDD)(self))
   }
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/ExistingRDD.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/ExistingRDD.scala
index a500269f3cdcf..f931dc95ef575 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/ExistingRDD.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/ExistingRDD.scala
@@ -21,9 +21,9 @@ import org.apache.spark.annotation.DeveloperApi
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.catalyst.CatalystTypeConverters
 import org.apache.spark.sql.catalyst.analysis.MultiInstanceRelation
-import org.apache.spark.sql.catalyst.expressions.{Attribute, GenericMutableRow, SpecificMutableRow}
+import org.apache.spark.sql.catalyst.expressions.{Attribute, GenericMutableRow}
 import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, Statistics}
-import org.apache.spark.sql.types.StructType
+import org.apache.spark.sql.types.DataType
 import org.apache.spark.sql.{Row, SQLContext}
 
 /**
@@ -31,26 +31,19 @@ import org.apache.spark.sql.{Row, SQLContext}
  */
 @DeveloperApi
 object RDDConversions {
-  def productToRowRdd[A <: Product](data: RDD[A], schema: StructType): RDD[Row] = {
+  def productToRowRdd[A <: Product](data: RDD[A], outputTypes: Seq[DataType]): RDD[Row] = {
     data.mapPartitions { iterator =>
-      if (iterator.isEmpty) {
-        Iterator.empty
-      } else {
-        val bufferedIterator = iterator.buffered
-        val mutableRow = new SpecificMutableRow(schema.fields.map(_.dataType))
-        val schemaFields = schema.fields.toArray
-        val converters = schemaFields.map {
-          f => CatalystTypeConverters.createToCatalystConverter(f.dataType)
-        }
-        bufferedIterator.map { r =>
-          var i = 0
-          while (i < mutableRow.length) {
-            mutableRow(i) = converters(i)(r.productElement(i))
-            i += 1
-          }
-
-          mutableRow
+      val numColumns = outputTypes.length
+      val mutableRow = new GenericMutableRow(numColumns)
+      val converters = outputTypes.map(CatalystTypeConverters.createToCatalystConverter)
+      iterator.map { r =>
+        var i = 0
+        while (i < numColumns) {
+          mutableRow(i) = converters(i)(r.productElement(i))
+          i += 1
         }
+
+        mutableRow
       }
     }
   }
@@ -58,26 +51,19 @@ object RDDConversions {
   /**
    * Convert the objects inside Row into the types Catalyst expected.
    */
-  def rowToRowRdd(data: RDD[Row], schema: StructType): RDD[Row] = {
+  def rowToRowRdd(data: RDD[Row], outputTypes: Seq[DataType]): RDD[Row] = {
     data.mapPartitions { iterator =>
-      if (iterator.isEmpty) {
-        Iterator.empty
-      } else {
-        val bufferedIterator = iterator.buffered
-        val mutableRow = new GenericMutableRow(bufferedIterator.head.toSeq.toArray)
-        val schemaFields = schema.fields.toArray
-        val converters = schemaFields.map {
-          f => CatalystTypeConverters.createToCatalystConverter(f.dataType)
-        }
-        bufferedIterator.map { r =>
-          var i = 0
-          while (i < mutableRow.length) {
-            mutableRow(i) = converters(i)(r(i))
-            i += 1
-          }
-
-          mutableRow
+      val numColumns = outputTypes.length
+      val mutableRow = new GenericMutableRow(numColumns)
+      val converters = outputTypes.map(CatalystTypeConverters.createToCatalystConverter)
+      iterator.map { r =>
+        var i = 0
+        while (i < numColumns) {
+          mutableRow(i) = converters(i)(r(i))
+          i += 1
         }
+
+        mutableRow
       }
     }
   }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/sources/DataSourceStrategy.scala b/sql/core/src/main/scala/org/apache/spark/sql/sources/DataSourceStrategy.scala
index dacd967cff856..c6a4dabbab05e 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/sources/DataSourceStrategy.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/sources/DataSourceStrategy.scala
@@ -309,7 +309,7 @@ private[sql] object DataSourceStrategy extends Strategy with Logging {
       output: Seq[Attribute],
       rdd: RDD[Row]): SparkPlan = {
     val converted = if (relation.needConversion) {
-      execution.RDDConversions.rowToRowRdd(rdd, relation.schema)
+      execution.RDDConversions.rowToRowRdd(rdd, output.map(_.dataType))
     } else {
       rdd
     }
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/sources/SimpleTextRelation.scala b/sql/hive/src/test/scala/org/apache/spark/sql/sources/SimpleTextRelation.scala
index de907846b9180..0f959b3d0b86d 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/sources/SimpleTextRelation.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/sources/SimpleTextRelation.scala
@@ -27,6 +27,7 @@ import org.apache.hadoop.mapreduce.lib.output.{FileOutputFormat, TextOutputForma
 import org.apache.hadoop.mapreduce.{Job, RecordWriter, TaskAttemptContext}
 
 import org.apache.spark.rdd.RDD
+import org.apache.spark.sql.catalyst.CatalystTypeConverters
 import org.apache.spark.sql.catalyst.expressions.{Cast, Literal}
 import org.apache.spark.sql.types.{DataType, StructType}
 import org.apache.spark.sql.{Row, SQLContext}
@@ -108,7 +109,10 @@ class SimpleTextRelation(
 
     sparkContext.textFile(inputStatuses.map(_.getPath).mkString(",")).map { record =>
       Row(record.split(",").zip(fields).map { case (value, dataType) =>
-        Cast(Literal(value), dataType).eval()
+        // `Cast`ed values are always of Catalyst types (i.e. UTF8String instead of String, etc.)
+        val catalystValue = Cast(Literal(value), dataType).eval()
+        // Here we're converting Catalyst values to Scala values to test `needsConversion`
+        CatalystTypeConverters.convertToScala(catalystValue, dataType)
       }: _*)
     }
   }
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/sources/hadoopFsRelationSuites.scala b/sql/hive/src/test/scala/org/apache/spark/sql/sources/hadoopFsRelationSuites.scala
index 70328e1ef810d..7c02d563f8d9a 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/sources/hadoopFsRelationSuites.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/sources/hadoopFsRelationSuites.scala
@@ -76,6 +76,12 @@ abstract class HadoopFsRelationTest extends QueryTest with SQLTestUtils {
       df.filter('a > 1 && 'p1 < 2).select('b, 'p1),
       for (i <- 2 to 3; _ <- Seq("foo", "bar")) yield Row(s"val_$i", 1))
 
+    // Project many copies of columns with different types (reproduction for SPARK-7858)
+    checkAnswer(
+      df.filter('a > 1 && 'p1 < 2).select('b, 'b, 'b, 'b, 'p1, 'p1, 'p1, 'p1),
+      for (i <- 2 to 3; _ <- Seq("foo", "bar"))
+        yield Row(s"val_$i", s"val_$i", s"val_$i", s"val_$i", 1, 1, 1, 1))
+
     // Self-join
     df.registerTempTable("t")
     withTempTable("t") {

From b463e6d618e69c535297e51f41eca4f91bd33cc8 Mon Sep 17 00:00:00 2001
From: Cheng Lian <lian@databricks.com>
Date: Tue, 26 May 2015 20:48:56 -0700
Subject: [PATCH 192/525] [SPARK-7868] [SQL] Ignores _temporary directories in
 HadoopFsRelation

So that potential partial/corrupted data files left by failed tasks/jobs won't affect normal data scan.

Author: Cheng Lian <lian@databricks.com>

Closes #6411 from liancheng/spark-7868 and squashes the following commits:

273ea36 [Cheng Lian] Ignores _temporary directories
---
 .../apache/spark/sql/sources/interfaces.scala | 20 ++++++++++++-------
 .../sql/sources/hadoopFsRelationSuites.scala  | 16 +++++++++++++++
 2 files changed, 29 insertions(+), 7 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/sources/interfaces.scala b/sql/core/src/main/scala/org/apache/spark/sql/sources/interfaces.scala
index aaabbadcd651b..c06026e042d9f 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/sources/interfaces.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/sources/interfaces.scala
@@ -31,7 +31,7 @@ import org.apache.spark.SerializableWritable
 import org.apache.spark.sql.{Row, _}
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.expressions.codegen.GenerateMutableProjection
-import org.apache.spark.sql.types.{StructField, StructType}
+import org.apache.spark.sql.types.StructType
 
 /**
  * ::DeveloperApi::
@@ -378,16 +378,22 @@ abstract class HadoopFsRelation private[sql](maybePartitionSpec: Option[Partitio
     var leafDirToChildrenFiles = mutable.Map.empty[Path, Array[FileStatus]]
 
     def refresh(): Unit = {
+      // We don't filter files/directories whose name start with "_" or "." here, as specific data
+      // sources may take advantages over them (e.g. Parquet _metadata and _common_metadata files).
+      // But "_temporary" directories are explicitly ignored since failed tasks/jobs may leave
+      // partial/corrupted data files there.
       def listLeafFilesAndDirs(fs: FileSystem, status: FileStatus): Set[FileStatus] = {
-        val (dirs, files) = fs.listStatus(status.getPath).partition(_.isDir)
-        val leafDirs = if (dirs.isEmpty) Set(status) else Set.empty[FileStatus]
-        files.toSet ++ leafDirs ++ dirs.flatMap(dir => listLeafFilesAndDirs(fs, dir))
+        if (status.getPath.getName.toLowerCase == "_temporary") {
+          Set.empty
+        } else {
+          val (dirs, files) = fs.listStatus(status.getPath).partition(_.isDir)
+          val leafDirs = if (dirs.isEmpty) Set(status) else Set.empty[FileStatus]
+          files.toSet ++ leafDirs ++ dirs.flatMap(dir => listLeafFilesAndDirs(fs, dir))
+        }
       }
 
       leafFiles.clear()
 
-      // We don't filter files/directories like _temporary/_SUCCESS here, as specific data sources
-      // may take advantages over them (e.g. Parquet _metadata and _common_metadata files).
       val statuses = paths.flatMap { path =>
         val hdfsPath = new Path(path)
         val fs = hdfsPath.getFileSystem(hadoopConf)
@@ -395,7 +401,7 @@ abstract class HadoopFsRelation private[sql](maybePartitionSpec: Option[Partitio
         Try(fs.getFileStatus(qualified)).toOption.toArray.flatMap(listLeafFilesAndDirs(fs, _))
       }
 
-      val (dirs, files) = statuses.partition(_.isDir)
+      val files = statuses.filterNot(_.isDir)
       leafFiles ++= files.map(f => f.getPath -> f).toMap
       leafDirToChildrenFiles ++= files.groupBy(_.getPath.getParent)
     }
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/sources/hadoopFsRelationSuites.scala b/sql/hive/src/test/scala/org/apache/spark/sql/sources/hadoopFsRelationSuites.scala
index 7c02d563f8d9a..cf5ae88dc4bee 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/sources/hadoopFsRelationSuites.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/sources/hadoopFsRelationSuites.scala
@@ -548,4 +548,20 @@ class ParquetHadoopFsRelationSuite extends HadoopFsRelationTest {
       checkAnswer(table("t"), df.select('b, 'c, 'a).collect())
     }
   }
+
+  test("SPARK-7868: _temporary directories should be ignored") {
+    withTempPath { dir =>
+      val df = Seq("a", "b", "c").zipWithIndex.toDF()
+
+      df.write
+        .format("parquet")
+        .save(dir.getCanonicalPath)
+
+      df.write
+        .format("parquet")
+        .save(s"${dir.getCanonicalPath}/_temporary")
+
+      checkAnswer(read.format("parquet").load(dir.getCanonicalPath), df.collect())
+    }
+  }
 }

From a9f1c0c57b9be586dbada09dab91dcfce31141d9 Mon Sep 17 00:00:00 2001
From: Xiangrui Meng <meng@databricks.com>
Date: Tue, 26 May 2015 23:51:32 -0700
Subject: [PATCH 193/525] [SPARK-7535] [.1] [MLLIB] minor changes to the
 pipeline API

1. removed `Params.validateParams(extra)`
2. added `Evaluate.evaluate(dataset, paramPairs*)`
3. updated `RegressionEvaluator` doc

jkbradley

Author: Xiangrui Meng <meng@databricks.com>

Closes #6392 from mengxr/SPARK-7535.1 and squashes the following commits:

5ff5af8 [Xiangrui Meng] add unit test for CV.validateParams
f1f8369 [Xiangrui Meng] update CV.validateParams() to test estimatorParamMaps
607445d [Xiangrui Meng] merge master
8716f5f [Xiangrui Meng] specify default metric name in RegressionEvaluator
e4e5631 [Xiangrui Meng] update RegressionEvaluator doc
801e864 [Xiangrui Meng] Merge remote-tracking branch 'apache/master' into SPARK-7535.1
fcbd3e2 [Xiangrui Meng] Merge branch 'master' into SPARK-7535.1
2192316 [Xiangrui Meng] remove validateParams(extra); add evaluate(dataset, extra*)
---
 .../scala/org/apache/spark/ml/Pipeline.scala  |  9 ++--
 .../ml/evaluation/RegressionEvaluator.scala   |  4 +-
 .../org/apache/spark/ml/param/params.scala    | 13 -----
 .../spark/ml/tuning/CrossValidator.scala      | 23 +++++---
 .../apache/spark/ml/param/ParamsSuite.scala   |  2 +-
 .../spark/ml/tuning/CrossValidatorSuite.scala | 52 ++++++++++++++++++-
 6 files changed, 71 insertions(+), 32 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/Pipeline.scala b/mllib/src/main/scala/org/apache/spark/ml/Pipeline.scala
index 9da3ff65c744e..11a4722722ea1 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/Pipeline.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/Pipeline.scala
@@ -97,12 +97,9 @@ class Pipeline(override val uid: String) extends Estimator[PipelineModel] {
   /** @group getParam */
   def getStages: Array[PipelineStage] = $(stages).clone()
 
-  override def validateParams(paramMap: ParamMap): Unit = {
-    val map = extractParamMap(paramMap)
-    getStages.foreach {
-      case pStage: Params => pStage.validateParams(map)
-      case _ =>
-    }
+  override def validateParams(): Unit = {
+    super.validateParams()
+    $(stages).foreach(_.validateParams())
   }
 
   /**
diff --git a/mllib/src/main/scala/org/apache/spark/ml/evaluation/RegressionEvaluator.scala b/mllib/src/main/scala/org/apache/spark/ml/evaluation/RegressionEvaluator.scala
index 1771177e1ea91..abb1b35bedea5 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/evaluation/RegressionEvaluator.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/evaluation/RegressionEvaluator.scala
@@ -36,8 +36,8 @@ final class RegressionEvaluator(override val uid: String)
   def this() = this(Identifiable.randomUID("regEval"))
 
   /**
-   * param for metric name in evaluation
-   * @group param supports mse, rmse, r2, mae as valid metric names.
+   * param for metric name in evaluation (supports `"rmse"` (default), `"mse"`, `"r2"`, and `"mae"`)
+   * @group param
    */
   val metricName: Param[String] = {
     val allowedParams = ParamValidators.inArray(Array("mse", "rmse", "r2", "mae"))
diff --git a/mllib/src/main/scala/org/apache/spark/ml/param/params.scala b/mllib/src/main/scala/org/apache/spark/ml/param/params.scala
index 1afa59c994c2b..473488dce9b0d 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/param/params.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/param/params.scala
@@ -333,19 +333,6 @@ trait Params extends Identifiable with Serializable {
       .map(m => m.invoke(this).asInstanceOf[Param[_]])
   }
 
-  /**
-   * Validates parameter values stored internally plus the input parameter map.
-   * Raises an exception if any parameter is invalid.
-   *
-   * This only needs to check for interactions between parameters.
-   * Parameter value checks which do not depend on other parameters are handled by
-   * [[Param.validate()]].  This method does not handle input/output column parameters;
-   * those are checked during schema validation.
-   */
-  def validateParams(paramMap: ParamMap): Unit = {
-    copy(paramMap).validateParams()
-  }
-
   /**
    * Validates parameter values stored internally.
    * Raise an exception if any parameter value is invalid.
diff --git a/mllib/src/main/scala/org/apache/spark/ml/tuning/CrossValidator.scala b/mllib/src/main/scala/org/apache/spark/ml/tuning/CrossValidator.scala
index 2e5a629561180..6434b64aed15d 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/tuning/CrossValidator.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/tuning/CrossValidator.scala
@@ -102,12 +102,6 @@ class CrossValidator(override val uid: String) extends Estimator[CrossValidatorM
   /** @group setParam */
   def setNumFolds(value: Int): this.type = set(numFolds, value)
 
-  override def validateParams(paramMap: ParamMap): Unit = {
-    getEstimatorParamMaps.foreach { eMap =>
-      getEstimator.validateParams(eMap ++ paramMap)
-    }
-  }
-
   override def fit(dataset: DataFrame): CrossValidatorModel = {
     val schema = dataset.schema
     transformSchema(schema, logging = true)
@@ -147,6 +141,14 @@ class CrossValidator(override val uid: String) extends Estimator[CrossValidatorM
   override def transformSchema(schema: StructType): StructType = {
     $(estimator).transformSchema(schema)
   }
+
+  override def validateParams(): Unit = {
+    super.validateParams()
+    val est = $(estimator)
+    for (paramMap <- $(estimatorParamMaps)) {
+      est.copy(paramMap).validateParams()
+    }
+  }
 }
 
 /**
@@ -159,8 +161,8 @@ class CrossValidatorModel private[ml] (
     val bestModel: Model[_])
   extends Model[CrossValidatorModel] with CrossValidatorParams {
 
-  override def validateParams(paramMap: ParamMap): Unit = {
-    bestModel.validateParams(paramMap)
+  override def validateParams(): Unit = {
+    bestModel.validateParams()
   }
 
   override def transform(dataset: DataFrame): DataFrame = {
@@ -171,4 +173,9 @@ class CrossValidatorModel private[ml] (
   override def transformSchema(schema: StructType): StructType = {
     bestModel.transformSchema(schema)
   }
+
+  override def copy(extra: ParamMap): CrossValidatorModel = {
+    val copied = new CrossValidatorModel(uid, bestModel.copy(extra).asInstanceOf[Model[_]])
+    copyValues(copied, extra)
+  }
 }
diff --git a/mllib/src/test/scala/org/apache/spark/ml/param/ParamsSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/param/ParamsSuite.scala
index d270ad7613af1..04f2af4727ea4 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/param/ParamsSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/param/ParamsSuite.scala
@@ -135,7 +135,7 @@ class ParamsSuite extends FunSuite {
     intercept[IllegalArgumentException] {
       solver.validateParams()
     }
-    solver.validateParams(ParamMap(inputCol -> "input"))
+    solver.copy(ParamMap(inputCol -> "input")).validateParams()
     solver.setInputCol("input")
     assert(solver.isSet(inputCol))
     assert(solver.isDefined(inputCol))
diff --git a/mllib/src/test/scala/org/apache/spark/ml/tuning/CrossValidatorSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/tuning/CrossValidatorSuite.scala
index 05313d440fbf6..65972ec79b9a5 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/tuning/CrossValidatorSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/tuning/CrossValidatorSuite.scala
@@ -19,11 +19,15 @@ package org.apache.spark.ml.tuning
 
 import org.scalatest.FunSuite
 
+import org.apache.spark.ml.{Estimator, Model}
 import org.apache.spark.ml.classification.LogisticRegression
-import org.apache.spark.ml.evaluation.BinaryClassificationEvaluator
+import org.apache.spark.ml.evaluation.{BinaryClassificationEvaluator, Evaluator}
+import org.apache.spark.ml.param.ParamMap
+import org.apache.spark.ml.param.shared.HasInputCol
 import org.apache.spark.mllib.classification.LogisticRegressionSuite.generateLogisticInput
 import org.apache.spark.mllib.util.MLlibTestSparkContext
-import org.apache.spark.sql.{SQLContext, DataFrame}
+import org.apache.spark.sql.{DataFrame, SQLContext}
+import org.apache.spark.sql.types.StructType
 
 class CrossValidatorSuite extends FunSuite with MLlibTestSparkContext {
 
@@ -53,4 +57,48 @@ class CrossValidatorSuite extends FunSuite with MLlibTestSparkContext {
     assert(parent.getRegParam === 0.001)
     assert(parent.getMaxIter === 10)
   }
+
+  test("validateParams should check estimatorParamMaps") {
+    import CrossValidatorSuite._
+
+    val est = new MyEstimator("est")
+    val eval = new MyEvaluator
+    val paramMaps = new ParamGridBuilder()
+      .addGrid(est.inputCol, Array("input1", "input2"))
+      .build()
+
+    val cv = new CrossValidator()
+      .setEstimator(est)
+      .setEstimatorParamMaps(paramMaps)
+      .setEvaluator(eval)
+
+    cv.validateParams() // This should pass.
+
+    val invalidParamMaps = paramMaps :+ ParamMap(est.inputCol -> "")
+    cv.setEstimatorParamMaps(invalidParamMaps)
+    intercept[IllegalArgumentException] {
+      cv.validateParams()
+    }
+  }
+}
+
+object CrossValidatorSuite {
+
+  abstract class MyModel extends Model[MyModel]
+
+  class MyEstimator(override val uid: String) extends Estimator[MyModel] with HasInputCol {
+
+    override def validateParams(): Unit = require($(inputCol).nonEmpty)
+
+    override def fit(dataset: DataFrame): MyModel = ???
+
+    override def transformSchema(schema: StructType): StructType = ???
+  }
+
+  class MyEvaluator extends Evaluator {
+
+    override def evaluate(dataset: DataFrame): Double = ???
+
+    override val uid: String = "eval"
+  }
 }

From 6dd645870d34d97ac992032bfd6cf39f20a0c50f Mon Sep 17 00:00:00 2001
From: Cheolsoo Park <cheolsoop@netflix.com>
Date: Wed, 27 May 2015 00:18:42 -0700
Subject: [PATCH 194/525] [SPARK-7850][BUILD] Hive 0.12.0 profile in POM should
 be removed

I grep'ed hive-0.12.0 in the source code and removed all the profiles and doc references.

Author: Cheolsoo Park <cheolsoop@netflix.com>

Closes #6393 from piaozhexiu/SPARK-7850 and squashes the following commits:

fb429ce [Cheolsoo Park] Remove hive-0.13.1 profile
82bf09a [Cheolsoo Park] Remove hive 0.12.0 shim code
f3722da [Cheolsoo Park] Remove hive-0.12.0 profile and references from POM and build docs
---
 docs/building-spark.md                        |   6 +-
 pom.xml                                       |  16 -
 .../spark/sql/hive/thriftserver/Shim12.scala  | 278 ------------------
 sql/hive/pom.xml                              |  10 -
 .../org/apache/spark/sql/hive/Shim12.scala    | 265 -----------------
 5 files changed, 1 insertion(+), 574 deletions(-)
 delete mode 100644 sql/hive-thriftserver/v0.12.0/src/main/scala/org/apache/spark/sql/hive/thriftserver/Shim12.scala
 delete mode 100644 sql/hive/v0.12.0/src/main/scala/org/apache/spark/sql/hive/Shim12.scala

diff --git a/docs/building-spark.md b/docs/building-spark.md
index 4dbccb9e6e46c..3ca7f2746e678 100644
--- a/docs/building-spark.md
+++ b/docs/building-spark.md
@@ -118,14 +118,10 @@ mvn -Pyarn -Phadoop-2.3 -Dhadoop.version=2.3.0 -Dyarn.version=2.2.0 -DskipTests
 # Building With Hive and JDBC Support
 To enable Hive integration for Spark SQL along with its JDBC server and CLI,
 add the `-Phive` and `Phive-thriftserver` profiles to your existing build options.
-By default Spark will build with Hive 0.13.1 bindings. You can also build for
-Hive 0.12.0 using the `-Phive-0.12.0` profile.
+By default Spark will build with Hive 0.13.1 bindings.
 {% highlight bash %}
 # Apache Hadoop 2.4.X with Hive 13 support
 mvn -Pyarn -Phadoop-2.4 -Dhadoop.version=2.4.0 -Phive -Phive-thriftserver -DskipTests clean package
-
-# Apache Hadoop 2.4.X with Hive 12 support
-mvn -Pyarn -Phadoop-2.4 -Dhadoop.version=2.4.0 -Phive -Phive-0.12.0 -Phive-thriftserver -DskipTests clean package
 {% endhighlight %}
 
 # Building for Scala 2.11
diff --git a/pom.xml b/pom.xml
index c72d7cbf843ef..711edf9efad2b 100644
--- a/pom.xml
+++ b/pom.xml
@@ -1753,22 +1753,6 @@
         <module>sql/hive-thriftserver</module>
       </modules>
     </profile>
-    <profile>
-      <id>hive-0.12.0</id>
-      <properties>
-        <hive.version>0.12.0-protobuf-2.5</hive.version>
-        <hive.version.short>0.12.0</hive.version.short>
-        <derby.version>10.4.2.0</derby.version>
-      </properties>
-    </profile>
-    <profile>
-      <id>hive-0.13.1</id>
-      <properties>
-        <hive.version>0.13.1a</hive.version>
-        <hive.version.short>0.13.1</hive.version.short>
-        <derby.version>10.10.1.1</derby.version>
-      </properties>
-    </profile>
 
     <profile>
       <id>scala-2.10</id>
diff --git a/sql/hive-thriftserver/v0.12.0/src/main/scala/org/apache/spark/sql/hive/thriftserver/Shim12.scala b/sql/hive-thriftserver/v0.12.0/src/main/scala/org/apache/spark/sql/hive/thriftserver/Shim12.scala
deleted file mode 100644
index b3a79ba1c7d6b..0000000000000
--- a/sql/hive-thriftserver/v0.12.0/src/main/scala/org/apache/spark/sql/hive/thriftserver/Shim12.scala
+++ /dev/null
@@ -1,278 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.sql.hive.thriftserver
-
-import java.sql.{Date, Timestamp}
-import java.util.concurrent.Executors
-import java.util.{ArrayList => JArrayList, Map => JMap, UUID}
-
-import org.apache.commons.logging.Log
-import org.apache.hadoop.hive.conf.HiveConf
-import org.apache.hadoop.hive.conf.HiveConf.ConfVars
-import org.apache.hive.service.cli.thrift.TProtocolVersion
-import org.apache.spark.sql.hive.thriftserver.server.SparkSQLOperationManager
-
-import scala.collection.JavaConversions._
-import scala.collection.mutable.{ArrayBuffer, Map => SMap}
-
-import org.apache.hadoop.hive.common.`type`.HiveDecimal
-import org.apache.hadoop.hive.metastore.api.FieldSchema
-import org.apache.hadoop.hive.shims.ShimLoader
-import org.apache.hadoop.security.UserGroupInformation
-import org.apache.hive.service.cli._
-import org.apache.hive.service.cli.operation.ExecuteStatementOperation
-import org.apache.hive.service.cli.session.{SessionManager, HiveSession}
-
-import org.apache.spark.Logging
-import org.apache.spark.sql.{DataFrame, SQLConf, Row => SparkRow}
-import org.apache.spark.sql.execution.SetCommand
-import org.apache.spark.sql.hive.thriftserver.ReflectionUtils._
-import org.apache.spark.sql.hive.{HiveContext, HiveMetastoreTypes}
-import org.apache.spark.sql.types._
-
-/**
- * A compatibility layer for interacting with Hive version 0.12.0.
- */
-private[thriftserver] object HiveThriftServerShim {
-  val version = "0.12.0"
-
-  def setServerUserName(sparkServiceUGI: UserGroupInformation, sparkCliService:SparkSQLCLIService) = {
-    val serverUserName = ShimLoader.getHadoopShims.getShortUserName(sparkServiceUGI)
-    setSuperField(sparkCliService, "serverUserName", serverUserName)
-  }
-}
-
-private[hive] class SparkSQLDriver(val _context: HiveContext = SparkSQLEnv.hiveContext)
-  extends AbstractSparkSQLDriver(_context) {
-  override def getResults(res: JArrayList[String]): Boolean = {
-    if (hiveResponse == null) {
-      false
-    } else {
-      res.addAll(hiveResponse)
-      hiveResponse = null
-      true
-    }
-  }
-}
-
-private[hive] class SparkExecuteStatementOperation(
-    parentSession: HiveSession,
-    statement: String,
-    confOverlay: JMap[String, String])(
-    hiveContext: HiveContext,
-    sessionToActivePool: SMap[SessionHandle, String])
-  extends ExecuteStatementOperation(parentSession, statement, confOverlay) with Logging {
-
-  private var result: DataFrame = _
-  private var iter: Iterator[SparkRow] = _
-  private var dataTypes: Array[DataType] = _
-
-  def close(): Unit = {
-    // RDDs will be cleaned automatically upon garbage collection.
-    logDebug("CLOSING")
-  }
-
-  def getNextRowSet(order: FetchOrientation, maxRowsL: Long): RowSet = {
-    if (!iter.hasNext) {
-      new RowSet()
-    } else {
-      // maxRowsL here typically maps to java.sql.Statement.getFetchSize, which is an int
-      val maxRows = maxRowsL.toInt
-      var curRow = 0
-      var rowSet = new ArrayBuffer[Row](maxRows.min(1024))
-
-      while (curRow < maxRows && iter.hasNext) {
-        val sparkRow = iter.next()
-        val row = new Row()
-        var curCol = 0
-
-        while (curCol < sparkRow.length) {
-          if (sparkRow.isNullAt(curCol)) {
-            addNullColumnValue(sparkRow, row, curCol)
-          } else {
-            addNonNullColumnValue(sparkRow, row, curCol)
-          }
-          curCol += 1
-        }
-        rowSet += row
-        curRow += 1
-      }
-      new RowSet(rowSet, 0)
-    }
-  }
-
-  def addNonNullColumnValue(from: SparkRow, to: Row, ordinal: Int) {
-    dataTypes(ordinal) match {
-      case StringType =>
-        to.addString(from(ordinal).asInstanceOf[String])
-      case IntegerType =>
-        to.addColumnValue(ColumnValue.intValue(from.getInt(ordinal)))
-      case BooleanType =>
-        to.addColumnValue(ColumnValue.booleanValue(from.getBoolean(ordinal)))
-      case DoubleType =>
-        to.addColumnValue(ColumnValue.doubleValue(from.getDouble(ordinal)))
-      case FloatType =>
-        to.addColumnValue(ColumnValue.floatValue(from.getFloat(ordinal)))
-      case DecimalType() =>
-        val hiveDecimal = from.getDecimal(ordinal)
-        to.addColumnValue(ColumnValue.stringValue(new HiveDecimal(hiveDecimal)))
-      case LongType =>
-        to.addColumnValue(ColumnValue.longValue(from.getLong(ordinal)))
-      case ByteType =>
-        to.addColumnValue(ColumnValue.byteValue(from.getByte(ordinal)))
-      case ShortType =>
-        to.addColumnValue(ColumnValue.shortValue(from.getShort(ordinal)))
-      case DateType =>
-        to.addColumnValue(ColumnValue.dateValue(from(ordinal).asInstanceOf[Date]))
-      case TimestampType =>
-        to.addColumnValue(
-          ColumnValue.timestampValue(from.get(ordinal).asInstanceOf[Timestamp]))
-      case BinaryType | _: ArrayType | _: StructType | _: MapType =>
-        val hiveString = HiveContext.toHiveString((from.get(ordinal), dataTypes(ordinal)))
-        to.addColumnValue(ColumnValue.stringValue(hiveString))
-    }
-  }
-
-  def addNullColumnValue(from: SparkRow, to: Row, ordinal: Int) {
-    dataTypes(ordinal) match {
-      case StringType =>
-        to.addString(null)
-      case IntegerType =>
-        to.addColumnValue(ColumnValue.intValue(null))
-      case BooleanType =>
-        to.addColumnValue(ColumnValue.booleanValue(null))
-      case DoubleType =>
-        to.addColumnValue(ColumnValue.doubleValue(null))
-      case FloatType =>
-        to.addColumnValue(ColumnValue.floatValue(null))
-      case DecimalType() =>
-        to.addColumnValue(ColumnValue.stringValue(null: HiveDecimal))
-      case LongType =>
-        to.addColumnValue(ColumnValue.longValue(null))
-      case ByteType =>
-        to.addColumnValue(ColumnValue.byteValue(null))
-      case ShortType =>
-        to.addColumnValue(ColumnValue.shortValue(null))
-      case DateType =>
-        to.addColumnValue(ColumnValue.dateValue(null))
-      case TimestampType =>
-        to.addColumnValue(ColumnValue.timestampValue(null))
-      case BinaryType | _: ArrayType | _: StructType | _: MapType =>
-        to.addColumnValue(ColumnValue.stringValue(null: String))
-    }
-  }
-
-  def getResultSetSchema: TableSchema = {
-    logInfo(s"Result Schema: ${result.queryExecution.analyzed.output}")
-    if (result.queryExecution.analyzed.output.size == 0) {
-      new TableSchema(new FieldSchema("Result", "string", "") :: Nil)
-    } else {
-      val schema = result.queryExecution.analyzed.output.map { attr =>
-        new FieldSchema(attr.name, HiveMetastoreTypes.toMetastoreType(attr.dataType), "")
-      }
-      new TableSchema(schema)
-    }
-  }
-
-  def run(): Unit = {
-    val statementId = UUID.randomUUID().toString
-    logInfo(s"Running query '$statement'")
-    setState(OperationState.RUNNING)
-    HiveThriftServer2.listener.onStatementStart(
-      statementId, parentSession.getSessionHandle.getSessionId.toString, statement, statementId)
-    hiveContext.sparkContext.setJobGroup(statementId, statement)
-    sessionToActivePool.get(parentSession.getSessionHandle).foreach { pool =>
-      hiveContext.sparkContext.setLocalProperty("spark.scheduler.pool", pool)
-    }
-    try {
-      result = hiveContext.sql(statement)
-      logDebug(result.queryExecution.toString())
-      result.queryExecution.logical match {
-        case SetCommand(Some((SQLConf.THRIFTSERVER_POOL, Some(value))), _) =>
-          sessionToActivePool(parentSession.getSessionHandle) = value
-          logInfo(s"Setting spark.scheduler.pool=$value for future statements in this session.")
-        case _ =>
-      }
-      HiveThriftServer2.listener.onStatementParsed(statementId, result.queryExecution.toString())
-      iter = {
-        val useIncrementalCollect =
-          hiveContext.getConf("spark.sql.thriftServer.incrementalCollect", "false").toBoolean
-        if (useIncrementalCollect) {
-          result.rdd.toLocalIterator
-        } else {
-          result.collect().iterator
-        }
-      }
-      dataTypes = result.queryExecution.analyzed.output.map(_.dataType).toArray
-      setHasResultSet(true)
-    } catch {
-      // Actually do need to catch Throwable as some failures don't inherit from Exception and
-      // HiveServer will silently swallow them.
-      case e: Throwable =>
-        setState(OperationState.ERROR)
-        HiveThriftServer2.listener.onStatementError(
-          statementId, e.getMessage, e.getStackTraceString)
-        logError("Error executing query:",e)
-        throw new HiveSQLException(e.toString)
-    }
-    setState(OperationState.FINISHED)
-    HiveThriftServer2.listener.onStatementFinish(statementId)
-  }
-}
-
-private[hive] class SparkSQLSessionManager(hiveContext: HiveContext)
-  extends SessionManager
-  with ReflectedCompositeService {
-
-  private lazy val sparkSqlOperationManager = new SparkSQLOperationManager(hiveContext)
-
-  override def init(hiveConf: HiveConf) {
-    setSuperField(this, "hiveConf", hiveConf)
-
-    val backgroundPoolSize = hiveConf.getIntVar(ConfVars.HIVE_SERVER2_ASYNC_EXEC_THREADS)
-    setSuperField(this, "backgroundOperationPool", Executors.newFixedThreadPool(backgroundPoolSize))
-    getAncestorField[Log](this, 3, "LOG").info(
-      s"HiveServer2: Async execution pool size $backgroundPoolSize")
-
-    setSuperField(this, "operationManager", sparkSqlOperationManager)
-    addService(sparkSqlOperationManager)
-
-    initCompositeService(hiveConf)
-  }
-
-  override def openSession(
-      username: String,
-      passwd: String,
-      sessionConf: java.util.Map[String, String],
-      withImpersonation: Boolean,
-      delegationToken: String): SessionHandle = {
-    hiveContext.openSession()
-    val sessionHandle = super.openSession(
-      username, passwd, sessionConf, withImpersonation, delegationToken)
-    HiveThriftServer2.listener.onSessionCreated("UNKNOWN", sessionHandle.getSessionId.toString)
-    sessionHandle
-  }
-
-  override def closeSession(sessionHandle: SessionHandle) {
-    HiveThriftServer2.listener.onSessionClosed(sessionHandle.getSessionId.toString)
-    super.closeSession(sessionHandle)
-    sparkSqlOperationManager.sessionToActivePool -= sessionHandle
-
-    hiveContext.detachSession()
-  }
-}
diff --git a/sql/hive/pom.xml b/sql/hive/pom.xml
index e322340094e6f..615b07e74d535 100644
--- a/sql/hive/pom.xml
+++ b/sql/hive/pom.xml
@@ -136,16 +136,6 @@
         </plugins>
       </build>
     </profile>
-    <profile>
-      <id>hive-0.12.0</id>
-      <dependencies>
-         <dependency>
-           <groupId>com.twitter</groupId>
-           <artifactId>parquet-hive-bundle</artifactId>
-           <version>1.5.0</version>
-        </dependency>
-      </dependencies>
-    </profile>
   </profiles>
 
   <build>
diff --git a/sql/hive/v0.12.0/src/main/scala/org/apache/spark/sql/hive/Shim12.scala b/sql/hive/v0.12.0/src/main/scala/org/apache/spark/sql/hive/Shim12.scala
deleted file mode 100644
index 33e96eaabfbf6..0000000000000
--- a/sql/hive/v0.12.0/src/main/scala/org/apache/spark/sql/hive/Shim12.scala
+++ /dev/null
@@ -1,265 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.sql.hive
-
-import java.net.URI
-import java.util.{ArrayList => JArrayList, Properties}
-
-import scala.collection.JavaConversions._
-import scala.language.implicitConversions
-
-import org.apache.hadoop.{io => hadoopIo}
-import org.apache.hadoop.conf.Configuration
-import org.apache.hadoop.fs.Path
-import org.apache.hadoop.hive.common.`type`.HiveDecimal
-import org.apache.hadoop.hive.conf.HiveConf
-import org.apache.hadoop.hive.ql.Context
-import org.apache.hadoop.hive.ql.metadata.{Hive, Partition, Table}
-import org.apache.hadoop.hive.ql.plan.{CreateTableDesc, FileSinkDesc, TableDesc}
-import org.apache.hadoop.hive.ql.processors._
-import org.apache.hadoop.hive.ql.stats.StatsSetupConst
-import org.apache.hadoop.hive.serde2.{ColumnProjectionUtils, Deserializer, io => hiveIo}
-import org.apache.hadoop.hive.serde2.objectinspector.{ObjectInspectorConverters, ObjectInspector, PrimitiveObjectInspector}
-import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector.PrimitiveCategory
-import org.apache.hadoop.hive.serde2.objectinspector.primitive.{HiveDecimalObjectInspector, PrimitiveObjectInspectorFactory}
-import org.apache.hadoop.hive.serde2.typeinfo.{TypeInfo, TypeInfoFactory}
-import org.apache.hadoop.io.{NullWritable, Writable}
-import org.apache.hadoop.mapred.InputFormat
-
-import org.apache.spark.sql.types.{UTF8String, Decimal, DecimalType}
-
-private[hive] case class HiveFunctionWrapper(functionClassName: String)
-  extends java.io.Serializable {
-
-  // for Serialization
-  def this() = this(null)
-
-  import org.apache.spark.util.Utils._
-  def createFunction[UDFType <: AnyRef](): UDFType = {
-    getContextOrSparkClassLoader
-      .loadClass(functionClassName).newInstance.asInstanceOf[UDFType]
-  }
-}
-
-/**
- * A compatibility layer for interacting with Hive version 0.12.0.
- */
-private[hive] object HiveShim {
-  val version = "0.12.0"
-
-  def getTableDesc(
-    serdeClass: Class[_ <: Deserializer],
-    inputFormatClass: Class[_ <: InputFormat[_, _]],
-    outputFormatClass: Class[_],
-    properties: Properties) = {
-    new TableDesc(serdeClass, inputFormatClass, outputFormatClass, properties)
-  }
-
-  def getStringWritableConstantObjectInspector(value: Any): ObjectInspector =
-    PrimitiveObjectInspectorFactory.getPrimitiveWritableConstantObjectInspector(
-      PrimitiveCategory.STRING,
-      getStringWritable(value))
-
-  def getIntWritableConstantObjectInspector(value: Any): ObjectInspector =
-    PrimitiveObjectInspectorFactory.getPrimitiveWritableConstantObjectInspector(
-      PrimitiveCategory.INT,
-      getIntWritable(value))
-
-  def getDoubleWritableConstantObjectInspector(value: Any): ObjectInspector =
-    PrimitiveObjectInspectorFactory.getPrimitiveWritableConstantObjectInspector(
-      PrimitiveCategory.DOUBLE,
-      getDoubleWritable(value))
-
-  def getBooleanWritableConstantObjectInspector(value: Any): ObjectInspector =
-    PrimitiveObjectInspectorFactory.getPrimitiveWritableConstantObjectInspector(
-      PrimitiveCategory.BOOLEAN,
-      getBooleanWritable(value))
-
-  def getLongWritableConstantObjectInspector(value: Any): ObjectInspector =
-    PrimitiveObjectInspectorFactory.getPrimitiveWritableConstantObjectInspector(
-      PrimitiveCategory.LONG,
-      getLongWritable(value))
-
-  def getFloatWritableConstantObjectInspector(value: Any): ObjectInspector =
-    PrimitiveObjectInspectorFactory.getPrimitiveWritableConstantObjectInspector(
-      PrimitiveCategory.FLOAT,
-      getFloatWritable(value))
-
-  def getShortWritableConstantObjectInspector(value: Any): ObjectInspector =
-    PrimitiveObjectInspectorFactory.getPrimitiveWritableConstantObjectInspector(
-      PrimitiveCategory.SHORT,
-      getShortWritable(value))
-
-  def getByteWritableConstantObjectInspector(value: Any): ObjectInspector =
-    PrimitiveObjectInspectorFactory.getPrimitiveWritableConstantObjectInspector(
-      PrimitiveCategory.BYTE,
-      getByteWritable(value))
-
-  def getBinaryWritableConstantObjectInspector(value: Any): ObjectInspector =
-    PrimitiveObjectInspectorFactory.getPrimitiveWritableConstantObjectInspector(
-      PrimitiveCategory.BINARY,
-      getBinaryWritable(value))
-
-  def getDateWritableConstantObjectInspector(value: Any): ObjectInspector =
-    PrimitiveObjectInspectorFactory.getPrimitiveWritableConstantObjectInspector(
-      PrimitiveCategory.DATE,
-      getDateWritable(value))
-
-  def getTimestampWritableConstantObjectInspector(value: Any): ObjectInspector =
-    PrimitiveObjectInspectorFactory.getPrimitiveWritableConstantObjectInspector(
-      PrimitiveCategory.TIMESTAMP,
-      getTimestampWritable(value))
-
-  def getDecimalWritableConstantObjectInspector(value: Any): ObjectInspector =
-    PrimitiveObjectInspectorFactory.getPrimitiveWritableConstantObjectInspector(
-      PrimitiveCategory.DECIMAL,
-      getDecimalWritable(value))
-
-  def getPrimitiveNullWritableConstantObjectInspector: ObjectInspector =
-    PrimitiveObjectInspectorFactory.getPrimitiveWritableConstantObjectInspector(
-      PrimitiveCategory.VOID, null)
-
-  def getStringWritable(value: Any): hadoopIo.Text =
-    if (value == null) null else new hadoopIo.Text(value.asInstanceOf[UTF8String].toString)
-
-  def getIntWritable(value: Any): hadoopIo.IntWritable =
-    if (value == null) null else new hadoopIo.IntWritable(value.asInstanceOf[Int])
-
-  def getDoubleWritable(value: Any): hiveIo.DoubleWritable =
-    if (value == null) null else new hiveIo.DoubleWritable(value.asInstanceOf[Double])
-
-  def getBooleanWritable(value: Any): hadoopIo.BooleanWritable =
-    if (value == null) null else new hadoopIo.BooleanWritable(value.asInstanceOf[Boolean])
-
-  def getLongWritable(value: Any): hadoopIo.LongWritable =
-    if (value == null) null else new hadoopIo.LongWritable(value.asInstanceOf[Long])
-
-  def getFloatWritable(value: Any): hadoopIo.FloatWritable =
-    if (value == null) null else new hadoopIo.FloatWritable(value.asInstanceOf[Float])
-
-  def getShortWritable(value: Any): hiveIo.ShortWritable =
-    if (value == null) null else new hiveIo.ShortWritable(value.asInstanceOf[Short])
-
-  def getByteWritable(value: Any): hiveIo.ByteWritable =
-    if (value == null) null else new hiveIo.ByteWritable(value.asInstanceOf[Byte])
-
-  def getBinaryWritable(value: Any): hadoopIo.BytesWritable =
-    if (value == null) null else new hadoopIo.BytesWritable(value.asInstanceOf[Array[Byte]])
-
-  def getDateWritable(value: Any): hiveIo.DateWritable =
-    if (value == null) null else new hiveIo.DateWritable(value.asInstanceOf[Int])
-
-  def getTimestampWritable(value: Any): hiveIo.TimestampWritable =
-    if (value == null) {
-      null
-    } else {
-      new hiveIo.TimestampWritable(value.asInstanceOf[java.sql.Timestamp])
-    }
-
-  def getDecimalWritable(value: Any): hiveIo.HiveDecimalWritable =
-    if (value == null) {
-      null
-    } else {
-      new hiveIo.HiveDecimalWritable(
-        HiveShim.createDecimal(value.asInstanceOf[Decimal].toJavaBigDecimal))
-    }
-
-  def getPrimitiveNullWritable: NullWritable = NullWritable.get()
-
-  def createDriverResultsArray = new JArrayList[String]
-
-  def processResults(results: JArrayList[String]) = results
-
-  def getStatsSetupConstTotalSize = StatsSetupConst.TOTAL_SIZE
-
-  def getStatsSetupConstRawDataSize = StatsSetupConst.RAW_DATA_SIZE
-
-  def createDefaultDBIfNeeded(context: HiveContext) = {  }
-
-  def getCommandProcessor(cmd: Array[String], conf: HiveConf) = {
-    CommandProcessorFactory.get(cmd(0), conf)
-  }
-
-  def createDecimal(bd: java.math.BigDecimal): HiveDecimal = {
-    new HiveDecimal(bd)
-  }
-
-  def appendReadColumns(conf: Configuration, ids: Seq[Integer], names: Seq[String]) {
-    ColumnProjectionUtils.appendReadColumnIDs(conf, ids)
-    ColumnProjectionUtils.appendReadColumnNames(conf, names)
-  }
-
-  def getExternalTmpPath(context: Context, uri: URI) = {
-    context.getExternalTmpFileURI(uri)
-  }
-
-  def getDataLocationPath(p: Partition) = p.getPartitionPath
-
-  def getAllPartitionsOf(client: Hive, tbl: Table) = client.getAllPartitionsForPruner(tbl)
-
-  def compatibilityBlackList = Seq(
-    "decimal_.*",
-    "udf7",
-    "drop_partitions_filter2",
-    "show_.*",
-    "serde_regex",
-    "udf_to_date",
-    "udaf_collect_set",
-    "udf_concat"
-  )
-
-  def setLocation(tbl: Table, crtTbl: CreateTableDesc): Unit = {
-    tbl.setDataLocation(new Path(crtTbl.getLocation()).toUri())
-  }
-
-  def decimalMetastoreString(decimalType: DecimalType): String = "decimal"
-
-  def decimalTypeInfo(decimalType: DecimalType): TypeInfo =
-    TypeInfoFactory.decimalTypeInfo
-
-  def decimalTypeInfoToCatalyst(inspector: PrimitiveObjectInspector): DecimalType = {
-    DecimalType.Unlimited
-  }
-
-  def toCatalystDecimal(hdoi: HiveDecimalObjectInspector, data: Any): Decimal = {
-    if (hdoi.preferWritable()) {
-      Decimal(hdoi.getPrimitiveWritableObject(data).getHiveDecimal().bigDecimalValue)
-    } else {
-      Decimal(hdoi.getPrimitiveJavaObject(data).bigDecimalValue())
-    }
-  }
-
-  def getConvertedOI(
-      inputOI: ObjectInspector,
-      outputOI: ObjectInspector): ObjectInspector = {
-    ObjectInspectorConverters.getConvertedOI(inputOI, outputOI, true)
-  }
-
-  def prepareWritable(w: Writable): Writable = {
-    w
-  }
-
-  def setTblNullFormat(crtTbl: CreateTableDesc, tbl: Table) = {}
-}
-
-private[hive] class ShimFileSinkDesc(
-    var dir: String,
-    var tableInfo: TableDesc,
-    var compressed: Boolean)
-  extends FileSinkDesc(dir, tableInfo, compressed) {
-}

From 4f98d7a7f1715273bc91f1903bb7e0f287cc7394 Mon Sep 17 00:00:00 2001
From: Liang-Chi Hsieh <viirya@gmail.com>
Date: Wed, 27 May 2015 00:27:39 -0700
Subject: [PATCH 195/525] [SPARK-7697][SQL] Use LongType for unsigned int in
 JDBCRDD

JIRA: https://issues.apache.org/jira/browse/SPARK-7697

The reported problem case is mysql. But for h2 db, there is no unsigned int. So it is not able to add corresponding test.

Author: Liang-Chi Hsieh <viirya@gmail.com>

Closes #6229 from viirya/unsignedint_as_long and squashes the following commits:

dc4b5d8 [Liang-Chi Hsieh] Merge remote-tracking branch 'upstream/master' into unsignedint_as_long
608695b [Liang-Chi Hsieh] Use LongType for unsigned int in JDBCRDD.
---
 .../scala/org/apache/spark/sql/jdbc/JDBCRDD.scala     | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/jdbc/JDBCRDD.scala b/sql/core/src/main/scala/org/apache/spark/sql/jdbc/JDBCRDD.scala
index be03a237b6c4e..244bd3ebfeb7e 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/jdbc/JDBCRDD.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/jdbc/JDBCRDD.scala
@@ -46,7 +46,11 @@ private[sql] object JDBCRDD extends Logging {
    * @param sqlType - A field of java.sql.Types
    * @return The Catalyst type corresponding to sqlType.
    */
-  private def getCatalystType(sqlType: Int, precision: Int, scale: Int): DataType = {
+  private def getCatalystType(
+      sqlType: Int,
+      precision: Int,
+      scale: Int,
+      signed: Boolean): DataType = {
     val answer = sqlType match {
       case java.sql.Types.ARRAY         => null
       case java.sql.Types.BIGINT        => LongType
@@ -64,7 +68,7 @@ private[sql] object JDBCRDD extends Logging {
       case java.sql.Types.DISTINCT      => null
       case java.sql.Types.DOUBLE        => DoubleType
       case java.sql.Types.FLOAT         => FloatType
-      case java.sql.Types.INTEGER       => IntegerType
+      case java.sql.Types.INTEGER       => if (signed) { IntegerType } else { LongType }
       case java.sql.Types.JAVA_OBJECT   => null
       case java.sql.Types.LONGNVARCHAR  => StringType
       case java.sql.Types.LONGVARBINARY => BinaryType
@@ -123,11 +127,12 @@ private[sql] object JDBCRDD extends Logging {
           val typeName = rsmd.getColumnTypeName(i + 1)
           val fieldSize = rsmd.getPrecision(i + 1)
           val fieldScale = rsmd.getScale(i + 1)
+          val isSigned = rsmd.isSigned(i + 1)
           val nullable = rsmd.isNullable(i + 1) != ResultSetMetaData.columnNoNulls
           val metadata = new MetadataBuilder().putString("name", columnName)
           val columnType =
             dialect.getCatalystType(dataType, typeName, fieldSize, metadata).getOrElse(
-              getCatalystType(dataType, fieldSize, fieldScale))
+              getCatalystType(dataType, fieldSize, fieldScale, isSigned))
           fields(i) = StructField(columnName, columnType, nullable, metadata.build())
           i = i + 1
         }

From 9f48bf6b3761d66c7dc50f076ed92aff21b7eea0 Mon Sep 17 00:00:00 2001
From: Reynold Xin <rxin@databricks.com>
Date: Wed, 27 May 2015 01:12:59 -0700
Subject: [PATCH 196/525] [SPARK-7887][SQL] Remove EvaluatedType from SQL
 Expression.

This type is not really used. Might as well remove it.

Author: Reynold Xin <rxin@databricks.com>

Closes #6427 from rxin/evalutedType and squashes the following commits:

51a319a [Reynold Xin] [SPARK-7887][SQL] Remove EvaluatedType from SQL Expression.
---
 .../spark/sql/catalyst/analysis/unresolved.scala | 10 +++++-----
 .../catalyst/expressions/BoundAttribute.scala    |  2 --
 .../spark/sql/catalyst/expressions/Cast.scala    |  2 --
 .../sql/catalyst/expressions/Expression.scala    |  8 ++------
 .../sql/catalyst/expressions/ExtractValue.scala  |  2 --
 .../sql/catalyst/expressions/ScalaUdf.scala      |  2 --
 .../sql/catalyst/expressions/SortOrder.scala     |  2 +-
 .../sql/catalyst/expressions/aggregates.scala    |  4 +---
 .../sql/catalyst/expressions/arithmetic.scala    | 16 ++++------------
 .../sql/catalyst/expressions/complexTypes.scala  |  6 ++----
 .../catalyst/expressions/decimalFunctions.scala  |  2 --
 .../sql/catalyst/expressions/generators.scala    |  2 --
 .../sql/catalyst/expressions/literals.scala      |  2 --
 .../catalyst/expressions/mathfuncs/binary.scala  |  3 ++-
 .../catalyst/expressions/mathfuncs/unary.scala   |  1 -
 .../catalyst/expressions/namedExpressions.scala  |  6 ++----
 .../sql/catalyst/expressions/nullFunctions.scala |  1 -
 .../sql/catalyst/expressions/predicates.scala    |  6 ------
 .../spark/sql/catalyst/expressions/random.scala  |  2 --
 .../spark/sql/catalyst/expressions/sets.scala    |  4 ----
 .../catalyst/expressions/stringOperations.scala  |  8 --------
 .../catalyst/expressions/windowExpressions.scala | 12 +++++-------
 .../catalyst/plans/physical/partitioning.scala   |  4 ++--
 .../spark/sql/catalyst/trees/TreeNodeSuite.scala |  9 ++++-----
 .../expressions/MonotonicallyIncreasingID.scala  |  2 --
 .../execution/expressions/SparkPartitionID.scala |  2 --
 .../apache/spark/sql/execution/pythonUdfs.scala  |  2 +-
 .../org/apache/spark/sql/hive/hiveUdfs.scala     |  5 +----
 28 files changed, 32 insertions(+), 95 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/unresolved.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/unresolved.scala
index 2999c2ef3efe1..bbb150c1e83c7 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/unresolved.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/unresolved.scala
@@ -67,7 +67,7 @@ case class UnresolvedAttribute(nameParts: Seq[String])
   override def withName(newName: String): UnresolvedAttribute = UnresolvedAttribute.quoted(newName)
 
   // Unresolved attributes are transient at compile time and don't get evaluated during execution.
-  override def eval(input: Row = null): EvaluatedType =
+  override def eval(input: Row = null): Any =
     throw new TreeNodeException(this, s"No function to evaluate expression. type: ${this.nodeName}")
 
   override def toString: String = s"'$name"
@@ -85,7 +85,7 @@ case class UnresolvedFunction(name: String, children: Seq[Expression]) extends E
   override lazy val resolved = false
 
   // Unresolved functions are transient at compile time and don't get evaluated during execution.
-  override def eval(input: Row = null): EvaluatedType =
+  override def eval(input: Row = null): Any =
     throw new TreeNodeException(this, s"No function to evaluate expression. type: ${this.nodeName}")
 
   override def toString: String = s"'$name(${children.mkString(",")})"
@@ -107,7 +107,7 @@ trait Star extends NamedExpression with trees.LeafNode[Expression] {
   override lazy val resolved = false
 
   // Star gets expanded at runtime so we never evaluate a Star.
-  override def eval(input: Row = null): EvaluatedType =
+  override def eval(input: Row = null): Any =
     throw new TreeNodeException(this, s"No function to evaluate expression. type: ${this.nodeName}")
 
   def expand(input: Seq[Attribute], resolver: Resolver): Seq[NamedExpression]
@@ -166,7 +166,7 @@ case class MultiAlias(child: Expression, names: Seq[String])
 
   override lazy val resolved = false
 
-  override def eval(input: Row = null): EvaluatedType =
+  override def eval(input: Row = null): Any =
     throw new TreeNodeException(this, s"No function to evaluate expression. type: ${this.nodeName}")
 
   override def toString: String = s"$child AS $names"
@@ -200,7 +200,7 @@ case class UnresolvedExtractValue(child: Expression, extraction: Expression)
   override def nullable: Boolean = throw new UnresolvedException(this, "nullable")
   override lazy val resolved = false
 
-  override def eval(input: Row = null): EvaluatedType =
+  override def eval(input: Row = null): Any =
     throw new TreeNodeException(this, s"No function to evaluate expression. type: ${this.nodeName}")
 
   override def toString: String = s"$child[$extraction]"
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/BoundAttribute.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/BoundAttribute.scala
index c6217f07c452d..1ffc95c676f6f 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/BoundAttribute.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/BoundAttribute.scala
@@ -30,8 +30,6 @@ import org.apache.spark.sql.catalyst.trees
 case class BoundReference(ordinal: Int, dataType: DataType, nullable: Boolean)
   extends NamedExpression with trees.LeafNode[Expression] {
 
-  type EvaluatedType = Any
-
   override def toString: String = s"input[$ordinal]"
 
   override def eval(input: Row): Any = input(ordinal)
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala
index d8cf2b2e32435..df3cdf2cdf992 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala
@@ -105,8 +105,6 @@ case class Cast(child: Expression, dataType: DataType) extends UnaryExpression w
 
   override def toString: String = s"CAST($child, $dataType)"
 
-  type EvaluatedType = Any
-
   // [[func]] assumes the input is no longer null because eval already does the null check.
   @inline private[this] def buildCast[T](a: Any, func: T => Any): Any = func(a.asInstanceOf[T])
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala
index c7ae9da7fce49..d19928784442e 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala
@@ -25,9 +25,6 @@ import org.apache.spark.sql.types._
 abstract class Expression extends TreeNode[Expression] {
   self: Product =>
 
-  /** The narrowest possible type that is produced when this expression is evaluated. */
-  type EvaluatedType <: Any
-
   /**
    * Returns true when an expression is a candidate for static evaluation before the query is
    * executed.
@@ -44,7 +41,7 @@ abstract class Expression extends TreeNode[Expression] {
   def references: AttributeSet = AttributeSet(children.flatMap(_.references.iterator))
 
   /** Returns the result of evaluating this expression on a given input Row */
-  def eval(input: Row = null): EvaluatedType
+  def eval(input: Row = null): Any
 
   /**
    * Returns `true` if this expression and all its children have been resolved to a specific schema
@@ -117,8 +114,7 @@ abstract class UnaryExpression extends Expression with trees.UnaryNode[Expressio
 // not like a real expressions.
 case class GroupExpression(children: Seq[Expression]) extends Expression {
   self: Product =>
-  type EvaluatedType = Seq[Any]
-  override def eval(input: Row): EvaluatedType = throw new UnsupportedOperationException
+  override def eval(input: Row): Any = throw new UnsupportedOperationException
   override def nullable: Boolean = false
   override def foldable: Boolean = false
   override def dataType: DataType = throw new UnsupportedOperationException
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ExtractValue.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ExtractValue.scala
index e05926cbfe74b..b5f4e16745c1b 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ExtractValue.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ExtractValue.scala
@@ -92,8 +92,6 @@ object ExtractValue {
 
 trait ExtractValue extends UnaryExpression {
   self: Product =>
-
-  type EvaluatedType = Any
 }
 
 /**
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ScalaUdf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ScalaUdf.scala
index fe2873e0be34d..5b45347872cca 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ScalaUdf.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ScalaUdf.scala
@@ -27,8 +27,6 @@ import org.apache.spark.sql.types.DataType
 case class ScalaUdf(function: AnyRef, dataType: DataType, children: Seq[Expression])
   extends Expression {
 
-  type EvaluatedType = Any
-
   override def nullable: Boolean = true
 
   override def toString: String = s"scalaUDF(${children.mkString(",")})"
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/SortOrder.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/SortOrder.scala
index 83074eb1e6310..195eec8e5cdc4 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/SortOrder.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/SortOrder.scala
@@ -36,7 +36,7 @@ case class SortOrder(child: Expression, direction: SortDirection) extends Expres
   override def nullable: Boolean = child.nullable
 
   // SortOrder itself is never evaluated.
-  override def eval(input: Row = null): EvaluatedType =
+  override def eval(input: Row = null): Any =
     throw new TreeNodeException(this, s"No function to evaluate expression. type: ${this.nodeName}")
 
   override def toString: String = s"$child ${if (direction == Ascending) "ASC" else "DESC"}"
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregates.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregates.scala
index f3830c6d3bcf2..72eff5fe961f0 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregates.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregates.scala
@@ -37,7 +37,7 @@ abstract class AggregateExpression extends Expression {
    * [[AggregateExpression.eval]] should never be invoked because [[AggregateExpression]]'s are
    * replaced with a physical aggregate operator at runtime.
    */
-  override def eval(input: Row = null): EvaluatedType =
+  override def eval(input: Row = null): Any =
     throw new TreeNodeException(this, s"No function to evaluate expression. type: ${this.nodeName}")
 }
 
@@ -74,8 +74,6 @@ abstract class AggregateFunction
   extends AggregateExpression with Serializable with trees.LeafNode[Expression] {
   self: Product =>
 
-  override type EvaluatedType = Any
-
   /** Base should return the generic aggregate expression that this function is computing */
   val base: AggregateExpression
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/arithmetic.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/arithmetic.scala
index c7a37ad966df6..34c833b260dc0 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/arithmetic.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/arithmetic.scala
@@ -22,7 +22,6 @@ import org.apache.spark.sql.catalyst.errors.TreeNodeException
 import org.apache.spark.sql.types._
 
 case class UnaryMinus(child: Expression) extends UnaryExpression {
-  type EvaluatedType = Any
 
   override def dataType: DataType = child.dataType
   override def foldable: Boolean = child.foldable
@@ -45,7 +44,6 @@ case class UnaryMinus(child: Expression) extends UnaryExpression {
 }
 
 case class Sqrt(child: Expression) extends UnaryExpression {
-  type EvaluatedType = Any
 
   override def dataType: DataType = DoubleType
   override def foldable: Boolean = child.foldable
@@ -72,8 +70,6 @@ case class Sqrt(child: Expression) extends UnaryExpression {
 abstract class BinaryArithmetic extends BinaryExpression {
   self: Product =>
 
-  type EvaluatedType = Any
-
   override lazy val resolved =
     left.resolved && right.resolved &&
     left.dataType == right.dataType &&
@@ -101,7 +97,7 @@ abstract class BinaryArithmetic extends BinaryExpression {
     }
   }
 
-  def evalInternal(evalE1: EvaluatedType, evalE2: EvaluatedType): Any =
+  def evalInternal(evalE1: Any, evalE2: Any): Any =
     sys.error(s"BinaryExpressions must either override eval or evalInternal")
 }
 
@@ -244,7 +240,7 @@ case class BitwiseAnd(left: Expression, right: Expression) extends BinaryArithme
     case other => sys.error(s"Unsupported bitwise & operation on $other")
   }
 
-  override def evalInternal(evalE1: EvaluatedType, evalE2: EvaluatedType): Any = and(evalE1, evalE2)
+  override def evalInternal(evalE1: Any, evalE2: Any): Any = and(evalE1, evalE2)
 }
 
 /**
@@ -265,7 +261,7 @@ case class BitwiseOr(left: Expression, right: Expression) extends BinaryArithmet
     case other => sys.error(s"Unsupported bitwise | operation on $other")
   }
 
-  override def evalInternal(evalE1: EvaluatedType, evalE2: EvaluatedType): Any = or(evalE1, evalE2)
+  override def evalInternal(evalE1: Any, evalE2: Any): Any = or(evalE1, evalE2)
 }
 
 /**
@@ -286,14 +282,13 @@ case class BitwiseXor(left: Expression, right: Expression) extends BinaryArithme
     case other => sys.error(s"Unsupported bitwise ^ operation on $other")
   }
 
-  override def evalInternal(evalE1: EvaluatedType, evalE2: EvaluatedType): Any = xor(evalE1, evalE2)
+  override def evalInternal(evalE1: Any, evalE2: Any): Any = xor(evalE1, evalE2)
 }
 
 /**
  * A function that calculates bitwise not(~) of a number.
  */
 case class BitwiseNot(child: Expression) extends UnaryExpression {
-  type EvaluatedType = Any
 
   override def dataType: DataType = child.dataType
   override def foldable: Boolean = child.foldable
@@ -323,7 +318,6 @@ case class BitwiseNot(child: Expression) extends UnaryExpression {
 }
 
 case class MaxOf(left: Expression, right: Expression) extends Expression {
-  type EvaluatedType = Any
 
   override def foldable: Boolean = left.foldable && right.foldable
 
@@ -368,7 +362,6 @@ case class MaxOf(left: Expression, right: Expression) extends Expression {
 }
 
 case class MinOf(left: Expression, right: Expression) extends Expression {
-  type EvaluatedType = Any
 
   override def foldable: Boolean = left.foldable && right.foldable
 
@@ -416,7 +409,6 @@ case class MinOf(left: Expression, right: Expression) extends Expression {
  * A function that get the absolute value of the numeric value.
  */
 case class Abs(child: Expression) extends UnaryExpression  {
-  type EvaluatedType = Any
 
   override def dataType: DataType = child.dataType
   override def foldable: Boolean = child.foldable
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypes.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypes.scala
index 956a2429b0b61..e7cd7131a9e56 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypes.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypes.scala
@@ -24,8 +24,7 @@ import org.apache.spark.sql.types._
  * Returns an Array containing the evaluation of all children expressions.
  */
 case class CreateArray(children: Seq[Expression]) extends Expression {
-  override type EvaluatedType = Any
-  
+
   override def foldable: Boolean = children.forall(_.foldable)
   
   lazy val childTypes = children.map(_.dataType).distinct
@@ -54,7 +53,6 @@ case class CreateArray(children: Seq[Expression]) extends Expression {
  * TODO: [[CreateStruct]] does not support codegen.
  */
 case class CreateStruct(children: Seq[NamedExpression]) extends Expression {
-  override type EvaluatedType = Row
 
   override def foldable: Boolean = children.forall(_.foldable)
 
@@ -71,7 +69,7 @@ case class CreateStruct(children: Seq[NamedExpression]) extends Expression {
 
   override def nullable: Boolean = false
 
-  override def eval(input: Row): EvaluatedType = {
+  override def eval(input: Row): Any = {
     Row(children.map(_.eval(input)): _*)
   }
 }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/decimalFunctions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/decimalFunctions.scala
index adb94df7d1c7b..65ba18924afe1 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/decimalFunctions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/decimalFunctions.scala
@@ -21,7 +21,6 @@ import org.apache.spark.sql.types._
 
 /** Return the unscaled Long value of a Decimal, assuming it fits in a Long */
 case class UnscaledValue(child: Expression) extends UnaryExpression {
-  override type EvaluatedType = Any
 
   override def dataType: DataType = LongType
   override def foldable: Boolean = child.foldable
@@ -40,7 +39,6 @@ case class UnscaledValue(child: Expression) extends UnaryExpression {
 
 /** Create a Decimal from an unscaled Long value */
 case class MakeDecimal(child: Expression, precision: Int, scale: Int) extends UnaryExpression {
-  override type EvaluatedType = Decimal
 
   override def dataType: DataType = DecimalType(precision, scale)
   override def foldable: Boolean = child.foldable
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/generators.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/generators.scala
index 747a47bdde953..cab40feb72d47 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/generators.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/generators.scala
@@ -40,8 +40,6 @@ import org.apache.spark.sql.types._
 abstract class Generator extends Expression {
   self: Product =>
 
-  override type EvaluatedType = TraversableOnce[Row]
-
   // TODO ideally we should return the type of ArrayType(StructType),
   // however, we don't keep the output field names in the Generator.
   override def dataType: DataType = throw new UnsupportedOperationException
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/literals.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/literals.scala
index 5f8c7354aede1..d3ca3d9a4b18b 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/literals.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/literals.scala
@@ -78,14 +78,12 @@ case class Literal protected (value: Any, dataType: DataType) extends LeafExpres
 
   override def toString: String = if (value != null) value.toString else "null"
 
-  type EvaluatedType = Any
   override def eval(input: Row): Any = value
 }
 
 // TODO: Specialize
 case class MutableLiteral(var value: Any, dataType: DataType, nullable: Boolean = true)
     extends LeafExpression {
-  type EvaluatedType = Any
 
   def update(expression: Expression, input: Row): Unit = {
     value = expression.eval(input)
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/mathfuncs/binary.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/mathfuncs/binary.scala
index fcc06d3aa1036..d5be44626f8e1 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/mathfuncs/binary.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/mathfuncs/binary.scala
@@ -29,7 +29,7 @@ import org.apache.spark.sql.types._
  */
 abstract class BinaryMathExpression(f: (Double, Double) => Double, name: String) 
   extends BinaryExpression with Serializable with ExpectsInputTypes { self: Product =>
-  type EvaluatedType = Any
+
   override def symbol: String = null
   override def expectedChildTypes: Seq[DataType] = Seq(DoubleType, DoubleType)
 
@@ -68,6 +68,7 @@ abstract class BinaryMathExpression(f: (Double, Double) => Double, name: String)
 case class Atan2(
     left: Expression,
     right: Expression) extends BinaryMathExpression(math.atan2, "ATAN2") {
+
   override def eval(input: Row): Any = {
     val evalE1 = left.eval(input)
     if (evalE1 == null) {
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/mathfuncs/unary.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/mathfuncs/unary.scala
index dc68469e060cb..cdcb8e2840baf 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/mathfuncs/unary.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/mathfuncs/unary.scala
@@ -28,7 +28,6 @@ import org.apache.spark.sql.types._
 abstract class MathematicalExpression(f: Double => Double, name: String)
   extends UnaryExpression with Serializable with ExpectsInputTypes {
   self: Product =>
-  type EvaluatedType = Any
 
   override def expectedChildTypes: Seq[DataType] = Seq(DoubleType)
   override def dataType: DataType = DoubleType
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/namedExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/namedExpressions.scala
index 50be26d0b08b5..00565ec651a59 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/namedExpressions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/namedExpressions.scala
@@ -111,7 +111,6 @@ case class Alias(child: Expression, name: String)(
     val explicitMetadata: Option[Metadata] = None)
   extends NamedExpression with trees.UnaryNode[Expression] {
 
-  override type EvaluatedType = Any
   // Alias(Generator, xx) need to be transformed into Generate(generator, ...)
   override lazy val resolved = childrenResolved && !child.isInstanceOf[Generator]
 
@@ -229,7 +228,7 @@ case class AttributeReference(
   }
 
   // Unresolved attributes are transient at compile time and don't get evaluated during execution.
-  override def eval(input: Row = null): EvaluatedType =
+  override def eval(input: Row = null): Any =
     throw new TreeNodeException(this, s"No function to evaluate expression. type: ${this.nodeName}")
 
   override def toString: String = s"$name#${exprId.id}$typeSuffix"
@@ -240,7 +239,6 @@ case class AttributeReference(
  * expression id or the unresolved indicator.
  */
 case class PrettyAttribute(name: String) extends Attribute with trees.LeafNode[Expression] {
-  type EvaluatedType = Any
 
   override def toString: String = name
 
@@ -252,7 +250,7 @@ case class PrettyAttribute(name: String) extends Attribute with trees.LeafNode[E
   override def withName(newName: String): Attribute = throw new UnsupportedOperationException
   override def qualifiers: Seq[String] = throw new UnsupportedOperationException
   override def exprId: ExprId = throw new UnsupportedOperationException
-  override def eval(input: Row): EvaluatedType = throw new UnsupportedOperationException
+  override def eval(input: Row): Any = throw new UnsupportedOperationException
   override def nullable: Boolean = throw new UnsupportedOperationException
   override def dataType: DataType = NullType
 }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/nullFunctions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/nullFunctions.scala
index f9161cf34f0c9..5070570b4740d 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/nullFunctions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/nullFunctions.scala
@@ -22,7 +22,6 @@ import org.apache.spark.sql.catalyst.analysis.UnresolvedException
 import org.apache.spark.sql.types.DataType
 
 case class Coalesce(children: Seq[Expression]) extends Expression {
-  type EvaluatedType = Any
 
   /** Coalesce is nullable if all of its children are nullable, or if it has no children. */
   override def nullable: Boolean = !children.exists(!_.nullable)
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala
index 1d72a9eb834b9..e2d1c8115e051 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala
@@ -35,8 +35,6 @@ trait Predicate extends Expression {
   self: Product =>
 
   override def dataType: DataType = BooleanType
-
-  type EvaluatedType = Any
 }
 
 trait PredicateHelper {
@@ -341,8 +339,6 @@ case class If(predicate: Expression, trueValue: Expression, falseValue: Expressi
     trueValue.dataType
   }
 
-  type EvaluatedType = Any
-
   override def eval(input: Row): Any = {
     if (true == predicate.eval(input)) {
       trueValue.eval(input)
@@ -357,8 +353,6 @@ case class If(predicate: Expression, trueValue: Expression, falseValue: Expressi
 trait CaseWhenLike extends Expression {
   self: Product =>
 
-  type EvaluatedType = Any
-
   // Note that `branches` are considered in consecutive pairs (cond, val), and the optional last
   // element is the value for the default catch-all case (if provided).
   // Hence, `branches` consists of at least two elements, and can have an odd or even length.
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/random.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/random.scala
index 66d7c8b07cce8..de82c15680607 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/random.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/random.scala
@@ -38,8 +38,6 @@ abstract class RDG(seed: Long) extends LeafExpression with Serializable {
    */
   @transient protected lazy val rng = new XORShiftRandom(seed + TaskContext.get().partitionId())
 
-  override type EvaluatedType = Double
-
   override def nullable: Boolean = false
 
   override def dataType: DataType = DoubleType
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/sets.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/sets.scala
index 4c44182278207..b65bf165f21db 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/sets.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/sets.scala
@@ -51,7 +51,6 @@ private[sql] class OpenHashSetUDT(
  * Creates a new set of the specified type
  */
 case class NewSet(elementType: DataType) extends LeafExpression {
-  type EvaluatedType = Any
 
   override def nullable: Boolean = false
 
@@ -69,7 +68,6 @@ case class NewSet(elementType: DataType) extends LeafExpression {
  * For performance, this expression mutates its input during evaluation.
  */
 case class AddItemToSet(item: Expression, set: Expression) extends Expression {
-  type EvaluatedType = Any
 
   override def children: Seq[Expression] = item :: set :: Nil
 
@@ -101,7 +99,6 @@ case class AddItemToSet(item: Expression, set: Expression) extends Expression {
  * For performance, this expression mutates its left input set during evaluation.
  */
 case class CombineSets(left: Expression, right: Expression) extends BinaryExpression {
-  type EvaluatedType = Any
 
   override def nullable: Boolean = left.nullable || right.nullable
 
@@ -133,7 +130,6 @@ case class CombineSets(left: Expression, right: Expression) extends BinaryExpres
  * Returns the number of elements in the input set.
  */
 case class CountSet(child: Expression) extends UnaryExpression {
-  type EvaluatedType = Any
 
   override def nullable: Boolean = child.nullable
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala
index 7683e0990ce80..5da93fe9c6cf9 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala
@@ -25,8 +25,6 @@ import org.apache.spark.sql.types._
 trait StringRegexExpression extends ExpectsInputTypes {
   self: BinaryExpression =>
 
-  type EvaluatedType = Any
-
   def escape(v: String): String
   def matches(regex: Pattern, str: String): Boolean
 
@@ -114,8 +112,6 @@ case class RLike(left: Expression, right: Expression)
 trait CaseConversionExpression extends ExpectsInputTypes {
   self: UnaryExpression =>
 
-  type EvaluatedType = Any
-
   def convert(v: UTF8String): UTF8String
 
   override def foldable: Boolean = child.foldable
@@ -159,8 +155,6 @@ trait StringComparison extends ExpectsInputTypes {
 
   def compare(l: UTF8String, r: UTF8String): Boolean
 
-  override type EvaluatedType = Any
-
   override def nullable: Boolean = left.nullable || right.nullable
 
   override def expectedChildTypes: Seq[DataType] = Seq(StringType, StringType)
@@ -211,8 +205,6 @@ case class EndsWith(left: Expression, right: Expression)
  */
 case class Substring(str: Expression, pos: Expression, len: Expression)
   extends Expression with ExpectsInputTypes {
-  
-  type EvaluatedType = Any
 
   override def foldable: Boolean = str.foldable && pos.foldable && len.foldable
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/windowExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/windowExpressions.scala
index 099d67ca7fee3..2729b34a0833f 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/windowExpressions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/windowExpressions.scala
@@ -66,8 +66,6 @@ case class WindowSpecDefinition(
     }
   }
 
-  type EvaluatedType = Any
-
   override def children: Seq[Expression]  = partitionSpec ++ orderSpec
 
   override lazy val resolved: Boolean =
@@ -76,7 +74,7 @@ case class WindowSpecDefinition(
 
   override def toString: String = simpleString
 
-  override def eval(input: Row): EvaluatedType = throw new UnsupportedOperationException
+  override def eval(input: Row): Any = throw new UnsupportedOperationException
   override def nullable: Boolean = true
   override def foldable: Boolean = false
   override def dataType: DataType = throw new UnsupportedOperationException
@@ -299,7 +297,7 @@ case class UnresolvedWindowFunction(
   override def get(index: Int): Any =
     throw new UnresolvedException(this, "get")
   // Unresolved functions are transient at compile time and don't get evaluated during execution.
-  override def eval(input: Row = null): EvaluatedType =
+  override def eval(input: Row = null): Any =
     throw new TreeNodeException(this, s"No function to evaluate expression. type: ${this.nodeName}")
 
   override def toString: String = s"'$name(${children.mkString(",")})"
@@ -311,25 +309,25 @@ case class UnresolvedWindowFunction(
 case class UnresolvedWindowExpression(
     child: UnresolvedWindowFunction,
     windowSpec: WindowSpecReference) extends UnaryExpression {
+
   override def dataType: DataType = throw new UnresolvedException(this, "dataType")
   override def foldable: Boolean = throw new UnresolvedException(this, "foldable")
   override def nullable: Boolean = throw new UnresolvedException(this, "nullable")
   override lazy val resolved = false
 
   // Unresolved functions are transient at compile time and don't get evaluated during execution.
-  override def eval(input: Row = null): EvaluatedType =
+  override def eval(input: Row = null): Any =
     throw new TreeNodeException(this, s"No function to evaluate expression. type: ${this.nodeName}")
 }
 
 case class WindowExpression(
     windowFunction: WindowFunction,
     windowSpec: WindowSpecDefinition) extends Expression {
-  override type EvaluatedType = Any
 
   override def children: Seq[Expression] =
     windowFunction :: windowSpec :: Nil
 
-  override def eval(input: Row): EvaluatedType =
+  override def eval(input: Row): Any =
     throw new TreeNodeException(this, s"No function to evaluate expression. type: ${this.nodeName}")
 
   override def dataType: DataType = windowFunction.dataType
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/physical/partitioning.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/physical/partitioning.scala
index fb4217a44807b..80ba57a082a60 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/physical/partitioning.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/physical/partitioning.scala
@@ -169,7 +169,7 @@ case class HashPartitioning(expressions: Seq[Expression], numPartitions: Int)
 
   override def keyExpressions: Seq[Expression] = expressions
 
-  override def eval(input: Row = null): EvaluatedType =
+  override def eval(input: Row = null): Any =
     throw new TreeNodeException(this, s"No function to evaluate expression. type: ${this.nodeName}")
 }
 
@@ -213,6 +213,6 @@ case class RangePartitioning(ordering: Seq[SortOrder], numPartitions: Int)
 
   override def keyExpressions: Seq[Expression] = ordering.map(_.child)
 
-  override def eval(input: Row): EvaluatedType =
+  override def eval(input: Row): Any =
     throw new TreeNodeException(this, s"No function to evaluate expression. type: ${this.nodeName}")
 }
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/trees/TreeNodeSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/trees/TreeNodeSuite.scala
index 3d10dab5ba34c..e5f77dcd962a4 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/trees/TreeNodeSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/trees/TreeNodeSuite.scala
@@ -25,12 +25,11 @@ import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.types.{IntegerType, StringType, NullType}
 
 case class Dummy(optKey: Option[Expression]) extends Expression {
-  def children: Seq[Expression] = optKey.toSeq
-  def nullable: Boolean = true
-  def dataType: NullType = NullType
+  override def children: Seq[Expression] = optKey.toSeq
+  override def nullable: Boolean = true
+  override def dataType: NullType = NullType
   override lazy val resolved = true
-  type EvaluatedType = Any
-  def eval(input: Row): Any = null.asInstanceOf[Any]
+  override def eval(input: Row): Any = null.asInstanceOf[Any]
 }
 
 class TreeNodeSuite extends FunSuite {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/expressions/MonotonicallyIncreasingID.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/expressions/MonotonicallyIncreasingID.scala
index 9ac732b55b188..e228a60c9029f 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/expressions/MonotonicallyIncreasingID.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/expressions/MonotonicallyIncreasingID.scala
@@ -39,8 +39,6 @@ private[sql] case class MonotonicallyIncreasingID() extends LeafExpression {
    */
   @transient private[this] var count: Long = 0L
 
-  override type EvaluatedType = Long
-
   override def nullable: Boolean = false
 
   override def dataType: DataType = LongType
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/expressions/SparkPartitionID.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/expressions/SparkPartitionID.scala
index c2c6cbd491598..1272793f88cd0 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/expressions/SparkPartitionID.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/expressions/SparkPartitionID.scala
@@ -27,8 +27,6 @@ import org.apache.spark.sql.types.{IntegerType, DataType}
  */
 private[sql] case object SparkPartitionID extends LeafExpression {
 
-  override type EvaluatedType = Int
-
   override def nullable: Boolean = false
 
   override def dataType: DataType = IntegerType
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/pythonUdfs.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/pythonUdfs.scala
index 11b2897f76786..55f3ff4709013 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/pythonUdfs.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/pythonUdfs.scala
@@ -56,7 +56,7 @@ private[spark] case class PythonUDF(
 
   def nullable: Boolean = true
 
-  override def eval(input: Row): PythonUDF.this.EvaluatedType = {
+  override def eval(input: Row): Any = {
     sys.error("PythonUDFs can not be directly evaluated.")
   }
 }
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/hiveUdfs.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/hiveUdfs.scala
index bc6b3a2d58c38..7ec4f7332502e 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/hiveUdfs.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/hiveUdfs.scala
@@ -75,7 +75,7 @@ private[hive] abstract class HiveFunctionRegistry
 
 private[hive] case class HiveSimpleUdf(funcWrapper: HiveFunctionWrapper, children: Seq[Expression])
   extends Expression with HiveInspectors with Logging {
-  type EvaluatedType = Any
+
   type UDFType = UDF
 
   override def nullable: Boolean = true
@@ -139,7 +139,6 @@ private[hive] class DeferredObjectAdapter(oi: ObjectInspector)
 private[hive] case class HiveGenericUdf(funcWrapper: HiveFunctionWrapper, children: Seq[Expression])
   extends Expression with HiveInspectors with Logging {
   type UDFType = GenericUDF
-  type EvaluatedType = Any
 
   override def nullable: Boolean = true
 
@@ -336,8 +335,6 @@ private[hive] case class HiveWindowFunction(
 
   def nullable: Boolean = true
 
-  override type EvaluatedType = Any
-
   override def eval(input: Row): Any =
     throw new TreeNodeException(this, s"No function to evaluate expression. type: ${this.nodeName}")
 

From 3e7d7d6b3d6e07b52b1a138f7aa2ef628597fe05 Mon Sep 17 00:00:00 2001
From: Reynold Xin <rxin@databricks.com>
Date: Wed, 27 May 2015 01:13:57 -0700
Subject: [PATCH 197/525] [SQL] Rename MathematicalExpression
 UnaryMathExpression, and specify BinaryMathExpression's output data type as
 DoubleType.

Two minor changes.

cc brkyvz

Author: Reynold Xin <rxin@databricks.com>

Closes #6428 from rxin/math-func-cleanup and squashes the following commits:

5910df5 [Reynold Xin] [SQL] Rename MathematicalExpression UnaryMathExpression, and specify BinaryMathExpression's output data type as DoubleType.
---
 .../expressions/mathfuncs/binary.scala        |  9 +---
 .../expressions/mathfuncs/unary.scala         | 46 +++++++++----------
 2 files changed, 23 insertions(+), 32 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/mathfuncs/binary.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/mathfuncs/binary.scala
index d5be44626f8e1..890efc9f52ca3 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/mathfuncs/binary.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/mathfuncs/binary.scala
@@ -17,7 +17,6 @@
 
 package org.apache.spark.sql.catalyst.expressions.mathfuncs
 
-import org.apache.spark.sql.catalyst.analysis.UnresolvedException
 import org.apache.spark.sql.catalyst.expressions.{ExpectsInputTypes, BinaryExpression, Expression, Row}
 import org.apache.spark.sql.types._
 
@@ -41,13 +40,7 @@ abstract class BinaryMathExpression(f: (Double, Double) => Double, name: String)
       left.dataType == right.dataType &&
       !DecimalType.isFixed(left.dataType)
 
-  override def dataType: DataType = {
-    if (!resolved) {
-      throw new UnresolvedException(this,
-        s"datatype. Can not resolve due to differing types ${left.dataType}, ${right.dataType}")
-    }
-    left.dataType
-  }
+  override def dataType: DataType = DoubleType
 
   override def eval(input: Row): Any = {
     val evalE1 = left.eval(input)
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/mathfuncs/unary.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/mathfuncs/unary.scala
index cdcb8e2840baf..41b422346a02d 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/mathfuncs/unary.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/mathfuncs/unary.scala
@@ -25,7 +25,7 @@ import org.apache.spark.sql.types._
  * input format, therefore these functions extend `ExpectsInputTypes`.
  * @param name The short name of the function
  */
-abstract class MathematicalExpression(f: Double => Double, name: String)
+abstract class UnaryMathExpression(f: Double => Double, name: String)
   extends UnaryExpression with Serializable with ExpectsInputTypes {
   self: Product =>
 
@@ -46,46 +46,44 @@ abstract class MathematicalExpression(f: Double => Double, name: String)
   }
 }
 
-case class Acos(child: Expression) extends MathematicalExpression(math.acos, "ACOS")
+case class Acos(child: Expression) extends UnaryMathExpression(math.acos, "ACOS")
 
-case class Asin(child: Expression) extends MathematicalExpression(math.asin, "ASIN")
+case class Asin(child: Expression) extends UnaryMathExpression(math.asin, "ASIN")
 
-case class Atan(child: Expression) extends MathematicalExpression(math.atan, "ATAN")
+case class Atan(child: Expression) extends UnaryMathExpression(math.atan, "ATAN")
 
-case class Cbrt(child: Expression) extends MathematicalExpression(math.cbrt, "CBRT")
+case class Cbrt(child: Expression) extends UnaryMathExpression(math.cbrt, "CBRT")
 
-case class Ceil(child: Expression) extends MathematicalExpression(math.ceil, "CEIL")
+case class Ceil(child: Expression) extends UnaryMathExpression(math.ceil, "CEIL")
 
-case class Cos(child: Expression) extends MathematicalExpression(math.cos, "COS")
+case class Cos(child: Expression) extends UnaryMathExpression(math.cos, "COS")
 
-case class Cosh(child: Expression) extends MathematicalExpression(math.cosh, "COSH")
+case class Cosh(child: Expression) extends UnaryMathExpression(math.cosh, "COSH")
 
-case class Exp(child: Expression) extends MathematicalExpression(math.exp, "EXP")
+case class Exp(child: Expression) extends UnaryMathExpression(math.exp, "EXP")
 
-case class Expm1(child: Expression) extends MathematicalExpression(math.expm1, "EXPM1")
+case class Expm1(child: Expression) extends UnaryMathExpression(math.expm1, "EXPM1")
 
-case class Floor(child: Expression) extends MathematicalExpression(math.floor, "FLOOR")
+case class Floor(child: Expression) extends UnaryMathExpression(math.floor, "FLOOR")
 
-case class Log(child: Expression) extends MathematicalExpression(math.log, "LOG")
+case class Log(child: Expression) extends UnaryMathExpression(math.log, "LOG")
 
-case class Log10(child: Expression) extends MathematicalExpression(math.log10, "LOG10")
+case class Log10(child: Expression) extends UnaryMathExpression(math.log10, "LOG10")
 
-case class Log1p(child: Expression) extends MathematicalExpression(math.log1p, "LOG1P")
+case class Log1p(child: Expression) extends UnaryMathExpression(math.log1p, "LOG1P")
 
-case class Rint(child: Expression) extends MathematicalExpression(math.rint, "ROUND")
+case class Rint(child: Expression) extends UnaryMathExpression(math.rint, "ROUND")
 
-case class Signum(child: Expression) extends MathematicalExpression(math.signum, "SIGNUM")
+case class Signum(child: Expression) extends UnaryMathExpression(math.signum, "SIGNUM")
 
-case class Sin(child: Expression) extends MathematicalExpression(math.sin, "SIN")
+case class Sin(child: Expression) extends UnaryMathExpression(math.sin, "SIN")
 
-case class Sinh(child: Expression) extends MathematicalExpression(math.sinh, "SINH")
+case class Sinh(child: Expression) extends UnaryMathExpression(math.sinh, "SINH")
 
-case class Tan(child: Expression) extends MathematicalExpression(math.tan, "TAN")
+case class Tan(child: Expression) extends UnaryMathExpression(math.tan, "TAN")
 
-case class Tanh(child: Expression) extends MathematicalExpression(math.tanh, "TANH")
+case class Tanh(child: Expression) extends UnaryMathExpression(math.tanh, "TANH")
 
-case class ToDegrees(child: Expression) 
-  extends MathematicalExpression(math.toDegrees, "DEGREES")
+case class ToDegrees(child: Expression) extends UnaryMathExpression(math.toDegrees, "DEGREES")
 
-case class ToRadians(child: Expression) 
-  extends MathematicalExpression(math.toRadians, "RADIANS")
+case class ToRadians(child: Expression) extends UnaryMathExpression(math.toRadians, "RADIANS")

From 4615081d7a10b023491e25478d19b8161e030974 Mon Sep 17 00:00:00 2001
From: scwf <wangfei1@huawei.com>
Date: Wed, 27 May 2015 09:12:18 -0500
Subject: [PATCH 198/525] [CORE] [TEST] HistoryServerSuite failed due to
 timezone issue

follow up for #6377
Change time to the equivalent in GMT
/cc squito

Author: scwf <wangfei1@huawei.com>

Closes #6425 from scwf/fix-HistoryServerSuite and squashes the following commits:

4d37935 [scwf] fix HistoryServerSuite
---
 .../org/apache/spark/deploy/history/HistoryServerSuite.scala    | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/core/src/test/scala/org/apache/spark/deploy/history/HistoryServerSuite.scala b/core/src/test/scala/org/apache/spark/deploy/history/HistoryServerSuite.scala
index 4adb5122bcf1a..e10dd4cf837aa 100644
--- a/core/src/test/scala/org/apache/spark/deploy/history/HistoryServerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/deploy/history/HistoryServerSuite.scala
@@ -82,7 +82,7 @@ class HistoryServerSuite extends FunSuite with BeforeAndAfter with Matchers with
     "running app list json" -> "applications?status=running",
     "minDate app list json" -> "applications?minDate=2015-02-10",
     "maxDate app list json" -> "applications?maxDate=2015-02-10",
-    "maxDate2 app list json" -> "applications?maxDate=2015-02-03T10:42:40.000CST",
+    "maxDate2 app list json" -> "applications?maxDate=2015-02-03T16:42:40.000GMT",
     "one app json" -> "applications/local-1422981780767",
     "one app multi-attempt json" -> "applications/local-1426533911241",
     "job list json" -> "applications/local-1422981780767/jobs",

From ff0ddff46935ae3d036b7dbc437fff8a6c19d6a4 Mon Sep 17 00:00:00 2001
From: Kay Ousterhout <kayousterhout@gmail.com>
Date: Wed, 27 May 2015 09:32:29 -0700
Subject: [PATCH 199/525] [SPARK-7878] Rename Stage.jobId to firstJobId

The previous name was confusing, because each stage can be associated with
many jobs, and jobId is just the ID of the first job that was associated
with the Stage. This commit also renames some of the method parameters in
DAGScheduler.scala to clarify when the jobId refers to the first job ID
associated with the stage (as opposed to the jobId associated with a job
that's currently being scheduled).

cc markhamstra JoshRosen (hopefully this will help prevent future bugs like SPARK-6880)

Author: Kay Ousterhout <kayousterhout@gmail.com>

Closes #6418 from kayousterhout/SPARK-7878 and squashes the following commits:

b71a9b8 [Kay Ousterhout] [SPARK-7878] Rename Stage.jobId to firstJobId
---
 .../apache/spark/scheduler/DAGScheduler.scala | 58 +++++++++----------
 .../apache/spark/scheduler/ResultStage.scala  |  4 +-
 .../spark/scheduler/ShuffleMapStage.scala     |  4 +-
 .../org/apache/spark/scheduler/Stage.scala    |  4 +-
 4 files changed, 33 insertions(+), 37 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala b/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala
index 5d812918a13d1..a083be2448aa3 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala
@@ -208,19 +208,17 @@ class DAGScheduler(
 
   /**
    * Get or create a shuffle map stage for the given shuffle dependency's map side.
-   * The jobId value passed in will be used if the stage doesn't already exist with
-   * a lower jobId (jobId always increases across jobs.)
    */
   private def getShuffleMapStage(
       shuffleDep: ShuffleDependency[_, _, _],
-      jobId: Int): ShuffleMapStage = {
+      firstJobId: Int): ShuffleMapStage = {
     shuffleToMapStage.get(shuffleDep.shuffleId) match {
       case Some(stage) => stage
       case None =>
         // We are going to register ancestor shuffle dependencies
-        registerShuffleDependencies(shuffleDep, jobId)
+        registerShuffleDependencies(shuffleDep, firstJobId)
         // Then register current shuffleDep
-        val stage = newOrUsedShuffleStage(shuffleDep, jobId)
+        val stage = newOrUsedShuffleStage(shuffleDep, firstJobId)
         shuffleToMapStage(shuffleDep.shuffleId) = stage
 
         stage
@@ -230,15 +228,15 @@ class DAGScheduler(
   /**
    * Helper function to eliminate some code re-use when creating new stages.
    */
-  private def getParentStagesAndId(rdd: RDD[_], jobId: Int): (List[Stage], Int) = {
-    val parentStages = getParentStages(rdd, jobId)
+  private def getParentStagesAndId(rdd: RDD[_], firstJobId: Int): (List[Stage], Int) = {
+    val parentStages = getParentStages(rdd, firstJobId)
     val id = nextStageId.getAndIncrement()
     (parentStages, id)
   }
 
   /**
    * Create a ShuffleMapStage as part of the (re)-creation of a shuffle map stage in
-   * newOrUsedShuffleStage.  The stage will be associated with the provided jobId.
+   * newOrUsedShuffleStage.  The stage will be associated with the provided firstJobId.
    * Production of shuffle map stages should always use newOrUsedShuffleStage, not
    * newShuffleMapStage directly.
    */
@@ -246,21 +244,19 @@ class DAGScheduler(
       rdd: RDD[_],
       numTasks: Int,
       shuffleDep: ShuffleDependency[_, _, _],
-      jobId: Int,
+      firstJobId: Int,
       callSite: CallSite): ShuffleMapStage = {
-    val (parentStages: List[Stage], id: Int) = getParentStagesAndId(rdd, jobId)
+    val (parentStages: List[Stage], id: Int) = getParentStagesAndId(rdd, firstJobId)
     val stage: ShuffleMapStage = new ShuffleMapStage(id, rdd, numTasks, parentStages,
-      jobId, callSite, shuffleDep)
+      firstJobId, callSite, shuffleDep)
 
     stageIdToStage(id) = stage
-    updateJobIdStageIdMaps(jobId, stage)
+    updateJobIdStageIdMaps(firstJobId, stage)
     stage
   }
 
   /**
-   * Create a ResultStage -- either directly for use as a result stage, or as part of the
-   * (re)-creation of a shuffle map stage in newOrUsedShuffleStage.  The stage will be associated
-   * with the provided jobId.
+   * Create a ResultStage associated with the provided jobId.
    */
   private def newResultStage(
       rdd: RDD[_],
@@ -277,16 +273,16 @@ class DAGScheduler(
 
   /**
    * Create a shuffle map Stage for the given RDD.  The stage will also be associated with the
-   * provided jobId.  If a stage for the shuffleId existed previously so that the shuffleId is
+   * provided firstJobId.  If a stage for the shuffleId existed previously so that the shuffleId is
    * present in the MapOutputTracker, then the number and location of available outputs are
    * recovered from the MapOutputTracker
    */
   private def newOrUsedShuffleStage(
       shuffleDep: ShuffleDependency[_, _, _],
-      jobId: Int): ShuffleMapStage = {
+      firstJobId: Int): ShuffleMapStage = {
     val rdd = shuffleDep.rdd
     val numTasks = rdd.partitions.size
-    val stage = newShuffleMapStage(rdd, numTasks, shuffleDep, jobId, rdd.creationSite)
+    val stage = newShuffleMapStage(rdd, numTasks, shuffleDep, firstJobId, rdd.creationSite)
     if (mapOutputTracker.containsShuffle(shuffleDep.shuffleId)) {
       val serLocs = mapOutputTracker.getSerializedMapOutputStatuses(shuffleDep.shuffleId)
       val locs = MapOutputTracker.deserializeMapStatuses(serLocs)
@@ -304,10 +300,10 @@ class DAGScheduler(
   }
 
   /**
-   * Get or create the list of parent stages for a given RDD. The stages will be assigned the
-   * provided jobId if they haven't already been created with a lower jobId.
+   * Get or create the list of parent stages for a given RDD.  The new Stages will be created with
+   * the provided firstJobId.
    */
-  private def getParentStages(rdd: RDD[_], jobId: Int): List[Stage] = {
+  private def getParentStages(rdd: RDD[_], firstJobId: Int): List[Stage] = {
     val parents = new HashSet[Stage]
     val visited = new HashSet[RDD[_]]
     // We are manually maintaining a stack here to prevent StackOverflowError
@@ -321,7 +317,7 @@ class DAGScheduler(
         for (dep <- r.dependencies) {
           dep match {
             case shufDep: ShuffleDependency[_, _, _] =>
-              parents += getShuffleMapStage(shufDep, jobId)
+              parents += getShuffleMapStage(shufDep, firstJobId)
             case _ =>
               waitingForVisit.push(dep.rdd)
           }
@@ -336,11 +332,11 @@ class DAGScheduler(
   }
 
   /** Find ancestor missing shuffle dependencies and register into shuffleToMapStage */
-  private def registerShuffleDependencies(shuffleDep: ShuffleDependency[_, _, _], jobId: Int) {
+  private def registerShuffleDependencies(shuffleDep: ShuffleDependency[_, _, _], firstJobId: Int) {
     val parentsWithNoMapStage = getAncestorShuffleDependencies(shuffleDep.rdd)
     while (parentsWithNoMapStage.nonEmpty) {
       val currentShufDep = parentsWithNoMapStage.pop()
-      val stage = newOrUsedShuffleStage(currentShufDep, jobId)
+      val stage = newOrUsedShuffleStage(currentShufDep, firstJobId)
       shuffleToMapStage(currentShufDep.shuffleId) = stage
     }
   }
@@ -390,7 +386,7 @@ class DAGScheduler(
           for (dep <- rdd.dependencies) {
             dep match {
               case shufDep: ShuffleDependency[_, _, _] =>
-                val mapStage = getShuffleMapStage(shufDep, stage.jobId)
+                val mapStage = getShuffleMapStage(shufDep, stage.firstJobId)
                 if (!mapStage.isAvailable) {
                   missing += mapStage
                 }
@@ -577,7 +573,7 @@ class DAGScheduler(
 
   private[scheduler] def doCancelAllJobs() {
     // Cancel all running jobs.
-    runningStages.map(_.jobId).foreach(handleJobCancellation(_,
+    runningStages.map(_.firstJobId).foreach(handleJobCancellation(_,
       reason = "as part of cancellation of all jobs"))
     activeJobs.clear() // These should already be empty by this point,
     jobIdToActiveJob.clear() // but just in case we lost track of some jobs...
@@ -603,7 +599,7 @@ class DAGScheduler(
       clearCacheLocs()
       val failedStagesCopy = failedStages.toArray
       failedStages.clear()
-      for (stage <- failedStagesCopy.sortBy(_.jobId)) {
+      for (stage <- failedStagesCopy.sortBy(_.firstJobId)) {
         submitStage(stage)
       }
     }
@@ -623,7 +619,7 @@ class DAGScheduler(
     logTrace("failed: " + failedStages)
     val waitingStagesCopy = waitingStages.toArray
     waitingStages.clear()
-    for (stage <- waitingStagesCopy.sortBy(_.jobId)) {
+    for (stage <- waitingStagesCopy.sortBy(_.firstJobId)) {
       submitStage(stage)
     }
   }
@@ -843,7 +839,7 @@ class DAGScheduler(
       }
     }
 
-    val properties = jobIdToActiveJob.get(stage.jobId).map(_.properties).orNull
+    val properties = jobIdToActiveJob.get(stage.firstJobId).map(_.properties).orNull
 
     runningStages += stage
     // SparkListenerStageSubmitted should be posted before testing whether tasks are
@@ -909,7 +905,7 @@ class DAGScheduler(
       stage.pendingTasks ++= tasks
       logDebug("New pending tasks: " + stage.pendingTasks)
       taskScheduler.submitTasks(
-        new TaskSet(tasks.toArray, stage.id, stage.newAttemptId(), stage.jobId, properties))
+        new TaskSet(tasks.toArray, stage.id, stage.newAttemptId(), stage.firstJobId, properties))
       stage.latestInfo.submissionTime = Some(clock.getTimeMillis())
     } else {
       // Because we posted SparkListenerStageSubmitted earlier, we should mark
@@ -1323,7 +1319,7 @@ class DAGScheduler(
         for (dep <- rdd.dependencies) {
           dep match {
             case shufDep: ShuffleDependency[_, _, _] =>
-              val mapStage = getShuffleMapStage(shufDep, stage.jobId)
+              val mapStage = getShuffleMapStage(shufDep, stage.firstJobId)
               if (!mapStage.isAvailable) {
                 waitingForVisit.push(mapStage.rdd)
               }  // Otherwise there's no need to follow the dependency back
diff --git a/core/src/main/scala/org/apache/spark/scheduler/ResultStage.scala b/core/src/main/scala/org/apache/spark/scheduler/ResultStage.scala
index c0f3d5a13d623..bf81b9aca4810 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/ResultStage.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/ResultStage.scala
@@ -28,9 +28,9 @@ private[spark] class ResultStage(
     rdd: RDD[_],
     numTasks: Int,
     parents: List[Stage],
-    jobId: Int,
+    firstJobId: Int,
     callSite: CallSite)
-  extends Stage(id, rdd, numTasks, parents, jobId, callSite) {
+  extends Stage(id, rdd, numTasks, parents, firstJobId, callSite) {
 
   // The active job for this result stage. Will be empty if the job has already finished
   // (e.g., because the job was cancelled).
diff --git a/core/src/main/scala/org/apache/spark/scheduler/ShuffleMapStage.scala b/core/src/main/scala/org/apache/spark/scheduler/ShuffleMapStage.scala
index d02210743484c..66c75f325fcde 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/ShuffleMapStage.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/ShuffleMapStage.scala
@@ -30,10 +30,10 @@ private[spark] class ShuffleMapStage(
     rdd: RDD[_],
     numTasks: Int,
     parents: List[Stage],
-    jobId: Int,
+    firstJobId: Int,
     callSite: CallSite,
     val shuffleDep: ShuffleDependency[_, _, _])
-  extends Stage(id, rdd, numTasks, parents, jobId, callSite) {
+  extends Stage(id, rdd, numTasks, parents, firstJobId, callSite) {
 
   override def toString: String = "ShuffleMapStage " + id
 
diff --git a/core/src/main/scala/org/apache/spark/scheduler/Stage.scala b/core/src/main/scala/org/apache/spark/scheduler/Stage.scala
index 5d0ddb8377c33..c59d6e4f5bc04 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/Stage.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/Stage.scala
@@ -34,7 +34,7 @@ import org.apache.spark.util.CallSite
  * initiated a job (e.g. count(), save(), etc). For shuffle map stages, we also track the nodes
  * that each output partition is on.
  *
- * Each Stage also has a jobId, identifying the job that first submitted the stage.  When FIFO
+ * Each Stage also has a firstJobId, identifying the job that first submitted the stage.  When FIFO
  * scheduling is used, this allows Stages from earlier jobs to be computed first or recovered
  * faster on failure.
  *
@@ -51,7 +51,7 @@ private[spark] abstract class Stage(
     val rdd: RDD[_],
     val numTasks: Int,
     val parents: List[Stage],
-    val jobId: Int,
+    val firstJobId: Int,
     val callSite: CallSite)
   extends Logging {
 

From 15459db4f6867e95076cf53fade2fca833c4cf4e Mon Sep 17 00:00:00 2001
From: Cheng Lian <lian@databricks.com>
Date: Wed, 27 May 2015 10:09:12 -0700
Subject: [PATCH 200/525] [SPARK-7847] [SQL] Fixes dynamic partition directory
 escaping

Please refer to [SPARK-7847] [1] for details.

[1]: https://issues.apache.org/jira/browse/SPARK-7847

Author: Cheng Lian <lian@databricks.com>

Closes #6389 from liancheng/spark-7847 and squashes the following commits:

935c652 [Cheng Lian] Adds test case for writing various data types as dynamic partition value
f4fc398 [Cheng Lian] Converts partition columns to Scala type when writing dynamic partitions
d0aeca0 [Cheng Lian] Fixes dynamic partition directory escaping
---
 .../apache/spark/sql/parquet/newParquet.scala | 22 ++++--
 .../spark/sql/sources/PartitioningUtils.scala | 76 ++++++++++++++++++-
 .../apache/spark/sql/sources/commands.scala   | 57 ++------------
 .../ParquetPartitionDiscoverySuite.scala      | 57 +++++++++++++-
 4 files changed, 152 insertions(+), 60 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/newParquet.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/newParquet.scala
index cb1e60883df1e..8b3e1b2b59bf6 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/newParquet.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/newParquet.scala
@@ -17,6 +17,7 @@
 
 package org.apache.spark.sql.parquet
 
+import java.net.URI
 import java.util.{List => JList}
 
 import scala.collection.JavaConversions._
@@ -282,21 +283,28 @@ private[sql] class ParquetRelation2(
         val cacheMetadata = useMetadataCache
 
         @transient val cachedStatuses = inputFiles.map { f =>
-          // In order to encode the authority of a Path containing special characters such as /,
-          // we need to use the string returned by the URI of the path to create a new Path.
-          val pathWithAuthority = new Path(f.getPath.toUri.toString)
-
+          // In order to encode the authority of a Path containing special characters such as '/'
+          // (which does happen in some S3N credentials), we need to use the string returned by the
+          // URI of the path to create a new Path.
+          val pathWithEscapedAuthority = escapePathUserInfo(f.getPath)
           new FileStatus(
             f.getLen, f.isDir, f.getReplication, f.getBlockSize, f.getModificationTime,
-            f.getAccessTime, f.getPermission, f.getOwner, f.getGroup, pathWithAuthority)
+            f.getAccessTime, f.getPermission, f.getOwner, f.getGroup, pathWithEscapedAuthority)
         }.toSeq
 
         @transient val cachedFooters = footers.map { f =>
           // In order to encode the authority of a Path containing special characters such as /,
           // we need to use the string returned by the URI of the path to create a new Path.
-          new Footer(new Path(f.getFile.toUri.toString), f.getParquetMetadata)
+          new Footer(escapePathUserInfo(f.getFile), f.getParquetMetadata)
         }.toSeq
 
+        private def escapePathUserInfo(path: Path): Path = {
+          val uri = path.toUri
+          new Path(new URI(
+            uri.getScheme, uri.getRawUserInfo, uri.getHost, uri.getPort, uri.getPath,
+            uri.getQuery, uri.getFragment))
+        }
+
         // Overridden so we can inject our own cached files statuses.
         override def getPartitions: Array[SparkPartition] = {
           val inputFormat = if (cacheMetadata) {
@@ -377,7 +385,7 @@ private[sql] class ParquetRelation2(
               .orElse(readSchema())
               .orElse(maybeMetastoreSchema)
               .getOrElse(sys.error("Failed to get the schema."))
-        
+
           // If this Parquet relation is converted from a Hive Metastore table, must reconcile case
           // case insensitivity issue and possible schema mismatch (probably caused by schema
           // evolution).
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/sources/PartitioningUtils.scala b/sql/core/src/main/scala/org/apache/spark/sql/sources/PartitioningUtils.scala
index e0ead23d786f9..dafdf0f8b4564 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/sources/PartitioningUtils.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/sources/PartitioningUtils.scala
@@ -24,6 +24,7 @@ import scala.collection.mutable.ArrayBuffer
 import scala.util.Try
 
 import org.apache.hadoop.fs.Path
+import org.apache.hadoop.util.Shell
 
 import org.apache.spark.sql.Row
 import org.apache.spark.sql.catalyst.expressions.{Cast, Literal}
@@ -221,7 +222,7 @@ private[sql] object PartitioningUtils {
       // Then falls back to string
       .getOrElse {
         if (raw == defaultPartitionName) Literal.create(null, NullType)
-        else Literal.create(raw, StringType)
+        else Literal.create(unescapePathName(raw), StringType)
       }
   }
 
@@ -243,4 +244,77 @@ private[sql] object PartitioningUtils {
       Literal.create(Cast(l, desiredType).eval(), desiredType)
     }
   }
+
+  //////////////////////////////////////////////////////////////////////////////////////////////////
+  // The following string escaping code is mainly copied from Hive (o.a.h.h.common.FileUtils).
+  //////////////////////////////////////////////////////////////////////////////////////////////////
+
+  val charToEscape = {
+    val bitSet = new java.util.BitSet(128)
+
+    /**
+     * ASCII 01-1F are HTTP control characters that need to be escaped.
+     * \u000A and \u000D are \n and \r, respectively.
+     */
+    val clist = Array(
+      '\u0001', '\u0002', '\u0003', '\u0004', '\u0005', '\u0006', '\u0007', '\u0008', '\u0009',
+      '\n', '\u000B', '\u000C', '\r', '\u000E', '\u000F', '\u0010', '\u0011', '\u0012', '\u0013',
+      '\u0014', '\u0015', '\u0016', '\u0017', '\u0018', '\u0019', '\u001A', '\u001B', '\u001C',
+      '\u001D', '\u001E', '\u001F', '"', '#', '%', '\'', '*', '/', ':', '=', '?', '\\', '\u007F',
+      '{', '[', ']', '^')
+
+    clist.foreach(bitSet.set(_))
+
+    if (Shell.WINDOWS) {
+      Array(' ', '<', '>', '|').foreach(bitSet.set(_))
+    }
+
+    bitSet
+  }
+
+  def needsEscaping(c: Char): Boolean = {
+    c >= 0 && c < charToEscape.size() && charToEscape.get(c)
+  }
+
+  def escapePathName(path: String): String = {
+    val builder = new StringBuilder()
+    path.foreach { c =>
+      if (needsEscaping(c)) {
+        builder.append('%')
+        builder.append(f"${c.asInstanceOf[Int]}%02x")
+      } else {
+        builder.append(c)
+      }
+    }
+
+    builder.toString()
+  }
+
+  def unescapePathName(path: String): String = {
+    val sb = new StringBuilder
+    var i = 0
+
+    while (i < path.length) {
+      val c = path.charAt(i)
+      if (c == '%' && i + 2 < path.length) {
+        val code: Int = try {
+          Integer.valueOf(path.substring(i + 1, i + 3), 16)
+        } catch { case e: Exception =>
+          -1: Integer
+        }
+        if (code >= 0) {
+          sb.append(code.asInstanceOf[Char])
+          i += 3
+        } else {
+          sb.append(c)
+          i += 1
+        }
+      } else {
+        sb.append(c)
+        i += 1
+      }
+    }
+
+    sb.toString()
+  }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/sources/commands.scala b/sql/core/src/main/scala/org/apache/spark/sql/sources/commands.scala
index fbd98ef0380e1..3132067d562f6 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/sources/commands.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/sources/commands.scala
@@ -24,7 +24,6 @@ import scala.collection.mutable
 import org.apache.hadoop.fs.Path
 import org.apache.hadoop.mapreduce._
 import org.apache.hadoop.mapreduce.lib.output.{FileOutputCommitter => MapReduceFileOutputCommitter, FileOutputFormat}
-import org.apache.hadoop.util.Shell
 import parquet.hadoop.util.ContextUtil
 
 import org.apache.spark._
@@ -35,7 +34,8 @@ import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.expressions.codegen.GenerateProjection
 import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
 import org.apache.spark.sql.execution.RunnableCommand
-import org.apache.spark.sql.{SQLConf, DataFrame, SQLContext, SaveMode}
+import org.apache.spark.sql.types.StructType
+import org.apache.spark.sql.{DataFrame, SQLConf, SQLContext, SaveMode}
 
 private[sql] case class InsertIntoDataSource(
     logicalRelation: LogicalRelation,
@@ -208,9 +208,11 @@ private[sql] case class InsertIntoHadoopFsRelation(
           writerContainer.outputWriterForRow(partitionPart).write(convertedDataPart)
         }
       } else {
+        val partitionSchema = StructType.fromAttributes(partitionOutput)
+        val converter = CatalystTypeConverters.createToScalaConverter(partitionSchema)
         while (iterator.hasNext) {
           val row = iterator.next()
-          val partitionPart = partitionProj(row)
+          val partitionPart = converter(partitionProj(row)).asInstanceOf[Row]
           val dataPart = dataProj(row)
           writerContainer.outputWriterForRow(partitionPart).write(dataPart)
         }
@@ -416,7 +418,7 @@ private[sql] class DynamicPartitionWriterContainer(
       val valueString = if (string == null || string.isEmpty) {
         defaultPartitionName
       } else {
-        DynamicPartitionWriterContainer.escapePathName(string)
+        PartitioningUtils.escapePathName(string)
       }
       s"/$col=$valueString"
     }.mkString.stripPrefix(Path.SEPARATOR)
@@ -448,50 +450,3 @@ private[sql] class DynamicPartitionWriterContainer(
     }
   }
 }
-
-private[sql] object DynamicPartitionWriterContainer {
-  //////////////////////////////////////////////////////////////////////////////////////////////////
-  // The following string escaping code is mainly copied from Hive (o.a.h.h.common.FileUtils).
-  //////////////////////////////////////////////////////////////////////////////////////////////////
-
-  val charToEscape = {
-    val bitSet = new java.util.BitSet(128)
-
-    /**
-     * ASCII 01-1F are HTTP control characters that need to be escaped.
-     * \u000A and \u000D are \n and \r, respectively.
-     */
-    val clist = Array(
-      '\u0001', '\u0002', '\u0003', '\u0004', '\u0005', '\u0006', '\u0007', '\u0008', '\u0009',
-      '\n', '\u000B', '\u000C', '\r', '\u000E', '\u000F', '\u0010', '\u0011', '\u0012', '\u0013',
-      '\u0014', '\u0015', '\u0016', '\u0017', '\u0018', '\u0019', '\u001A', '\u001B', '\u001C',
-      '\u001D', '\u001E', '\u001F', '"', '#', '%', '\'', '*', '/', ':', '=', '?', '\\', '\u007F',
-      '{', '[', ']', '^')
-
-    clist.foreach(bitSet.set(_))
-
-    if (Shell.WINDOWS) {
-      Array(' ', '<', '>', '|').foreach(bitSet.set(_))
-    }
-
-    bitSet
-  }
-
-  def needsEscaping(c: Char): Boolean = {
-    c >= 0 && c < charToEscape.size() && charToEscape.get(c)
-  }
-
-  def escapePathName(path: String): String = {
-    val builder = new StringBuilder()
-    path.foreach { c =>
-      if (DynamicPartitionWriterContainer.needsEscaping(c)) {
-        builder.append('%')
-        builder.append(f"${c.asInstanceOf[Int]}%02x")
-      } else {
-        builder.append(c)
-      }
-    }
-
-    builder.toString()
-  }
-}
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetPartitionDiscoverySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetPartitionDiscoverySuite.scala
index 90d4528efca48..f231589e9674d 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetPartitionDiscoverySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetPartitionDiscoverySuite.scala
@@ -17,6 +17,8 @@
 package org.apache.spark.sql.parquet
 
 import java.io.File
+import java.math.BigInteger
+import java.sql.{Timestamp, Date}
 
 import scala.collection.mutable.ArrayBuffer
 
@@ -27,7 +29,7 @@ import org.apache.spark.sql.sources.PartitioningUtils._
 import org.apache.spark.sql.sources.{LogicalRelation, Partition, PartitionSpec}
 import org.apache.spark.sql.test.TestSQLContext
 import org.apache.spark.sql.types._
-import org.apache.spark.sql.{QueryTest, Row, SQLContext}
+import org.apache.spark.sql.{Column, QueryTest, Row, SQLContext}
 
 // The data where the partitioning key exists only in the directory structure.
 case class ParquetData(intField: Int, stringField: String)
@@ -377,4 +379,57 @@ class ParquetPartitionDiscoverySuite extends QueryTest with ParquetTest {
       }
     }
   }
+
+  test("SPARK-7847: Dynamic partition directory path escaping and unescaping") {
+    withTempPath { dir =>
+      val df = Seq("/", "[]", "?").zipWithIndex.map(_.swap).toDF("i", "s")
+      df.write.format("parquet").partitionBy("s").save(dir.getCanonicalPath)
+      checkAnswer(read.parquet(dir.getCanonicalPath), df.collect())
+    }
+  }
+
+  test("Various partition value types") {
+    val row =
+      Row(
+        100.toByte,
+        40000.toShort,
+        Int.MaxValue,
+        Long.MaxValue,
+        1.5.toFloat,
+        4.5,
+        new java.math.BigDecimal(new BigInteger("212500"), 5),
+        new java.math.BigDecimal(2.125),
+        java.sql.Date.valueOf("2015-05-23"),
+        new Timestamp(0),
+        "This is a string, /[]?=:",
+        "This is not a partition column")
+
+    // BooleanType is not supported yet
+    val partitionColumnTypes =
+      Seq(
+        ByteType,
+        ShortType,
+        IntegerType,
+        LongType,
+        FloatType,
+        DoubleType,
+        DecimalType(10, 5),
+        DecimalType.Unlimited,
+        DateType,
+        TimestampType,
+        StringType)
+
+    val partitionColumns = partitionColumnTypes.zipWithIndex.map {
+      case (t, index) => StructField(s"p_$index", t)
+    }
+
+    val schema = StructType(partitionColumns :+ StructField(s"i", StringType))
+    val df = createDataFrame(sparkContext.parallelize(row :: Nil), schema)
+
+    withTempPath { dir =>
+      df.write.format("parquet").partitionBy(partitionColumns.map(_.name): _*).save(dir.toString)
+      val fields = schema.map(f => Column(f.name).cast(f.dataType))
+      checkAnswer(read.load(dir.toString).select(fields: _*), row)
+    }
+  }
 }

From 0db76c90ad5f84d7a5640c41de74876b906ddc90 Mon Sep 17 00:00:00 2001
From: Kousuke Saruta <sarutak@oss.nttdata.co.jp>
Date: Wed, 27 May 2015 11:41:35 -0700
Subject: [PATCH 201/525] [SPARK-7864] [UI] Fix the logic grabbing the link
 from table in AllJobPage

This issue is related to #6419 .
Now AllJobPage doesn't have a "kill link" but I think fix the issue mentioned in #6419 just in case to avoid accidents in the future.

So, it's minor issue for now and I don't file this issue in JIRA.

Author: Kousuke Saruta <sarutak@oss.nttdata.co.jp>

Closes #6432 from sarutak/remove-ambiguity-of-link and squashes the following commits:

cd1a503 [Kousuke Saruta] Fixed ambiguity link issue in AllJobPage
---
 .../main/resources/org/apache/spark/ui/static/timeline-view.js  | 2 +-
 core/src/main/scala/org/apache/spark/ui/jobs/AllJobsPage.scala  | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/core/src/main/resources/org/apache/spark/ui/static/timeline-view.js b/core/src/main/resources/org/apache/spark/ui/static/timeline-view.js
index 28ac998e8d065..ca74ef9d7e94e 100644
--- a/core/src/main/resources/org/apache/spark/ui/static/timeline-view.js
+++ b/core/src/main/resources/org/apache/spark/ui/static/timeline-view.js
@@ -46,7 +46,7 @@ function drawApplicationTimeline(groupArray, eventObjArray, startTime) {
       };
 
       $(this).click(function() {
-        var jobPagePath = $(getSelectorForJobEntry(this)).find("a").attr("href")
+        var jobPagePath = $(getSelectorForJobEntry(this)).find("a.name-link").attr("href")
           window.location.href = jobPagePath
       });
 
diff --git a/core/src/main/scala/org/apache/spark/ui/jobs/AllJobsPage.scala b/core/src/main/scala/org/apache/spark/ui/jobs/AllJobsPage.scala
index e010ebef3b34a..2ce670ad02e97 100644
--- a/core/src/main/scala/org/apache/spark/ui/jobs/AllJobsPage.scala
+++ b/core/src/main/scala/org/apache/spark/ui/jobs/AllJobsPage.scala
@@ -231,7 +231,7 @@ private[ui] class AllJobsPage(parent: JobsTab) extends WebUIPage("") {
         </td>
         <td>
           <span class="description-input" title={lastStageDescription}>{lastStageDescription}</span>
-          <a href={detailUrl}>{lastStageName}</a>
+          <a href={detailUrl} class="name-link">{lastStageName}</a>
         </td>
         <td sorttable_customkey={job.submissionTime.getOrElse(-1).toString}>
           {formattedSubmissionTime}

From 6fec1a9409b34d8ce58ea1c330b52cc7ef3e7e7e Mon Sep 17 00:00:00 2001
From: Reynold Xin <rxin@databricks.com>
Date: Wed, 27 May 2015 11:54:35 -0700
Subject: [PATCH 202/525] Removed Guava dependency from JavaTypeInference's
 type signature.

This should also close #6243.

Author: Reynold Xin <rxin@databricks.com>

Closes #6431 from rxin/JavaTypeInference-guava and squashes the following commits:

e58df3c [Reynold Xin] Removed Gauva dependency from JavaTypeInference's type signature.
---
 .../apache/spark/sql/catalyst/JavaTypeInference.scala | 11 ++++++++++-
 .../main/scala/org/apache/spark/sql/SQLContext.scala  |  4 +---
 2 files changed, 11 insertions(+), 4 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/JavaTypeInference.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/JavaTypeInference.scala
index 625c8d3a62125..9a3f9694e4c48 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/JavaTypeInference.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/JavaTypeInference.scala
@@ -38,12 +38,21 @@ private [sql] object JavaTypeInference {
   private val keySetReturnType = classOf[JMap[_, _]].getMethod("keySet").getGenericReturnType
   private val valuesReturnType = classOf[JMap[_, _]].getMethod("values").getGenericReturnType
 
+  /**
+   * Infers the corresponding SQL data type of a JavaClean class.
+   * @param beanClass Java type
+   * @return (SQL data type, nullable)
+   */
+  def inferDataType(beanClass: Class[_]): (DataType, Boolean) = {
+    inferDataType(TypeToken.of(beanClass))
+  }
+
   /**
    * Infers the corresponding SQL data type of a Java type.
    * @param typeToken Java type
    * @return (SQL data type, nullable)
    */
-  private [sql] def inferDataType(typeToken: TypeToken[_]): (DataType, Boolean) = {
+  private def inferDataType(typeToken: TypeToken[_]): (DataType, Boolean) = {
     // TODO: All of this could probably be moved to Catalyst as it is mostly not Spark specific.
     typeToken.getRawType match {
       case c: Class[_] if c.isAnnotationPresent(classOf[SQLUserDefinedType]) =>
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
index 3935f7b321b85..15c30352bee69 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
@@ -27,8 +27,6 @@ import scala.language.implicitConversions
 import scala.reflect.runtime.universe.TypeTag
 import scala.util.control.NonFatal
 
-import com.google.common.reflect.TypeToken
-
 import org.apache.spark.SparkContext
 import org.apache.spark.annotation.{DeveloperApi, Experimental}
 import org.apache.spark.api.java.{JavaRDD, JavaSparkContext}
@@ -1011,7 +1009,7 @@ class SQLContext(@transient val sparkContext: SparkContext)
    * Returns a Catalyst Schema for the given java bean class.
    */
   protected def getSchema(beanClass: Class[_]): Seq[AttributeReference] = {
-    val (dataType, _) = JavaTypeInference.inferDataType(TypeToken.of(beanClass))
+    val (dataType, _) = JavaTypeInference.inferDataType(beanClass)
     dataType.asInstanceOf[StructType].fields.map { f =>
       AttributeReference(f.name, f.dataType, f.nullable)()
     }

From 8161562eabc1eff430cfd9d8eaf413a8c4ef2cfb Mon Sep 17 00:00:00 2001
From: Daoyuan Wang <daoyuan.wang@intel.com>
Date: Wed, 27 May 2015 12:42:13 -0700
Subject: [PATCH 203/525] [SPARK-7790] [SQL] date and decimal conversion for
 dynamic partition key

Author: Daoyuan Wang <daoyuan.wang@intel.com>

Closes #6318 from adrian-wang/dynpart and squashes the following commits:

ad73b61 [Daoyuan Wang] not use sqlTestUtils for try catch because dont have sqlcontext here
6c33b51 [Daoyuan Wang] fix according to liancheng
f0f8074 [Daoyuan Wang] some specific types as dynamic partition
---
 .../hive/execution/InsertIntoHiveTable.scala  |  2 +-
 .../spark/sql/hive/hiveWriterContainers.scala | 17 ++++++++--
 .../sql/hive/execution/SQLQuerySuite.scala    | 33 +++++++++++++++++++
 3 files changed, 48 insertions(+), 4 deletions(-)

diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveTable.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveTable.scala
index c0b0b104e9142..7a6ca48b54a24 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveTable.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveTable.scala
@@ -106,7 +106,7 @@ case class InsertIntoHiveTable(
         }
 
         writerContainer
-          .getLocalFileWriter(row)
+          .getLocalFileWriter(row, table.schema)
           .write(serializer.serialize(outputData, standardOI))
       }
 
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/hiveWriterContainers.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/hiveWriterContainers.scala
index cbc381cc81b59..50b209f7ccbb8 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/hiveWriterContainers.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/hiveWriterContainers.scala
@@ -34,8 +34,10 @@ import org.apache.hadoop.hive.common.FileUtils
 import org.apache.spark.mapred.SparkHadoopMapRedUtil
 import org.apache.spark.sql.Row
 import org.apache.spark.{Logging, SerializableWritable, SparkHadoopWriter}
+import org.apache.spark.sql.catalyst.util.DateUtils
 import org.apache.spark.sql.hive.{ShimFileSinkDesc => FileSinkDesc}
 import org.apache.spark.sql.hive.HiveShim._
+import org.apache.spark.sql.types._
 
 /**
  * Internal helper class that saves an RDD using a Hive OutputFormat.
@@ -92,7 +94,7 @@ private[hive] class SparkHiveWriterContainer(
     "part-" + numberFormat.format(splitID) + extension
   }
 
-  def getLocalFileWriter(row: Row): FileSinkOperator.RecordWriter = writer
+  def getLocalFileWriter(row: Row, schema: StructType): FileSinkOperator.RecordWriter = writer
 
   def close() {
     // Seems the boolean value passed into close does not matter.
@@ -195,11 +197,20 @@ private[spark] class SparkHiveDynamicPartitionWriterContainer(
     jobConf.setBoolean(SUCCESSFUL_JOB_OUTPUT_DIR_MARKER, oldMarker)
   }
 
-  override def getLocalFileWriter(row: Row): FileSinkOperator.RecordWriter = {
+  override def getLocalFileWriter(row: Row, schema: StructType): FileSinkOperator.RecordWriter = {
+    def convertToHiveRawString(col: String, value: Any): String = {
+      val raw = String.valueOf(value)
+      schema(col).dataType match {
+        case DateType => DateUtils.toString(raw.toInt)
+        case _: DecimalType => BigDecimal(raw).toString()
+        case _ => raw
+      }
+    }
+
     val dynamicPartPath = dynamicPartColNames
       .zip(row.toSeq.takeRight(dynamicPartColNames.length))
       .map { case (col, rawVal) =>
-        val string = if (rawVal == null) null else String.valueOf(rawVal)
+        val string = if (rawVal == null) null else convertToHiveRawString(col, rawVal)
         val colString =
           if (string == null || string.isEmpty) {
             defaultPartName
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
index b707f5e68489b..538e66125c5fe 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
@@ -837,4 +837,37 @@ class SQLQuerySuite extends QueryTest {
         java.lang.Math.exp(1.0).toString,
         java.lang.Math.floor(1.9).toString))
   }
+
+  test("dynamic partition value test") {
+    try {
+      sql("set hive.exec.dynamic.partition.mode=nonstrict")
+      // date
+      sql("drop table if exists dynparttest1")
+      sql("create table dynparttest1 (value int) partitioned by (pdate date)")
+      sql(
+        """
+          |insert into table dynparttest1 partition(pdate)
+          | select count(*), cast('2015-05-21' as date) as pdate from src
+        """.stripMargin)
+      checkAnswer(
+        sql("select * from dynparttest1"),
+        Seq(Row(500, java.sql.Date.valueOf("2015-05-21"))))
+
+      // decimal
+      sql("drop table if exists dynparttest2")
+      sql("create table dynparttest2 (value int) partitioned by (pdec decimal(5, 1))")
+      sql(
+        """
+          |insert into table dynparttest2 partition(pdec)
+          | select count(*), cast('100.12' as decimal(5, 1)) as pdec from src
+        """.stripMargin)
+      checkAnswer(
+        sql("select * from dynparttest2"),
+        Seq(Row(500, new java.math.BigDecimal("100.1"))))
+    } finally {
+      sql("drop table if exists dynparttest1")
+      sql("drop table if exists dynparttest2")
+      sql("set hive.exec.dynamic.partition.mode=strict")
+    }
+  }
 }

From b97ddff000b99adca3dd8fe13d01054fd5014fa0 Mon Sep 17 00:00:00 2001
From: Cheng Lian <lian@databricks.com>
Date: Wed, 27 May 2015 13:09:33 -0700
Subject: [PATCH 204/525] [SPARK-7684] [SQL] Refactoring
 MetastoreDataSourcesSuite to workaround SPARK-7684

As stated in SPARK-7684, currently `TestHive.reset` has some execution order specific bug, which makes running specific test suites locally pretty frustrating. This PR refactors `MetastoreDataSourcesSuite` (which relies on `TestHive.reset` heavily) using various `withXxx` utility methods in `SQLTestUtils` to ask each test case to cleanup their own mess so that we can avoid calling `TestHive.reset`.

Author: Cheng Lian <lian@databricks.com>
Author: Yin Huai <yhuai@databricks.com>

Closes #6353 from liancheng/workaround-spark-7684 and squashes the following commits:

26939aa [Yin Huai] Move the initialization of jsonFilePath to beforeAll.
a423d48 [Cheng Lian] Fixes Scala style issue
dfe45d0 [Cheng Lian] Refactors MetastoreDataSourcesSuite to workaround SPARK-7684
92a116d [Cheng Lian] Fixes minor styling issues
---
 .../org/apache/spark/sql/QueryTest.scala      |    4 +
 .../apache/spark/sql/test/SQLTestUtils.scala  |   12 +-
 .../sql/hive/MetastoreDataSourcesSuite.scala  | 1372 +++++++++--------
 3 files changed, 722 insertions(+), 666 deletions(-)

diff --git a/sql/core/src/test/scala/org/apache/spark/sql/QueryTest.scala b/sql/core/src/test/scala/org/apache/spark/sql/QueryTest.scala
index bbf9ab113ca43..98ba3c99283a1 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/QueryTest.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/QueryTest.scala
@@ -67,6 +67,10 @@ class QueryTest extends PlanTest {
     checkAnswer(df, Seq(expectedAnswer))
   }
 
+  protected def checkAnswer(df: DataFrame, expectedAnswer: DataFrame): Unit = {
+    checkAnswer(df, expectedAnswer.collect())
+  }
+
   def sqlTest(sqlString: String, expectedAnswer: Seq[Row])(implicit sqlContext: SQLContext) {
     test(sqlString) {
       checkAnswer(sqlContext.sql(sqlString), expectedAnswer)
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/test/SQLTestUtils.scala b/sql/core/src/test/scala/org/apache/spark/sql/test/SQLTestUtils.scala
index ca66cdc48272d..17a8b0cca09df 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/test/SQLTestUtils.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/test/SQLTestUtils.scala
@@ -75,14 +75,18 @@ trait SQLTestUtils {
   /**
    * Drops temporary table `tableName` after calling `f`.
    */
-  protected def withTempTable(tableName: String)(f: => Unit): Unit = {
-    try f finally sqlContext.dropTempTable(tableName)
+  protected def withTempTable(tableNames: String*)(f: => Unit): Unit = {
+    try f finally tableNames.foreach(sqlContext.dropTempTable)
   }
 
   /**
    * Drops table `tableName` after calling `f`.
    */
-  protected def withTable(tableName: String)(f: => Unit): Unit = {
-    try f finally sqlContext.sql(s"DROP TABLE IF EXISTS $tableName")
+  protected def withTable(tableNames: String*)(f: => Unit): Unit = {
+    try f finally {
+      tableNames.foreach { name =>
+        sqlContext.sql(s"DROP TABLE IF EXISTS $name")
+      }
+    }
   }
 }
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala
index 9623ef06aa9b0..58e2d1fbfa73e 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala
@@ -21,770 +21,818 @@ import java.io.File
 
 import scala.collection.mutable.ArrayBuffer
 
+import org.scalatest.BeforeAndAfterAll
+
 import org.apache.hadoop.fs.Path
 import org.apache.hadoop.mapred.InvalidInputException
-import org.scalatest.BeforeAndAfterEach
 
 import org.apache.spark.sql._
 import org.apache.spark.sql.hive.client.{HiveTable, ManagedTable}
+import org.apache.spark.sql.hive.test.TestHive
 import org.apache.spark.sql.hive.test.TestHive._
 import org.apache.spark.sql.hive.test.TestHive.implicits._
 import org.apache.spark.sql.parquet.ParquetRelation2
 import org.apache.spark.sql.sources.LogicalRelation
+import org.apache.spark.sql.test.SQLTestUtils
 import org.apache.spark.sql.types._
 import org.apache.spark.util.Utils
 
 /**
  * Tests for persisting tables created though the data sources API into the metastore.
  */
-class MetastoreDataSourcesSuite extends QueryTest with BeforeAndAfterEach {
+class MetastoreDataSourcesSuite extends QueryTest with SQLTestUtils with BeforeAndAfterAll {
+  override val sqlContext = TestHive
+
+  var jsonFilePath: String = _
 
-  override def afterEach(): Unit = {
-    reset()
-    Utils.deleteRecursively(tempPath)
+  override def beforeAll(): Unit = {
+    jsonFilePath = Utils.getSparkClassLoader.getResource("sample.json").getFile
   }
 
-  val filePath = Utils.getSparkClassLoader.getResource("sample.json").getFile
-  var tempPath: File = Utils.createTempDir()
-  tempPath.delete()
-
-  test ("persistent JSON table") {
-    sql(
-      s"""
-        |CREATE TABLE jsonTable
-        |USING org.apache.spark.sql.json.DefaultSource
-        |OPTIONS (
-        |  path '${filePath}'
-        |)
-      """.stripMargin)
-
-    checkAnswer(
-      sql("SELECT * FROM jsonTable"),
-      read.json(filePath).collect().toSeq)
+  test("persistent JSON table") {
+    withTable("jsonTable") {
+      sql(
+        s"""CREATE TABLE jsonTable
+           |USING org.apache.spark.sql.json.DefaultSource
+           |OPTIONS (
+           |  path '$jsonFilePath'
+           |)
+         """.stripMargin)
+
+      checkAnswer(
+        sql("SELECT * FROM jsonTable"),
+        read.json(jsonFilePath).collect().toSeq)
+    }
   }
 
-  test ("persistent JSON table with a user specified schema") {
-    sql(
-      s"""
-        |CREATE TABLE jsonTable (
-        |a string,
-        |b String,
-        |`c_!@(3)` int,
-        |`<d>` Struct<`d!`:array<int>, `=`:array<struct<Dd2: boolean>>>)
-        |USING org.apache.spark.sql.json.DefaultSource
-        |OPTIONS (
-        |  path '${filePath}'
-        |)
-      """.stripMargin)
-
-    read.json(filePath).registerTempTable("expectedJsonTable")
-
-    checkAnswer(
-      sql("SELECT a, b, `c_!@(3)`, `<d>`.`d!`, `<d>`.`=` FROM jsonTable"),
-      sql("SELECT a, b, `c_!@(3)`, `<d>`.`d!`, `<d>`.`=` FROM expectedJsonTable").collect().toSeq)
+  test("persistent JSON table with a user specified schema") {
+    withTable("jsonTable") {
+      sql(
+        s"""CREATE TABLE jsonTable (
+           |a string,
+           |b String,
+           |`c_!@(3)` int,
+           |`<d>` Struct<`d!`:array<int>, `=`:array<struct<Dd2: boolean>>>)
+           |USING org.apache.spark.sql.json.DefaultSource
+           |OPTIONS (
+           |  path '$jsonFilePath'
+           |)
+         """.stripMargin)
+
+      withTempTable("expectedJsonTable") {
+        read.json(jsonFilePath).registerTempTable("expectedJsonTable")
+        checkAnswer(
+          sql("SELECT a, b, `c_!@(3)`, `<d>`.`d!`, `<d>`.`=` FROM jsonTable"),
+          sql("SELECT a, b, `c_!@(3)`, `<d>`.`d!`, `<d>`.`=` FROM expectedJsonTable"))
+      }
+    }
   }
 
-  test ("persistent JSON table with a user specified schema with a subset of fields") {
-    // This works because JSON objects are self-describing and JSONRelation can get needed
-    // field values based on field names.
-    sql(
-      s"""
-        |CREATE TABLE jsonTable (`<d>` Struct<`=`:array<struct<Dd2: boolean>>>, b String)
-        |USING org.apache.spark.sql.json.DefaultSource
-        |OPTIONS (
-        |  path '${filePath}'
-        |)
-      """.stripMargin)
-
-    val innerStruct = StructType(
-      StructField("=", ArrayType(StructType(StructField("Dd2", BooleanType, true) :: Nil))) :: Nil)
-    val expectedSchema = StructType(
-      StructField("<d>", innerStruct, true) ::
-      StructField("b", StringType, true) :: Nil)
-
-    assert(expectedSchema === table("jsonTable").schema)
-
-    read.json(filePath).registerTempTable("expectedJsonTable")
-
-    checkAnswer(
-      sql("SELECT b, `<d>`.`=` FROM jsonTable"),
-      sql("SELECT b, `<d>`.`=` FROM expectedJsonTable").collect().toSeq)
+  test("persistent JSON table with a user specified schema with a subset of fields") {
+    withTable("jsonTable") {
+      // This works because JSON objects are self-describing and JSONRelation can get needed
+      // field values based on field names.
+      sql(
+        s"""CREATE TABLE jsonTable (`<d>` Struct<`=`:array<struct<Dd2: boolean>>>, b String)
+           |USING org.apache.spark.sql.json.DefaultSource
+           |OPTIONS (
+           |  path '$jsonFilePath'
+           |)
+         """.stripMargin)
+
+      val innerStruct = StructType(Seq(
+        StructField("=", ArrayType(StructType(StructField("Dd2", BooleanType, true) :: Nil)))))
+
+      val expectedSchema = StructType(Seq(
+        StructField("<d>", innerStruct, true),
+        StructField("b", StringType, true)))
+
+      assert(expectedSchema === table("jsonTable").schema)
+
+      withTempTable("expectedJsonTable") {
+        read.json(jsonFilePath).registerTempTable("expectedJsonTable")
+        checkAnswer(
+          sql("SELECT b, `<d>`.`=` FROM jsonTable"),
+          sql("SELECT b, `<d>`.`=` FROM expectedJsonTable"))
+      }
+    }
   }
 
   test("resolve shortened provider names") {
-    sql(
-      s"""
-        |CREATE TABLE jsonTable
-        |USING org.apache.spark.sql.json
-        |OPTIONS (
-        |  path '${filePath}'
-        |)
-      """.stripMargin)
-
-    checkAnswer(
-      sql("SELECT * FROM jsonTable"),
-      read.json(filePath).collect().toSeq)
+    withTable("jsonTable") {
+      sql(
+        s"""
+           |CREATE TABLE jsonTable
+           |USING org.apache.spark.sql.json
+           |OPTIONS (
+           |  path '$jsonFilePath'
+           |)
+         """.stripMargin)
+
+      checkAnswer(
+        sql("SELECT * FROM jsonTable"),
+        read.json(jsonFilePath).collect().toSeq)
+    }
   }
 
   test("drop table") {
-    sql(
-      s"""
-        |CREATE TABLE jsonTable
-        |USING org.apache.spark.sql.json
-        |OPTIONS (
-        |  path '${filePath}'
-        |)
-      """.stripMargin)
-
-    checkAnswer(
-      sql("SELECT * FROM jsonTable"),
-      read.json(filePath).collect().toSeq)
-
-    sql("DROP TABLE jsonTable")
-
-    intercept[Exception] {
-      sql("SELECT * FROM jsonTable").collect()
-    }
+    withTable("jsonTable") {
+      sql(
+        s"""
+           |CREATE TABLE jsonTable
+           |USING org.apache.spark.sql.json
+           |OPTIONS (
+           |  path '$jsonFilePath'
+           |)
+         """.stripMargin)
+
+      checkAnswer(
+        sql("SELECT * FROM jsonTable"),
+        read.json(jsonFilePath))
+
+      sql("DROP TABLE jsonTable")
 
-    assert(
-      (new File(filePath)).exists(),
-      "The table with specified path is considered as an external table, " +
-        "its data should not deleted after DROP TABLE.")
+      intercept[Exception] {
+        sql("SELECT * FROM jsonTable").collect()
+      }
+
+      assert(
+        new File(jsonFilePath).exists(),
+        "The table with specified path is considered as an external table, " +
+          "its data should not deleted after DROP TABLE.")
+    }
   }
 
   test("check change without refresh") {
-    val tempDir = File.createTempFile("sparksql", "json", Utils.createTempDir())
-    tempDir.delete()
-    sparkContext.parallelize(("a", "b") :: Nil).toDF()
-      .toJSON.saveAsTextFile(tempDir.getCanonicalPath)
-
-    sql(
-      s"""
-        |CREATE TABLE jsonTable
-        |USING org.apache.spark.sql.json
-        |OPTIONS (
-        |  path '${tempDir.getCanonicalPath}'
-        |)
-      """.stripMargin)
-
-    checkAnswer(
-      sql("SELECT * FROM jsonTable"),
-      Row("a", "b"))
-
-    Utils.deleteRecursively(tempDir)
-    sparkContext.parallelize(("a1", "b1", "c1") :: Nil).toDF()
-      .toJSON.saveAsTextFile(tempDir.getCanonicalPath)
-
-    // Schema is cached so the new column does not show. The updated values in existing columns
-    // will show.
-    checkAnswer(
-      sql("SELECT * FROM jsonTable"),
-      Row("a1", "b1"))
-
-    sql("REFRESH TABLE jsonTable")
-
-    // Check that the refresh worked
-    checkAnswer(
-      sql("SELECT * FROM jsonTable"),
-      Row("a1", "b1", "c1"))
-    Utils.deleteRecursively(tempDir)
+    withTempPath { tempDir =>
+      withTable("jsonTable") {
+        (("a", "b") :: Nil).toDF().toJSON.saveAsTextFile(tempDir.getCanonicalPath)
+
+        sql(
+          s"""CREATE TABLE jsonTable
+             |USING org.apache.spark.sql.json
+             |OPTIONS (
+             |  path '${tempDir.getCanonicalPath}'
+             |)
+           """.stripMargin)
+
+        checkAnswer(
+          sql("SELECT * FROM jsonTable"),
+          Row("a", "b"))
+
+        Utils.deleteRecursively(tempDir)
+        (("a1", "b1", "c1") :: Nil).toDF().toJSON.saveAsTextFile(tempDir.getCanonicalPath)
+
+        // Schema is cached so the new column does not show. The updated values in existing columns
+        // will show.
+        checkAnswer(
+          sql("SELECT * FROM jsonTable"),
+          Row("a1", "b1"))
+
+        sql("REFRESH TABLE jsonTable")
+
+        // Check that the refresh worked
+        checkAnswer(
+          sql("SELECT * FROM jsonTable"),
+          Row("a1", "b1", "c1"))
+      }
+    }
   }
 
   test("drop, change, recreate") {
-    val tempDir = File.createTempFile("sparksql", "json", Utils.createTempDir())
-    tempDir.delete()
-    sparkContext.parallelize(("a", "b") :: Nil).toDF()
-      .toJSON.saveAsTextFile(tempDir.getCanonicalPath)
-
-    sql(
-      s"""
-        |CREATE TABLE jsonTable
-        |USING org.apache.spark.sql.json
-        |OPTIONS (
-        |  path '${tempDir.getCanonicalPath}'
-        |)
-      """.stripMargin)
-
-    checkAnswer(
-      sql("SELECT * FROM jsonTable"),
-      Row("a", "b"))
-
-    Utils.deleteRecursively(tempDir)
-    sparkContext.parallelize(("a", "b", "c") :: Nil).toDF()
-      .toJSON.saveAsTextFile(tempDir.getCanonicalPath)
-
-    sql("DROP TABLE jsonTable")
-
-    sql(
-      s"""
-        |CREATE TABLE jsonTable
-        |USING org.apache.spark.sql.json
-        |OPTIONS (
-        |  path '${tempDir.getCanonicalPath}'
-        |)
-      """.stripMargin)
-
-    // New table should reflect new schema.
-    checkAnswer(
-      sql("SELECT * FROM jsonTable"),
-      Row("a", "b", "c"))
-    Utils.deleteRecursively(tempDir)
+    withTempPath { tempDir =>
+      (("a", "b") :: Nil).toDF().toJSON.saveAsTextFile(tempDir.getCanonicalPath)
+
+      withTable("jsonTable") {
+        sql(
+          s"""CREATE TABLE jsonTable
+             |USING org.apache.spark.sql.json
+             |OPTIONS (
+             |  path '${tempDir.getCanonicalPath}'
+             |)
+           """.stripMargin)
+
+        checkAnswer(
+          sql("SELECT * FROM jsonTable"),
+          Row("a", "b"))
+
+        Utils.deleteRecursively(tempDir)
+        (("a", "b", "c") :: Nil).toDF().toJSON.saveAsTextFile(tempDir.getCanonicalPath)
+
+        sql("DROP TABLE jsonTable")
+
+        sql(
+          s"""CREATE TABLE jsonTable
+             |USING org.apache.spark.sql.json
+             |OPTIONS (
+             |  path '${tempDir.getCanonicalPath}'
+             |)
+           """.stripMargin)
+
+        // New table should reflect new schema.
+        checkAnswer(
+          sql("SELECT * FROM jsonTable"),
+          Row("a", "b", "c"))
+      }
+    }
   }
 
   test("invalidate cache and reload") {
-    sql(
-      s"""
-        |CREATE TABLE jsonTable (`c_!@(3)` int)
-        |USING org.apache.spark.sql.json.DefaultSource
-        |OPTIONS (
-        |  path '${filePath}'
-        |)
-      """.stripMargin)
+    withTable("jsonTable") {
+      sql(
+        s"""CREATE TABLE jsonTable (`c_!@(3)` int)
+           |USING org.apache.spark.sql.json.DefaultSource
+           |OPTIONS (
+           |  path '$jsonFilePath'
+           |)
+         """.stripMargin)
 
-    read.json(filePath).registerTempTable("expectedJsonTable")
+      withTempTable("expectedJsonTable") {
+        read.json(jsonFilePath).registerTempTable("expectedJsonTable")
 
-    checkAnswer(
-      sql("SELECT * FROM jsonTable"),
-      sql("SELECT `c_!@(3)` FROM expectedJsonTable").collect().toSeq)
+        checkAnswer(
+          sql("SELECT * FROM jsonTable"),
+          sql("SELECT `c_!@(3)` FROM expectedJsonTable").collect().toSeq)
 
-    // Discard the cached relation.
-    invalidateTable("jsonTable")
+        // Discard the cached relation.
+        invalidateTable("jsonTable")
 
-    checkAnswer(
-      sql("SELECT * FROM jsonTable"),
-      sql("SELECT `c_!@(3)` FROM expectedJsonTable").collect().toSeq)
+        checkAnswer(
+          sql("SELECT * FROM jsonTable"),
+          sql("SELECT `c_!@(3)` FROM expectedJsonTable").collect().toSeq)
 
-    invalidateTable("jsonTable")
-    val expectedSchema = StructType(StructField("c_!@(3)", IntegerType, true) :: Nil)
+        invalidateTable("jsonTable")
+        val expectedSchema = StructType(StructField("c_!@(3)", IntegerType, true) :: Nil)
 
-    assert(expectedSchema === table("jsonTable").schema)
+        assert(expectedSchema === table("jsonTable").schema)
+      }
+    }
   }
 
   test("CTAS") {
-    sql(
-      s"""
-        |CREATE TABLE jsonTable
-        |USING org.apache.spark.sql.json.DefaultSource
-        |OPTIONS (
-        |  path '${filePath}'
-        |)
-      """.stripMargin)
-
-    sql(
-      s"""
-        |CREATE TABLE ctasJsonTable
-        |USING org.apache.spark.sql.json.DefaultSource
-        |OPTIONS (
-        |  path '${tempPath}'
-        |) AS
-        |SELECT * FROM jsonTable
-      """.stripMargin)
-
-    assert(table("ctasJsonTable").schema === table("jsonTable").schema)
-
-    checkAnswer(
-      sql("SELECT * FROM ctasJsonTable"),
-      sql("SELECT * FROM jsonTable").collect())
+    withTempPath { tempPath =>
+      withTable("jsonTable", "ctasJsonTable") {
+        sql(
+          s"""CREATE TABLE jsonTable
+             |USING org.apache.spark.sql.json.DefaultSource
+             |OPTIONS (
+             |  path '$jsonFilePath'
+             |)
+           """.stripMargin)
+
+        sql(
+          s"""CREATE TABLE ctasJsonTable
+             |USING org.apache.spark.sql.json.DefaultSource
+             |OPTIONS (
+             |  path '$tempPath'
+             |) AS
+             |SELECT * FROM jsonTable
+           """.stripMargin)
+
+        assert(table("ctasJsonTable").schema === table("jsonTable").schema)
+
+        checkAnswer(
+          sql("SELECT * FROM ctasJsonTable"),
+          sql("SELECT * FROM jsonTable").collect())
+      }
+    }
   }
 
   test("CTAS with IF NOT EXISTS") {
-    sql(
-      s"""
-        |CREATE TABLE jsonTable
-        |USING org.apache.spark.sql.json.DefaultSource
-        |OPTIONS (
-        |  path '${filePath}'
-        |)
-      """.stripMargin)
-
-    sql(
-      s"""
-        |CREATE TABLE ctasJsonTable
-        |USING org.apache.spark.sql.json.DefaultSource
-        |OPTIONS (
-        |  path '${tempPath}'
-        |) AS
-        |SELECT * FROM jsonTable
-      """.stripMargin)
-
-    // Create the table again should trigger a AnalysisException.
-    val message = intercept[AnalysisException] {
-      sql(
-        s"""
-        |CREATE TABLE ctasJsonTable
-        |USING org.apache.spark.sql.json.DefaultSource
-        |OPTIONS (
-        |  path '${tempPath}'
-        |) AS
-        |SELECT * FROM jsonTable
-      """.stripMargin)
-    }.getMessage
-    assert(message.contains("Table ctasJsonTable already exists."),
-      "We should complain that ctasJsonTable already exists")
-
-    // The following statement should be fine if it has IF NOT EXISTS.
-    // It tries to create a table ctasJsonTable with a new schema.
-    // The actual table's schema and data should not be changed.
-    sql(
-      s"""
-        |CREATE TABLE IF NOT EXISTS ctasJsonTable
-        |USING org.apache.spark.sql.json.DefaultSource
-        |OPTIONS (
-        |  path '${tempPath}'
-        |) AS
-        |SELECT a FROM jsonTable
-      """.stripMargin)
-
-    // Discard the cached relation.
-    invalidateTable("ctasJsonTable")
-
-    // Schema should not be changed.
-    assert(table("ctasJsonTable").schema === table("jsonTable").schema)
-    // Table data should not be changed.
-    checkAnswer(
-      sql("SELECT * FROM ctasJsonTable"),
-      sql("SELECT * FROM jsonTable").collect())
+    withTempPath { path =>
+      val tempPath = path.getCanonicalPath
+
+      withTable("jsonTable", "ctasJsonTable") {
+        sql(
+          s"""CREATE TABLE jsonTable
+             |USING org.apache.spark.sql.json.DefaultSource
+             |OPTIONS (
+             |  path '$jsonFilePath'
+             |)
+           """.stripMargin)
+
+        sql(
+          s"""CREATE TABLE ctasJsonTable
+             |USING org.apache.spark.sql.json.DefaultSource
+             |OPTIONS (
+             |  path '$tempPath'
+             |) AS
+             |SELECT * FROM jsonTable
+           """.stripMargin)
+
+        // Create the table again should trigger a AnalysisException.
+        val message = intercept[AnalysisException] {
+          sql(
+            s"""CREATE TABLE ctasJsonTable
+               |USING org.apache.spark.sql.json.DefaultSource
+               |OPTIONS (
+               |  path '$tempPath'
+               |) AS
+               |SELECT * FROM jsonTable
+             """.stripMargin)
+        }.getMessage
+
+        assert(
+          message.contains("Table ctasJsonTable already exists."),
+          "We should complain that ctasJsonTable already exists")
+
+        // The following statement should be fine if it has IF NOT EXISTS.
+        // It tries to create a table ctasJsonTable with a new schema.
+        // The actual table's schema and data should not be changed.
+        sql(
+          s"""CREATE TABLE IF NOT EXISTS ctasJsonTable
+             |USING org.apache.spark.sql.json.DefaultSource
+             |OPTIONS (
+             |  path '$tempPath'
+             |) AS
+             |SELECT a FROM jsonTable
+           """.stripMargin)
+
+        // Discard the cached relation.
+        invalidateTable("ctasJsonTable")
+
+        // Schema should not be changed.
+        assert(table("ctasJsonTable").schema === table("jsonTable").schema)
+        // Table data should not be changed.
+        checkAnswer(
+          sql("SELECT * FROM ctasJsonTable"),
+          sql("SELECT * FROM jsonTable").collect())
+      }
+    }
   }
 
   test("CTAS a managed table") {
-    sql(
-      s"""
-        |CREATE TABLE jsonTable
-        |USING org.apache.spark.sql.json.DefaultSource
-        |OPTIONS (
-        |  path '${filePath}'
-        |)
-      """.stripMargin)
-
-    val expectedPath = catalog.hiveDefaultTableFilePath("ctasJsonTable")
-    val filesystemPath = new Path(expectedPath)
-    val fs = filesystemPath.getFileSystem(sparkContext.hadoopConfiguration)
-    if (fs.exists(filesystemPath)) fs.delete(filesystemPath, true)
-
-    // It is a managed table when we do not specify the location.
-    sql(
-      s"""
-        |CREATE TABLE ctasJsonTable
-        |USING org.apache.spark.sql.json.DefaultSource
-        |AS
-        |SELECT * FROM jsonTable
-      """.stripMargin)
-
-    assert(fs.exists(filesystemPath), s"$expectedPath should exist after we create the table.")
-
-    sql(
-      s"""
-        |CREATE TABLE loadedTable
-        |USING org.apache.spark.sql.json.DefaultSource
-        |OPTIONS (
-        |  path '${expectedPath}'
-        |)
-      """.stripMargin)
-
-    assert(table("ctasJsonTable").schema === table("loadedTable").schema)
-
-    checkAnswer(
-      sql("SELECT * FROM ctasJsonTable"),
-      sql("SELECT * FROM loadedTable").collect()
-    )
-
-    sql("DROP TABLE ctasJsonTable")
-    assert(!fs.exists(filesystemPath), s"$expectedPath should not exist after we drop the table.")
-  }
-
-  test("SPARK-5286 Fail to drop an invalid table when using the data source API") {
-    sql(
-      s"""
-        |CREATE TABLE jsonTable
-        |USING org.apache.spark.sql.json.DefaultSource
-        |OPTIONS (
-        |  path 'it is not a path at all!'
-        |)
-      """.stripMargin)
-
-    sql("DROP TABLE jsonTable").collect().foreach(println)
-  }
-
-  test("SPARK-5839 HiveMetastoreCatalog does not recognize table aliases of data source tables.") {
-    val originalDefaultSource = conf.defaultDataSourceName
-
-    val rdd = sparkContext.parallelize((1 to 10).map(i => s"""{"a":$i, "b":"str${i}"}"""))
-    val df = read.json(rdd)
-
-    conf.setConf(SQLConf.DEFAULT_DATA_SOURCE_NAME, "org.apache.spark.sql.json")
-    // Save the df as a managed table (by not specifiying the path).
-    df.write.saveAsTable("savedJsonTable")
-
-    checkAnswer(
-      sql("SELECT * FROM savedJsonTable where savedJsonTable.a < 5"),
-      (1 to 4).map(i => Row(i, s"str${i}")))
+    withTable("jsonTable", "ctasJsonTable", "loadedTable") {
+      sql(
+        s"""CREATE TABLE jsonTable
+           |USING org.apache.spark.sql.json.DefaultSource
+           |OPTIONS (
+           |  path '$jsonFilePath'
+           |)
+         """.stripMargin)
+
+      val expectedPath = catalog.hiveDefaultTableFilePath("ctasJsonTable")
+      val filesystemPath = new Path(expectedPath)
+      val fs = filesystemPath.getFileSystem(sparkContext.hadoopConfiguration)
+      if (fs.exists(filesystemPath)) fs.delete(filesystemPath, true)
+
+      // It is a managed table when we do not specify the location.
+      sql(
+        s"""CREATE TABLE ctasJsonTable
+           |USING org.apache.spark.sql.json.DefaultSource
+           |AS
+           |SELECT * FROM jsonTable
+         """.stripMargin)
 
-    checkAnswer(
-      sql("SELECT * FROM savedJsonTable tmp where tmp.a > 5"),
-      (6 to 10).map(i => Row(i, s"str${i}")))
+      assert(fs.exists(filesystemPath), s"$expectedPath should exist after we create the table.")
 
-    invalidateTable("savedJsonTable")
+      sql(
+        s"""CREATE TABLE loadedTable
+           |USING org.apache.spark.sql.json.DefaultSource
+           |OPTIONS (
+           |  path '$expectedPath'
+           |)
+         """.stripMargin)
 
-    checkAnswer(
-      sql("SELECT * FROM savedJsonTable where savedJsonTable.a < 5"),
-      (1 to 4).map(i => Row(i, s"str${i}")))
+      assert(table("ctasJsonTable").schema === table("loadedTable").schema)
 
-    checkAnswer(
-      sql("SELECT * FROM savedJsonTable tmp where tmp.a > 5"),
-      (6 to 10).map(i => Row(i, s"str${i}")))
+      checkAnswer(
+        sql("SELECT * FROM ctasJsonTable"),
+        sql("SELECT * FROM loadedTable"))
 
-    // Drop table will also delete the data.
-    sql("DROP TABLE savedJsonTable")
+      sql("DROP TABLE ctasJsonTable")
+      assert(!fs.exists(filesystemPath), s"$expectedPath should not exist after we drop the table.")
+    }
+  }
 
-    conf.setConf(SQLConf.DEFAULT_DATA_SOURCE_NAME, originalDefaultSource)
+  test("SPARK-5286 Fail to drop an invalid table when using the data source API") {
+    withTable("jsonTable") {
+      sql(
+        s"""CREATE TABLE jsonTable
+           |USING org.apache.spark.sql.json.DefaultSource
+           |OPTIONS (
+           |  path 'it is not a path at all!'
+           |)
+         """.stripMargin)
+
+      sql("DROP TABLE jsonTable").collect().foreach(println)
+    }
   }
 
-  test("save table") {
-    val originalDefaultSource = conf.defaultDataSourceName
+  test("SPARK-5839 HiveMetastoreCatalog does not recognize table aliases of data source tables.") {
+    withTable("savedJsonTable") {
+      // Save the df as a managed table (by not specifying the path).
+      (1 to 10)
+        .map(i => i -> s"str$i")
+        .toDF("a", "b")
+        .write
+        .format("json")
+        .saveAsTable("savedJsonTable")
 
-    val rdd = sparkContext.parallelize((1 to 10).map(i => s"""{"a":$i, "b":"str${i}"}"""))
-    val df = read.json(rdd)
+      checkAnswer(
+        sql("SELECT * FROM savedJsonTable where savedJsonTable.a < 5"),
+        (1 to 4).map(i => Row(i, s"str$i")))
 
-    conf.setConf(SQLConf.DEFAULT_DATA_SOURCE_NAME, "org.apache.spark.sql.json")
-    // Save the df as a managed table (by not specifiying the path).
-    df.write.saveAsTable("savedJsonTable")
+      checkAnswer(
+        sql("SELECT * FROM savedJsonTable tmp where tmp.a > 5"),
+        (6 to 10).map(i => Row(i, s"str$i")))
 
-    checkAnswer(
-      sql("SELECT * FROM savedJsonTable"),
-      df.collect())
+      invalidateTable("savedJsonTable")
 
-    // Right now, we cannot append to an existing JSON table.
-    intercept[RuntimeException] {
-      df.write.mode(SaveMode.Append).saveAsTable("savedJsonTable")
-    }
+      checkAnswer(
+        sql("SELECT * FROM savedJsonTable where savedJsonTable.a < 5"),
+        (1 to 4).map(i => Row(i, s"str$i")))
 
-    // We can overwrite it.
-    df.write.mode(SaveMode.Overwrite).saveAsTable("savedJsonTable")
-    checkAnswer(
-      sql("SELECT * FROM savedJsonTable"),
-      df.collect())
-
-    // When the save mode is Ignore, we will do nothing when the table already exists.
-    df.select("b").write.mode(SaveMode.Ignore).saveAsTable("savedJsonTable")
-    assert(df.schema === table("savedJsonTable").schema)
-    checkAnswer(
-      sql("SELECT * FROM savedJsonTable"),
-      df.collect())
-
-    // Drop table will also delete the data.
-    sql("DROP TABLE savedJsonTable")
-    intercept[InvalidInputException] {
-      read.json(catalog.hiveDefaultTableFilePath("savedJsonTable"))
+      checkAnswer(
+        sql("SELECT * FROM savedJsonTable tmp where tmp.a > 5"),
+        (6 to 10).map(i => Row(i, s"str$i")))
     }
+  }
 
-    // Create an external table by specifying the path.
-    conf.setConf(SQLConf.DEFAULT_DATA_SOURCE_NAME, "not a source name")
-    df.write
-      .format("org.apache.spark.sql.json")
-      .mode(SaveMode.Append)
-      .option("path", tempPath.toString)
-      .saveAsTable("savedJsonTable")
-    checkAnswer(
-      sql("SELECT * FROM savedJsonTable"),
-      df.collect())
-
-    // Data should not be deleted after we drop the table.
-    sql("DROP TABLE savedJsonTable")
-    checkAnswer(
-      read.json(tempPath.toString),
-      df.collect())
-
-    conf.setConf(SQLConf.DEFAULT_DATA_SOURCE_NAME, originalDefaultSource)
+  test("save table") {
+    withTempPath { path =>
+      val tempPath = path.getCanonicalPath
+
+      withTable("savedJsonTable") {
+        val df = (1 to 10).map(i => i -> s"str$i").toDF("a", "b")
+
+        withSQLConf(SQLConf.DEFAULT_DATA_SOURCE_NAME -> "json") {
+          // Save the df as a managed table (by not specifying the path).
+          df.write.saveAsTable("savedJsonTable")
+
+          checkAnswer(sql("SELECT * FROM savedJsonTable"), df)
+
+          // Right now, we cannot append to an existing JSON table.
+          intercept[RuntimeException] {
+            df.write.mode(SaveMode.Append).saveAsTable("savedJsonTable")
+          }
+
+          // We can overwrite it.
+          df.write.mode(SaveMode.Overwrite).saveAsTable("savedJsonTable")
+          checkAnswer(sql("SELECT * FROM savedJsonTable"), df)
+
+          // When the save mode is Ignore, we will do nothing when the table already exists.
+          df.select("b").write.mode(SaveMode.Ignore).saveAsTable("savedJsonTable")
+          assert(df.schema === table("savedJsonTable").schema)
+          checkAnswer(sql("SELECT * FROM savedJsonTable"), df)
+
+          // Drop table will also delete the data.
+          sql("DROP TABLE savedJsonTable")
+          intercept[InvalidInputException] {
+            read.json(catalog.hiveDefaultTableFilePath("savedJsonTable"))
+          }
+        }
+
+        // Create an external table by specifying the path.
+        withSQLConf(SQLConf.DEFAULT_DATA_SOURCE_NAME -> "not a source name") {
+          df.write
+            .format("org.apache.spark.sql.json")
+            .mode(SaveMode.Append)
+            .option("path", tempPath.toString)
+            .saveAsTable("savedJsonTable")
+
+          checkAnswer(sql("SELECT * FROM savedJsonTable"), df)
+        }
+
+        // Data should not be deleted after we drop the table.
+        sql("DROP TABLE savedJsonTable")
+        checkAnswer(read.json(tempPath.toString), df)
+      }
+    }
   }
 
   test("create external table") {
-    val originalDefaultSource = conf.defaultDataSourceName
-
-    val rdd = sparkContext.parallelize((1 to 10).map(i => s"""{"a":$i, "b":"str${i}"}"""))
-    val df = read.json(rdd)
-
-    conf.setConf(SQLConf.DEFAULT_DATA_SOURCE_NAME, "not a source name")
-    df.write.format("org.apache.spark.sql.json")
-      .mode(SaveMode.Append)
-      .option("path", tempPath.toString)
-      .saveAsTable("savedJsonTable")
-
-    conf.setConf(SQLConf.DEFAULT_DATA_SOURCE_NAME, "org.apache.spark.sql.json")
-    createExternalTable("createdJsonTable", tempPath.toString)
-    assert(table("createdJsonTable").schema === df.schema)
-    checkAnswer(
-      sql("SELECT * FROM createdJsonTable"),
-      df.collect())
-
-    var message = intercept[AnalysisException] {
-      createExternalTable("createdJsonTable", filePath.toString)
-    }.getMessage
-    assert(message.contains("Table createdJsonTable already exists."),
-      "We should complain that ctasJsonTable already exists")
-
-    // Data should not be deleted.
-    sql("DROP TABLE createdJsonTable")
-    checkAnswer(
-      read.json(tempPath.toString),
-      df.collect())
-
-    // Try to specify the schema.
-    conf.setConf(SQLConf.DEFAULT_DATA_SOURCE_NAME, "not a source name")
-    val schema = StructType(StructField("b", StringType, true) :: Nil)
-    createExternalTable(
-      "createdJsonTable",
-      "org.apache.spark.sql.json",
-      schema,
-      Map("path" -> tempPath.toString))
-    checkAnswer(
-      sql("SELECT * FROM createdJsonTable"),
-      sql("SELECT b FROM savedJsonTable").collect())
-
-    sql("DROP TABLE createdJsonTable")
-
-    message = intercept[RuntimeException] {
-      createExternalTable(
-        "createdJsonTable",
-        "org.apache.spark.sql.json",
-        schema,
-        Map.empty[String, String])
-    }.getMessage
-    assert(
-      message.contains("'path' must be specified for json data."),
-      "We should complain that path is not specified.")
-
-    sql("DROP TABLE savedJsonTable")
-    conf.setConf(SQLConf.DEFAULT_DATA_SOURCE_NAME, originalDefaultSource)
+    withTempPath { tempPath =>
+      withTable("savedJsonTable", "createdJsonTable") {
+        val df = read.json(sparkContext.parallelize((1 to 10).map { i =>
+          s"""{ "a": $i, "b": "str$i" }"""
+        }))
+
+        withSQLConf(SQLConf.DEFAULT_DATA_SOURCE_NAME -> "not a source name") {
+          df.write
+            .format("json")
+            .mode(SaveMode.Append)
+            .option("path", tempPath.toString)
+            .saveAsTable("savedJsonTable")
+        }
+
+        withSQLConf(SQLConf.DEFAULT_DATA_SOURCE_NAME -> "json") {
+          createExternalTable("createdJsonTable", tempPath.toString)
+          assert(table("createdJsonTable").schema === df.schema)
+          checkAnswer(sql("SELECT * FROM createdJsonTable"), df)
+
+          assert(
+            intercept[AnalysisException] {
+              createExternalTable("createdJsonTable", jsonFilePath.toString)
+            }.getMessage.contains("Table createdJsonTable already exists."),
+            "We should complain that createdJsonTable already exists")
+        }
+
+        // Data should not be deleted.
+        sql("DROP TABLE createdJsonTable")
+        checkAnswer(read.json(tempPath.toString), df)
+
+        // Try to specify the schema.
+        withSQLConf(SQLConf.DEFAULT_DATA_SOURCE_NAME -> "not a source name") {
+          val schema = StructType(StructField("b", StringType, true) :: Nil)
+          createExternalTable(
+            "createdJsonTable",
+            "org.apache.spark.sql.json",
+            schema,
+            Map("path" -> tempPath.toString))
+
+          checkAnswer(
+            sql("SELECT * FROM createdJsonTable"),
+            sql("SELECT b FROM savedJsonTable"))
+
+          sql("DROP TABLE createdJsonTable")
+
+          assert(
+            intercept[RuntimeException] {
+              createExternalTable(
+                "createdJsonTable",
+                "org.apache.spark.sql.json",
+                schema,
+                Map.empty[String, String])
+            }.getMessage.contains("'path' must be specified for json data."),
+            "We should complain that path is not specified.")
+        }
+      }
+    }
   }
 
   if (HiveShim.version == "0.13.1") {
     test("scan a parquet table created through a CTAS statement") {
-      val originalConvertMetastore = getConf("spark.sql.hive.convertMetastoreParquet", "true")
-      val originalUseDataSource = getConf(SQLConf.PARQUET_USE_DATA_SOURCE_API, "true")
-      setConf("spark.sql.hive.convertMetastoreParquet", "true")
-      setConf(SQLConf.PARQUET_USE_DATA_SOURCE_API, "true")
-
-      val rdd = sparkContext.parallelize((1 to 10).map(i => s"""{"a":$i, "b":"str${i}"}"""))
-      read.json(rdd).registerTempTable("jt")
-      sql(
-        """
-          |create table test_parquet_ctas STORED AS parquET
-          |AS select tmp.a from jt tmp where tmp.a < 5
-        """.stripMargin)
-
-      checkAnswer(
-        sql(s"SELECT a FROM test_parquet_ctas WHERE a > 2 "),
-        Row(3) :: Row(4) :: Nil
-      )
-
-      table("test_parquet_ctas").queryExecution.optimizedPlan match {
-        case LogicalRelation(p: ParquetRelation2) => // OK
-        case _ =>
-          fail(
-            "test_parquet_ctas should be converted to " +
-            s"${classOf[ParquetRelation2].getCanonicalName}")
+      withSQLConf(
+        "spark.sql.hive.convertMetastoreParquet" -> "true",
+        SQLConf.PARQUET_USE_DATA_SOURCE_API -> "true") {
+
+        withTempTable("jt") {
+          (1 to 10).map(i => i -> s"str$i").toDF("a", "b").registerTempTable("jt")
+
+          withTable("test_parquet_ctas") {
+            sql(
+              """CREATE TABLE test_parquet_ctas STORED AS PARQUET
+                |AS SELECT tmp.a FROM jt tmp WHERE tmp.a < 5
+              """.stripMargin)
+
+            checkAnswer(
+              sql(s"SELECT a FROM test_parquet_ctas WHERE a > 2 "),
+              Row(3) :: Row(4) :: Nil)
+
+            table("test_parquet_ctas").queryExecution.optimizedPlan match {
+              case LogicalRelation(p: ParquetRelation2) => // OK
+              case _ =>
+                fail(s"test_parquet_ctas should have be converted to ${classOf[ParquetRelation2]}")
+            }
+          }
+        }
       }
-
-      // Clenup and reset confs.
-      sql("DROP TABLE IF EXISTS jt")
-      sql("DROP TABLE IF EXISTS test_parquet_ctas")
-      setConf("spark.sql.hive.convertMetastoreParquet", originalConvertMetastore)
-      setConf(SQLConf.PARQUET_USE_DATA_SOURCE_API, originalUseDataSource)
     }
   }
 
   test("Pre insert nullability check (ArrayType)") {
-    val df1 =
-      createDataFrame(Tuple1(Seq(Int.box(1), null.asInstanceOf[Integer])) :: Nil).toDF("a")
-    val expectedSchema1 =
-      StructType(
-        StructField("a", ArrayType(IntegerType, containsNull = true), nullable = true) :: Nil)
-    assert(df1.schema === expectedSchema1)
-    df1.write.mode(SaveMode.Overwrite).format("parquet").saveAsTable("arrayInParquet")
-
-    val df2 =
-      createDataFrame(Tuple1(Seq(2, 3)) :: Nil).toDF("a")
-    val expectedSchema2 =
-      StructType(
-        StructField("a", ArrayType(IntegerType, containsNull = false), nullable = true) :: Nil)
-    assert(df2.schema === expectedSchema2)
-    df2.write.mode(SaveMode.Append).insertInto("arrayInParquet")
-    createDataFrame(Tuple1(Seq(4, 5)) :: Nil).toDF("a").write.mode(SaveMode.Append)
-      .saveAsTable("arrayInParquet") // This one internally calls df2.insertInto.
-    createDataFrame(Tuple1(Seq(Int.box(6), null.asInstanceOf[Integer])) :: Nil).toDF("a").write
-      .mode(SaveMode.Append).saveAsTable("arrayInParquet")
-    refreshTable("arrayInParquet")
-
-    checkAnswer(
-      sql("SELECT a FROM arrayInParquet"),
-      Row(ArrayBuffer(1, null)) ::
-        Row(ArrayBuffer(2, 3)) ::
-        Row(ArrayBuffer(4, 5)) ::
-        Row(ArrayBuffer(6, null)) :: Nil)
-
-    sql("DROP TABLE arrayInParquet")
+    withTable("arrayInParquet") {
+      {
+        val df = (Tuple1(Seq(Int.box(1), null: Integer)) :: Nil).toDF("a")
+        val expectedSchema =
+          StructType(
+            StructField(
+              "a",
+              ArrayType(IntegerType, containsNull = true),
+              nullable = true) :: Nil)
+
+        assert(df.schema === expectedSchema)
+
+        df.write
+          .format("parquet")
+          .mode(SaveMode.Overwrite)
+          .saveAsTable("arrayInParquet")
+      }
+
+      {
+        val df = (Tuple1(Seq(2, 3)) :: Nil).toDF("a")
+        val expectedSchema =
+          StructType(
+            StructField(
+              "a",
+              ArrayType(IntegerType, containsNull = false),
+              nullable = true) :: Nil)
+
+        assert(df.schema === expectedSchema)
+
+        df.write
+          .format("parquet")
+          .mode(SaveMode.Append)
+          .insertInto("arrayInParquet")
+      }
+
+      (Tuple1(Seq(4, 5)) :: Nil).toDF("a")
+        .write
+        .mode(SaveMode.Append)
+        .saveAsTable("arrayInParquet") // This one internally calls df2.insertInto.
+
+      (Tuple1(Seq(Int.box(6), null: Integer)) :: Nil).toDF("a")
+        .write
+        .mode(SaveMode.Append)
+        .saveAsTable("arrayInParquet")
+
+      refreshTable("arrayInParquet")
+
+      checkAnswer(
+        sql("SELECT a FROM arrayInParquet"),
+        Row(ArrayBuffer(1, null)) ::
+          Row(ArrayBuffer(2, 3)) ::
+          Row(ArrayBuffer(4, 5)) ::
+          Row(ArrayBuffer(6, null)) :: Nil)
+    }
   }
 
   test("Pre insert nullability check (MapType)") {
-    val df1 =
-      createDataFrame(Tuple1(Map(1 -> null.asInstanceOf[Integer])) :: Nil).toDF("a")
-    val mapType1 = MapType(IntegerType, IntegerType, valueContainsNull = true)
-    val expectedSchema1 =
-      StructType(
-        StructField("a", mapType1, nullable = true) :: Nil)
-    assert(df1.schema === expectedSchema1)
-    df1.write.mode(SaveMode.Overwrite).format("parquet").saveAsTable("mapInParquet")
-
-    val df2 =
-      createDataFrame(Tuple1(Map(2 -> 3)) :: Nil).toDF("a")
-    val mapType2 = MapType(IntegerType, IntegerType, valueContainsNull = false)
-    val expectedSchema2 =
-      StructType(
-        StructField("a", mapType2, nullable = true) :: Nil)
-    assert(df2.schema === expectedSchema2)
-    df2.write.mode(SaveMode.Append).insertInto("mapInParquet")
-    createDataFrame(Tuple1(Map(4 -> 5)) :: Nil).toDF("a").write.mode(SaveMode.Append)
-      .saveAsTable("mapInParquet") // This one internally calls df2.insertInto.
-    createDataFrame(Tuple1(Map(6 -> null.asInstanceOf[Integer])) :: Nil).toDF("a").write
-      .format("parquet").mode(SaveMode.Append).saveAsTable("mapInParquet")
-    refreshTable("mapInParquet")
-
-    checkAnswer(
-      sql("SELECT a FROM mapInParquet"),
-      Row(Map(1 -> null)) ::
-        Row(Map(2 -> 3)) ::
-        Row(Map(4 -> 5)) ::
-        Row(Map(6 -> null)) :: Nil)
-
-    sql("DROP TABLE mapInParquet")
+    withTable("mapInParquet") {
+      {
+        val df = (Tuple1(Map(1 -> (null: Integer))) :: Nil).toDF("a")
+        val expectedSchema =
+          StructType(
+            StructField(
+              "a",
+              MapType(IntegerType, IntegerType, valueContainsNull = true),
+              nullable = true) :: Nil)
+
+        assert(df.schema === expectedSchema)
+
+        df.write
+          .format("parquet")
+          .mode(SaveMode.Overwrite)
+          .saveAsTable("mapInParquet")
+      }
+
+      {
+        val df = (Tuple1(Map(2 -> 3)) :: Nil).toDF("a")
+        val expectedSchema =
+          StructType(
+            StructField(
+              "a",
+              MapType(IntegerType, IntegerType, valueContainsNull = false),
+              nullable = true) :: Nil)
+
+        assert(df.schema === expectedSchema)
+
+        df.write
+          .format("parquet")
+          .mode(SaveMode.Append)
+          .insertInto("mapInParquet")
+      }
+
+      (Tuple1(Map(4 -> 5)) :: Nil).toDF("a")
+        .write
+        .format("parquet")
+        .mode(SaveMode.Append)
+        .saveAsTable("mapInParquet") // This one internally calls df2.insertInto.
+
+      (Tuple1(Map(6 -> null.asInstanceOf[Integer])) :: Nil).toDF("a")
+        .write
+        .format("parquet")
+        .mode(SaveMode.Append)
+        .saveAsTable("mapInParquet")
+
+      refreshTable("mapInParquet")
+
+      checkAnswer(
+        sql("SELECT a FROM mapInParquet"),
+        Row(Map(1 -> null)) ::
+          Row(Map(2 -> 3)) ::
+          Row(Map(4 -> 5)) ::
+          Row(Map(6 -> null)) :: Nil)
+    }
   }
 
   test("SPARK-6024 wide schema support") {
-    // We will need 80 splits for this schema if the threshold is 4000.
-    val schema = StructType((1 to 5000).map(i => StructField(s"c_${i}", StringType, true)))
-    assert(
-      schema.json.size > conf.schemaStringLengthThreshold,
-      "To correctly test the fix of SPARK-6024, the value of " +
-      s"spark.sql.sources.schemaStringLengthThreshold needs to be less than ${schema.json.size}")
-    // Manually create a metastore data source table.
-    catalog.createDataSourceTable(
-      tableName = "wide_schema",
-      userSpecifiedSchema = Some(schema),
-      partitionColumns = Array.empty[String],
-      provider = "json",
-      options = Map("path" -> "just a dummy path"),
-      isExternal = false)
-
-    invalidateTable("wide_schema")
-
-    val actualSchema = table("wide_schema").schema
-    assert(schema === actualSchema)
+    withSQLConf(SQLConf.SCHEMA_STRING_LENGTH_THRESHOLD -> "4000") {
+      withTable("wide_schema") {
+        // We will need 80 splits for this schema if the threshold is 4000.
+        val schema = StructType((1 to 5000).map(i => StructField(s"c_$i", StringType, true)))
+
+        // Manually create a metastore data source table.
+        catalog.createDataSourceTable(
+          tableName = "wide_schema",
+          userSpecifiedSchema = Some(schema),
+          partitionColumns = Array.empty[String],
+          provider = "json",
+          options = Map("path" -> "just a dummy path"),
+          isExternal = false)
+
+        invalidateTable("wide_schema")
+
+        val actualSchema = table("wide_schema").schema
+        assert(schema === actualSchema)
+      }
+    }
   }
 
   test("SPARK-6655 still support a schema stored in spark.sql.sources.schema") {
     val tableName = "spark6655"
-    val schema = StructType(StructField("int", IntegerType, true) :: Nil)
-
-    val hiveTable = HiveTable(
-      specifiedDatabase = Some("default"),
-      name = tableName,
-      schema = Seq.empty,
-      partitionColumns = Seq.empty,
-      properties = Map(
-        "spark.sql.sources.provider" -> "json",
-        "spark.sql.sources.schema" -> schema.json,
-        "EXTERNAL" -> "FALSE"),
-      tableType = ManagedTable,
-      serdeProperties = Map(
-        "path" -> catalog.hiveDefaultTableFilePath(tableName)))
-
-    catalog.client.createTable(hiveTable)
-
-    invalidateTable(tableName)
-    val actualSchema = table(tableName).schema
-    assert(schema === actualSchema)
-    sql(s"drop table $tableName")
+    withTable(tableName) {
+      val schema = StructType(StructField("int", IntegerType, true) :: Nil)
+      val hiveTable = HiveTable(
+        specifiedDatabase = Some("default"),
+        name = tableName,
+        schema = Seq.empty,
+        partitionColumns = Seq.empty,
+        properties = Map(
+          "spark.sql.sources.provider" -> "json",
+          "spark.sql.sources.schema" -> schema.json,
+          "EXTERNAL" -> "FALSE"),
+        tableType = ManagedTable,
+        serdeProperties = Map(
+          "path" -> catalog.hiveDefaultTableFilePath(tableName)))
+
+      catalog.client.createTable(hiveTable)
+
+      invalidateTable(tableName)
+      val actualSchema = table(tableName).schema
+      assert(schema === actualSchema)
+    }
   }
 
   test("Saving partition columns information") {
-    val df =
-      sparkContext.parallelize(1 to 10, 4).map { i =>
-        Tuple4(i, i + 1, s"str$i", s"str${i + 1}")
-      }.toDF("a", "b", "c", "d")
-
+    val df = (1 to 10).map(i => (i, i + 1, s"str$i", s"str${i + 1}")).toDF("a", "b", "c", "d")
     val tableName = s"partitionInfo_${System.currentTimeMillis()}"
-    df.write.format("parquet").partitionBy("d", "b").saveAsTable(tableName)
-    invalidateTable(tableName)
-    val metastoreTable = catalog.client.getTable("default", tableName)
-    val expectedPartitionColumns =
-      StructType(df.schema("d") :: df.schema("b") :: Nil)
-    val actualPartitionColumns =
-      StructType(
-        metastoreTable.partitionColumns.map(c =>
-          StructField(c.name, HiveMetastoreTypes.toDataType(c.hiveType))))
-    // Make sure partition columns are correctly stored in metastore.
-    assert(
-      expectedPartitionColumns.sameType(actualPartitionColumns),
-      s"Partitions columns stored in metastore $actualPartitionColumns is not the " +
-        s"partition columns defined by the saveAsTable operation $expectedPartitionColumns.")
-
-    // Check the content of the saved table.
-    checkAnswer(
-      table(tableName).selectExpr("c", "b", "d", "a"),
-      df.selectExpr("c", "b", "d", "a").collect())
-
-    sql(s"drop table $tableName")
+
+    withTable(tableName) {
+      df.write.format("parquet").partitionBy("d", "b").saveAsTable(tableName)
+      invalidateTable(tableName)
+      val metastoreTable = catalog.client.getTable("default", tableName)
+      val expectedPartitionColumns = StructType(df.schema("d") :: df.schema("b") :: Nil)
+      val actualPartitionColumns =
+        StructType(
+          metastoreTable.partitionColumns.map(c =>
+            StructField(c.name, HiveMetastoreTypes.toDataType(c.hiveType))))
+      // Make sure partition columns are correctly stored in metastore.
+      assert(
+        expectedPartitionColumns.sameType(actualPartitionColumns),
+        s"Partitions columns stored in metastore $actualPartitionColumns is not the " +
+          s"partition columns defined by the saveAsTable operation $expectedPartitionColumns.")
+
+      // Check the content of the saved table.
+      checkAnswer(
+        table(tableName).select("c", "b", "d", "a"),
+        df.select("c", "b", "d", "a"))
+    }
   }
 
   test("insert into a table") {
-    def createDF(from: Int, to: Int): DataFrame =
-      createDataFrame((from to to).map(i => Tuple2(i, s"str$i"))).toDF("c1", "c2")
+    def createDF(from: Int, to: Int): DataFrame = {
+      (from to to).map(i => i -> s"str$i").toDF("c1", "c2")
+    }
 
-    createDF(0, 9).write.format("parquet").saveAsTable("insertParquet")
-    checkAnswer(
-      sql("SELECT p.c1, p.c2 FROM insertParquet p WHERE p.c1 > 5"),
-      (6 to 9).map(i => Row(i, s"str$i")))
+    withTable("insertParquet") {
+      createDF(0, 9).write.format("parquet").saveAsTable("insertParquet")
+      checkAnswer(
+        sql("SELECT p.c1, p.c2 FROM insertParquet p WHERE p.c1 > 5"),
+        (6 to 9).map(i => Row(i, s"str$i")))
 
-    intercept[AnalysisException] {
-      createDF(10, 19).write.format("parquet").saveAsTable("insertParquet")
-    }
+      intercept[AnalysisException] {
+        createDF(10, 19).write.format("parquet").saveAsTable("insertParquet")
+      }
 
-    createDF(10, 19).write.mode(SaveMode.Append).format("parquet").saveAsTable("insertParquet")
-    checkAnswer(
-      sql("SELECT p.c1, p.c2 FROM insertParquet p WHERE p.c1 > 5"),
-      (6 to 19).map(i => Row(i, s"str$i")))
+      createDF(10, 19).write.mode(SaveMode.Append).format("parquet").saveAsTable("insertParquet")
+      checkAnswer(
+        sql("SELECT p.c1, p.c2 FROM insertParquet p WHERE p.c1 > 5"),
+        (6 to 19).map(i => Row(i, s"str$i")))
 
-    createDF(20, 29).write.mode(SaveMode.Append).format("parquet").saveAsTable("insertParquet")
-    checkAnswer(
-      sql("SELECT p.c1, c2 FROM insertParquet p WHERE p.c1 > 5 AND p.c1 < 25"),
-      (6 to 24).map(i => Row(i, s"str$i")))
+      createDF(20, 29).write.mode(SaveMode.Append).format("parquet").saveAsTable("insertParquet")
+      checkAnswer(
+        sql("SELECT p.c1, c2 FROM insertParquet p WHERE p.c1 > 5 AND p.c1 < 25"),
+        (6 to 24).map(i => Row(i, s"str$i")))
 
-    intercept[AnalysisException] {
-      createDF(30, 39).write.saveAsTable("insertParquet")
-    }
+      intercept[AnalysisException] {
+        createDF(30, 39).write.saveAsTable("insertParquet")
+      }
+
+      createDF(30, 39).write.mode(SaveMode.Append).saveAsTable("insertParquet")
+      checkAnswer(
+        sql("SELECT p.c1, c2 FROM insertParquet p WHERE p.c1 > 5 AND p.c1 < 35"),
+        (6 to 34).map(i => Row(i, s"str$i")))
 
-    createDF(30, 39).write.mode(SaveMode.Append).saveAsTable("insertParquet")
-    checkAnswer(
-      sql("SELECT p.c1, c2 FROM insertParquet p WHERE p.c1 > 5 AND p.c1 < 35"),
-      (6 to 34).map(i => Row(i, s"str$i")))
-
-    createDF(40, 49).write.mode(SaveMode.Append).insertInto("insertParquet")
-    checkAnswer(
-      sql("SELECT p.c1, c2 FROM insertParquet p WHERE p.c1 > 5 AND p.c1 < 45"),
-      (6 to 44).map(i => Row(i, s"str$i")))
-
-    createDF(50, 59).write.mode(SaveMode.Overwrite).saveAsTable("insertParquet")
-    checkAnswer(
-      sql("SELECT p.c1, c2 FROM insertParquet p WHERE p.c1 > 51 AND p.c1 < 55"),
-      (52 to 54).map(i => Row(i, s"str$i")))
-    createDF(60, 69).write.mode(SaveMode.Ignore).saveAsTable("insertParquet")
-    checkAnswer(
-      sql("SELECT p.c1, c2 FROM insertParquet p"),
-      (50 to 59).map(i => Row(i, s"str$i")))
-
-    createDF(70, 79).write.mode(SaveMode.Overwrite).insertInto("insertParquet")
-    checkAnswer(
-      sql("SELECT p.c1, c2 FROM insertParquet p"),
-      (70 to 79).map(i => Row(i, s"str$i")))
+      createDF(40, 49).write.mode(SaveMode.Append).insertInto("insertParquet")
+      checkAnswer(
+        sql("SELECT p.c1, c2 FROM insertParquet p WHERE p.c1 > 5 AND p.c1 < 45"),
+        (6 to 44).map(i => Row(i, s"str$i")))
+
+      createDF(50, 59).write.mode(SaveMode.Overwrite).saveAsTable("insertParquet")
+      checkAnswer(
+        sql("SELECT p.c1, c2 FROM insertParquet p WHERE p.c1 > 51 AND p.c1 < 55"),
+        (52 to 54).map(i => Row(i, s"str$i")))
+      createDF(60, 69).write.mode(SaveMode.Ignore).saveAsTable("insertParquet")
+      checkAnswer(
+        sql("SELECT p.c1, c2 FROM insertParquet p"),
+        (50 to 59).map(i => Row(i, s"str$i")))
+
+      createDF(70, 79).write.mode(SaveMode.Overwrite).insertInto("insertParquet")
+      checkAnswer(
+        sql("SELECT p.c1, c2 FROM insertParquet p"),
+        (70 to 79).map(i => Row(i, s"str$i")))
+    }
   }
 }

From db3fd054f240c7e38aba0732e471df65cd14011a Mon Sep 17 00:00:00 2001
From: Cheng Hao <hao.cheng@intel.com>
Date: Wed, 27 May 2015 14:21:00 -0700
Subject: [PATCH 205/525] [SPARK-7853] [SQL] Fixes a class loader issue in
 Spark SQL

This PR is based on PR #6396 authored by chenghao-intel. Essentially, Spark SQL should use context classloader to load SerDe classes.

yhuai helped updating the test case, and I fixed a bug in the original `CliSuite`: while testing the CLI tool with `runCliWithin`, we don't append `\n` to the last query, thus the last query is never executed.

Original PR description is pasted below.

----

```
bin/spark-sql --jars ./sql/hive/src/test/resources/hive-hcatalog-core-0.13.1.jar
CREATE TABLE t1(a string, b string) ROW FORMAT SERDE 'org.apache.hive.hcatalog.data.JsonSerDe';
```

Throws exception like

```
15/05/26 00:16:33 ERROR SparkSQLDriver: Failed in [CREATE TABLE t1(a string, b string) ROW FORMAT SERDE 'org.apache.hive.hcatalog.data.JsonSerDe']
org.apache.spark.sql.execution.QueryExecutionException: FAILED: Execution Error, return code 1 from org.apache.hadoop.hive.ql.exec.DDLTask. Cannot validate serde: org.apache.hive.hcatalog.data.JsonSerDe
        at org.apache.spark.sql.hive.client.ClientWrapper$$anonfun$runHive$1.apply(ClientWrapper.scala:333)
        at org.apache.spark.sql.hive.client.ClientWrapper$$anonfun$runHive$1.apply(ClientWrapper.scala:310)
        at org.apache.spark.sql.hive.client.ClientWrapper.withHiveState(ClientWrapper.scala:139)
        at org.apache.spark.sql.hive.client.ClientWrapper.runHive(ClientWrapper.scala:310)
        at org.apache.spark.sql.hive.client.ClientWrapper.runSqlHive(ClientWrapper.scala:300)
        at org.apache.spark.sql.hive.HiveContext.runSqlHive(HiveContext.scala:457)
        at org.apache.spark.sql.hive.execution.HiveNativeCommand.run(HiveNativeCommand.scala:33)
        at org.apache.spark.sql.execution.ExecutedCommand.sideEffectResult$lzycompute(commands.scala:57)
        at org.apache.spark.sql.execution.ExecutedCommand.sideEffectResult(commands.scala:57)
        at org.apache.spark.sql.execution.ExecutedCommand.doExecute(commands.scala:68)
        at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:88)
        at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:88)
        at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:148)
        at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:87)
        at org.apache.spark.sql.SQLContext$QueryExecution.toRdd$lzycompute(SQLContext.scala:922)
        at org.apache.spark.sql.SQLContext$QueryExecution.toRdd(SQLContext.scala:922)
        at org.apache.spark.sql.DataFrame.<init>(DataFrame.scala:147)
        at org.apache.spark.sql.DataFrame.<init>(DataFrame.scala:131)
        at org.apache.spark.sql.DataFrame$.apply(DataFrame.scala:51)
        at org.apache.spark.sql.SQLContext.sql(SQLContext.scala:727)
        at org.apache.spark.sql.hive.thriftserver.AbstractSparkSQLDriver.run(AbstractSparkSQLDriver.scala:57)
```

Author: Cheng Hao <hao.cheng@intel.com>
Author: Cheng Lian <lian@databricks.com>
Author: Yin Huai <yhuai@databricks.com>

Closes #6435 from liancheng/classLoader and squashes the following commits:

d4c4845 [Cheng Lian] Fixes CliSuite
75e80e2 [Yin Huai] Update the fix.
fd26533 [Cheng Hao] scalastyle
dd78775 [Cheng Hao] workaround for classloader of IsolatedClientLoader
---
 .../sql/hive/thriftserver/CliSuite.scala      | 41 +++++++++++++++++--
 .../apache/spark/sql/hive/HiveContext.scala   | 18 ++++++--
 .../apache/spark/sql/hive/TableReader.scala   |  2 +-
 3 files changed, 53 insertions(+), 8 deletions(-)

diff --git a/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/CliSuite.scala b/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/CliSuite.scala
index b070fa8eaa469..cc07db827d359 100644
--- a/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/CliSuite.scala
+++ b/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/CliSuite.scala
@@ -25,11 +25,15 @@ import scala.concurrent.{Await, Promise}
 import scala.sys.process.{Process, ProcessLogger}
 
 import org.apache.hadoop.hive.conf.HiveConf.ConfVars
-import org.scalatest.{BeforeAndAfter, BeforeAndAfterAll, FunSuite}
+import org.scalatest.{BeforeAndAfter, FunSuite}
 
 import org.apache.spark.Logging
 import org.apache.spark.util.Utils
 
+/**
+ * A test suite for the `spark-sql` CLI tool.  Note that all test cases share the same temporary
+ * Hive metastore and warehouse.
+ */
 class CliSuite extends FunSuite with BeforeAndAfter with Logging {
   val warehousePath = Utils.createTempDir()
   val metastorePath = Utils.createTempDir()
@@ -58,13 +62,13 @@ class CliSuite extends FunSuite with BeforeAndAfter with Logging {
          |  --master local
          |  --hiveconf ${ConfVars.METASTORECONNECTURLKEY}=$jdbcUrl
          |  --hiveconf ${ConfVars.METASTOREWAREHOUSE}=$warehousePath
-         |  --driver-class-path ${sys.props("java.class.path")}
        """.stripMargin.split("\\s+").toSeq ++ extraArgs
     }
 
     var next = 0
     val foundAllExpectedAnswers = Promise.apply[Unit]()
-    val queryStream = new ByteArrayInputStream(queries.mkString("\n").getBytes)
+    // Explicitly adds ENTER for each statement to make sure they are actually entered into the CLI.
+    val queryStream = new ByteArrayInputStream(queries.map(_ + "\n").mkString.getBytes)
     val buffer = new ArrayBuffer[String]()
     val lock = new Object
 
@@ -124,7 +128,7 @@ class CliSuite extends FunSuite with BeforeAndAfter with Logging {
       "SELECT COUNT(*) FROM hive_test;"
         -> "5",
       "DROP TABLE hive_test;"
-        -> "Time taken: "
+        -> "OK"
     )
   }
 
@@ -151,4 +155,33 @@ class CliSuite extends FunSuite with BeforeAndAfter with Logging {
         -> "hive_test"
     )
   }
+
+  test("Commands using SerDe provided in --jars") {
+    val jarFile =
+      "../hive/src/test/resources/hive-hcatalog-core-0.13.1.jar"
+        .split("/")
+        .mkString(File.separator)
+
+    val dataFilePath =
+      Thread.currentThread().getContextClassLoader.getResource("data/files/small_kv.txt")
+
+    runCliWithin(1.minute, Seq("--jars", s"$jarFile"))(
+      """CREATE TABLE t1(key string, val string)
+        |ROW FORMAT SERDE 'org.apache.hive.hcatalog.data.JsonSerDe';
+      """.stripMargin
+        -> "OK",
+      "CREATE TABLE sourceTable (key INT, val STRING);"
+        -> "OK",
+      s"LOAD DATA LOCAL INPATH '$dataFilePath' OVERWRITE INTO TABLE sourceTable;"
+        -> "OK",
+      "INSERT INTO TABLE t1 SELECT key, val FROM sourceTable;"
+        -> "Time taken:",
+      "SELECT count(key) FROM t1;"
+        -> "5",
+      "DROP TABLE t1;"
+        -> "OK",
+      "DROP TABLE sourceTable;"
+        -> "OK"
+    )
+  }
 }
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala
index b64768ababef9..9ab98fdcce725 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala
@@ -18,6 +18,7 @@
 package org.apache.spark.sql.hive
 
 import java.io.{BufferedReader, File, InputStreamReader, PrintStream}
+import java.net.{URL, URLClassLoader}
 import java.sql.Timestamp
 import java.util.{ArrayList => JArrayList}
 
@@ -25,7 +26,7 @@ import org.apache.hadoop.hive.ql.parse.VariableSubstitution
 import org.apache.spark.sql.catalyst.ParserDialect
 
 import scala.collection.JavaConversions._
-import scala.collection.mutable.HashMap
+import scala.collection.mutable.{ArrayBuffer, HashMap}
 import scala.language.implicitConversions
 
 import org.apache.hadoop.fs.{FileSystem, Path}
@@ -188,8 +189,19 @@ class HiveContext(sc: SparkContext) extends SQLContext(sc) {
           "Specify a vaild path to the correct hive jars using $HIVE_METASTORE_JARS " +
           s"or change $HIVE_METASTORE_VERSION to $hiveExecutionVersion.")
       }
-      val jars = getClass.getClassLoader match {
-        case urlClassLoader: java.net.URLClassLoader => urlClassLoader.getURLs
+      // We recursively add all jars in the class loader chain,
+      // starting from the given urlClassLoader.
+      def addJars(urlClassLoader: URLClassLoader): Array[URL] = {
+        val jarsInParent = urlClassLoader.getParent match {
+          case parent: URLClassLoader => addJars(parent)
+          case other => Array.empty[URL]
+        }
+
+        urlClassLoader.getURLs ++ jarsInParent
+      }
+
+      val jars = Utils.getContextOrSparkClassLoader match {
+        case urlClassLoader: URLClassLoader => addJars(urlClassLoader)
         case other =>
           throw new IllegalArgumentException(
             "Unable to locate hive jars to connect to metastore " +
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/TableReader.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/TableReader.scala
index 0b6f7a334a715..294fc3bd7d5e9 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/TableReader.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/TableReader.scala
@@ -79,7 +79,7 @@ class HadoopTableReader(
     makeRDDForTable(
       hiveTable,
       Class.forName(
-        relation.tableDesc.getSerdeClassName, true, Utils.getSparkClassLoader)
+        relation.tableDesc.getSerdeClassName, true, Utils.getContextOrSparkClassLoader)
         .asInstanceOf[Class[Deserializer]],
       filterOpt = None)
 

From a1e092eae57172909ff2af06d8b461742595734c Mon Sep 17 00:00:00 2001
From: Liang-Chi Hsieh <viirya@gmail.com>
Date: Wed, 27 May 2015 18:51:36 -0700
Subject: [PATCH 206/525] [SPARK-7897][SQL] Use DecimalType to represent
 unsigned bigint in JDBCRDD

JIRA: https://issues.apache.org/jira/browse/SPARK-7897

Author: Liang-Chi Hsieh <viirya@gmail.com>

Closes #6438 from viirya/jdbc_unsigned_bigint and squashes the following commits:

ccb3c3f [Liang-Chi Hsieh] Use DecimalType to represent unsigned bigint.
---
 sql/core/src/main/scala/org/apache/spark/sql/jdbc/JDBCRDD.scala | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/jdbc/JDBCRDD.scala b/sql/core/src/main/scala/org/apache/spark/sql/jdbc/JDBCRDD.scala
index 244bd3ebfeb7e..88f1b02549e21 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/jdbc/JDBCRDD.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/jdbc/JDBCRDD.scala
@@ -53,7 +53,7 @@ private[sql] object JDBCRDD extends Logging {
       signed: Boolean): DataType = {
     val answer = sqlType match {
       case java.sql.Types.ARRAY         => null
-      case java.sql.Types.BIGINT        => LongType
+      case java.sql.Types.BIGINT        => if (signed) { LongType } else { DecimalType.Unlimited }
       case java.sql.Types.BINARY        => BinaryType
       case java.sql.Types.BIT           => BooleanType // @see JdbcDialect for quirks
       case java.sql.Types.BLOB          => BinaryType

From 3c1f1baaf003d50786d3eee1e288f4bac69096f2 Mon Sep 17 00:00:00 2001
From: Yin Huai <yhuai@databricks.com>
Date: Wed, 27 May 2015 20:04:29 -0700
Subject: [PATCH 207/525] [SPARK-7907] [SQL] [UI] Rename tab ThriftServer to
 SQL.

This PR has three changes:
1. Renaming the table of `ThriftServer` to `SQL`;
2. Renaming the title of the tab from `ThriftServer` to `JDBC/ODBC Server`; and
3. Renaming the title of the session page from `ThriftServer` to `JDBC/ODBC Session`.

https://issues.apache.org/jira/browse/SPARK-7907

Author: Yin Huai <yhuai@databricks.com>

Closes #6448 from yhuai/JDBCServer and squashes the following commits:

eadcc3d [Yin Huai] Update test.
9168005 [Yin Huai] Use SQL as the tab name.
221831e [Yin Huai] Rename ThriftServer to JDBCServer.
---
 .../spark/sql/hive/thriftserver/ui/ThriftServerPage.scala     | 4 ++--
 .../sql/hive/thriftserver/ui/ThriftServerSessionPage.scala    | 2 +-
 .../spark/sql/hive/thriftserver/ui/ThriftServerTab.scala      | 4 +++-
 .../apache/spark/sql/hive/thriftserver/UISeleniumSuite.scala  | 4 ++--
 4 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/ui/ThriftServerPage.scala b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/ui/ThriftServerPage.scala
index 6a2be4a58e5cb..7c48ff4b35df5 100644
--- a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/ui/ThriftServerPage.scala
+++ b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/ui/ThriftServerPage.scala
@@ -47,7 +47,7 @@ private[ui] class ThriftServerPage(parent: ThriftServerTab) extends WebUIPage(""
       </h4> ++
       generateSessionStatsTable() ++
       generateSQLStatsTable()
-    UIUtils.headerSparkPage("ThriftServer", content, parent, Some(5000))
+    UIUtils.headerSparkPage("JDBC/ODBC Server", content, parent, Some(5000))
   }
 
   /** Generate basic stats of the thrift server program */
@@ -143,7 +143,7 @@ private[ui] class ThriftServerPage(parent: ThriftServerTab) extends WebUIPage(""
       val headerRow = Seq("User", "IP", "Session ID", "Start Time", "Finish Time", "Duration",
         "Total Execute")
       def generateDataRow(session: SessionInfo): Seq[Node] = {
-        val sessionLink = "%s/ThriftServer/session?id=%s"
+        val sessionLink = "%s/sql/session?id=%s"
           .format(UIUtils.prependBaseUri(parent.basePath), session.sessionId)
         <tr>
           <td> {session.userName} </td>
diff --git a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/ui/ThriftServerSessionPage.scala b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/ui/ThriftServerSessionPage.scala
index 33ba038ecce73..d9d66dcd8517e 100644
--- a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/ui/ThriftServerSessionPage.scala
+++ b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/ui/ThriftServerSessionPage.scala
@@ -55,7 +55,7 @@ private[ui] class ThriftServerSessionPage(parent: ThriftServerTab)
         Total run {sessionStat._2.totalExecution} SQL
       </h4> ++
       generateSQLStatsTable(sessionStat._2.sessionId)
-    UIUtils.headerSparkPage("ThriftServer", content, parent, Some(5000))
+    UIUtils.headerSparkPage("JDBC/ODBC Session", content, parent, Some(5000))
   }
 
   /** Generate basic stats of the streaming program */
diff --git a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/ui/ThriftServerTab.scala b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/ui/ThriftServerTab.scala
index 343031f10c75c..94fd8a6bb60b9 100644
--- a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/ui/ThriftServerTab.scala
+++ b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/ui/ThriftServerTab.scala
@@ -27,7 +27,9 @@ import org.apache.spark.{SparkContext, Logging, SparkException}
  * This assumes the given SparkContext has enabled its SparkUI.
  */
 private[thriftserver] class ThriftServerTab(sparkContext: SparkContext)
-  extends SparkUITab(getSparkUI(sparkContext), "ThriftServer") with Logging {
+  extends SparkUITab(getSparkUI(sparkContext), "sql") with Logging {
+
+  override val name = "SQL"
 
   val parent = getSparkUI(sparkContext)
   val listener = HiveThriftServer2.listener
diff --git a/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/UISeleniumSuite.scala b/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/UISeleniumSuite.scala
index a286dc5825f77..e1466e0423033 100644
--- a/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/UISeleniumSuite.scala
+++ b/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/UISeleniumSuite.scala
@@ -84,11 +84,11 @@ class UISeleniumSuite
 
       eventually(timeout(10 seconds), interval(50 milliseconds)) {
         go to baseURL
-        find(cssSelector("""ul li a[href*="ThriftServer"]""")) should not be None
+        find(cssSelector("""ul li a[href*="sql"]""")) should not be None
       }
 
       eventually(timeout(10 seconds), interval(50 milliseconds)) {
-        go to (baseURL + "/ThriftServer")
+        go to (baseURL + "/sql")
         find(id("sessionstat")) should not be None
         find(id("sqlstat")) should not be None
 

From 852f4de2d3d0c5fff2fa66000a7a3088bb3dbe74 Mon Sep 17 00:00:00 2001
From: Josh Rosen <joshrosen@databricks.com>
Date: Wed, 27 May 2015 20:19:53 -0700
Subject: [PATCH 208/525] [SPARK-7873] Allow KryoSerializerInstance to create
 multiple streams at the same time

This is a somewhat obscure bug, but I think that it will seriously impact KryoSerializer users who use custom registrators which disabled auto-reset. When auto-reset is disabled, then this breaks things in some of our shuffle paths which actually end up creating multiple OutputStreams from the same shared SerializerInstance (which is unsafe).

This was introduced by a patch (SPARK-3386) which enables serializer re-use in some of the shuffle paths, since constructing new serializer instances is actually pretty costly for KryoSerializer.  We had already fixed another corner-case (SPARK-7766) bug related to this, but missed this one.

I think that the root problem here is that KryoSerializerInstance can be used in a way which is unsafe even within a single thread, e.g. by creating multiple open OutputStreams from the same instance or by interleaving deserialize and deserializeStream calls. I considered a smaller patch which adds assertions to guard against this type of "misuse" but abandoned that approach after I realized how convoluted the Scaladoc became.

This patch fixes this bug by making it legal to create multiple streams from the same KryoSerializerInstance.  Internally, KryoSerializerInstance now implements a  `borrowKryo()` / `releaseKryo()` API that's backed by a "pool" of capacity 1. Each call to a KryoSerializerInstance method will borrow the Kryo, do its work, then release the serializer instance back to the pool. If the pool is empty and we need an instance, it will allocate a new Kryo on-demand. This makes it safe for multiple OutputStreams to be opened from the same serializer. If we try to release a Kryo back to the pool but the pool already contains a Kryo, then we'll just discard the new Kryo. I don't think there's a clear benefit to having a larger pool since our usages tend to fall into two cases, a) where we only create a single OutputStream and b) where we create a huge number of OutputStreams with the same lifecycle, then destroy the KryoSerializerInstance (this is what's happening in the bypassMergeSort code path that my regression test hits).

Author: Josh Rosen <joshrosen@databricks.com>

Closes #6415 from JoshRosen/SPARK-7873 and squashes the following commits:

00b402e [Josh Rosen] Initialize eagerly to fix a failing test
ba55d20 [Josh Rosen] Add explanatory comments
3f1da96 [Josh Rosen] Guard against duplicate close()
ab457ca [Josh Rosen] Sketch a loan/release based solution.
9816e8f [Josh Rosen] Add a failing test showing how deserialize() and deserializeStream() can interfere.
7350886 [Josh Rosen] Add failing regression test for SPARK-7873
---
 .../spark/serializer/KryoSerializer.scala     | 129 ++++++++++++++----
 .../apache/spark/serializer/Serializer.scala  |   5 +
 .../serializer/KryoSerializerSuite.scala      |  37 ++++-
 3 files changed, 147 insertions(+), 24 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/serializer/KryoSerializer.scala b/core/src/main/scala/org/apache/spark/serializer/KryoSerializer.scala
index 217957963437d..3f909885dbd66 100644
--- a/core/src/main/scala/org/apache/spark/serializer/KryoSerializer.scala
+++ b/core/src/main/scala/org/apache/spark/serializer/KryoSerializer.scala
@@ -17,8 +17,9 @@
 
 package org.apache.spark.serializer
 
-import java.io.{EOFException, InputStream, OutputStream}
+import java.io.{EOFException, IOException, InputStream, OutputStream}
 import java.nio.ByteBuffer
+import javax.annotation.Nullable
 
 import scala.reflect.ClassTag
 
@@ -136,21 +137,45 @@ class KryoSerializer(conf: SparkConf)
 }
 
 private[spark]
-class KryoSerializationStream(kryo: Kryo, outStream: OutputStream) extends SerializationStream {
-  val output = new KryoOutput(outStream)
+class KryoSerializationStream(
+    serInstance: KryoSerializerInstance,
+    outStream: OutputStream) extends SerializationStream {
+
+  private[this] var output: KryoOutput = new KryoOutput(outStream)
+  private[this] var kryo: Kryo = serInstance.borrowKryo()
 
   override def writeObject[T: ClassTag](t: T): SerializationStream = {
     kryo.writeClassAndObject(output, t)
     this
   }
 
-  override def flush() { output.flush() }
-  override def close() { output.close() }
+  override def flush() {
+    if (output == null) {
+      throw new IOException("Stream is closed")
+    }
+    output.flush()
+  }
+
+  override def close() {
+    if (output != null) {
+      try {
+        output.close()
+      } finally {
+        serInstance.releaseKryo(kryo)
+        kryo = null
+        output = null
+      }
+    }
+  }
 }
 
 private[spark]
-class KryoDeserializationStream(kryo: Kryo, inStream: InputStream) extends DeserializationStream {
-  private val input = new KryoInput(inStream)
+class KryoDeserializationStream(
+    serInstance: KryoSerializerInstance,
+    inStream: InputStream) extends DeserializationStream {
+
+  private[this] var input: KryoInput = new KryoInput(inStream)
+  private[this] var kryo: Kryo = serInstance.borrowKryo()
 
   override def readObject[T: ClassTag](): T = {
     try {
@@ -163,52 +188,105 @@ class KryoDeserializationStream(kryo: Kryo, inStream: InputStream) extends Deser
   }
 
   override def close() {
-    // Kryo's Input automatically closes the input stream it is using.
-    input.close()
+    if (input != null) {
+      try {
+        // Kryo's Input automatically closes the input stream it is using.
+        input.close()
+      } finally {
+        serInstance.releaseKryo(kryo)
+        kryo = null
+        input = null
+      }
+    }
   }
 }
 
 private[spark] class KryoSerializerInstance(ks: KryoSerializer) extends SerializerInstance {
-  private val kryo = ks.newKryo()
 
-  // Make these lazy vals to avoid creating a buffer unless we use them
+  /**
+   * A re-used [[Kryo]] instance. Methods will borrow this instance by calling `borrowKryo()`, do
+   * their work, then release the instance by calling `releaseKryo()`. Logically, this is a caching
+   * pool of size one. SerializerInstances are not thread-safe, hence accesses to this field are
+   * not synchronized.
+   */
+  @Nullable private[this] var cachedKryo: Kryo = borrowKryo()
+
+  /**
+   * Borrows a [[Kryo]] instance. If possible, this tries to re-use a cached Kryo instance;
+   * otherwise, it allocates a new instance.
+   */
+  private[serializer] def borrowKryo(): Kryo = {
+    if (cachedKryo != null) {
+      val kryo = cachedKryo
+      // As a defensive measure, call reset() to clear any Kryo state that might have been modified
+      // by the last operation to borrow this instance (see SPARK-7766 for discussion of this issue)
+      kryo.reset()
+      cachedKryo = null
+      kryo
+    } else {
+      ks.newKryo()
+    }
+  }
+
+  /**
+   * Release a borrowed [[Kryo]] instance. If this serializer instance already has a cached Kryo
+   * instance, then the given Kryo instance is discarded; otherwise, the Kryo is stored for later
+   * re-use.
+   */
+  private[serializer] def releaseKryo(kryo: Kryo): Unit = {
+    if (cachedKryo == null) {
+      cachedKryo = kryo
+    }
+  }
+
+  // Make these lazy vals to avoid creating a buffer unless we use them.
   private lazy val output = ks.newKryoOutput()
   private lazy val input = new KryoInput()
 
   override def serialize[T: ClassTag](t: T): ByteBuffer = {
     output.clear()
-    kryo.reset() // We must reset in case this serializer instance was reused (see SPARK-7766)
+    val kryo = borrowKryo()
     try {
       kryo.writeClassAndObject(output, t)
     } catch {
       case e: KryoException if e.getMessage.startsWith("Buffer overflow") =>
         throw new SparkException(s"Kryo serialization failed: ${e.getMessage}. To avoid this, " +
           "increase spark.kryoserializer.buffer.max value.")
+    } finally {
+      releaseKryo(kryo)
     }
     ByteBuffer.wrap(output.toBytes)
   }
 
   override def deserialize[T: ClassTag](bytes: ByteBuffer): T = {
-    input.setBuffer(bytes.array)
-    kryo.readClassAndObject(input).asInstanceOf[T]
+    val kryo = borrowKryo()
+    try {
+      input.setBuffer(bytes.array)
+      kryo.readClassAndObject(input).asInstanceOf[T]
+    } finally {
+      releaseKryo(kryo)
+    }
   }
 
   override def deserialize[T: ClassTag](bytes: ByteBuffer, loader: ClassLoader): T = {
+    val kryo = borrowKryo()
     val oldClassLoader = kryo.getClassLoader
-    kryo.setClassLoader(loader)
-    input.setBuffer(bytes.array)
-    val obj = kryo.readClassAndObject(input).asInstanceOf[T]
-    kryo.setClassLoader(oldClassLoader)
-    obj
+    try {
+      kryo.setClassLoader(loader)
+      input.setBuffer(bytes.array)
+      kryo.readClassAndObject(input).asInstanceOf[T]
+    } finally {
+      kryo.setClassLoader(oldClassLoader)
+      releaseKryo(kryo)
+    }
   }
 
   override def serializeStream(s: OutputStream): SerializationStream = {
-    kryo.reset() // We must reset in case this serializer instance was reused (see SPARK-7766)
-    new KryoSerializationStream(kryo, s)
+    new KryoSerializationStream(this, s)
   }
 
   override def deserializeStream(s: InputStream): DeserializationStream = {
-    new KryoDeserializationStream(kryo, s)
+    new KryoDeserializationStream(this, s)
   }
 
   /**
@@ -218,7 +296,12 @@ private[spark] class KryoSerializerInstance(ks: KryoSerializer) extends Serializ
   def getAutoReset(): Boolean = {
     val field = classOf[Kryo].getDeclaredField("autoReset")
     field.setAccessible(true)
-    field.get(kryo).asInstanceOf[Boolean]
+    val kryo = borrowKryo()
+    try {
+      field.get(kryo).asInstanceOf[Boolean]
+    } finally {
+      releaseKryo(kryo)
+    }
   }
 }
 
diff --git a/core/src/main/scala/org/apache/spark/serializer/Serializer.scala b/core/src/main/scala/org/apache/spark/serializer/Serializer.scala
index 6078c9d433ebf..f1bdff96d3df1 100644
--- a/core/src/main/scala/org/apache/spark/serializer/Serializer.scala
+++ b/core/src/main/scala/org/apache/spark/serializer/Serializer.scala
@@ -19,6 +19,7 @@ package org.apache.spark.serializer
 
 import java.io._
 import java.nio.ByteBuffer
+import javax.annotation.concurrent.NotThreadSafe
 
 import scala.reflect.ClassTag
 
@@ -114,8 +115,12 @@ object Serializer {
 /**
  * :: DeveloperApi ::
  * An instance of a serializer, for use by one thread at a time.
+ *
+ * It is legal to create multiple serialization / deserialization streams from the same
+ * SerializerInstance as long as those streams are all used within the same thread.
  */
 @DeveloperApi
+@NotThreadSafe
 abstract class SerializerInstance {
   def serialize[T: ClassTag](t: T): ByteBuffer
 
diff --git a/core/src/test/scala/org/apache/spark/serializer/KryoSerializerSuite.scala b/core/src/test/scala/org/apache/spark/serializer/KryoSerializerSuite.scala
index 8c384bd358ebc..ef50bc9438f95 100644
--- a/core/src/test/scala/org/apache/spark/serializer/KryoSerializerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/serializer/KryoSerializerSuite.scala
@@ -17,7 +17,7 @@
 
 package org.apache.spark.serializer
 
-import java.io.ByteArrayOutputStream
+import java.io.{ByteArrayInputStream, ByteArrayOutputStream}
 
 import scala.collection.mutable
 import scala.reflect.ClassTag
@@ -361,6 +361,41 @@ class KryoSerializerSuite extends FunSuite with SharedSparkContext {
   }
 }
 
+class KryoSerializerAutoResetDisabledSuite extends FunSuite with SharedSparkContext {
+  conf.set("spark.serializer", classOf[KryoSerializer].getName)
+  conf.set("spark.kryo.registrator", classOf[RegistratorWithoutAutoReset].getName)
+  conf.set("spark.kryo.referenceTracking", "true")
+  conf.set("spark.shuffle.manager", "sort")
+  conf.set("spark.shuffle.sort.bypassMergeThreshold", "200")
+
+  test("sort-shuffle with bypassMergeSort (SPARK-7873)") {
+    val myObject = ("Hello", "World")
+    assert(sc.parallelize(Seq.fill(100)(myObject)).repartition(2).collect().toSet === Set(myObject))
+  }
+
+  test("calling deserialize() after deserializeStream()") {
+    val serInstance = new KryoSerializer(conf).newInstance().asInstanceOf[KryoSerializerInstance]
+    assert(!serInstance.getAutoReset())
+    val hello = "Hello"
+    val world = "World"
+    // Here, we serialize the same value twice, so the reference-tracking should cause us to store
+    // references to some of these values
+    val helloHello = serInstance.serialize((hello, hello))
+    // Here's a stream which only contains one value
+    val worldWorld: Array[Byte] = {
+      val baos = new ByteArrayOutputStream()
+      val serStream = serInstance.serializeStream(baos)
+      serStream.writeObject(world)
+      serStream.writeObject(world)
+      serStream.close()
+      baos.toByteArray
+    }
+    val deserializationStream = serInstance.deserializeStream(new ByteArrayInputStream(worldWorld))
+    assert(deserializationStream.readValue[Any]() === world)
+    deserializationStream.close()
+    assert(serInstance.deserialize[Any](helloHello) === (hello, hello))
+  }
+}
 
 class ClassLoaderTestingObject
 

From bd11b01ebaf62df8b0d8c0b63b51b66e58f50960 Mon Sep 17 00:00:00 2001
From: Sandy Ryza <sandy@cloudera.com>
Date: Wed, 27 May 2015 22:23:22 -0700
Subject: [PATCH 209/525] [SPARK-7896] Allow ChainedBuffer to store more than 2
 GB

Author: Sandy Ryza <sandy@cloudera.com>

Closes #6440 from sryza/sandy-spark-7896 and squashes the following commits:

49d8a0d [Sandy Ryza] Fix bug introduced when reading over record boundaries
6006856 [Sandy Ryza] Fix overflow issues
006b4b2 [Sandy Ryza] Fix scalastyle by removing non ascii characters
8b000ca [Sandy Ryza] Add ascii art to describe layout of data in metaBuffer
f2053c0 [Sandy Ryza] Fix negative overflow issue
0368c78 [Sandy Ryza] Initialize size as 0
a5a4820 [Sandy Ryza] Use explicit types for all numbers in ChainedBuffer
b7e0213 [Sandy Ryza] SPARK-7896. Allow ChainedBuffer to store more than 2 GB
---
 .../spark/util/collection/ChainedBuffer.scala | 46 +++++++++--------
 .../PartitionedSerializedPairBuffer.scala     | 51 +++++++++++--------
 2 files changed, 55 insertions(+), 42 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/util/collection/ChainedBuffer.scala b/core/src/main/scala/org/apache/spark/util/collection/ChainedBuffer.scala
index a60bffe611f14..516aaa44d03fc 100644
--- a/core/src/main/scala/org/apache/spark/util/collection/ChainedBuffer.scala
+++ b/core/src/main/scala/org/apache/spark/util/collection/ChainedBuffer.scala
@@ -28,11 +28,13 @@ import scala.collection.mutable.ArrayBuffer
  * occupy a contiguous segment of memory.
  */
 private[spark] class ChainedBuffer(chunkSize: Int) {
-  private val chunkSizeLog2 = (math.log(chunkSize) / math.log(2)).toInt
-  assert(math.pow(2, chunkSizeLog2).toInt == chunkSize,
+
+  private val chunkSizeLog2: Int = java.lang.Long.numberOfTrailingZeros(
+    java.lang.Long.highestOneBit(chunkSize))
+  assert((1 << chunkSizeLog2) == chunkSize,
     s"ChainedBuffer chunk size $chunkSize must be a power of two")
   private val chunks: ArrayBuffer[Array[Byte]] = new ArrayBuffer[Array[Byte]]()
-  private var _size: Int = _
+  private var _size: Long = 0
 
   /**
    * Feed bytes from this buffer into a BlockObjectWriter.
@@ -41,16 +43,16 @@ private[spark] class ChainedBuffer(chunkSize: Int) {
    * @param os OutputStream to read into.
    * @param len Number of bytes to read.
    */
-  def read(pos: Int, os: OutputStream, len: Int): Unit = {
+  def read(pos: Long, os: OutputStream, len: Int): Unit = {
     if (pos + len > _size) {
       throw new IndexOutOfBoundsException(
         s"Read of $len bytes at position $pos would go past size ${_size} of buffer")
     }
-    var chunkIndex = pos >> chunkSizeLog2
-    var posInChunk = pos - (chunkIndex << chunkSizeLog2)
-    var written = 0
+    var chunkIndex: Int = (pos >> chunkSizeLog2).toInt
+    var posInChunk: Int = (pos - (chunkIndex.toLong << chunkSizeLog2)).toInt
+    var written: Int = 0
     while (written < len) {
-      val toRead = math.min(len - written, chunkSize - posInChunk)
+      val toRead: Int = math.min(len - written, chunkSize - posInChunk)
       os.write(chunks(chunkIndex), posInChunk, toRead)
       written += toRead
       chunkIndex += 1
@@ -66,16 +68,16 @@ private[spark] class ChainedBuffer(chunkSize: Int) {
    * @param offs Offset in the byte array to read to.
    * @param len Number of bytes to read.
    */
-  def read(pos: Int, bytes: Array[Byte], offs: Int, len: Int): Unit = {
+  def read(pos: Long, bytes: Array[Byte], offs: Int, len: Int): Unit = {
     if (pos + len > _size) {
       throw new IndexOutOfBoundsException(
         s"Read of $len bytes at position $pos would go past size of buffer")
     }
-    var chunkIndex = pos >> chunkSizeLog2
-    var posInChunk = pos - (chunkIndex << chunkSizeLog2)
-    var written = 0
+    var chunkIndex: Int = (pos >> chunkSizeLog2).toInt
+    var posInChunk: Int = (pos - (chunkIndex.toLong << chunkSizeLog2)).toInt
+    var written: Int = 0
     while (written < len) {
-      val toRead = math.min(len - written, chunkSize - posInChunk)
+      val toRead: Int = math.min(len - written, chunkSize - posInChunk)
       System.arraycopy(chunks(chunkIndex), posInChunk, bytes, offs + written, toRead)
       written += toRead
       chunkIndex += 1
@@ -91,22 +93,22 @@ private[spark] class ChainedBuffer(chunkSize: Int) {
    * @param offs Offset in the byte array to write from.
    * @param len Number of bytes to write.
    */
-  def write(pos: Int, bytes: Array[Byte], offs: Int, len: Int): Unit = {
+  def write(pos: Long, bytes: Array[Byte], offs: Int, len: Int): Unit = {
     if (pos > _size) {
       throw new IndexOutOfBoundsException(
         s"Write at position $pos starts after end of buffer ${_size}")
     }
     // Grow if needed
-    val endChunkIndex = (pos + len - 1) >> chunkSizeLog2
+    val endChunkIndex: Int = ((pos + len - 1) >> chunkSizeLog2).toInt
     while (endChunkIndex >= chunks.length) {
       chunks += new Array[Byte](chunkSize)
     }
 
-    var chunkIndex = pos >> chunkSizeLog2
-    var posInChunk = pos - (chunkIndex << chunkSizeLog2)
-    var written = 0
+    var chunkIndex: Int = (pos >> chunkSizeLog2).toInt
+    var posInChunk: Int = (pos - (chunkIndex.toLong << chunkSizeLog2)).toInt
+    var written: Int = 0
     while (written < len) {
-      val toWrite = math.min(len - written, chunkSize - posInChunk)
+      val toWrite: Int = math.min(len - written, chunkSize - posInChunk)
       System.arraycopy(bytes, offs + written, chunks(chunkIndex), posInChunk, toWrite)
       written += toWrite
       chunkIndex += 1
@@ -119,19 +121,19 @@ private[spark] class ChainedBuffer(chunkSize: Int) {
   /**
    * Total size of buffer that can be written to without allocating additional memory.
    */
-  def capacity: Int = chunks.size * chunkSize
+  def capacity: Long = chunks.size.toLong * chunkSize
 
   /**
    * Size of the logical buffer.
    */
-  def size: Int = _size
+  def size: Long = _size
 }
 
 /**
  * Output stream that writes to a ChainedBuffer.
  */
 private[spark] class ChainedBufferOutputStream(chainedBuffer: ChainedBuffer) extends OutputStream {
-  private var pos = 0
+  private var pos: Long = 0
 
   override def write(b: Int): Unit = {
     throw new UnsupportedOperationException()
diff --git a/core/src/main/scala/org/apache/spark/util/collection/PartitionedSerializedPairBuffer.scala b/core/src/main/scala/org/apache/spark/util/collection/PartitionedSerializedPairBuffer.scala
index ac9ea6393628f..554d88206e221 100644
--- a/core/src/main/scala/org/apache/spark/util/collection/PartitionedSerializedPairBuffer.scala
+++ b/core/src/main/scala/org/apache/spark/util/collection/PartitionedSerializedPairBuffer.scala
@@ -41,6 +41,13 @@ import org.apache.spark.util.collection.PartitionedSerializedPairBuffer._
  *
  * Currently, only sorting by partition is supported.
  *
+ * Each record is laid out inside the the metaBuffer as follows. keyStart, a long, is split across
+ * two integers:
+ *
+ *   +-------------+------------+------------+-------------+
+ *   |         keyStart         | keyValLen  | partitionId |
+ *   +-------------+------------+------------+-------------+
+ *
  * @param metaInitialRecords The initial number of entries in the metadata buffer.
  * @param kvBlockSize The size of each byte buffer in the ChainedBuffer used to store the records.
  * @param serializerInstance the serializer used for serializing inserted records.
@@ -68,19 +75,15 @@ private[spark] class PartitionedSerializedPairBuffer[K, V](
     }
 
     val keyStart = kvBuffer.size
-    if (keyStart < 0) {
-      throw new Exception(s"Can't grow buffer beyond ${1 << 31} bytes")
-    }
     kvSerializationStream.writeKey[Any](key)
-    kvSerializationStream.flush()
-    val valueStart = kvBuffer.size
     kvSerializationStream.writeValue[Any](value)
     kvSerializationStream.flush()
-    val valueEnd = kvBuffer.size
+    val keyValLen = (kvBuffer.size - keyStart).toInt
 
-    metaBuffer.put(keyStart)
-    metaBuffer.put(valueStart)
-    metaBuffer.put(valueEnd)
+    // keyStart, a long, gets split across two ints
+    metaBuffer.put(keyStart.toInt)
+    metaBuffer.put((keyStart >> 32).toInt)
+    metaBuffer.put(keyValLen)
     metaBuffer.put(partition)
   }
 
@@ -114,7 +117,7 @@ private[spark] class PartitionedSerializedPairBuffer[K, V](
     }
   }
 
-  override def estimateSize: Long = metaBuffer.capacity * 4 + kvBuffer.capacity
+  override def estimateSize: Long = metaBuffer.capacity * 4L + kvBuffer.capacity
 
   override def destructiveSortedWritablePartitionedIterator(keyComparator: Option[Comparator[K]])
     : WritablePartitionedIterator = {
@@ -128,10 +131,10 @@ private[spark] class PartitionedSerializedPairBuffer[K, V](
       var pos = 0
 
       def writeNext(writer: BlockObjectWriter): Unit = {
-        val keyStart = metaBuffer.get(pos + KEY_START)
-        val valueEnd = metaBuffer.get(pos + VAL_END)
+        val keyStart = getKeyStartPos(metaBuffer, pos)
+        val keyValLen = metaBuffer.get(pos + KEY_VAL_LEN)
         pos += RECORD_SIZE
-        kvBuffer.read(keyStart, writer, valueEnd - keyStart)
+        kvBuffer.read(keyStart, writer, keyValLen)
         writer.recordWritten()
       }
       def nextPartition(): Int = metaBuffer.get(pos + PARTITION)
@@ -163,9 +166,11 @@ private[spark] class PartitionedSerializedPairBuffer[K, V](
 private[spark] class OrderedInputStream(metaBuffer: IntBuffer, kvBuffer: ChainedBuffer)
     extends InputStream {
 
+  import PartitionedSerializedPairBuffer._
+
   private var metaBufferPos = 0
   private var kvBufferPos =
-    if (metaBuffer.position > 0) metaBuffer.get(metaBufferPos + KEY_START) else 0
+    if (metaBuffer.position > 0) getKeyStartPos(metaBuffer, metaBufferPos) else 0
 
   override def read(bytes: Array[Byte]): Int = read(bytes, 0, bytes.length)
 
@@ -173,13 +178,14 @@ private[spark] class OrderedInputStream(metaBuffer: IntBuffer, kvBuffer: Chained
     if (metaBufferPos >= metaBuffer.position) {
       return -1
     }
-    val bytesRemainingInRecord = metaBuffer.get(metaBufferPos + VAL_END) - kvBufferPos
+    val bytesRemainingInRecord = (metaBuffer.get(metaBufferPos + KEY_VAL_LEN) -
+      (kvBufferPos - getKeyStartPos(metaBuffer, metaBufferPos))).toInt
     val toRead = math.min(bytesRemainingInRecord, len)
     kvBuffer.read(kvBufferPos, bytes, offs, toRead)
     if (toRead == bytesRemainingInRecord) {
       metaBufferPos += RECORD_SIZE
       if (metaBufferPos < metaBuffer.position) {
-        kvBufferPos = metaBuffer.get(metaBufferPos + KEY_START)
+        kvBufferPos = getKeyStartPos(metaBuffer, metaBufferPos)
       }
     } else {
       kvBufferPos += toRead
@@ -246,9 +252,14 @@ private[spark] class SerializedSortDataFormat extends SortDataFormat[Int, IntBuf
 }
 
 private[spark] object PartitionedSerializedPairBuffer {
-  val KEY_START = 0
-  val VAL_START = 1
-  val VAL_END = 2
+  val KEY_START = 0 // keyStart, a long, gets split across two ints
+  val KEY_VAL_LEN = 2
   val PARTITION = 3
-  val RECORD_SIZE = Seq(KEY_START, VAL_START, VAL_END, PARTITION).size // num ints of metadata
+  val RECORD_SIZE = PARTITION + 1 // num ints of metadata
+
+  def getKeyStartPos(metaBuffer: IntBuffer, metaBufferPos: Int): Long = {
+    val lower32 = metaBuffer.get(metaBufferPos + KEY_START)
+    val upper32 = metaBuffer.get(metaBufferPos + KEY_START + 1)
+    (upper32.toLong << 32) | (lower32 & 0xFFFFFFFFL)
+  }
 }

From 35410614deb7feea1c9d5cca00a6fa7970404f21 Mon Sep 17 00:00:00 2001
From: Matt Wise <mwise@quixey.com>
Date: Wed, 27 May 2015 22:39:19 -0700
Subject: [PATCH 210/525] [DOCS] Fix typo in documentation for Java UDF
 registration

This contribution is my original work and I license the work to the project under the project's open source license

Author: Matt Wise <mwise@quixey.com>

Closes #6447 from wisematthew/fix-typo-in-java-udf-registration-doc and squashes the following commits:

e7ef5f7 [Matt Wise] Fix typo in documentation for Java UDF registration
---
 docs/sql-programming-guide.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/sql-programming-guide.md b/docs/sql-programming-guide.md
index 5b41c0ee6e430..ab646f65bb5eb 100644
--- a/docs/sql-programming-guide.md
+++ b/docs/sql-programming-guide.md
@@ -1939,7 +1939,7 @@ sqlContext.udf.register("strLen", (s: String) => s.length())
 <div data-lang="java"  markdown="1">
 {% highlight java %}
 
-sqlContext.udf().register("strLen", (String s) -> { s.length(); });
+sqlContext.udf().register("strLen", (String s) -> s.length(), DataTypes.IntegerType);
 
 {% endhighlight %}
 </div>

From e838a25bdb5603ef05e779225704c972ce436145 Mon Sep 17 00:00:00 2001
From: zuxqoj <sbshekhar@gmail.com>
Date: Wed, 27 May 2015 23:13:13 -0700
Subject: [PATCH 211/525] [SPARK-7782] fixed sort arrow issue

Current behaviour::
In spark UI
![screen shot 2015-05-27 at 3 27 51 pm](https://cloud.githubusercontent.com/assets/3919211/7837541/47d330ba-04a5-11e5-89d1-e5b11da1a513.png)

In YARN
![screen shot 2015-05-27 at 3](https://cloud.githubusercontent.com/assets/3919211/7837594/aebd1d36-04a5-11e5-8216-86e03c07d2bd.png)

In jira
![screen shot 2015-05-27 at 3_2](https://cloud.githubusercontent.com/assets/3919211/7837616/d3fedce2-04a5-11e5-9e68-960ed54e5d83.png)

Author: zuxqoj <sbshekhar@gmail.com>

Closes #6437 from zuxqoj/SPARK-7782_PR and squashes the following commits:

cd068b9 [zuxqoj] [SPARK-7782] fixed sort arrow issue
---
 .../main/resources/org/apache/spark/ui/static/sorttable.js  | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/core/src/main/resources/org/apache/spark/ui/static/sorttable.js b/core/src/main/resources/org/apache/spark/ui/static/sorttable.js
index dbacbf19beee5..dde6069000bc4 100644
--- a/core/src/main/resources/org/apache/spark/ui/static/sorttable.js
+++ b/core/src/main/resources/org/apache/spark/ui/static/sorttable.js
@@ -100,7 +100,7 @@ sorttable = {
             this.removeChild(document.getElementById('sorttable_sortfwdind'));
             sortrevind = document.createElement('span');
             sortrevind.id = "sorttable_sortrevind";
-            sortrevind.innerHTML = stIsIE ? '&nbsp<font face="webdings">5</font>' : '&nbsp;&#x25B4;';
+            sortrevind.innerHTML = stIsIE ? '&nbsp<font face="webdings">5</font>' : '&nbsp;&#x25BE;';
             this.appendChild(sortrevind);
             return;
           }
@@ -113,7 +113,7 @@ sorttable = {
             this.removeChild(document.getElementById('sorttable_sortrevind'));
             sortfwdind = document.createElement('span');
             sortfwdind.id = "sorttable_sortfwdind";
-            sortfwdind.innerHTML = stIsIE ? '&nbsp<font face="webdings">6</font>' : '&nbsp;&#x25BE;';
+            sortfwdind.innerHTML = stIsIE ? '&nbsp<font face="webdings">6</font>' : '&nbsp;&#x25B4;';
             this.appendChild(sortfwdind);
             return;
           }
@@ -134,7 +134,7 @@ sorttable = {
           this.className += ' sorttable_sorted';
           sortfwdind = document.createElement('span');
           sortfwdind.id = "sorttable_sortfwdind";
-          sortfwdind.innerHTML = stIsIE ? '&nbsp<font face="webdings">6</font>' : '&nbsp;&#x25BE;';
+          sortfwdind.innerHTML = stIsIE ? '&nbsp<font face="webdings">6</font>' : '&nbsp;&#x25B4;';
           this.appendChild(sortfwdind);
 
           // build an array to sort. This is a Schwartzian transform thing,

From 000df2f0d6af068bb188e81bbb207f0c2f43bf16 Mon Sep 17 00:00:00 2001
From: zsxwing <zsxwing@gmail.com>
Date: Thu, 28 May 2015 09:04:12 -0700
Subject: [PATCH 212/525] [SPARK-7895] [STREAMING] [EXAMPLES] Move Kafka
 examples from scala-2.10/src to src

Since `spark-streaming-kafka` now is published for both Scala 2.10 and 2.11, we can move `KafkaWordCount` and `DirectKafkaWordCount` from `examples/scala-2.10/src/` to `examples/src/` so that they will appear in `spark-examples-***-jar` for Scala 2.11.

Author: zsxwing <zsxwing@gmail.com>

Closes #6436 from zsxwing/SPARK-7895 and squashes the following commits:

c6052f1 [zsxwing] Update examples/pom.xml
0bcfa87 [zsxwing] Fix the sleep time
b9d1256 [zsxwing] Move Kafka examples from scala-2.10/src to src
---
 examples/pom.xml                              | 44 +++----------------
 .../streaming/JavaDirectKafkaWordCount.java   |  0
 .../streaming/JavaKafkaWordCount.java         |  0
 .../streaming/DirectKafkaWordCount.scala      |  0
 .../examples/streaming/KafkaWordCount.scala   |  2 +-
 5 files changed, 6 insertions(+), 40 deletions(-)
 rename examples/{scala-2.10 => }/src/main/java/org/apache/spark/examples/streaming/JavaDirectKafkaWordCount.java (100%)
 rename examples/{scala-2.10 => }/src/main/java/org/apache/spark/examples/streaming/JavaKafkaWordCount.java (100%)
 rename examples/{scala-2.10 => }/src/main/scala/org/apache/spark/examples/streaming/DirectKafkaWordCount.scala (100%)
 rename examples/{scala-2.10 => }/src/main/scala/org/apache/spark/examples/streaming/KafkaWordCount.scala (99%)

diff --git a/examples/pom.xml b/examples/pom.xml
index 5b04b4f8d6ca0..e4efee7b5e647 100644
--- a/examples/pom.xml
+++ b/examples/pom.xml
@@ -97,6 +97,11 @@
         </exclusion>
       </exclusions>
     </dependency>
+    <dependency>
+      <groupId>org.apache.spark</groupId>
+      <artifactId>spark-streaming-kafka_${scala.binary.version}</artifactId>
+      <version>${project.version}</version>
+    </dependency>
     <dependency>
       <groupId>org.apache.hbase</groupId>
       <artifactId>hbase-testing-util</artifactId>
@@ -392,45 +397,6 @@
         </dependency>
       </dependencies>
     </profile>
-    <profile>
-      <!-- We add a source directory specific to Scala 2.10 since Kafka
-           only works with it -->
-      <id>scala-2.10</id>
-      <activation>
-        <property><name>!scala-2.11</name></property>
-      </activation>
-      <dependencies>
-        <dependency>
-          <groupId>org.apache.spark</groupId>
-          <artifactId>spark-streaming-kafka_${scala.binary.version}</artifactId>
-          <version>${project.version}</version>
-        </dependency>
-      </dependencies>
-      <build>
-        <plugins>
-          <plugin>
-            <groupId>org.codehaus.mojo</groupId>
-            <artifactId>build-helper-maven-plugin</artifactId>
-            <executions>
-              <execution>
-                <id>add-scala-sources</id>
-                <phase>generate-sources</phase>
-                <goals>
-                  <goal>add-source</goal>
-                </goals>
-                <configuration>
-                  <sources>
-                    <source>src/main/scala</source>
-                    <source>scala-2.10/src/main/scala</source>
-                    <source>scala-2.10/src/main/java</source>
-                  </sources>
-                </configuration>
-              </execution>
-            </executions>
-          </plugin>
-        </plugins>
-      </build>
-    </profile>
 
     <!-- Profiles that disable inclusion of certain dependencies. -->
     <profile>
diff --git a/examples/scala-2.10/src/main/java/org/apache/spark/examples/streaming/JavaDirectKafkaWordCount.java b/examples/src/main/java/org/apache/spark/examples/streaming/JavaDirectKafkaWordCount.java
similarity index 100%
rename from examples/scala-2.10/src/main/java/org/apache/spark/examples/streaming/JavaDirectKafkaWordCount.java
rename to examples/src/main/java/org/apache/spark/examples/streaming/JavaDirectKafkaWordCount.java
diff --git a/examples/scala-2.10/src/main/java/org/apache/spark/examples/streaming/JavaKafkaWordCount.java b/examples/src/main/java/org/apache/spark/examples/streaming/JavaKafkaWordCount.java
similarity index 100%
rename from examples/scala-2.10/src/main/java/org/apache/spark/examples/streaming/JavaKafkaWordCount.java
rename to examples/src/main/java/org/apache/spark/examples/streaming/JavaKafkaWordCount.java
diff --git a/examples/scala-2.10/src/main/scala/org/apache/spark/examples/streaming/DirectKafkaWordCount.scala b/examples/src/main/scala/org/apache/spark/examples/streaming/DirectKafkaWordCount.scala
similarity index 100%
rename from examples/scala-2.10/src/main/scala/org/apache/spark/examples/streaming/DirectKafkaWordCount.scala
rename to examples/src/main/scala/org/apache/spark/examples/streaming/DirectKafkaWordCount.scala
diff --git a/examples/scala-2.10/src/main/scala/org/apache/spark/examples/streaming/KafkaWordCount.scala b/examples/src/main/scala/org/apache/spark/examples/streaming/KafkaWordCount.scala
similarity index 99%
rename from examples/scala-2.10/src/main/scala/org/apache/spark/examples/streaming/KafkaWordCount.scala
rename to examples/src/main/scala/org/apache/spark/examples/streaming/KafkaWordCount.scala
index f407367a54f6c..9ae1b045c2c76 100644
--- a/examples/scala-2.10/src/main/scala/org/apache/spark/examples/streaming/KafkaWordCount.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/streaming/KafkaWordCount.scala
@@ -96,7 +96,7 @@ object KafkaWordCountProducer {
         producer.send(message)
       }
 
-      Thread.sleep(100)
+      Thread.sleep(1000)
     }
   }
 

From 530efe3e80c62b25c869b85167e00330eb1ddea6 Mon Sep 17 00:00:00 2001
From: Xiangrui Meng <meng@databricks.com>
Date: Thu, 28 May 2015 12:03:46 -0700
Subject: [PATCH 213/525] [SPARK-7911] [MLLIB] A workaround for VectorUDT
 serialize (or deserialize) being called multiple times

~~A PythonUDT shouldn't be serialized into external Scala types in PythonRDD. I'm not sure whether this should fix one of the bugs related to SQL UDT/UDF in PySpark.~~

The fix above didn't work. So I added a workaround for this. If a Python UDF is applied to a Python UDT. This will put the Python SQL types as inputs. Still incorrect, but at least it doesn't throw exceptions on the Scala side. davies harsha2010

Author: Xiangrui Meng <meng@databricks.com>

Closes #6442 from mengxr/SPARK-7903 and squashes the following commits:

c257d2a [Xiangrui Meng] add a workaround for VectorUDT
---
 .../apache/spark/mllib/linalg/Vectors.scala   | 19 ++++++++++++++-----
 1 file changed, 14 insertions(+), 5 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala
index f6bcdf83cd337..2ffa497a99d93 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala
@@ -176,27 +176,31 @@ private[spark] class VectorUDT extends UserDefinedType[Vector] {
   }
 
   override def serialize(obj: Any): Row = {
-    val row = new GenericMutableRow(4)
     obj match {
       case SparseVector(size, indices, values) =>
+        val row = new GenericMutableRow(4)
         row.setByte(0, 0)
         row.setInt(1, size)
         row.update(2, indices.toSeq)
         row.update(3, values.toSeq)
+        row
       case DenseVector(values) =>
+        val row = new GenericMutableRow(4)
         row.setByte(0, 1)
         row.setNullAt(1)
         row.setNullAt(2)
         row.update(3, values.toSeq)
+        row
+      // TODO: There are bugs in UDT serialization because we don't have a clear separation between
+      // TODO: internal SQL types and language specific types (including UDT). UDT serialize and
+      // TODO: deserialize may get called twice. See SPARK-7186.
+      case row: Row =>
+        row
     }
-    row
   }
 
   override def deserialize(datum: Any): Vector = {
     datum match {
-      // TODO: something wrong with UDT serialization
-      case v: Vector =>
-        v
       case row: Row =>
         require(row.length == 4,
           s"VectorUDT.deserialize given row with length ${row.length} but requires length == 4")
@@ -211,6 +215,11 @@ private[spark] class VectorUDT extends UserDefinedType[Vector] {
             val values = row.getAs[Iterable[Double]](3).toArray
             new DenseVector(values)
         }
+      // TODO: There are bugs in UDT serialization because we don't have a clear separation between
+      // TODO: internal SQL types and language specific types (including UDT). UDT serialize and
+      // TODO: deserialize may get called twice. See SPARK-7186.
+      case v: Vector =>
+        v
     }
   }
 

From c771589c96403b2a518fb77d5162eca8f495f37b Mon Sep 17 00:00:00 2001
From: Li Yao <hnkfliyao@gmail.com>
Date: Thu, 28 May 2015 13:39:39 -0700
Subject: [PATCH 214/525] [MINOR] Fix the a minor bug in PageRank Example.

Fix the bug that entering only 1 arg will cause array out of bounds exception in PageRank example.

Author: Li Yao <hnkfliyao@gmail.com>

Closes #6455 from lastland/patch-1 and squashes the following commits:

de06128 [Li Yao] Fix the bug that entering only 1 arg will cause array out of bounds exception.
---
 .../main/scala/org/apache/spark/examples/SparkPageRank.scala    | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/src/main/scala/org/apache/spark/examples/SparkPageRank.scala b/examples/src/main/scala/org/apache/spark/examples/SparkPageRank.scala
index 8d092b6506d33..bd7894f184c4c 100644
--- a/examples/src/main/scala/org/apache/spark/examples/SparkPageRank.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/SparkPageRank.scala
@@ -51,7 +51,7 @@ object SparkPageRank {
     showWarning()
 
     val sparkConf = new SparkConf().setAppName("PageRank")
-    val iters = if (args.length > 0) args(1).toInt else 10
+    val iters = if (args.length > 1) args(1).toInt else 10
     val ctx = new SparkContext(sparkConf)
     val lines = ctx.textFile(args(0), 1)
     val links = lines.map{ s =>

From 3e312a5ed0154527c66eeeee0d2cc3bfce0a820e Mon Sep 17 00:00:00 2001
From: Mike Dusenberry <dusenberrymw@gmail.com>
Date: Thu, 28 May 2015 17:15:10 -0400
Subject: [PATCH 215/525] [DOCS] Fixing broken "IDE setup" link in the Building
 Spark documentation.

The location of the IDE setup information has changed, so this just updates the link on the Building Spark page.

Author: Mike Dusenberry <dusenberrymw@gmail.com>

Closes #6467 from dusenberrymw/Fix_Broken_Link_On_Building_Spark_Doc and squashes the following commits:

75c533a [Mike Dusenberry] Fixing broken "IDE setup" link in the Building Spark documentation by pointing to new location.
---
 docs/building-spark.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/building-spark.md b/docs/building-spark.md
index 3ca7f2746e678..b2649d1ee2a53 100644
--- a/docs/building-spark.md
+++ b/docs/building-spark.md
@@ -176,7 +176,7 @@ Thus, the full flow for running continuous-compilation of the `core` submodule m
 # Building Spark with IntelliJ IDEA or Eclipse
 
 For help in setting up IntelliJ IDEA or Eclipse for Spark development, and troubleshooting, refer to the
-[wiki page for IDE setup](https://cwiki.apache.org/confluence/display/SPARK/Contributing+to+Spark#ContributingtoSpark-IDESetup).
+[wiki page for IDE setup](https://cwiki.apache.org/confluence/display/SPARK/Useful+Developer+Tools#UsefulDeveloperTools-IDESetup).
 
 # Running Java 8 Test Suites
 

From 7859ab659eecbcf2d8b9a274a4e9e4f5186a528c Mon Sep 17 00:00:00 2001
From: Xiangrui Meng <meng@databricks.com>
Date: Thu, 28 May 2015 16:32:51 -0700
Subject: [PATCH 216/525] [SPARK-7198] [MLLIB] VectorAssembler should output ML
 attributes

`VectorAssembler` should carry over ML attributes. For unknown attributes, we assume numeric values. This PR handles the following cases:

1. DoubleType with ML attribute: carry over
2. DoubleType without ML attribute: numeric value
3. Scalar type: numeric value
4. VectorType with all ML attributes: carry over and update names
5. VectorType with number of ML attributes: assume all numeric
6. VectorType without ML attributes: check the first row and get the number of attributes

jkbradley

Author: Xiangrui Meng <meng@databricks.com>

Closes #6452 from mengxr/SPARK-7198 and squashes the following commits:

a9d2469 [Xiangrui Meng] add space
facdb1f [Xiangrui Meng] VectorAssembler should output ML attributes
---
 .../spark/ml/feature/VectorAssembler.scala    | 51 +++++++++++++++++--
 .../ml/feature/VectorAssemblerSuite.scala     | 37 ++++++++++++++
 2 files changed, 83 insertions(+), 5 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/VectorAssembler.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/VectorAssembler.scala
index 514ffb03c0509..229ee27ec5942 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/VectorAssembler.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/VectorAssembler.scala
@@ -22,6 +22,7 @@ import scala.collection.mutable.ArrayBuilder
 import org.apache.spark.SparkException
 import org.apache.spark.annotation.Experimental
 import org.apache.spark.ml.Transformer
+import org.apache.spark.ml.attribute.{Attribute, AttributeGroup, NumericAttribute, UnresolvedAttribute}
 import org.apache.spark.ml.param.shared._
 import org.apache.spark.ml.util.Identifiable
 import org.apache.spark.mllib.linalg.{Vector, VectorUDT, Vectors}
@@ -37,7 +38,7 @@ import org.apache.spark.sql.types._
 class VectorAssembler(override val uid: String)
   extends Transformer with HasInputCols with HasOutputCol {
 
-  def this() = this(Identifiable.randomUID("va"))
+  def this() = this(Identifiable.randomUID("vecAssembler"))
 
   /** @group setParam */
   def setInputCols(value: Array[String]): this.type = set(inputCols, value)
@@ -46,19 +47,59 @@ class VectorAssembler(override val uid: String)
   def setOutputCol(value: String): this.type = set(outputCol, value)
 
   override def transform(dataset: DataFrame): DataFrame = {
+    // Schema transformation.
+    val schema = dataset.schema
+    lazy val first = dataset.first()
+    val attrs = $(inputCols).flatMap { c =>
+      val field = schema(c)
+      val index = schema.fieldIndex(c)
+      field.dataType match {
+        case DoubleType =>
+          val attr = Attribute.fromStructField(field)
+          // If the input column doesn't have ML attribute, assume numeric.
+          if (attr == UnresolvedAttribute) {
+            Some(NumericAttribute.defaultAttr.withName(c))
+          } else {
+            Some(attr.withName(c))
+          }
+        case _: NumericType | BooleanType =>
+          // If the input column type is a compatible scalar type, assume numeric.
+          Some(NumericAttribute.defaultAttr.withName(c))
+        case _: VectorUDT =>
+          val group = AttributeGroup.fromStructField(field)
+          if (group.attributes.isDefined) {
+            // If attributes are defined, copy them with updated names.
+            group.attributes.get.map { attr =>
+              if (attr.name.isDefined) {
+                // TODO: Define a rigorous naming scheme.
+                attr.withName(c + "_" + attr.name.get)
+              } else {
+                attr
+              }
+            }
+          } else {
+            // Otherwise, treat all attributes as numeric. If we cannot get the number of attributes
+            // from metadata, check the first row.
+            val numAttrs = group.numAttributes.getOrElse(first.getAs[Vector](index).size)
+            Array.fill(numAttrs)(NumericAttribute.defaultAttr)
+          }
+      }
+    }
+    val metadata = new AttributeGroup($(outputCol), attrs).toMetadata()
+
+    // Data transformation.
     val assembleFunc = udf { r: Row =>
       VectorAssembler.assemble(r.toSeq: _*)
     }
-    val schema = dataset.schema
-    val inputColNames = $(inputCols)
-    val args = inputColNames.map { c =>
+    val args = $(inputCols).map { c =>
       schema(c).dataType match {
         case DoubleType => dataset(c)
         case _: VectorUDT => dataset(c)
         case _: NumericType | BooleanType => dataset(c).cast(DoubleType).as(s"${c}_double_$uid")
       }
     }
-    dataset.select(col("*"), assembleFunc(struct(args : _*)).as($(outputCol)))
+
+    dataset.select(col("*"), assembleFunc(struct(args : _*)).as($(outputCol), metadata))
   }
 
   override def transformSchema(schema: StructType): StructType = {
diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/VectorAssemblerSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/VectorAssemblerSuite.scala
index d0cd62c5e4864..43534e89928b1 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/feature/VectorAssemblerSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/feature/VectorAssemblerSuite.scala
@@ -20,9 +20,11 @@ package org.apache.spark.ml.feature
 import org.scalatest.FunSuite
 
 import org.apache.spark.SparkException
+import org.apache.spark.ml.attribute.{AttributeGroup, NominalAttribute, NumericAttribute}
 import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vector, Vectors}
 import org.apache.spark.mllib.util.MLlibTestSparkContext
 import org.apache.spark.sql.Row
+import org.apache.spark.sql.functions.col
 
 class VectorAssemblerSuite extends FunSuite with MLlibTestSparkContext {
 
@@ -61,4 +63,39 @@ class VectorAssemblerSuite extends FunSuite with MLlibTestSparkContext {
         assert(v === Vectors.sparse(6, Array(1, 2, 4, 5), Array(1.0, 2.0, 3.0, 10.0)))
     }
   }
+
+  test("ML attributes") {
+    val browser = NominalAttribute.defaultAttr.withValues("chrome", "firefox", "safari")
+    val hour = NumericAttribute.defaultAttr.withMin(0.0).withMax(24.0)
+    val user = new AttributeGroup("user", Array(
+      NominalAttribute.defaultAttr.withName("gender").withValues("male", "female"),
+      NumericAttribute.defaultAttr.withName("salary")))
+    val row = (1.0, 0.5, 1, Vectors.dense(1.0, 1000.0), Vectors.sparse(2, Array(1), Array(2.0)))
+    val df = sqlContext.createDataFrame(Seq(row)).toDF("browser", "hour", "count", "user", "ad")
+      .select(
+        col("browser").as("browser", browser.toMetadata()),
+        col("hour").as("hour", hour.toMetadata()),
+        col("count"), // "count" is an integer column without ML attribute
+        col("user").as("user", user.toMetadata()),
+        col("ad")) // "ad" is a vector column without ML attribute
+    val assembler = new VectorAssembler()
+      .setInputCols(Array("browser", "hour", "count", "user", "ad"))
+      .setOutputCol("features")
+    val output = assembler.transform(df)
+    val schema = output.schema
+    val features = AttributeGroup.fromStructField(schema("features"))
+    assert(features.size === 7)
+    val browserOut = features.getAttr(0)
+    assert(browserOut === browser.withIndex(0).withName("browser"))
+    val hourOut = features.getAttr(1)
+    assert(hourOut === hour.withIndex(1).withName("hour"))
+    val countOut = features.getAttr(2)
+    assert(countOut === NumericAttribute.defaultAttr.withName("count").withIndex(2))
+    val userGenderOut = features.getAttr(3)
+    assert(userGenderOut === user.getAttr("gender").withName("user_gender").withIndex(3))
+    val userSalaryOut = features.getAttr(4)
+    assert(userSalaryOut === user.getAttr("salary").withName("user_salary").withIndex(4))
+    assert(features.getAttr(5) === NumericAttribute.defaultAttr.withIndex(5))
+    assert(features.getAttr(6) === NumericAttribute.defaultAttr.withIndex(6))
+  }
 }

From 0077af22ca5fcb2e50dcf7daa4f6804ae722bfbe Mon Sep 17 00:00:00 2001
From: Reynold Xin <rxin@databricks.com>
Date: Thu, 28 May 2015 16:56:59 -0700
Subject: [PATCH 217/525] Remove SizeEstimator from o.a.spark package.

See comments on https://github.com/apache/spark/pull/3913

Author: Reynold Xin <rxin@databricks.com>

Closes #6471 from rxin/sizeestimator and squashes the following commits:

c057095 [Reynold Xin] Fixed import.
2da478b [Reynold Xin] Remove SizeEstimator from o.a.spark package.
---
 .../org/apache/spark/SizeEstimator.scala      | 44 -------------------
 .../org/apache/spark/util/SizeEstimator.scala | 20 +++++++--
 2 files changed, 17 insertions(+), 47 deletions(-)
 delete mode 100644 core/src/main/scala/org/apache/spark/SizeEstimator.scala

diff --git a/core/src/main/scala/org/apache/spark/SizeEstimator.scala b/core/src/main/scala/org/apache/spark/SizeEstimator.scala
deleted file mode 100644
index 54fc3a856adfa..0000000000000
--- a/core/src/main/scala/org/apache/spark/SizeEstimator.scala
+++ /dev/null
@@ -1,44 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark
-
-import org.apache.spark.annotation.DeveloperApi
-
-/**
- * Estimates the sizes of Java objects (number of bytes of memory they occupy), for use in
- * memory-aware caches.
- *
- * Based on the following JavaWorld article:
- * http://www.javaworld.com/javaworld/javaqa/2003-12/02-qa-1226-sizeof.html
- */
-@DeveloperApi
-object SizeEstimator {
-  /**
-   * :: DeveloperApi ::
-   * Estimate the number of bytes that the given object takes up on the JVM heap. The estimate
-   * includes space taken up by objects referenced by the given object, their references, and so on
-   * and so forth.
-   *
-   * This is useful for determining the amount of heap space a broadcast variable will occupy on
-   * each executor or the amount of space each object will take when caching objects in
-   * deserialized form. This is not the same as the serialized size of the object, which will
-   * typically be much smaller.
-   */
-  @DeveloperApi
-  def estimate(obj: AnyRef): Long = org.apache.spark.util.SizeEstimator.estimate(obj)
-}
diff --git a/core/src/main/scala/org/apache/spark/util/SizeEstimator.scala b/core/src/main/scala/org/apache/spark/util/SizeEstimator.scala
index 968a72d5adae9..f38949c3cb846 100644
--- a/core/src/main/scala/org/apache/spark/util/SizeEstimator.scala
+++ b/core/src/main/scala/org/apache/spark/util/SizeEstimator.scala
@@ -21,21 +21,37 @@ import java.lang.management.ManagementFactory
 import java.lang.reflect.{Field, Modifier}
 import java.util.{IdentityHashMap, Random}
 import java.util.concurrent.ConcurrentHashMap
+
 import scala.collection.mutable.ArrayBuffer
 import scala.runtime.ScalaRunTime
 
 import org.apache.spark.Logging
+import org.apache.spark.annotation.DeveloperApi
 import org.apache.spark.util.collection.OpenHashSet
 
 
 /**
+ * :: DeveloperApi ::
  * Estimates the sizes of Java objects (number of bytes of memory they occupy), for use in
  * memory-aware caches.
  *
  * Based on the following JavaWorld article:
  * http://www.javaworld.com/javaworld/javaqa/2003-12/02-qa-1226-sizeof.html
  */
-private[spark] object SizeEstimator extends Logging {
+@DeveloperApi
+object SizeEstimator extends Logging {
+
+  /**
+   * Estimate the number of bytes that the given object takes up on the JVM heap. The estimate
+   * includes space taken up by objects referenced by the given object, their references, and so on
+   * and so forth.
+   *
+   * This is useful for determining the amount of heap space a broadcast variable will occupy on
+   * each executor or the amount of space each object will take when caching objects in
+   * deserialized form. This is not the same as the serialized size of the object, which will
+   * typically be much smaller.
+   */
+  def estimate(obj: AnyRef): Long = estimate(obj, new IdentityHashMap[AnyRef, AnyRef])
 
   // Sizes of primitive types
   private val BYTE_SIZE    = 1
@@ -161,8 +177,6 @@ private[spark] object SizeEstimator extends Logging {
     val shellSize: Long,
     val pointerFields: List[Field]) {}
 
-  def estimate(obj: AnyRef): Long = estimate(obj, new IdentityHashMap[AnyRef, AnyRef])
-
   private def estimate(obj: AnyRef, visited: IdentityHashMap[AnyRef, AnyRef]): Long = {
     val state = new SearchState(visited)
     state.enqueue(obj)

From 572b62cafe4bc7b1d464c9dcfb449c9d53456826 Mon Sep 17 00:00:00 2001
From: Yin Huai <yhuai@databricks.com>
Date: Thu, 28 May 2015 17:12:30 -0700
Subject: [PATCH 218/525] [SPARK-7853] [SQL] Fix HiveContext in Spark Shell

https://issues.apache.org/jira/browse/SPARK-7853

This fixes the problem introduced by my change in https://github.com/apache/spark/pull/6435, which causes that Hive Context fails to create in spark shell because of the class loader issue.

Author: Yin Huai <yhuai@databricks.com>

Closes #6459 from yhuai/SPARK-7853 and squashes the following commits:

37ad33e [Yin Huai] Do not use hiveQlTable at all.
47cdb6d [Yin Huai] Move hiveconf.set to the end of setConf.
005649b [Yin Huai] Update comment.
35d86f3 [Yin Huai] Access TTable directly to make sure Hive will not internally use any metastore utility functions.
3737766 [Yin Huai] Recursively find all jars.
---
 .../apache/spark/sql/hive/HiveContext.scala   | 35 ++++++++++---------
 .../spark/sql/hive/HiveMetastoreCatalog.scala | 12 +++----
 2 files changed, 25 insertions(+), 22 deletions(-)

diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala
index 9ab98fdcce725..2ed71d3d52880 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala
@@ -189,24 +189,22 @@ class HiveContext(sc: SparkContext) extends SQLContext(sc) {
           "Specify a vaild path to the correct hive jars using $HIVE_METASTORE_JARS " +
           s"or change $HIVE_METASTORE_VERSION to $hiveExecutionVersion.")
       }
-      // We recursively add all jars in the class loader chain,
-      // starting from the given urlClassLoader.
-      def addJars(urlClassLoader: URLClassLoader): Array[URL] = {
-        val jarsInParent = urlClassLoader.getParent match {
-          case parent: URLClassLoader => addJars(parent)
-          case other => Array.empty[URL]
-        }
 
-        urlClassLoader.getURLs ++ jarsInParent
+      // We recursively find all jars in the class loader chain,
+      // starting from the given classLoader.
+      def allJars(classLoader: ClassLoader): Array[URL] = classLoader match {
+        case null => Array.empty[URL]
+        case urlClassLoader: URLClassLoader =>
+          urlClassLoader.getURLs ++ allJars(urlClassLoader.getParent)
+        case other => allJars(other.getParent)
       }
 
-      val jars = Utils.getContextOrSparkClassLoader match {
-        case urlClassLoader: URLClassLoader => addJars(urlClassLoader)
-        case other =>
-          throw new IllegalArgumentException(
-            "Unable to locate hive jars to connect to metastore " +
-            s"using classloader ${other.getClass.getName}. " +
-            "Please set spark.sql.hive.metastore.jars")
+      val classLoader = Utils.getContextOrSparkClassLoader
+      val jars = allJars(classLoader)
+      if (jars.length == 0) {
+        throw new IllegalArgumentException(
+          "Unable to locate hive jars to connect to metastore. " +
+            "Please set spark.sql.hive.metastore.jars.")
       }
 
       logInfo(
@@ -356,9 +354,14 @@ class HiveContext(sc: SparkContext) extends SQLContext(sc) {
 
   override def setConf(key: String, value: String): Unit = {
     super.setConf(key, value)
-    hiveconf.set(key, value)
     executionHive.runSqlHive(s"SET $key=$value")
     metadataHive.runSqlHive(s"SET $key=$value")
+    // If users put any Spark SQL setting in the spark conf (e.g. spark-defaults.conf),
+    // this setConf will be called in the constructor of the SQLContext.
+    // Also, calling hiveconf will create a default session containing a HiveConf, which
+    // will interfer with the creation of executionHive (which is a lazy val). So,
+    // we put hiveconf.set at the end of this method.
+    hiveconf.set(key, value)
   }
 
   /* A catalyst metadata catalog that points to the Hive Metastore. */
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
index 425a4005aa2c3..95117f7a6847e 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
@@ -707,20 +707,20 @@ private[hive] case class MetastoreRelation
     hiveQlTable.getMetadata
   )
 
-  implicit class SchemaAttribute(f: FieldSchema) {
+  implicit class SchemaAttribute(f: HiveColumn) {
     def toAttribute: AttributeReference = AttributeReference(
-      f.getName,
-      HiveMetastoreTypes.toDataType(f.getType),
+      f.name,
+      HiveMetastoreTypes.toDataType(f.hiveType),
       // Since data can be dumped in randomly with no validation, everything is nullable.
       nullable = true
     )(qualifiers = Seq(alias.getOrElse(tableName)))
   }
 
-  // Must be a stable value since new attributes are born here.
-  val partitionKeys = hiveQlTable.getPartitionKeys.map(_.toAttribute)
+  /** PartitionKey attributes */
+  val partitionKeys = table.partitionColumns.map(_.toAttribute)
 
   /** Non-partitionKey attributes */
-  val attributes = hiveQlTable.getCols.map(_.toAttribute)
+  val attributes = table.schema.map(_.toAttribute)
 
   val output = attributes ++ partitionKeys
 

From 1bd63e82fdb6ee57c61051430d63685b801df016 Mon Sep 17 00:00:00 2001
From: Xusen Yin <yinxusen@gmail.com>
Date: Thu, 28 May 2015 17:30:12 -0700
Subject: [PATCH 219/525] [SPARK-7577] [ML] [DOC] add bucketizer doc

CC jkbradley

Author: Xusen Yin <yinxusen@gmail.com>

Closes #6451 from yinxusen/SPARK-7577 and squashes the following commits:

e2dc32e [Xusen Yin] rename colums
e350e49 [Xusen Yin] add all demos
006ddf1 [Xusen Yin] add java test
3238481 [Xusen Yin] add bucketizer
---
 docs/ml-features.md                           | 86 +++++++++++++++++++
 .../spark/ml/feature/JavaBucketizerSuite.java | 80 +++++++++++++++++
 2 files changed, 166 insertions(+)
 create mode 100644 mllib/src/test/java/org/apache/spark/ml/feature/JavaBucketizerSuite.java

diff --git a/docs/ml-features.md b/docs/ml-features.md
index efe9b3b8edb6e..d7851a55fabfe 100644
--- a/docs/ml-features.md
+++ b/docs/ml-features.md
@@ -789,6 +789,92 @@ scaledData = scalerModel.transform(dataFrame)
 </div>
 </div>
 
+## Bucketizer
+
+`Bucketizer` transforms a column of continuous features to a column of feature buckets, where the buckets are specified by users. It takes a parameter:
+
+* `splits`: Parameter for mapping continuous features into buckets. With n+1 splits, there are n buckets. A bucket defined by splits x,y holds values in the range [x,y) except the last bucket, which also includes y. Splits should be strictly increasing. Values at -inf, inf must be explicitly provided to cover all Double values; Otherwise, values outside the splits specified will be treated as errors. Two examples of `splits` are `Array(Double.NegativeInfinity, 0.0, 1.0, Double.PositiveInfinity)` and `Array(0.0, 1.0, 2.0)`.
+
+Note that if you have no idea of the upper bound and lower bound of the targeted column, you would better add the `Double.NegativeInfinity` and `Double.PositiveInfinity` as the bounds of your splits to prevent a potenial out of Bucketizer bounds exception.
+
+Note also that the splits that you provided have to be in strictly increasing order, i.e. `s0 < s1 < s2 < ... < sn`.
+
+More details can be found in the API docs for [Bucketizer](api/scala/index.html#org.apache.spark.ml.feature.Bucketizer).
+
+The following example demonstrates how to bucketize a column of `Double`s into another index-wised column.
+
+<div class="codetabs">
+<div data-lang="scala">
+{% highlight scala %}
+import org.apache.spark.ml.feature.Bucketizer
+import org.apache.spark.sql.DataFrame
+
+val splits = Array(Double.NegativeInfinity, -0.5, 0.0, 0.5, Double.PositiveInfinity)
+
+val data = Array(-0.5, -0.3, 0.0, 0.2)
+val dataFrame = sqlContext.createDataFrame(data.map(Tuple1.apply)).toDF("features")
+
+val bucketizer = new Bucketizer()
+  .setInputCol("features")
+  .setOutputCol("bucketedFeatures")
+  .setSplits(splits)
+
+// Transform original data into its bucket index.
+val bucketedData = bucketizer.transform(dataFrame)
+{% endhighlight %}
+</div>
+
+<div data-lang="java">
+{% highlight java %}
+import com.google.common.collect.Lists;
+
+import org.apache.spark.sql.DataFrame;
+import org.apache.spark.sql.Row;
+import org.apache.spark.sql.RowFactory;
+import org.apache.spark.sql.types.DataTypes;
+import org.apache.spark.sql.types.Metadata;
+import org.apache.spark.sql.types.StructField;
+import org.apache.spark.sql.types.StructType;
+
+double[] splits = {Double.NEGATIVE_INFINITY, -0.5, 0.0, 0.5, Double.POSITIVE_INFINITY};
+
+JavaRDD<Row> data = jsc.parallelize(Lists.newArrayList(
+  RowFactory.create(-0.5),
+  RowFactory.create(-0.3),
+  RowFactory.create(0.0),
+  RowFactory.create(0.2)
+));
+StructType schema = new StructType(new StructField[] {
+  new StructField("features", DataTypes.DoubleType, false, Metadata.empty())
+});
+DataFrame dataFrame = jsql.createDataFrame(data, schema);
+
+Bucketizer bucketizer = new Bucketizer()
+  .setInputCol("features")
+  .setOutputCol("bucketedFeatures")
+  .setSplits(splits);
+
+// Transform original data into its bucket index.
+DataFrame bucketedData = bucketizer.transform(dataFrame);
+{% endhighlight %}
+</div>
+
+<div data-lang="python">
+{% highlight python %}
+from pyspark.ml.feature import Bucketizer
+
+splits = [-float("inf"), -0.5, 0.0, 0.5, float("inf")]
+
+data = [(-0.5,), (-0.3,), (0.0,), (0.2,)]
+dataFrame = sqlContext.createDataFrame(data, ["features"])
+
+bucketizer = Bucketizer(splits=splits, inputCol="features", outputCol="bucketedFeatures")
+
+# Transform original data into its bucket index.
+bucketedData = bucketizer.transform(dataFrame)
+{% endhighlight %}
+</div>
+</div>
 
 # Feature Selectors
 
diff --git a/mllib/src/test/java/org/apache/spark/ml/feature/JavaBucketizerSuite.java b/mllib/src/test/java/org/apache/spark/ml/feature/JavaBucketizerSuite.java
new file mode 100644
index 0000000000000..d5bd230a957a1
--- /dev/null
+++ b/mllib/src/test/java/org/apache/spark/ml/feature/JavaBucketizerSuite.java
@@ -0,0 +1,80 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.ml.feature;
+
+import com.google.common.collect.Lists;
+import org.junit.After;
+import org.junit.Assert;
+import org.junit.Before;
+import org.junit.Test;
+
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.sql.DataFrame;
+import org.apache.spark.sql.Row;
+import org.apache.spark.sql.RowFactory;
+import org.apache.spark.sql.SQLContext;
+import org.apache.spark.sql.types.DataTypes;
+import org.apache.spark.sql.types.Metadata;
+import org.apache.spark.sql.types.StructField;
+import org.apache.spark.sql.types.StructType;
+
+public class JavaBucketizerSuite {
+  private transient JavaSparkContext jsc;
+  private transient SQLContext jsql;
+
+  @Before
+  public void setUp() {
+    jsc = new JavaSparkContext("local", "JavaBucketizerSuite");
+    jsql = new SQLContext(jsc);
+  }
+
+  @After
+  public void tearDown() {
+    jsc.stop();
+    jsc = null;
+  }
+
+  @Test
+  public void bucketizerTest() {
+    double[] splits = {-0.5, 0.0, 0.5};
+
+    JavaRDD<Row> data = jsc.parallelize(Lists.newArrayList(
+      RowFactory.create(-0.5),
+      RowFactory.create(-0.3),
+      RowFactory.create(0.0),
+      RowFactory.create(0.2)
+    ));
+    StructType schema = new StructType(new StructField[] {
+      new StructField("feature", DataTypes.DoubleType, false, Metadata.empty())
+    });
+    DataFrame dataset = jsql.createDataFrame(data, schema);
+
+    Bucketizer bucketizer = new Bucketizer()
+      .setInputCol("feature")
+      .setOutputCol("result")
+      .setSplits(splits);
+
+    Row[] result = bucketizer.transform(dataset).select("result").collect();
+
+    for (Row r : result) {
+      double index = r.getDouble(0);
+      Assert.assertTrue((index >= 0) && (index <= 1));
+    }
+  }
+}

From 3af0b3136e4b7dea52c413d640653ccddc638574 Mon Sep 17 00:00:00 2001
From: Reynold Xin <rxin@databricks.com>
Date: Thu, 28 May 2015 17:55:22 -0700
Subject: [PATCH 220/525] [SPARK-7927] whitespace fixes for streaming.

So we can enable a whitespace enforcement rule in the style checker to save code review time.

Author: Reynold Xin <rxin@databricks.com>

Closes #6475 from rxin/whitespace-streaming and squashes the following commits:

810dae4 [Reynold Xin] Fixed tests.
89068ad [Reynold Xin] [SPARK-7927] whitespace fixes for streaming.
---
 .../org/apache/spark/streaming/StreamingContext.scala     | 2 +-
 .../apache/spark/streaming/api/java/JavaPairDStream.scala | 8 ++++----
 .../org/apache/spark/streaming/dstream/DStream.scala      | 2 +-
 .../apache/spark/streaming/dstream/FileInputDStream.scala | 8 ++++----
 .../spark/streaming/dstream/PairDStreamFunctions.scala    | 2 +-
 .../spark/streaming/dstream/ReducedWindowedDStream.scala  | 8 ++++----
 .../apache/spark/streaming/dstream/ShuffledDStream.scala  | 6 +++---
 .../org/apache/spark/streaming/dstream/StateDStream.scala | 2 +-
 .../apache/spark/streaming/dstream/WindowedDStream.scala  | 4 ++--
 .../apache/spark/streaming/receiver/BlockGenerator.scala  | 2 +-
 .../org/apache/spark/streaming/receiver/RateLimiter.scala | 3 ++-
 .../spark/streaming/scheduler/ReceiverTracker.scala       | 2 +-
 .../spark/streaming/util/FileBasedWriteAheadLog.scala     | 2 +-
 .../org/apache/spark/streaming/util/RawTextHelper.scala   | 4 ++--
 .../org/apache/spark/streaming/BasicOperationsSuite.scala | 6 +++---
 .../org/apache/spark/streaming/InputStreamsSuite.scala    | 2 +-
 .../apache/spark/streaming/StreamingContextSuite.scala    | 4 +++-
 .../apache/spark/streaming/StreamingListenerSuite.scala   | 6 ++++--
 .../streaming/ui/StreamingJobProgressListenerSuite.scala  | 2 +-
 19 files changed, 40 insertions(+), 35 deletions(-)

diff --git a/streaming/src/main/scala/org/apache/spark/streaming/StreamingContext.scala b/streaming/src/main/scala/org/apache/spark/streaming/StreamingContext.scala
index 5e58ed714829e..25842d502543e 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/StreamingContext.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/StreamingContext.scala
@@ -461,7 +461,7 @@ class StreamingContext private[streaming] (
     val conf = sc_.hadoopConfiguration
     conf.setInt(FixedLengthBinaryInputFormat.RECORD_LENGTH_PROPERTY, recordLength)
     val br = fileStream[LongWritable, BytesWritable, FixedLengthBinaryInputFormat](
-      directory, FileInputDStream.defaultFilter : Path => Boolean, newFilesOnly=true, conf)
+      directory, FileInputDStream.defaultFilter: Path => Boolean, newFilesOnly = true, conf)
     val data = br.map { case (k, v) =>
       val bytes = v.getBytes
       require(bytes.length == recordLength, "Byte array does not have correct length. " +
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaPairDStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaPairDStream.scala
index 93baad19e3ee1..959ac9c177f81 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaPairDStream.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaPairDStream.scala
@@ -227,7 +227,7 @@ class JavaPairDStream[K, V](val dstream: DStream[(K, V)])(
    * @param numPartitions  Number of partitions of each RDD in the new DStream.
    */
   def groupByKeyAndWindow(windowDuration: Duration, slideDuration: Duration, numPartitions: Int)
-  :JavaPairDStream[K, JIterable[V]] = {
+    : JavaPairDStream[K, JIterable[V]] = {
     dstream.groupByKeyAndWindow(windowDuration, slideDuration, numPartitions)
       .mapValues(asJavaIterable _)
   }
@@ -247,7 +247,7 @@ class JavaPairDStream[K, V](val dstream: DStream[(K, V)])(
       windowDuration: Duration,
       slideDuration: Duration,
       partitioner: Partitioner
-    ):JavaPairDStream[K, JIterable[V]] = {
+    ): JavaPairDStream[K, JIterable[V]] = {
     dstream.groupByKeyAndWindow(windowDuration, slideDuration, partitioner)
       .mapValues(asJavaIterable _)
   }
@@ -262,7 +262,7 @@ class JavaPairDStream[K, V](val dstream: DStream[(K, V)])(
    *                       batching interval
    */
   def reduceByKeyAndWindow(reduceFunc: JFunction2[V, V, V], windowDuration: Duration)
-  :JavaPairDStream[K, V] = {
+    : JavaPairDStream[K, V] = {
     dstream.reduceByKeyAndWindow(reduceFunc, windowDuration)
   }
 
@@ -281,7 +281,7 @@ class JavaPairDStream[K, V](val dstream: DStream[(K, V)])(
       reduceFunc: JFunction2[V, V, V],
       windowDuration: Duration,
       slideDuration: Duration
-    ):JavaPairDStream[K, V] = {
+    ): JavaPairDStream[K, V] = {
     dstream.reduceByKeyAndWindow(reduceFunc, windowDuration, slideDuration)
   }
 
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/dstream/DStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/dstream/DStream.scala
index c858647c6406d..6efcc193bfccc 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/dstream/DStream.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/dstream/DStream.scala
@@ -659,7 +659,7 @@ abstract class DStream[T: ClassTag] (
     // DStreams can't be serialized with closures, we can't proactively check
     // it for serializability and so we pass the optional false to SparkContext.clean
     val cleanedF = context.sparkContext.clean(transformFunc, false)
-    val realTransformFunc =  (rdds: Seq[RDD[_]], time: Time) => {
+    val realTransformFunc = (rdds: Seq[RDD[_]], time: Time) => {
       assert(rdds.length == 1)
       cleanedF(rdds.head.asInstanceOf[RDD[T]], time)
     }
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/dstream/FileInputDStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/dstream/FileInputDStream.scala
index eca69f00188e4..6c1fab56740ee 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/dstream/FileInputDStream.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/dstream/FileInputDStream.scala
@@ -69,7 +69,7 @@ import org.apache.spark.util.{TimeStampedHashMap, Utils}
  *   processing semantics are undefined.
  */
 private[streaming]
-class FileInputDStream[K, V, F <: NewInputFormat[K,V]](
+class FileInputDStream[K, V, F <: NewInputFormat[K, V]](
     @transient ssc_ : StreamingContext,
     directory: String,
     filter: Path => Boolean = FileInputDStream.defaultFilter,
@@ -251,7 +251,7 @@ class FileInputDStream[K, V, F <: NewInputFormat[K,V]](
 
   /** Generate one RDD from an array of files */
   private def filesToRDD(files: Seq[String]): RDD[(K, V)] = {
-    val fileRDDs = files.map(file =>{
+    val fileRDDs = files.map { file =>
       val rdd = serializableConfOpt.map(_.value) match {
         case Some(config) => context.sparkContext.newAPIHadoopFile(
           file,
@@ -267,7 +267,7 @@ class FileInputDStream[K, V, F <: NewInputFormat[K,V]](
           "Refer to the streaming programming guide for more details.")
       }
       rdd
-    })
+    }
     new UnionRDD(context.sparkContext, fileRDDs)
   }
 
@@ -294,7 +294,7 @@ class FileInputDStream[K, V, F <: NewInputFormat[K,V]](
   private def readObject(ois: ObjectInputStream): Unit = Utils.tryOrIOException {
     logDebug(this.getClass().getSimpleName + ".readObject used")
     ois.defaultReadObject()
-    generatedRDDs = new mutable.HashMap[Time, RDD[(K,V)]] ()
+    generatedRDDs = new mutable.HashMap[Time, RDD[(K, V)]]()
     batchTimeToSelectedFiles =
       new mutable.HashMap[Time, Array[String]] with mutable.SynchronizedMap[Time, Array[String]]
     recentlySelectedFiles = new mutable.HashSet[String]()
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/dstream/PairDStreamFunctions.scala b/streaming/src/main/scala/org/apache/spark/streaming/dstream/PairDStreamFunctions.scala
index fda22eb6ec42e..358e4c66df7ba 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/dstream/PairDStreamFunctions.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/dstream/PairDStreamFunctions.scala
@@ -32,7 +32,7 @@ import org.apache.spark.streaming.StreamingContext.rddToFileName
 /**
  * Extra functions available on DStream of (key, value) pairs through an implicit conversion.
  */
-class PairDStreamFunctions[K, V](self: DStream[(K,V)])
+class PairDStreamFunctions[K, V](self: DStream[(K, V)])
     (implicit kt: ClassTag[K], vt: ClassTag[V], ord: Ordering[K])
   extends Serializable
 {
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/dstream/ReducedWindowedDStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/dstream/ReducedWindowedDStream.scala
index df9f7f140eddc..6a583bf2a3626 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/dstream/ReducedWindowedDStream.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/dstream/ReducedWindowedDStream.scala
@@ -38,7 +38,7 @@ class ReducedWindowedDStream[K: ClassTag, V: ClassTag](
     _windowDuration: Duration,
     _slideDuration: Duration,
     partitioner: Partitioner
-  ) extends DStream[(K,V)](parent.ssc) {
+  ) extends DStream[(K, V)](parent.ssc) {
 
   require(_windowDuration.isMultipleOf(parent.slideDuration),
     "The window duration of ReducedWindowedDStream (" + _windowDuration + ") " +
@@ -58,7 +58,7 @@ class ReducedWindowedDStream[K: ClassTag, V: ClassTag](
   super.persist(StorageLevel.MEMORY_ONLY_SER)
   reducedStream.persist(StorageLevel.MEMORY_ONLY_SER)
 
-  def windowDuration: Duration =  _windowDuration
+  def windowDuration: Duration = _windowDuration
 
   override def dependencies: List[DStream[_]] = List(reducedStream)
 
@@ -68,7 +68,7 @@ class ReducedWindowedDStream[K: ClassTag, V: ClassTag](
 
   override def parentRememberDuration: Duration = rememberDuration + windowDuration
 
-  override def persist(storageLevel: StorageLevel): DStream[(K,V)] = {
+  override def persist(storageLevel: StorageLevel): DStream[(K, V)] = {
     super.persist(storageLevel)
     reducedStream.persist(storageLevel)
     this
@@ -118,7 +118,7 @@ class ReducedWindowedDStream[K: ClassTag, V: ClassTag](
 
     // Get the RDD of the reduced value of the previous window
     val previousWindowRDD =
-      getOrCompute(previousWindow.endTime).getOrElse(ssc.sc.makeRDD(Seq[(K,V)]()))
+      getOrCompute(previousWindow.endTime).getOrElse(ssc.sc.makeRDD(Seq[(K, V)]()))
 
     // Make the list of RDDs that needs to cogrouped together for reducing their reduced values
     val allRDDs = new ArrayBuffer[RDD[(K, V)]]() += previousWindowRDD ++= oldRDDs ++= newRDDs
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/dstream/ShuffledDStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/dstream/ShuffledDStream.scala
index 7757ccac09a58..e0ffd5d86b435 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/dstream/ShuffledDStream.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/dstream/ShuffledDStream.scala
@@ -25,19 +25,19 @@ import scala.reflect.ClassTag
 
 private[streaming]
 class ShuffledDStream[K: ClassTag, V: ClassTag, C: ClassTag](
-    parent: DStream[(K,V)],
+    parent: DStream[(K, V)],
     createCombiner: V => C,
     mergeValue: (C, V) => C,
     mergeCombiner: (C, C) => C,
     partitioner: Partitioner,
     mapSideCombine: Boolean = true
-  ) extends DStream[(K,C)] (parent.ssc) {
+  ) extends DStream[(K, C)] (parent.ssc) {
 
   override def dependencies: List[DStream[_]] = List(parent)
 
   override def slideDuration: Duration = parent.slideDuration
 
-  override def compute(validTime: Time): Option[RDD[(K,C)]] = {
+  override def compute(validTime: Time): Option[RDD[(K, C)]] = {
     parent.getOrCompute(validTime) match {
       case Some(rdd) => Some(rdd.combineByKey[C](
           createCombiner, mergeValue, mergeCombiner, partitioner, mapSideCombine))
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/dstream/StateDStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/dstream/StateDStream.scala
index de8718d0a80fe..621d6dff788f4 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/dstream/StateDStream.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/dstream/StateDStream.scala
@@ -51,7 +51,7 @@ class StateDStream[K: ClassTag, V: ClassTag, S: ClassTag](
     val finalFunc = (iterator: Iterator[(K, (Iterable[V], Iterable[S]))]) => {
       val i = iterator.map(t => {
         val itr = t._2._2.iterator
-        val headOption = if(itr.hasNext) Some(itr.next) else None
+        val headOption = if (itr.hasNext) Some(itr.next()) else None
         (t._1, t._2._1.toSeq, headOption)
       })
       updateFuncLocal(i)
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/dstream/WindowedDStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/dstream/WindowedDStream.scala
index 899865a906c27..4efba039f8959 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/dstream/WindowedDStream.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/dstream/WindowedDStream.scala
@@ -44,7 +44,7 @@ class WindowedDStream[T: ClassTag](
   // Persist parent level by default, as those RDDs are going to be obviously reused.
   parent.persist(StorageLevel.MEMORY_ONLY_SER)
 
-  def windowDuration: Duration =  _windowDuration
+  def windowDuration: Duration = _windowDuration
 
   override def dependencies: List[DStream[_]] = List(parent)
 
@@ -68,7 +68,7 @@ class WindowedDStream[T: ClassTag](
       new PartitionerAwareUnionRDD(ssc.sc, rddsInWindow)
     } else {
       logDebug("Using normal union for windowing at " + validTime)
-      new UnionRDD(ssc.sc,rddsInWindow)
+      new UnionRDD(ssc.sc, rddsInWindow)
     }
     Some(windowRDD)
   }
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/receiver/BlockGenerator.scala b/streaming/src/main/scala/org/apache/spark/streaming/receiver/BlockGenerator.scala
index 4bebcc5aa7ca0..0588517a2de39 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/receiver/BlockGenerator.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/receiver/BlockGenerator.scala
@@ -164,7 +164,7 @@ private[streaming] class BlockGenerator(
   private def keepPushingBlocks() {
     logInfo("Started block pushing thread")
     try {
-      while(!stopped) {
+      while (!stopped) {
         Option(blocksForPushing.poll(100, TimeUnit.MILLISECONDS)) match {
           case Some(block) => pushBlock(block)
           case None =>
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/receiver/RateLimiter.scala b/streaming/src/main/scala/org/apache/spark/streaming/receiver/RateLimiter.scala
index 97db9ded83367..8df542b367d27 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/receiver/RateLimiter.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/receiver/RateLimiter.scala
@@ -17,8 +17,9 @@
 
 package org.apache.spark.streaming.receiver
 
+import com.google.common.util.concurrent.{RateLimiter => GuavaRateLimiter}
+
 import org.apache.spark.{Logging, SparkConf}
-import com.google.common.util.concurrent.{RateLimiter=>GuavaRateLimiter}
 
 /** Provides waitToPush() method to limit the rate at which receivers consume data.
   *
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ReceiverTracker.scala b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ReceiverTracker.scala
index f73f7e705ee0d..f1504b09c9873 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ReceiverTracker.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ReceiverTracker.scala
@@ -230,7 +230,7 @@ class ReceiverTracker(ssc: StreamingContext, skipReceiverLaunch: Boolean = false
   class ReceiverLauncher {
     @transient val env = ssc.env
     @volatile @transient private var running = false
-    @transient val thread  = new Thread() {
+    @transient val thread = new Thread() {
       override def run() {
         try {
           SparkEnv.set(env)
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/util/FileBasedWriteAheadLog.scala b/streaming/src/main/scala/org/apache/spark/streaming/util/FileBasedWriteAheadLog.scala
index 87ba4f84a9ceb..fe6328b1ce727 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/util/FileBasedWriteAheadLog.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/util/FileBasedWriteAheadLog.scala
@@ -200,7 +200,7 @@ private[streaming] class FileBasedWriteAheadLog(
   /** Initialize the log directory or recover existing logs inside the directory */
   private def initializeOrRecover(): Unit = synchronized {
     val logDirectoryPath = new Path(logDirectory)
-    val fileSystem =  HdfsUtils.getFileSystemForPath(logDirectoryPath, hadoopConf)
+    val fileSystem = HdfsUtils.getFileSystemForPath(logDirectoryPath, hadoopConf)
 
     if (fileSystem.exists(logDirectoryPath) && fileSystem.getFileStatus(logDirectoryPath).isDir) {
       val logFileInfo = logFilesTologInfo(fileSystem.listStatus(logDirectoryPath).map { _.getPath })
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/util/RawTextHelper.scala b/streaming/src/main/scala/org/apache/spark/streaming/util/RawTextHelper.scala
index 4d968f8bfa7a8..408936653c790 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/util/RawTextHelper.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/util/RawTextHelper.scala
@@ -27,7 +27,7 @@ object RawTextHelper {
    * Splits lines and counts the words.
    */
   def splitAndCountPartitions(iter: Iterator[String]): Iterator[(String, Long)] = {
-    val map = new OpenHashMap[String,Long]
+    val map = new OpenHashMap[String, Long]
     var i = 0
     var j = 0
     while (iter.hasNext) {
@@ -98,7 +98,7 @@ object RawTextHelper {
    * before real workload starts.
    */
   def warmUp(sc: SparkContext) {
-    for(i <- 0 to 1) {
+    for (i <- 0 to 1) {
       sc.parallelize(1 to 200000, 1000)
         .map(_ % 1331).map(_.toString)
         .mapPartitions(splitAndCountPartitions).reduceByKey(_ + _, 10)
diff --git a/streaming/src/test/scala/org/apache/spark/streaming/BasicOperationsSuite.scala b/streaming/src/test/scala/org/apache/spark/streaming/BasicOperationsSuite.scala
index f269cb74e0c2b..08faeaa58f419 100644
--- a/streaming/src/test/scala/org/apache/spark/streaming/BasicOperationsSuite.scala
+++ b/streaming/src/test/scala/org/apache/spark/streaming/BasicOperationsSuite.scala
@@ -255,7 +255,7 @@ class BasicOperationsSuite extends TestSuiteBase {
       Seq(  )
     )
     val operation = (s1: DStream[String], s2: DStream[String]) => {
-      s1.map(x => (x,1)).cogroup(s2.map(x => (x, "x"))).mapValues(x => (x._1.toSeq, x._2.toSeq))
+      s1.map(x => (x, 1)).cogroup(s2.map(x => (x, "x"))).mapValues(x => (x._1.toSeq, x._2.toSeq))
     }
     testOperation(inputData1, inputData2, operation, outputData, true)
   }
@@ -427,9 +427,9 @@ class BasicOperationsSuite extends TestSuiteBase {
   test("updateStateByKey - object lifecycle") {
     val inputData =
       Seq(
-        Seq("a","b"),
+        Seq("a", "b"),
         null,
-        Seq("a","c","a"),
+        Seq("a", "c", "a"),
         Seq("c"),
         null,
         null
diff --git a/streaming/src/test/scala/org/apache/spark/streaming/InputStreamsSuite.scala b/streaming/src/test/scala/org/apache/spark/streaming/InputStreamsSuite.scala
index 0122514f9374c..b74d67c63a788 100644
--- a/streaming/src/test/scala/org/apache/spark/streaming/InputStreamsSuite.scala
+++ b/streaming/src/test/scala/org/apache/spark/streaming/InputStreamsSuite.scala
@@ -418,7 +418,7 @@ class TestServer(portToBind: Int = 0) extends Logging {
   val servingThread = new Thread() {
     override def run() {
       try {
-        while(true) {
+        while (true) {
           logInfo("Accepting connections on port " + port)
           val clientSocket = serverSocket.accept()
           if (startLatch.getCount == 1) {
diff --git a/streaming/src/test/scala/org/apache/spark/streaming/StreamingContextSuite.scala b/streaming/src/test/scala/org/apache/spark/streaming/StreamingContextSuite.scala
index f8e8030791df1..e36c7914b130e 100644
--- a/streaming/src/test/scala/org/apache/spark/streaming/StreamingContextSuite.scala
+++ b/streaming/src/test/scala/org/apache/spark/streaming/StreamingContextSuite.scala
@@ -732,7 +732,9 @@ class SlowTestReceiver(totalRecords: Int, recordsPerSecond: Int)
 
   def onStop() {
     // Simulate slow receiver by waiting for all records to be produced
-    while(!SlowTestReceiver.receivedAllRecords) Thread.sleep(100)
+    while (!SlowTestReceiver.receivedAllRecords) {
+      Thread.sleep(100)
+    }
     // no clean to be done, the receiving thread should stop on it own
   }
 }
diff --git a/streaming/src/test/scala/org/apache/spark/streaming/StreamingListenerSuite.scala b/streaming/src/test/scala/org/apache/spark/streaming/StreamingListenerSuite.scala
index 312cce408cfe7..1dc8960d60528 100644
--- a/streaming/src/test/scala/org/apache/spark/streaming/StreamingListenerSuite.scala
+++ b/streaming/src/test/scala/org/apache/spark/streaming/StreamingListenerSuite.scala
@@ -133,8 +133,10 @@ class StreamingListenerSuite extends TestSuiteBase with Matchers {
 
   /** Check if a sequence of numbers is in increasing order */
   def isInIncreasingOrder(seq: Seq[Long]): Boolean = {
-    for(i <- 1 until seq.size) {
-      if (seq(i - 1) > seq(i)) return false
+    for (i <- 1 until seq.size) {
+      if (seq(i - 1) > seq(i)) {
+        return false
+      }
     }
     true
   }
diff --git a/streaming/src/test/scala/org/apache/spark/streaming/ui/StreamingJobProgressListenerSuite.scala b/streaming/src/test/scala/org/apache/spark/streaming/ui/StreamingJobProgressListenerSuite.scala
index 2a0f45830e03c..c9175d61b1f49 100644
--- a/streaming/src/test/scala/org/apache/spark/streaming/ui/StreamingJobProgressListenerSuite.scala
+++ b/streaming/src/test/scala/org/apache/spark/streaming/ui/StreamingJobProgressListenerSuite.scala
@@ -64,7 +64,7 @@ class StreamingJobProgressListenerSuite extends TestSuiteBase with Matchers {
     listener.numTotalReceivedRecords should be (0)
 
     // onBatchStarted
-    val batchInfoStarted = BatchInfo(Time(1000), streamIdToNumRecords,  1000, Some(2000), None)
+    val batchInfoStarted = BatchInfo(Time(1000), streamIdToNumRecords, 1000, Some(2000), None)
     listener.onBatchStarted(StreamingListenerBatchStarted(batchInfoStarted))
     listener.waitingBatches should be (Nil)
     listener.runningBatches should be (List(BatchUIData(batchInfoStarted)))

From ee6a0e12fb76e4d5c24175900e5bf6a8cb35e2b0 Mon Sep 17 00:00:00 2001
From: Reynold Xin <rxin@databricks.com>
Date: Thu, 28 May 2015 18:08:56 -0700
Subject: [PATCH 221/525] [SPARK-7927] whitespace fixes for Hive and
 ThriftServer.

So we can enable a whitespace enforcement rule in the style checker to save code review time.

Author: Reynold Xin <rxin@databricks.com>

Closes #6478 from rxin/whitespace-hive and squashes the following commits:

e01b0e0 [Reynold Xin] Fixed tests.
a3bba22 [Reynold Xin] [SPARK-7927] whitespace fixes for Hive and ThriftServer.
---
 .../sql/hive/thriftserver/SparkSQLCLIDriver.scala      |  8 ++++----
 .../sql/hive/thriftserver/ui/ThriftServerPage.scala    |  6 +++---
 .../hive/thriftserver/ui/ThriftServerSessionPage.scala |  2 +-
 .../spark/sql/hive/thriftserver/UISeleniumSuite.scala  |  4 ++--
 .../apache/spark/sql/hive/ExtendedHiveQlParser.scala   |  6 +++---
 .../scala/org/apache/spark/sql/hive/HiveContext.scala  |  4 ++--
 .../org/apache/spark/sql/hive/HiveInspectors.scala     | 10 +++++-----
 .../apache/spark/sql/hive/HiveMetastoreCatalog.scala   | 10 +++++++---
 .../main/scala/org/apache/spark/sql/hive/HiveQl.scala  |  9 +++++----
 .../spark/sql/hive/execution/InsertIntoHiveTable.scala |  7 +++----
 .../sql/hive/execution/ScriptTransformation.scala      |  2 +-
 .../scala/org/apache/spark/sql/hive/hiveUdfs.scala     |  6 +++---
 .../apache/spark/sql/hive/hiveWriterContainers.scala   |  2 +-
 .../org/apache/spark/sql/hive/test/TestHive.scala      |  6 +++---
 14 files changed, 43 insertions(+), 39 deletions(-)

diff --git a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLCLIDriver.scala b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLCLIDriver.scala
index deb1008c468bf..14f6f658d9b75 100644
--- a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLCLIDriver.scala
+++ b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLCLIDriver.scala
@@ -43,7 +43,7 @@ import org.apache.spark.util.Utils
 private[hive] object SparkSQLCLIDriver {
   private var prompt = "spark-sql"
   private var continuedPrompt = "".padTo(prompt.length, ' ')
-  private var transport:TSocket = _
+  private var transport: TSocket = _
 
   installSignalHandler()
 
@@ -276,13 +276,13 @@ private[hive] class SparkSQLCLIDriver extends CliDriver with Logging {
 
           driver.init()
           val out = sessionState.out
-          val start:Long = System.currentTimeMillis()
+          val start: Long = System.currentTimeMillis()
           if (sessionState.getIsVerbose) {
             out.println(cmd)
           }
           val rc = driver.run(cmd)
           val end = System.currentTimeMillis()
-          val timeTaken:Double = (end - start) / 1000.0
+          val timeTaken: Double = (end - start) / 1000.0
 
           ret = rc.getResponseCode
           if (ret != 0) {
@@ -310,7 +310,7 @@ private[hive] class SparkSQLCLIDriver extends CliDriver with Logging {
               res.clear()
             }
           } catch {
-            case e:IOException =>
+            case e: IOException =>
               console.printError(
                 s"""Failed with exception ${e.getClass.getName}: ${e.getMessage}
                    |${org.apache.hadoop.util.StringUtils.stringifyException(e)}
diff --git a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/ui/ThriftServerPage.scala b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/ui/ThriftServerPage.scala
index 7c48ff4b35df5..10c83d8b27a2a 100644
--- a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/ui/ThriftServerPage.scala
+++ b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/ui/ThriftServerPage.scala
@@ -77,7 +77,7 @@ private[ui] class ThriftServerPage(parent: ThriftServerTab) extends WebUIPage(""
             [{id}]
           </a>
         }
-        val detail = if(info.state == ExecutionState.FAILED) info.detail else info.executePlan
+        val detail = if (info.state == ExecutionState.FAILED) info.detail else info.executePlan
         <tr>
           <td>{info.userName}</td>
           <td>
@@ -85,7 +85,7 @@ private[ui] class ThriftServerPage(parent: ThriftServerTab) extends WebUIPage(""
           </td>
           <td>{info.groupId}</td>
           <td>{formatDate(info.startTimestamp)}</td>
-          <td>{if(info.finishTimestamp > 0) formatDate(info.finishTimestamp)}</td>
+          <td>{if (info.finishTimestamp > 0) formatDate(info.finishTimestamp)}</td>
           <td>{formatDurationOption(Some(info.totalTime))}</td>
           <td>{info.statement}</td>
           <td>{info.state}</td>
@@ -150,7 +150,7 @@ private[ui] class ThriftServerPage(parent: ThriftServerTab) extends WebUIPage(""
           <td> {session.ip} </td>
           <td> <a href={sessionLink}> {session.sessionId} </a> </td>
           <td> {formatDate(session.startTimestamp)} </td>
-          <td> {if(session.finishTimestamp > 0) formatDate(session.finishTimestamp)} </td>
+          <td> {if (session.finishTimestamp > 0) formatDate(session.finishTimestamp)} </td>
           <td> {formatDurationOption(Some(session.totalTime))} </td>
           <td> {session.totalExecution.toString} </td>
         </tr>
diff --git a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/ui/ThriftServerSessionPage.scala b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/ui/ThriftServerSessionPage.scala
index d9d66dcd8517e..3b01afa603cea 100644
--- a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/ui/ThriftServerSessionPage.scala
+++ b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/ui/ThriftServerSessionPage.scala
@@ -87,7 +87,7 @@ private[ui] class ThriftServerSessionPage(parent: ThriftServerTab)
             [{id}]
           </a>
         }
-        val detail = if(info.state == ExecutionState.FAILED) info.detail else info.executePlan
+        val detail = if (info.state == ExecutionState.FAILED) info.detail else info.executePlan
         <tr>
           <td>{info.userName}</td>
           <td>
diff --git a/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/UISeleniumSuite.scala b/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/UISeleniumSuite.scala
index e1466e0423033..4c9fab7ef6136 100644
--- a/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/UISeleniumSuite.scala
+++ b/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/UISeleniumSuite.scala
@@ -73,7 +73,7 @@ class UISeleniumSuite
   }
 
   ignore("thrift server ui test") {
-    withJdbcStatement(statement =>{
+    withJdbcStatement { statement =>
       val baseURL = s"http://localhost:$uiPort"
 
       val queries = Seq(
@@ -97,6 +97,6 @@ class UISeleniumSuite
           findAll(cssSelector("""ul table tbody tr td""")).map(_.text).toList should contain (line)
         }
       }
-    })
+    }
   }
 }
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/ExtendedHiveQlParser.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/ExtendedHiveQlParser.scala
index 3f20c6142e59a..7f8449cdc282d 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/ExtendedHiveQlParser.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/ExtendedHiveQlParser.scala
@@ -29,10 +29,10 @@ import org.apache.spark.sql.hive.execution.{AddJar, AddFile, HiveNativeCommand}
 private[hive] class ExtendedHiveQlParser extends AbstractSparkSQLParser {
   // Keyword is a convention with AbstractSparkSQLParser, which will scan all of the `Keyword`
   // properties via reflection the class in runtime for constructing the SqlLexical object
-  protected val ADD  = Keyword("ADD")
-  protected val DFS  = Keyword("DFS")
+  protected val ADD = Keyword("ADD")
+  protected val DFS = Keyword("DFS")
   protected val FILE = Keyword("FILE")
-  protected val JAR  = Keyword("JAR")
+  protected val JAR = Keyword("JAR")
 
   protected lazy val start: Parser[LogicalPlan] = dfs | addJar | addFile | hiveQl
 
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala
index 2ed71d3d52880..fbf2c7d8cbc06 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala
@@ -530,7 +530,7 @@ private[hive] object HiveContext {
     val propMap: HashMap[String, String] = HashMap()
     // We have to mask all properties in hive-site.xml that relates to metastore data source
     // as we used a local metastore here.
-    HiveConf.ConfVars.values().foreach { confvar  =>
+    HiveConf.ConfVars.values().foreach { confvar =>
       if (confvar.varname.contains("datanucleus") || confvar.varname.contains("jdo")) {
         propMap.put(confvar.varname, confvar.defaultVal)
       }
@@ -553,7 +553,7 @@ private[hive] object HiveContext {
       }.mkString("{", ",", "}")
     case (seq: Seq[_], ArrayType(typ, _)) =>
       seq.map(v => (v, typ)).map(toHiveStructString).mkString("[", ",", "]")
-    case (map: Map[_,_], MapType(kType, vType, _)) =>
+    case (map: Map[_, _], MapType(kType, vType, _)) =>
       map.map {
         case (key, value) =>
           toHiveStructString((key, kType)) + ":" + toHiveStructString((value, vType))
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveInspectors.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveInspectors.scala
index 0a694c70e4e5c..24cd335082639 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveInspectors.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveInspectors.scala
@@ -335,7 +335,7 @@ private[hive] trait HiveInspectors {
       val allRefs = si.getAllStructFieldRefs
       new GenericRow(
         allRefs.map(r =>
-          unwrap(si.getStructFieldData(data,r), r.getFieldObjectInspector)).toArray)
+          unwrap(si.getStructFieldData(data, r), r.getFieldObjectInspector)).toArray)
   }
 
 
@@ -561,8 +561,8 @@ private[hive] trait HiveInspectors {
     case DecimalType() => PrimitiveObjectInspectorFactory.javaHiveDecimalObjectInspector
     case StructType(fields) =>
       ObjectInspectorFactory.getStandardStructObjectInspector(
-        java.util.Arrays.asList(fields.map(f => f.name) :_*),
-        java.util.Arrays.asList(fields.map(f => toInspector(f.dataType)) :_*))
+        java.util.Arrays.asList(fields.map(f => f.name) : _*),
+        java.util.Arrays.asList(fields.map(f => toInspector(f.dataType)) : _*))
   }
 
   /**
@@ -677,8 +677,8 @@ private[hive] trait HiveInspectors {
         getListTypeInfo(elemType.toTypeInfo)
       case StructType(fields) =>
         getStructTypeInfo(
-          java.util.Arrays.asList(fields.map(_.name) :_*),
-          java.util.Arrays.asList(fields.map(_.dataType.toTypeInfo) :_*))
+          java.util.Arrays.asList(fields.map(_.name) : _*),
+          java.util.Arrays.asList(fields.map(_.dataType.toTypeInfo) : _*))
       case MapType(keyType, valueType, _) =>
         getMapTypeInfo(keyType.toTypeInfo, valueType.toTypeInfo)
       case BinaryType => binaryTypeInfo
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
index 95117f7a6847e..47b85731587d5 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
@@ -546,13 +546,17 @@ private[hive] class HiveMetastoreCatalog(val client: ClientInterface, hive: Hive
    * UNIMPLEMENTED: It needs to be decided how we will persist in-memory tables to the metastore.
    * For now, if this functionality is desired mix in the in-memory [[OverrideCatalog]].
    */
-  override def registerTable(tableIdentifier: Seq[String], plan: LogicalPlan): Unit = ???
+  override def registerTable(tableIdentifier: Seq[String], plan: LogicalPlan): Unit = {
+    throw new UnsupportedOperationException
+  }
 
   /**
    * UNIMPLEMENTED: It needs to be decided how we will persist in-memory tables to the metastore.
    * For now, if this functionality is desired mix in the in-memory [[OverrideCatalog]].
    */
-  override def unregisterTable(tableIdentifier: Seq[String]): Unit = ???
+  override def unregisterTable(tableIdentifier: Seq[String]): Unit = {
+    throw new UnsupportedOperationException
+  }
 
   override def unregisterAllTables(): Unit = {}
 }
@@ -725,7 +729,7 @@ private[hive] case class MetastoreRelation
   val output = attributes ++ partitionKeys
 
   /** An attribute map that can be used to lookup original attributes based on expression id. */
-  val attributeMap = AttributeMap(output.map(o => (o,o)))
+  val attributeMap = AttributeMap(output.map(o => (o, o)))
 
   /** An attribute map for determining the ordinal for non-partition columns. */
   val columnOrdinals = AttributeMap(attributes.zipWithIndex)
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala
index 2cbb5ca4d2e0c..3915ee835685f 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala
@@ -665,7 +665,7 @@ https://cwiki.apache.org/confluence/display/Hive/Enhanced+Aggregation%2C+Cube%2C
                 HiveColumn(field.getName, field.getType, field.getComment)
               })
           }
-        case Token("TOK_TABLEROWFORMAT", Token("TOK_SERDEPROPS", child :: Nil) :: Nil)=>
+        case Token("TOK_TABLEROWFORMAT", Token("TOK_SERDEPROPS", child :: Nil) :: Nil) =>
           val serdeParams = new java.util.HashMap[String, String]()
           child match {
             case Token("TOK_TABLEROWFORMATFIELD", rowChild1 :: rowChild2) =>
@@ -775,7 +775,7 @@ https://cwiki.apache.org/confluence/display/Hive/Enhanced+Aggregation%2C+Cube%2C
 
     // Support "TRUNCATE TABLE table_name [PARTITION partition_spec]"
     case Token("TOK_TRUNCATETABLE",
-          Token("TOK_TABLE_PARTITION",table)::Nil) =>  NativePlaceholder
+          Token("TOK_TABLE_PARTITION", table) :: Nil) => NativePlaceholder
 
     case Token("TOK_QUERY", queryArgs)
         if Seq("TOK_FROM", "TOK_INSERT").contains(queryArgs.head.getText) =>
@@ -1151,7 +1151,7 @@ https://cwiki.apache.org/confluence/display/Hive/Enhanced+Aggregation%2C+Cube%2C
         case Seq(false, false) => Inner
       }.toBuffer
 
-      val joinedTables = tables.reduceLeft(Join(_,_, Inner, None))
+      val joinedTables = tables.reduceLeft(Join(_, _, Inner, None))
 
       // Must be transform down.
       val joinedResult = joinedTables transform {
@@ -1171,7 +1171,8 @@ https://cwiki.apache.org/confluence/display/Hive/Enhanced+Aggregation%2C+Cube%2C
       // worth the number of hacks that will be required to implement it.  Namely, we need to add
       // some sort of mapped star expansion that would expand all child output row to be similarly
       // named output expressions where some aggregate expression has been applied (i.e. First).
-      ??? // Aggregate(groups, Star(None, First(_)) :: Nil, joinedResult)
+      // Aggregate(groups, Star(None, First(_)) :: Nil, joinedResult)
+      throw new UnsupportedOperationException
 
     case Token(allJoinTokens(joinToken),
            relation1 ::
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveTable.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveTable.scala
index 7a6ca48b54a24..8613332186f28 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveTable.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveTable.scala
@@ -194,10 +194,9 @@ case class InsertIntoHiveTable(
     if (partition.nonEmpty) {
 
       // loadPartition call orders directories created on the iteration order of the this map
-      val orderedPartitionSpec = new util.LinkedHashMap[String,String]()
-      table.hiveQlTable.getPartCols().foreach{
-        entry=>
-          orderedPartitionSpec.put(entry.getName,partitionSpec.get(entry.getName).getOrElse(""))
+      val orderedPartitionSpec = new util.LinkedHashMap[String, String]()
+      table.hiveQlTable.getPartCols().foreach { entry =>
+        orderedPartitionSpec.put(entry.getName, partitionSpec.get(entry.getName).getOrElse(""))
       }
       val partVals = MetaStoreUtils.getPvals(table.hiveQlTable.getPartCols, partitionSpec)
 
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/ScriptTransformation.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/ScriptTransformation.scala
index bfd26e0170c70..6f27a8626fc1e 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/ScriptTransformation.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/ScriptTransformation.scala
@@ -216,7 +216,7 @@ case class HiveScriptIOSchema (
     val columnTypes = attrs.map {
       case aref: AttributeReference => aref.dataType
       case e: NamedExpression => e.dataType
-      case _ =>  null
+      case _ => null
     }
 
     (columns, columnTypes)
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/hiveUdfs.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/hiveUdfs.scala
index 7ec4f7332502e..bb116e3ab7de7 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/hiveUdfs.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/hiveUdfs.scala
@@ -315,7 +315,7 @@ private[hive] case class HiveWindowFunction(
 
   // The object inspector of values returned from the Hive window function.
   @transient
-  protected lazy val returnInspector  = {
+  protected lazy val returnInspector = {
     evaluator.init(GenericUDAFEvaluator.Mode.COMPLETE, inputInspectors)
   }
 
@@ -410,7 +410,7 @@ private[hive] case class HiveGenericUdaf(
   protected lazy val resolver: AbstractGenericUDAFResolver = funcWrapper.createFunction()
 
   @transient
-  protected lazy val objectInspector  = {
+  protected lazy val objectInspector = {
     val parameterInfo = new SimpleGenericUDAFParameterInfo(inspectors.toArray, false, false)
     resolver.getEvaluator(parameterInfo)
       .init(GenericUDAFEvaluator.Mode.COMPLETE, inspectors.toArray)
@@ -443,7 +443,7 @@ private[hive] case class HiveUdaf(
     new GenericUDAFBridge(funcWrapper.createFunction())
 
   @transient
-  protected lazy val objectInspector  = {
+  protected lazy val objectInspector = {
     val parameterInfo = new SimpleGenericUDAFParameterInfo(inspectors.toArray, false, false)
     resolver.getEvaluator(parameterInfo)
       .init(GenericUDAFEvaluator.Mode.COMPLETE, inspectors.toArray)
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/hiveWriterContainers.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/hiveWriterContainers.scala
index 50b209f7ccbb8..2bb526b14be34 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/hiveWriterContainers.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/hiveWriterContainers.scala
@@ -71,7 +71,7 @@ private[hive] class SparkHiveWriterContainer(
   @transient protected lazy val jobContext = newJobContext(conf.value, jID.value)
   @transient private lazy val taskContext = newTaskAttemptContext(conf.value, taID.value)
   @transient private lazy val outputFormat =
-    conf.value.getOutputFormat.asInstanceOf[HiveOutputFormat[AnyRef,Writable]]
+    conf.value.getOutputFormat.asInstanceOf[HiveOutputFormat[AnyRef, Writable]]
 
   def driverSideSetup() {
     setIDs(0, 0, 0)
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/test/TestHive.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/test/TestHive.scala
index 2e06cabfa80c9..7c7afc824d7a6 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/test/TestHive.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/test/TestHive.scala
@@ -189,7 +189,7 @@ class TestHiveContext(sc: SparkContext) extends HiveContext(sc) {
     }
   }
 
-  case class TestTable(name: String, commands: (()=>Unit)*)
+  case class TestTable(name: String, commands: (() => Unit)*)
 
   protected[hive] implicit class SqlCmd(sql: String) {
     def cmd: () => Unit = {
@@ -253,8 +253,8 @@ class TestHiveContext(sc: SparkContext) extends HiveContext(sc) {
          |  'serialization.format'='${classOf[TBinaryProtocol].getName}'
          |)
          |STORED AS
-         |INPUTFORMAT '${classOf[SequenceFileInputFormat[_,_]].getName}'
-         |OUTPUTFORMAT '${classOf[SequenceFileOutputFormat[_,_]].getName}'
+         |INPUTFORMAT '${classOf[SequenceFileInputFormat[_, _]].getName}'
+         |OUTPUTFORMAT '${classOf[SequenceFileOutputFormat[_, _]].getName}'
         """.stripMargin)
 
       runSqlHive(

From 66c49ed60dcef48a6b38ae2d2c4c479933f3aa19 Mon Sep 17 00:00:00 2001
From: Kay Ousterhout <kayousterhout@gmail.com>
Date: Thu, 28 May 2015 19:04:32 -0700
Subject: [PATCH 222/525] [SPARK-7933] Remove Patrick's username/pw from merge
 script

Looks like this was added by accident when pwendell merged a commit back in September: fe2b1d6a209db9fe96b1c6630677955b94bd48c9

Author: Kay Ousterhout <kayousterhout@gmail.com>

Closes #6485 from kayousterhout/SPARK-7933 and squashes the following commits:

7c6164a [Kay Ousterhout] [SPARK-7933] Remove Patrick's username/pw from merge script
---
 dev/merge_spark_pr.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/dev/merge_spark_pr.py b/dev/merge_spark_pr.py
index 787c5cc8e892d..cd83b352c1bfb 100755
--- a/dev/merge_spark_pr.py
+++ b/dev/merge_spark_pr.py
@@ -44,9 +44,9 @@
 # Remote name which points to Apache git
 PUSH_REMOTE_NAME = os.environ.get("PUSH_REMOTE_NAME", "apache")
 # ASF JIRA username
-JIRA_USERNAME = os.environ.get("JIRA_USERNAME", "pwendell")
+JIRA_USERNAME = os.environ.get("JIRA_USERNAME", "")
 # ASF JIRA password
-JIRA_PASSWORD = os.environ.get("JIRA_PASSWORD", "35500")
+JIRA_PASSWORD = os.environ.get("JIRA_PASSWORD", "")
 
 GITHUB_BASE = "https://github.com/apache/spark/pull"
 GITHUB_API_BASE = "https://api.github.com/repos/apache/spark"

From 9b692bfdfcc91b32498865d21138cf215a378665 Mon Sep 17 00:00:00 2001
From: Takuya UESHIN <ueshin@happy-camper.st>
Date: Thu, 28 May 2015 19:05:12 -0700
Subject: [PATCH 223/525] [SPARK-7826] [CORE] Suppress extra calling
 getCacheLocs.

There are too many extra call method `getCacheLocs` for `DAGScheduler`, which includes Akka communication.
To improve `DAGScheduler` performance, suppress extra calling the method.

In my application with over 1200 stages, the execution time became 3.8 min from 8.5 min with my patch.

Author: Takuya UESHIN <ueshin@happy-camper.st>

Closes #6352 from ueshin/issues/SPARK-7826 and squashes the following commits:

3d4d036 [Takuya UESHIN] Modify a test and the documentation.
10b1b22 [Takuya UESHIN] Simplify the unit test.
d858b59 [Takuya UESHIN] Move the storageLevel check inside the if (!cacheLocs.contains(rdd.id)) block.
6f3125c [Takuya UESHIN] Fix scalastyle.
b9c835c [Takuya UESHIN] Put the condition that checks if the RDD has uncached partition or not into variable for readability.
f87f2ec [Takuya UESHIN] Get cached locations from block manager only if the storage level of the RDD is not StorageLevel.NONE.
8248386 [Takuya UESHIN] Revert "Suppress extra calling getCacheLocs."
a4d944a [Takuya UESHIN] Add an unit test.
9a80fad [Takuya UESHIN] Suppress extra calling getCacheLocs.
---
 .../apache/spark/scheduler/DAGScheduler.scala | 15 +++++---
 .../spark/scheduler/DAGSchedulerSuite.scala   | 35 ++++++++++++++++---
 2 files changed, 42 insertions(+), 8 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala b/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala
index a083be2448aa3..a2299e907c5ae 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala
@@ -193,9 +193,15 @@ class DAGScheduler(
   def getCacheLocs(rdd: RDD[_]): Seq[Seq[TaskLocation]] = cacheLocs.synchronized {
     // Note: this doesn't use `getOrElse()` because this method is called O(num tasks) times
     if (!cacheLocs.contains(rdd.id)) {
-      val blockIds = rdd.partitions.indices.map(index => RDDBlockId(rdd.id, index)).toArray[BlockId]
-      val locs: Seq[Seq[TaskLocation]] = blockManagerMaster.getLocations(blockIds).map { bms =>
-        bms.map(bm => TaskLocation(bm.host, bm.executorId))
+      // Note: if the storage level is NONE, we don't need to get locations from block manager.
+      val locs: Seq[Seq[TaskLocation]] = if (rdd.getStorageLevel == StorageLevel.NONE) {
+        Seq.fill(rdd.partitions.size)(Nil)
+      } else {
+        val blockIds =
+          rdd.partitions.indices.map(index => RDDBlockId(rdd.id, index)).toArray[BlockId]
+        blockManagerMaster.getLocations(blockIds).map { bms =>
+          bms.map(bm => TaskLocation(bm.host, bm.executorId))
+        }
       }
       cacheLocs(rdd.id) = locs
     }
@@ -382,7 +388,8 @@ class DAGScheduler(
     def visit(rdd: RDD[_]) {
       if (!visited(rdd)) {
         visited += rdd
-        if (getCacheLocs(rdd).contains(Nil)) {
+        val rddHasUncachedPartitions = getCacheLocs(rdd).contains(Nil)
+        if (rddHasUncachedPartitions) {
           for (dep <- rdd.dependencies) {
             dep match {
               case shufDep: ShuffleDependency[_, _, _] =>
diff --git a/core/src/test/scala/org/apache/spark/scheduler/DAGSchedulerSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/DAGSchedulerSuite.scala
index 6a8ae29aae675..46642236e454a 100644
--- a/core/src/test/scala/org/apache/spark/scheduler/DAGSchedulerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/scheduler/DAGSchedulerSuite.scala
@@ -318,7 +318,7 @@ class DAGSchedulerSuite
   }
 
   test("cache location preferences w/ dependency") {
-    val baseRdd = new MyRDD(sc, 1, Nil)
+    val baseRdd = new MyRDD(sc, 1, Nil).cache()
     val finalRdd = new MyRDD(sc, 1, List(new OneToOneDependency(baseRdd)))
     cacheLocations(baseRdd.id -> 0) =
       Seq(makeBlockManagerId("hostA"), makeBlockManagerId("hostB"))
@@ -331,7 +331,7 @@ class DAGSchedulerSuite
   }
 
   test("regression test for getCacheLocs") {
-    val rdd = new MyRDD(sc, 3, Nil)
+    val rdd = new MyRDD(sc, 3, Nil).cache()
     cacheLocations(rdd.id -> 0) =
       Seq(makeBlockManagerId("hostA"), makeBlockManagerId("hostB"))
     cacheLocations(rdd.id -> 1) =
@@ -342,6 +342,33 @@ class DAGSchedulerSuite
     assert(locs === Seq(Seq("hostA", "hostB"), Seq("hostB", "hostC"), Seq("hostC", "hostD")))
   }
 
+  /**
+   * This test ensures that if a particular RDD is cached, RDDs earlier in the dependency chain
+   * are not computed. It constructs the following chain of dependencies:
+   * +---+ shuffle +---+    +---+    +---+
+   * | A |<--------| B |<---| C |<---| D |
+   * +---+         +---+    +---+    +---+
+   * Here, B is derived from A by performing a shuffle, C has a one-to-one dependency on B,
+   * and D similarly has a one-to-one dependency on C. If none of the RDDs were cached, this
+   * set of RDDs would result in a two stage job: one ShuffleMapStage, and a ResultStage that
+   * reads the shuffled data from RDD A. This test ensures that if C is cached, the scheduler
+   * doesn't perform a shuffle, and instead computes the result using a single ResultStage
+   * that reads C's cached data.
+   */
+  test("getMissingParentStages should consider all ancestor RDDs' cache statuses") {
+    val rddA = new MyRDD(sc, 1, Nil)
+    val rddB = new MyRDD(sc, 1, List(new ShuffleDependency(rddA, null)))
+    val rddC = new MyRDD(sc, 1, List(new OneToOneDependency(rddB))).cache()
+    val rddD = new MyRDD(sc, 1, List(new OneToOneDependency(rddC)))
+    cacheLocations(rddC.id -> 0) =
+      Seq(makeBlockManagerId("hostA"), makeBlockManagerId("hostB"))
+    submit(rddD, Array(0))
+    assert(scheduler.runningStages.size === 1)
+    // Make sure that the scheduler is running the final result stage.
+    // Because C is cached, the shuffle map stage to compute A does not need to be run.
+    assert(scheduler.runningStages.head.isInstanceOf[ResultStage])
+  }
+
   test("avoid exponential blowup when getting preferred locs list") {
     // Build up a complex dependency graph with repeated zip operations, without preferred locations
     var rdd: RDD[_] = new MyRDD(sc, 1, Nil)
@@ -678,9 +705,9 @@ class DAGSchedulerSuite
   }
 
   test("cached post-shuffle") {
-    val shuffleOneRdd = new MyRDD(sc, 2, Nil)
+    val shuffleOneRdd = new MyRDD(sc, 2, Nil).cache()
     val shuffleDepOne = new ShuffleDependency(shuffleOneRdd, null)
-    val shuffleTwoRdd = new MyRDD(sc, 2, List(shuffleDepOne))
+    val shuffleTwoRdd = new MyRDD(sc, 2, List(shuffleDepOne)).cache()
     val shuffleDepTwo = new ShuffleDependency(shuffleTwoRdd, null)
     val finalRdd = new MyRDD(sc, 1, List(shuffleDepTwo))
     submit(finalRdd, Array(0))

From 04616b1a2f5244710b07ecbb404384ded893292c Mon Sep 17 00:00:00 2001
From: Xiangrui Meng <meng@databricks.com>
Date: Thu, 28 May 2015 20:09:12 -0700
Subject: [PATCH 224/525] [SPARK-7927] [MLLIB] Enforce whitespace for more
 tokens in style checker

rxin

Author: Xiangrui Meng <meng@databricks.com>

Closes #6481 from mengxr/mllib-scalastyle and squashes the following commits:

3ca4d61 [Xiangrui Meng] revert scalastyle config
30961ba [Xiangrui Meng] adjust spaces in mllib/test
571b5c5 [Xiangrui Meng] fix spaces in mllib
---
 .../spark/ml/classification/OneVsRest.scala    | 10 +++++-----
 .../scala/org/apache/spark/ml/tree/Node.scala  |  4 ++--
 .../mllib/api/python/PythonMLLibAPI.scala      |  8 ++++----
 .../mllib/clustering/GaussianMixture.scala     |  2 +-
 .../spark/mllib/clustering/LDAOptimizer.scala  |  6 +++---
 .../org/apache/spark/mllib/feature/IDF.scala   |  2 +-
 .../spark/mllib/feature/StandardScaler.scala   |  2 +-
 .../apache/spark/mllib/feature/Word2Vec.scala  |  6 +++---
 .../mllib/linalg/distributed/RowMatrix.scala   |  2 +-
 .../apache/spark/mllib/random/RandomRDDs.scala |  2 +-
 .../mllib/regression/IsotonicRegression.scala  |  2 +-
 .../stat/MultivariateOnlineSummarizer.scala    |  2 +-
 .../spark/mllib/stat/test/ChiSqTest.scala      |  2 +-
 .../apache/spark/mllib/tree/DecisionTree.scala |  4 ++--
 .../mllib/tree/GradientBoostedTrees.scala      | 12 ++++++------
 .../apache/spark/mllib/tree/RandomForest.scala |  2 +-
 .../apache/spark/mllib/tree/model/Node.scala   |  8 ++++----
 .../spark/mllib/util/MFDataGenerator.scala     |  7 +++----
 .../spark/ml/feature/Word2VecSuite.scala       |  6 +++---
 .../spark/ml/tuning/CrossValidatorSuite.scala  | 12 +++++++++---
 .../mllib/api/python/PythonMLLibAPISuite.scala |  2 +-
 .../mllib/classification/NaiveBayesSuite.scala |  2 +-
 .../spark/mllib/classification/SVMSuite.scala  | 18 +++++++++---------
 .../spark/mllib/clustering/KMeansSuite.scala   |  4 ++--
 .../PowerIterationClusteringSuite.scala        |  2 ++
 .../evaluation/RegressionMetricsSuite.scala    |  4 ++--
 .../mllib/feature/StandardScalerSuite.scala    |  2 +-
 .../linalg/distributed/BlockMatrixSuite.scala  |  2 ++
 .../optimization/GradientDescentSuite.scala    |  2 +-
 .../spark/mllib/optimization/NNLSSuite.scala   |  2 ++
 .../spark/mllib/regression/LassoSuite.scala    | 10 ++++++----
 .../spark/mllib/stat/CorrelationSuite.scala    |  6 +++++-
 .../apache/spark/mllib/util/MLUtilsSuite.scala |  2 +-
 33 files changed, 88 insertions(+), 71 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/OneVsRest.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/OneVsRest.scala
index 36735cd834cd4..b8c7f3c5bc3b9 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/classification/OneVsRest.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/classification/OneVsRest.scala
@@ -70,7 +70,7 @@ private[ml] trait OneVsRestParams extends PredictorParams {
 final class OneVsRestModel private[ml] (
     override val uid: String,
     labelMetadata: Metadata,
-    val models: Array[_ <: ClassificationModel[_,_]])
+    val models: Array[_ <: ClassificationModel[_, _]])
   extends Model[OneVsRestModel] with OneVsRestParams {
 
   override def transformSchema(schema: StructType): StructType = {
@@ -104,17 +104,17 @@ final class OneVsRestModel private[ml] (
 
         // add temporary column to store intermediate scores and update
         val tmpColName = "mbc$tmp" + UUID.randomUUID().toString
-        val update: (Map[Int, Double], Vector) => Map[Int, Double]  =
+        val update: (Map[Int, Double], Vector) => Map[Int, Double] =
           (predictions: Map[Int, Double], prediction: Vector) => {
             predictions + ((index, prediction(1)))
           }
         val updateUdf = callUDF(update, mapType, col(accColName), col(rawPredictionCol))
-        val transformedDataset = model.transform(df).select(columns:_*)
+        val transformedDataset = model.transform(df).select(columns : _*)
         val updatedDataset = transformedDataset.withColumn(tmpColName, updateUdf)
         val newColumns = origCols ++ List(col(tmpColName))
 
         // switch out the intermediate column with the accumulator column
-        updatedDataset.select(newColumns:_*).withColumnRenamed(tmpColName, accColName)
+        updatedDataset.select(newColumns : _*).withColumnRenamed(tmpColName, accColName)
     }
 
     if (handlePersistence) {
@@ -190,7 +190,7 @@ final class OneVsRest(override val uid: String)
       val trainingDataset = multiclassLabeled.withColumn(labelColName, labelUDFWithNewMeta)
       val classifier = getClassifier
       classifier.fit(trainingDataset, classifier.labelCol -> labelColName)
-    }.toArray[ClassificationModel[_,_]]
+    }.toArray[ClassificationModel[_, _]]
 
     if (handlePersistence) {
       multiclassLabeled.unpersist()
diff --git a/mllib/src/main/scala/org/apache/spark/ml/tree/Node.scala b/mllib/src/main/scala/org/apache/spark/ml/tree/Node.scala
index 6a84176efb086..4242154be14ce 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/tree/Node.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/tree/Node.scala
@@ -159,9 +159,9 @@ final class InternalNode private[ml] (
 
   override private[tree] def subtreeToString(indentFactor: Int = 0): String = {
     val prefix: String = " " * indentFactor
-    prefix + s"If (${InternalNode.splitToString(split, left=true)})\n" +
+    prefix + s"If (${InternalNode.splitToString(split, left = true)})\n" +
       leftChild.subtreeToString(indentFactor + 1) +
-      prefix + s"Else (${InternalNode.splitToString(split, left=false)})\n" +
+      prefix + s"Else (${InternalNode.splitToString(split, left = false)})\n" +
       rightChild.subtreeToString(indentFactor + 1)
   }
 
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala b/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala
index 2fa54df6fc2b2..65f30fdba7393 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala
@@ -392,7 +392,7 @@ private[python] class PythonMLLibAPI extends Serializable {
       data: JavaRDD[Vector],
       wt: Vector,
       mu: Array[Object],
-      si: Array[Object]):  RDD[Vector]  = {
+      si: Array[Object]): RDD[Vector] = {
 
       val weight = wt.toArray
       val mean = mu.map(_.asInstanceOf[DenseVector])
@@ -428,7 +428,7 @@ private[python] class PythonMLLibAPI extends Serializable {
 
     if (seed != null) als.setSeed(seed)
 
-    val model =  als.run(ratingsJRDD.rdd)
+    val model = als.run(ratingsJRDD.rdd)
     new MatrixFactorizationModelWrapper(model)
   }
 
@@ -459,7 +459,7 @@ private[python] class PythonMLLibAPI extends Serializable {
 
     if (seed != null) als.setSeed(seed)
 
-    val model =  als.run(ratingsJRDD.rdd)
+    val model = als.run(ratingsJRDD.rdd)
     new MatrixFactorizationModelWrapper(model)
   }
 
@@ -1242,7 +1242,7 @@ private[spark] object SerDe extends Serializable {
   }
 
   /* convert RDD[Tuple2[,]] to RDD[Array[Any]] */
-  def fromTuple2RDD(rdd: RDD[(Any, Any)]): RDD[Array[Any]]  = {
+  def fromTuple2RDD(rdd: RDD[(Any, Any)]): RDD[Array[Any]] = {
     rdd.map(x => Array(x._1, x._2))
   }
 
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixture.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixture.scala
index c88410ac0ff43..e9a23e40cc790 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixture.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixture.scala
@@ -211,7 +211,7 @@ class GaussianMixture private (
 private object ExpectationSum {
   def zero(k: Int, d: Int): ExpectationSum = {
     new ExpectationSum(0.0, Array.fill(k)(0.0), 
-      Array.fill(k)(BDV.zeros(d)), Array.fill(k)(BreezeMatrix.zeros(d,d)))
+      Array.fill(k)(BDV.zeros(d)), Array.fill(k)(BreezeMatrix.zeros(d, d)))
   }
   
   // compute cluster contributions for each input point
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAOptimizer.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAOptimizer.scala
index 6fa2fe053c6a4..8e5154b902d1d 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAOptimizer.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAOptimizer.scala
@@ -273,7 +273,7 @@ final class OnlineLDAOptimizer extends LDAOptimizer {
    * Default: 1024, following the original Online LDA paper.
    */
   def setTau0(tau0: Double): this.type = {
-    require(tau0 > 0,  s"LDA tau0 must be positive, but was set to $tau0")
+    require(tau0 > 0, s"LDA tau0 must be positive, but was set to $tau0")
     this.tau0 = tau0
     this
   }
@@ -339,7 +339,7 @@ final class OnlineLDAOptimizer extends LDAOptimizer {
 
   override private[clustering] def initialize(
       docs: RDD[(Long, Vector)],
-      lda: LDA):  OnlineLDAOptimizer = {
+      lda: LDA): OnlineLDAOptimizer = {
     this.k = lda.getK
     this.corpusSize = docs.count()
     this.vocabSize = docs.first()._2.size
@@ -458,7 +458,7 @@ final class OnlineLDAOptimizer extends LDAOptimizer {
    * uses digamma which is accurate but expensive.
    */
   private def dirichletExpectation(alpha: BDM[Double]): BDM[Double] = {
-    val rowSum =  sum(alpha(breeze.linalg.*, ::))
+    val rowSum = sum(alpha(breeze.linalg.*, ::))
     val digAlpha = digamma(alpha)
     val digRowSum = digamma(rowSum)
     val result = digAlpha(::, breeze.linalg.*) - digRowSum
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/IDF.scala b/mllib/src/main/scala/org/apache/spark/mllib/feature/IDF.scala
index a89eea0e21be2..efbfeb4059f5a 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/feature/IDF.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/feature/IDF.scala
@@ -144,7 +144,7 @@ private object IDF {
          * Since arrays are initialized to 0 by default,
          * we just omit changing those entries.
          */
-        if(df(j) >= minDocFreq) {
+        if (df(j) >= minDocFreq) {
           inv(j) = math.log((m + 1.0) / (df(j) + 1.0))
         }
         j += 1
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/StandardScaler.scala b/mllib/src/main/scala/org/apache/spark/mllib/feature/StandardScaler.scala
index 6ae6917eae595..c73b8f258060d 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/feature/StandardScaler.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/feature/StandardScaler.scala
@@ -90,7 +90,7 @@ class StandardScalerModel (
 
   @DeveloperApi
   def setWithMean(withMean: Boolean): this.type = {
-    require(!(withMean && this.mean == null),"cannot set withMean to true while mean is null")
+    require(!(withMean && this.mean == null), "cannot set withMean to true while mean is null")
     this.withMean = withMean
     this
   }
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala b/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala
index 9106b73dfcd76..466ae95859b82 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala
@@ -49,7 +49,7 @@ private case class VocabWord(
   var cn: Int,
   var point: Array[Int],
   var code: Array[Int],
-  var codeLen:Int
+  var codeLen: Int
 )
 
 /**
@@ -469,7 +469,7 @@ class Word2VecModel private[mllib] (
     val norm1 = blas.snrm2(n, v1, 1)
     val norm2 = blas.snrm2(n, v2, 1)
     if (norm1 == 0 || norm2 == 0) return 0.0
-    blas.sdot(n, v1, 1, v2,1) / norm1 / norm2
+    blas.sdot(n, v1, 1, v2, 1) / norm1 / norm2
   }
 
   override protected def formatVersion = "1.0"
@@ -500,7 +500,7 @@ class Word2VecModel private[mllib] (
    */
   def findSynonyms(word: String, num: Int): Array[(String, Double)] = {
     val vector = transform(word)
-    findSynonyms(vector,num)
+    findSynonyms(vector, num)
   }
 
   /**
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/RowMatrix.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/RowMatrix.scala
index 9a89a6f3a515f..1626da9c3d2ee 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/RowMatrix.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/RowMatrix.scala
@@ -219,7 +219,7 @@ class RowMatrix(
 
     val computeMode = mode match {
       case "auto" =>
-        if(k > 5000) {
+        if (k > 5000) {
           logWarning(s"computing svd with k=$k and n=$n, please check necessity")
         }
 
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/random/RandomRDDs.scala b/mllib/src/main/scala/org/apache/spark/mllib/random/RandomRDDs.scala
index 8341bb86afd71..7db5a14fd45a5 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/random/RandomRDDs.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/random/RandomRDDs.scala
@@ -52,7 +52,7 @@ object RandomRDDs {
       numPartitions: Int = 0,
       seed: Long = Utils.random.nextLong()): RDD[Double] = {
     val uniform = new UniformGenerator()
-    randomRDD(sc, uniform,  size, numPartitionsOrDefault(sc, numPartitions), seed)
+    randomRDD(sc, uniform, size, numPartitionsOrDefault(sc, numPartitions), seed)
   }
 
   /**
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/regression/IsotonicRegression.scala b/mllib/src/main/scala/org/apache/spark/mllib/regression/IsotonicRegression.scala
index 3ea63dd8c0acd..96e50faca2b19 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/regression/IsotonicRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/regression/IsotonicRegression.scala
@@ -203,7 +203,7 @@ object IsotonicRegressionModel extends Loader[IsotonicRegressionModel] {
   override def load(sc: SparkContext, path: String): IsotonicRegressionModel = {
     implicit val formats = DefaultFormats
     val (loadedClassName, version, metadata) = loadMetadata(sc, path)
-    val isotonic =  (metadata \ "isotonic").extract[Boolean]
+    val isotonic = (metadata \ "isotonic").extract[Boolean]
     val classNameV1_0 = SaveLoadV1_0.thisClassName
     (loadedClassName, version) match {
       case (className, "1.0") if className == classNameV1_0 =>
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/stat/MultivariateOnlineSummarizer.scala b/mllib/src/main/scala/org/apache/spark/mllib/stat/MultivariateOnlineSummarizer.scala
index 0b1755613aac4..d321cc554c1cc 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/stat/MultivariateOnlineSummarizer.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/stat/MultivariateOnlineSummarizer.scala
@@ -70,7 +70,7 @@ class MultivariateOnlineSummarizer extends MultivariateStatisticalSummary with S
     require(n == sample.size, s"Dimensions mismatch when adding new sample." +
       s" Expecting $n but got ${sample.size}.")
 
-    val localCurrMean= currMean
+    val localCurrMean = currMean
     val localCurrM2n = currM2n
     val localCurrM2 = currM2
     val localCurrL1 = currL1
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/stat/test/ChiSqTest.scala b/mllib/src/main/scala/org/apache/spark/mllib/stat/test/ChiSqTest.scala
index e597fce2babd1..23c8d7c7c8075 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/stat/test/ChiSqTest.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/stat/test/ChiSqTest.scala
@@ -196,7 +196,7 @@ private[stat] object ChiSqTest extends Logging {
    * Pearson's independence test on the input contingency matrix.
    * TODO: optimize for SparseMatrix when it becomes supported.
    */
-  def chiSquaredMatrix(counts: Matrix, methodName:String = PEARSON.name): ChiSqTestResult = {
+  def chiSquaredMatrix(counts: Matrix, methodName: String = PEARSON.name): ChiSqTestResult = {
     val method = methodFromString(methodName)
     val numRows = counts.numRows
     val numCols = counts.numCols
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/DecisionTree.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/DecisionTree.scala
index dfe3a0b6913ef..cecd1fed896d5 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/tree/DecisionTree.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/DecisionTree.scala
@@ -169,7 +169,7 @@ object DecisionTree extends Serializable with Logging {
       numClasses: Int,
       maxBins: Int,
       quantileCalculationStrategy: QuantileStrategy,
-      categoricalFeaturesInfo: Map[Int,Int]): DecisionTreeModel = {
+      categoricalFeaturesInfo: Map[Int, Int]): DecisionTreeModel = {
     val strategy = new Strategy(algo, impurity, maxDepth, numClasses, maxBins,
       quantileCalculationStrategy, categoricalFeaturesInfo)
     new DecisionTree(strategy).run(input)
@@ -768,7 +768,7 @@ object DecisionTree extends Serializable with Logging {
    */
   private def calculatePredictImpurity(
       leftImpurityCalculator: ImpurityCalculator,
-      rightImpurityCalculator: ImpurityCalculator): (Predict, Double) =  {
+      rightImpurityCalculator: ImpurityCalculator): (Predict, Double) = {
     val parentNodeAgg = leftImpurityCalculator.copy
     parentNodeAgg.add(rightImpurityCalculator)
     val predict = calculatePredict(parentNodeAgg)
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/GradientBoostedTrees.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/GradientBoostedTrees.scala
index 1f779584dcffd..e3ddc7053693c 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/tree/GradientBoostedTrees.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/GradientBoostedTrees.scala
@@ -60,12 +60,12 @@ class GradientBoostedTrees(private val boostingStrategy: BoostingStrategy)
   def run(input: RDD[LabeledPoint]): GradientBoostedTreesModel = {
     val algo = boostingStrategy.treeStrategy.algo
     algo match {
-      case Regression => GradientBoostedTrees.boost(input, input, boostingStrategy, validate=false)
+      case Regression =>
+        GradientBoostedTrees.boost(input, input, boostingStrategy, validate = false)
       case Classification =>
         // Map labels to -1, +1 so binary classification can be treated as regression.
         val remappedInput = input.map(x => new LabeledPoint((x.label * 2) - 1, x.features))
-        GradientBoostedTrees.boost(remappedInput,
-          remappedInput, boostingStrategy, validate=false)
+        GradientBoostedTrees.boost(remappedInput, remappedInput, boostingStrategy, validate = false)
       case _ =>
         throw new IllegalArgumentException(s"$algo is not supported by the gradient boosting.")
     }
@@ -93,8 +93,8 @@ class GradientBoostedTrees(private val boostingStrategy: BoostingStrategy)
       validationInput: RDD[LabeledPoint]): GradientBoostedTreesModel = {
     val algo = boostingStrategy.treeStrategy.algo
     algo match {
-      case Regression => GradientBoostedTrees.boost(
-        input, validationInput, boostingStrategy, validate=true)
+      case Regression =>
+        GradientBoostedTrees.boost(input, validationInput, boostingStrategy, validate = true)
       case Classification =>
         // Map labels to -1, +1 so binary classification can be treated as regression.
         val remappedInput = input.map(
@@ -102,7 +102,7 @@ class GradientBoostedTrees(private val boostingStrategy: BoostingStrategy)
         val remappedValidationInput = validationInput.map(
           x => new LabeledPoint((x.label * 2) - 1, x.features))
         GradientBoostedTrees.boost(remappedInput, remappedValidationInput, boostingStrategy,
-          validate=true)
+          validate = true)
       case _ =>
         throw new IllegalArgumentException(s"$algo is not supported by the gradient boosting.")
     }
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/RandomForest.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/RandomForest.scala
index b347c450c1aa8..99d0e3cf2fd6d 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/tree/RandomForest.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/RandomForest.scala
@@ -249,7 +249,7 @@ private class RandomForest (
       try {
         nodeIdCache.get.deleteAllCheckpoints()
       } catch {
-        case e:IOException =>
+        case e: IOException =>
           logWarning(s"delete all checkpoints failed. Error reason: ${e.getMessage}")
       }
     }
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/model/Node.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/model/Node.scala
index 431a839817eac..ee710fc1ed299 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/tree/model/Node.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/model/Node.scala
@@ -151,9 +151,9 @@ class Node (
           s"(feature ${split.feature} > ${split.threshold})"
         }
         case Categorical => if (left) {
-          s"(feature ${split.feature} in ${split.categories.mkString("{",",","}")})"
+          s"(feature ${split.feature} in ${split.categories.mkString("{", ",", "}")})"
         } else {
-          s"(feature ${split.feature} not in ${split.categories.mkString("{",",","}")})"
+          s"(feature ${split.feature} not in ${split.categories.mkString("{", ",", "}")})"
         }
       }
     }
@@ -161,9 +161,9 @@ class Node (
     if (isLeaf) {
       prefix + s"Predict: ${predict.predict}\n"
     } else {
-      prefix + s"If ${splitToString(split.get, left=true)}\n" +
+      prefix + s"If ${splitToString(split.get, left = true)}\n" +
         leftNode.get.subtreeToString(indentFactor + 1) +
-        prefix + s"Else ${splitToString(split.get, left=false)}\n" +
+        prefix + s"Else ${splitToString(split.get, left = false)}\n" +
         rightNode.get.subtreeToString(indentFactor + 1)
     }
   }
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/util/MFDataGenerator.scala b/mllib/src/main/scala/org/apache/spark/mllib/util/MFDataGenerator.scala
index 0c5b4f9d04a74..bd73a866c8a82 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/util/MFDataGenerator.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/util/MFDataGenerator.scala
@@ -82,8 +82,7 @@ object MFDataGenerator {
     BLAS.gemm(z, A, B, 1.0, fullData)
 
     val df = rank * (m + n - rank)
-    val sampSize = scala.math.min(scala.math.round(trainSampFact * df),
-      scala.math.round(.99 * m * n)).toInt
+    val sampSize = math.min(math.round(trainSampFact * df), math.round(.99 * m * n)).toInt
     val rand = new Random()
     val mn = m * n
     val shuffled = rand.shuffle((0 until mn).toList)
@@ -102,8 +101,8 @@ object MFDataGenerator {
 
     // optionally generate testing data
     if (test) {
-      val testSampSize = scala.math
-        .min(scala.math.round(sampSize * testSampFact),scala.math.round(mn - sampSize)).toInt
+      val testSampSize = math.min(
+        math.round(sampSize * testSampFact), math.round(mn - sampSize)).toInt
       val testOmega = shuffled.slice(sampSize, sampSize + testSampSize)
       val testOrdered = testOmega.sortWith(_ < _).toArray
       val testData: RDD[(Int, Int, Double)] = sc.parallelize(testOrdered)
diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/Word2VecSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/Word2VecSuite.scala
index 43a09cc418703..df446d0c22015 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/feature/Word2VecSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/feature/Word2VecSuite.scala
@@ -35,9 +35,9 @@ class Word2VecSuite extends FunSuite with MLlibTestSparkContext {
     val doc = sc.parallelize(Seq(sentence, sentence)).map(line => line.split(" "))
 
     val codes = Map(
-      "a" -> Array(-0.2811822295188904,-0.6356269121170044,-0.3020961284637451),
-      "b" -> Array(1.0309048891067505,-1.29472815990448,0.22276712954044342),
-      "c" -> Array(-0.08456747233867645,0.5137411952018738,0.11731560528278351)
+      "a" -> Array(-0.2811822295188904, -0.6356269121170044, -0.3020961284637451),
+      "b" -> Array(1.0309048891067505, -1.29472815990448, 0.22276712954044342),
+      "c" -> Array(-0.08456747233867645, 0.5137411952018738, 0.11731560528278351)
     )
 
     val expected = doc.map { sentence =>
diff --git a/mllib/src/test/scala/org/apache/spark/ml/tuning/CrossValidatorSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/tuning/CrossValidatorSuite.scala
index 65972ec79b9a5..60d8bfe38fb13 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/tuning/CrossValidatorSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/tuning/CrossValidatorSuite.scala
@@ -90,14 +90,20 @@ object CrossValidatorSuite {
 
     override def validateParams(): Unit = require($(inputCol).nonEmpty)
 
-    override def fit(dataset: DataFrame): MyModel = ???
+    override def fit(dataset: DataFrame): MyModel = {
+      throw new UnsupportedOperationException
+    }
 
-    override def transformSchema(schema: StructType): StructType = ???
+    override def transformSchema(schema: StructType): StructType = {
+      throw new UnsupportedOperationException
+    }
   }
 
   class MyEvaluator extends Evaluator {
 
-    override def evaluate(dataset: DataFrame): Double = ???
+    override def evaluate(dataset: DataFrame): Double = {
+      throw new UnsupportedOperationException
+    }
 
     override val uid: String = "eval"
   }
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/api/python/PythonMLLibAPISuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/api/python/PythonMLLibAPISuite.scala
index a629dba8a426f..3d362b5ee53ea 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/api/python/PythonMLLibAPISuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/api/python/PythonMLLibAPISuite.scala
@@ -84,7 +84,7 @@ class PythonMLLibAPISuite extends FunSuite {
 
     val smt = new SparseMatrix(
       3, 3, Array(0, 2, 3, 5), Array(0, 2, 1, 0, 2), Array(0.9, 1.2, 3.4, 5.7, 8.9),
-      isTransposed=true)
+      isTransposed = true)
     val nsmt = SerDe.loads(SerDe.dumps(smt)).asInstanceOf[SparseMatrix]
     assert(smt.toArray === nsmt.toArray)
   }
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/classification/NaiveBayesSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/classification/NaiveBayesSuite.scala
index c111a78a55806..ea40b41bbbe5e 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/classification/NaiveBayesSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/classification/NaiveBayesSuite.scala
@@ -163,7 +163,7 @@ class NaiveBayesSuite extends FunSuite with MLlibTestSparkContext {
     val theta = Array(
       Array(0.50, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.40), // label 0
       Array(0.02, 0.70, 0.10, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02), // label 1
-      Array(0.02, 0.02, 0.60, 0.02,  0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.30)  // label 2
+      Array(0.02, 0.02, 0.60, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.30)  // label 2
     ).map(_.map(math.log))
 
     val testData = NaiveBayesSuite.generateNaiveBayesInput(
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/classification/SVMSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/classification/SVMSuite.scala
index 6de098b383ba3..90f9cec6855bf 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/classification/SVMSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/classification/SVMSuite.scala
@@ -46,7 +46,7 @@ object SVMSuite {
     nPoints: Int,
     seed: Int): Seq[LabeledPoint] = {
     val rnd = new Random(seed)
-    val weightsMat = new DoubleMatrix(1, weights.length, weights:_*)
+    val weightsMat = new DoubleMatrix(1, weights.length, weights : _*)
     val x = Array.fill[Array[Double]](nPoints)(
         Array.fill[Double](weights.length)(rnd.nextDouble() * 2.0 - 1.0))
     val y = x.map { xi =>
@@ -91,7 +91,7 @@ class SVMSuite extends FunSuite with MLlibTestSparkContext {
     val model = svm.run(testRDD)
 
     val validationData = SVMSuite.generateSVMInput(A, Array[Double](B, C), nPoints, 17)
-    val validationRDD  = sc.parallelize(validationData, 2)
+    val validationRDD = sc.parallelize(validationData, 2)
 
     // Test prediction on RDD.
 
@@ -117,7 +117,7 @@ class SVMSuite extends FunSuite with MLlibTestSparkContext {
     val B = -1.5
     val C = 1.0
 
-    val testData = SVMSuite.generateSVMInput(A, Array[Double](B,C), nPoints, 42)
+    val testData = SVMSuite.generateSVMInput(A, Array[Double](B, C), nPoints, 42)
 
     val testRDD = sc.parallelize(testData, 2)
     testRDD.cache()
@@ -127,8 +127,8 @@ class SVMSuite extends FunSuite with MLlibTestSparkContext {
 
     val model = svm.run(testRDD)
 
-    val validationData = SVMSuite.generateSVMInput(A, Array[Double](B,C), nPoints, 17)
-    val validationRDD  = sc.parallelize(validationData, 2)
+    val validationData = SVMSuite.generateSVMInput(A, Array[Double](B, C), nPoints, 17)
+    val validationRDD = sc.parallelize(validationData, 2)
 
     // Test prediction on RDD.
     validatePrediction(model.predict(validationRDD.map(_.features)).collect(), validationData)
@@ -145,7 +145,7 @@ class SVMSuite extends FunSuite with MLlibTestSparkContext {
     val B = -1.5
     val C = 1.0
 
-    val testData = SVMSuite.generateSVMInput(A, Array[Double](B,C), nPoints, 42)
+    val testData = SVMSuite.generateSVMInput(A, Array[Double](B, C), nPoints, 42)
 
     val initialB = -1.0
     val initialC = -1.0
@@ -159,8 +159,8 @@ class SVMSuite extends FunSuite with MLlibTestSparkContext {
 
     val model = svm.run(testRDD, initialWeights)
 
-    val validationData = SVMSuite.generateSVMInput(A, Array[Double](B,C), nPoints, 17)
-    val validationRDD  = sc.parallelize(validationData,2)
+    val validationData = SVMSuite.generateSVMInput(A, Array[Double](B, C), nPoints, 17)
+    val validationRDD = sc.parallelize(validationData, 2)
 
     // Test prediction on RDD.
     validatePrediction(model.predict(validationRDD.map(_.features)).collect(), validationData)
@@ -177,7 +177,7 @@ class SVMSuite extends FunSuite with MLlibTestSparkContext {
     val B = -1.5
     val C = 1.0
 
-    val testData = SVMSuite.generateSVMInput(A, Array[Double](B,C), nPoints, 42)
+    val testData = SVMSuite.generateSVMInput(A, Array[Double](B, C), nPoints, 42)
     val testRDD = sc.parallelize(testData, 2)
 
     val testRDDInvalid = testRDD.map { lp =>
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/clustering/KMeansSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/clustering/KMeansSuite.scala
index 0f2b26d462ad2..877e6dc699523 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/clustering/KMeansSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/clustering/KMeansSuite.scala
@@ -75,7 +75,7 @@ class KMeansSuite extends FunSuite with MLlibTestSparkContext {
     val center = Vectors.dense(1.0, 2.0, 3.0)
 
     // Make sure code runs.
-    var model = KMeans.train(data, k=2, maxIterations=1)
+    var model = KMeans.train(data, k = 2, maxIterations = 1)
     assert(model.clusterCenters.size === 2)
   }
 
@@ -87,7 +87,7 @@ class KMeansSuite extends FunSuite with MLlibTestSparkContext {
       2)
 
     // Make sure code runs.
-    var model = KMeans.train(data, k=3, maxIterations=1)
+    var model = KMeans.train(data, k = 3, maxIterations = 1)
     assert(model.clusterCenters.size === 3)
   }
 
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/clustering/PowerIterationClusteringSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/clustering/PowerIterationClusteringSuite.scala
index 6d6fe6fe46bab..556842f3129a3 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/clustering/PowerIterationClusteringSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/clustering/PowerIterationClusteringSuite.scala
@@ -94,11 +94,13 @@ class PowerIterationClusteringSuite extends FunSuite with MLlibTestSparkContext
      */
     val similarities = Seq[(Long, Long, Double)](
       (0, 1, 1.0), (0, 2, 1.0), (0, 3, 1.0), (1, 2, 1.0), (2, 3, 1.0))
+    // scalastyle:off
     val expected = Array(
       Array(0.0,     1.0/3.0, 1.0/3.0, 1.0/3.0),
       Array(1.0/2.0,     0.0, 1.0/2.0,     0.0),
       Array(1.0/3.0, 1.0/3.0,     0.0, 1.0/3.0),
       Array(1.0/2.0,     0.0, 1.0/2.0,     0.0))
+    // scalastyle:on
     val w = normalize(sc.parallelize(similarities, 2))
     w.edges.collect().foreach { case Edge(i, j, x) =>
       assert(x ~== expected(i.toInt)(j.toInt) absTol 1e-14)
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/evaluation/RegressionMetricsSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/evaluation/RegressionMetricsSuite.scala
index 670b4c34e6095..3aa732474ec2e 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/evaluation/RegressionMetricsSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/evaluation/RegressionMetricsSuite.scala
@@ -26,7 +26,7 @@ class RegressionMetricsSuite extends FunSuite with MLlibTestSparkContext {
 
   test("regression metrics") {
     val predictionAndObservations = sc.parallelize(
-      Seq((2.5,3.0),(0.0,-0.5),(2.0,2.0),(8.0,7.0)), 2)
+      Seq((2.5, 3.0), (0.0, -0.5), (2.0, 2.0), (8.0, 7.0)), 2)
     val metrics = new RegressionMetrics(predictionAndObservations)
     assert(metrics.explainedVariance ~== 0.95717 absTol 1E-5,
       "explained variance regression score mismatch")
@@ -39,7 +39,7 @@ class RegressionMetricsSuite extends FunSuite with MLlibTestSparkContext {
 
   test("regression metrics with complete fitting") {
     val predictionAndObservations = sc.parallelize(
-      Seq((3.0,3.0),(0.0,0.0),(2.0,2.0),(8.0,8.0)), 2)
+      Seq((3.0, 3.0), (0.0, 0.0), (2.0, 2.0), (8.0, 8.0)), 2)
     val metrics = new RegressionMetrics(predictionAndObservations)
     assert(metrics.explainedVariance ~== 1.0 absTol 1E-5,
       "explained variance regression score mismatch")
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/feature/StandardScalerSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/feature/StandardScalerSuite.scala
index 7f94564b2a3ae..1eb991869de40 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/feature/StandardScalerSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/feature/StandardScalerSuite.scala
@@ -360,7 +360,7 @@ class StandardScalerSuite extends FunSuite with MLlibTestSparkContext {
     }
     withClue("model needs std and mean vectors to be equal size when both are provided") {
       intercept[IllegalArgumentException] {
-        val model = new StandardScalerModel(Vectors.dense(0.0), Vectors.dense(0.0,1.0))
+        val model = new StandardScalerModel(Vectors.dense(0.0), Vectors.dense(0.0, 1.0))
       }
     }
   }
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/linalg/distributed/BlockMatrixSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/linalg/distributed/BlockMatrixSuite.scala
index 949d1c9939570..a58336175899c 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/linalg/distributed/BlockMatrixSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/linalg/distributed/BlockMatrixSuite.scala
@@ -57,11 +57,13 @@ class BlockMatrixSuite extends FunSuite with MLlibTestSparkContext {
     val random = new ju.Random()
     // This should generate a 4x4 grid of 1x2 blocks.
     val part0 = GridPartitioner(4, 7, suggestedNumPartitions = 12)
+    // scalastyle:off
     val expected0 = Array(
       Array(0, 0, 4, 4,  8,  8, 12),
       Array(1, 1, 5, 5,  9,  9, 13),
       Array(2, 2, 6, 6, 10, 10, 14),
       Array(3, 3, 7, 7, 11, 11, 15))
+    // scalastyle:on
     for (i <- 0 until 4; j <- 0 until 7) {
       assert(part0.getPartition((i, j)) === expected0(i)(j))
       assert(part0.getPartition((i, j, random.nextInt())) === expected0(i)(j))
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/optimization/GradientDescentSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/optimization/GradientDescentSuite.scala
index 86481c6e66200..e110506d579b0 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/optimization/GradientDescentSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/optimization/GradientDescentSuite.scala
@@ -42,7 +42,7 @@ object GradientDescentSuite {
       offset: Double,
       scale: Double,
       nPoints: Int,
-      seed: Int): Seq[LabeledPoint]  = {
+      seed: Int): Seq[LabeledPoint] = {
     val rnd = new Random(seed)
     val x1 = Array.fill[Double](nPoints)(rnd.nextGaussian())
 
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/optimization/NNLSSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/optimization/NNLSSuite.scala
index 22855e4e8f247..bb723fc471181 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/optimization/NNLSSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/optimization/NNLSSuite.scala
@@ -68,12 +68,14 @@ class NNLSSuite extends FunSuite {
 
   test("NNLS: nonnegativity constraint active") {
     val n = 5
+    // scalastyle:off
     val ata = new DoubleMatrix(Array(
       Array( 4.377, -3.531, -1.306, -0.139,  3.418),
       Array(-3.531,  4.344,  0.934,  0.305, -2.140),
       Array(-1.306,  0.934,  2.644, -0.203, -0.170),
       Array(-0.139,  0.305, -0.203,  5.883,  1.428),
       Array( 3.418, -2.140, -0.170,  1.428,  4.684)))
+    // scalastyle:on
     val atb = new DoubleMatrix(Array(-1.632, 2.115, 1.094, -1.025, -0.636))
 
     val goodx = Array(0.13025, 0.54506, 0.2874, 0.0, 0.028628)
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/regression/LassoSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/regression/LassoSuite.scala
index c9f5dc069ef2e..71dce50922991 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/regression/LassoSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/regression/LassoSuite.scala
@@ -67,11 +67,12 @@ class LassoSuite extends FunSuite with MLlibTestSparkContext {
     assert(weight1 >= -1.60 && weight1 <= -1.40, weight1 + " not in [-1.6, -1.4]")
     assert(weight2 >= -1.0e-3 && weight2 <= 1.0e-3, weight2 + " not in [-0.001, 0.001]")
 
-    val validationData = LinearDataGenerator.generateLinearInput(A, Array[Double](B,C), nPoints, 17)
+    val validationData = LinearDataGenerator
+      .generateLinearInput(A, Array[Double](B, C), nPoints, 17)
       .map { case LabeledPoint(label, features) =>
       LabeledPoint(label, Vectors.dense(1.0 +: features.toArray))
     }
-    val validationRDD  = sc.parallelize(validationData, 2)
+    val validationRDD = sc.parallelize(validationData, 2)
 
     // Test prediction on RDD.
     validatePrediction(model.predict(validationRDD.map(_.features)).collect(), validationData)
@@ -110,11 +111,12 @@ class LassoSuite extends FunSuite with MLlibTestSparkContext {
     assert(weight1 >= -1.60 && weight1 <= -1.40, weight1 + " not in [-1.6, -1.4]")
     assert(weight2 >= -1.0e-3 && weight2 <= 1.0e-3, weight2 + " not in [-0.001, 0.001]")
 
-    val validationData = LinearDataGenerator.generateLinearInput(A, Array[Double](B,C), nPoints, 17)
+    val validationData = LinearDataGenerator
+      .generateLinearInput(A, Array[Double](B, C), nPoints, 17)
       .map { case LabeledPoint(label, features) =>
       LabeledPoint(label, Vectors.dense(1.0 +: features.toArray))
     }
-    val validationRDD  = sc.parallelize(validationData,2)
+    val validationRDD = sc.parallelize(validationData, 2)
 
     // Test prediction on RDD.
     validatePrediction(model.predict(validationRDD.map(_.features)).collect(), validationData)
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/stat/CorrelationSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/stat/CorrelationSuite.scala
index d20a09b4b4925..a7e6fce31ff7e 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/stat/CorrelationSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/stat/CorrelationSuite.scala
@@ -96,11 +96,13 @@ class CorrelationSuite extends FunSuite with MLlibTestSparkContext {
     val X = sc.parallelize(data)
     val defaultMat = Statistics.corr(X)
     val pearsonMat = Statistics.corr(X, "pearson")
+    // scalastyle:off
     val expected = BDM(
       (1.00000000, 0.05564149, Double.NaN, 0.4004714),
       (0.05564149, 1.00000000, Double.NaN, 0.9135959),
       (Double.NaN, Double.NaN, 1.00000000, Double.NaN),
-      (0.40047142, 0.91359586, Double.NaN,1.0000000))
+      (0.40047142, 0.91359586, Double.NaN, 1.0000000))
+    // scalastyle:on
     assert(matrixApproxEqual(defaultMat.toBreeze, expected))
     assert(matrixApproxEqual(pearsonMat.toBreeze, expected))
   }
@@ -108,11 +110,13 @@ class CorrelationSuite extends FunSuite with MLlibTestSparkContext {
   test("corr(X) spearman") {
     val X = sc.parallelize(data)
     val spearmanMat = Statistics.corr(X, "spearman")
+    // scalastyle:off
     val expected = BDM(
       (1.0000000,  0.1054093,  Double.NaN, 0.4000000),
       (0.1054093,  1.0000000,  Double.NaN, 0.9486833),
       (Double.NaN, Double.NaN, 1.00000000, Double.NaN),
       (0.4000000,  0.9486833,  Double.NaN, 1.0000000))
+    // scalastyle:on
     assert(matrixApproxEqual(spearmanMat.toBreeze, expected))
   }
 
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/util/MLUtilsSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/util/MLUtilsSuite.scala
index 668fc1d43c5d6..cdece2c174be4 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/util/MLUtilsSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/util/MLUtilsSuite.scala
@@ -168,7 +168,7 @@ class MLUtilsSuite extends FunSuite with MLlibTestSparkContext {
             "Each training+validation set combined should contain all of the data.")
         }
         // K fold cross validation should only have each element in the validation set exactly once
-        assert(foldedRdds.map(_._2).reduce((x,y) => x.union(y)).collect().sorted ===
+        assert(foldedRdds.map(_._2).reduce((x, y) => x.union(y)).collect().sorted ===
           data.collect().sorted)
       }
     }

From ff44c711abc7ca545dfa1e836279c00fe7539c18 Mon Sep 17 00:00:00 2001
From: Reynold Xin <rxin@databricks.com>
Date: Thu, 28 May 2015 20:10:21 -0700
Subject: [PATCH 225/525] [SPARK-7927] whitespace fixes for SQL core.

So we can enable a whitespace enforcement rule in the style checker to save code review time.

Author: Reynold Xin <rxin@databricks.com>

Closes #6477 from rxin/whitespace-sql-core and squashes the following commits:

ce6e369 [Reynold Xin] Fixed tests.
6095fed [Reynold Xin] [SPARK-7927] whitespace fixes for SQL core.
---
 .../scala/org/apache/spark/sql/Column.scala   |  4 +-
 .../org/apache/spark/sql/DataFrame.scala      | 18 ++++----
 .../apache/spark/sql/DataFrameHolder.scala    |  2 +-
 .../org/apache/spark/sql/GroupedData.scala    | 10 ++---
 .../org/apache/spark/sql/SQLContext.scala     |  2 +-
 .../org/apache/spark/sql/SparkSQLParser.scala | 18 ++++----
 .../columnar/InMemoryColumnarTableScan.scala  |  2 +-
 .../apache/spark/sql/execution/Exchange.scala |  2 +-
 .../spark/sql/execution/SparkStrategies.scala |  7 +--
 .../joins/BroadcastLeftSemiJoinHash.scala     |  2 +-
 .../sql/execution/stat/FrequentItems.scala    |  4 +-
 .../org/apache/spark/sql/functions.scala      |  2 +-
 .../org/apache/spark/sql/jdbc/JDBCRDD.scala   | 44 ++++++++++---------
 .../apache/spark/sql/json/InferSchema.scala   |  2 +-
 .../spark/sql/json/JacksonGenerator.scala     | 10 ++---
 .../org/apache/spark/sql/json/JsonRDD.scala   |  8 ++--
 .../spark/sql/parquet/ParquetConverter.scala  | 12 ++---
 .../sql/parquet/ParquetTableOperations.scala  |  4 +-
 .../spark/sql/parquet/ParquetTypes.scala      |  2 +-
 .../org/apache/spark/sql/sources/ddl.scala    |  8 ++--
 .../spark/sql/ColumnExpressionSuite.scala     | 14 +++---
 .../spark/sql/DataFrameAggregateSuite.scala   |  4 +-
 .../org/apache/spark/sql/DataFrameSuite.scala | 38 ++++++++--------
 .../org/apache/spark/sql/JoinSuite.scala      |  8 ++--
 .../apache/spark/sql/ListTablesSuite.scala    |  2 +-
 .../org/apache/spark/sql/SQLQuerySuite.scala  | 42 +++++++++---------
 .../sql/ScalaReflectionRelationSuite.scala    |  4 +-
 .../scala/org/apache/spark/sql/TestData.scala |  4 +-
 .../scala/org/apache/spark/sql/UDFSuite.scala |  2 +-
 .../spark/sql/columnar/ColumnTypeSuite.scala  |  6 +--
 .../compression/DictionaryEncodingSuite.scala |  4 +-
 .../compression/IntegralDeltaSuite.scala      |  4 +-
 .../compression/RunLengthEncodingSuite.scala  | 10 ++---
 .../org/apache/spark/sql/jdbc/JDBCSuite.scala |  4 +-
 .../org/apache/spark/sql/json/JsonSuite.scala |  2 +-
 .../spark/sql/sources/DDLTestSuite.scala      |  5 +--
 .../spark/sql/sources/FilteredScanSuite.scala |  2 +-
 37 files changed, 160 insertions(+), 158 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/Column.scala b/sql/core/src/main/scala/org/apache/spark/sql/Column.scala
index 6895aa1010956..b49b1d327289f 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/Column.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/Column.scala
@@ -349,7 +349,7 @@ class Column(protected[sql] val expr: Expression) extends Logging {
    * @group expr_ops
    * @since 1.4.0
    */
-  def when(condition: Column, value: Any):Column = this.expr match {
+  def when(condition: Column, value: Any): Column = this.expr match {
     case CaseWhen(branches: Seq[Expression]) =>
       CaseWhen(branches ++ Seq(lit(condition).expr, lit(value).expr))
     case _ =>
@@ -378,7 +378,7 @@ class Column(protected[sql] val expr: Expression) extends Logging {
    * @group expr_ops
    * @since 1.4.0
    */
-  def otherwise(value: Any):Column = this.expr match {
+  def otherwise(value: Any): Column = this.expr match {
     case CaseWhen(branches: Seq[Expression]) =>
       if (branches.size % 2 == 0) {
         CaseWhen(branches :+ lit(value).expr)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
index f968577bc5848..e90109446b642 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
@@ -255,7 +255,7 @@ class DataFrame private[sql](
     val newCols = logicalPlan.output.zip(colNames).map { case (oldAttribute, newName) =>
       Column(oldAttribute).as(newName)
     }
-    select(newCols :_*)
+    select(newCols : _*)
   }
 
   /**
@@ -500,7 +500,7 @@ class DataFrame private[sql](
    */
   @scala.annotation.varargs
   def sort(sortCol: String, sortCols: String*): DataFrame = {
-    sort((sortCol +: sortCols).map(apply) :_*)
+    sort((sortCol +: sortCols).map(apply) : _*)
   }
 
   /**
@@ -531,7 +531,7 @@ class DataFrame private[sql](
    * @since 1.3.0
    */
   @scala.annotation.varargs
-  def orderBy(sortCol: String, sortCols: String*): DataFrame = sort(sortCol, sortCols :_*)
+  def orderBy(sortCol: String, sortCols: String*): DataFrame = sort(sortCol, sortCols : _*)
 
   /**
    * Returns a new [[DataFrame]] sorted by the given expressions.
@@ -540,7 +540,7 @@ class DataFrame private[sql](
    * @since 1.3.0
    */
   @scala.annotation.varargs
-  def orderBy(sortExprs: Column*): DataFrame = sort(sortExprs :_*)
+  def orderBy(sortExprs: Column*): DataFrame = sort(sortExprs : _*)
 
   /**
    * Selects column based on the column name and return it as a [[Column]].
@@ -611,7 +611,7 @@ class DataFrame private[sql](
    * @since 1.3.0
    */
   @scala.annotation.varargs
-  def select(col: String, cols: String*): DataFrame = select((col +: cols).map(Column(_)) :_*)
+  def select(col: String, cols: String*): DataFrame = select((col +: cols).map(Column(_)) : _*)
 
   /**
    * Selects a set of SQL expressions. This is a variant of `select` that accepts
@@ -825,7 +825,7 @@ class DataFrame private[sql](
    * @since 1.3.0
    */
   def agg(aggExpr: (String, String), aggExprs: (String, String)*): DataFrame = {
-    groupBy().agg(aggExpr, aggExprs :_*)
+    groupBy().agg(aggExpr, aggExprs : _*)
   }
 
   /**
@@ -863,7 +863,7 @@ class DataFrame private[sql](
    * @since 1.3.0
    */
   @scala.annotation.varargs
-  def agg(expr: Column, exprs: Column*): DataFrame = groupBy().agg(expr, exprs :_*)
+  def agg(expr: Column, exprs: Column*): DataFrame = groupBy().agg(expr, exprs : _*)
 
   /**
    * Returns a new [[DataFrame]] by taking the first `n` rows. The difference between this function
@@ -1039,7 +1039,7 @@ class DataFrame private[sql](
         val name = field.name
         if (resolver(name, colName)) col.as(colName) else Column(name)
       }
-      select(colNames :_*)
+      select(colNames : _*)
     } else {
       select(Column("*"), col.as(colName))
     }
@@ -1262,7 +1262,7 @@ class DataFrame private[sql](
    * @group action
    * @since 1.3.0
    */
-  override def collectAsList(): java.util.List[Row] = java.util.Arrays.asList(rdd.collect() :_*)
+  override def collectAsList(): java.util.List[Row] = java.util.Arrays.asList(rdd.collect() : _*)
 
   /**
    * Returns the number of rows in the [[DataFrame]].
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameHolder.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameHolder.scala
index b87efb58d51e5..2f19ec0403017 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameHolder.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameHolder.scala
@@ -28,5 +28,5 @@ private[sql] case class DataFrameHolder(df: DataFrame) {
   // `rdd.toDF("1")` as invoking this toDF and then apply on the returned DataFrame.
   def toDF(): DataFrame = df
 
-  def toDF(colNames: String*): DataFrame = df.toDF(colNames :_*)
+  def toDF(colNames: String*): DataFrame = df.toDF(colNames : _*)
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/GroupedData.scala b/sql/core/src/main/scala/org/apache/spark/sql/GroupedData.scala
index f730e4ae00e2b..516ba2ac23371 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/GroupedData.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/GroupedData.scala
@@ -247,7 +247,7 @@ class GroupedData protected[sql](
    */
   @scala.annotation.varargs
   def mean(colNames: String*): DataFrame = {
-    aggregateNumericColumns(colNames:_*)(Average)
+    aggregateNumericColumns(colNames : _*)(Average)
   }
  
   /**
@@ -259,7 +259,7 @@ class GroupedData protected[sql](
    */
   @scala.annotation.varargs
   def max(colNames: String*): DataFrame = {
-    aggregateNumericColumns(colNames:_*)(Max)
+    aggregateNumericColumns(colNames : _*)(Max)
   }
 
   /**
@@ -271,7 +271,7 @@ class GroupedData protected[sql](
    */
   @scala.annotation.varargs
   def avg(colNames: String*): DataFrame = {
-    aggregateNumericColumns(colNames:_*)(Average)
+    aggregateNumericColumns(colNames : _*)(Average)
   }
 
   /**
@@ -283,7 +283,7 @@ class GroupedData protected[sql](
    */
   @scala.annotation.varargs
   def min(colNames: String*): DataFrame = {
-    aggregateNumericColumns(colNames:_*)(Min)
+    aggregateNumericColumns(colNames : _*)(Min)
   }
 
   /**
@@ -295,6 +295,6 @@ class GroupedData protected[sql](
    */
   @scala.annotation.varargs
   def sum(colNames: String*): DataFrame = {
-    aggregateNumericColumns(colNames:_*)(Sum)
+    aggregateNumericColumns(colNames : _*)(Sum)
   }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
index 15c30352bee69..a32897c20b474 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
@@ -298,7 +298,7 @@ class SQLContext(@transient val sparkContext: SparkContext)
      */
     implicit class StringToColumn(val sc: StringContext) {
       def $(args: Any*): ColumnName = {
-        new ColumnName(sc.s(args :_*))
+        new ColumnName(sc.s(args : _*))
       }
     }
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SparkSQLParser.scala b/sql/core/src/main/scala/org/apache/spark/sql/SparkSQLParser.scala
index 6b1ae81972e4e..305b306a79871 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/SparkSQLParser.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/SparkSQLParser.scala
@@ -54,15 +54,15 @@ private[sql] class SparkSQLParser(fallback: String => LogicalPlan) extends Abstr
     }
   }
 
-  protected val AS      = Keyword("AS")
-  protected val CACHE   = Keyword("CACHE")
-  protected val CLEAR   = Keyword("CLEAR")
-  protected val IN      = Keyword("IN")
-  protected val LAZY    = Keyword("LAZY")
-  protected val SET     = Keyword("SET")
-  protected val SHOW    = Keyword("SHOW")
-  protected val TABLE   = Keyword("TABLE")
-  protected val TABLES  = Keyword("TABLES")
+  protected val AS = Keyword("AS")
+  protected val CACHE = Keyword("CACHE")
+  protected val CLEAR = Keyword("CLEAR")
+  protected val IN = Keyword("IN")
+  protected val LAZY = Keyword("LAZY")
+  protected val SET = Keyword("SET")
+  protected val SHOW = Keyword("SHOW")
+  protected val TABLE = Keyword("TABLE")
+  protected val TABLES = Keyword("TABLES")
   protected val UNCACHE = Keyword("UNCACHE")
 
   override protected lazy val start: Parser[LogicalPlan] = cache | uncache | set | show | others
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/columnar/InMemoryColumnarTableScan.scala b/sql/core/src/main/scala/org/apache/spark/sql/columnar/InMemoryColumnarTableScan.scala
index a59d42cdd6028..3db26fad2b92f 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/columnar/InMemoryColumnarTableScan.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/columnar/InMemoryColumnarTableScan.scala
@@ -236,7 +236,7 @@ private[sql] case class InMemoryColumnarTableScan(
     case GreaterThanOrEqual(a: AttributeReference, l: Literal) => l <= statsFor(a).upperBound
     case GreaterThanOrEqual(l: Literal, a: AttributeReference) => statsFor(a).lowerBound <= l
 
-    case IsNull(a: Attribute)    => statsFor(a).nullCount > 0
+    case IsNull(a: Attribute) => statsFor(a).nullCount > 0
     case IsNotNull(a: Attribute) => statsFor(a).count - statsFor(a).nullCount > 0
   }
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/Exchange.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/Exchange.scala
index 3e46596ecf6ac..f25d10fec0411 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/Exchange.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/Exchange.scala
@@ -296,7 +296,7 @@ private[sql] case class EnsureRequirements(sqlContext: SQLContext) extends Rule[
           .sliding(2)
           .map {
             case Seq(a) => true
-            case Seq(a,b) => a compatibleWith b
+            case Seq(a, b) => a.compatibleWith(b)
           }.exists(!_)
 
       // Adds Exchange or Sort operators as required
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala
index 3f6a0345bc17d..d0a1ad00560d3 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala
@@ -243,8 +243,9 @@ private[sql] abstract class SparkStrategies extends QueryPlanner[SparkPlan] {
                 case (predicate, None) => predicate
                 // Filter needs to be applied above when it contains partitioning
                 // columns
-                case (predicate, _) if(!predicate.references.map(_.name).toSet
-                  .intersect (partitionColNames).isEmpty) => predicate
+                case (predicate, _)
+                  if !predicate.references.map(_.name).toSet.intersect(partitionColNames).isEmpty =>
+                  predicate
               }
             }
           } else {
@@ -270,7 +271,7 @@ private[sql] abstract class SparkStrategies extends QueryPlanner[SparkPlan] {
           projectList,
           filters,
           identity[Seq[Expression]], // All filters still need to be evaluated.
-          InMemoryColumnarTableScan(_,  filters, mem)) :: Nil
+          InMemoryColumnarTableScan(_, filters, mem)) :: Nil
       case _ => Nil
     }
   }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/BroadcastLeftSemiJoinHash.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/BroadcastLeftSemiJoinHash.scala
index 640fc26ba3baa..a32e5fc4f7ea4 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/BroadcastLeftSemiJoinHash.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/BroadcastLeftSemiJoinHash.scala
@@ -39,7 +39,7 @@ case class BroadcastLeftSemiJoinHash(
   override def output: Seq[Attribute] = left.output
 
   protected override def doExecute(): RDD[Row] = {
-    val buildIter= buildPlan.execute().map(_.copy()).collect().toIterator
+    val buildIter = buildPlan.execute().map(_.copy()).collect().toIterator
     val hashSet = new java.util.HashSet[Row]()
     var currentRow: Row = null
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/stat/FrequentItems.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/stat/FrequentItems.scala
index 5ae7e107544f8..fe8a81e3d0434 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/stat/FrequentItems.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/stat/FrequentItems.scala
@@ -89,7 +89,7 @@ private[sql] object FrequentItems extends Logging {
       (name, originalSchema.fields(index).dataType)
     }
     
-    val freqItems = df.select(cols.map(Column(_)):_*).rdd.aggregate(countMaps)(
+    val freqItems = df.select(cols.map(Column(_)) : _*).rdd.aggregate(countMaps)(
       seqOp = (counts, row) => {
         var i = 0
         while (i < numCols) {
@@ -110,7 +110,7 @@ private[sql] object FrequentItems extends Logging {
       }
     )
     val justItems = freqItems.map(m => m.baseMap.keys.toSeq)
-    val resultRow = Row(justItems:_*)
+    val resultRow = Row(justItems : _*)
     // append frequent Items to the column name for easy debugging
     val outputCols = colInfo.map { v =>
       StructField(v._1 + "_freqItems", ArrayType(v._2, false))
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
index 9a23cfb89ca12..6dc17bbb2e768 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
@@ -187,7 +187,7 @@ object functions {
    */
   @scala.annotation.varargs
   def countDistinct(columnName: String, columnNames: String*): Column =
-    countDistinct(Column(columnName), columnNames.map(Column.apply) :_*)
+    countDistinct(Column(columnName), columnNames.map(Column.apply) : _*)
 
   /**
    * Aggregate function: returns the approximate number of distinct items in a group.
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/jdbc/JDBCRDD.scala b/sql/core/src/main/scala/org/apache/spark/sql/jdbc/JDBCRDD.scala
index 88f1b02549e21..0bdb68e8ac845 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/jdbc/JDBCRDD.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/jdbc/JDBCRDD.scala
@@ -52,6 +52,7 @@ private[sql] object JDBCRDD extends Logging {
       scale: Int,
       signed: Boolean): DataType = {
     val answer = sqlType match {
+      // scalastyle:off
       case java.sql.Types.ARRAY         => null
       case java.sql.Types.BIGINT        => if (signed) { LongType } else { DecimalType.Unlimited }
       case java.sql.Types.BINARY        => BinaryType
@@ -92,7 +93,8 @@ private[sql] object JDBCRDD extends Logging {
       case java.sql.Types.TINYINT       => IntegerType
       case java.sql.Types.VARBINARY     => BinaryType
       case java.sql.Types.VARCHAR       => StringType
-      case _ => null
+      case _                            => null
+      // scalastyle:on
     }
 
     if (answer == null) throw new SQLException("Unsupported type " + sqlType)
@@ -323,19 +325,19 @@ private[sql] class JDBCRDD(
    */
   def getConversions(schema: StructType): Array[JDBCConversion] = {
     schema.fields.map(sf => sf.dataType match {
-      case BooleanType           => BooleanConversion
-      case DateType              => DateConversion
+      case BooleanType => BooleanConversion
+      case DateType => DateConversion
       case DecimalType.Unlimited => DecimalConversion(None)
-      case DecimalType.Fixed(d)  => DecimalConversion(Some(d))
-      case DoubleType            => DoubleConversion
-      case FloatType             => FloatConversion
-      case IntegerType           => IntegerConversion
-      case LongType              =>
+      case DecimalType.Fixed(d) => DecimalConversion(Some(d))
+      case DoubleType => DoubleConversion
+      case FloatType => FloatConversion
+      case IntegerType => IntegerConversion
+      case LongType =>
         if (sf.metadata.contains("binarylong")) BinaryLongConversion else LongConversion
-      case StringType            => StringConversion
-      case TimestampType         => TimestampConversion
-      case BinaryType            => BinaryConversion
-      case _                     => throw new IllegalArgumentException(s"Unsupported field $sf")
+      case StringType => StringConversion
+      case TimestampType => TimestampConversion
+      case BinaryType => BinaryConversion
+      case _ => throw new IllegalArgumentException(s"Unsupported field $sf")
     }).toArray
   }
 
@@ -376,8 +378,8 @@ private[sql] class JDBCRDD(
         while (i < conversions.length) {
           val pos = i + 1
           conversions(i) match {
-            case BooleanConversion    => mutableRow.setBoolean(i, rs.getBoolean(pos))
-            case DateConversion       =>
+            case BooleanConversion => mutableRow.setBoolean(i, rs.getBoolean(pos))
+            case DateConversion =>
               // DateUtils.fromJavaDate does not handle null value, so we need to check it.
               val dateVal = rs.getDate(pos)
               if (dateVal != null) {
@@ -407,14 +409,14 @@ private[sql] class JDBCRDD(
               } else {
                 mutableRow.update(i, Decimal(decimalVal))
               }
-            case DoubleConversion     => mutableRow.setDouble(i, rs.getDouble(pos))
-            case FloatConversion      => mutableRow.setFloat(i, rs.getFloat(pos))
-            case IntegerConversion    => mutableRow.setInt(i, rs.getInt(pos))
-            case LongConversion       => mutableRow.setLong(i, rs.getLong(pos))
+            case DoubleConversion => mutableRow.setDouble(i, rs.getDouble(pos))
+            case FloatConversion => mutableRow.setFloat(i, rs.getFloat(pos))
+            case IntegerConversion => mutableRow.setInt(i, rs.getInt(pos))
+            case LongConversion => mutableRow.setLong(i, rs.getLong(pos))
             // TODO(davies): use getBytes for better performance, if the encoding is UTF-8
-            case StringConversion     => mutableRow.setString(i, rs.getString(pos))
-            case TimestampConversion  => mutableRow.update(i, rs.getTimestamp(pos))
-            case BinaryConversion     => mutableRow.update(i, rs.getBytes(pos))
+            case StringConversion => mutableRow.setString(i, rs.getString(pos))
+            case TimestampConversion => mutableRow.update(i, rs.getTimestamp(pos))
+            case BinaryConversion => mutableRow.update(i, rs.getBytes(pos))
             case BinaryLongConversion => {
               val bytes = rs.getBytes(pos)
               var ans = 0L
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/json/InferSchema.scala b/sql/core/src/main/scala/org/apache/spark/sql/json/InferSchema.scala
index 9c58b8e4bb16a..06aa19ef09bd2 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/json/InferSchema.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/json/InferSchema.scala
@@ -124,7 +124,7 @@ private[sql] object InferSchema {
           case ArrayType(NullType, containsNull) => ArrayType(StringType, containsNull)
           case ArrayType(struct: StructType, containsNull) =>
             ArrayType(nullTypeToStringType(struct), containsNull)
-          case struct: StructType =>nullTypeToStringType(struct)
+          case struct: StructType => nullTypeToStringType(struct)
           case other: DataType => other
         }
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/json/JacksonGenerator.scala b/sql/core/src/main/scala/org/apache/spark/sql/json/JacksonGenerator.scala
index 80bf74aa02602..325f54b6808a8 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/json/JacksonGenerator.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/json/JacksonGenerator.scala
@@ -33,7 +33,7 @@ private[sql] object JacksonGenerator {
     */
   def apply(rowSchema: StructType, gen: JsonGenerator)(row: Row): Unit = {
     def valWriter: (DataType, Any) => Unit = {
-      case (_, null) | (NullType, _)  => gen.writeNull()
+      case (_, null) | (NullType, _) => gen.writeNull()
       case (StringType, v: String) => gen.writeString(v)
       case (TimestampType, v: java.sql.Timestamp) => gen.writeString(v.toString)
       case (IntegerType, v: Int) => gen.writeNumber(v)
@@ -48,16 +48,16 @@ private[sql] object JacksonGenerator {
       case (DateType, v) => gen.writeString(v.toString)
       case (udt: UserDefinedType[_], v) => valWriter(udt.sqlType, udt.serialize(v))
 
-      case (ArrayType(ty, _), v: Seq[_] ) =>
+      case (ArrayType(ty, _), v: Seq[_]) =>
         gen.writeStartArray()
-        v.foreach(valWriter(ty,_))
+        v.foreach(valWriter(ty, _))
         gen.writeEndArray()
 
-      case (MapType(kv,vv, _), v: Map[_,_]) =>
+      case (MapType(kv, vv, _), v: Map[_, _]) =>
         gen.writeStartObject()
         v.foreach { p =>
           gen.writeFieldName(p._1.toString)
-          valWriter(vv,p._2)
+          valWriter(vv, p._2)
         }
         gen.writeEndObject()
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/json/JsonRDD.scala b/sql/core/src/main/scala/org/apache/spark/sql/json/JsonRDD.scala
index 037a6d60a2ed6..95eb1174b1dd6 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/json/JsonRDD.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/json/JsonRDD.scala
@@ -141,7 +141,7 @@ private[sql] object JsonRDD extends Logging {
           case ArrayType(NullType, containsNull) => ArrayType(StringType, containsNull)
           case ArrayType(struct: StructType, containsNull) =>
             ArrayType(nullTypeToStringType(struct), containsNull)
-          case struct: StructType =>nullTypeToStringType(struct)
+          case struct: StructType => nullTypeToStringType(struct)
           case other: DataType => other
         }
         StructField(fieldName, newType, nullable)
@@ -216,7 +216,7 @@ private[sql] object JsonRDD extends Logging {
           case map: Map[_, _] => StructType(Nil)
           // We have an array of arrays. If those element arrays do not have the same
           // element types, we will return ArrayType[StringType].
-          case seq: Seq[_] =>  typeOfArray(seq)
+          case seq: Seq[_] => typeOfArray(seq)
           case value => typeOfPrimitiveValue(value)
         }
       }.reduce((type1: DataType, type2: DataType) => compatibleType(type1, type2))
@@ -406,7 +406,7 @@ private[sql] object JsonRDD extends Logging {
     }
   }
 
-  private[json] def enforceCorrectType(value: Any, desiredType: DataType): Any ={
+  private[json] def enforceCorrectType(value: Any, desiredType: DataType): Any = {
     if (value == null) {
       null
     } else {
@@ -434,7 +434,7 @@ private[sql] object JsonRDD extends Logging {
     }
   }
 
-  private def asRow(json: Map[String,Any], schema: StructType): Row = {
+  private def asRow(json: Map[String, Any], schema: StructType): Row = {
     // TODO: Reuse the row instead of creating a new one for every record.
     val row = new GenericMutableRow(schema.fields.length)
     schema.fields.zipWithIndex.foreach {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetConverter.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetConverter.scala
index 36cb5e03bbca7..1b4196ab0be35 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetConverter.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetConverter.scala
@@ -480,7 +480,7 @@ private[parquet] class CatalystPrimitiveStringConverter(parent: CatalystConverte
 
   override def hasDictionarySupport: Boolean = true
 
-  override def setDictionary(dictionary: Dictionary):Unit =
+  override def setDictionary(dictionary: Dictionary): Unit =
     dict = Array.tabulate(dictionary.getMaxId + 1) { dictionary.decodeToBinary(_).getBytes }
 
   override def addValueFromDictionary(dictionaryId: Int): Unit =
@@ -591,8 +591,8 @@ private[parquet] class CatalystArrayConverter(
       CatalystConverter.ARRAY_ELEMENTS_SCHEMA_NAME,
       elementType,
       false),
-    fieldIndex=0,
-    parent=this)
+    fieldIndex = 0,
+    parent = this)
 
   override def getConverter(fieldIndex: Int): Converter = converter
 
@@ -601,7 +601,7 @@ private[parquet] class CatalystArrayConverter(
 
   override protected[parquet] def updateField(fieldIndex: Int, value: Any): Unit = {
     // fieldIndex is ignored (assumed to be zero but not checked)
-    if(value == null) {
+    if (value == null) {
       throw new IllegalArgumentException("Null values inside Parquet arrays are not supported!")
     }
     buffer += value
@@ -654,8 +654,8 @@ private[parquet] class CatalystNativeArrayConverter(
       CatalystConverter.ARRAY_ELEMENTS_SCHEMA_NAME,
       elementType,
       false),
-    fieldIndex=0,
-    parent=this)
+    fieldIndex = 0,
+    parent = this)
 
   override def getConverter(fieldIndex: Int): Converter = converter
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableOperations.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableOperations.scala
index 90950f924a054..cb7ae246d0d75 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableOperations.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableOperations.scala
@@ -541,7 +541,7 @@ private[parquet] class FilteringParquetRowInputFormat
     val splits = mutable.ArrayBuffer.empty[ParquetInputSplit]
     val filter: Filter = ParquetInputFormat.getFilter(configuration)
     var rowGroupsDropped: Long = 0
-    var totalRowGroups: Long  = 0
+    var totalRowGroups: Long = 0
 
     // Ugly hack, stuck with it until PR:
     // https://github.com/apache/incubator-parquet-mr/pull/17
@@ -664,7 +664,7 @@ private[parquet] object FileSystemHelper {
         s"ParquetTableOperations: path $path does not exist or is not a directory")
     }
     fs.globStatus(path)
-      .flatMap { status => if(status.isDir) fs.listStatus(status.getPath) else List(status) }
+      .flatMap { status => if (status.isDir) fs.listStatus(status.getPath) else List(status) }
       .map(_.getPath)
   }
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTypes.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTypes.scala
index 1dc819b5d7b9b..6698b19c7477d 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTypes.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTypes.scala
@@ -489,7 +489,7 @@ private[parquet] object ParquetTypesConverter extends Logging {
     val children =
       fs
         .globStatus(path)
-        .flatMap { status => if(status.isDir) fs.listStatus(status.getPath) else List(status) }
+        .flatMap { status => if (status.isDir) fs.listStatus(status.getPath) else List(status) }
         .filterNot { status =>
           val name = status.getPath.getName
           (name(0) == '.' || name(0) == '_') && name != ParquetFileWriter.PARQUET_METADATA_FILE
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/sources/ddl.scala b/sql/core/src/main/scala/org/apache/spark/sql/sources/ddl.scala
index ca30b8e74626f..22587f5a1c6f1 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/sources/ddl.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/sources/ddl.scala
@@ -130,7 +130,7 @@ private[sql] class DDLParser(
         }
     }
 
-  protected lazy val tableCols: Parser[Seq[StructField]] =  "(" ~> repsep(column, ",") <~ ")"
+  protected lazy val tableCols: Parser[Seq[StructField]] = "(" ~> repsep(column, ",") <~ ")"
 
   /*
    * describe [extended] table avroTable
@@ -138,7 +138,7 @@ private[sql] class DDLParser(
    */
   protected lazy val describeTable: Parser[LogicalPlan] =
     (DESCRIBE ~> opt(EXTENDED)) ~ (ident <~ ".").? ~ ident  ^^ {
-      case e ~ db ~ tbl  =>
+      case e ~ db ~ tbl =>
         val tblIdentifier = db match {
           case Some(dbName) =>
             Seq(dbName, tbl)
@@ -171,7 +171,7 @@ private[sql] class DDLParser(
   }
 
   protected lazy val pair: Parser[(String, String)] =
-    optionName ~ stringLit ^^ { case k ~ v => (k,v) }
+    optionName ~ stringLit ^^ { case k ~ v => (k, v) }
 
   protected lazy val column: Parser[StructField] =
     ident ~ dataType ~ (COMMENT ~> stringLit).?  ^^ { case columnName ~ typ ~ cm =>
@@ -239,7 +239,7 @@ private[sql] object ResolvedDataSource {
             Some(partitionColumnsSchema(schema, partitionColumns))
           }
 
-          val caseInsensitiveOptions= new CaseInsensitiveMap(options)
+          val caseInsensitiveOptions = new CaseInsensitiveMap(options)
           val paths = {
             val patternPath = new Path(caseInsensitiveOptions("path"))
             SparkHadoopUtil.get.globPath(patternPath).map(_.toString).toArray
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/ColumnExpressionSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/ColumnExpressionSuite.scala
index 9bdf201b3be7c..d006b83fc075a 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/ColumnExpressionSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/ColumnExpressionSuite.scala
@@ -28,14 +28,14 @@ class ColumnExpressionSuite extends QueryTest {
   import org.apache.spark.sql.TestData._
 
   test("single explode") {
-    val df = Seq((1, Seq(1,2,3))).toDF("a", "intList")
+    val df = Seq((1, Seq(1, 2, 3))).toDF("a", "intList")
     checkAnswer(
       df.select(explode('intList)),
       Row(1) :: Row(2) :: Row(3) :: Nil)
   }
 
   test("explode and other columns") {
-    val df = Seq((1, Seq(1,2,3))).toDF("a", "intList")
+    val df = Seq((1, Seq(1, 2, 3))).toDF("a", "intList")
 
     checkAnswer(
       df.select($"a", explode('intList)),
@@ -45,13 +45,13 @@ class ColumnExpressionSuite extends QueryTest {
 
     checkAnswer(
       df.select($"*", explode('intList)),
-      Row(1, Seq(1,2,3), 1) ::
-      Row(1, Seq(1,2,3), 2) ::
-      Row(1, Seq(1,2,3), 3) :: Nil)
+      Row(1, Seq(1, 2, 3), 1) ::
+      Row(1, Seq(1, 2, 3), 2) ::
+      Row(1, Seq(1, 2, 3), 3) :: Nil)
   }
 
   test("aliased explode") {
-    val df = Seq((1, Seq(1,2,3))).toDF("a", "intList")
+    val df = Seq((1, Seq(1, 2, 3))).toDF("a", "intList")
 
     checkAnswer(
       df.select(explode('intList).as('int)).select('int),
@@ -79,7 +79,7 @@ class ColumnExpressionSuite extends QueryTest {
   }
 
   test("self join explode") {
-    val df = Seq((1, Seq(1,2,3))).toDF("a", "intList")
+    val df = Seq((1, Seq(1, 2, 3))).toDF("a", "intList")
     val exploded = df.select(explode('intList).as('i))
 
     checkAnswer(
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameAggregateSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameAggregateSuite.scala
index 35a574f354741..232f05c00918f 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameAggregateSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameAggregateSuite.scala
@@ -148,12 +148,12 @@ class DataFrameAggregateSuite extends QueryTest {
   test("null count") {
     checkAnswer(
       testData3.groupBy('a).agg(count('b)),
-      Seq(Row(1,0), Row(2, 1))
+      Seq(Row(1, 0), Row(2, 1))
     )
 
     checkAnswer(
       testData3.groupBy('a).agg(count('a + 'b)),
-      Seq(Row(1,0), Row(2, 1))
+      Seq(Row(1, 0), Row(2, 1))
     )
 
     checkAnswer(
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
index 0dcba80ef2a20..a4fd1058afce5 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
@@ -59,7 +59,7 @@ class DataFrameSuite extends QueryTest {
   }
 
   test("rename nested groupby") {
-    val df = Seq((1,(1,1))).toDF()
+    val df = Seq((1, (1, 1))).toDF()
 
     checkAnswer(
       df.groupBy("_1").agg(sum("_2._1")).toDF("key", "total"),
@@ -211,23 +211,23 @@ class DataFrameSuite extends QueryTest {
   test("global sorting") {
     checkAnswer(
       testData2.orderBy('a.asc, 'b.asc),
-      Seq(Row(1,1), Row(1,2), Row(2,1), Row(2,2), Row(3,1), Row(3,2)))
+      Seq(Row(1, 1), Row(1, 2), Row(2, 1), Row(2, 2), Row(3, 1), Row(3, 2)))
 
     checkAnswer(
       testData2.orderBy(asc("a"), desc("b")),
-      Seq(Row(1,2), Row(1,1), Row(2,2), Row(2,1), Row(3,2), Row(3,1)))
+      Seq(Row(1, 2), Row(1, 1), Row(2, 2), Row(2, 1), Row(3, 2), Row(3, 1)))
 
     checkAnswer(
       testData2.orderBy('a.asc, 'b.desc),
-      Seq(Row(1,2), Row(1,1), Row(2,2), Row(2,1), Row(3,2), Row(3,1)))
+      Seq(Row(1, 2), Row(1, 1), Row(2, 2), Row(2, 1), Row(3, 2), Row(3, 1)))
 
     checkAnswer(
       testData2.orderBy('a.desc, 'b.desc),
-      Seq(Row(3,2), Row(3,1), Row(2,2), Row(2,1), Row(1,2), Row(1,1)))
+      Seq(Row(3, 2), Row(3, 1), Row(2, 2), Row(2, 1), Row(1, 2), Row(1, 1)))
 
     checkAnswer(
       testData2.orderBy('a.desc, 'b.asc),
-      Seq(Row(3,1), Row(3,2), Row(2,1), Row(2,2), Row(1,1), Row(1,2)))
+      Seq(Row(3, 1), Row(3, 2), Row(2, 1), Row(2, 2), Row(1, 1), Row(1, 2)))
 
     checkAnswer(
       arrayData.toDF().orderBy('data.getItem(0).asc),
@@ -331,7 +331,7 @@ class DataFrameSuite extends QueryTest {
     checkAnswer(
       df,
       testData.collect().toSeq)
-    assert(df.schema.map(_.name) === Seq("key","value"))
+    assert(df.schema.map(_.name) === Seq("key", "value"))
   }
 
   test("withColumnRenamed") {
@@ -364,24 +364,24 @@ class DataFrameSuite extends QueryTest {
 
   test("describe") {
     val describeTestData = Seq(
-      ("Bob",   16, 176),
+      ("Bob", 16, 176),
       ("Alice", 32, 164),
       ("David", 60, 192),
-      ("Amy",   24, 180)).toDF("name", "age", "height")
+      ("Amy", 24, 180)).toDF("name", "age", "height")
 
     val describeResult = Seq(
-      Row("count",   "4",               "4"),
-      Row("mean",    "33.0",            "178.0"),
-      Row("stddev",  "16.583123951777", "10.0"),
-      Row("min",     "16",              "164"),
-      Row("max",     "60",              "192"))
+      Row("count", "4", "4"),
+      Row("mean", "33.0", "178.0"),
+      Row("stddev", "16.583123951777", "10.0"),
+      Row("min", "16", "164"),
+      Row("max", "60", "192"))
 
     val emptyDescribeResult = Seq(
-      Row("count",   "0",  "0"),
-      Row("mean",    null, null),
-      Row("stddev",  null, null),
-      Row("min",     null, null),
-      Row("max",     null, null))
+      Row("count", "0", "0"),
+      Row("mean", null, null),
+      Row("stddev", null, null),
+      Row("min", null, null),
+      Row("max", null, null))
 
     def getSchemaAsSeq(df: DataFrame): Seq[String] = df.schema.map(_.name)
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/JoinSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/JoinSuite.scala
index 037d392c1f929..407c789657834 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/JoinSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/JoinSuite.scala
@@ -167,10 +167,10 @@ class JoinSuite extends QueryTest with BeforeAndAfterEach {
     val y = testData2.where($"a" === 1).as("y")
     checkAnswer(
       x.join(y).where($"x.a" === $"y.a"),
-      Row(1,1,1,1) ::
-      Row(1,1,1,2) ::
-      Row(1,2,1,1) ::
-      Row(1,2,1,2) :: Nil
+      Row(1, 1, 1, 1) ::
+      Row(1, 1, 1, 2) ::
+      Row(1, 2, 1, 1) ::
+      Row(1, 2, 1, 2) :: Nil
     )
   }
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/ListTablesSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/ListTablesSuite.scala
index f9f41eb358bd5..3ce97c3fffdb4 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/ListTablesSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/ListTablesSuite.scala
@@ -28,7 +28,7 @@ class ListTablesSuite extends QueryTest with BeforeAndAfter {
   import org.apache.spark.sql.test.TestSQLContext.implicits._
 
   val df =
-    sparkContext.parallelize((1 to 10).map(i => (i,s"str$i"))).toDF("key", "value")
+    sparkContext.parallelize((1 to 10).map(i => (i, s"str$i"))).toDF("key", "value")
 
   before {
     df.registerTempTable("ListTablesSuiteTable")
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
index 7c47fe454b6dc..bf18bf854aa4a 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
@@ -53,7 +53,7 @@ class SQLQuerySuite extends QueryTest with BeforeAndAfterAll {
   }
 
   test("self join with aliases") {
-    Seq(1,2,3).map(i => (i, i.toString)).toDF("int", "str").registerTempTable("df")
+    Seq(1, 2, 3).map(i => (i, i.toString)).toDF("int", "str").registerTempTable("df")
 
     checkAnswer(
       sql(
@@ -76,7 +76,7 @@ class SQLQuerySuite extends QueryTest with BeforeAndAfterAll {
   }
 
   test("self join with alias in agg") {
-      Seq(1,2,3)
+      Seq(1, 2, 3)
         .map(i => (i, i.toString))
         .toDF("int", "str")
         .groupBy("str")
@@ -113,7 +113,7 @@ class SQLQuerySuite extends QueryTest with BeforeAndAfterAll {
   test("SPARK-4625 support SORT BY in SimpleSQLParser & DSL") {
     checkAnswer(
       sql("SELECT a FROM testData2 SORT BY a"),
-      Seq(1, 1, 2 ,2 ,3 ,3).map(Row(_))
+      Seq(1, 1, 2, 2, 3, 3).map(Row(_))
     )
   }
 
@@ -354,7 +354,7 @@ class SQLQuerySuite extends QueryTest with BeforeAndAfterAll {
   test("left semi greater than predicate") {
     checkAnswer(
       sql("SELECT * FROM testData2 x LEFT SEMI JOIN testData2 y ON x.a >= y.a + 2"),
-      Seq(Row(3,1), Row(3,2))
+      Seq(Row(3, 1), Row(3, 2))
     )
   }
 
@@ -371,16 +371,16 @@ class SQLQuerySuite extends QueryTest with BeforeAndAfterAll {
   test("agg") {
     checkAnswer(
       sql("SELECT a, SUM(b) FROM testData2 GROUP BY a"),
-      Seq(Row(1,3), Row(2,3), Row(3,3)))
+      Seq(Row(1, 3), Row(2, 3), Row(3, 3)))
   }
 
   test("literal in agg grouping expressions") {
     checkAnswer(
       sql("SELECT a, count(1) FROM testData2 GROUP BY a, 1"),
-      Seq(Row(1,2), Row(2,2), Row(3,2)))
+      Seq(Row(1, 2), Row(2, 2), Row(3, 2)))
     checkAnswer(
       sql("SELECT a, count(2) FROM testData2 GROUP BY a, 2"),
-      Seq(Row(1,2), Row(2,2), Row(3,2)))
+      Seq(Row(1, 2), Row(2, 2), Row(3, 2)))
   }
 
   test("aggregates with nulls") {
@@ -405,19 +405,19 @@ class SQLQuerySuite extends QueryTest with BeforeAndAfterAll {
   def sortTest(): Unit = {
     checkAnswer(
       sql("SELECT * FROM testData2 ORDER BY a ASC, b ASC"),
-      Seq(Row(1,1), Row(1,2), Row(2,1), Row(2,2), Row(3,1), Row(3,2)))
+      Seq(Row(1, 1), Row(1, 2), Row(2, 1), Row(2, 2), Row(3, 1), Row(3, 2)))
 
     checkAnswer(
       sql("SELECT * FROM testData2 ORDER BY a ASC, b DESC"),
-      Seq(Row(1,2), Row(1,1), Row(2,2), Row(2,1), Row(3,2), Row(3,1)))
+      Seq(Row(1, 2), Row(1, 1), Row(2, 2), Row(2, 1), Row(3, 2), Row(3, 1)))
 
     checkAnswer(
       sql("SELECT * FROM testData2 ORDER BY a DESC, b DESC"),
-      Seq(Row(3,2), Row(3,1), Row(2,2), Row(2,1), Row(1,2), Row(1,1)))
+      Seq(Row(3, 2), Row(3, 1), Row(2, 2), Row(2, 1), Row(1, 2), Row(1, 1)))
 
     checkAnswer(
       sql("SELECT * FROM testData2 ORDER BY a DESC, b ASC"),
-      Seq(Row(3,1), Row(3,2), Row(2,1), Row(2,2), Row(1,1), Row(1,2)))
+      Seq(Row(3, 1), Row(3, 2), Row(2, 1), Row(2, 2), Row(1, 1), Row(1, 2)))
 
     checkAnswer(
       sql("SELECT b FROM binaryData ORDER BY a ASC"),
@@ -552,7 +552,7 @@ class SQLQuerySuite extends QueryTest with BeforeAndAfterAll {
   test("average overflow") {
     checkAnswer(
       sql("SELECT AVG(a),b FROM largeAndSmallInts group by b"),
-      Seq(Row(2147483645.0,1), Row(2.0,2)))
+      Seq(Row(2147483645.0, 1), Row(2.0, 2)))
   }
 
   test("count") {
@@ -619,10 +619,10 @@ class SQLQuerySuite extends QueryTest with BeforeAndAfterAll {
         |  (SELECT * FROM testData2 WHERE a = 1) x JOIN
         |  (SELECT * FROM testData2 WHERE a = 1) y
         |WHERE x.a = y.a""".stripMargin),
-      Row(1,1,1,1) ::
-      Row(1,1,1,2) ::
-      Row(1,2,1,1) ::
-      Row(1,2,1,2) :: Nil)
+      Row(1, 1, 1, 1) ::
+      Row(1, 1, 1, 2) ::
+      Row(1, 2, 1, 1) ::
+      Row(1, 2, 1, 2) :: Nil)
   }
 
   test("inner join, no matches") {
@@ -1266,22 +1266,22 @@ class SQLQuerySuite extends QueryTest with BeforeAndAfterAll {
   test("SPARK-4432 Fix attribute reference resolution error when using ORDER BY") {
     checkAnswer(
       sql("SELECT a + b FROM testData2 ORDER BY a"),
-      Seq(2, 3, 3 ,4 ,4 ,5).map(Row(_))
+      Seq(2, 3, 3, 4, 4, 5).map(Row(_))
     )
   }
 
   test("oder by asc by default when not specify ascending and descending") {
     checkAnswer(
       sql("SELECT a, b FROM testData2 ORDER BY a desc, b"),
-      Seq(Row(3, 1), Row(3, 2), Row(2, 1), Row(2,2), Row(1, 1), Row(1, 2))
+      Seq(Row(3, 1), Row(3, 2), Row(2, 1), Row(2, 2), Row(1, 1), Row(1, 2))
     )
   }
 
   test("Supporting relational operator '<=>' in Spark SQL") {
-    val nullCheckData1 = TestData(1,"1") :: TestData(2,null) :: Nil
+    val nullCheckData1 = TestData(1, "1") :: TestData(2, null) :: Nil
     val rdd1 = sparkContext.parallelize((0 to 1).map(i => nullCheckData1(i)))
     rdd1.toDF().registerTempTable("nulldata1")
-    val nullCheckData2 = TestData(1,"1") :: TestData(2,null) :: Nil
+    val nullCheckData2 = TestData(1, "1") :: TestData(2, null) :: Nil
     val rdd2 = sparkContext.parallelize((0 to 1).map(i => nullCheckData2(i)))
     rdd2.toDF().registerTempTable("nulldata2")
     checkAnswer(sql("SELECT nulldata1.key FROM nulldata1 join " +
@@ -1290,7 +1290,7 @@ class SQLQuerySuite extends QueryTest with BeforeAndAfterAll {
   }
 
   test("Multi-column COUNT(DISTINCT ...)") {
-    val data = TestData(1,"val_1") :: TestData(2,"val_2") :: Nil
+    val data = TestData(1, "val_1") :: TestData(2, "val_2") :: Nil
     val rdd = sparkContext.parallelize((0 to 1).map(i => data(i)))
     rdd.toDF().registerTempTable("distinctData")
     checkAnswer(sql("SELECT COUNT(DISTINCT key,value) FROM distinctData"), Row(2))
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/ScalaReflectionRelationSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/ScalaReflectionRelationSuite.scala
index 3fa00fd9d0ccb..52d265b445e14 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/ScalaReflectionRelationSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/ScalaReflectionRelationSuite.scala
@@ -80,14 +80,14 @@ class ScalaReflectionRelationSuite extends FunSuite {
 
   test("query case class RDD") {
     val data = ReflectData("a", 1, 1L, 1.toFloat, 1.toDouble, 1.toShort, 1.toByte, true,
-      new java.math.BigDecimal(1), new Date(12345), new Timestamp(12345), Seq(1,2,3))
+      new java.math.BigDecimal(1), new Date(12345), new Timestamp(12345), Seq(1, 2, 3))
     val rdd = sparkContext.parallelize(data :: Nil)
     rdd.toDF().registerTempTable("reflectData")
 
     assert(sql("SELECT * FROM reflectData").collect().head ===
       Row("a", 1, 1L, 1.toFloat, 1.toDouble, 1.toShort, 1.toByte, true,
         new java.math.BigDecimal(1), Date.valueOf("1970-01-01"),
-        new Timestamp(12345), Seq(1,2,3)))
+        new Timestamp(12345), Seq(1, 2, 3)))
   }
 
   test("query case class RDD with nulls") {
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/TestData.scala b/sql/core/src/test/scala/org/apache/spark/sql/TestData.scala
index 8fbc2d23d47e6..725a18bfae3a7 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/TestData.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/TestData.scala
@@ -109,8 +109,8 @@ object TestData {
   case class ArrayData(data: Seq[Int], nestedData: Seq[Seq[Int]])
   val arrayData =
     TestSQLContext.sparkContext.parallelize(
-      ArrayData(Seq(1,2,3), Seq(Seq(1,2,3))) ::
-      ArrayData(Seq(2,3,4), Seq(Seq(2,3,4))) :: Nil)
+      ArrayData(Seq(1, 2, 3), Seq(Seq(1, 2, 3))) ::
+      ArrayData(Seq(2, 3, 4), Seq(Seq(2, 3, 4))) :: Nil)
   arrayData.toDF().registerTempTable("arrayData")
 
   case class MapData(data: scala.collection.Map[Int, String])
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/UDFSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/UDFSuite.scala
index d615542ab50a7..1a9ba66416b21 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/UDFSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/UDFSuite.scala
@@ -38,7 +38,7 @@ class UDFSuite extends QueryTest {
   }
 
   test("TwoArgument UDF") {
-    udf.register("strLenScala", (_: String).length + (_:Int))
+    udf.register("strLenScala", (_: String).length + (_: Int))
     assert(sql("SELECT strLenScala('test', 1)").head().getInt(0) === 5)
   }
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/columnar/ColumnTypeSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/columnar/ColumnTypeSuite.scala
index 1e105e259dce7..061efb37a0ac3 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/columnar/ColumnTypeSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/columnar/ColumnTypeSuite.scala
@@ -73,7 +73,7 @@ class ColumnTypeSuite extends FunSuite with Logging {
     checkActualSize(TIMESTAMP, new Timestamp(0L), 12)
 
     val binary = Array.fill[Byte](4)(0: Byte)
-    checkActualSize(BINARY,  binary, 4 + 4)
+    checkActualSize(BINARY, binary, 4 + 4)
 
     val generic = Map(1 -> "a")
     checkActualSize(GENERIC, SparkSqlSerializer.serialize(generic), 4 + 8)
@@ -167,7 +167,7 @@ class ColumnTypeSuite extends FunSuite with Logging {
     val serializer = new SparkSqlSerializer(conf).newInstance()
 
     val buffer = ByteBuffer.allocate(512)
-    val obj = CustomClass(Int.MaxValue,Long.MaxValue)
+    val obj = CustomClass(Int.MaxValue, Long.MaxValue)
     val serializedObj = serializer.serialize(obj).array()
 
     GENERIC.append(serializer.serialize(obj).array(), buffer)
@@ -278,7 +278,7 @@ private[columnar] object CustomerSerializer extends Serializer[CustomClass] {
   override def read(kryo: Kryo, input: Input, aClass: Class[CustomClass]): CustomClass = {
     val a = input.readInt()
     val b = input.readLong()
-    CustomClass(a,b)
+    CustomClass(a, b)
   }
 }
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/columnar/compression/DictionaryEncodingSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/columnar/compression/DictionaryEncodingSuite.scala
index 64b70552eb047..cef60ec204faa 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/columnar/compression/DictionaryEncodingSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/columnar/compression/DictionaryEncodingSuite.scala
@@ -27,8 +27,8 @@ import org.apache.spark.sql.columnar.ColumnarTestUtils._
 import org.apache.spark.sql.types.AtomicType
 
 class DictionaryEncodingSuite extends FunSuite {
-  testDictionaryEncoding(new IntColumnStats,    INT)
-  testDictionaryEncoding(new LongColumnStats,   LONG)
+  testDictionaryEncoding(new IntColumnStats, INT)
+  testDictionaryEncoding(new LongColumnStats, LONG)
   testDictionaryEncoding(new StringColumnStats, STRING)
 
   def testDictionaryEncoding[T <: AtomicType](
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/columnar/compression/IntegralDeltaSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/columnar/compression/IntegralDeltaSuite.scala
index bfd99f143bedc..5514590541dd6 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/columnar/compression/IntegralDeltaSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/columnar/compression/IntegralDeltaSuite.scala
@@ -25,7 +25,7 @@ import org.apache.spark.sql.columnar.ColumnarTestUtils._
 import org.apache.spark.sql.types.IntegralType
 
 class IntegralDeltaSuite extends FunSuite {
-  testIntegralDelta(new IntColumnStats,  INT,  IntDelta)
+  testIntegralDelta(new IntColumnStats, INT, IntDelta)
   testIntegralDelta(new LongColumnStats, LONG, LongDelta)
 
   def testIntegralDelta[I <: IntegralType](
@@ -116,7 +116,7 @@ class IntegralDeltaSuite extends FunSuite {
 
     test(s"$scheme: simple case") {
       val input = columnType match {
-        case INT  => Seq(2: Int,  1: Int,  2: Int,  130: Int)
+        case INT => Seq(2: Int, 1: Int, 2: Int, 130: Int)
         case LONG => Seq(2: Long, 1: Long, 2: Long, 130: Long)
       }
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/columnar/compression/RunLengthEncodingSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/columnar/compression/RunLengthEncodingSuite.scala
index fde7a4595be0e..6ee48f6291914 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/columnar/compression/RunLengthEncodingSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/columnar/compression/RunLengthEncodingSuite.scala
@@ -26,11 +26,11 @@ import org.apache.spark.sql.types.AtomicType
 
 class RunLengthEncodingSuite extends FunSuite {
   testRunLengthEncoding(new NoopColumnStats, BOOLEAN)
-  testRunLengthEncoding(new ByteColumnStats,    BYTE)
-  testRunLengthEncoding(new ShortColumnStats,   SHORT)
-  testRunLengthEncoding(new IntColumnStats,     INT)
-  testRunLengthEncoding(new LongColumnStats,    LONG)
-  testRunLengthEncoding(new StringColumnStats,  STRING)
+  testRunLengthEncoding(new ByteColumnStats, BYTE)
+  testRunLengthEncoding(new ShortColumnStats, SHORT)
+  testRunLengthEncoding(new IntColumnStats, INT)
+  testRunLengthEncoding(new LongColumnStats, LONG)
+  testRunLengthEncoding(new StringColumnStats, STRING)
 
   def testRunLengthEncoding[T <: AtomicType](
       columnStats: ColumnStats,
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCSuite.scala
index 347f28351fd72..30279f528944b 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCSuite.scala
@@ -429,8 +429,8 @@ class JDBCSuite extends FunSuite with BeforeAndAfter {
     }, testH2Dialect))
     assert(agg.canHandle("jdbc:h2:xxx"))
     assert(!agg.canHandle("jdbc:h2"))
-    assert(agg.getCatalystType(0,"",1,null) == Some(LongType))
-    assert(agg.getCatalystType(1,"",1,null) == Some(StringType))
+    assert(agg.getCatalystType(0, "", 1, null) == Some(LongType))
+    assert(agg.getCatalystType(1, "", 1, null) == Some(StringType))
   }
 
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/json/JsonSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/json/JsonSuite.scala
index 7e6eeba17752a..f8d62f9e7e02b 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/json/JsonSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/json/JsonSuite.scala
@@ -522,7 +522,7 @@ class JsonSuite extends QueryTest {
       Row(Seq(), "11", "[1,2,3]", Row(null), "[]") ::
         Row(null, """{"field":false}""", null, null, "{}") ::
         Row(Seq(4, 5, 6), null, "str", Row(null), "[7,8,9]") ::
-        Row(Seq(7), "{}","""["str1","str2",33]""", Row("str"), """{"field":true}""") :: Nil
+        Row(Seq(7), "{}", """["str1","str2",33]""", Row("str"), """{"field":true}""") :: Nil
     )
   }
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/DDLTestSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/DDLTestSuite.scala
index f5106f67a08df..5c3467158a01b 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/sources/DDLTestSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/sources/DDLTestSuite.scala
@@ -43,7 +43,7 @@ case class SimpleDDLScan(from: Int, to: Int, table: String)(@transient val sqlCo
       StructField("bigintType", LongType, nullable = false),
       StructField("tinyintType", ByteType, nullable = false),
       StructField("decimalType", DecimalType.Unlimited, nullable = false),
-      StructField("fixedDecimalType", DecimalType(5,1), nullable = false),
+      StructField("fixedDecimalType", DecimalType(5, 1), nullable = false),
       StructField("binaryType", BinaryType, nullable = false),
       StructField("booleanType", BooleanType, nullable = false),
       StructField("smallIntType", ShortType, nullable = false),
@@ -51,8 +51,7 @@ case class SimpleDDLScan(from: Int, to: Int, table: String)(@transient val sqlCo
       StructField("mapType", MapType(StringType, StringType)),
       StructField("arrayType", ArrayType(StringType)),
       StructField("structType",
-        StructType(StructField("f1",StringType) ::
-          (StructField("f2",IntegerType)) :: Nil
+        StructType(StructField("f1", StringType) :: StructField("f2", IntegerType) :: Nil
         )
       )
     ))
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/FilteredScanSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/FilteredScanSuite.scala
index cce747e7dbf64..db94b1f3e8926 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/sources/FilteredScanSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/sources/FilteredScanSuite.scala
@@ -154,7 +154,7 @@ class FilteredScanSuite extends DataSourceTest {
 
   sqlTest(
     "SELECT a, b FROM oneToTenFiltered WHERE a IN (1,3,5)",
-    Seq(1,3,5).map(i => Row(i, i * 2)))
+    Seq(1, 3, 5).map(i => Row(i, i * 2)))
 
   sqlTest(
     "SELECT a, b FROM oneToTenFiltered WHERE A = 1",

From 2881d14cbedc14f1cd8ae5078446dba1a8d39086 Mon Sep 17 00:00:00 2001
From: Reynold Xin <rxin@databricks.com>
Date: Thu, 28 May 2015 20:11:04 -0700
Subject: [PATCH 226/525] [SPARK-7929] Remove Bagel examples & whitespace fix
 for examples.

Author: Reynold Xin <rxin@databricks.com>

Closes #6480 from rxin/whitespace-example and squashes the following commits:

8a4a3d4 [Reynold Xin] [SPARK-7929] Remove Bagel examples & whitespace fix for examples.
---
 .../spark/examples/CassandraCQLTest.scala     |   6 +-
 .../org/apache/spark/examples/LocalLR.scala   |   2 +-
 .../org/apache/spark/examples/SparkALS.scala  |   2 +-
 .../org/apache/spark/examples/SparkLR.scala   |   2 +-
 .../spark/examples/bagel/PageRankUtils.scala  | 112 ---------
 .../examples/bagel/WikipediaPageRank.scala    | 106 --------
 .../bagel/WikipediaPageRankStandalone.scala   | 232 ------------------
 .../spark/examples/ml/OneVsRestExample.scala  |   6 +-
 .../examples/mllib/DenseGaussianMixture.scala |   2 +-
 .../pythonconverters/AvroConverters.scala     |  29 ++-
 .../examples/streaming/ActorWordCount.scala   |   6 +-
 .../streaming/DirectKafkaWordCount.scala      |   2 +-
 .../examples/streaming/KafkaWordCount.scala   |   4 +-
 .../examples/streaming/MQTTWordCount.scala    |   2 +-
 .../clickstream/PageViewGenerator.scala       |   3 +-
 15 files changed, 31 insertions(+), 485 deletions(-)
 delete mode 100644 examples/src/main/scala/org/apache/spark/examples/bagel/PageRankUtils.scala
 delete mode 100644 examples/src/main/scala/org/apache/spark/examples/bagel/WikipediaPageRank.scala
 delete mode 100644 examples/src/main/scala/org/apache/spark/examples/bagel/WikipediaPageRankStandalone.scala

diff --git a/examples/src/main/scala/org/apache/spark/examples/CassandraCQLTest.scala b/examples/src/main/scala/org/apache/spark/examples/CassandraCQLTest.scala
index 11d5c92c5952d..023bb3ee2d108 100644
--- a/examples/src/main/scala/org/apache/spark/examples/CassandraCQLTest.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/CassandraCQLTest.scala
@@ -104,8 +104,8 @@ object CassandraCQLTest {
 
     val casRdd = sc.newAPIHadoopRDD(job.getConfiguration(),
       classOf[CqlPagingInputFormat],
-      classOf[java.util.Map[String,ByteBuffer]],
-      classOf[java.util.Map[String,ByteBuffer]])
+      classOf[java.util.Map[String, ByteBuffer]],
+      classOf[java.util.Map[String, ByteBuffer]])
 
     println("Count: " + casRdd.count)
     val productSaleRDD = casRdd.map {
@@ -118,7 +118,7 @@ object CassandraCQLTest {
       case (productId, saleCount) => println(productId + ":" + saleCount)
     }
 
-    val casoutputCF  = aggregatedRDD.map {
+    val casoutputCF = aggregatedRDD.map {
       case (productId, saleCount) => {
         val outColFamKey = Map("prod_id" -> ByteBufferUtil.bytes(productId))
         val outKey: java.util.Map[String, ByteBuffer] = outColFamKey
diff --git a/examples/src/main/scala/org/apache/spark/examples/LocalLR.scala b/examples/src/main/scala/org/apache/spark/examples/LocalLR.scala
index a55e0dc8d36c2..c3fc74a116c0a 100644
--- a/examples/src/main/scala/org/apache/spark/examples/LocalLR.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/LocalLR.scala
@@ -39,7 +39,7 @@ object LocalLR {
 
   def generateData: Array[DataPoint] = {
     def generatePoint(i: Int): DataPoint = {
-      val y = if(i % 2 == 0) -1 else 1
+      val y = if (i % 2 == 0) -1 else 1
       val x = DenseVector.fill(D){rand.nextGaussian + y * R}
       DataPoint(x, y)
     }
diff --git a/examples/src/main/scala/org/apache/spark/examples/SparkALS.scala b/examples/src/main/scala/org/apache/spark/examples/SparkALS.scala
index 6c0ac8013ce34..30c4261551837 100644
--- a/examples/src/main/scala/org/apache/spark/examples/SparkALS.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/SparkALS.scala
@@ -117,7 +117,7 @@ object SparkALS {
     var us = Array.fill(U)(randomVector(F))
 
     // Iteratively update movies then users
-    val Rc  = sc.broadcast(R)
+    val Rc = sc.broadcast(R)
     var msb = sc.broadcast(ms)
     var usb = sc.broadcast(us)
     for (iter <- 1 to ITERATIONS) {
diff --git a/examples/src/main/scala/org/apache/spark/examples/SparkLR.scala b/examples/src/main/scala/org/apache/spark/examples/SparkLR.scala
index 8c01a60844620..1e6b4fb0c7514 100644
--- a/examples/src/main/scala/org/apache/spark/examples/SparkLR.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/SparkLR.scala
@@ -44,7 +44,7 @@ object SparkLR {
 
   def generateData: Array[DataPoint] = {
     def generatePoint(i: Int): DataPoint = {
-      val y = if(i % 2 == 0) -1 else 1
+      val y = if (i % 2 == 0) -1 else 1
       val x = DenseVector.fill(D){rand.nextGaussian + y * R}
       DataPoint(x, y)
     }
diff --git a/examples/src/main/scala/org/apache/spark/examples/bagel/PageRankUtils.scala b/examples/src/main/scala/org/apache/spark/examples/bagel/PageRankUtils.scala
deleted file mode 100644
index ab6e63deb3c95..0000000000000
--- a/examples/src/main/scala/org/apache/spark/examples/bagel/PageRankUtils.scala
+++ /dev/null
@@ -1,112 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.examples.bagel
-
-import org.apache.spark._
-import org.apache.spark.bagel._
-
-class PageRankUtils extends Serializable {
-  def computeWithCombiner(numVertices: Long, epsilon: Double)(
-    self: PRVertex, messageSum: Option[Double], superstep: Int
-  ): (PRVertex, Array[PRMessage]) = {
-    val newValue = messageSum match {
-      case Some(msgSum) if msgSum != 0 =>
-        0.15 / numVertices + 0.85 * msgSum
-      case _ => self.value
-    }
-
-    val terminate = superstep >= 10
-
-    val outbox: Array[PRMessage] =
-      if (!terminate) {
-        self.outEdges.map(targetId => new PRMessage(targetId, newValue / self.outEdges.size))
-      } else {
-        Array[PRMessage]()
-      }
-
-    (new PRVertex(newValue, self.outEdges, !terminate), outbox)
-  }
-
-  def computeNoCombiner(numVertices: Long, epsilon: Double)
-    (self: PRVertex, messages: Option[Array[PRMessage]], superstep: Int)
-  : (PRVertex, Array[PRMessage]) =
-    computeWithCombiner(numVertices, epsilon)(self, messages match {
-      case Some(msgs) => Some(msgs.map(_.value).sum)
-      case None => None
-    }, superstep)
-}
-
-class PRCombiner extends Combiner[PRMessage, Double] with Serializable {
-  def createCombiner(msg: PRMessage): Double =
-    msg.value
-  def mergeMsg(combiner: Double, msg: PRMessage): Double =
-    combiner + msg.value
-  def mergeCombiners(a: Double, b: Double): Double =
-    a + b
-}
-
-class PRVertex() extends Vertex with Serializable {
-  var value: Double = _
-  var outEdges: Array[String] = _
-  var active: Boolean = _
-
-  def this(value: Double, outEdges: Array[String], active: Boolean = true) {
-    this()
-    this.value = value
-    this.outEdges = outEdges
-    this.active = active
-  }
-
-  override def toString(): String = {
-    "PRVertex(value=%f, outEdges.length=%d, active=%s)"
-      .format(value, outEdges.length, active.toString)
-  }
-}
-
-class PRMessage() extends Message[String] with Serializable {
-  var targetId: String = _
-  var value: Double = _
-
-  def this(targetId: String, value: Double) {
-    this()
-    this.targetId = targetId
-    this.value = value
-  }
-}
-
-class CustomPartitioner(partitions: Int) extends Partitioner {
-  def numPartitions: Int = partitions
-
-  def getPartition(key: Any): Int = {
-    val hash = key match {
-      case k: Long => (k & 0x00000000FFFFFFFFL).toInt
-      case _ => key.hashCode
-    }
-
-    val mod = key.hashCode % partitions
-    if (mod < 0) mod + partitions else mod
-  }
-
-  override def equals(other: Any): Boolean = other match {
-    case c: CustomPartitioner =>
-      c.numPartitions == numPartitions
-    case _ => false
-  }
-
-  override def hashCode: Int = numPartitions
-}
diff --git a/examples/src/main/scala/org/apache/spark/examples/bagel/WikipediaPageRank.scala b/examples/src/main/scala/org/apache/spark/examples/bagel/WikipediaPageRank.scala
deleted file mode 100644
index 859abedf2a55e..0000000000000
--- a/examples/src/main/scala/org/apache/spark/examples/bagel/WikipediaPageRank.scala
+++ /dev/null
@@ -1,106 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.examples.bagel
-
-import org.apache.spark._
-import org.apache.spark.SparkContext._
-
-import org.apache.spark.bagel._
-
-import scala.xml.{XML,NodeSeq}
-
-/**
- * Run PageRank on XML Wikipedia dumps from http://wiki.freebase.com/wiki/WEX. Uses the "articles"
- * files from there, which contains one line per wiki article in a tab-separated format
- * (http://wiki.freebase.com/wiki/WEX/Documentation#articles).
- */
-object WikipediaPageRank {
-  def main(args: Array[String]) {
-    if (args.length < 4) {
-      System.err.println(
-        "Usage: WikipediaPageRank <inputFile> <threshold> <numPartitions> <usePartitioner>")
-      System.exit(-1)
-    }
-    val sparkConf = new SparkConf()
-    sparkConf.setAppName("WikipediaPageRank")
-    sparkConf.registerKryoClasses(Array(classOf[PRVertex], classOf[PRMessage]))
-
-    val inputFile = args(0)
-    val threshold = args(1).toDouble
-    val numPartitions = args(2).toInt
-    val usePartitioner = args(3).toBoolean
-
-    sparkConf.setAppName("WikipediaPageRank")
-    val sc = new SparkContext(sparkConf)
-
-    // Parse the Wikipedia page data into a graph
-    val input = sc.textFile(inputFile)
-
-    println("Counting vertices...")
-    val numVertices = input.count()
-    println("Done counting vertices.")
-
-    println("Parsing input file...")
-    var vertices = input.map(line => {
-      val fields = line.split("\t")
-      val (title, body) = (fields(1), fields(3).replace("\\n", "\n"))
-      val links =
-        if (body == "\\N") {
-          NodeSeq.Empty
-        } else {
-          try {
-            XML.loadString(body) \\ "link" \ "target"
-          } catch {
-            case e: org.xml.sax.SAXParseException =>
-              System.err.println("Article \"" + title + "\" has malformed XML in body:\n" + body)
-            NodeSeq.Empty
-          }
-        }
-      val outEdges = links.map(link => new String(link.text)).toArray
-      val id = new String(title)
-      (id, new PRVertex(1.0 / numVertices, outEdges))
-    })
-    if (usePartitioner) {
-      vertices = vertices.partitionBy(new HashPartitioner(sc.defaultParallelism)).cache()
-    } else {
-      vertices = vertices.cache()
-    }
-    println("Done parsing input file.")
-
-    // Do the computation
-    val epsilon = 0.01 / numVertices
-    val messages = sc.parallelize(Array[(String, PRMessage)]())
-    val utils = new PageRankUtils
-    val result =
-        Bagel.run(
-          sc, vertices, messages, combiner = new PRCombiner(),
-          numPartitions = numPartitions)(
-          utils.computeWithCombiner(numVertices, epsilon))
-
-    // Print the result
-    System.err.println("Articles with PageRank >= " + threshold + ":")
-    val top =
-      (result
-       .filter { case (id, vertex) => vertex.value >= threshold }
-       .map { case (id, vertex) => "%s\t%s\n".format(id, vertex.value) }
-       .collect().mkString)
-    println(top)
-
-    sc.stop()
-  }
-}
diff --git a/examples/src/main/scala/org/apache/spark/examples/bagel/WikipediaPageRankStandalone.scala b/examples/src/main/scala/org/apache/spark/examples/bagel/WikipediaPageRankStandalone.scala
deleted file mode 100644
index 576a3e371b993..0000000000000
--- a/examples/src/main/scala/org/apache/spark/examples/bagel/WikipediaPageRankStandalone.scala
+++ /dev/null
@@ -1,232 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.examples.bagel
-
-import java.io.{InputStream, OutputStream, DataInputStream, DataOutputStream}
-import java.nio.ByteBuffer
-
-import scala.collection.mutable.ArrayBuffer
-import scala.xml.{XML, NodeSeq}
-
-import org.apache.spark._
-import org.apache.spark.serializer.{DeserializationStream, SerializationStream, SerializerInstance}
-import org.apache.spark.SparkContext._
-import org.apache.spark.rdd.RDD
-
-import scala.reflect.ClassTag
-
-object WikipediaPageRankStandalone {
-  def main(args: Array[String]) {
-    if (args.length < 4) {
-      System.err.println("Usage: WikipediaPageRankStandalone <inputFile> <threshold> " +
-        "<numIterations> <usePartitioner>")
-      System.exit(-1)
-    }
-    val sparkConf = new SparkConf()
-    sparkConf.set("spark.serializer", "spark.bagel.examples.WPRSerializer")
-
-    val inputFile = args(0)
-    val threshold = args(1).toDouble
-    val numIterations = args(2).toInt
-    val usePartitioner = args(3).toBoolean
-
-    sparkConf.setAppName("WikipediaPageRankStandalone")
-
-    val sc = new SparkContext(sparkConf)
-
-    val input = sc.textFile(inputFile)
-    val partitioner = new HashPartitioner(sc.defaultParallelism)
-    val links =
-      if (usePartitioner) {
-        input.map(parseArticle _).partitionBy(partitioner).cache()
-      } else {
-        input.map(parseArticle _).cache()
-      }
-    val n = links.count()
-    val defaultRank = 1.0 / n
-    val a = 0.15
-
-    // Do the computation
-    val startTime = System.currentTimeMillis
-    val ranks =
-      pageRank(links, numIterations, defaultRank, a, n, partitioner, usePartitioner,
-        sc.defaultParallelism)
-
-    // Print the result
-    System.err.println("Articles with PageRank >= " + threshold + ":")
-    val top =
-      (ranks
-       .filter { case (id, rank) => rank >= threshold }
-       .map { case (id, rank) => "%s\t%s\n".format(id, rank) }
-       .collect().mkString)
-    println(top)
-
-    val time = (System.currentTimeMillis - startTime) / 1000.0
-    println("Completed %d iterations in %f seconds: %f seconds per iteration"
-      .format(numIterations, time, time / numIterations))
-    sc.stop()
-  }
-
-  def parseArticle(line: String): (String, Array[String]) = {
-    val fields = line.split("\t")
-    val (title, body) = (fields(1), fields(3).replace("\\n", "\n"))
-    val id = new String(title)
-    val links =
-      if (body == "\\N") {
-        NodeSeq.Empty
-      } else {
-        try {
-          XML.loadString(body) \\ "link" \ "target"
-        } catch {
-          case e: org.xml.sax.SAXParseException =>
-            System.err.println("Article \"" + title + "\" has malformed XML in body:\n" + body)
-          NodeSeq.Empty
-        }
-      }
-    val outEdges = links.map(link => new String(link.text)).toArray
-    (id, outEdges)
-  }
-
-  def pageRank(
-    links: RDD[(String, Array[String])],
-    numIterations: Int,
-    defaultRank: Double,
-    a: Double,
-    n: Long,
-    partitioner: Partitioner,
-    usePartitioner: Boolean,
-    numPartitions: Int
-  ): RDD[(String, Double)] = {
-    var ranks = links.mapValues { edges => defaultRank }
-    for (i <- 1 to numIterations) {
-      val contribs = links.groupWith(ranks).flatMap {
-        case (id, (linksWrapperIterable, rankWrapperIterable)) =>
-          val linksWrapper = linksWrapperIterable.iterator
-          val rankWrapper = rankWrapperIterable.iterator
-          if (linksWrapper.hasNext) {
-            val linksWrapperHead = linksWrapper.next
-            if (rankWrapper.hasNext) {
-              val rankWrapperHead = rankWrapper.next
-              linksWrapperHead.map(dest => (dest, rankWrapperHead / linksWrapperHead.size))
-            } else {
-              linksWrapperHead.map(dest => (dest, defaultRank / linksWrapperHead.size))
-            }
-          } else {
-            Array[(String, Double)]()
-          }
-      }
-      ranks = (contribs.combineByKey((x: Double) => x,
-                                     (x: Double, y: Double) => x + y,
-                                     (x: Double, y: Double) => x + y,
-                                     partitioner)
-               .mapValues(sum => a/n + (1-a)*sum))
-    }
-    ranks
-  }
-}
-
-class WPRSerializer extends org.apache.spark.serializer.Serializer {
-  def newInstance(): SerializerInstance = new WPRSerializerInstance()
-}
-
-class WPRSerializerInstance extends SerializerInstance {
-  def serialize[T: ClassTag](t: T): ByteBuffer = {
-    throw new UnsupportedOperationException()
-  }
-
-  def deserialize[T: ClassTag](bytes: ByteBuffer): T = {
-    throw new UnsupportedOperationException()
-  }
-
-  def deserialize[T: ClassTag](bytes: ByteBuffer, loader: ClassLoader): T = {
-    throw new UnsupportedOperationException()
-  }
-
-  def serializeStream(s: OutputStream): SerializationStream = {
-    new WPRSerializationStream(s)
-  }
-
-  def deserializeStream(s: InputStream): DeserializationStream = {
-    new WPRDeserializationStream(s)
-  }
-}
-
-class WPRSerializationStream(os: OutputStream) extends SerializationStream {
-  val dos = new DataOutputStream(os)
-
-  def writeObject[T: ClassTag](t: T): SerializationStream = t match {
-    case (id: String, wrapper: ArrayBuffer[_]) => wrapper(0) match {
-      case links: Array[String] => {
-        dos.writeInt(0) // links
-        dos.writeUTF(id)
-        dos.writeInt(links.length)
-        for (link <- links) {
-          dos.writeUTF(link)
-        }
-        this
-      }
-      case rank: Double => {
-        dos.writeInt(1) // rank
-        dos.writeUTF(id)
-        dos.writeDouble(rank)
-        this
-      }
-    }
-    case (id: String, rank: Double) => {
-      dos.writeInt(2) // rank without wrapper
-      dos.writeUTF(id)
-      dos.writeDouble(rank)
-      this
-    }
-  }
-
-  def flush() { dos.flush() }
-  def close() { dos.close() }
-}
-
-class WPRDeserializationStream(is: InputStream) extends DeserializationStream {
-  val dis = new DataInputStream(is)
-
-  def readObject[T: ClassTag](): T = {
-    val typeId = dis.readInt()
-    typeId match {
-      case 0 => {
-        val id = dis.readUTF()
-        val numLinks = dis.readInt()
-        val links = new Array[String](numLinks)
-        for (i <- 0 until numLinks) {
-          val link = dis.readUTF()
-          links(i) = link
-        }
-        (id, ArrayBuffer(links)).asInstanceOf[T]
-      }
-      case 1 => {
-        val id = dis.readUTF()
-        val rank = dis.readDouble()
-        (id, ArrayBuffer(rank)).asInstanceOf[T]
-      }
-      case 2 => {
-        val id = dis.readUTF()
-        val rank = dis.readDouble()
-        (id, rank).asInstanceOf[T]
-     }
-    }
-  }
-
-  def close() { dis.close() }
-}
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/OneVsRestExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/OneVsRestExample.scala
index b99d0a1246011..6927eb8f275cf 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/OneVsRestExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/OneVsRestExample.scala
@@ -73,7 +73,7 @@ object OneVsRestExample {
         .action((x, c) => c.copy(fracTest = x))
       opt[String]("testInput")
         .text("input path to test dataset.  If given, option fracTest is ignored")
-        .action((x,c) => c.copy(testInput = Some(x)))
+        .action((x, c) => c.copy(testInput = Some(x)))
       opt[Int]("maxIter")
         .text(s"maximum number of iterations for Logistic Regression." +
           s" default: ${defaultParams.maxIter}")
@@ -88,10 +88,10 @@ object OneVsRestExample {
         .action((x, c) => c.copy(fitIntercept = x))
       opt[Double]("regParam")
         .text(s"the regularization parameter for Logistic Regression.")
-        .action((x,c) => c.copy(regParam = Some(x)))
+        .action((x, c) => c.copy(regParam = Some(x)))
       opt[Double]("elasticNetParam")
         .text(s"the ElasticNet mixing parameter for Logistic Regression.")
-        .action((x,c) => c.copy(elasticNetParam = Some(x)))
+        .action((x, c) => c.copy(elasticNetParam = Some(x)))
       checkConfig { params =>
         if (params.fracTest < 0 || params.fracTest >= 1) {
           failure(s"fracTest ${params.fracTest} value incorrect; should be in [0,1).")
diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/DenseGaussianMixture.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/DenseGaussianMixture.scala
index df76b45e50810..9a1aab036aa0f 100644
--- a/examples/src/main/scala/org/apache/spark/examples/mllib/DenseGaussianMixture.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/mllib/DenseGaussianMixture.scala
@@ -40,7 +40,7 @@ object DenseGaussianMixture {
 
   private def run(inputFile: String, k: Int, convergenceTol: Double, maxIterations: Int) {
     val conf = new SparkConf().setAppName("Gaussian Mixture Model EM example")
-    val ctx  = new SparkContext(conf)
+    val ctx = new SparkContext(conf)
     
     val data = ctx.textFile(inputFile).map { line =>
       Vectors.dense(line.trim.split(' ').map(_.toDouble))
diff --git a/examples/src/main/scala/org/apache/spark/examples/pythonconverters/AvroConverters.scala b/examples/src/main/scala/org/apache/spark/examples/pythonconverters/AvroConverters.scala
index a11890d6f2b1c..3ebb112fc069e 100644
--- a/examples/src/main/scala/org/apache/spark/examples/pythonconverters/AvroConverters.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/pythonconverters/AvroConverters.scala
@@ -36,22 +36,21 @@ object AvroConversionUtil extends Serializable {
       return null
     }
     schema.getType match {
-      case UNION   => unpackUnion(obj, schema)
-      case ARRAY   => unpackArray(obj, schema)
-      case FIXED   => unpackFixed(obj, schema)
-      case MAP     => unpackMap(obj, schema)
-      case BYTES   => unpackBytes(obj)
-      case RECORD  => unpackRecord(obj)
-      case STRING  => obj.toString
-      case ENUM    => obj.toString
-      case NULL    => obj
+      case UNION => unpackUnion(obj, schema)
+      case ARRAY => unpackArray(obj, schema)
+      case FIXED => unpackFixed(obj, schema)
+      case MAP => unpackMap(obj, schema)
+      case BYTES => unpackBytes(obj)
+      case RECORD => unpackRecord(obj)
+      case STRING => obj.toString
+      case ENUM => obj.toString
+      case NULL => obj
       case BOOLEAN => obj
-      case DOUBLE  => obj
-      case FLOAT   => obj
-      case INT     => obj
-      case LONG    => obj
-      case other   => throw new SparkException(
-        s"Unknown Avro schema type ${other.getName}")
+      case DOUBLE => obj
+      case FLOAT => obj
+      case INT => obj
+      case LONG => obj
+      case other => throw new SparkException(s"Unknown Avro schema type ${other.getName}")
     }
   }
 
diff --git a/examples/src/main/scala/org/apache/spark/examples/streaming/ActorWordCount.scala b/examples/src/main/scala/org/apache/spark/examples/streaming/ActorWordCount.scala
index 92867b44be138..016de4c63d1d2 100644
--- a/examples/src/main/scala/org/apache/spark/examples/streaming/ActorWordCount.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/streaming/ActorWordCount.scala
@@ -104,10 +104,8 @@ extends Actor with ActorHelper {
 object FeederActor {
 
   def main(args: Array[String]) {
-    if(args.length < 2){
-      System.err.println(
-        "Usage: FeederActor <hostname> <port>\n"
-      )
+    if (args.length < 2){
+      System.err.println("Usage: FeederActor <hostname> <port>\n")
       System.exit(1)
     }
     val Seq(host, port) = args.toSeq
diff --git a/examples/src/main/scala/org/apache/spark/examples/streaming/DirectKafkaWordCount.scala b/examples/src/main/scala/org/apache/spark/examples/streaming/DirectKafkaWordCount.scala
index 11a8cf09533ce..fbe394de4a179 100644
--- a/examples/src/main/scala/org/apache/spark/examples/streaming/DirectKafkaWordCount.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/streaming/DirectKafkaWordCount.scala
@@ -51,7 +51,7 @@ object DirectKafkaWordCount {
 
     // Create context with 2 second batch interval
     val sparkConf = new SparkConf().setAppName("DirectKafkaWordCount")
-    val ssc =  new StreamingContext(sparkConf, Seconds(2))
+    val ssc = new StreamingContext(sparkConf, Seconds(2))
 
     // Create direct kafka stream with brokers and topics
     val topicsSet = topics.split(",").toSet
diff --git a/examples/src/main/scala/org/apache/spark/examples/streaming/KafkaWordCount.scala b/examples/src/main/scala/org/apache/spark/examples/streaming/KafkaWordCount.scala
index 9ae1b045c2c76..60416ee343544 100644
--- a/examples/src/main/scala/org/apache/spark/examples/streaming/KafkaWordCount.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/streaming/KafkaWordCount.scala
@@ -49,10 +49,10 @@ object KafkaWordCount {
 
     val Array(zkQuorum, group, topics, numThreads) = args
     val sparkConf = new SparkConf().setAppName("KafkaWordCount")
-    val ssc =  new StreamingContext(sparkConf, Seconds(2))
+    val ssc = new StreamingContext(sparkConf, Seconds(2))
     ssc.checkpoint("checkpoint")
 
-    val topicMap = topics.split(",").map((_,numThreads.toInt)).toMap
+    val topicMap = topics.split(",").map((_, numThreads.toInt)).toMap
     val lines = KafkaUtils.createStream(ssc, zkQuorum, group, topicMap).map(_._2)
     val words = lines.flatMap(_.split(" "))
     val wordCounts = words.map(x => (x, 1L))
diff --git a/examples/src/main/scala/org/apache/spark/examples/streaming/MQTTWordCount.scala b/examples/src/main/scala/org/apache/spark/examples/streaming/MQTTWordCount.scala
index 85b9a54b40baf..b336751d81616 100644
--- a/examples/src/main/scala/org/apache/spark/examples/streaming/MQTTWordCount.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/streaming/MQTTWordCount.scala
@@ -49,7 +49,7 @@ object MQTTPublisher {
 
       client.connect()
 
-      val msgtopic  = client.getTopic(topic)
+      val msgtopic = client.getTopic(topic)
       val msgContent = "hello mqtt demo for spark streaming"
       val message = new MqttMessage(msgContent.getBytes("utf-8"))
 
diff --git a/examples/src/main/scala/org/apache/spark/examples/streaming/clickstream/PageViewGenerator.scala b/examples/src/main/scala/org/apache/spark/examples/streaming/clickstream/PageViewGenerator.scala
index 54d996b8ac990..889f052c70263 100644
--- a/examples/src/main/scala/org/apache/spark/examples/streaming/clickstream/PageViewGenerator.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/streaming/clickstream/PageViewGenerator.scala
@@ -57,8 +57,7 @@ object PageViewGenerator {
                        404 -> .05)
   val userZipCode = Map(94709 -> .5,
                         94117 -> .5)
-  val userID = Map((1 to 100).map(_ -> .01):_*)
-
+  val userID = Map((1 to 100).map(_ -> .01) : _*)
 
   def pickFromDistribution[T](inputMap : Map[T, Double]) : T = {
     val rand = new Random().nextDouble()

From 8da560d7de9b3c9a3e3ff197eeb10a3d7023f10d Mon Sep 17 00:00:00 2001
From: Reynold Xin <rxin@databricks.com>
Date: Thu, 28 May 2015 20:11:57 -0700
Subject: [PATCH 227/525] [SPARK-7927] whitespace fixes for Catalyst module.

So we can enable a whitespace enforcement rule in the style checker to save code review time.

Author: Reynold Xin <rxin@databricks.com>

Closes #6476 from rxin/whitespace-catalyst and squashes the following commits:

650409d [Reynold Xin] Fixed tests.
51a9e5d [Reynold Xin] [SPARK-7927] whitespace fixes for Catalyst module.
---
 .../sql/catalyst/AbstractSparkSQLParser.scala |  2 +-
 .../apache/spark/sql/catalyst/SqlParser.scala |  8 +-
 .../sql/catalyst/analysis/Analyzer.scala      |  9 +-
 .../spark/sql/catalyst/analysis/Catalog.scala |  2 +-
 .../catalyst/analysis/HiveTypeCoercion.scala  |  5 +-
 .../spark/sql/catalyst/dsl/package.scala      |  2 +-
 .../spark/sql/catalyst/errors/package.scala   |  7 --
 .../spark/sql/catalyst/expressions/Cast.scala | 84 +++++++++----------
 .../catalyst/expressions/ExtractValue.scala   |  2 +-
 .../sql/catalyst/expressions/aggregates.scala |  4 +-
 .../expressions/codegen/CodeGenerator.scala   |  4 +-
 .../codegen/GenerateProjection.scala          |  2 +-
 .../sql/catalyst/expressions/generators.scala |  4 +-
 .../expressions/stringOperations.scala        |  4 +-
 .../expressions/windowExpressions.scala       |  2 +-
 .../spark/sql/catalyst/plans/QueryPlan.scala  |  4 +-
 .../plans/logical/basicOperators.scala        |  2 +-
 .../spark/sql/catalyst/trees/TreeNode.scala   |  4 +-
 .../spark/sql/catalyst/util/package.scala     |  2 +-
 .../org/apache/spark/sql/types/DataType.scala |  2 +-
 .../sql/catalyst/ScalaReflectionSuite.scala   |  2 +-
 .../spark/sql/catalyst/SqlParserSuite.scala   |  4 +-
 .../sql/catalyst/analysis/AnalysisSuite.scala |  2 +-
 .../ExpressionEvaluationSuite.scala           | 56 ++++++-------
 .../GeneratedEvaluationSuite.scala            |  6 +-
 .../GeneratedMutableEvaluationSuite.scala     |  2 +-
 .../BooleanSimplificationSuite.scala          |  4 +-
 .../optimizer/FilterPushdownSuite.scala       |  2 +-
 .../catalyst/optimizer/OptimizeInSuite.scala  |  6 +-
 .../optimizer/UnionPushdownSuite.scala        |  4 +-
 .../sql/catalyst/trees/TreeNodeSuite.scala    |  4 +-
 .../spark/sql/types/DataTypeSuite.scala       |  4 +-
 32 files changed, 121 insertions(+), 130 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/AbstractSparkSQLParser.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/AbstractSparkSQLParser.scala
index 2eb3e167baad5..ef7b3ad9432cf 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/AbstractSparkSQLParser.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/AbstractSparkSQLParser.scala
@@ -103,7 +103,7 @@ class SqlLexical extends StdLexical {
     ( identChar ~ (identChar | digit).* ^^
       { case first ~ rest => processIdent((first :: rest).mkString) }
     | rep1(digit) ~ ('.' ~> digit.*).? ^^ {
-        case i ~ None    => NumericLit(i.mkString)
+        case i ~ None => NumericLit(i.mkString)
         case i ~ Some(d) => FloatLit(i.mkString + "." + d.mkString)
       }
     | '\'' ~> chrExcept('\'', '\n', EofCh).* <~ '\'' ^^
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/SqlParser.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/SqlParser.scala
index fc36b9f1f20d2..e85312aee7d16 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/SqlParser.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/SqlParser.scala
@@ -140,7 +140,7 @@ class SqlParser extends AbstractSparkSQLParser with DataTypeParser {
       (HAVING ~> expression).? ~
       sortType.? ~
       (LIMIT  ~> expression).? ^^ {
-        case d ~ p ~ r ~ f ~ g ~ h ~ o ~ l  =>
+        case d ~ p ~ r ~ f ~ g ~ h ~ o ~ l =>
           val base = r.getOrElse(OneRowRelation)
           val withFilter = f.map(Filter(_, base)).getOrElse(base)
           val withProjection = g
@@ -212,7 +212,7 @@ class SqlParser extends AbstractSparkSQLParser with DataTypeParser {
 
   protected lazy val ordering: Parser[Seq[SortOrder]] =
     ( rep1sep(expression ~ direction.? , ",") ^^ {
-        case exps  => exps.map(pair => SortOrder(pair._1, pair._2.getOrElse(Ascending)))
+        case exps => exps.map(pair => SortOrder(pair._1, pair._2.getOrElse(Ascending)))
       }
     )
 
@@ -242,7 +242,7 @@ class SqlParser extends AbstractSparkSQLParser with DataTypeParser {
     | termExpression ~ NOT.? ~ (BETWEEN ~> termExpression) ~ (AND ~> termExpression) ^^ {
         case e ~ not ~ el ~ eu =>
           val betweenExpr: Expression = And(GreaterThanOrEqual(e, el), LessThanOrEqual(e, eu))
-          not.fold(betweenExpr)(f=> Not(betweenExpr))
+          not.fold(betweenExpr)(f => Not(betweenExpr))
       }
     | termExpression ~ (RLIKE  ~> termExpression) ^^ { case e1 ~ e2 => RLike(e1, e2) }
     | termExpression ~ (REGEXP ~> termExpression) ^^ { case e1 ~ e2 => RLike(e1, e2) }
@@ -365,7 +365,7 @@ class SqlParser extends AbstractSparkSQLParser with DataTypeParser {
 
   protected lazy val baseExpression: Parser[Expression] =
     ( "*" ^^^ UnresolvedStar(None)
-    | ident <~ "." ~ "*" ^^ { case tableName  => UnresolvedStar(Option(tableName)) }
+    | ident <~ "." ~ "*" ^^ { case tableName => UnresolvedStar(Option(tableName)) }
     | primary
     )
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
index c239e83271615..df37889eedcf0 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
@@ -494,7 +494,7 @@ class Analyzer(
     def apply(plan: LogicalPlan): LogicalPlan = plan transformUp {
       case filter @ Filter(havingCondition, aggregate @ Aggregate(_, originalAggExprs, _))
           if aggregate.resolved && containsAggregate(havingCondition) => {
-        val evaluatedCondition = Alias(havingCondition,  "havingCondition")()
+        val evaluatedCondition = Alias(havingCondition, "havingCondition")()
         val aggExprsWithHaving = evaluatedCondition +: originalAggExprs
 
         Project(aggregate.output,
@@ -515,16 +515,15 @@ class Analyzer(
    *  - concrete attribute references for their output.
    *  - to be relocated from a SELECT clause (i.e. from  a [[Project]]) into a [[Generate]]).
    *
-   * Names for the output [[Attributes]] are extracted from [[Alias]] or [[MultiAlias]] expressions
+   * Names for the output [[Attribute]]s are extracted from [[Alias]] or [[MultiAlias]] expressions
    * that wrap the [[Generator]]. If more than one [[Generator]] is found in a Project, an
    * [[AnalysisException]] is throw.
    */
   object ResolveGenerate extends Rule[LogicalPlan] {
     def apply(plan: LogicalPlan): LogicalPlan = plan transform {
       case p: Generate if !p.child.resolved || !p.generator.resolved => p
-      case g: Generate if g.resolved == false =>
-          g.copy(
-            generatorOutput = makeGeneratorOutput(g.generator, g.generatorOutput.map(_.name)))
+      case g: Generate if !g.resolved =>
+        g.copy(generatorOutput = makeGeneratorOutput(g.generator, g.generatorOutput.map(_.name)))
 
       case p @ Project(projectList, child) =>
         // Holds the resolved generator, if one exists in the project list.
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Catalog.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Catalog.scala
index 208021c421326..3e240fd55e250 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Catalog.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Catalog.scala
@@ -140,7 +140,7 @@ class SimpleCatalog(val conf: CatalystConf) extends Catalog {
 trait OverrideCatalog extends Catalog {
 
   // TODO: This doesn't work when the database changes...
-  val overrides = new mutable.HashMap[(Option[String],String), LogicalPlan]()
+  val overrides = new mutable.HashMap[(Option[String], String), LogicalPlan]()
 
   abstract override def tableExists(tableIdentifier: Seq[String]): Boolean = {
     val tableIdent = processTableIdentifier(tableIdentifier)
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala
index b45b17d856fac..44664f898f762 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala
@@ -561,8 +561,7 @@ trait HiveTypeCoercion {
 
       case a @ CreateArray(children) if !a.resolved =>
         val commonType = a.childTypes.reduce(
-          (a,b) =>
-            findTightestCommonType(a,b).getOrElse(StringType))
+          (a, b) => findTightestCommonType(a, b).getOrElse(StringType))
         CreateArray(
           children.map(c => if (c.dataType == commonType) c else Cast(c, commonType)))
 
@@ -634,7 +633,7 @@ trait HiveTypeCoercion {
     import HiveTypeCoercion._
 
     def apply(plan: LogicalPlan): LogicalPlan = plan transformAllExpressions {
-      case cw: CaseWhenLike if !cw.resolved && cw.childrenResolved && !cw.valueTypesEqual  =>
+      case cw: CaseWhenLike if !cw.resolved && cw.childrenResolved && !cw.valueTypesEqual =>
         logDebug(s"Input values for null casting ${cw.valueTypes.mkString(",")}")
         val commonType = cw.valueTypes.reduce { (v1, v2) =>
           findTightestCommonType(v1, v2).getOrElse(sys.error(
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/dsl/package.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/dsl/package.scala
index 60ab9fba4885a..51821757967d2 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/dsl/package.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/dsl/package.scala
@@ -140,7 +140,7 @@ package object dsl {
       // Note that if we make ExpressionConversions an object rather than a trait, we can
       // then make this a value class to avoid the small penalty of runtime instantiation.
       def $(args: Any*): analysis.UnresolvedAttribute = {
-        analysis.UnresolvedAttribute(sc.s(args :_*))
+        analysis.UnresolvedAttribute(sc.s(args : _*))
       }
     }
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/errors/package.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/errors/package.scala
index 0fd4f9b374ee0..d2a90a50c89f4 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/errors/package.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/errors/package.scala
@@ -49,11 +49,4 @@ package object errors {
       case e: Exception => throw new TreeNodeException(tree, msg, e)
     }
   }
-
-  /**
-   * Executes `f` which is expected to throw a
-   * [[catalyst.errors.TreeNodeException TreeNodeException]]. The first tree encountered in
-   * the stack of exceptions of type `TreeType` is returned.
-   */
-  def getTree[TreeType <: TreeNode[_]](f: => Unit): TreeType = ??? // TODO: Implement
 }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala
index df3cdf2cdf992..21adac144112e 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala
@@ -35,48 +35,48 @@ case class Cast(child: Expression, dataType: DataType) extends UnaryExpression w
 
   private[this] def forceNullable(from: DataType, to: DataType) = (from, to) match {
     case (StringType, _: NumericType) => true
-    case (StringType, TimestampType)  => true
-    case (DoubleType, TimestampType)  => true
-    case (FloatType, TimestampType)   => true
-    case (StringType, DateType)       => true
-    case (_: NumericType, DateType)   => true
-    case (BooleanType, DateType)      => true
-    case (DateType, _: NumericType)   => true
-    case (DateType, BooleanType)      => true
+    case (StringType, TimestampType) => true
+    case (DoubleType, TimestampType) => true
+    case (FloatType, TimestampType) => true
+    case (StringType, DateType) => true
+    case (_: NumericType, DateType) => true
+    case (BooleanType, DateType) => true
+    case (DateType, _: NumericType) => true
+    case (DateType, BooleanType) => true
     case (DoubleType, _: DecimalType) => true
-    case (FloatType, _: DecimalType)  => true
+    case (FloatType, _: DecimalType) => true
     case (_, DecimalType.Fixed(_, _)) => true // TODO: not all upcasts here can really give null
-    case _                            => false
+    case _ => false
   }
 
   private[this] def resolvableNullability(from: Boolean, to: Boolean) = !from || to
 
   private[this] def resolve(from: DataType, to: DataType): Boolean = {
     (from, to) match {
-      case (from, to) if from == to         => true
+      case (from, to) if from == to => true
 
-      case (NullType, _)                    => true
+      case (NullType, _) => true
 
-      case (_, StringType)                  => true
+      case (_, StringType) => true
 
-      case (StringType, BinaryType)         => true
+      case (StringType, BinaryType) => true
 
-      case (StringType, BooleanType)        => true
-      case (DateType, BooleanType)          => true
-      case (TimestampType, BooleanType)     => true
-      case (_: NumericType, BooleanType)    => true
+      case (StringType, BooleanType) => true
+      case (DateType, BooleanType) => true
+      case (TimestampType, BooleanType) => true
+      case (_: NumericType, BooleanType) => true
 
-      case (StringType, TimestampType)      => true
-      case (BooleanType, TimestampType)     => true
-      case (DateType, TimestampType)        => true
-      case (_: NumericType, TimestampType)  => true
+      case (StringType, TimestampType) => true
+      case (BooleanType, TimestampType) => true
+      case (DateType, TimestampType) => true
+      case (_: NumericType, TimestampType) => true
 
-      case (_, DateType)                    => true
+      case (_, DateType) => true
 
-      case (StringType, _: NumericType)     => true
-      case (BooleanType, _: NumericType)    => true
-      case (DateType, _: NumericType)       => true
-      case (TimestampType, _: NumericType)  => true
+      case (StringType, _: NumericType) => true
+      case (BooleanType, _: NumericType) => true
+      case (DateType, _: NumericType) => true
+      case (TimestampType, _: NumericType) => true
       case (_: NumericType, _: NumericType) => true
 
       case (ArrayType(from, fn), ArrayType(to, tn)) =>
@@ -410,21 +410,21 @@ case class Cast(child: Expression, dataType: DataType) extends UnaryExpression w
 
   private[this] def cast(from: DataType, to: DataType): Any => Any = to match {
     case dt if dt == child.dataType => identity[Any]
-    case StringType                 => castToString(from)
-    case BinaryType                 => castToBinary(from)
-    case DateType                   => castToDate(from)
-    case decimal: DecimalType       => castToDecimal(from, decimal)
-    case TimestampType              => castToTimestamp(from)
-    case BooleanType                => castToBoolean(from)
-    case ByteType                   => castToByte(from)
-    case ShortType                  => castToShort(from)
-    case IntegerType                => castToInt(from)
-    case FloatType                  => castToFloat(from)
-    case LongType                   => castToLong(from)
-    case DoubleType                 => castToDouble(from)
-    case array: ArrayType           => castArray(from.asInstanceOf[ArrayType], array)
-    case map: MapType               => castMap(from.asInstanceOf[MapType], map)
-    case struct: StructType         => castStruct(from.asInstanceOf[StructType], struct)
+    case StringType => castToString(from)
+    case BinaryType => castToBinary(from)
+    case DateType => castToDate(from)
+    case decimal: DecimalType => castToDecimal(from, decimal)
+    case TimestampType => castToTimestamp(from)
+    case BooleanType => castToBoolean(from)
+    case ByteType => castToByte(from)
+    case ShortType => castToShort(from)
+    case IntegerType => castToInt(from)
+    case FloatType => castToFloat(from)
+    case LongType => castToLong(from)
+    case DoubleType => castToDouble(from)
+    case array: ArrayType => castArray(from.asInstanceOf[ArrayType], array)
+    case map: MapType => castMap(from.asInstanceOf[MapType], map)
+    case struct: StructType => castStruct(from.asInstanceOf[StructType], struct)
   }
 
   private[this] lazy val cast: Any => Any = cast(child.dataType, dataType)
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ExtractValue.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ExtractValue.scala
index b5f4e16745c1b..a1e0819e8a433 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ExtractValue.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ExtractValue.scala
@@ -47,7 +47,7 @@ object ExtractValue {
       case (ArrayType(StructType(fields), containsNull), Literal(fieldName, StringType)) =>
         val ordinal = findField(fields, fieldName.toString, resolver)
         GetArrayStructFields(child, fields(ordinal), ordinal, containsNull)
-      case (_: ArrayType, _) if extraction.dataType.isInstanceOf[IntegralType]  =>
+      case (_: ArrayType, _) if extraction.dataType.isInstanceOf[IntegralType] =>
         GetArrayItem(child, extraction)
       case (_: MapType, _) =>
         GetMapValue(child, extraction)
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregates.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregates.scala
index 72eff5fe961f0..6c380d3084652 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregates.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregates.scala
@@ -111,7 +111,7 @@ case class MinFunction(expr: Expression, base: AggregateExpression) extends Aggr
   override def update(input: Row): Unit = {
     if (currentMin.value == null) {
       currentMin.value = expr.eval(input)
-    } else if(cmp.eval(input) == true) {
+    } else if (cmp.eval(input) == true) {
       currentMin.value = expr.eval(input)
     }
   }
@@ -142,7 +142,7 @@ case class MaxFunction(expr: Expression, base: AggregateExpression) extends Aggr
   override def update(input: Row): Unit = {
     if (currentMax.value == null) {
       currentMax.value = expr.eval(input)
-    } else if(cmp.eval(input) == true) {
+    } else if (cmp.eval(input) == true) {
       currentMax.value = expr.eval(input)
     }
   }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala
index ecb4c4b68f904..36964af68dd8d 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala
@@ -373,7 +373,7 @@ abstract class CodeGenerator[InType <: AnyRef, OutType <: AnyRef] extends Loggin
         // Uh, bad function name...
         child.castOrNull(c => q"!$c", BooleanType)
 
-      case Add(e1, e2) =>      (e1, e2) evaluate { case (eval1, eval2) => q"$eval1 + $eval2" }
+      case Add(e1, e2) => (e1, e2) evaluate { case (eval1, eval2) => q"$eval1 + $eval2" }
       case Subtract(e1, e2) => (e1, e2) evaluate { case (eval1, eval2) => q"$eval1 - $eval2" }
       case Multiply(e1, e2) => (e1, e2) evaluate { case (eval1, eval2) => q"$eval1 * $eval2" }
       case Divide(e1, e2) =>
@@ -665,7 +665,7 @@ abstract class CodeGenerator[InType <: AnyRef, OutType <: AnyRef] extends Loggin
   protected def defaultPrimitive(dt: DataType) = dt match {
     case BooleanType => ru.Literal(Constant(false))
     case FloatType => ru.Literal(Constant(-1.0.toFloat))
-    case StringType =>  q"""org.apache.spark.sql.types.UTF8String("<uninit>")"""
+    case StringType => q"""org.apache.spark.sql.types.UTF8String("<uninit>")"""
     case ShortType => ru.Literal(Constant(-1.toShort))
     case LongType => ru.Literal(Constant(-1L))
     case ByteType => ru.Literal(Constant(-1.toByte))
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateProjection.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateProjection.scala
index 584f938445c8c..31c63a79ebc8c 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateProjection.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateProjection.scala
@@ -161,7 +161,7 @@ object GenerateProjection extends CodeGenerator[Seq[Expression], Projection] {
       }
     }
 
-    val hashValues = expressions.zipWithIndex.map { case (e,i) =>
+    val hashValues = expressions.zipWithIndex.map { case (e, i) =>
       val elementName = newTermName(s"c$i")
       val nonNull = e.dataType match {
         case BooleanType => q"if ($elementName) 0 else 1"
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/generators.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/generators.scala
index cab40feb72d47..634138010fd21 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/generators.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/generators.scala
@@ -103,8 +103,8 @@ case class Explode(child: Expression)
         val inputArray = child.eval(input).asInstanceOf[Seq[Any]]
         if (inputArray == null) Nil else inputArray.map(v => new GenericRow(Array(v)))
       case MapType(_, _, _) =>
-        val inputMap = child.eval(input).asInstanceOf[Map[Any,Any]]
-        if (inputMap == null) Nil else inputMap.map { case (k,v) => new GenericRow(Array(k,v)) }
+        val inputMap = child.eval(input).asInstanceOf[Map[Any, Any]]
+        if (inputMap == null) Nil else inputMap.map { case (k, v) => new GenericRow(Array(k, v)) }
     }
   }
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala
index 5da93fe9c6cf9..83a44a12f0682 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala
@@ -38,14 +38,14 @@ trait StringRegexExpression extends ExpectsInputTypes {
     case _ => null
   }
 
-  protected def compile(str: String): Pattern = if(str == null) {
+  protected def compile(str: String): Pattern = if (str == null) {
     null
   } else {
     // Let it raise exception if couldn't compile the regex string
     Pattern.compile(escape(str))
   }
 
-  protected def pattern(str: String) = if(cache == null) compile(str) else cache
+  protected def pattern(str: String) = if (cache == null) compile(str) else cache
 
   override def eval(input: Row): Any = {
     val l = left.eval(input)
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/windowExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/windowExpressions.scala
index 2729b34a0833f..82c4d462cc322 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/windowExpressions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/windowExpressions.scala
@@ -66,7 +66,7 @@ case class WindowSpecDefinition(
     }
   }
 
-  override def children: Seq[Expression]  = partitionSpec ++ orderSpec
+  override def children: Seq[Expression] = partitionSpec ++ orderSpec
 
   override lazy val resolved: Boolean =
     childrenResolved && frameSpecification.isInstanceOf[SpecifiedWindowFrame]
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/QueryPlan.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/QueryPlan.scala
index 7967189cacb24..eff5c61644944 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/QueryPlan.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/QueryPlan.scala
@@ -84,7 +84,7 @@ abstract class QueryPlan[PlanType <: TreeNode[PlanType]] extends TreeNode[PlanTy
     val newArgs = productIterator.map {
       case e: Expression => transformExpressionDown(e)
       case Some(e: Expression) => Some(transformExpressionDown(e))
-      case m: Map[_,_] => m
+      case m: Map[_, _] => m
       case d: DataType => d // Avoid unpacking Structs
       case seq: Traversable[_] => seq.map {
         case e: Expression => transformExpressionDown(e)
@@ -117,7 +117,7 @@ abstract class QueryPlan[PlanType <: TreeNode[PlanType]] extends TreeNode[PlanTy
     val newArgs = productIterator.map {
       case e: Expression => transformExpressionUp(e)
       case Some(e: Expression) => Some(transformExpressionUp(e))
-      case m: Map[_,_] => m
+      case m: Map[_, _] => m
       case d: DataType => d // Avoid unpacking Structs
       case seq: Traversable[_] => seq.map {
         case e: Expression => transformExpressionUp(e)
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicOperators.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicOperators.scala
index 01f4b6e9bb77d..33a9e55a47dee 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicOperators.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicOperators.scala
@@ -93,7 +93,7 @@ case class Union(left: LogicalPlan, right: LogicalPlan) extends BinaryNode {
 
   override lazy val resolved: Boolean =
     childrenResolved &&
-    left.output.zip(right.output).forall { case (l,r) => l.dataType == r.dataType }
+    left.output.zip(right.output).forall { case (l, r) => l.dataType == r.dataType }
 
   override def statistics: Statistics = {
     val sizeInBytes = left.statistics.sizeInBytes + right.statistics.sizeInBytes
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/trees/TreeNode.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/trees/TreeNode.scala
index 28e15566f0961..36d005d0e1684 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/trees/TreeNode.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/trees/TreeNode.scala
@@ -254,7 +254,7 @@ abstract class TreeNode[BaseType <: TreeNode[BaseType]] {
         } else {
           Some(arg)
         }
-      case m: Map[_,_] => m
+      case m: Map[_, _] => m
       case d: DataType => d // Avoid unpacking Structs
       case args: Traversable[_] => args.map {
         case arg: TreeNode[_] if children contains arg =>
@@ -311,7 +311,7 @@ abstract class TreeNode[BaseType <: TreeNode[BaseType]] {
         } else {
           Some(arg)
         }
-      case m: Map[_,_] => m
+      case m: Map[_, _] => m
       case d: DataType => d // Avoid unpacking Structs
       case args: Traversable[_] => args.map {
         case arg: TreeNode[_] if children contains arg =>
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/package.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/package.scala
index 9d613a940ee86..07054166a5e88 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/package.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/package.scala
@@ -83,7 +83,7 @@ package object util {
   }
 
   def resourceToString(
-      resource:String,
+      resource: String,
       encoding: String = "UTF-8",
       classLoader: ClassLoader = Utils.getSparkClassLoader): String = {
     new String(resourceToBytes(resource, classLoader), encoding)
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/DataType.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/DataType.scala
index a0b261649f66f..54604808e133e 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/DataType.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/DataType.scala
@@ -271,7 +271,7 @@ object DataType {
 
     protected lazy val structField: Parser[StructField] =
       ("StructField(" ~> "[a-zA-Z0-9_]*".r) ~ ("," ~> dataType) ~ ("," ~> boolVal <~ ")") ^^ {
-        case name ~ tpe ~ nullable  =>
+        case name ~ tpe ~ nullable =>
           StructField(name, tpe, nullable = nullable)
       }
 
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/ScalaReflectionSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/ScalaReflectionSuite.scala
index bbc0b661a0c0c..7ff51db76b6bb 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/ScalaReflectionSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/ScalaReflectionSuite.scala
@@ -253,7 +253,7 @@ class ScalaReflectionSuite extends FunSuite {
     }
 
     assert(ArrayType(IntegerType) === typeOfObject3(Seq(1, 2, 3)))
-    assert(ArrayType(ArrayType(IntegerType)) === typeOfObject3(Seq(Seq(1,2,3))))
+    assert(ArrayType(ArrayType(IntegerType)) === typeOfObject3(Seq(Seq(1, 2, 3))))
   }
 
   test("convert PrimitiveData to catalyst") {
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/SqlParserSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/SqlParserSuite.scala
index 890ea2a84b82e..9eed15952d82b 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/SqlParserSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/SqlParserSuite.scala
@@ -28,7 +28,7 @@ private[sql] case class TestCommand(cmd: String) extends LogicalPlan with Comman
 }
 
 private[sql] class SuperLongKeywordTestParser extends AbstractSparkSQLParser {
-  protected val EXECUTE   = Keyword("THISISASUPERLONGKEYWORDTEST")
+  protected val EXECUTE = Keyword("THISISASUPERLONGKEYWORDTEST")
 
   override protected lazy val start: Parser[LogicalPlan] = set
 
@@ -39,7 +39,7 @@ private[sql] class SuperLongKeywordTestParser extends AbstractSparkSQLParser {
 }
 
 private[sql] class CaseInsensitiveTestParser extends AbstractSparkSQLParser {
-  protected val EXECUTE   = Keyword("EXECUTE")
+  protected val EXECUTE = Keyword("EXECUTE")
 
   override protected lazy val start: Parser[LogicalPlan] = set
 
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala
index 939cefb71b817..fcff24ca31486 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala
@@ -155,7 +155,7 @@ class AnalysisSuite extends FunSuite with BeforeAndAfter {
       caseSensitive: Boolean = true): Unit = {
     test(name) {
       val error = intercept[AnalysisException] {
-        if(caseSensitive) {
+        if (caseSensitive) {
           caseSensitiveAnalyze(plan)
         } else {
           caseInsensitiveAnalyze(plan)
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionEvaluationSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionEvaluationSuite.scala
index 5c4a1527c27c9..a14f776b1eaee 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionEvaluationSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionEvaluationSuite.scala
@@ -43,8 +43,8 @@ class ExpressionEvaluationBaseSuite extends FunSuite {
     val actual = try evaluate(expression, inputRow) catch {
       case e: Exception => fail(s"Exception evaluating $expression", e)
     }
-    if(actual != expected) {
-      val input = if(inputRow == EmptyRow) "" else s", input: $inputRow"
+    if (actual != expected) {
+      val input = if (inputRow == EmptyRow) "" else s", input: $inputRow"
       fail(s"Incorrect Evaluation: $expression, actual: $actual, expected: $expected$input")
     }
   }
@@ -126,37 +126,37 @@ class ExpressionEvaluationSuite extends ExpressionEvaluationBaseSuite {
   }
 
   booleanLogicTest("AND", _ && _,
-    (true,  true,  true) ::
-    (true,  false, false) ::
-    (true,  null,  null) ::
-    (false, true,  false) ::
+    (true, true, true) ::
+    (true, false, false) ::
+    (true, null, null) ::
+    (false, true, false) ::
     (false, false, false) ::
-    (false, null,  false) ::
-    (null,  true,  null) ::
-    (null,  false, false) ::
-    (null,  null,  null) :: Nil)
+    (false, null, false) ::
+    (null, true, null) ::
+    (null, false, false) ::
+    (null, null, null) :: Nil)
 
   booleanLogicTest("OR", _ || _,
-    (true,  true,  true) ::
-    (true,  false, true) ::
-    (true,  null,  true) ::
-    (false, true,  true) ::
+    (true, true, true) ::
+    (true, false, true) ::
+    (true, null, true) ::
+    (false, true, true) ::
     (false, false, false) ::
-    (false, null,  null) ::
-    (null,  true,  true) ::
-    (null,  false, null) ::
-    (null,  null,  null) :: Nil)
+    (false, null, null) ::
+    (null, true, true) ::
+    (null, false, null) ::
+    (null, null, null) :: Nil)
 
   booleanLogicTest("=", _ === _,
-    (true,  true,  true) ::
-    (true,  false, false) ::
-    (true,  null,  null) ::
-    (false, true,  false) ::
+    (true, true, true) ::
+    (true, false, false) ::
+    (true, null, null) ::
+    (false, true, false) ::
     (false, false, true) ::
-    (false, null,  null) ::
-    (null,  true,  null) ::
-    (null,  false, null) ::
-    (null,  null,  null) :: Nil)
+    (false, null, null) ::
+    (null, true, null) ::
+    (null, false, null) ::
+    (null, null, null) :: Nil)
 
   def booleanLogicTest(
       name: String,
@@ -164,7 +164,7 @@ class ExpressionEvaluationSuite extends ExpressionEvaluationBaseSuite {
       truthTable: Seq[(Any, Any, Any)]) {
     test(s"3VL $name") {
       truthTable.foreach {
-        case (l,r,answer) =>
+        case (l, r, answer) =>
           val expr = op(Literal.create(l, BooleanType), Literal.create(r, BooleanType))
           checkEvaluation(expr, answer)
       }
@@ -928,7 +928,7 @@ class ExpressionEvaluationSuite extends ExpressionEvaluationBaseSuite {
         :: StructField("b", StringType, nullable = false) :: Nil
     )
 
-    assert(getStructField(BoundReference(2,typeS, nullable = true), "a").nullable === true)
+    assert(getStructField(BoundReference(2, typeS, nullable = true), "a").nullable === true)
     assert(getStructField(BoundReference(2, typeS_notNullable, nullable = false), "a").nullable
       === false)
 
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/GeneratedEvaluationSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/GeneratedEvaluationSuite.scala
index b5ebe4b38e337..d7c437095e395 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/GeneratedEvaluationSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/GeneratedEvaluationSuite.scala
@@ -41,9 +41,9 @@ class GeneratedEvaluationSuite extends ExpressionEvaluationSuite {
           """.stripMargin)
     }
 
-    val actual  = plan(inputRow).apply(0)
-    if(actual != expected) {
-      val input = if(inputRow == EmptyRow) "" else s", input: $inputRow"
+    val actual = plan(inputRow).apply(0)
+    if (actual != expected) {
+      val input = if (inputRow == EmptyRow) "" else s", input: $inputRow"
       fail(s"Incorrect Evaluation: $expression, actual: $actual, expected: $expected$input")
     }
   }
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/GeneratedMutableEvaluationSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/GeneratedMutableEvaluationSuite.scala
index 97af2e0fd0502..a40324b008e16 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/GeneratedMutableEvaluationSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/GeneratedMutableEvaluationSuite.scala
@@ -53,7 +53,7 @@ class GeneratedMutableEvaluationSuite extends ExpressionEvaluationSuite {
         """.stripMargin)
     }
     if (actual != expectedRow) {
-      val input = if(inputRow == EmptyRow) "" else s", input: $inputRow"
+      val input = if (inputRow == EmptyRow) "" else s", input: $inputRow"
       fail(s"Incorrect Evaluation: $expression, actual: $actual, expected: $expected$input")
     }
   }
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/BooleanSimplificationSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/BooleanSimplificationSuite.scala
index 6255578d7fa57..465a5e6914204 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/BooleanSimplificationSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/BooleanSimplificationSuite.scala
@@ -78,9 +78,9 @@ class BooleanSimplificationSuite extends PlanTest with PredicateHelper {
   test("(a && b && c && ...) || (a && b && d && ...) || (a && b && e && ...) ...") {
     checkCondition('b > 3 || 'c > 5, 'b > 3 || 'c > 5)
 
-    checkCondition(('a < 2 && 'a > 3 && 'b > 5) || 'a < 2,  'a < 2)
+    checkCondition(('a < 2 && 'a > 3 && 'b > 5) || 'a < 2, 'a < 2)
 
-    checkCondition('a < 2 || ('a < 2 && 'a > 3 && 'b > 5),  'a < 2)
+    checkCondition('a < 2 || ('a < 2 && 'a > 3 && 'b > 5), 'a < 2)
 
     val input = ('a === 'b && 'b > 3 && 'c > 2) ||
       ('a === 'b && 'c < 1 && 'a === 5) ||
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/FilterPushdownSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/FilterPushdownSuite.scala
index be33cb9bb8eaa..ff25470bf0946 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/FilterPushdownSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/FilterPushdownSuite.scala
@@ -97,7 +97,7 @@ class FilterPushdownSuite extends PlanTest {
   test("column pruning for Project(ne, Limit)") {
     val originalQuery =
       testRelation
-        .select('a,'b)
+        .select('a, 'b)
         .limit(2)
         .select('a)
 
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/OptimizeInSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/OptimizeInSuite.scala
index 3eb399e68e70c..11b0859d3f066 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/OptimizeInSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/OptimizeInSuite.scala
@@ -46,7 +46,7 @@ class OptimizeInSuite extends PlanTest {
   test("OptimizedIn test: In clause optimized to InSet") {
     val originalQuery =
       testRelation
-        .where(In(UnresolvedAttribute("a"), Seq(Literal(1),Literal(2))))
+        .where(In(UnresolvedAttribute("a"), Seq(Literal(1), Literal(2))))
         .analyze
 
     val optimized = Optimize.execute(originalQuery.analyze)
@@ -61,13 +61,13 @@ class OptimizeInSuite extends PlanTest {
   test("OptimizedIn test: In clause not optimized in case filter has attributes") {
     val originalQuery =
       testRelation
-        .where(In(UnresolvedAttribute("a"), Seq(Literal(1),Literal(2), UnresolvedAttribute("b"))))
+        .where(In(UnresolvedAttribute("a"), Seq(Literal(1), Literal(2), UnresolvedAttribute("b"))))
         .analyze
 
     val optimized = Optimize.execute(originalQuery.analyze)
     val correctAnswer =
       testRelation
-        .where(In(UnresolvedAttribute("a"), Seq(Literal(1),Literal(2), UnresolvedAttribute("b"))))
+        .where(In(UnresolvedAttribute("a"), Seq(Literal(1), Literal(2), UnresolvedAttribute("b"))))
         .analyze
 
     comparePlans(optimized, correctAnswer)
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/UnionPushdownSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/UnionPushdownSuite.scala
index a3ad200800b02..35f50be46b76f 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/UnionPushdownSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/UnionPushdownSuite.scala
@@ -33,8 +33,8 @@ class UnionPushdownSuite extends PlanTest {
         UnionPushdown) :: Nil
   }
 
-  val testRelation =  LocalRelation('a.int, 'b.int, 'c.int)
-  val testRelation2 =  LocalRelation('d.int, 'e.int, 'f.int)
+  val testRelation = LocalRelation('a.int, 'b.int, 'c.int)
+  val testRelation2 = LocalRelation('d.int, 'e.int, 'f.int)
   val testUnion = Union(testRelation, testRelation2)
 
   test("union: filter to each side") {
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/trees/TreeNodeSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/trees/TreeNodeSuite.scala
index e5f77dcd962a4..9fcfc51c96139 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/trees/TreeNodeSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/trees/TreeNodeSuite.scala
@@ -91,7 +91,7 @@ class TreeNodeSuite extends FunSuite {
   test("transform works on nodes with Option children") {
     val dummy1 = Dummy(Some(Literal.create("1", StringType)))
     val dummy2 = Dummy(None)
-    val toZero: PartialFunction[Expression, Expression] =  { case Literal(_, _) => Literal(0) }
+    val toZero: PartialFunction[Expression, Expression] = { case Literal(_, _) => Literal(0) }
 
     var actual = dummy1 transformDown toZero
     assert(actual === Dummy(Some(Literal(0))))
@@ -104,7 +104,7 @@ class TreeNodeSuite extends FunSuite {
   }
 
   test("preserves origin") {
-    CurrentOrigin.setPosition(1,1)
+    CurrentOrigin.setPosition(1, 1)
     val add = Add(Literal(1), Literal(1))
     CurrentOrigin.reset()
 
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/types/DataTypeSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/types/DataTypeSuite.scala
index a73317c86916b..df119827812f9 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/types/DataTypeSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/types/DataTypeSuite.scala
@@ -250,11 +250,11 @@ class DataTypeSuite extends FunSuite {
     expected = false)
   checkEqualsIgnoreCompatibleNullability(
     from = MapType(StringType, ArrayType(IntegerType, true), valueContainsNull = true),
-    to = MapType(StringType,  ArrayType(IntegerType, false), valueContainsNull = true),
+    to = MapType(StringType, ArrayType(IntegerType, false), valueContainsNull = true),
     expected = false)
   checkEqualsIgnoreCompatibleNullability(
     from = MapType(StringType, ArrayType(IntegerType, false), valueContainsNull = true),
-    to = MapType(StringType,  ArrayType(IntegerType, true), valueContainsNull = true),
+    to = MapType(StringType, ArrayType(IntegerType, true), valueContainsNull = true),
     expected = true)
 
 

From 7f7505d8db7759ea46e904f767c23130eff1104a Mon Sep 17 00:00:00 2001
From: Reynold Xin <rxin@databricks.com>
Date: Thu, 28 May 2015 20:15:52 -0700
Subject: [PATCH 228/525] [SPARK-7927] whitespace fixes for core.

So we can enable a whitespace enforcement rule in the style checker to save code review time.

Author: Reynold Xin <rxin@databricks.com>

Closes #6473 from rxin/whitespace-core and squashes the following commits:

058195d [Reynold Xin] Fixed tests.
fce11e9 [Reynold Xin] [SPARK-7927] whitespace fixes for core.
---
 .../scala/org/apache/spark/Accumulators.scala |  2 +-
 .../scala/org/apache/spark/Aggregator.scala   |  4 +--
 .../scala/org/apache/spark/Partitioner.scala  |  8 ++---
 .../scala/org/apache/spark/SparkConf.scala    |  2 +-
 .../scala/org/apache/spark/SparkContext.scala | 10 +++---
 .../scala/org/apache/spark/SparkEnv.scala     |  4 +--
 .../org/apache/spark/SparkHadoopWriter.scala  | 10 +++---
 .../apache/spark/api/java/JavaRDDLike.scala   |  2 +-
 .../apache/spark/api/python/PythonRDD.scala   |  2 +-
 .../apache/spark/api/r/RBackendHandler.scala  |  4 +--
 .../scala/org/apache/spark/api/r/RRDD.scala   |  2 +-
 .../spark/broadcast/HttpBroadcast.scala       |  4 +--
 .../spark/deploy/FaultToleranceTest.scala     |  2 +-
 .../org/apache/spark/deploy/SparkSubmit.scala |  2 +-
 .../apache/spark/deploy/worker/Worker.scala   |  2 +-
 .../apache/spark/executor/TaskMetrics.scala   |  2 +-
 .../mapreduce/SparkHadoopMapReduceUtil.scala  |  2 +-
 .../spark/network/nio/BlockMessage.scala      |  2 +-
 .../spark/network/nio/BlockMessageArray.scala |  4 +--
 .../spark/network/nio/SecurityMessage.scala   |  2 +-
 .../spark/partial/GroupedCountEvaluator.scala |  6 ++--
 .../org/apache/spark/rdd/CheckpointRDD.scala  |  2 +-
 .../org/apache/spark/rdd/CoalescedRDD.scala   |  4 +--
 .../apache/spark/rdd/PairRDDFunctions.scala   |  6 ++--
 .../main/scala/org/apache/spark/rdd/RDD.scala | 12 +++----
 .../spark/rdd/SequenceFileRDDFunctions.scala  |  6 ++--
 .../org/apache/spark/rdd/SubtractedRDD.scala  |  2 +-
 .../spark/rdd/ZippedPartitionsRDD.scala       |  2 +-
 .../apache/spark/scheduler/DAGScheduler.scala |  4 +--
 .../spark/scheduler/DAGSchedulerSource.scala  |  3 +-
 .../spark/scheduler/SchedulingAlgorithm.scala |  2 +-
 .../spark/scheduler/SparkListener.scala       |  6 ++--
 .../spark/scheduler/TaskSetManager.scala      |  4 +--
 .../cluster/CoarseGrainedClusterMessage.scala |  3 +-
 .../cluster/YarnSchedulerBackend.scala        |  2 +-
 .../mesos/CoarseMesosSchedulerBackend.scala   |  2 +-
 .../cluster/mesos/MesosSchedulerBackend.scala |  2 +-
 .../mesos/MesosSchedulerBackendUtil.scala     |  2 +-
 .../status/api/v1/AllStagesResource.scala     |  4 +--
 .../spark/status/api/v1/ApiRootResource.scala |  8 ++---
 .../spark/status/api/v1/OneRDDResource.scala  |  2 +-
 .../org/apache/spark/status/api/v1/api.scala  |  2 +-
 .../storage/BlockManagerSlaveEndpoint.scala   |  2 +-
 .../spark/storage/BlockManagerSource.scala    |  3 +-
 .../scala/org/apache/spark/ui/SparkUI.scala   |  2 +-
 .../scala/org/apache/spark/ui/UIUtils.scala   |  2 +-
 .../apache/spark/ui/UIWorkloadGenerator.scala |  2 +-
 .../org/apache/spark/ui/storage/RDDPage.scala |  2 +-
 .../org/apache/spark/util/AkkaUtils.scala     |  2 +-
 .../spark/util/CompletionIterator.scala       |  2 +-
 .../org/apache/spark/util/Distribution.scala  |  4 +--
 .../apache/spark/util/MetadataCleaner.scala   |  2 +-
 .../org/apache/spark/util/MutablePair.scala   |  2 +-
 .../org/apache/spark/util/SizeEstimator.scala | 16 ++++-----
 .../scala/org/apache/spark/util/Utils.scala   |  2 +-
 .../apache/spark/util/collection/BitSet.scala |  2 +-
 .../util/collection/SortDataFormat.scala      |  4 +--
 .../util/random/StratifiedSamplingUtils.scala |  2 +-
 .../org/apache/spark/AccumulatorSuite.scala   |  2 +-
 .../org/apache/spark/CheckpointSuite.scala    |  4 +--
 .../apache/spark/ContextCleanerSuite.scala    |  6 ++--
 .../scala/org/apache/spark/FailureSuite.scala |  2 +-
 .../org/apache/spark/FileServerSuite.scala    | 20 +++++------
 .../scala/org/apache/spark/FileSuite.scala    |  2 +-
 .../apache/spark/ImplicitOrderingSuite.scala  |  4 +--
 .../org/apache/spark/SparkConfSuite.scala     | 12 +++----
 .../org/apache/spark/SparkContextSuite.scala  | 14 ++++----
 .../spark/broadcast/BroadcastSuite.scala      |  2 +-
 .../deploy/worker/WorkerArgumentsTest.scala   |  2 +-
 .../spark/deploy/worker/WorkerSuite.scala     |  2 +-
 .../metrics/InputOutputMetricsSuite.scala     |  6 ++--
 .../spark/rdd/PairRDDFunctionsSuite.scala     | 10 +++---
 .../scala/org/apache/spark/rdd/RDDSuite.scala | 34 +++++++++----------
 .../org/apache/spark/rdd/RDDSuiteUtils.scala  |  6 ++--
 .../org/apache/spark/rdd/SortingSuite.scala   |  4 +--
 .../org/apache/spark/rpc/RpcEnvSuite.scala    |  6 ++--
 .../CoarseGrainedSchedulerBackendSuite.scala  |  4 +--
 .../spark/scheduler/DAGSchedulerSuite.scala   |  6 ++--
 .../apache/spark/scheduler/PoolSuite.scala    |  4 +--
 .../serializer/KryoSerializerSuite.scala      |  4 +--
 .../ProactiveClosureSerializationSuite.scala  | 10 +++---
 .../spark/serializer/TestSerializer.scala     | 11 +++---
 .../spark/storage/FlatmapIteratorSuite.scala  |  6 ++--
 .../org/apache/spark/ui/UISeleniumSuite.scala |  8 ++---
 .../spark/ui/storage/StorageTabSuite.scala    |  2 +-
 .../apache/spark/util/AkkaUtilsSuite.scala    |  2 +-
 .../org/apache/spark/util/UtilsSuite.scala    |  2 +-
 .../spark/util/collection/BitSetSuite.scala   |  2 +-
 88 files changed, 205 insertions(+), 203 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/Accumulators.scala b/core/src/main/scala/org/apache/spark/Accumulators.scala
index 330df1d59a9b1..5a8d17bd99933 100644
--- a/core/src/main/scala/org/apache/spark/Accumulators.scala
+++ b/core/src/main/scala/org/apache/spark/Accumulators.scala
@@ -228,7 +228,7 @@ GrowableAccumulableParam[R <% Growable[T] with TraversableOnce[T] with Serializa
  * @tparam T result type
  */
 class Accumulator[T](@transient initialValue: T, param: AccumulatorParam[T], name: Option[String])
-    extends Accumulable[T,T](initialValue, param, name) {
+  extends Accumulable[T, T](initialValue, param, name) {
 
   def this(initialValue: T, param: AccumulatorParam[T]) = this(initialValue, param, None)
 }
diff --git a/core/src/main/scala/org/apache/spark/Aggregator.scala b/core/src/main/scala/org/apache/spark/Aggregator.scala
index af9765d313e9e..b8a5f5016860f 100644
--- a/core/src/main/scala/org/apache/spark/Aggregator.scala
+++ b/core/src/main/scala/org/apache/spark/Aggregator.scala
@@ -45,7 +45,7 @@ case class Aggregator[K, V, C] (
   def combineValuesByKey(iter: Iterator[_ <: Product2[K, V]],
                          context: TaskContext): Iterator[(K, C)] = {
     if (!isSpillEnabled) {
-      val combiners = new AppendOnlyMap[K,C]
+      val combiners = new AppendOnlyMap[K, C]
       var kv: Product2[K, V] = null
       val update = (hadValue: Boolean, oldValue: C) => {
         if (hadValue) mergeValue(oldValue, kv._2) else createCombiner(kv._2)
@@ -76,7 +76,7 @@ case class Aggregator[K, V, C] (
     : Iterator[(K, C)] =
   {
     if (!isSpillEnabled) {
-      val combiners = new AppendOnlyMap[K,C]
+      val combiners = new AppendOnlyMap[K, C]
       var kc: Product2[K, C] = null
       val update = (hadValue: Boolean, oldValue: C) => {
         if (hadValue) mergeCombiners(oldValue, kc._2) else kc._2
diff --git a/core/src/main/scala/org/apache/spark/Partitioner.scala b/core/src/main/scala/org/apache/spark/Partitioner.scala
index b8d244408bc5b..82889bcd30988 100644
--- a/core/src/main/scala/org/apache/spark/Partitioner.scala
+++ b/core/src/main/scala/org/apache/spark/Partitioner.scala
@@ -103,7 +103,7 @@ class HashPartitioner(partitions: Int) extends Partitioner {
  */
 class RangePartitioner[K : Ordering : ClassTag, V](
     @transient partitions: Int,
-    @transient rdd: RDD[_ <: Product2[K,V]],
+    @transient rdd: RDD[_ <: Product2[K, V]],
     private var ascending: Boolean = true)
   extends Partitioner {
 
@@ -185,7 +185,7 @@ class RangePartitioner[K : Ordering : ClassTag, V](
   }
 
   override def equals(other: Any): Boolean = other match {
-    case r: RangePartitioner[_,_] =>
+    case r: RangePartitioner[_, _] =>
       r.rangeBounds.sameElements(rangeBounds) && r.ascending == ascending
     case _ =>
       false
@@ -249,7 +249,7 @@ private[spark] object RangePartitioner {
    * @param sampleSizePerPartition max sample size per partition
    * @return (total number of items, an array of (partitionId, number of items, sample))
    */
-  def sketch[K:ClassTag](
+  def sketch[K : ClassTag](
       rdd: RDD[K],
       sampleSizePerPartition: Int): (Long, Array[(Int, Int, Array[K])]) = {
     val shift = rdd.id
@@ -272,7 +272,7 @@ private[spark] object RangePartitioner {
    * @param partitions number of partitions
    * @return selected bounds
    */
-  def determineBounds[K:Ordering:ClassTag](
+  def determineBounds[K : Ordering : ClassTag](
       candidates: ArrayBuffer[(K, Float)],
       partitions: Int): Array[K] = {
     val ordering = implicitly[Ordering[K]]
diff --git a/core/src/main/scala/org/apache/spark/SparkConf.scala b/core/src/main/scala/org/apache/spark/SparkConf.scala
index b5e5d6f1465f3..4b5bcb54aa873 100644
--- a/core/src/main/scala/org/apache/spark/SparkConf.scala
+++ b/core/src/main/scala/org/apache/spark/SparkConf.scala
@@ -481,7 +481,7 @@ private[spark] object SparkConf extends Logging {
           "are no longer accepted. To specify the equivalent now, one may use '64k'.")
     )
     
-    Map(configs.map { cfg => (cfg.key -> cfg) }:_*)
+    Map(configs.map { cfg => (cfg.key -> cfg) } : _*)
   }
 
   /**
diff --git a/core/src/main/scala/org/apache/spark/SparkContext.scala b/core/src/main/scala/org/apache/spark/SparkContext.scala
index ea6c0dea08e47..a453c9bf4864a 100644
--- a/core/src/main/scala/org/apache/spark/SparkContext.scala
+++ b/core/src/main/scala/org/apache/spark/SparkContext.scala
@@ -389,7 +389,7 @@ class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationCli
 
     _conf.set("spark.executor.id", SparkContext.DRIVER_IDENTIFIER)
 
-    _jars =_conf.getOption("spark.jars").map(_.split(",")).map(_.filter(_.size != 0)).toSeq.flatten
+    _jars = _conf.getOption("spark.jars").map(_.split(",")).map(_.filter(_.size != 0)).toSeq.flatten
     _files = _conf.getOption("spark.files").map(_.split(",")).map(_.filter(_.size != 0))
       .toSeq.flatten
 
@@ -438,7 +438,7 @@ class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationCli
     _ui =
       if (conf.getBoolean("spark.ui.enabled", true)) {
         Some(SparkUI.createLiveUI(this, _conf, listenerBus, _jobProgressListener,
-          _env.securityManager,appName, startTime = startTime))
+          _env.securityManager, appName, startTime = startTime))
       } else {
         // For tests, do not enable the UI
         None
@@ -917,7 +917,7 @@ class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationCli
       classOf[FixedLengthBinaryInputFormat],
       classOf[LongWritable],
       classOf[BytesWritable],
-      conf=conf)
+      conf = conf)
     val data = br.map { case (k, v) =>
       val bytes = v.getBytes
       assert(bytes.length == recordLength, "Byte array does not have correct length")
@@ -1267,7 +1267,7 @@ class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationCli
    */
   def accumulableCollection[R <% Growable[T] with TraversableOnce[T] with Serializable: ClassTag, T]
       (initialValue: R): Accumulable[R, T] = {
-    val param = new GrowableAccumulableParam[R,T]
+    val param = new GrowableAccumulableParam[R, T]
     val acc = new Accumulable(initialValue, param)
     cleaner.foreach(_.registerAccumulatorForCleanup(acc))
     acc
@@ -1316,7 +1316,7 @@ class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationCli
     val uri = new URI(path)
     val schemeCorrectedPath = uri.getScheme match {
       case null | "local" => new File(path).getCanonicalFile.toURI.toString
-      case _              => path
+      case _ => path
     }
 
     val hadoopPath = new Path(schemeCorrectedPath)
diff --git a/core/src/main/scala/org/apache/spark/SparkEnv.scala b/core/src/main/scala/org/apache/spark/SparkEnv.scala
index 327114542880d..a185954089528 100644
--- a/core/src/main/scala/org/apache/spark/SparkEnv.scala
+++ b/core/src/main/scala/org/apache/spark/SparkEnv.scala
@@ -298,7 +298,7 @@ object SparkEnv extends Logging {
       }
     }
 
-    val mapOutputTracker =  if (isDriver) {
+    val mapOutputTracker = if (isDriver) {
       new MapOutputTrackerMaster(conf)
     } else {
       new MapOutputTrackerWorker(conf)
@@ -348,7 +348,7 @@ object SparkEnv extends Logging {
         val fileServerPort = conf.getInt("spark.fileserver.port", 0)
         val server = new HttpFileServer(conf, securityManager, fileServerPort)
         server.initialize()
-        conf.set("spark.fileserver.uri",  server.serverUri)
+        conf.set("spark.fileserver.uri", server.serverUri)
         server
       } else {
         null
diff --git a/core/src/main/scala/org/apache/spark/SparkHadoopWriter.scala b/core/src/main/scala/org/apache/spark/SparkHadoopWriter.scala
index 2ec42d3aea169..59ac82ccec53b 100644
--- a/core/src/main/scala/org/apache/spark/SparkHadoopWriter.scala
+++ b/core/src/main/scala/org/apache/spark/SparkHadoopWriter.scala
@@ -50,8 +50,8 @@ class SparkHadoopWriter(@transient jobConf: JobConf)
   private var jID: SerializableWritable[JobID] = null
   private var taID: SerializableWritable[TaskAttemptID] = null
 
-  @transient private var writer: RecordWriter[AnyRef,AnyRef] = null
-  @transient private var format: OutputFormat[AnyRef,AnyRef] = null
+  @transient private var writer: RecordWriter[AnyRef, AnyRef] = null
+  @transient private var format: OutputFormat[AnyRef, AnyRef] = null
   @transient private var committer: OutputCommitter = null
   @transient private var jobContext: JobContext = null
   @transient private var taskContext: TaskAttemptContext = null
@@ -114,10 +114,10 @@ class SparkHadoopWriter(@transient jobConf: JobConf)
 
   // ********* Private Functions *********
 
-  private def getOutputFormat(): OutputFormat[AnyRef,AnyRef] = {
+  private def getOutputFormat(): OutputFormat[AnyRef, AnyRef] = {
     if (format == null) {
       format = conf.value.getOutputFormat()
-        .asInstanceOf[OutputFormat[AnyRef,AnyRef]]
+        .asInstanceOf[OutputFormat[AnyRef, AnyRef]]
     }
     format
   }
@@ -138,7 +138,7 @@ class SparkHadoopWriter(@transient jobConf: JobConf)
 
   private def getTaskContext(): TaskAttemptContext = {
     if (taskContext == null) {
-      taskContext =  newTaskAttemptContext(conf.value, taID.value)
+      taskContext = newTaskAttemptContext(conf.value, taID.value)
     }
     taskContext
   }
diff --git a/core/src/main/scala/org/apache/spark/api/java/JavaRDDLike.scala b/core/src/main/scala/org/apache/spark/api/java/JavaRDDLike.scala
index 74db7643224f5..b8e15f38a20d2 100644
--- a/core/src/main/scala/org/apache/spark/api/java/JavaRDDLike.scala
+++ b/core/src/main/scala/org/apache/spark/api/java/JavaRDDLike.scala
@@ -96,7 +96,7 @@ trait JavaRDDLike[T, This <: JavaRDDLike[T, This]] extends Serializable {
   def mapPartitionsWithIndex[R](
       f: JFunction2[jl.Integer, java.util.Iterator[T], java.util.Iterator[R]],
       preservesPartitioning: Boolean = false): JavaRDD[R] =
-    new JavaRDD(rdd.mapPartitionsWithIndex(((a,b) => f(a,asJavaIterator(b))),
+    new JavaRDD(rdd.mapPartitionsWithIndex(((a, b) => f(a, asJavaIterator(b))),
         preservesPartitioning)(fakeClassTag))(fakeClassTag)
 
   /**
diff --git a/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala b/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala
index 2d92f6a42b308..a77bf42ce1d38 100644
--- a/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala
+++ b/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala
@@ -723,7 +723,7 @@ private[spark] object PythonRDD extends Logging {
     val converted = convertRDD(rdd, keyConverterClass, valueConverterClass,
       new JavaToWritableConverter)
     val fc = Utils.classForName(outputFormatClass).asInstanceOf[Class[F]]
-    converted.saveAsHadoopFile(path, kc, vc, fc, new JobConf(mergedConf), codec=codec)
+    converted.saveAsHadoopFile(path, kc, vc, fc, new JobConf(mergedConf), codec = codec)
   }
 
   /**
diff --git a/core/src/main/scala/org/apache/spark/api/r/RBackendHandler.scala b/core/src/main/scala/org/apache/spark/api/r/RBackendHandler.scala
index 0075d963711f1..026a1b9380357 100644
--- a/core/src/main/scala/org/apache/spark/api/r/RBackendHandler.scala
+++ b/core/src/main/scala/org/apache/spark/api/r/RBackendHandler.scala
@@ -124,7 +124,7 @@ private[r] class RBackendHandler(server: RBackend)
           }
           throw new Exception(s"No matched method found for $cls.$methodName")
         }
-        val ret = methods.head.invoke(obj, args:_*)
+        val ret = methods.head.invoke(obj, args : _*)
 
         // Write status bit
         writeInt(dos, 0)
@@ -135,7 +135,7 @@ private[r] class RBackendHandler(server: RBackend)
           matchMethod(numArgs, args, x.getParameterTypes)
         }.head
 
-        val obj = ctor.newInstance(args:_*)
+        val obj = ctor.newInstance(args : _*)
 
         writeInt(dos, 0)
         writeObject(dos, obj.asInstanceOf[AnyRef])
diff --git a/core/src/main/scala/org/apache/spark/api/r/RRDD.scala b/core/src/main/scala/org/apache/spark/api/r/RRDD.scala
index 06247f7e8b78c..e020458888e4a 100644
--- a/core/src/main/scala/org/apache/spark/api/r/RRDD.scala
+++ b/core/src/main/scala/org/apache/spark/api/r/RRDD.scala
@@ -309,7 +309,7 @@ private class StringRRDD[T: ClassTag](
 }
 
 private object SpecialLengths {
-  val TIMING_DATA   = -1
+  val TIMING_DATA = -1
 }
 
 private[r] class BufferedStreamThread(
diff --git a/core/src/main/scala/org/apache/spark/broadcast/HttpBroadcast.scala b/core/src/main/scala/org/apache/spark/broadcast/HttpBroadcast.scala
index 4457c75e8b0fc..b69af639f7862 100644
--- a/core/src/main/scala/org/apache/spark/broadcast/HttpBroadcast.scala
+++ b/core/src/main/scala/org/apache/spark/broadcast/HttpBroadcast.scala
@@ -125,7 +125,7 @@ private[broadcast] object HttpBroadcast extends Logging {
         securityManager = securityMgr
         if (isDriver) {
           createServer(conf)
-          conf.set("spark.httpBroadcast.uri",  serverUri)
+          conf.set("spark.httpBroadcast.uri", serverUri)
         }
         serverUri = conf.get("spark.httpBroadcast.uri")
         cleaner = new MetadataCleaner(MetadataCleanerType.HTTP_BROADCAST, cleanup, conf)
@@ -187,7 +187,7 @@ private[broadcast] object HttpBroadcast extends Logging {
   }
 
   private def read[T: ClassTag](id: Long): T = {
-    logDebug("broadcast read server: " +  serverUri + " id: broadcast-" + id)
+    logDebug("broadcast read server: " + serverUri + " id: broadcast-" + id)
     val url = serverUri + "/" + BroadcastBlockId(id).name
 
     var uc: URLConnection = null
diff --git a/core/src/main/scala/org/apache/spark/deploy/FaultToleranceTest.scala b/core/src/main/scala/org/apache/spark/deploy/FaultToleranceTest.scala
index c048b78910f38..b4edb6109e839 100644
--- a/core/src/main/scala/org/apache/spark/deploy/FaultToleranceTest.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/FaultToleranceTest.scala
@@ -65,7 +65,7 @@ private object FaultToleranceTest extends App with Logging {
   private val workers = ListBuffer[TestWorkerInfo]()
   private var sc: SparkContext = _
 
-  private val zk =  SparkCuratorUtil.newClient(conf)
+  private val zk = SparkCuratorUtil.newClient(conf)
 
   private var numPassed = 0
   private var numFailed = 0
diff --git a/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala b/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
index 198371b70f14f..92bb5059a0313 100644
--- a/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
@@ -361,7 +361,7 @@ object SparkSubmit {
         pyArchives = pythonPath.mkString(",")
       }
 
-      pyArchives = pyArchives.split(",").map { localPath=>
+      pyArchives = pyArchives.split(",").map { localPath =>
         val localURI = Utils.resolveURI(localPath)
         if (localURI.getScheme != "local") {
           args.files = mergeFileLists(args.files, localURI.toString)
diff --git a/core/src/main/scala/org/apache/spark/deploy/worker/Worker.scala b/core/src/main/scala/org/apache/spark/deploy/worker/Worker.scala
index c8df024dda355..ebc6cd76c6afd 100755
--- a/core/src/main/scala/org/apache/spark/deploy/worker/Worker.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/worker/Worker.scala
@@ -554,7 +554,7 @@ private[deploy] object Worker extends Logging {
       conf = conf, securityManager = securityMgr)
     val masterAkkaUrls = masterUrls.map(Master.toAkkaUrl(_, AkkaUtils.protocol(actorSystem)))
     actorSystem.actorOf(Props(classOf[Worker], host, boundPort, webUiPort, cores, memory,
-      masterAkkaUrls, systemName, actorName,  workDir, conf, securityMgr), name = actorName)
+      masterAkkaUrls, systemName, actorName, workDir, conf, securityMgr), name = actorName)
     (actorSystem, boundPort)
   }
 
diff --git a/core/src/main/scala/org/apache/spark/executor/TaskMetrics.scala b/core/src/main/scala/org/apache/spark/executor/TaskMetrics.scala
index 06152f16ae618..d90ae405a0849 100644
--- a/core/src/main/scala/org/apache/spark/executor/TaskMetrics.scala
+++ b/core/src/main/scala/org/apache/spark/executor/TaskMetrics.scala
@@ -261,7 +261,7 @@ case class InputMetrics(readMethod: DataReadMethod.Value) {
    */
   private var _recordsRead: Long = _
   def recordsRead: Long = _recordsRead
-  def incRecordsRead(records: Long): Unit =  _recordsRead += records
+  def incRecordsRead(records: Long): Unit = _recordsRead += records
 
   /**
    * Invoke the bytesReadCallback and mutate bytesRead.
diff --git a/core/src/main/scala/org/apache/spark/mapreduce/SparkHadoopMapReduceUtil.scala b/core/src/main/scala/org/apache/spark/mapreduce/SparkHadoopMapReduceUtil.scala
index cfd20392d12f1..390d148bc97f9 100644
--- a/core/src/main/scala/org/apache/spark/mapreduce/SparkHadoopMapReduceUtil.scala
+++ b/core/src/main/scala/org/apache/spark/mapreduce/SparkHadoopMapReduceUtil.scala
@@ -60,7 +60,7 @@ trait SparkHadoopMapReduceUtil {
         val taskTypeClass = Class.forName("org.apache.hadoop.mapreduce.TaskType")
           .asInstanceOf[Class[Enum[_]]]
         val taskType = taskTypeClass.getMethod("valueOf", classOf[String]).invoke(
-          taskTypeClass, if(isMap) "MAP" else "REDUCE")
+          taskTypeClass, if (isMap) "MAP" else "REDUCE")
         val ctor = klass.getDeclaredConstructor(classOf[String], classOf[Int], taskTypeClass,
           classOf[Int], classOf[Int])
         ctor.newInstance(jtIdentifier, new JInteger(jobId), taskType, new JInteger(taskId),
diff --git a/core/src/main/scala/org/apache/spark/network/nio/BlockMessage.scala b/core/src/main/scala/org/apache/spark/network/nio/BlockMessage.scala
index b573f1a8a5fcb..1a92a799d004a 100644
--- a/core/src/main/scala/org/apache/spark/network/nio/BlockMessage.scala
+++ b/core/src/main/scala/org/apache/spark/network/nio/BlockMessage.scala
@@ -110,7 +110,7 @@ private[nio] class BlockMessage() {
   def getType: Int = typ
   def getId: BlockId = id
   def getData: ByteBuffer = data
-  def getLevel: StorageLevel =  level
+  def getLevel: StorageLevel = level
 
   def toBufferMessage: BufferMessage = {
     val buffers = new ArrayBuffer[ByteBuffer]()
diff --git a/core/src/main/scala/org/apache/spark/network/nio/BlockMessageArray.scala b/core/src/main/scala/org/apache/spark/network/nio/BlockMessageArray.scala
index 1ba25aa74aa02..7d0806f0c2580 100644
--- a/core/src/main/scala/org/apache/spark/network/nio/BlockMessageArray.scala
+++ b/core/src/main/scala/org/apache/spark/network/nio/BlockMessageArray.scala
@@ -114,8 +114,8 @@ private[nio] object BlockMessageArray {
     val blockMessages =
       (0 until 10).map { i =>
         if (i % 2 == 0) {
-          val buffer =  ByteBuffer.allocate(100)
-          buffer.clear
+          val buffer = ByteBuffer.allocate(100)
+          buffer.clear()
           BlockMessage.fromPutBlock(PutBlock(TestBlockId(i.toString), buffer,
             StorageLevel.MEMORY_ONLY_SER))
         } else {
diff --git a/core/src/main/scala/org/apache/spark/network/nio/SecurityMessage.scala b/core/src/main/scala/org/apache/spark/network/nio/SecurityMessage.scala
index 747a2088a7258..232c552f9865d 100644
--- a/core/src/main/scala/org/apache/spark/network/nio/SecurityMessage.scala
+++ b/core/src/main/scala/org/apache/spark/network/nio/SecurityMessage.scala
@@ -75,7 +75,7 @@ private[nio] class SecurityMessage extends Logging {
     for (i <- 1 to idLength) {
         idBuilder += buffer.getChar()
     }
-    connectionId  = idBuilder.toString()
+    connectionId = idBuilder.toString()
 
     val tokenLength = buffer.getInt()
     token = new Array[Byte](tokenLength)
diff --git a/core/src/main/scala/org/apache/spark/partial/GroupedCountEvaluator.scala b/core/src/main/scala/org/apache/spark/partial/GroupedCountEvaluator.scala
index 3ef3cc219dec6..91b07ce3af1b6 100644
--- a/core/src/main/scala/org/apache/spark/partial/GroupedCountEvaluator.scala
+++ b/core/src/main/scala/org/apache/spark/partial/GroupedCountEvaluator.scala
@@ -32,12 +32,12 @@ import org.apache.spark.util.collection.OpenHashMap
  * An ApproximateEvaluator for counts by key. Returns a map of key to confidence interval.
  */
 private[spark] class GroupedCountEvaluator[T : ClassTag](totalOutputs: Int, confidence: Double)
-  extends ApproximateEvaluator[OpenHashMap[T,Long], Map[T, BoundedDouble]] {
+  extends ApproximateEvaluator[OpenHashMap[T, Long], Map[T, BoundedDouble]] {
 
   var outputsMerged = 0
-  var sums = new OpenHashMap[T,Long]()   // Sum of counts for each key
+  var sums = new OpenHashMap[T, Long]()   // Sum of counts for each key
 
-  override def merge(outputId: Int, taskResult: OpenHashMap[T,Long]) {
+  override def merge(outputId: Int, taskResult: OpenHashMap[T, Long]) {
     outputsMerged += 1
     taskResult.foreach { case (key, value) =>
       sums.changeValue(key, value, _ + value)
diff --git a/core/src/main/scala/org/apache/spark/rdd/CheckpointRDD.scala b/core/src/main/scala/org/apache/spark/rdd/CheckpointRDD.scala
index 0d130dd4c7a60..a4715e3437d94 100644
--- a/core/src/main/scala/org/apache/spark/rdd/CheckpointRDD.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/CheckpointRDD.scala
@@ -49,7 +49,7 @@ class CheckpointRDD[T: ClassTag](sc: SparkContext, val checkpointPath: String)
     if (fs.exists(cpath)) {
       val dirContents = fs.listStatus(cpath).map(_.getPath)
       val partitionFiles = dirContents.filter(_.getName.startsWith("part-")).map(_.toString).sorted
-      val numPart =  partitionFiles.length
+      val numPart = partitionFiles.length
       if (numPart > 0 && (! partitionFiles(0).endsWith(CheckpointRDD.splitIdToFile(0)) ||
           ! partitionFiles(numPart-1).endsWith(CheckpointRDD.splitIdToFile(numPart-1)))) {
         throw new SparkException("Invalid checkpoint directory: " + checkpointPath)
diff --git a/core/src/main/scala/org/apache/spark/rdd/CoalescedRDD.scala b/core/src/main/scala/org/apache/spark/rdd/CoalescedRDD.scala
index 0c1b02c07d09f..663eebb8e4191 100644
--- a/core/src/main/scala/org/apache/spark/rdd/CoalescedRDD.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/CoalescedRDD.scala
@@ -310,11 +310,11 @@ private class PartitionCoalescer(maxPartitions: Int, prev: RDD[_], balanceSlack:
   def throwBalls() {
     if (noLocality) {  // no preferredLocations in parent RDD, no randomization needed
       if (maxPartitions > groupArr.size) { // just return prev.partitions
-        for ((p,i) <- prev.partitions.zipWithIndex) {
+        for ((p, i) <- prev.partitions.zipWithIndex) {
           groupArr(i).arr += p
         }
       } else { // no locality available, then simply split partitions based on positions in array
-        for(i <- 0 until maxPartitions) {
+        for (i <- 0 until maxPartitions) {
           val rangeStart = ((i.toLong * prev.partitions.length) / maxPartitions).toInt
           val rangeEnd = (((i.toLong + 1) * prev.partitions.length) / maxPartitions).toInt
           (rangeStart until rangeEnd).foreach{ j => groupArr(i).arr += prev.partitions(j) }
diff --git a/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala b/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala
index 8653cdee1adee..004899f27b7a6 100644
--- a/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala
@@ -467,7 +467,7 @@ class PairRDDFunctions[K, V](self: RDD[(K, V)])
     val mergeValue = (buf: CompactBuffer[V], v: V) => buf += v
     val mergeCombiners = (c1: CompactBuffer[V], c2: CompactBuffer[V]) => c1 ++= c2
     val bufs = combineByKey[CompactBuffer[V]](
-      createCombiner, mergeValue, mergeCombiners, partitioner, mapSideCombine=false)
+      createCombiner, mergeValue, mergeCombiners, partitioner, mapSideCombine = false)
     bufs.asInstanceOf[RDD[(K, Iterable[V])]]
   }
 
@@ -1011,7 +1011,7 @@ class PairRDDFunctions[K, V](self: RDD[(K, V)])
       jobFormat.checkOutputSpecs(job)
     }
 
-    val writeShard = (context: TaskContext, iter: Iterator[(K,V)]) => {
+    val writeShard = (context: TaskContext, iter: Iterator[(K, V)]) => {
       val config = wrappedConf.value
       /* "reduce task" <split #> <attempt # = spark task #> */
       val attemptId = newTaskAttemptID(jobtrackerID, stageId, isMap = false, context.partitionId,
@@ -1027,7 +1027,7 @@ class PairRDDFunctions[K, V](self: RDD[(K, V)])
 
       val (outputMetrics, bytesWrittenCallback) = initHadoopOutputMetrics(context)
 
-      val writer = format.getRecordWriter(hadoopContext).asInstanceOf[NewRecordWriter[K,V]]
+      val writer = format.getRecordWriter(hadoopContext).asInstanceOf[NewRecordWriter[K, V]]
       require(writer != null, "Unable to obtain RecordWriter")
       var recordsWritten = 0L
       Utils.tryWithSafeFinally {
diff --git a/core/src/main/scala/org/apache/spark/rdd/RDD.scala b/core/src/main/scala/org/apache/spark/rdd/RDD.scala
index d772f03f76651..5fcef255e13af 100644
--- a/core/src/main/scala/org/apache/spark/rdd/RDD.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/RDD.scala
@@ -454,7 +454,7 @@ abstract class RDD[T: ClassTag](
       withReplacement: Boolean,
       num: Int,
       seed: Long = Utils.random.nextLong): Array[T] = {
-    val numStDev =  10.0
+    val numStDev = 10.0
 
     if (num < 0) {
       throw new IllegalArgumentException("Negative number of elements requested")
@@ -1138,8 +1138,8 @@ abstract class RDD[T: ClassTag](
     if (elementClassTag.runtimeClass.isArray) {
       throw new SparkException("countByValueApprox() does not support arrays")
     }
-    val countPartition: (TaskContext, Iterator[T]) => OpenHashMap[T,Long] = { (ctx, iter) =>
-      val map = new OpenHashMap[T,Long]
+    val countPartition: (TaskContext, Iterator[T]) => OpenHashMap[T, Long] = { (ctx, iter) =>
+      val map = new OpenHashMap[T, Long]
       iter.foreach {
         t => map.changeValue(t, 1L, _ + 1L)
       }
@@ -1585,15 +1585,15 @@ abstract class RDD[T: ClassTag](
         case 0 => Seq.empty
         case 1 =>
           val d = rdd.dependencies.head
-          debugString(d.rdd, prefix, d.isInstanceOf[ShuffleDependency[_,_,_]], true)
+          debugString(d.rdd, prefix, d.isInstanceOf[ShuffleDependency[_, _, _]], true)
         case _ =>
           val frontDeps = rdd.dependencies.take(len - 1)
           val frontDepStrings = frontDeps.flatMap(
-            d => debugString(d.rdd, prefix, d.isInstanceOf[ShuffleDependency[_,_,_]]))
+            d => debugString(d.rdd, prefix, d.isInstanceOf[ShuffleDependency[_, _, _]]))
 
           val lastDep = rdd.dependencies.last
           val lastDepStrings =
-            debugString(lastDep.rdd, prefix, lastDep.isInstanceOf[ShuffleDependency[_,_,_]], true)
+            debugString(lastDep.rdd, prefix, lastDep.isInstanceOf[ShuffleDependency[_, _, _]], true)
 
           (frontDepStrings ++ lastDepStrings)
       }
diff --git a/core/src/main/scala/org/apache/spark/rdd/SequenceFileRDDFunctions.scala b/core/src/main/scala/org/apache/spark/rdd/SequenceFileRDDFunctions.scala
index 3dfcf67f0eb66..4b5f15dd06b85 100644
--- a/core/src/main/scala/org/apache/spark/rdd/SequenceFileRDDFunctions.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/SequenceFileRDDFunctions.scala
@@ -104,13 +104,13 @@ class SequenceFileRDDFunctions[K <% Writable: ClassTag, V <% Writable : ClassTag
     if (!convertKey && !convertValue) {
       self.saveAsHadoopFile(path, keyWritableClass, valueWritableClass, format, jobConf, codec)
     } else if (!convertKey && convertValue) {
-      self.map(x => (x._1,anyToWritable(x._2))).saveAsHadoopFile(
+      self.map(x => (x._1, anyToWritable(x._2))).saveAsHadoopFile(
         path, keyWritableClass, valueWritableClass, format, jobConf, codec)
     } else if (convertKey && !convertValue) {
-      self.map(x => (anyToWritable(x._1),x._2)).saveAsHadoopFile(
+      self.map(x => (anyToWritable(x._1), x._2)).saveAsHadoopFile(
         path, keyWritableClass, valueWritableClass, format, jobConf, codec)
     } else if (convertKey && convertValue) {
-      self.map(x => (anyToWritable(x._1),anyToWritable(x._2))).saveAsHadoopFile(
+      self.map(x => (anyToWritable(x._1), anyToWritable(x._2))).saveAsHadoopFile(
         path, keyWritableClass, valueWritableClass, format, jobConf, codec)
     }
   }
diff --git a/core/src/main/scala/org/apache/spark/rdd/SubtractedRDD.scala b/core/src/main/scala/org/apache/spark/rdd/SubtractedRDD.scala
index 633aeba3bbae6..f7cb1791d4ac6 100644
--- a/core/src/main/scala/org/apache/spark/rdd/SubtractedRDD.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/SubtractedRDD.scala
@@ -125,7 +125,7 @@ private[spark] class SubtractedRDD[K: ClassTag, V: ClassTag, W: ClassTag](
     integrate(0, t => getSeq(t._1) += t._2)
     // the second dep is rdd2; remove all of its keys
     integrate(1, t => map.remove(t._1))
-    map.iterator.map { t =>  t._2.iterator.map { (t._1, _) } }.flatten
+    map.iterator.map { t => t._2.iterator.map { (t._1, _) } }.flatten
   }
 
   override def clearDependencies() {
diff --git a/core/src/main/scala/org/apache/spark/rdd/ZippedPartitionsRDD.scala b/core/src/main/scala/org/apache/spark/rdd/ZippedPartitionsRDD.scala
index a96b6c3d23454..81f40ad33aa5d 100644
--- a/core/src/main/scala/org/apache/spark/rdd/ZippedPartitionsRDD.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/ZippedPartitionsRDD.scala
@@ -123,7 +123,7 @@ private[spark] class ZippedPartitionsRDD3
 }
 
 private[spark] class ZippedPartitionsRDD4
-  [A: ClassTag, B: ClassTag, C: ClassTag, D:ClassTag, V: ClassTag](
+  [A: ClassTag, B: ClassTag, C: ClassTag, D: ClassTag, V: ClassTag](
     sc: SparkContext,
     var f: (Iterator[A], Iterator[B], Iterator[C], Iterator[D]) => Iterator[V],
     var rdd1: RDD[A],
diff --git a/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala b/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala
index a2299e907c5ae..75a567fb31520 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala
@@ -1367,10 +1367,10 @@ class DAGScheduler(
   private def getPreferredLocsInternal(
       rdd: RDD[_],
       partition: Int,
-      visited: HashSet[(RDD[_],Int)]): Seq[TaskLocation] = {
+      visited: HashSet[(RDD[_], Int)]): Seq[TaskLocation] = {
     // If the partition has already been visited, no need to re-visit.
     // This avoids exponential path exploration.  SPARK-695
-    if (!visited.add((rdd,partition))) {
+    if (!visited.add((rdd, partition))) {
       // Nil has already been returned for previously visited partitions.
       return Nil
     }
diff --git a/core/src/main/scala/org/apache/spark/scheduler/DAGSchedulerSource.scala b/core/src/main/scala/org/apache/spark/scheduler/DAGSchedulerSource.scala
index 12668b6c0988e..02c67073af6a0 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/DAGSchedulerSource.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/DAGSchedulerSource.scala
@@ -17,9 +17,8 @@
 
 package org.apache.spark.scheduler
 
-import com.codahale.metrics.{Gauge,MetricRegistry}
+import com.codahale.metrics.{Gauge, MetricRegistry}
 
-import org.apache.spark.SparkContext
 import org.apache.spark.metrics.source.Source
 
 private[spark] class DAGSchedulerSource(val dagScheduler: DAGScheduler)
diff --git a/core/src/main/scala/org/apache/spark/scheduler/SchedulingAlgorithm.scala b/core/src/main/scala/org/apache/spark/scheduler/SchedulingAlgorithm.scala
index 5e62c8468f007..864941d468af9 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/SchedulingAlgorithm.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/SchedulingAlgorithm.scala
@@ -56,7 +56,7 @@ private[spark] class FairSchedulingAlgorithm extends SchedulingAlgorithm {
     val minShareRatio2 = runningTasks2.toDouble / math.max(minShare2, 1.0).toDouble
     val taskToWeightRatio1 = runningTasks1.toDouble / s1.weight.toDouble
     val taskToWeightRatio2 = runningTasks2.toDouble / s2.weight.toDouble
-    var compare:Int = 0
+    var compare: Int = 0
 
     if (s1Needy && !s2Needy) {
       return true
diff --git a/core/src/main/scala/org/apache/spark/scheduler/SparkListener.scala b/core/src/main/scala/org/apache/spark/scheduler/SparkListener.scala
index 863d0befbc19e..9620915f495ab 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/SparkListener.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/SparkListener.scala
@@ -270,7 +270,7 @@ class StatsReportListener extends SparkListener with Logging {
 private[spark] object StatsReportListener extends Logging {
 
   // For profiling, the extremes are more interesting
-  val percentiles = Array[Int](0,5,10,25,50,75,90,95,100)
+  val percentiles = Array[Int](0, 5, 10, 25, 50, 75, 90, 95, 100)
   val probabilities = percentiles.map(_ / 100.0)
   val percentilesHeader = "\t" + percentiles.mkString("%\t") + "%"
 
@@ -304,7 +304,7 @@ private[spark] object StatsReportListener extends Logging {
     dOpt.foreach { d => showDistribution(heading, d, formatNumber)}
   }
 
-  def showDistribution(heading: String, dOpt: Option[Distribution], format:String) {
+  def showDistribution(heading: String, dOpt: Option[Distribution], format: String) {
     def f(d: Double): String = format.format(d)
     showDistribution(heading, dOpt, f _)
   }
@@ -318,7 +318,7 @@ private[spark] object StatsReportListener extends Logging {
   }
 
   def showBytesDistribution(
-      heading:String,
+      heading: String,
       getMetric: (TaskInfo, TaskMetrics) => Option[Long],
       taskInfoMetrics: Seq[(TaskInfo, TaskMetrics)]) {
     showBytesDistribution(heading, extractLongDistribution(taskInfoMetrics, getMetric))
diff --git a/core/src/main/scala/org/apache/spark/scheduler/TaskSetManager.scala b/core/src/main/scala/org/apache/spark/scheduler/TaskSetManager.scala
index c4487d5b37247..d473e51abab80 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/TaskSetManager.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/TaskSetManager.scala
@@ -781,10 +781,10 @@ private[spark] class TaskSetManager(
     // that it's okay if we add a task to the same queue twice (if it had multiple preferred
     // locations), because dequeueTaskFromList will skip already-running tasks.
     for (index <- getPendingTasksForExecutor(execId)) {
-      addPendingTask(index, readding=true)
+      addPendingTask(index, readding = true)
     }
     for (index <- getPendingTasksForHost(host)) {
-      addPendingTask(index, readding=true)
+      addPendingTask(index, readding = true)
     }
 
     // Re-enqueue any tasks that ran on the failed executor if this is a shuffle map stage,
diff --git a/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedClusterMessage.scala b/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedClusterMessage.scala
index 70364cea62a80..4be1eda2e9291 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedClusterMessage.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedClusterMessage.scala
@@ -75,7 +75,8 @@ private[spark] object CoarseGrainedClusterMessages {
   case class SetupDriver(driver: RpcEndpointRef) extends CoarseGrainedClusterMessage
 
   // Exchanged between the driver and the AM in Yarn client mode
-  case class AddWebUIFilter(filterName:String, filterParams: Map[String, String], proxyBase: String)
+  case class AddWebUIFilter(
+      filterName: String, filterParams: Map[String, String], proxyBase: String)
     extends CoarseGrainedClusterMessage
 
   // Messages exchanged between the driver and the cluster manager for executor allocation
diff --git a/core/src/main/scala/org/apache/spark/scheduler/cluster/YarnSchedulerBackend.scala b/core/src/main/scala/org/apache/spark/scheduler/cluster/YarnSchedulerBackend.scala
index 2a3a5d925d06f..190ff61d689d1 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/cluster/YarnSchedulerBackend.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/cluster/YarnSchedulerBackend.scala
@@ -149,7 +149,7 @@ private[spark] abstract class YarnSchedulerBackend(
       }
     }
 
-    override def onStop(): Unit ={
+    override def onStop(): Unit = {
       askAmThreadPool.shutdownNow()
     }
   }
diff --git a/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/CoarseMesosSchedulerBackend.scala b/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/CoarseMesosSchedulerBackend.scala
index aff086594c73f..6b8edca5aa485 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/CoarseMesosSchedulerBackend.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/CoarseMesosSchedulerBackend.scala
@@ -52,7 +52,7 @@ private[spark] class CoarseMesosSchedulerBackend(
   val MAX_SLAVE_FAILURES = 2     // Blacklist a slave after this many failures
 
   // Maximum number of cores to acquire (TODO: we'll need more flexible controls here)
-  val maxCores = conf.get("spark.cores.max",  Int.MaxValue.toString).toInt
+  val maxCores = conf.get("spark.cores.max", Int.MaxValue.toString).toInt
 
   // Cores we have acquired with each Mesos task ID
   val coresByTaskId = new HashMap[Int, Int]
diff --git a/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosSchedulerBackend.scala b/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosSchedulerBackend.scala
index db0a080b3b0c0..49de85ef48ada 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosSchedulerBackend.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosSchedulerBackend.scala
@@ -146,7 +146,7 @@ private[spark] class MesosSchedulerBackend(
   private def createExecArg(): Array[Byte] = {
     if (execArgs == null) {
       val props = new HashMap[String, String]
-      for ((key,value) <- sc.conf.getAll) {
+      for ((key, value) <- sc.conf.getAll) {
         props(key) = value
       }
       // Serialize the map as an array of (String, String) pairs
diff --git a/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosSchedulerBackendUtil.scala b/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosSchedulerBackendUtil.scala
index 928c5cfed417a..2f2934c249eb0 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosSchedulerBackendUtil.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosSchedulerBackendUtil.scala
@@ -108,7 +108,7 @@ private[mesos] object MesosSchedulerBackendUtil extends Logging {
       image: String,
       volumes: Option[List[Volume]] = None,
       network: Option[ContainerInfo.DockerInfo.Network] = None,
-      portmaps: Option[List[ContainerInfo.DockerInfo.PortMapping]] = None):Unit = {
+      portmaps: Option[List[ContainerInfo.DockerInfo.PortMapping]] = None): Unit = {
 
     val docker = ContainerInfo.DockerInfo.newBuilder().setImage(image)
 
diff --git a/core/src/main/scala/org/apache/spark/status/api/v1/AllStagesResource.scala b/core/src/main/scala/org/apache/spark/status/api/v1/AllStagesResource.scala
index 50608588f09ae..390c136df79b3 100644
--- a/core/src/main/scala/org/apache/spark/status/api/v1/AllStagesResource.scala
+++ b/core/src/main/scala/org/apache/spark/status/api/v1/AllStagesResource.scala
@@ -169,7 +169,7 @@ private[v1] object AllStagesResource {
 
     val outputMetrics: Option[OutputMetricDistributions] =
       new MetricHelper[InternalOutputMetrics, OutputMetricDistributions](rawMetrics, quantiles) {
-        def getSubmetrics(raw:InternalTaskMetrics): Option[InternalOutputMetrics] = {
+        def getSubmetrics(raw: InternalTaskMetrics): Option[InternalOutputMetrics] = {
           raw.outputMetrics
         }
         def build: OutputMetricDistributions = new OutputMetricDistributions(
@@ -284,7 +284,7 @@ private[v1] object AllStagesResource {
  * the options (returning None if the metrics are all empty), and extract the quantiles for each
  * metric.  After creating an instance, call metricOption to get the result type.
  */
-private[v1] abstract class MetricHelper[I,O](
+private[v1] abstract class MetricHelper[I, O](
     rawMetrics: Seq[InternalTaskMetrics],
     quantiles: Array[Double]) {
 
diff --git a/core/src/main/scala/org/apache/spark/status/api/v1/ApiRootResource.scala b/core/src/main/scala/org/apache/spark/status/api/v1/ApiRootResource.scala
index bf2cc2e72f1fe..f73c742732dec 100644
--- a/core/src/main/scala/org/apache/spark/status/api/v1/ApiRootResource.scala
+++ b/core/src/main/scala/org/apache/spark/status/api/v1/ApiRootResource.scala
@@ -101,7 +101,7 @@ private[v1] class ApiRootResource extends UIRootFromServletContext {
 
 
   @Path("applications/{appId}/stages")
-  def getStages(@PathParam("appId") appId: String): AllStagesResource= {
+  def getStages(@PathParam("appId") appId: String): AllStagesResource = {
     uiRoot.withSparkUI(appId, None) { ui =>
       new AllStagesResource(ui)
     }
@@ -110,14 +110,14 @@ private[v1] class ApiRootResource extends UIRootFromServletContext {
   @Path("applications/{appId}/{attemptId}/stages")
   def getStages(
       @PathParam("appId") appId: String,
-      @PathParam("attemptId") attemptId: String): AllStagesResource= {
+      @PathParam("attemptId") attemptId: String): AllStagesResource = {
     uiRoot.withSparkUI(appId, Some(attemptId)) { ui =>
       new AllStagesResource(ui)
     }
   }
 
   @Path("applications/{appId}/stages/{stageId: \\d+}")
-  def getStage(@PathParam("appId") appId: String): OneStageResource= {
+  def getStage(@PathParam("appId") appId: String): OneStageResource = {
     uiRoot.withSparkUI(appId, None) { ui =>
       new OneStageResource(ui)
     }
@@ -171,7 +171,7 @@ private[spark] object ApiRootResource {
   def getServletHandler(uiRoot: UIRoot): ServletContextHandler = {
     val jerseyContext = new ServletContextHandler(ServletContextHandler.NO_SESSIONS)
     jerseyContext.setContextPath("/api")
-    val holder:ServletHolder = new ServletHolder(classOf[ServletContainer])
+    val holder: ServletHolder = new ServletHolder(classOf[ServletContainer])
     holder.setInitParameter("com.sun.jersey.config.property.resourceConfigClass",
       "com.sun.jersey.api.core.PackagesResourceConfig")
     holder.setInitParameter("com.sun.jersey.config.property.packages",
diff --git a/core/src/main/scala/org/apache/spark/status/api/v1/OneRDDResource.scala b/core/src/main/scala/org/apache/spark/status/api/v1/OneRDDResource.scala
index 07b224fac4786..dfdc09c6caf3b 100644
--- a/core/src/main/scala/org/apache/spark/status/api/v1/OneRDDResource.scala
+++ b/core/src/main/scala/org/apache/spark/status/api/v1/OneRDDResource.scala
@@ -25,7 +25,7 @@ import org.apache.spark.ui.SparkUI
 private[v1] class OneRDDResource(ui: SparkUI) {
 
   @GET
-  def rddData(@PathParam("rddId") rddId: Int): RDDStorageInfo  = {
+  def rddData(@PathParam("rddId") rddId: Int): RDDStorageInfo = {
     AllRDDResource.getRDDStorageInfo(rddId, ui.storageListener, true).getOrElse(
       throw new NotFoundException(s"no rdd found w/ id $rddId")
     )
diff --git a/core/src/main/scala/org/apache/spark/status/api/v1/api.scala b/core/src/main/scala/org/apache/spark/status/api/v1/api.scala
index ef3c8570d8186..2bec64f2ef02b 100644
--- a/core/src/main/scala/org/apache/spark/status/api/v1/api.scala
+++ b/core/src/main/scala/org/apache/spark/status/api/v1/api.scala
@@ -134,7 +134,7 @@ class StageData private[spark](
 
     val accumulatorUpdates: Seq[AccumulableInfo],
     val tasks: Option[Map[Long, TaskData]],
-    val executorSummary:Option[Map[String,ExecutorStageSummary]])
+    val executorSummary: Option[Map[String, ExecutorStageSummary]])
 
 class TaskData private[spark](
     val taskId: Long,
diff --git a/core/src/main/scala/org/apache/spark/storage/BlockManagerSlaveEndpoint.scala b/core/src/main/scala/org/apache/spark/storage/BlockManagerSlaveEndpoint.scala
index 543df4e1350dd..7478ab0fc2f7a 100644
--- a/core/src/main/scala/org/apache/spark/storage/BlockManagerSlaveEndpoint.scala
+++ b/core/src/main/scala/org/apache/spark/storage/BlockManagerSlaveEndpoint.scala
@@ -40,7 +40,7 @@ class BlockManagerSlaveEndpoint(
   private implicit val asyncExecutionContext = ExecutionContext.fromExecutorService(asyncThreadPool)
 
   // Operations that involve removing blocks may be slow and should be done asynchronously
-  override def receiveAndReply(context: RpcCallContext): PartialFunction[Any, Unit]  = {
+  override def receiveAndReply(context: RpcCallContext): PartialFunction[Any, Unit] = {
     case RemoveBlock(blockId) =>
       doAsync[Boolean]("removing block " + blockId, context) {
         blockManager.removeBlock(blockId)
diff --git a/core/src/main/scala/org/apache/spark/storage/BlockManagerSource.scala b/core/src/main/scala/org/apache/spark/storage/BlockManagerSource.scala
index 8569c6f3cbbc3..c5ba9af3e2658 100644
--- a/core/src/main/scala/org/apache/spark/storage/BlockManagerSource.scala
+++ b/core/src/main/scala/org/apache/spark/storage/BlockManagerSource.scala
@@ -17,9 +17,8 @@
 
 package org.apache.spark.storage
 
-import com.codahale.metrics.{Gauge,MetricRegistry}
+import com.codahale.metrics.{Gauge, MetricRegistry}
 
-import org.apache.spark.SparkContext
 import org.apache.spark.metrics.source.Source
 
 private[spark] class BlockManagerSource(val blockManager: BlockManager)
diff --git a/core/src/main/scala/org/apache/spark/ui/SparkUI.scala b/core/src/main/scala/org/apache/spark/ui/SparkUI.scala
index 0b11e914bb251..3788916cf39bb 100644
--- a/core/src/main/scala/org/apache/spark/ui/SparkUI.scala
+++ b/core/src/main/scala/org/apache/spark/ui/SparkUI.scala
@@ -137,7 +137,7 @@ private[spark] object SparkUI {
       jobProgressListener: JobProgressListener,
       securityManager: SecurityManager,
       appName: String,
-      startTime: Long): SparkUI =  {
+      startTime: Long): SparkUI = {
     create(Some(sc), conf, listenerBus, securityManager, appName,
       jobProgressListener = Some(jobProgressListener), startTime = startTime)
   }
diff --git a/core/src/main/scala/org/apache/spark/ui/UIUtils.scala b/core/src/main/scala/org/apache/spark/ui/UIUtils.scala
index 6194c50ec8c7c..65162f4fdcd62 100644
--- a/core/src/main/scala/org/apache/spark/ui/UIUtils.scala
+++ b/core/src/main/scala/org/apache/spark/ui/UIUtils.scala
@@ -309,7 +309,7 @@ private[spark] object UIUtils extends Logging {
       started: Int,
       completed: Int,
       failed: Int,
-      skipped:Int,
+      skipped: Int,
       total: Int): Seq[Node] = {
     val completeWidth = "width: %s%%".format((completed.toDouble/total)*100)
     val startWidth = "width: %s%%".format((started.toDouble/total)*100)
diff --git a/core/src/main/scala/org/apache/spark/ui/UIWorkloadGenerator.scala b/core/src/main/scala/org/apache/spark/ui/UIWorkloadGenerator.scala
index 5fbcd6bb8ad94..ba03acdb38cc5 100644
--- a/core/src/main/scala/org/apache/spark/ui/UIWorkloadGenerator.scala
+++ b/core/src/main/scala/org/apache/spark/ui/UIWorkloadGenerator.scala
@@ -54,7 +54,7 @@ private[spark] object UIWorkloadGenerator {
     val sc = new SparkContext(conf)
 
     def setProperties(s: String): Unit = {
-      if(schedulingMode == SchedulingMode.FAIR) {
+      if (schedulingMode == SchedulingMode.FAIR) {
         sc.setLocalProperty("spark.scheduler.pool", s)
       }
       sc.setLocalProperty(SparkContext.SPARK_JOB_DESCRIPTION, s)
diff --git a/core/src/main/scala/org/apache/spark/ui/storage/RDDPage.scala b/core/src/main/scala/org/apache/spark/ui/storage/RDDPage.scala
index fbce917a0824d..36943978ff594 100644
--- a/core/src/main/scala/org/apache/spark/ui/storage/RDDPage.scala
+++ b/core/src/main/scala/org/apache/spark/ui/storage/RDDPage.scala
@@ -33,7 +33,7 @@ private[ui] class RDDPage(parent: StorageTab) extends WebUIPage("rdd") {
     val parameterId = request.getParameter("id")
     require(parameterId != null && parameterId.nonEmpty, "Missing id parameter")
     val rddId = parameterId.toInt
-    val rddStorageInfo = AllRDDResource.getRDDStorageInfo(rddId, listener,includeDetails = true)
+    val rddStorageInfo = AllRDDResource.getRDDStorageInfo(rddId, listener, includeDetails = true)
       .getOrElse {
         // Rather than crashing, render an "RDD Not Found" page
         return UIUtils.headerSparkPage("RDD Not Found", Seq[Node](), parent)
diff --git a/core/src/main/scala/org/apache/spark/util/AkkaUtils.scala b/core/src/main/scala/org/apache/spark/util/AkkaUtils.scala
index 7513b1b795dea..96aa2fe164703 100644
--- a/core/src/main/scala/org/apache/spark/util/AkkaUtils.scala
+++ b/core/src/main/scala/org/apache/spark/util/AkkaUtils.scala
@@ -63,7 +63,7 @@ private[spark] object AkkaUtils extends Logging {
       conf: SparkConf,
       securityManager: SecurityManager): (ActorSystem, Int) = {
 
-    val akkaThreads   = conf.getInt("spark.akka.threads", 4)
+    val akkaThreads = conf.getInt("spark.akka.threads", 4)
     val akkaBatchSize = conf.getInt("spark.akka.batchSize", 15)
     val akkaTimeoutS = conf.getTimeAsSeconds("spark.akka.timeout",
       conf.get("spark.network.timeout", "120s"))
diff --git a/core/src/main/scala/org/apache/spark/util/CompletionIterator.scala b/core/src/main/scala/org/apache/spark/util/CompletionIterator.scala
index 9044aaeef2d48..31d230d0fec8e 100644
--- a/core/src/main/scala/org/apache/spark/util/CompletionIterator.scala
+++ b/core/src/main/scala/org/apache/spark/util/CompletionIterator.scala
@@ -42,7 +42,7 @@ abstract class CompletionIterator[ +A, +I <: Iterator[A]](sub: I) extends Iterat
 
 private[spark] object CompletionIterator {
   def apply[A, I <: Iterator[A]](sub: I, completionFunction: => Unit) : CompletionIterator[A, I] = {
-    new CompletionIterator[A,I](sub) {
+    new CompletionIterator[A, I](sub) {
       def completion(): Unit = completionFunction
     }
   }
diff --git a/core/src/main/scala/org/apache/spark/util/Distribution.scala b/core/src/main/scala/org/apache/spark/util/Distribution.scala
index 9aea8efa38c7a..1bab707235b89 100644
--- a/core/src/main/scala/org/apache/spark/util/Distribution.scala
+++ b/core/src/main/scala/org/apache/spark/util/Distribution.scala
@@ -35,7 +35,7 @@ private[spark] class Distribution(val data: Array[Double], val startIdx: Int, va
   java.util.Arrays.sort(data, startIdx, endIdx)
   val length = endIdx - startIdx
 
-  val defaultProbabilities = Array(0,0.25,0.5,0.75,1.0)
+  val defaultProbabilities = Array(0, 0.25, 0.5, 0.75, 1.0)
 
   /**
    * Get the value of the distribution at the given probabilities.  Probabilities should be
@@ -44,7 +44,7 @@ private[spark] class Distribution(val data: Array[Double], val startIdx: Int, va
    */
   def getQuantiles(probabilities: Traversable[Double] = defaultProbabilities)
       : IndexedSeq[Double] = {
-    probabilities.toIndexedSeq.map{p:Double => data(closestIndex(p))}
+    probabilities.toIndexedSeq.map { p: Double => data(closestIndex(p)) }
   }
 
   private def closestIndex(p: Double) = {
diff --git a/core/src/main/scala/org/apache/spark/util/MetadataCleaner.scala b/core/src/main/scala/org/apache/spark/util/MetadataCleaner.scala
index 2bbfc988a99a8..a8bbad086849e 100644
--- a/core/src/main/scala/org/apache/spark/util/MetadataCleaner.scala
+++ b/core/src/main/scala/org/apache/spark/util/MetadataCleaner.scala
@@ -89,7 +89,7 @@ private[spark] object MetadataCleaner {
       conf: SparkConf,
       cleanerType: MetadataCleanerType.MetadataCleanerType,
       delay: Int) {
-    conf.set(MetadataCleanerType.systemProperty(cleanerType),  delay.toString)
+    conf.set(MetadataCleanerType.systemProperty(cleanerType), delay.toString)
   }
 
   /**
diff --git a/core/src/main/scala/org/apache/spark/util/MutablePair.scala b/core/src/main/scala/org/apache/spark/util/MutablePair.scala
index dad888548ed10..3d95b7869f494 100644
--- a/core/src/main/scala/org/apache/spark/util/MutablePair.scala
+++ b/core/src/main/scala/org/apache/spark/util/MutablePair.scala
@@ -45,5 +45,5 @@ case class MutablePair[@specialized(Int, Long, Double, Char, Boolean/* , AnyRef
 
   override def toString: String = "(" + _1 + "," + _2 + ")"
 
-  override def canEqual(that: Any): Boolean = that.isInstanceOf[MutablePair[_,_]]
+  override def canEqual(that: Any): Boolean = that.isInstanceOf[MutablePair[_, _]]
 }
diff --git a/core/src/main/scala/org/apache/spark/util/SizeEstimator.scala b/core/src/main/scala/org/apache/spark/util/SizeEstimator.scala
index f38949c3cb846..f1f6b5e1f93d8 100644
--- a/core/src/main/scala/org/apache/spark/util/SizeEstimator.scala
+++ b/core/src/main/scala/org/apache/spark/util/SizeEstimator.scala
@@ -54,14 +54,14 @@ object SizeEstimator extends Logging {
   def estimate(obj: AnyRef): Long = estimate(obj, new IdentityHashMap[AnyRef, AnyRef])
 
   // Sizes of primitive types
-  private val BYTE_SIZE    = 1
+  private val BYTE_SIZE = 1
   private val BOOLEAN_SIZE = 1
-  private val CHAR_SIZE    = 2
-  private val SHORT_SIZE   = 2
-  private val INT_SIZE     = 4
-  private val LONG_SIZE    = 8
-  private val FLOAT_SIZE   = 4
-  private val DOUBLE_SIZE  = 8
+  private val CHAR_SIZE = 2
+  private val SHORT_SIZE = 2
+  private val INT_SIZE = 4
+  private val LONG_SIZE = 8
+  private val FLOAT_SIZE = 4
+  private val DOUBLE_SIZE = 8
 
   // Fields can be primitive types, sizes are: 1, 2, 4, 8. Or fields can be pointers. The size of
   // a pointer is 4 or 8 depending on the JVM (32-bit or 64-bit) and UseCompressedOops flag.
@@ -96,7 +96,7 @@ object SizeEstimator extends Logging {
     isCompressedOops = getIsCompressedOops
 
     objectSize = if (!is64bit) 8 else {
-      if(!isCompressedOops) {
+      if (!isCompressedOops) {
         16
       } else {
         12
diff --git a/core/src/main/scala/org/apache/spark/util/Utils.scala b/core/src/main/scala/org/apache/spark/util/Utils.scala
index b7a2473dfe920..763d4db690187 100644
--- a/core/src/main/scala/org/apache/spark/util/Utils.scala
+++ b/core/src/main/scala/org/apache/spark/util/Utils.scala
@@ -882,7 +882,7 @@ private[spark] object Utils extends Logging {
   // If not, we should change it to LRUCache or something.
   private val hostPortParseResults = new ConcurrentHashMap[String, (String, Int)]()
 
-  def parseHostPort(hostPort: String): (String,  Int) = {
+  def parseHostPort(hostPort: String): (String, Int) = {
     // Check cache first.
     val cached = hostPortParseResults.get(hostPort)
     if (cached != null) {
diff --git a/core/src/main/scala/org/apache/spark/util/collection/BitSet.scala b/core/src/main/scala/org/apache/spark/util/collection/BitSet.scala
index 41cb8cfe2afa3..9c15b1188d91c 100644
--- a/core/src/main/scala/org/apache/spark/util/collection/BitSet.scala
+++ b/core/src/main/scala/org/apache/spark/util/collection/BitSet.scala
@@ -161,7 +161,7 @@ class BitSet(numBits: Int) extends Serializable {
     override def hasNext: Boolean = ind >= 0
     override def next(): Int = {
       val tmp = ind
-      ind  = nextSetBit(ind + 1)
+      ind = nextSetBit(ind + 1)
       tmp
     }
   }
diff --git a/core/src/main/scala/org/apache/spark/util/collection/SortDataFormat.scala b/core/src/main/scala/org/apache/spark/util/collection/SortDataFormat.scala
index 4f0bf8384afc9..9a7a5a4e74868 100644
--- a/core/src/main/scala/org/apache/spark/util/collection/SortDataFormat.scala
+++ b/core/src/main/scala/org/apache/spark/util/collection/SortDataFormat.scala
@@ -90,9 +90,9 @@ class KVArraySortDataFormat[K, T <: AnyRef : ClassTag] extends SortDataFormat[K,
   override def swap(data: Array[T], pos0: Int, pos1: Int) {
     val tmpKey = data(2 * pos0)
     val tmpVal = data(2 * pos0 + 1)
-    data(2 * pos0)     = data(2 * pos1)
+    data(2 * pos0) = data(2 * pos1)
     data(2 * pos0 + 1) = data(2 * pos1 + 1)
-    data(2 * pos1)     = tmpKey
+    data(2 * pos1) = tmpKey
     data(2 * pos1 + 1) = tmpVal
   }
 
diff --git a/core/src/main/scala/org/apache/spark/util/random/StratifiedSamplingUtils.scala b/core/src/main/scala/org/apache/spark/util/random/StratifiedSamplingUtils.scala
index 9e29bf9d61f17..effe6fa2adcfa 100644
--- a/core/src/main/scala/org/apache/spark/util/random/StratifiedSamplingUtils.scala
+++ b/core/src/main/scala/org/apache/spark/util/random/StratifiedSamplingUtils.scala
@@ -196,7 +196,7 @@ private[spark] object StratifiedSamplingUtils extends Logging {
    *
    * The sampling function has a unique seed per partition.
    */
-  def getBernoulliSamplingFunction[K, V](rdd: RDD[(K,  V)],
+  def getBernoulliSamplingFunction[K, V](rdd: RDD[(K, V)],
       fractions: Map[K, Double],
       exact: Boolean,
       seed: Long): (Int, Iterator[(K, V)]) => Iterator[(K, V)] = {
diff --git a/core/src/test/scala/org/apache/spark/AccumulatorSuite.scala b/core/src/test/scala/org/apache/spark/AccumulatorSuite.scala
index 75399461f2a5f..746a40a21bf9e 100644
--- a/core/src/test/scala/org/apache/spark/AccumulatorSuite.scala
+++ b/core/src/test/scala/org/apache/spark/AccumulatorSuite.scala
@@ -103,7 +103,7 @@ class AccumulatorSuite extends FunSuite with Matchers with LocalSparkContext {
       sc = new SparkContext("local[" + nThreads + "]", "test")
       val setAcc = sc.accumulableCollection(mutable.HashSet[Int]())
       val bufferAcc = sc.accumulableCollection(mutable.ArrayBuffer[Int]())
-      val mapAcc = sc.accumulableCollection(mutable.HashMap[Int,String]())
+      val mapAcc = sc.accumulableCollection(mutable.HashMap[Int, String]())
       val d = sc.parallelize((1 to maxI) ++ (1 to maxI))
       d.foreach {
         x => {setAcc += x; bufferAcc += x; mapAcc += (x -> x.toString)}
diff --git a/core/src/test/scala/org/apache/spark/CheckpointSuite.scala b/core/src/test/scala/org/apache/spark/CheckpointSuite.scala
index e1faddeabec79..91d8fdedbe0f3 100644
--- a/core/src/test/scala/org/apache/spark/CheckpointSuite.scala
+++ b/core/src/test/scala/org/apache/spark/CheckpointSuite.scala
@@ -218,10 +218,10 @@ class CheckpointSuite extends FunSuite with LocalSparkContext with Logging {
     val pairRDD = generateFatPairRDD()
     pairRDD.checkpoint()
     val unionRDD = new PartitionerAwareUnionRDD(sc, Array(pairRDD))
-    val partitionBeforeCheckpoint =  serializeDeserialize(
+    val partitionBeforeCheckpoint = serializeDeserialize(
       unionRDD.partitions.head.asInstanceOf[PartitionerAwareUnionRDDPartition])
     pairRDD.count()
-    val partitionAfterCheckpoint =  serializeDeserialize(
+    val partitionAfterCheckpoint = serializeDeserialize(
       unionRDD.partitions.head.asInstanceOf[PartitionerAwareUnionRDDPartition])
     assert(
       partitionBeforeCheckpoint.parents.head.getClass !=
diff --git a/core/src/test/scala/org/apache/spark/ContextCleanerSuite.scala b/core/src/test/scala/org/apache/spark/ContextCleanerSuite.scala
index 0922a2c3599cc..4a48f6580c78e 100644
--- a/core/src/test/scala/org/apache/spark/ContextCleanerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/ContextCleanerSuite.scala
@@ -158,7 +158,7 @@ class ContextCleanerSuite extends ContextCleanerSuiteBase {
     rdd.count()
 
     // Test that GC does not cause RDD cleanup due to a strong reference
-    val preGCTester =  new CleanerTester(sc, rddIds = Seq(rdd.id))
+    val preGCTester = new CleanerTester(sc, rddIds = Seq(rdd.id))
     runGC()
     intercept[Exception] {
       preGCTester.assertCleanup()(timeout(1000 millis))
@@ -195,7 +195,7 @@ class ContextCleanerSuite extends ContextCleanerSuiteBase {
     var broadcast = newBroadcast()
 
     // Test that GC does not cause broadcast cleanup due to a strong reference
-    val preGCTester =  new CleanerTester(sc, broadcastIds = Seq(broadcast.id))
+    val preGCTester = new CleanerTester(sc, broadcastIds = Seq(broadcast.id))
     runGC()
     intercept[Exception] {
       preGCTester.assertCleanup()(timeout(1000 millis))
@@ -267,7 +267,7 @@ class ContextCleanerSuite extends ContextCleanerSuiteBase {
     val shuffleIds = 0 until sc.newShuffleId
     val broadcastIds = broadcastBuffer.map(_.id)
 
-    val preGCTester =  new CleanerTester(sc, rddIds, shuffleIds, broadcastIds)
+    val preGCTester = new CleanerTester(sc, rddIds, shuffleIds, broadcastIds)
     runGC()
     intercept[Exception] {
       preGCTester.assertCleanup()(timeout(1000 millis))
diff --git a/core/src/test/scala/org/apache/spark/FailureSuite.scala b/core/src/test/scala/org/apache/spark/FailureSuite.scala
index 1212d0b43207d..cade1fda2c7be 100644
--- a/core/src/test/scala/org/apache/spark/FailureSuite.scala
+++ b/core/src/test/scala/org/apache/spark/FailureSuite.scala
@@ -57,7 +57,7 @@ class FailureSuite extends FunSuite with LocalSparkContext {
     FailureSuiteState.synchronized {
       assert(FailureSuiteState.tasksRun === 4)
     }
-    assert(results.toList === List(1,4,9))
+    assert(results.toList === List(1, 4, 9))
     FailureSuiteState.clear()
   }
 
diff --git a/core/src/test/scala/org/apache/spark/FileServerSuite.scala b/core/src/test/scala/org/apache/spark/FileServerSuite.scala
index c0439f934813e..bff2d10b9946c 100644
--- a/core/src/test/scala/org/apache/spark/FileServerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/FileServerSuite.scala
@@ -81,7 +81,7 @@ class FileServerSuite extends FunSuite with LocalSparkContext {
   test("Distributing files locally") {
     sc = new SparkContext("local[4]", "test", newConf)
     sc.addFile(tmpFile.toString)
-    val testData = Array((1,1), (1,1), (2,1), (3,5), (2,2), (3,0))
+    val testData = Array((1, 1), (1, 1), (2, 1), (3, 5), (2, 2), (3, 0))
     val result = sc.parallelize(testData).reduceByKey {
       val path = SparkFiles.get("FileServerSuite.txt")
       val in = new BufferedReader(new FileReader(path))
@@ -89,7 +89,7 @@ class FileServerSuite extends FunSuite with LocalSparkContext {
       in.close()
       _ * fileVal + _ * fileVal
     }.collect()
-    assert(result.toSet === Set((1,200), (2,300), (3,500)))
+    assert(result.toSet === Set((1, 200), (2, 300), (3, 500)))
   }
 
   test("Distributing files locally security On") {
@@ -100,7 +100,7 @@ class FileServerSuite extends FunSuite with LocalSparkContext {
 
     sc.addFile(tmpFile.toString)
     assert(sc.env.securityManager.isAuthenticationEnabled() === true)
-    val testData = Array((1,1), (1,1), (2,1), (3,5), (2,2), (3,0))
+    val testData = Array((1, 1), (1, 1), (2, 1), (3, 5), (2, 2), (3, 0))
     val result = sc.parallelize(testData).reduceByKey {
       val path = SparkFiles.get("FileServerSuite.txt")
       val in = new BufferedReader(new FileReader(path))
@@ -108,14 +108,14 @@ class FileServerSuite extends FunSuite with LocalSparkContext {
       in.close()
       _ * fileVal + _ * fileVal
     }.collect()
-    assert(result.toSet === Set((1,200), (2,300), (3,500)))
+    assert(result.toSet === Set((1, 200), (2, 300), (3, 500)))
   }
 
   test("Distributing files locally using URL as input") {
     // addFile("file:///....")
     sc = new SparkContext("local[4]", "test", newConf)
     sc.addFile(new File(tmpFile.toString).toURI.toString)
-    val testData = Array((1,1), (1,1), (2,1), (3,5), (2,2), (3,0))
+    val testData = Array((1, 1), (1, 1), (2, 1), (3, 5), (2, 2), (3, 0))
     val result = sc.parallelize(testData).reduceByKey {
       val path = SparkFiles.get("FileServerSuite.txt")
       val in = new BufferedReader(new FileReader(path))
@@ -123,7 +123,7 @@ class FileServerSuite extends FunSuite with LocalSparkContext {
       in.close()
       _ * fileVal + _ * fileVal
     }.collect()
-    assert(result.toSet === Set((1,200), (2,300), (3,500)))
+    assert(result.toSet === Set((1, 200), (2, 300), (3, 500)))
   }
 
   test ("Dynamically adding JARS locally") {
@@ -140,7 +140,7 @@ class FileServerSuite extends FunSuite with LocalSparkContext {
   test("Distributing files on a standalone cluster") {
     sc = new SparkContext("local-cluster[1,1,512]", "test", newConf)
     sc.addFile(tmpFile.toString)
-    val testData = Array((1,1), (1,1), (2,1), (3,5), (2,2), (3,0))
+    val testData = Array((1, 1), (1, 1), (2, 1), (3, 5), (2, 2), (3, 0))
     val result = sc.parallelize(testData).reduceByKey {
       val path = SparkFiles.get("FileServerSuite.txt")
       val in = new BufferedReader(new FileReader(path))
@@ -148,13 +148,13 @@ class FileServerSuite extends FunSuite with LocalSparkContext {
       in.close()
       _ * fileVal + _ * fileVal
     }.collect()
-    assert(result.toSet === Set((1,200), (2,300), (3,500)))
+    assert(result.toSet === Set((1, 200), (2, 300), (3, 500)))
   }
 
   test ("Dynamically adding JARS on a standalone cluster") {
     sc = new SparkContext("local-cluster[1,1,512]", "test", newConf)
     sc.addJar(tmpJarUrl)
-    val testData = Array((1,1))
+    val testData = Array((1, 1))
     sc.parallelize(testData).foreach { x =>
       if (Thread.currentThread.getContextClassLoader.getResource("FileServerSuite.txt") == null) {
         throw new SparkException("jar not added")
@@ -165,7 +165,7 @@ class FileServerSuite extends FunSuite with LocalSparkContext {
   test ("Dynamically adding JARS on a standalone cluster using local: URL") {
     sc = new SparkContext("local-cluster[1,1,512]", "test", newConf)
     sc.addJar(tmpJarUrl.replace("file", "local"))
-    val testData = Array((1,1))
+    val testData = Array((1, 1))
     sc.parallelize(testData).foreach { x =>
       if (Thread.currentThread.getContextClassLoader.getResource("FileServerSuite.txt") == null) {
         throw new SparkException("jar not added")
diff --git a/core/src/test/scala/org/apache/spark/FileSuite.scala b/core/src/test/scala/org/apache/spark/FileSuite.scala
index c8f08eed47c76..d67de8692df62 100644
--- a/core/src/test/scala/org/apache/spark/FileSuite.scala
+++ b/core/src/test/scala/org/apache/spark/FileSuite.scala
@@ -334,7 +334,7 @@ class FileSuite extends FunSuite with LocalSparkContext {
     }
     val copyRdd = mappedRdd.flatMap {
       curData: (String, PortableDataStream) =>
-        for(i <- 1 to numOfCopies) yield (i, curData._2)
+        for (i <- 1 to numOfCopies) yield (i, curData._2)
     }
 
     val copyArr: Array[(Int, PortableDataStream)] = copyRdd.collect()
diff --git a/core/src/test/scala/org/apache/spark/ImplicitOrderingSuite.scala b/core/src/test/scala/org/apache/spark/ImplicitOrderingSuite.scala
index 51348c039b5c9..69314deda1f03 100644
--- a/core/src/test/scala/org/apache/spark/ImplicitOrderingSuite.scala
+++ b/core/src/test/scala/org/apache/spark/ImplicitOrderingSuite.scala
@@ -44,11 +44,11 @@ private object ImplicitOrderingSuite {
   class NonOrderedClass {}
 
   class ComparableClass extends Comparable[ComparableClass] {
-    override def compareTo(o: ComparableClass): Int = ???
+    override def compareTo(o: ComparableClass): Int = throw new UnsupportedOperationException
   }
 
   class OrderedClass extends Ordered[OrderedClass] {
-    override def compare(o: OrderedClass): Int = ???
+    override def compare(o: OrderedClass): Int = throw new UnsupportedOperationException
   }
   
   def basicMapExpectations(rdd: RDD[Int]): List[(Boolean, String)] = {
diff --git a/core/src/test/scala/org/apache/spark/SparkConfSuite.scala b/core/src/test/scala/org/apache/spark/SparkConfSuite.scala
index fafa4ed606b08..fafc9d47503b7 100644
--- a/core/src/test/scala/org/apache/spark/SparkConfSuite.scala
+++ b/core/src/test/scala/org/apache/spark/SparkConfSuite.scala
@@ -34,18 +34,18 @@ class SparkConfSuite extends FunSuite with LocalSparkContext with ResetSystemPro
     val conf = new SparkConf()
     // Simply exercise the API, we don't need a complete conversion test since that's handled in
     // UtilsSuite.scala
-    assert(conf.getSizeAsBytes("fake","1k") === ByteUnit.KiB.toBytes(1))
-    assert(conf.getSizeAsKb("fake","1k") === ByteUnit.KiB.toKiB(1))
-    assert(conf.getSizeAsMb("fake","1k") === ByteUnit.KiB.toMiB(1))
-    assert(conf.getSizeAsGb("fake","1k") === ByteUnit.KiB.toGiB(1))
+    assert(conf.getSizeAsBytes("fake", "1k") === ByteUnit.KiB.toBytes(1))
+    assert(conf.getSizeAsKb("fake", "1k") === ByteUnit.KiB.toKiB(1))
+    assert(conf.getSizeAsMb("fake", "1k") === ByteUnit.KiB.toMiB(1))
+    assert(conf.getSizeAsGb("fake", "1k") === ByteUnit.KiB.toGiB(1))
   }
 
   test("Test timeString conversion") {
     val conf = new SparkConf()
     // Simply exercise the API, we don't need a complete conversion test since that's handled in
     // UtilsSuite.scala
-    assert(conf.getTimeAsMs("fake","1ms") === TimeUnit.MILLISECONDS.toMillis(1))
-    assert(conf.getTimeAsSeconds("fake","1000ms") === TimeUnit.MILLISECONDS.toSeconds(1000))
+    assert(conf.getTimeAsMs("fake", "1ms") === TimeUnit.MILLISECONDS.toMillis(1))
+    assert(conf.getTimeAsSeconds("fake", "1000ms") === TimeUnit.MILLISECONDS.toSeconds(1000))
   }
 
   test("loading from system properties") {
diff --git a/core/src/test/scala/org/apache/spark/SparkContextSuite.scala b/core/src/test/scala/org/apache/spark/SparkContextSuite.scala
index 9049db7755358..31ef5cd75bd4a 100644
--- a/core/src/test/scala/org/apache/spark/SparkContextSuite.scala
+++ b/core/src/test/scala/org/apache/spark/SparkContextSuite.scala
@@ -222,8 +222,8 @@ class SparkContextSuite extends FunSuite with LocalSparkContext {
     val dir1 = Utils.createTempDir()
     val dir2 = Utils.createTempDir()
 
-    val dirpath1=dir1.getAbsolutePath
-    val dirpath2=dir2.getAbsolutePath
+    val dirpath1 = dir1.getAbsolutePath
+    val dirpath2 = dir2.getAbsolutePath
 
     // file1 and file2 are placed inside dir1, they are also used for
     // textFile, hadoopFile, and newAPIHadoopFile
@@ -235,11 +235,11 @@ class SparkContextSuite extends FunSuite with LocalSparkContext {
     val file4 = new File(dir2, "part-00001")
     val file5 = new File(dir2, "part-00002")
 
-    val filepath1=file1.getAbsolutePath
-    val filepath2=file2.getAbsolutePath
-    val filepath3=file3.getAbsolutePath
-    val filepath4=file4.getAbsolutePath
-    val filepath5=file5.getAbsolutePath
+    val filepath1 = file1.getAbsolutePath
+    val filepath2 = file2.getAbsolutePath
+    val filepath3 = file3.getAbsolutePath
+    val filepath4 = file4.getAbsolutePath
+    val filepath5 = file5.getAbsolutePath
 
 
     try {
diff --git a/core/src/test/scala/org/apache/spark/broadcast/BroadcastSuite.scala b/core/src/test/scala/org/apache/spark/broadcast/BroadcastSuite.scala
index 06e5f1cf6b96f..c38e306b6ac40 100644
--- a/core/src/test/scala/org/apache/spark/broadcast/BroadcastSuite.scala
+++ b/core/src/test/scala/org/apache/spark/broadcast/BroadcastSuite.scala
@@ -286,7 +286,7 @@ class BroadcastSuite extends FunSuite with LocalSparkContext {
       assert(statuses.size === expectedNumBlocks)
     }
 
-    testUnpersistBroadcast(distributed, numSlaves,  torrentConf, afterCreation,
+    testUnpersistBroadcast(distributed, numSlaves, torrentConf, afterCreation,
       afterUsingBroadcast, afterUnpersist, removeFromDriver)
   }
 
diff --git a/core/src/test/scala/org/apache/spark/deploy/worker/WorkerArgumentsTest.scala b/core/src/test/scala/org/apache/spark/deploy/worker/WorkerArgumentsTest.scala
index 7cc2104281464..e432b8e94654a 100644
--- a/core/src/test/scala/org/apache/spark/deploy/worker/WorkerArgumentsTest.scala
+++ b/core/src/test/scala/org/apache/spark/deploy/worker/WorkerArgumentsTest.scala
@@ -66,7 +66,7 @@ class WorkerArgumentsTest extends FunSuite {
       }
     }
     val conf = new MySparkConf()
-    val workerArgs =  new WorkerArguments(args, conf)
+    val workerArgs = new WorkerArguments(args, conf)
     assert(workerArgs.memory === 5120)
   }
 
diff --git a/core/src/test/scala/org/apache/spark/deploy/worker/WorkerSuite.scala b/core/src/test/scala/org/apache/spark/deploy/worker/WorkerSuite.scala
index 450fba21f4b5c..93a779d5ce6f2 100644
--- a/core/src/test/scala/org/apache/spark/deploy/worker/WorkerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/deploy/worker/WorkerSuite.scala
@@ -25,7 +25,7 @@ import org.scalatest.{Matchers, FunSuite}
 class WorkerSuite extends FunSuite with Matchers {
 
   def cmd(javaOpts: String*): Command = {
-    Command("", Seq.empty, Map.empty, Seq.empty, Seq.empty, Seq(javaOpts:_*))
+    Command("", Seq.empty, Map.empty, Seq.empty, Seq.empty, Seq(javaOpts : _*))
   }
   def conf(opts: (String, String)*): SparkConf = new SparkConf(loadDefaults = false).setAll(opts)
 
diff --git a/core/src/test/scala/org/apache/spark/metrics/InputOutputMetricsSuite.scala b/core/src/test/scala/org/apache/spark/metrics/InputOutputMetricsSuite.scala
index ef3e213f1fcce..60dba3b2d6719 100644
--- a/core/src/test/scala/org/apache/spark/metrics/InputOutputMetricsSuite.scala
+++ b/core/src/test/scala/org/apache/spark/metrics/InputOutputMetricsSuite.scala
@@ -263,7 +263,7 @@ class InputOutputMetricsSuite extends FunSuite with SharedSparkContext
 
     val tmpRdd = sc.textFile(tmpFilePath, numPartitions)
 
-    val firstSize= runAndReturnBytesRead {
+    val firstSize = runAndReturnBytesRead {
       aRdd.count()
     }
     val secondSize = runAndReturnBytesRead {
@@ -433,10 +433,10 @@ class OldCombineTextRecordReaderWrapper(
 /**
  * Hadoop 2 has a version of this, but we can't use it for backwards compatibility
  */
-class NewCombineTextInputFormat extends NewCombineFileInputFormat[LongWritable,Text] {
+class NewCombineTextInputFormat extends NewCombineFileInputFormat[LongWritable, Text] {
   def createRecordReader(split: NewInputSplit, context: TaskAttemptContext)
   : NewRecordReader[LongWritable, Text] = {
-    new NewCombineFileRecordReader[LongWritable,Text](split.asInstanceOf[NewCombineFileSplit],
+    new NewCombineFileRecordReader[LongWritable, Text](split.asInstanceOf[NewCombineFileSplit],
       context, classOf[NewCombineTextRecordReaderWrapper])
   }
 }
diff --git a/core/src/test/scala/org/apache/spark/rdd/PairRDDFunctionsSuite.scala b/core/src/test/scala/org/apache/spark/rdd/PairRDDFunctionsSuite.scala
index ca0d953d306d8..6564232986cfa 100644
--- a/core/src/test/scala/org/apache/spark/rdd/PairRDDFunctionsSuite.scala
+++ b/core/src/test/scala/org/apache/spark/rdd/PairRDDFunctionsSuite.scala
@@ -512,17 +512,17 @@ class PairRDDFunctionsSuite extends FunSuite with SharedSparkContext {
   }
 
   test("lookup") {
-    val pairs = sc.parallelize(Array((1,2), (3,4), (5,6), (5,7)))
+    val pairs = sc.parallelize(Array((1, 2), (3, 4), (5, 6), (5, 7)))
 
     assert(pairs.partitioner === None)
     assert(pairs.lookup(1) === Seq(2))
-    assert(pairs.lookup(5) === Seq(6,7))
+    assert(pairs.lookup(5) === Seq(6, 7))
     assert(pairs.lookup(-1) === Seq())
 
   }
 
   test("lookup with partitioner") {
-    val pairs = sc.parallelize(Array((1,2), (3,4), (5,6), (5,7)))
+    val pairs = sc.parallelize(Array((1, 2), (3, 4), (5, 6), (5, 7)))
 
     val p = new Partitioner {
       def numPartitions: Int = 2
@@ -533,12 +533,12 @@ class PairRDDFunctionsSuite extends FunSuite with SharedSparkContext {
 
     assert(shuffled.partitioner === Some(p))
     assert(shuffled.lookup(1) === Seq(2))
-    assert(shuffled.lookup(5) === Seq(6,7))
+    assert(shuffled.lookup(5) === Seq(6, 7))
     assert(shuffled.lookup(-1) === Seq())
   }
 
   test("lookup with bad partitioner") {
-    val pairs = sc.parallelize(Array((1,2), (3,4), (5,6), (5,7)))
+    val pairs = sc.parallelize(Array((1, 2), (3, 4), (5, 6), (5, 7)))
 
     val p = new Partitioner {
       def numPartitions: Int = 2
diff --git a/core/src/test/scala/org/apache/spark/rdd/RDDSuite.scala b/core/src/test/scala/org/apache/spark/rdd/RDDSuite.scala
index afc11bdc4d6ab..8079d5dcaea81 100644
--- a/core/src/test/scala/org/apache/spark/rdd/RDDSuite.scala
+++ b/core/src/test/scala/org/apache/spark/rdd/RDDSuite.scala
@@ -338,10 +338,10 @@ class RDDSuite extends FunSuite with SharedSparkContext {
   }
 
   test("coalesced RDDs with locality") {
-    val data3 = sc.makeRDD(List((1,List("a","c")), (2,List("a","b","c")), (3,List("b"))))
+    val data3 = sc.makeRDD(List((1, List("a", "c")), (2, List("a", "b", "c")), (3, List("b"))))
     val coal3 = data3.coalesce(3)
     val list3 = coal3.partitions.flatMap(_.asInstanceOf[CoalescedRDDPartition].preferredLocation)
-    assert(list3.sorted === Array("a","b","c"), "Locality preferences are dropped")
+    assert(list3.sorted === Array("a", "b", "c"), "Locality preferences are dropped")
 
     // RDD with locality preferences spread (non-randomly) over 6 machines, m0 through m5
     val data = sc.makeRDD((1 to 9).map(i => (i, (i to (i + 2)).map{ j => "m" + (j%6)})))
@@ -591,8 +591,8 @@ class RDDSuite extends FunSuite with SharedSparkContext {
     assert(sc.emptyRDD.isEmpty())
     assert(sc.parallelize(Seq[Int]()).isEmpty())
     assert(!sc.parallelize(Seq(1)).isEmpty())
-    assert(sc.parallelize(Seq(1,2,3), 3).filter(_ < 0).isEmpty())
-    assert(!sc.parallelize(Seq(1,2,3), 3).filter(_ > 1).isEmpty())
+    assert(sc.parallelize(Seq(1, 2, 3), 3).filter(_ < 0).isEmpty())
+    assert(!sc.parallelize(Seq(1, 2, 3), 3).filter(_ > 1).isEmpty())
   }
 
   test("sample preserves partitioner") {
@@ -609,49 +609,49 @@ class RDDSuite extends FunSuite with SharedSparkContext {
     val data = sc.parallelize(1 to n, 2)
 
     for (num <- List(5, 20, 100)) {
-      val sample = data.takeSample(withReplacement=false, num=num)
+      val sample = data.takeSample(withReplacement = false, num = num)
       assert(sample.size === num)        // Got exactly num elements
       assert(sample.toSet.size === num)  // Elements are distinct
       assert(sample.forall(x => 1 <= x && x <= n), s"elements not in [1, $n]")
     }
     for (seed <- 1 to 5) {
-      val sample = data.takeSample(withReplacement=false, 20, seed)
+      val sample = data.takeSample(withReplacement = false, 20, seed)
       assert(sample.size === 20)        // Got exactly 20 elements
       assert(sample.toSet.size === 20)  // Elements are distinct
       assert(sample.forall(x => 1 <= x && x <= n), s"elements not in [1, $n]")
     }
     for (seed <- 1 to 5) {
-      val sample = data.takeSample(withReplacement=false, 100, seed)
+      val sample = data.takeSample(withReplacement = false, 100, seed)
       assert(sample.size === 100)        // Got only 100 elements
       assert(sample.toSet.size === 100)  // Elements are distinct
       assert(sample.forall(x => 1 <= x && x <= n), s"elements not in [1, $n]")
     }
     for (seed <- 1 to 5) {
-      val sample = data.takeSample(withReplacement=true, 20, seed)
+      val sample = data.takeSample(withReplacement = true, 20, seed)
       assert(sample.size === 20)        // Got exactly 20 elements
       assert(sample.forall(x => 1 <= x && x <= n), s"elements not in [1, $n]")
     }
     {
-      val sample = data.takeSample(withReplacement=true, num=20)
+      val sample = data.takeSample(withReplacement = true, num = 20)
       assert(sample.size === 20)        // Got exactly 100 elements
       assert(sample.toSet.size <= 20, "sampling with replacement returned all distinct elements")
       assert(sample.forall(x => 1 <= x && x <= n), s"elements not in [1, $n]")
     }
     {
-      val sample = data.takeSample(withReplacement=true, num=n)
+      val sample = data.takeSample(withReplacement = true, num = n)
       assert(sample.size === n)        // Got exactly 100 elements
       // Chance of getting all distinct elements is astronomically low, so test we got < 100
       assert(sample.toSet.size < n, "sampling with replacement returned all distinct elements")
       assert(sample.forall(x => 1 <= x && x <= n), s"elements not in [1, $n]")
     }
     for (seed <- 1 to 5) {
-      val sample = data.takeSample(withReplacement=true, n, seed)
+      val sample = data.takeSample(withReplacement = true, n, seed)
       assert(sample.size === n)        // Got exactly 100 elements
       // Chance of getting all distinct elements is astronomically low, so test we got < 100
       assert(sample.toSet.size < n, "sampling with replacement returned all distinct elements")
     }
     for (seed <- 1 to 5) {
-      val sample = data.takeSample(withReplacement=true, 2 * n, seed)
+      val sample = data.takeSample(withReplacement = true, 2 * n, seed)
       assert(sample.size === 2 * n)        // Got exactly 200 elements
       // Chance of getting all distinct elements is still quite low, so test we got < 100
       assert(sample.toSet.size < n, "sampling with replacement returned all distinct elements")
@@ -691,7 +691,7 @@ class RDDSuite extends FunSuite with SharedSparkContext {
   }
 
   test("sortByKey") {
-    val data = sc.parallelize(Seq("5|50|A","4|60|C", "6|40|B"))
+    val data = sc.parallelize(Seq("5|50|A", "4|60|C", "6|40|B"))
 
     val col1 = Array("4|60|C", "5|50|A", "6|40|B")
     val col2 = Array("6|40|B", "5|50|A", "4|60|C")
@@ -703,7 +703,7 @@ class RDDSuite extends FunSuite with SharedSparkContext {
   }
 
   test("sortByKey ascending parameter") {
-    val data = sc.parallelize(Seq("5|50|A","4|60|C", "6|40|B"))
+    val data = sc.parallelize(Seq("5|50|A", "4|60|C", "6|40|B"))
 
     val asc = Array("4|60|C", "5|50|A", "6|40|B")
     val desc = Array("6|40|B", "5|50|A", "4|60|C")
@@ -764,9 +764,9 @@ class RDDSuite extends FunSuite with SharedSparkContext {
   }
 
   test("intersection strips duplicates in an input") {
-    val a = sc.parallelize(Seq(1,2,3,3))
-    val b = sc.parallelize(Seq(1,1,2,3))
-    val intersection = Array(1,2,3)
+    val a = sc.parallelize(Seq(1, 2, 3, 3))
+    val b = sc.parallelize(Seq(1, 1, 2, 3))
+    val intersection = Array(1, 2, 3)
 
     assert(a.intersection(b).collect().sorted === intersection)
     assert(b.intersection(a).collect().sorted === intersection)
diff --git a/core/src/test/scala/org/apache/spark/rdd/RDDSuiteUtils.scala b/core/src/test/scala/org/apache/spark/rdd/RDDSuiteUtils.scala
index fe695d85e29dd..194dc45d6e399 100644
--- a/core/src/test/scala/org/apache/spark/rdd/RDDSuiteUtils.scala
+++ b/core/src/test/scala/org/apache/spark/rdd/RDDSuiteUtils.scala
@@ -21,11 +21,11 @@ object RDDSuiteUtils {
   case class Person(first: String, last: String, age: Int)
 
   object AgeOrdering extends Ordering[Person] {
-    def compare(a:Person, b:Person): Int = a.age.compare(b.age)
+    def compare(a: Person, b: Person): Int = a.age.compare(b.age)
   }
 
   object NameOrdering extends Ordering[Person] {
-    def compare(a:Person, b:Person): Int =
-      implicitly[Ordering[Tuple2[String,String]]].compare((a.last, a.first), (b.last, b.first))
+    def compare(a: Person, b: Person): Int =
+      implicitly[Ordering[Tuple2[String, String]]].compare((a.last, a.first), (b.last, b.first))
   }
 }
diff --git a/core/src/test/scala/org/apache/spark/rdd/SortingSuite.scala b/core/src/test/scala/org/apache/spark/rdd/SortingSuite.scala
index 64b1c24c47168..54fc914722b46 100644
--- a/core/src/test/scala/org/apache/spark/rdd/SortingSuite.scala
+++ b/core/src/test/scala/org/apache/spark/rdd/SortingSuite.scala
@@ -26,7 +26,7 @@ class SortingSuite extends FunSuite with SharedSparkContext with Matchers with L
 
   test("sortByKey") {
     val pairs = sc.parallelize(Array((1, 0), (2, 0), (0, 0), (3, 0)), 2)
-    assert(pairs.sortByKey().collect() === Array((0,0), (1,0), (2,0), (3,0)))
+    assert(pairs.sortByKey().collect() === Array((0, 0), (1, 0), (2, 0), (3, 0)))
   }
 
   test("large array") {
@@ -136,7 +136,7 @@ class SortingSuite extends FunSuite with SharedSparkContext with Matchers with L
 
   test("get a range of elements in an array not partitioned by a range partitioner") {
     val pairArr = util.Random.shuffle((1 to 1000).toList).map(x => (x, x))
-    val pairs = sc.parallelize(pairArr,10)
+    val pairs = sc.parallelize(pairArr, 10)
     val range = pairs.filterByRange(200, 800).collect()
     assert((800 to 200 by -1).toArray.sorted === range.map(_._1).sorted)
   }
diff --git a/core/src/test/scala/org/apache/spark/rpc/RpcEnvSuite.scala b/core/src/test/scala/org/apache/spark/rpc/RpcEnvSuite.scala
index ae3339d80f9c6..21eb71d9acfbd 100644
--- a/core/src/test/scala/org/apache/spark/rpc/RpcEnvSuite.scala
+++ b/core/src/test/scala/org/apache/spark/rpc/RpcEnvSuite.scala
@@ -42,7 +42,7 @@ abstract class RpcEnvSuite extends FunSuite with BeforeAndAfterAll {
   }
 
   override def afterAll(): Unit = {
-    if(env != null) {
+    if (env != null) {
       env.shutdown()
     }
   }
@@ -75,7 +75,7 @@ abstract class RpcEnvSuite extends FunSuite with BeforeAndAfterAll {
       }
     })
 
-    val anotherEnv = createRpcEnv(new SparkConf(), "remote" ,13345)
+    val anotherEnv = createRpcEnv(new SparkConf(), "remote", 13345)
     // Use anotherEnv to find out the RpcEndpointRef
     val rpcEndpointRef = anotherEnv.setupEndpointRef("local", env.address, "send-remotely")
     try {
@@ -338,7 +338,7 @@ abstract class RpcEnvSuite extends FunSuite with BeforeAndAfterAll {
 
   test("call receive in sequence") {
     // If a RpcEnv implementation breaks the `receive` contract, hope this test can expose it
-    for(i <- 0 until 100) {
+    for (i <- 0 until 100) {
       @volatile var result = 0
       val endpointRef = env.setupEndpoint(s"receive-in-sequence-$i", new ThreadSafeRpcEndpoint {
         override val rpcEnv = env
diff --git a/core/src/test/scala/org/apache/spark/scheduler/CoarseGrainedSchedulerBackendSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/CoarseGrainedSchedulerBackendSuite.scala
index f77661ccbd1c5..3821166386fa6 100644
--- a/core/src/test/scala/org/apache/spark/scheduler/CoarseGrainedSchedulerBackendSuite.scala
+++ b/core/src/test/scala/org/apache/spark/scheduler/CoarseGrainedSchedulerBackendSuite.scala
@@ -26,8 +26,8 @@ class CoarseGrainedSchedulerBackendSuite extends FunSuite with LocalSparkContext
 
   test("serialized task larger than akka frame size") {
     val conf = new SparkConf
-    conf.set("spark.akka.frameSize","1")
-    conf.set("spark.default.parallelism","1")
+    conf.set("spark.akka.frameSize", "1")
+    conf.set("spark.default.parallelism", "1")
     sc = new SparkContext("local-cluster[2 , 1 , 512]", "test", conf)
     val frameSize = AkkaUtils.maxFrameSizeBytes(sc.conf)
     val buffer = new SerializableBuffer(java.nio.ByteBuffer.allocate(2 * frameSize))
diff --git a/core/src/test/scala/org/apache/spark/scheduler/DAGSchedulerSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/DAGSchedulerSuite.scala
index 46642236e454a..eea7a600841cc 100644
--- a/core/src/test/scala/org/apache/spark/scheduler/DAGSchedulerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/scheduler/DAGSchedulerSuite.scala
@@ -375,7 +375,7 @@ class DAGSchedulerSuite
     (1 to 30).foreach(_ => rdd = rdd.zip(rdd))
     // getPreferredLocs runs quickly, indicating that exponential graph traversal is avoided.
     failAfter(10 seconds) {
-      val preferredLocs = scheduler.getPreferredLocs(rdd,0)
+      val preferredLocs = scheduler.getPreferredLocs(rdd, 0)
       // No preferred locations are returned.
       assert(preferredLocs.length === 0)
     }
@@ -634,8 +634,8 @@ class DAGSchedulerSuite
     val listener1 = new FailureRecordingJobListener()
     val listener2 = new FailureRecordingJobListener()
 
-    submit(reduceRdd1, Array(0, 1), listener=listener1)
-    submit(reduceRdd2, Array(0, 1), listener=listener2)
+    submit(reduceRdd1, Array(0, 1), listener = listener1)
+    submit(reduceRdd2, Array(0, 1), listener = listener2)
 
     val stageFailureMessage = "Exception failure in map stage"
     failed(taskSets(0), stageFailureMessage)
diff --git a/core/src/test/scala/org/apache/spark/scheduler/PoolSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/PoolSuite.scala
index e8f461e2f56c9..456451b676bed 100644
--- a/core/src/test/scala/org/apache/spark/scheduler/PoolSuite.scala
+++ b/core/src/test/scala/org/apache/spark/scheduler/PoolSuite.scala
@@ -97,9 +97,9 @@ class PoolSuite extends FunSuite with LocalSparkContext {
     assert(rootPool.getSchedulableByName("3").weight === 1)
 
     val properties1 = new Properties()
-    properties1.setProperty("spark.scheduler.pool","1")
+    properties1.setProperty("spark.scheduler.pool", "1")
     val properties2 = new Properties()
-    properties2.setProperty("spark.scheduler.pool","2")
+    properties2.setProperty("spark.scheduler.pool", "2")
 
     val taskSetManager10 = createTaskSetManager(0, 1, taskScheduler)
     val taskSetManager11 = createTaskSetManager(1, 1, taskScheduler)
diff --git a/core/src/test/scala/org/apache/spark/serializer/KryoSerializerSuite.scala b/core/src/test/scala/org/apache/spark/serializer/KryoSerializerSuite.scala
index ef50bc9438f95..14c0172fa96ab 100644
--- a/core/src/test/scala/org/apache/spark/serializer/KryoSerializerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/serializer/KryoSerializerSuite.scala
@@ -109,7 +109,7 @@ class KryoSerializerSuite extends FunSuite with SharedSparkContext {
     check((1, 1))
     check((1, 1L))
     check((1L, 1))
-    check((1L,  1L))
+    check((1L, 1L))
     check((1.0, 1))
     check((1, 1.0))
     check((1.0, 1.0))
@@ -147,7 +147,7 @@ class KryoSerializerSuite extends FunSuite with SharedSparkContext {
     check(List(Some(mutable.HashMap(1->1, 2->2)), None, Some(mutable.HashMap(3->4))))
     check(List(
       mutable.HashMap("one" -> 1, "two" -> 2),
-      mutable.HashMap(1->"one",2->"two",3->"three")))
+      mutable.HashMap(1->"one", 2->"two", 3->"three")))
   }
 
   test("ranges") {
diff --git a/core/src/test/scala/org/apache/spark/serializer/ProactiveClosureSerializationSuite.scala b/core/src/test/scala/org/apache/spark/serializer/ProactiveClosureSerializationSuite.scala
index 433fd6bb4a11d..673948d84d82b 100644
--- a/core/src/test/scala/org/apache/spark/serializer/ProactiveClosureSerializationSuite.scala
+++ b/core/src/test/scala/org/apache/spark/serializer/ProactiveClosureSerializationSuite.scala
@@ -66,18 +66,18 @@ class ProactiveClosureSerializationSuite extends FunSuite with SharedSparkContex
   }
 
   private def xmap(x: RDD[String], uc: UnserializableClass): RDD[String] = 
-    x.map(y=>uc.op(y))
+    x.map(y => uc.op(y))
 
   private def xflatMap(x: RDD[String], uc: UnserializableClass): RDD[String] = 
-    x.flatMap(y=>Seq(uc.op(y)))
+    x.flatMap(y => Seq(uc.op(y)))
 
   private def xfilter(x: RDD[String], uc: UnserializableClass): RDD[String] = 
-    x.filter(y=>uc.pred(y))
+    x.filter(y => uc.pred(y))
 
   private def xmapPartitions(x: RDD[String], uc: UnserializableClass): RDD[String] = 
-    x.mapPartitions(_.map(y=>uc.op(y)))
+    x.mapPartitions(_.map(y => uc.op(y)))
 
   private def xmapPartitionsWithIndex(x: RDD[String], uc: UnserializableClass): RDD[String] = 
-    x.mapPartitionsWithIndex((_, it) => it.map(y=>uc.op(y)))
+    x.mapPartitionsWithIndex((_, it) => it.map(y => uc.op(y)))
   
 }
diff --git a/core/src/test/scala/org/apache/spark/serializer/TestSerializer.scala b/core/src/test/scala/org/apache/spark/serializer/TestSerializer.scala
index 86fcf447287f7..c1e0a29a34bb1 100644
--- a/core/src/test/scala/org/apache/spark/serializer/TestSerializer.scala
+++ b/core/src/test/scala/org/apache/spark/serializer/TestSerializer.scala
@@ -32,16 +32,19 @@ class TestSerializer extends Serializer {
 
 
 class TestSerializerInstance extends SerializerInstance {
-  override def serialize[T: ClassTag](t: T): ByteBuffer = ???
+  override def serialize[T: ClassTag](t: T): ByteBuffer = throw new UnsupportedOperationException
 
-  override def serializeStream(s: OutputStream): SerializationStream = ???
+  override def serializeStream(s: OutputStream): SerializationStream =
+    throw new UnsupportedOperationException
 
   override def deserializeStream(s: InputStream): TestDeserializationStream =
     new TestDeserializationStream
 
-  override def deserialize[T: ClassTag](bytes: ByteBuffer): T = ???
+  override def deserialize[T: ClassTag](bytes: ByteBuffer): T =
+    throw new UnsupportedOperationException
 
-  override def deserialize[T: ClassTag](bytes: ByteBuffer, loader: ClassLoader): T = ???
+  override def deserialize[T: ClassTag](bytes: ByteBuffer, loader: ClassLoader): T =
+    throw new UnsupportedOperationException
 }
 
 
diff --git a/core/src/test/scala/org/apache/spark/storage/FlatmapIteratorSuite.scala b/core/src/test/scala/org/apache/spark/storage/FlatmapIteratorSuite.scala
index bcf138b5ee6d0..47341b74e9c0f 100644
--- a/core/src/test/scala/org/apache/spark/storage/FlatmapIteratorSuite.scala
+++ b/core/src/test/scala/org/apache/spark/storage/FlatmapIteratorSuite.scala
@@ -59,10 +59,10 @@ class FlatmapIteratorSuite extends FunSuite with LocalSparkContext {
       .set("spark.serializer.objectStreamReset", "10")
     sc = new SparkContext(sconf)
     val expand_size = 500
-    val data = sc.parallelize(Seq(1,2)).
+    val data = sc.parallelize(Seq(1, 2)).
       flatMap(x => Stream.range(1, expand_size).
-      map(y => "%d: string test %d".format(y,x)))
-    var persisted = data.persist(StorageLevel.MEMORY_ONLY_SER)
+      map(y => "%d: string test %d".format(y, x)))
+    val persisted = data.persist(StorageLevel.MEMORY_ONLY_SER)
     assert(persisted.filter(_.startsWith("1:")).count()===2)
   }
 
diff --git a/core/src/test/scala/org/apache/spark/ui/UISeleniumSuite.scala b/core/src/test/scala/org/apache/spark/ui/UISeleniumSuite.scala
index b6f5accef0cef..a727a43f44dfc 100644
--- a/core/src/test/scala/org/apache/spark/ui/UISeleniumSuite.scala
+++ b/core/src/test/scala/org/apache/spark/ui/UISeleniumSuite.scala
@@ -483,11 +483,11 @@ class UISeleniumSuite extends FunSuite with WebBrowser with Matchers with Before
       val jobsJson = getJson(sc.ui.get, "jobs")
       jobsJson.children.size should be (expJobInfo.size)
       for {
-        (job @ JObject(_),idx) <- jobsJson.children.zipWithIndex
+        (job @ JObject(_), idx) <- jobsJson.children.zipWithIndex
         id = (job \ "jobId").extract[String]
         name = (job \ "name").extract[String]
       } {
-        withClue(s"idx = $idx; id = $id; name = ${name.substring(0,20)}") {
+        withClue(s"idx = $idx; id = $id; name = ${name.substring(0, 20)}") {
           id should be (expJobInfo(idx)._1)
           name should include (expJobInfo(idx)._2)
         }
@@ -540,12 +540,12 @@ class UISeleniumSuite extends FunSuite with WebBrowser with Matchers with Before
 
       goToUi(sc, "/stages/stage/?id=12&attempt=0")
       find("no-info").get.text should be ("No information to display for Stage 12 (Attempt 0)")
-      val badStage = HistoryServerSuite.getContentAndCode(apiUrl(sc.ui.get,"stages/12/0"))
+      val badStage = HistoryServerSuite.getContentAndCode(apiUrl(sc.ui.get, "stages/12/0"))
       badStage._1 should be (HttpServletResponse.SC_NOT_FOUND)
       badStage._2 should be (None)
       badStage._3 should be (Some("unknown stage: 12"))
 
-      val badAttempt = HistoryServerSuite.getContentAndCode(apiUrl(sc.ui.get,"stages/19/15"))
+      val badAttempt = HistoryServerSuite.getContentAndCode(apiUrl(sc.ui.get, "stages/19/15"))
       badAttempt._1 should be (HttpServletResponse.SC_NOT_FOUND)
       badAttempt._2 should be (None)
       badAttempt._3 should be (Some("unknown attempt for stage 19.  Found attempts: [0]"))
diff --git a/core/src/test/scala/org/apache/spark/ui/storage/StorageTabSuite.scala b/core/src/test/scala/org/apache/spark/ui/storage/StorageTabSuite.scala
index 7b38e6d9473e1..8778042e34657 100644
--- a/core/src/test/scala/org/apache/spark/ui/storage/StorageTabSuite.scala
+++ b/core/src/test/scala/org/apache/spark/ui/storage/StorageTabSuite.scala
@@ -169,7 +169,7 @@ class StorageTabSuite extends FunSuite with BeforeAndAfter {
   test("verify StorageTab contains all cached rdds") {
 
     val rddInfo0 = new RDDInfo(0, "rdd0", 1, memOnly, Seq(4))
-    val rddInfo1 = new RDDInfo(1, "rdd1", 1 ,memOnly, Seq(4))
+    val rddInfo1 = new RDDInfo(1, "rdd1", 1, memOnly, Seq(4))
     val stageInfo0 = new StageInfo(0, 0, "stage0", 1, Seq(rddInfo0), Seq.empty, "details")
     val stageInfo1 = new StageInfo(1, 0, "stage1", 1, Seq(rddInfo1), Seq.empty, "details")
     val taskMetrics0 = new TaskMetrics
diff --git a/core/src/test/scala/org/apache/spark/util/AkkaUtilsSuite.scala b/core/src/test/scala/org/apache/spark/util/AkkaUtilsSuite.scala
index bec79fc4dc8f7..ccdb3f571429d 100644
--- a/core/src/test/scala/org/apache/spark/util/AkkaUtilsSuite.scala
+++ b/core/src/test/scala/org/apache/spark/util/AkkaUtilsSuite.scala
@@ -138,7 +138,7 @@ class AkkaUtilsSuite extends FunSuite with LocalSparkContext with ResetSystemPro
 
     assert(securityManagerGood.isAuthenticationEnabled() === true)
 
-    val slaveRpcEnv =RpcEnv.create("spark-slave", hostname, 0, goodconf, securityManagerGood)
+    val slaveRpcEnv = RpcEnv.create("spark-slave", hostname, 0, goodconf, securityManagerGood)
     val slaveTracker = new MapOutputTrackerWorker(conf)
     slaveTracker.trackerEndpoint =
       slaveRpcEnv.setupEndpointRef("spark", rpcEnv.address, MapOutputTracker.ENDPOINT_NAME)
diff --git a/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala b/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala
index 61152c29a681f..afa5cdc819746 100644
--- a/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala
+++ b/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala
@@ -551,7 +551,7 @@ class UtilsSuite extends FunSuite with ResetSystemProperties with Logging {
   test("fetch hcfs dir") {
     val tempDir = Utils.createTempDir()
     val sourceDir = new File(tempDir, "source-dir")
-    val innerSourceDir = Utils.createTempDir(root=sourceDir.getPath)
+    val innerSourceDir = Utils.createTempDir(root = sourceDir.getPath)
     val sourceFile = File.createTempFile("someprefix", "somesuffix", innerSourceDir)
     val targetDir = new File(tempDir, "target-dir")
     Files.write("some text", sourceFile, UTF_8)
diff --git a/core/src/test/scala/org/apache/spark/util/collection/BitSetSuite.scala b/core/src/test/scala/org/apache/spark/util/collection/BitSetSuite.scala
index b85a409a4b2e9..ffc206991906a 100644
--- a/core/src/test/scala/org/apache/spark/util/collection/BitSetSuite.scala
+++ b/core/src/test/scala/org/apache/spark/util/collection/BitSetSuite.scala
@@ -94,7 +94,7 @@ class BitSetSuite extends FunSuite {
 
   test( "xor len(bitsetX) > len(bitsetY)" ) {
     val setBitsX = Seq( 0, 1, 3, 37, 38, 41, 85)
-    val setBitsY   = Seq( 0, 2, 3, 37, 41 )
+    val setBitsY = Seq( 0, 2, 3, 37, 41)
     val bitsetX = new BitSet(100)
     setBitsX.foreach( i => bitsetX.set(i))
     val bitsetY = new BitSet(60)

From b069ad23d9b6cbfb3a8bf245547add4816669075 Mon Sep 17 00:00:00 2001
From: Reynold Xin <rxin@databricks.com>
Date: Thu, 28 May 2015 20:17:16 -0700
Subject: [PATCH 229/525] [SPARK-7927] whitespace fixes for GraphX.

So we can enable a whitespace enforcement rule in the style checker to save code review time.

Author: Reynold Xin <rxin@databricks.com>

Closes #6474 from rxin/whitespace-graphx and squashes the following commits:

4d3cd26 [Reynold Xin] Fixed tests.
869dde4 [Reynold Xin] [SPARK-7927] whitespace fixes for GraphX.
---
 .../org/apache/spark/graphx/EdgeDirection.scala   |  4 ++--
 .../org/apache/spark/graphx/EdgeTriplet.scala     |  2 +-
 .../scala/org/apache/spark/graphx/Graph.scala     |  2 +-
 .../scala/org/apache/spark/graphx/GraphOps.scala  | 10 +++++-----
 .../scala/org/apache/spark/graphx/Pregel.scala    |  8 ++++----
 .../apache/spark/graphx/impl/EdgePartition.scala  |  4 ++--
 .../org/apache/spark/graphx/lib/PageRank.scala    |  8 ++++----
 .../org/apache/spark/graphx/lib/SVDPlusPlus.scala |  2 +-
 .../apache/spark/graphx/lib/TriangleCount.scala   |  4 ++--
 .../spark/graphx/util/GraphGenerators.scala       |  9 +++++----
 .../org/apache/spark/graphx/GraphOpsSuite.scala   |  6 +++---
 .../org/apache/spark/graphx/GraphSuite.scala      |  6 +++---
 .../graphx/lib/ConnectedComponentsSuite.scala     | 15 +++++++++------
 .../apache/spark/graphx/lib/PageRankSuite.scala   | 14 +++++++-------
 .../spark/graphx/lib/TriangleCountSuite.scala     |  2 +-
 15 files changed, 50 insertions(+), 46 deletions(-)

diff --git a/graphx/src/main/scala/org/apache/spark/graphx/EdgeDirection.scala b/graphx/src/main/scala/org/apache/spark/graphx/EdgeDirection.scala
index 058c8c8aa1b24..ce1054ed92ba1 100644
--- a/graphx/src/main/scala/org/apache/spark/graphx/EdgeDirection.scala
+++ b/graphx/src/main/scala/org/apache/spark/graphx/EdgeDirection.scala
@@ -26,8 +26,8 @@ class EdgeDirection private (private val name: String) extends Serializable {
    * out becomes in and both and either remain the same.
    */
   def reverse: EdgeDirection = this match {
-    case EdgeDirection.In   => EdgeDirection.Out
-    case EdgeDirection.Out  => EdgeDirection.In
+    case EdgeDirection.In => EdgeDirection.Out
+    case EdgeDirection.Out => EdgeDirection.In
     case EdgeDirection.Either => EdgeDirection.Either
     case EdgeDirection.Both => EdgeDirection.Both
   }
diff --git a/graphx/src/main/scala/org/apache/spark/graphx/EdgeTriplet.scala b/graphx/src/main/scala/org/apache/spark/graphx/EdgeTriplet.scala
index c8790cac3d8a0..65f82429d2029 100644
--- a/graphx/src/main/scala/org/apache/spark/graphx/EdgeTriplet.scala
+++ b/graphx/src/main/scala/org/apache/spark/graphx/EdgeTriplet.scala
@@ -37,7 +37,7 @@ class EdgeTriplet[VD, ED] extends Edge[ED] {
   /**
    * Set the edge properties of this triplet.
    */
-  protected[spark] def set(other: Edge[ED]): EdgeTriplet[VD,ED] = {
+  protected[spark] def set(other: Edge[ED]): EdgeTriplet[VD, ED] = {
     srcId = other.srcId
     dstId = other.dstId
     attr = other.attr
diff --git a/graphx/src/main/scala/org/apache/spark/graphx/Graph.scala b/graphx/src/main/scala/org/apache/spark/graphx/Graph.scala
index 36dc7b0f86c89..db73a8abc5733 100644
--- a/graphx/src/main/scala/org/apache/spark/graphx/Graph.scala
+++ b/graphx/src/main/scala/org/apache/spark/graphx/Graph.scala
@@ -316,7 +316,7 @@ abstract class Graph[VD: ClassTag, ED: ClassTag] protected () extends Serializab
    * satisfy the predicates
    */
   def subgraph(
-      epred: EdgeTriplet[VD,ED] => Boolean = (x => true),
+      epred: EdgeTriplet[VD, ED] => Boolean = (x => true),
       vpred: (VertexId, VD) => Boolean = ((v, d) => true))
     : Graph[VD, ED]
 
diff --git a/graphx/src/main/scala/org/apache/spark/graphx/GraphOps.scala b/graphx/src/main/scala/org/apache/spark/graphx/GraphOps.scala
index 7edd627b20918..9451ff1e5c0e2 100644
--- a/graphx/src/main/scala/org/apache/spark/graphx/GraphOps.scala
+++ b/graphx/src/main/scala/org/apache/spark/graphx/GraphOps.scala
@@ -124,18 +124,18 @@ class GraphOps[VD: ClassTag, ED: ClassTag](graph: Graph[VD, ED]) extends Seriali
   def collectNeighbors(edgeDirection: EdgeDirection): VertexRDD[Array[(VertexId, VD)]] = {
     val nbrs = edgeDirection match {
       case EdgeDirection.Either =>
-        graph.aggregateMessages[Array[(VertexId,VD)]](
+        graph.aggregateMessages[Array[(VertexId, VD)]](
           ctx => {
             ctx.sendToSrc(Array((ctx.dstId, ctx.dstAttr)))
             ctx.sendToDst(Array((ctx.srcId, ctx.srcAttr)))
           },
           (a, b) => a ++ b, TripletFields.All)
       case EdgeDirection.In =>
-        graph.aggregateMessages[Array[(VertexId,VD)]](
+        graph.aggregateMessages[Array[(VertexId, VD)]](
           ctx => ctx.sendToDst(Array((ctx.srcId, ctx.srcAttr))),
           (a, b) => a ++ b, TripletFields.Src)
       case EdgeDirection.Out =>
-        graph.aggregateMessages[Array[(VertexId,VD)]](
+        graph.aggregateMessages[Array[(VertexId, VD)]](
           ctx => ctx.sendToSrc(Array((ctx.dstId, ctx.dstAttr))),
           (a, b) => a ++ b, TripletFields.Dst)
       case EdgeDirection.Both =>
@@ -253,7 +253,7 @@ class GraphOps[VD: ClassTag, ED: ClassTag](graph: Graph[VD, ED]) extends Seriali
   def filter[VD2: ClassTag, ED2: ClassTag](
       preprocess: Graph[VD, ED] => Graph[VD2, ED2],
       epred: (EdgeTriplet[VD2, ED2]) => Boolean = (x: EdgeTriplet[VD2, ED2]) => true,
-      vpred: (VertexId, VD2) => Boolean = (v:VertexId, d:VD2) => true): Graph[VD, ED] = {
+      vpred: (VertexId, VD2) => Boolean = (v: VertexId, d: VD2) => true): Graph[VD, ED] = {
     graph.mask(preprocess(graph).subgraph(epred, vpred))
   }
 
@@ -356,7 +356,7 @@ class GraphOps[VD: ClassTag, ED: ClassTag](graph: Graph[VD, ED]) extends Seriali
       maxIterations: Int = Int.MaxValue,
       activeDirection: EdgeDirection = EdgeDirection.Either)(
       vprog: (VertexId, VD, A) => VD,
-      sendMsg: EdgeTriplet[VD, ED] => Iterator[(VertexId,A)],
+      sendMsg: EdgeTriplet[VD, ED] => Iterator[(VertexId, A)],
       mergeMsg: (A, A) => A)
     : Graph[VD, ED] = {
     Pregel(graph, initialMsg, maxIterations, activeDirection)(vprog, sendMsg, mergeMsg)
diff --git a/graphx/src/main/scala/org/apache/spark/graphx/Pregel.scala b/graphx/src/main/scala/org/apache/spark/graphx/Pregel.scala
index 01b013ff716fc..cfcf7244eaed5 100644
--- a/graphx/src/main/scala/org/apache/spark/graphx/Pregel.scala
+++ b/graphx/src/main/scala/org/apache/spark/graphx/Pregel.scala
@@ -147,10 +147,10 @@ object Pregel extends Logging {
       logInfo("Pregel finished iteration " + i)
 
       // Unpersist the RDDs hidden by newly-materialized RDDs
-      oldMessages.unpersist(blocking=false)
-      newVerts.unpersist(blocking=false)
-      prevG.unpersistVertices(blocking=false)
-      prevG.edges.unpersist(blocking=false)
+      oldMessages.unpersist(blocking = false)
+      newVerts.unpersist(blocking = false)
+      prevG.unpersistVertices(blocking = false)
+      prevG.edges.unpersist(blocking = false)
       // count the iteration
       i += 1
     }
diff --git a/graphx/src/main/scala/org/apache/spark/graphx/impl/EdgePartition.scala b/graphx/src/main/scala/org/apache/spark/graphx/impl/EdgePartition.scala
index c561570809253..ab021a252eb8a 100644
--- a/graphx/src/main/scala/org/apache/spark/graphx/impl/EdgePartition.scala
+++ b/graphx/src/main/scala/org/apache/spark/graphx/impl/EdgePartition.scala
@@ -156,8 +156,8 @@ class EdgePartition[
     val size = data.size
     var i = 0
     while (i < size) {
-      edge.srcId  = srcIds(i)
-      edge.dstId  = dstIds(i)
+      edge.srcId = srcIds(i)
+      edge.dstId = dstIds(i)
       edge.attr = data(i)
       newData(i) = f(edge)
       i += 1
diff --git a/graphx/src/main/scala/org/apache/spark/graphx/lib/PageRank.scala b/graphx/src/main/scala/org/apache/spark/graphx/lib/PageRank.scala
index bc974b2f04e70..8c0a461e99fa4 100644
--- a/graphx/src/main/scala/org/apache/spark/graphx/lib/PageRank.scala
+++ b/graphx/src/main/scala/org/apache/spark/graphx/lib/PageRank.scala
@@ -116,7 +116,7 @@ object PageRank extends Logging {
 
     val personalized = srcId isDefined
     val src: VertexId = srcId.getOrElse(-1L)
-    def delta(u: VertexId, v: VertexId):Double = { if (u == v) 1.0 else 0.0 }
+    def delta(u: VertexId, v: VertexId): Double = { if (u == v) 1.0 else 0.0 }
 
     var iteration = 0
     var prevRankGraph: Graph[Double, Double] = null
@@ -133,13 +133,13 @@ object PageRank extends Logging {
       // edge partitions.
       prevRankGraph = rankGraph
       val rPrb = if (personalized) {
-        (src: VertexId ,id: VertexId) => resetProb * delta(src,id)
+        (src: VertexId , id: VertexId) => resetProb * delta(src, id)
       } else {
         (src: VertexId, id: VertexId) => resetProb
       }
 
       rankGraph = rankGraph.joinVertices(rankUpdates) {
-        (id, oldRank, msgSum) => rPrb(src,id) + (1.0 - resetProb) * msgSum
+        (id, oldRank, msgSum) => rPrb(src, id) + (1.0 - resetProb) * msgSum
       }.cache()
 
       rankGraph.edges.foreachPartition(x => {}) // also materializes rankGraph.vertices
@@ -243,7 +243,7 @@ object PageRank extends Logging {
 
     // Execute a dynamic version of Pregel.
     val vp = if (personalized) {
-      (id: VertexId, attr: (Double, Double),msgSum: Double) =>
+      (id: VertexId, attr: (Double, Double), msgSum: Double) =>
         personalizedVertexProgram(id, attr, msgSum)
     } else {
       (id: VertexId, attr: (Double, Double), msgSum: Double) =>
diff --git a/graphx/src/main/scala/org/apache/spark/graphx/lib/SVDPlusPlus.scala b/graphx/src/main/scala/org/apache/spark/graphx/lib/SVDPlusPlus.scala
index 3b0e1628d86b5..9cb24ed080e1c 100644
--- a/graphx/src/main/scala/org/apache/spark/graphx/lib/SVDPlusPlus.scala
+++ b/graphx/src/main/scala/org/apache/spark/graphx/lib/SVDPlusPlus.scala
@@ -210,7 +210,7 @@ object SVDPlusPlus {
   /**
    * Forces materialization of a Graph by count()ing its RDDs.
    */
-  private def materialize(g: Graph[_,_]): Unit = {
+  private def materialize(g: Graph[_, _]): Unit = {
     g.vertices.count()
     g.edges.count()
   }
diff --git a/graphx/src/main/scala/org/apache/spark/graphx/lib/TriangleCount.scala b/graphx/src/main/scala/org/apache/spark/graphx/lib/TriangleCount.scala
index daf162085e3e4..a5d598053f9ca 100644
--- a/graphx/src/main/scala/org/apache/spark/graphx/lib/TriangleCount.scala
+++ b/graphx/src/main/scala/org/apache/spark/graphx/lib/TriangleCount.scala
@@ -38,7 +38,7 @@ import org.apache.spark.graphx._
  */
 object TriangleCount {
 
-  def run[VD: ClassTag, ED: ClassTag](graph: Graph[VD,ED]): Graph[Int, ED] = {
+  def run[VD: ClassTag, ED: ClassTag](graph: Graph[VD, ED]): Graph[Int, ED] = {
     // Remove redundant edges
     val g = graph.groupEdges((a, b) => a).cache()
 
@@ -49,7 +49,7 @@ object TriangleCount {
         var i = 0
         while (i < nbrs.size) {
           // prevent self cycle
-          if(nbrs(i) != vid) {
+          if (nbrs(i) != vid) {
             set.add(nbrs(i))
           }
           i += 1
diff --git a/graphx/src/main/scala/org/apache/spark/graphx/util/GraphGenerators.scala b/graphx/src/main/scala/org/apache/spark/graphx/util/GraphGenerators.scala
index 2d6a825b61726..9591c4e9b8f4e 100644
--- a/graphx/src/main/scala/org/apache/spark/graphx/util/GraphGenerators.scala
+++ b/graphx/src/main/scala/org/apache/spark/graphx/util/GraphGenerators.scala
@@ -243,14 +243,15 @@ object GraphGenerators {
    * @return A graph containing vertices with the row and column ids
    * as their attributes and edge values as 1.0.
    */
-  def gridGraph(sc: SparkContext, rows: Int, cols: Int): Graph[(Int,Int), Double] = {
+  def gridGraph(sc: SparkContext, rows: Int, cols: Int): Graph[(Int, Int), Double] = {
     // Convert row column address into vertex ids (row major order)
     def sub2ind(r: Int, c: Int): VertexId = r * cols + c
 
-    val vertices: RDD[(VertexId, (Int,Int))] =
-      sc.parallelize(0 until rows).flatMap( r => (0 until cols).map( c => (sub2ind(r,c), (r,c)) ) )
+    val vertices: RDD[(VertexId, (Int, Int))] = sc.parallelize(0 until rows).flatMap { r =>
+      (0 until cols).map( c => (sub2ind(r, c), (r, c)) )
+    }
     val edges: RDD[Edge[Double]] =
-      vertices.flatMap{ case (vid, (r,c)) =>
+      vertices.flatMap{ case (vid, (r, c)) =>
         (if (r + 1 < rows) { Seq( (sub2ind(r, c), sub2ind(r + 1, c))) } else { Seq.empty }) ++
         (if (c + 1 < cols) { Seq( (sub2ind(r, c), sub2ind(r, c + 1))) } else { Seq.empty })
       }.map{ case (src, dst) => Edge(src, dst, 1.0) }
diff --git a/graphx/src/test/scala/org/apache/spark/graphx/GraphOpsSuite.scala b/graphx/src/test/scala/org/apache/spark/graphx/GraphOpsSuite.scala
index 9bc8007ce49cd..68fe83739e399 100644
--- a/graphx/src/test/scala/org/apache/spark/graphx/GraphOpsSuite.scala
+++ b/graphx/src/test/scala/org/apache/spark/graphx/GraphOpsSuite.scala
@@ -59,7 +59,7 @@ class GraphOpsSuite extends FunSuite with LocalSparkContext {
   test ("filter") {
     withSpark { sc =>
       val n = 5
-      val vertices = sc.parallelize((0 to n).map(x => (x:VertexId, x)))
+      val vertices = sc.parallelize((0 to n).map(x => (x: VertexId, x)))
       val edges = sc.parallelize((1 to n).map(x => Edge(0, x, x)))
       val graph: Graph[Int, Int] = Graph(vertices, edges).cache()
       val filteredGraph = graph.filter(
@@ -67,11 +67,11 @@ class GraphOpsSuite extends FunSuite with LocalSparkContext {
           val degrees: VertexRDD[Int] = graph.outDegrees
           graph.outerJoinVertices(degrees) {(vid, data, deg) => deg.getOrElse(0)}
         },
-        vpred = (vid: VertexId, deg:Int) => deg > 0
+        vpred = (vid: VertexId, deg: Int) => deg > 0
       ).cache()
 
       val v = filteredGraph.vertices.collect().toSet
-      assert(v === Set((0,0)))
+      assert(v === Set((0, 0)))
 
       // the map is necessary because of object-reuse in the edge iterator
       val e = filteredGraph.edges.map(e => Edge(e.srcId, e.dstId, e.attr)).collect().toSet
diff --git a/graphx/src/test/scala/org/apache/spark/graphx/GraphSuite.scala b/graphx/src/test/scala/org/apache/spark/graphx/GraphSuite.scala
index a570e4ed75fc3..2b1d8e47326f8 100644
--- a/graphx/src/test/scala/org/apache/spark/graphx/GraphSuite.scala
+++ b/graphx/src/test/scala/org/apache/spark/graphx/GraphSuite.scala
@@ -248,7 +248,7 @@ class GraphSuite extends FunSuite with LocalSparkContext {
   test("mask") {
     withSpark { sc =>
       val n = 5
-      val vertices = sc.parallelize((0 to n).map(x => (x:VertexId, x)))
+      val vertices = sc.parallelize((0 to n).map(x => (x: VertexId, x)))
       val edges = sc.parallelize((1 to n).map(x => Edge(0, x, x)))
       val graph: Graph[Int, Int] = Graph(vertices, edges).cache()
 
@@ -260,11 +260,11 @@ class GraphSuite extends FunSuite with LocalSparkContext {
       val projectedGraph = graph.mask(subgraph)
 
       val v = projectedGraph.vertices.collect().toSet
-      assert(v === Set((0,0), (1,1), (2,2), (4,4), (5,5)))
+      assert(v === Set((0, 0), (1, 1), (2, 2), (4, 4), (5, 5)))
 
       // the map is necessary because of object-reuse in the edge iterator
       val e = projectedGraph.edges.map(e => Edge(e.srcId, e.dstId, e.attr)).collect().toSet
-      assert(e === Set(Edge(0,1,1), Edge(0,2,2), Edge(0,5,5)))
+      assert(e === Set(Edge(0, 1, 1), Edge(0, 2, 2), Edge(0, 5, 5)))
 
     }
   }
diff --git a/graphx/src/test/scala/org/apache/spark/graphx/lib/ConnectedComponentsSuite.scala b/graphx/src/test/scala/org/apache/spark/graphx/lib/ConnectedComponentsSuite.scala
index 4cc30a96408f8..accccfc232cd3 100644
--- a/graphx/src/test/scala/org/apache/spark/graphx/lib/ConnectedComponentsSuite.scala
+++ b/graphx/src/test/scala/org/apache/spark/graphx/lib/ConnectedComponentsSuite.scala
@@ -52,13 +52,16 @@ class ConnectedComponentsSuite extends FunSuite with LocalSparkContext {
     withSpark { sc =>
       val chain1 = (0 until 9).map(x => (x, x + 1))
       val chain2 = (10 until 20).map(x => (x, x + 1))
-      val rawEdges = sc.parallelize(chain1 ++ chain2, 3).map { case (s,d) => (s.toLong, d.toLong) }
+      val rawEdges = sc.parallelize(chain1 ++ chain2, 3).map { case (s, d) => (s.toLong, d.toLong) }
       val twoChains = Graph.fromEdgeTuples(rawEdges, 1.0)
       val ccGraph = twoChains.connectedComponents()
       val vertices = ccGraph.vertices.collect()
       for ( (id, cc) <- vertices ) {
-        if(id < 10) { assert(cc === 0) }
-        else { assert(cc === 10) }
+        if (id < 10) {
+          assert(cc === 0)
+        } else {
+          assert(cc === 10)
+        }
       }
       val ccMap = vertices.toMap
       for (id <- 0 until 20) {
@@ -75,7 +78,7 @@ class ConnectedComponentsSuite extends FunSuite with LocalSparkContext {
     withSpark { sc =>
       val chain1 = (0 until 9).map(x => (x, x + 1))
       val chain2 = (10 until 20).map(x => (x, x + 1))
-      val rawEdges = sc.parallelize(chain1 ++ chain2, 3).map { case (s,d) => (s.toLong, d.toLong) }
+      val rawEdges = sc.parallelize(chain1 ++ chain2, 3).map { case (s, d) => (s.toLong, d.toLong) }
       val twoChains = Graph.fromEdgeTuples(rawEdges, true).reverse
       val ccGraph = twoChains.connectedComponents()
       val vertices = ccGraph.vertices.collect()
@@ -106,9 +109,9 @@ class ConnectedComponentsSuite extends FunSuite with LocalSparkContext {
                        (4L, ("peter", "student"))))
       // Create an RDD for edges
       val relationships: RDD[Edge[String]] =
-        sc.parallelize(Array(Edge(3L, 7L, "collab"),    Edge(5L, 3L, "advisor"),
+        sc.parallelize(Array(Edge(3L, 7L, "collab"), Edge(5L, 3L, "advisor"),
                        Edge(2L, 5L, "colleague"), Edge(5L, 7L, "pi"),
-                       Edge(4L, 0L, "student"),   Edge(5L, 0L, "colleague")))
+                       Edge(4L, 0L, "student"), Edge(5L, 0L, "colleague")))
       // Edges are:
       //   2 ---> 5 ---> 3
       //          | \
diff --git a/graphx/src/test/scala/org/apache/spark/graphx/lib/PageRankSuite.scala b/graphx/src/test/scala/org/apache/spark/graphx/lib/PageRankSuite.scala
index 3f3c9dfd7b3dd..39c6ace912b00 100644
--- a/graphx/src/test/scala/org/apache/spark/graphx/lib/PageRankSuite.scala
+++ b/graphx/src/test/scala/org/apache/spark/graphx/lib/PageRankSuite.scala
@@ -31,14 +31,14 @@ object GridPageRank {
     def sub2ind(r: Int, c: Int): Int = r * nCols + c
     // Make the grid graph
     for (r <- 0 until nRows; c <- 0 until nCols) {
-      val ind = sub2ind(r,c)
+      val ind = sub2ind(r, c)
       if (r + 1 < nRows) {
         outDegree(ind) += 1
-        inNbrs(sub2ind(r + 1,c)) += ind
+        inNbrs(sub2ind(r + 1, c)) += ind
       }
       if (c + 1 < nCols) {
         outDegree(ind) += 1
-        inNbrs(sub2ind(r,c + 1)) += ind
+        inNbrs(sub2ind(r, c + 1)) += ind
       }
     }
     // compute the pagerank
@@ -99,8 +99,8 @@ class PageRankSuite extends FunSuite with LocalSparkContext {
       val resetProb = 0.15
       val errorTol = 1.0e-5
 
-      val staticRanks1 = starGraph.staticPersonalizedPageRank(0,numIter = 1, resetProb).vertices
-      val staticRanks2 = starGraph.staticPersonalizedPageRank(0,numIter = 2, resetProb)
+      val staticRanks1 = starGraph.staticPersonalizedPageRank(0, numIter = 1, resetProb).vertices
+      val staticRanks2 = starGraph.staticPersonalizedPageRank(0, numIter = 2, resetProb)
         .vertices.cache()
 
       // Static PageRank should only take 2 iterations to converge
@@ -117,7 +117,7 @@ class PageRankSuite extends FunSuite with LocalSparkContext {
       }
       assert(staticErrors.sum === 0)
 
-      val dynamicRanks = starGraph.personalizedPageRank(0,0, resetProb).vertices.cache()
+      val dynamicRanks = starGraph.personalizedPageRank(0, 0, resetProb).vertices.cache()
       assert(compareRanks(staticRanks2, dynamicRanks) < errorTol)
     }
   } // end of test Star PageRank
@@ -162,7 +162,7 @@ class PageRankSuite extends FunSuite with LocalSparkContext {
   test("Chain PersonalizedPageRank") {
     withSpark { sc =>
       val chain1 = (0 until 9).map(x => (x, x + 1) )
-      val rawEdges = sc.parallelize(chain1, 1).map { case (s,d) => (s.toLong, d.toLong) }
+      val rawEdges = sc.parallelize(chain1, 1).map { case (s, d) => (s.toLong, d.toLong) }
       val chain = Graph.fromEdgeTuples(rawEdges, 1.0).cache()
       val resetProb = 0.15
       val tol = 0.0001
diff --git a/graphx/src/test/scala/org/apache/spark/graphx/lib/TriangleCountSuite.scala b/graphx/src/test/scala/org/apache/spark/graphx/lib/TriangleCountSuite.scala
index 293c7f3ba4c21..79bf4e6cd18ee 100644
--- a/graphx/src/test/scala/org/apache/spark/graphx/lib/TriangleCountSuite.scala
+++ b/graphx/src/test/scala/org/apache/spark/graphx/lib/TriangleCountSuite.scala
@@ -58,7 +58,7 @@ class TriangleCountSuite extends FunSuite with LocalSparkContext {
       val triangles =
         Array(0L -> 1L, 1L -> 2L, 2L -> 0L) ++
         Array(0L -> -1L, -1L -> -2L, -2L -> 0L)
-      val revTriangles = triangles.map { case (a,b) => (b,a) }
+      val revTriangles = triangles.map { case (a, b) => (b, a) }
       val rawEdges = sc.parallelize(triangles ++ revTriangles, 2)
       val graph = Graph.fromEdgeTuples(rawEdges, true).cache()
       val triangleCount = graph.triangleCount()

From c45d58c143d68cb807186acc9d060daa8549dd5c Mon Sep 17 00:00:00 2001
From: Xiangrui Meng <meng@databricks.com>
Date: Thu, 28 May 2015 21:20:54 -0700
Subject: [PATCH 230/525] [SPARK-7926] [PYSPARK] use the official Pyrolite
 release

Switch to the official Pyrolite release from the one published under `org.spark-project`. Thanks irmen for making the releases on Maven Central. We didn't upgrade to 4.6 because we don't have enough time for QA. I excludes `serpent` from its dependencies because we don't use it in Spark.
~~~
[info]   +-net.jpountz.lz4:lz4:1.3.0
[info]   +-net.razorvine:pyrolite:4.4
[info]   +-net.sf.py4j:py4j:0.8.2.1
~~~

davies

Author: Xiangrui Meng <meng@databricks.com>

Closes #6472 from mengxr/SPARK-7926 and squashes the following commits:

7b3c6bf [Xiangrui Meng] use the official Pyrolite release
---
 core/pom.xml | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/core/pom.xml b/core/pom.xml
index bfa49d0d6dc25..e58efe495e36d 100644
--- a/core/pom.xml
+++ b/core/pom.xml
@@ -377,9 +377,15 @@
       <scope>test</scope>
     </dependency>
     <dependency>
-      <groupId>org.spark-project</groupId>
+      <groupId>net.razorvine</groupId>
       <artifactId>pyrolite</artifactId>
       <version>4.4</version>
+      <exclusions>
+        <exclusion>
+          <groupId>net.razorvine</groupId>
+          <artifactId>serpent</artifactId>
+        </exclusion>
+      </exclusions>
     </dependency>
     <dependency>
       <groupId>net.sf.py4j</groupId>

From 834e699524583a7ebfe9e83b3900ec503150deca Mon Sep 17 00:00:00 2001
From: Xiangrui Meng <meng@databricks.com>
Date: Thu, 28 May 2015 21:26:43 -0700
Subject: [PATCH 231/525] [MINOR] fix RegressionEvaluator doc

`make clean html` under `python/doc` returns
~~~
/Users/meng/src/spark/python/pyspark/ml/evaluation.py:docstring of pyspark.ml.evaluation.RegressionEvaluator.setParams:3: WARNING: Definition list ends without a blank line; unexpected unindent.
~~~

harsha2010

Author: Xiangrui Meng <meng@databricks.com>

Closes #6469 from mengxr/fix-regression-evaluator-doc and squashes the following commits:

91e2dad [Xiangrui Meng] fix RegressionEvaluator doc
---
 python/pyspark/ml/evaluation.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/pyspark/ml/evaluation.py b/python/pyspark/ml/evaluation.py
index 23c37167b3711..d8ddb78c6d639 100644
--- a/python/pyspark/ml/evaluation.py
+++ b/python/pyspark/ml/evaluation.py
@@ -205,7 +205,7 @@ def getMetricName(self):
     def setParams(self, predictionCol="prediction", labelCol="label",
                   metricName="rmse"):
         """
-        setParams(self, predictionCol="prediction", labelCol="label",
+        setParams(self, predictionCol="prediction", labelCol="label", \
                   metricName="rmse")
         Sets params for regression evaluator.
         """

From 04ddcd4db7801abefa9c9effe5d88413b29d713b Mon Sep 17 00:00:00 2001
From: Kay Ousterhout <kayousterhout@gmail.com>
Date: Thu, 28 May 2015 22:09:49 -0700
Subject: [PATCH 232/525] [SPARK-7932] Fix misleading scheduler delay
 visualization

The existing code rounds down to the nearest percent when computing the proportion
of a task's time that was spent on each phase of execution, and then computes
the scheduler delay proportion as 100 - sum(all other proportions).  As a result,
a few extra percent can end up in the scheduler delay. This commit eliminates
the rounding so that the time visualizations correspond properly to the real times.

sarutak If you could take a look at this, that would be great! Not sure if there's a good
reason to round here that I missed.

cc shivaram

Author: Kay Ousterhout <kayousterhout@gmail.com>

Closes #6484 from kayousterhout/SPARK-7932 and squashes the following commits:

1723cc4 [Kay Ousterhout] [SPARK-7932] Fix misleading scheduler delay visualization
---
 core/src/main/scala/org/apache/spark/ui/jobs/StagePage.scala | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/core/src/main/scala/org/apache/spark/ui/jobs/StagePage.scala b/core/src/main/scala/org/apache/spark/ui/jobs/StagePage.scala
index 31e2e7fba9783..b83a49f79c8a8 100644
--- a/core/src/main/scala/org/apache/spark/ui/jobs/StagePage.scala
+++ b/core/src/main/scala/org/apache/spark/ui/jobs/StagePage.scala
@@ -527,7 +527,7 @@ private[ui] class StagePage(parent: StagesTab) extends WebUIPage("stage") {
         minLaunchTime = launchTime.min(minLaunchTime)
         maxFinishTime = finishTime.max(maxFinishTime)
 
-        def toProportion(time: Long) = (time.toDouble / totalExecutionTime * 100).toLong
+        def toProportion(time: Long) = time.toDouble / totalExecutionTime * 100
 
         val metricsOpt = taskUIData.taskMetrics
         val shuffleReadTime =

From cd3d9a5c0c3e77098a72c85dffe4a27737009ae7 Mon Sep 17 00:00:00 2001
From: Tathagata Das <tathagata.das1565@gmail.com>
Date: Thu, 28 May 2015 22:28:13 -0700
Subject: [PATCH 233/525] [SPARK-7930] [CORE] [STREAMING] Fixed shutdown hook
 priorities

Shutdown hook for temp directories had priority 100 while SparkContext was 50. So the local root directory was deleted before SparkContext was shutdown. This leads to scary errors on running jobs, at the time of shutdown. This is especially a problem when running streaming examples, where Ctrl-C is the only way to shutdown.

The fix in this PR is to make the temp directory shutdown priority lower than SparkContext, so that the temp dirs are the last thing to get deleted, after the SparkContext has been shut down. Also, the DiskBlockManager shutdown priority is change from default 100 to temp_dir_prio + 1, so that it gets invoked just before all temp dirs are cleared.

Author: Tathagata Das <tathagata.das1565@gmail.com>

Closes #6482 from tdas/SPARK-7930 and squashes the following commits:

d7cbeb5 [Tathagata Das] Removed unnecessary line
1514d0b [Tathagata Das] Fixed shutdown hook priorities
---
 .../org/apache/spark/storage/DiskBlockManager.scala  |  4 ++--
 .../src/main/scala/org/apache/spark/util/Utils.scala | 12 ++++++++++--
 2 files changed, 12 insertions(+), 4 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/storage/DiskBlockManager.scala b/core/src/main/scala/org/apache/spark/storage/DiskBlockManager.scala
index 2a4447705fa65..d441a4d31b954 100644
--- a/core/src/main/scala/org/apache/spark/storage/DiskBlockManager.scala
+++ b/core/src/main/scala/org/apache/spark/storage/DiskBlockManager.scala
@@ -139,8 +139,8 @@ private[spark] class DiskBlockManager(blockManager: BlockManager, conf: SparkCon
   }
 
   private def addShutdownHook(): AnyRef = {
-    Utils.addShutdownHook { () =>
-      logDebug("Shutdown hook called")
+    Utils.addShutdownHook(Utils.TEMP_DIR_SHUTDOWN_PRIORITY + 1) { () =>
+      logInfo("Shutdown hook called")
       DiskBlockManager.this.doStop()
     }
   }
diff --git a/core/src/main/scala/org/apache/spark/util/Utils.scala b/core/src/main/scala/org/apache/spark/util/Utils.scala
index 763d4db690187..693e1a0a3d5f0 100644
--- a/core/src/main/scala/org/apache/spark/util/Utils.scala
+++ b/core/src/main/scala/org/apache/spark/util/Utils.scala
@@ -73,6 +73,13 @@ private[spark] object Utils extends Logging {
    */
   val SPARK_CONTEXT_SHUTDOWN_PRIORITY = 50
 
+  /**
+   * The shutdown priority of temp directory must be lower than the SparkContext shutdown
+   * priority. Otherwise cleaning the temp directories while Spark jobs are running can
+   * throw undesirable errors at the time of shutdown.
+   */
+  val TEMP_DIR_SHUTDOWN_PRIORITY = 25
+
   private val MAX_DIR_CREATION_ATTEMPTS: Int = 10
   @volatile private var localRootDirs: Array[String] = null
 
@@ -189,10 +196,11 @@ private[spark] object Utils extends Logging {
   private val shutdownDeleteTachyonPaths = new scala.collection.mutable.HashSet[String]()
 
   // Add a shutdown hook to delete the temp dirs when the JVM exits
-  addShutdownHook { () =>
-    logDebug("Shutdown hook called")
+  addShutdownHook(TEMP_DIR_SHUTDOWN_PRIORITY) { () =>
+    logInfo("Shutdown hook called")
     shutdownDeletePaths.foreach { dirPath =>
       try {
+        logInfo("Deleting directory " + dirPath)
         Utils.deleteRecursively(new File(dirPath))
       } catch {
         case e: Exception => logError(s"Exception while deleting Spark temp dir: $dirPath", e)

From db9513789756da4f16bb1fe8cf1d19500f231f54 Mon Sep 17 00:00:00 2001
From: Xiangrui Meng <meng@databricks.com>
Date: Thu, 28 May 2015 22:38:38 -0700
Subject: [PATCH 234/525] [SPARK-7922] [MLLIB] use DataFrames for user/item
 factors in ALSModel

Expose user/item factors in DataFrames. This is to be more consistent with the pipeline API. It also helps maintain consistent APIs across languages. This PR also removed fitting params from `ALSModel`.

coderxiang

Author: Xiangrui Meng <meng@databricks.com>

Closes #6468 from mengxr/SPARK-7922 and squashes the following commits:

7bfb1d5 [Xiangrui Meng] update ALSModel in PySpark
1ba5607 [Xiangrui Meng] use DataFrames for user/item factors in ALS
---
 .../apache/spark/ml/recommendation/ALS.scala  | 101 ++++++++++--------
 python/pyspark/ml/recommendation.py           |  30 +++++-
 python/pyspark/mllib/common.py                |   5 +-
 3 files changed, 89 insertions(+), 47 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/recommendation/ALS.scala b/mllib/src/main/scala/org/apache/spark/ml/recommendation/ALS.scala
index 900b637ff8ad4..df009d855ecbb 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/recommendation/ALS.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/recommendation/ALS.scala
@@ -35,21 +35,46 @@ import org.apache.spark.annotation.{DeveloperApi, Experimental}
 import org.apache.spark.ml.{Estimator, Model}
 import org.apache.spark.ml.param._
 import org.apache.spark.ml.param.shared._
-import org.apache.spark.ml.util.Identifiable
+import org.apache.spark.ml.util.{Identifiable, SchemaUtils}
 import org.apache.spark.mllib.optimization.NNLS
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.DataFrame
 import org.apache.spark.sql.functions._
-import org.apache.spark.sql.types.{DoubleType, FloatType, IntegerType, StructField, StructType}
+import org.apache.spark.sql.types.{DoubleType, FloatType, IntegerType, StructType}
 import org.apache.spark.storage.StorageLevel
 import org.apache.spark.util.Utils
 import org.apache.spark.util.collection.{OpenHashMap, OpenHashSet, SortDataFormat, Sorter}
 import org.apache.spark.util.random.XORShiftRandom
 
+/**
+ * Common params for ALS and ALSModel.
+ */
+private[recommendation] trait ALSModelParams extends Params with HasPredictionCol {
+  /**
+   * Param for the column name for user ids.
+   * Default: "user"
+   * @group param
+   */
+  val userCol = new Param[String](this, "userCol", "column name for user ids")
+
+  /** @group getParam */
+  def getUserCol: String = $(userCol)
+
+  /**
+   * Param for the column name for item ids.
+   * Default: "item"
+   * @group param
+   */
+  val itemCol = new Param[String](this, "itemCol", "column name for item ids")
+
+  /** @group getParam */
+  def getItemCol: String = $(itemCol)
+}
+
 /**
  * Common params for ALS.
  */
-private[recommendation] trait ALSParams extends Params with HasMaxIter with HasRegParam
+private[recommendation] trait ALSParams extends ALSModelParams with HasMaxIter with HasRegParam
   with HasPredictionCol with HasCheckpointInterval with HasSeed {
 
   /**
@@ -105,26 +130,6 @@ private[recommendation] trait ALSParams extends Params with HasMaxIter with HasR
   /** @group getParam */
   def getAlpha: Double = $(alpha)
 
-  /**
-   * Param for the column name for user ids.
-   * Default: "user"
-   * @group param
-   */
-  val userCol = new Param[String](this, "userCol", "column name for user ids")
-
-  /** @group getParam */
-  def getUserCol: String = $(userCol)
-
-  /**
-   * Param for the column name for item ids.
-   * Default: "item"
-   * @group param
-   */
-  val itemCol = new Param[String](this, "itemCol", "column name for item ids")
-
-  /** @group getParam */
-  def getItemCol: String = $(itemCol)
-
   /**
    * Param for the column name for ratings.
    * Default: "rating"
@@ -156,55 +161,60 @@ private[recommendation] trait ALSParams extends Params with HasMaxIter with HasR
    * @return output schema
    */
   protected def validateAndTransformSchema(schema: StructType): StructType = {
-    require(schema($(userCol)).dataType == IntegerType)
-    require(schema($(itemCol)).dataType== IntegerType)
+    SchemaUtils.checkColumnType(schema, $(userCol), IntegerType)
+    SchemaUtils.checkColumnType(schema, $(itemCol), IntegerType)
     val ratingType = schema($(ratingCol)).dataType
     require(ratingType == FloatType || ratingType == DoubleType)
-    val predictionColName = $(predictionCol)
-    require(!schema.fieldNames.contains(predictionColName),
-      s"Prediction column $predictionColName already exists.")
-    val newFields = schema.fields :+ StructField($(predictionCol), FloatType, nullable = false)
-    StructType(newFields)
+    SchemaUtils.appendColumn(schema, $(predictionCol), FloatType)
   }
 }
 
 /**
  * :: Experimental ::
  * Model fitted by ALS.
+ *
+ * @param rank rank of the matrix factorization model
+ * @param userFactors a DataFrame that stores user factors in two columns: `id` and `features`
+ * @param itemFactors a DataFrame that stores item factors in two columns: `id` and `features`
  */
 @Experimental
 class ALSModel private[ml] (
     override val uid: String,
-    k: Int,
-    userFactors: RDD[(Int, Array[Float])],
-    itemFactors: RDD[(Int, Array[Float])])
-  extends Model[ALSModel] with ALSParams {
+    val rank: Int,
+    @transient val userFactors: DataFrame,
+    @transient val itemFactors: DataFrame)
+  extends Model[ALSModel] with ALSModelParams {
+
+  /** @group setParam */
+  def setUserCol(value: String): this.type = set(userCol, value)
+
+  /** @group setParam */
+  def setItemCol(value: String): this.type = set(itemCol, value)
 
   /** @group setParam */
   def setPredictionCol(value: String): this.type = set(predictionCol, value)
 
   override def transform(dataset: DataFrame): DataFrame = {
-    import dataset.sqlContext.implicits._
-    val users = userFactors.toDF("id", "features")
-    val items = itemFactors.toDF("id", "features")
-
     // Register a UDF for DataFrame, and then
     // create a new column named map(predictionCol) by running the predict UDF.
     val predict = udf { (userFeatures: Seq[Float], itemFeatures: Seq[Float]) =>
       if (userFeatures != null && itemFeatures != null) {
-        blas.sdot(k, userFeatures.toArray, 1, itemFeatures.toArray, 1)
+        blas.sdot(rank, userFeatures.toArray, 1, itemFeatures.toArray, 1)
       } else {
         Float.NaN
       }
     }
     dataset
-      .join(users, dataset($(userCol)) === users("id"), "left")
-      .join(items, dataset($(itemCol)) === items("id"), "left")
-      .select(dataset("*"), predict(users("features"), items("features")).as($(predictionCol)))
+      .join(userFactors, dataset($(userCol)) === userFactors("id"), "left")
+      .join(itemFactors, dataset($(itemCol)) === itemFactors("id"), "left")
+      .select(dataset("*"),
+        predict(userFactors("features"), itemFactors("features")).as($(predictionCol)))
   }
 
   override def transformSchema(schema: StructType): StructType = {
-    validateAndTransformSchema(schema)
+    SchemaUtils.checkColumnType(schema, $(userCol), IntegerType)
+    SchemaUtils.checkColumnType(schema, $(itemCol), IntegerType)
+    SchemaUtils.appendColumn(schema, $(predictionCol), FloatType)
   }
 }
 
@@ -299,6 +309,7 @@ class ALS(override val uid: String) extends Estimator[ALSModel] with ALSParams {
   }
 
   override def fit(dataset: DataFrame): ALSModel = {
+    import dataset.sqlContext.implicits._
     val ratings = dataset
       .select(col($(userCol)).cast(IntegerType), col($(itemCol)).cast(IntegerType),
         col($(ratingCol)).cast(FloatType))
@@ -310,7 +321,9 @@ class ALS(override val uid: String) extends Estimator[ALSModel] with ALSParams {
       maxIter = $(maxIter), regParam = $(regParam), implicitPrefs = $(implicitPrefs),
       alpha = $(alpha), nonnegative = $(nonnegative),
       checkpointInterval = $(checkpointInterval), seed = $(seed))
-    val model = new ALSModel(uid, $(rank), userFactors, itemFactors).setParent(this)
+    val userDF = userFactors.toDF("id", "features")
+    val itemDF = itemFactors.toDF("id", "features")
+    val model = new ALSModel(uid, $(rank), userDF, itemDF).setParent(this)
     copyValues(model)
   }
 
diff --git a/python/pyspark/ml/recommendation.py b/python/pyspark/ml/recommendation.py
index b3e0dd7abf681..b06099ac0aee6 100644
--- a/python/pyspark/ml/recommendation.py
+++ b/python/pyspark/ml/recommendation.py
@@ -63,8 +63,15 @@ class ALS(JavaEstimator, HasCheckpointInterval, HasMaxIter, HasPredictionCol, Ha
     indicated user preferences rather than explicit ratings given to
     items.
 
+    >>> df = sqlContext.createDataFrame(
+    ...     [(0, 0, 4.0), (0, 1, 2.0), (1, 1, 3.0), (1, 2, 4.0), (2, 1, 1.0), (2, 2, 5.0)],
+    ...     ["user", "item", "rating"])
     >>> als = ALS(rank=10, maxIter=5)
     >>> model = als.fit(df)
+    >>> model.rank
+    10
+    >>> model.userFactors.orderBy("id").collect()
+    [Row(id=0, features=[...]), Row(id=1, ...), Row(id=2, ...)]
     >>> test = sqlContext.createDataFrame([(0, 2), (1, 0), (2, 0)], ["user", "item"])
     >>> predictions = sorted(model.transform(test).collect(), key=lambda r: r[0])
     >>> predictions[0]
@@ -260,6 +267,27 @@ class ALSModel(JavaModel):
     Model fitted by ALS.
     """
 
+    @property
+    def rank(self):
+        """rank of the matrix factorization model"""
+        return self._call_java("rank")
+
+    @property
+    def userFactors(self):
+        """
+        a DataFrame that stores user factors in two columns: `id` and
+        `features`
+        """
+        return self._call_java("userFactors")
+
+    @property
+    def itemFactors(self):
+        """
+        a DataFrame that stores item factors in two columns: `id` and
+        `features`
+        """
+        return self._call_java("itemFactors")
+
 
 if __name__ == "__main__":
     import doctest
@@ -272,8 +300,6 @@ class ALSModel(JavaModel):
     sqlContext = SQLContext(sc)
     globs['sc'] = sc
     globs['sqlContext'] = sqlContext
-    globs['df'] = sqlContext.createDataFrame([(0, 0, 4.0), (0, 1, 2.0), (1, 1, 3.0), (1, 2, 4.0),
-                                              (2, 1, 1.0), (2, 2, 5.0)], ["user", "item", "rating"])
     (failure_count, test_count) = doctest.testmod(globs=globs, optionflags=doctest.ELLIPSIS)
     sc.stop()
     if failure_count:
diff --git a/python/pyspark/mllib/common.py b/python/pyspark/mllib/common.py
index ba6058978880a..855e85f57155e 100644
--- a/python/pyspark/mllib/common.py
+++ b/python/pyspark/mllib/common.py
@@ -27,7 +27,7 @@
 
 from pyspark import RDD, SparkContext
 from pyspark.serializers import PickleSerializer, AutoBatchedSerializer
-
+from pyspark.sql import DataFrame, SQLContext
 
 # Hack for support float('inf') in Py4j
 _old_smart_decode = py4j.protocol.smart_decode
@@ -99,6 +99,9 @@ def _java2py(sc, r, encoding="bytes"):
             jrdd = sc._jvm.SerDe.javaToPython(r)
             return RDD(jrdd, sc)
 
+        if clsName == 'DataFrame':
+            return DataFrame(r, SQLContext(sc))
+
         if clsName in _picklable_classes:
             r = sc._jvm.SerDe.dumps(r)
         elif isinstance(r, (JavaArray, JavaList)):

From e714ecf277a7412ea8263662977fe3ad1f794975 Mon Sep 17 00:00:00 2001
From: Tathagata Das <tathagata.das1565@gmail.com>
Date: Thu, 28 May 2015 22:39:21 -0700
Subject: [PATCH 235/525] [SPARK-7931] [STREAMING] Do not restart receiver when
 stopped

Attempts to restart the socket receiver when it is supposed to be stopped causes undesirable error messages.

Author: Tathagata Das <tathagata.das1565@gmail.com>

Closes #6483 from tdas/SPARK-7931 and squashes the following commits:

09aeee1 [Tathagata Das] Do not restart receiver when stopped
---
 .../spark/streaming/dstream/SocketInputDStream.scala  | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/streaming/src/main/scala/org/apache/spark/streaming/dstream/SocketInputDStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/dstream/SocketInputDStream.scala
index 8b72bcf20653d..96e0a9c1a88f0 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/dstream/SocketInputDStream.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/dstream/SocketInputDStream.scala
@@ -17,6 +17,8 @@
 
 package org.apache.spark.streaming.dstream
 
+import scala.util.control.NonFatal
+
 import org.apache.spark.streaming.StreamingContext
 import org.apache.spark.storage.StorageLevel
 import org.apache.spark.util.NextIterator
@@ -74,13 +76,16 @@ class SocketReceiver[T: ClassTag](
       while(!isStopped && iterator.hasNext) {
         store(iterator.next)
       }
+      if (!isStopped()) {
+        restart("Socket data stream had no more data")
+      }
       logInfo("Stopped receiving")
-      restart("Retrying connecting to " + host + ":" + port)
     } catch {
       case e: java.net.ConnectException =>
         restart("Error connecting to " + host + ":" + port, e)
-      case t: Throwable =>
-        restart("Error receiving data", t)
+      case NonFatal(e) =>
+        logWarning("Error receiving data", e)
+        restart("Error receiving data", e)
     } finally {
       if (socket != null) {
         socket.close()

From 36067ce398e2949c2f122625e67fd5497febdee6 Mon Sep 17 00:00:00 2001
From: Patrick Wendell <patrick@databricks.com>
Date: Thu, 28 May 2015 22:48:02 -0700
Subject: [PATCH 236/525] [HOTFIX] Minor style fix from last commit

---
 .../apache/spark/streaming/dstream/SocketInputDStream.scala    | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/streaming/src/main/scala/org/apache/spark/streaming/dstream/SocketInputDStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/dstream/SocketInputDStream.scala
index 96e0a9c1a88f0..5ce5b7aae6e69 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/dstream/SocketInputDStream.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/dstream/SocketInputDStream.scala
@@ -78,8 +78,9 @@ class SocketReceiver[T: ClassTag](
       }
       if (!isStopped()) {
         restart("Socket data stream had no more data")
+      } else {
+        logInfo("Stopped receiving")
       }
-      logInfo("Stopped receiving")
     } catch {
       case e: java.net.ConnectException =>
         restart("Error connecting to " + host + ":" + port, e)

From 97a60cf75d1fed654953eccedd04f3442389c5ca Mon Sep 17 00:00:00 2001
From: Reynold Xin <rxin@databricks.com>
Date: Thu, 28 May 2015 23:00:02 -0700
Subject: [PATCH 237/525] [SPARK-7929] Turn whitespace checker on for more
 token types.

This is the last batch of changes to complete SPARK-7929.

Previous related PRs:
https://github.com/apache/spark/pull/6480
https://github.com/apache/spark/pull/6478
https://github.com/apache/spark/pull/6477
https://github.com/apache/spark/pull/6476
https://github.com/apache/spark/pull/6475
https://github.com/apache/spark/pull/6474
https://github.com/apache/spark/pull/6473

Author: Reynold Xin <rxin@databricks.com>

Closes #6487 from rxin/whitespace-lint and squashes the following commits:

b33d43d [Reynold Xin] [SPARK-7929] Turn whitespace checker on for more token types.
---
 .../flume/sink/TransactionProcessor.scala      |  2 +-
 .../streaming/flume/EventTransformer.scala     |  2 +-
 .../spark/streaming/kafka/KafkaRDDSuite.scala  |  2 +-
 .../streaming/mqtt/MQTTInputDStream.scala      | 14 +-------------
 .../streaming/KinesisWordCountASL.scala        |  2 +-
 .../spark/streaming/kinesis/KinesisUtils.scala |  4 ++--
 scalastyle-config.xml                          | 13 ++++++++++++-
 .../spark/sql/hive/HiveInspectorSuite.scala    | 12 ++++++------
 .../sql/hive/InsertIntoHiveTableSuite.scala    |  2 +-
 .../spark/sql/hive/ListTablesSuite.scala       |  2 +-
 .../org/apache/spark/sql/hive/UDFSuite.scala   |  6 +++---
 .../hive/execution/HiveComparisonTest.scala    |  4 ++--
 .../hive/execution/HiveResolutionSuite.scala   |  6 +++---
 .../hive/execution/HiveTableScanSuite.scala    |  2 +-
 .../sql/hive/execution/SQLQuerySuite.scala     |  2 +-
 .../apache/spark/sql/hive/parquetSuites.scala  |  4 ++--
 .../org/apache/spark/deploy/yarn/Client.scala  |  6 +++---
 .../yarn/ClientDistributedCacheManager.scala   | 18 +++++++++---------
 .../apache/spark/deploy/yarn/ClientSuite.scala |  2 +-
 19 files changed, 52 insertions(+), 53 deletions(-)

diff --git a/external/flume-sink/src/main/scala/org/apache/spark/streaming/flume/sink/TransactionProcessor.scala b/external/flume-sink/src/main/scala/org/apache/spark/streaming/flume/sink/TransactionProcessor.scala
index ea45b14294df9..7ad43b1d7b0a0 100644
--- a/external/flume-sink/src/main/scala/org/apache/spark/streaming/flume/sink/TransactionProcessor.scala
+++ b/external/flume-sink/src/main/scala/org/apache/spark/streaming/flume/sink/TransactionProcessor.scala
@@ -143,7 +143,7 @@ private class TransactionProcessor(val channel: Channel, val seqNum: String,
           eventBatch.setErrorMsg(msg)
         } else {
           // At this point, the events are available, so fill them into the event batch
-          eventBatch = new EventBatch("",seqNum, events)
+          eventBatch = new EventBatch("", seqNum, events)
         }
       })
     } catch {
diff --git a/external/flume/src/main/scala/org/apache/spark/streaming/flume/EventTransformer.scala b/external/flume/src/main/scala/org/apache/spark/streaming/flume/EventTransformer.scala
index dc629df4f4ac2..65c49c131518b 100644
--- a/external/flume/src/main/scala/org/apache/spark/streaming/flume/EventTransformer.scala
+++ b/external/flume/src/main/scala/org/apache/spark/streaming/flume/EventTransformer.scala
@@ -60,7 +60,7 @@ private[streaming] object EventTransformer extends Logging {
     out.write(body)
     val numHeaders = headers.size()
     out.writeInt(numHeaders)
-    for ((k,v) <- headers) {
+    for ((k, v) <- headers) {
       val keyBuff = Utils.serialize(k.toString)
       out.writeInt(keyBuff.length)
       out.write(keyBuff)
diff --git a/external/kafka/src/test/scala/org/apache/spark/streaming/kafka/KafkaRDDSuite.scala b/external/kafka/src/test/scala/org/apache/spark/streaming/kafka/KafkaRDDSuite.scala
index 39c3fb448ff57..3c875cb766513 100644
--- a/external/kafka/src/test/scala/org/apache/spark/streaming/kafka/KafkaRDDSuite.scala
+++ b/external/kafka/src/test/scala/org/apache/spark/streaming/kafka/KafkaRDDSuite.scala
@@ -65,7 +65,7 @@ class KafkaRDDSuite extends FunSuite with BeforeAndAfterAll {
 
     val offsetRanges = Array(OffsetRange(topic, 0, 0, messages.size))
 
-    val rdd =  KafkaUtils.createRDD[String, String, StringDecoder, StringDecoder](
+    val rdd = KafkaUtils.createRDD[String, String, StringDecoder, StringDecoder](
       sc, kafkaParams, offsetRanges)
 
     val received = rdd.map(_._2).collect.toSet
diff --git a/external/mqtt/src/main/scala/org/apache/spark/streaming/mqtt/MQTTInputDStream.scala b/external/mqtt/src/main/scala/org/apache/spark/streaming/mqtt/MQTTInputDStream.scala
index 40f5f18547236..7c2f18cb35bda 100644
--- a/external/mqtt/src/main/scala/org/apache/spark/streaming/mqtt/MQTTInputDStream.scala
+++ b/external/mqtt/src/main/scala/org/apache/spark/streaming/mqtt/MQTTInputDStream.scala
@@ -17,22 +17,10 @@
 
 package org.apache.spark.streaming.mqtt
 
-import java.io.IOException
-import java.util.concurrent.Executors
-import java.util.Properties
-
-import scala.collection.JavaConversions._
-import scala.collection.Map
-import scala.collection.mutable.HashMap
-import scala.reflect.ClassTag
-
 import org.eclipse.paho.client.mqttv3.IMqttDeliveryToken
 import org.eclipse.paho.client.mqttv3.MqttCallback
 import org.eclipse.paho.client.mqttv3.MqttClient
-import org.eclipse.paho.client.mqttv3.MqttClientPersistence
-import org.eclipse.paho.client.mqttv3.MqttException
 import org.eclipse.paho.client.mqttv3.MqttMessage
-import org.eclipse.paho.client.mqttv3.MqttTopic
 import org.eclipse.paho.client.mqttv3.persist.MemoryPersistence
 
 import org.apache.spark.storage.StorageLevel
@@ -87,7 +75,7 @@ class MQTTReceiver(
 
       // Handles Mqtt message
       override def messageArrived(topic: String, message: MqttMessage) {
-        store(new String(message.getPayload(),"utf-8"))
+        store(new String(message.getPayload(), "utf-8"))
       }
 
       override def deliveryComplete(token: IMqttDeliveryToken) {
diff --git a/extras/kinesis-asl/src/main/scala/org/apache/spark/examples/streaming/KinesisWordCountASL.scala b/extras/kinesis-asl/src/main/scala/org/apache/spark/examples/streaming/KinesisWordCountASL.scala
index df77f4be9db1d..97c3476049289 100644
--- a/extras/kinesis-asl/src/main/scala/org/apache/spark/examples/streaming/KinesisWordCountASL.scala
+++ b/extras/kinesis-asl/src/main/scala/org/apache/spark/examples/streaming/KinesisWordCountASL.scala
@@ -208,7 +208,7 @@ object KinesisWordProducerASL {
       recordsPerSecond: Int,
       wordsPerRecord: Int): Seq[(String, Int)] = {
 
-    val randomWords = List("spark","you","are","my","father")
+    val randomWords = List("spark", "you", "are", "my", "father")
     val totals = scala.collection.mutable.Map[String, Int]()
   
     // Create the low-level Kinesis Client from the AWS Java SDK.
diff --git a/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisUtils.scala b/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisUtils.scala
index 2531aebe7813c..e5acab50181e1 100644
--- a/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisUtils.scala
+++ b/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisUtils.scala
@@ -55,7 +55,7 @@ object KinesisUtils {
    */
   def createStream(
       ssc: StreamingContext,
-      kinesisAppName:  String,
+      kinesisAppName: String,
       streamName: String,
       endpointUrl: String,
       regionName: String,
@@ -102,7 +102,7 @@ object KinesisUtils {
    */
   def createStream(
       ssc: StreamingContext,
-      kinesisAppName:  String,
+      kinesisAppName: String,
       streamName: String,
       endpointUrl: String,
       regionName: String,
diff --git a/scalastyle-config.xml b/scalastyle-config.xml
index 7168d5b2a8e26..68d980b610c00 100644
--- a/scalastyle-config.xml
+++ b/scalastyle-config.xml
@@ -51,8 +51,8 @@
   </parameters>
  </check>
  <check level="error" class="org.scalastyle.scalariform.SpacesAfterPlusChecker" enabled="true"></check>
- <check level="error" class="org.scalastyle.file.WhitespaceEndOfLineChecker" enabled="false"></check>
  <check level="error" class="org.scalastyle.scalariform.SpacesBeforePlusChecker" enabled="true"></check>
+ <check level="error" class="org.scalastyle.file.WhitespaceEndOfLineChecker" enabled="false"></check>
  <check level="error" class="org.scalastyle.file.FileLineLengthChecker" enabled="true">
   <parameters>
    <parameter name="maxLineLength"><![CDATA[100]]></parameter>
@@ -142,4 +142,15 @@
  <check level="error" class="org.scalastyle.file.NoNewLineAtEofChecker" enabled="false"></check>
  <check level="error" class="org.scalastyle.scalariform.NonASCIICharacterChecker" enabled="true"></check>
  <check level="error" class="org.scalastyle.scalariform.SpaceAfterCommentStartChecker" enabled="true"></check>
+ <check level="error" class="org.scalastyle.scalariform.EnsureSingleSpaceBeforeTokenChecker" enabled="true">
+   <parameters>
+     <parameter name="tokens">ARROW, EQUALS</parameter>
+   </parameters>
+ </check>
+  <check level="error" class="org.scalastyle.scalariform.EnsureSingleSpaceAfterTokenChecker" enabled="true">
+    <parameters>
+     <parameter name="tokens">ARROW, EQUALS, COMMA, COLON, IF, WHILE, FOR</parameter>
+    </parameters>
+  </check>
+  <check level="error" class="org.scalastyle.scalariform.NotImplementedErrorUsage" enabled="true"></check>
 </scalastyle>
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveInspectorSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveInspectorSuite.scala
index 2a7374cc172b7..80c2d32bf70d7 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveInspectorSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveInspectorSuite.scala
@@ -78,10 +78,10 @@ class HiveInspectorSuite extends FunSuite with HiveInspectors {
     Literal(java.sql.Date.valueOf("2014-09-23")) ::
     Literal(Decimal(BigDecimal(123.123))) ::
     Literal(new java.sql.Timestamp(123123)) ::
-    Literal(Array[Byte](1,2,3)) ::
-    Literal.create(Seq[Int](1,2,3), ArrayType(IntegerType)) ::
-    Literal.create(Map[Int, Int](1->2, 2->1), MapType(IntegerType, IntegerType)) ::
-    Literal.create(Row(1,2.0d,3.0f),
+    Literal(Array[Byte](1, 2, 3)) ::
+    Literal.create(Seq[Int](1, 2, 3), ArrayType(IntegerType)) ::
+    Literal.create(Map[Int, Int](1 -> 2, 2 -> 1), MapType(IntegerType, IntegerType)) ::
+    Literal.create(Row(1, 2.0d, 3.0f),
       StructType(StructField("c1", IntegerType) ::
       StructField("c2", DoubleType) ::
       StructField("c3", FloatType) :: Nil)) ::
@@ -111,8 +111,8 @@ class HiveInspectorSuite extends FunSuite with HiveInspectors {
     case DecimalType() => PrimitiveObjectInspectorFactory.writableHiveDecimalObjectInspector
     case StructType(fields) =>
       ObjectInspectorFactory.getStandardStructObjectInspector(
-        java.util.Arrays.asList(fields.map(f => f.name) :_*),
-        java.util.Arrays.asList(fields.map(f => toWritableInspector(f.dataType)) :_*))
+        java.util.Arrays.asList(fields.map(f => f.name) : _*),
+        java.util.Arrays.asList(fields.map(f => toWritableInspector(f.dataType)) : _*))
   }
 
   def checkDataType(dt1: Seq[DataType], dt2: Seq[DataType]): Unit = {
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/InsertIntoHiveTableSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/InsertIntoHiveTableSuite.scala
index acf2f7da30188..9cc4685499f19 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/InsertIntoHiveTableSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/InsertIntoHiveTableSuite.scala
@@ -160,7 +160,7 @@ class InsertIntoHiveTableSuite extends QueryTest with BeforeAndAfter {
       "p1=a"::"p2=b"::"p3=c"::"p4=c"::"p5=1"::Nil ,
       "p1=a"::"p2=b"::"p3=c"::"p4=c"::"p5=4"::Nil
     )
-    assert(listFolders(tmpDir,List()).sortBy(_.toString()) == expected.sortBy(_.toString))
+    assert(listFolders(tmpDir, List()).sortBy(_.toString()) == expected.sortBy(_.toString))
     sql("DROP TABLE table_with_partition")
     sql("DROP TABLE tmp_table")
   }
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/ListTablesSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/ListTablesSuite.scala
index e12a6c21ccac4..1c15997ea8e6d 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/ListTablesSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/ListTablesSuite.scala
@@ -29,7 +29,7 @@ class ListTablesSuite extends QueryTest with BeforeAndAfterAll {
   import org.apache.spark.sql.hive.test.TestHive.implicits._
 
   val df =
-    sparkContext.parallelize((1 to 10).map(i => (i,s"str$i"))).toDF("key", "value")
+    sparkContext.parallelize((1 to 10).map(i => (i, s"str$i"))).toDF("key", "value")
 
   override def beforeAll(): Unit = {
     // The catalog in HiveContext is a case insensitive one.
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/UDFSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/UDFSuite.scala
index 85b6bc93d7122..8245047626d57 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/UDFSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/UDFSuite.scala
@@ -26,9 +26,9 @@ case class FunctionResult(f1: String, f2: String)
 
 class UDFSuite extends QueryTest {
   test("UDF case insensitive") {
-    udf.register("random0", () => { Math.random()})
-    udf.register("RANDOM1", () => { Math.random()})
-    udf.register("strlenScala", (_: String).length + (_:Int))
+    udf.register("random0", () => { Math.random() })
+    udf.register("RANDOM1", () => { Math.random() })
+    udf.register("strlenScala", (_: String).length + (_: Int))
     assert(sql("SELECT RANDOM0() FROM src LIMIT 1").head().getDouble(0) >= 0.0)
     assert(sql("SELECT RANDOm1() FROM src LIMIT 1").head().getDouble(0) >= 0.0)
     assert(sql("SELECT strlenscala('test', 1) FROM src LIMIT 1").head().getInt(0) === 5)
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveComparisonTest.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveComparisonTest.scala
index 9c056e493bfde..55e5551b63818 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveComparisonTest.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveComparisonTest.scala
@@ -273,7 +273,7 @@ abstract class HiveComparisonTest
         }
 
         val hiveCacheFiles = queryList.zipWithIndex.map {
-          case (queryString, i)  =>
+          case (queryString, i) =>
             val cachedAnswerName = s"$testCaseName-$i-${getMd5(queryString)}"
             new File(answerCache, cachedAnswerName)
         }
@@ -304,7 +304,7 @@ abstract class HiveComparisonTest
             // other DDL has not been executed yet.
             hiveQueries.foreach(_.logical)
             val computedResults = (queryList.zipWithIndex, hiveQueries, hiveCacheFiles).zipped.map {
-              case ((queryString, i), hiveQuery, cachedAnswerFile)=>
+              case ((queryString, i), hiveQuery, cachedAnswerFile) =>
                 try {
                   // Hooks often break the harness and don't really affect our test anyway, don't
                   // even try running them.
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveResolutionSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveResolutionSuite.scala
index 3dfa6e72e1242..b08db6de2d2f6 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveResolutionSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveResolutionSuite.scala
@@ -77,7 +77,7 @@ class HiveResolutionSuite extends HiveComparisonTest {
 
   test("case insensitivity with scala reflection") {
     // Test resolution with Scala Reflection
-    sparkContext.parallelize(Data(1, 2, Nested(1,2), Seq(Nested(1,2))) :: Nil)
+    sparkContext.parallelize(Data(1, 2, Nested(1, 2), Seq(Nested(1, 2))) :: Nil)
       .toDF().registerTempTable("caseSensitivityTest")
 
     val query = sql("SELECT a, b, A, B, n.a, n.b, n.A, n.B FROM caseSensitivityTest")
@@ -88,14 +88,14 @@ class HiveResolutionSuite extends HiveComparisonTest {
 
   ignore("case insensitivity with scala reflection joins") {
     // Test resolution with Scala Reflection
-    sparkContext.parallelize(Data(1, 2, Nested(1,2), Seq(Nested(1,2))) :: Nil)
+    sparkContext.parallelize(Data(1, 2, Nested(1, 2), Seq(Nested(1, 2))) :: Nil)
       .toDF().registerTempTable("caseSensitivityTest")
 
     sql("SELECT * FROM casesensitivitytest a JOIN casesensitivitytest b ON a.a = b.a").collect()
   }
 
   test("nested repeated resolution") {
-    sparkContext.parallelize(Data(1, 2, Nested(1,2), Seq(Nested(1,2))) :: Nil)
+    sparkContext.parallelize(Data(1, 2, Nested(1, 2), Seq(Nested(1, 2))) :: Nil)
       .toDF().registerTempTable("nestedRepeatedTest")
     assert(sql("SELECT nestedArray[0].a FROM nestedRepeatedTest").collect().head(0) === 1)
   }
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveTableScanSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveTableScanSuite.scala
index ab53c6309e089..0ba4d11478211 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveTableScanSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveTableScanSuite.scala
@@ -76,7 +76,7 @@ class HiveTableScanSuite extends HiveComparisonTest {
      
     TestHive.sql(s"LOAD DATA LOCAL INPATH '$location' INTO TABLE timestamp_query_null")
     assert(TestHive.sql("SELECT time from timestamp_query_null limit 2").collect() 
-      === Array(Row(java.sql.Timestamp.valueOf("2014-12-11 00:00:00")),Row(null)))
+      === Array(Row(java.sql.Timestamp.valueOf("2014-12-11 00:00:00")), Row(null)))
     TestHive.sql("DROP TABLE timestamp_query_null")
   }
 
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
index 538e66125c5fe..27863a60145d7 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
@@ -327,7 +327,7 @@ class SQLQuerySuite extends QueryTest {
       "org.apache.hadoop.hive.ql.io.RCFileInputFormat",
       "org.apache.hadoop.hive.ql.io.RCFileOutputFormat",
       "org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe",
-      "serde_p1=p1", "serde_p2=p2", "tbl_p1=p11", "tbl_p2=p22","MANAGED_TABLE"
+      "serde_p1=p1", "serde_p2=p2", "tbl_p1=p11", "tbl_p2=p22", "MANAGED_TABLE"
     )
 
     if (HiveShim.version =="0.13.1") {
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/parquetSuites.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/parquetSuites.scala
index 7851f38fd4056..e62ac909cbd0c 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/parquetSuites.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/parquetSuites.scala
@@ -38,7 +38,7 @@ case class ParquetData(intField: Int, stringField: String)
 // The data that also includes the partitioning key
 case class ParquetDataWithKey(p: Int, intField: Int, stringField: String)
 
-case class StructContainer(intStructField :Int, stringStructField: String)
+case class StructContainer(intStructField: Int, stringStructField: String)
 
 case class ParquetDataWithComplexTypes(
     intField: Int,
@@ -735,7 +735,7 @@ class ParquetDataSourceOnSourceSuite extends ParquetSourceSuiteBase {
     val filePath = new File(tempDir, "testParquet").getCanonicalPath
     val filePath2 = new File(tempDir, "testParquet2").getCanonicalPath
 
-    val df = Seq(1,2,3).map(i => (i, i.toString)).toDF("int", "str")
+    val df = Seq(1, 2, 3).map(i => (i, i.toString)).toDF("int", "str")
     val df2 = df.as('x).join(df.as('y), $"x.str" === $"y.str").groupBy("y.str").max("y.int")
     intercept[Throwable](df2.write.parquet(filePath))
 
diff --git a/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala b/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala
index 7e023f2d92578..234051eb7d3bb 100644
--- a/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala
+++ b/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala
@@ -1142,9 +1142,9 @@ object Client extends Logging {
           logDebug("HiveMetaStore configured in localmode")
         }
       } catch {
-        case e:java.lang.NoSuchMethodException => { logInfo("Hive Method not found " + e); return }
-        case e:java.lang.ClassNotFoundException => { logInfo("Hive Class not found " + e); return }
-        case e:Exception => { logError("Unexpected Exception " + e)
+        case e: java.lang.NoSuchMethodException => { logInfo("Hive Method not found " + e); return }
+        case e: java.lang.ClassNotFoundException => { logInfo("Hive Class not found " + e); return }
+        case e: Exception => { logError("Unexpected Exception " + e)
           throw new RuntimeException("Unexpected exception", e)
         }
       }
diff --git a/yarn/src/main/scala/org/apache/spark/deploy/yarn/ClientDistributedCacheManager.scala b/yarn/src/main/scala/org/apache/spark/deploy/yarn/ClientDistributedCacheManager.scala
index c592ecfdfce06..4ca6c903fcf12 100644
--- a/yarn/src/main/scala/org/apache/spark/deploy/yarn/ClientDistributedCacheManager.scala
+++ b/yarn/src/main/scala/org/apache/spark/deploy/yarn/ClientDistributedCacheManager.scala
@@ -95,13 +95,13 @@ private[spark] class ClientDistributedCacheManager() extends Logging {
     val (keys, tupleValues) = distCacheFiles.unzip
     val (sizes, timeStamps, visibilities) = tupleValues.unzip3
     if (keys.size > 0) {
-      env("SPARK_YARN_CACHE_FILES") = keys.reduceLeft[String] { (acc,n) => acc + "," + n }
+      env("SPARK_YARN_CACHE_FILES") = keys.reduceLeft[String] { (acc, n) => acc + "," + n }
       env("SPARK_YARN_CACHE_FILES_TIME_STAMPS") = 
-        timeStamps.reduceLeft[String] { (acc,n) => acc + "," + n }
+        timeStamps.reduceLeft[String] { (acc, n) => acc + "," + n }
       env("SPARK_YARN_CACHE_FILES_FILE_SIZES") = 
-        sizes.reduceLeft[String] { (acc,n) => acc + "," + n }
+        sizes.reduceLeft[String] { (acc, n) => acc + "," + n }
       env("SPARK_YARN_CACHE_FILES_VISIBILITIES") = 
-        visibilities.reduceLeft[String] { (acc,n) => acc + "," + n }
+        visibilities.reduceLeft[String] { (acc, n) => acc + "," + n }
     }
   }
 
@@ -112,13 +112,13 @@ private[spark] class ClientDistributedCacheManager() extends Logging {
     val (keys, tupleValues) = distCacheArchives.unzip
     val (sizes, timeStamps, visibilities) = tupleValues.unzip3
     if (keys.size > 0) {
-      env("SPARK_YARN_CACHE_ARCHIVES") = keys.reduceLeft[String] { (acc,n) => acc + "," + n }
+      env("SPARK_YARN_CACHE_ARCHIVES") = keys.reduceLeft[String] { (acc, n) => acc + "," + n }
       env("SPARK_YARN_CACHE_ARCHIVES_TIME_STAMPS") = 
-        timeStamps.reduceLeft[String] { (acc,n) => acc + "," + n }
+        timeStamps.reduceLeft[String] { (acc, n) => acc + "," + n }
       env("SPARK_YARN_CACHE_ARCHIVES_FILE_SIZES") =
-        sizes.reduceLeft[String] { (acc,n) => acc + "," + n }
+        sizes.reduceLeft[String] { (acc, n) => acc + "," + n }
       env("SPARK_YARN_CACHE_ARCHIVES_VISIBILITIES") = 
-        visibilities.reduceLeft[String] { (acc,n) => acc + "," + n }
+        visibilities.reduceLeft[String] { (acc, n) => acc + "," + n }
     }
   }
 
@@ -160,7 +160,7 @@ private[spark] class ClientDistributedCacheManager() extends Logging {
   def ancestorsHaveExecutePermissions(
       fs: FileSystem,
       path: Path,
-      statCache: Map[URI, FileStatus]): Boolean =  {
+      statCache: Map[URI, FileStatus]): Boolean = {
     var current = path
     while (current != null) {
       // the subdirs in the path should have execute permissions for others
diff --git a/yarn/src/test/scala/org/apache/spark/deploy/yarn/ClientSuite.scala b/yarn/src/test/scala/org/apache/spark/deploy/yarn/ClientSuite.scala
index 508819e242a26..6da3e82acdb14 100644
--- a/yarn/src/test/scala/org/apache/spark/deploy/yarn/ClientSuite.scala
+++ b/yarn/src/test/scala/org/apache/spark/deploy/yarn/ClientSuite.scala
@@ -203,7 +203,7 @@ class ClientSuite extends FunSuite with Matchers with BeforeAndAfterAll {
   def getFieldValue2[A: ClassTag, A1: ClassTag, B](
         clazz: Class[_],
         field: String,
-        defaults: => B)(mapTo:  A => B)(mapTo1: A1 => B): B = {
+        defaults: => B)(mapTo: A => B)(mapTo1: A1 => B): B = {
     Try(clazz.getField(field)).map(_.get(null)).map {
       case v: A => mapTo(v)
       case v1: A1 => mapTo1(v1)

From 23452be944463dae72a35b58551040556dd3aeb5 Mon Sep 17 00:00:00 2001
From: Xiangrui Meng <meng@databricks.com>
Date: Fri, 29 May 2015 00:51:12 -0700
Subject: [PATCH 238/525] [SPARK-7912] [SPARK-7921] [MLLIB] Update
 OneHotEncoder to handle ML attributes and change includeFirst to dropLast

This PR contains two major changes to `OneHotEncoder`:

1. more robust handling of ML attributes. If the input attribute is unknown, we look at the values to get the max category index
2. change `includeFirst` to `dropLast` and leave the default to `true`. There are couple benefits:

    a. consistent with other tutorials of one-hot encoding (or dummy coding) (e.g., http://www.ats.ucla.edu/stat/mult_pkg/faq/general/dummy.htm)
    b. keep the indices unmodified in the output vector. If we drop the first, all indices will be shifted by 1.
    c. If users use `StringIndex`, the last element is the least frequent one.

Sorry for including two changes in one PR! I'll update the user guide in another PR.

jkbradley sryza

Author: Xiangrui Meng <meng@databricks.com>

Closes #6466 from mengxr/SPARK-7912 and squashes the following commits:

a280dca [Xiangrui Meng] fix tests
d8f234d [Xiangrui Meng] Merge remote-tracking branch 'apache/master' into SPARK-7912
171b276 [Xiangrui Meng] mention the difference between our impl vs sklearn's
00dfd96 [Xiangrui Meng] update OneHotEncoder in Python
208ddad [Xiangrui Meng] update OneHotEncoder to handle ML attributes and change includeFirst to dropLast
---
 .../spark/ml/feature/OneHotEncoder.scala      | 160 ++++++++++++------
 .../spark/ml/feature/OneHotEncoderSuite.scala |  42 ++++-
 python/pyspark/ml/feature.py                  |  58 ++++---
 3 files changed, 176 insertions(+), 84 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/OneHotEncoder.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/OneHotEncoder.scala
index eb6ec49f854be..8f34878c8d329 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/OneHotEncoder.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/OneHotEncoder.scala
@@ -17,94 +17,152 @@
 
 package org.apache.spark.ml.feature
 
-import org.apache.spark.SparkException
 import org.apache.spark.annotation.Experimental
-import org.apache.spark.ml.UnaryTransformer
-import org.apache.spark.ml.attribute.{Attribute, BinaryAttribute, NominalAttribute}
+import org.apache.spark.ml.Transformer
+import org.apache.spark.ml.attribute._
 import org.apache.spark.ml.param._
 import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol}
 import org.apache.spark.ml.util.{Identifiable, SchemaUtils}
-import org.apache.spark.mllib.linalg.{Vector, VectorUDT, Vectors}
-import org.apache.spark.sql.types.{DataType, DoubleType, StructType}
+import org.apache.spark.mllib.linalg.Vectors
+import org.apache.spark.sql.DataFrame
+import org.apache.spark.sql.functions.{col, udf}
+import org.apache.spark.sql.types.{DoubleType, StructType}
 
 /**
  * :: Experimental ::
- * A one-hot encoder that maps a column of label indices to a column of binary vectors, with
- * at most a single one-value. By default, the binary vector has an element for each category, so
- * with 5 categories, an input value of 2.0 would map to an output vector of
- * (0.0, 0.0, 1.0, 0.0, 0.0). If includeFirst is set to false, the first category is omitted, so the
- * output vector for the previous example would be (0.0, 1.0, 0.0, 0.0) and an input value
- * of 0.0 would map to a vector of all zeros. Including the first category makes the vector columns
- * linearly dependent because they sum up to one.
+ * A one-hot encoder that maps a column of category indices to a column of binary vectors, with
+ * at most a single one-value per row that indicates the input category index.
+ * For example with 5 categories, an input value of 2.0 would map to an output vector of
+ * `[0.0, 0.0, 1.0, 0.0]`.
+ * The last category is not included by default (configurable via [[OneHotEncoder!.dropLast]]
+ * because it makes the vector entries sum up to one, and hence linearly dependent.
+ * So an input value of 4.0 maps to `[0.0, 0.0, 0.0, 0.0]`.
+ * Note that this is different from scikit-learn's OneHotEncoder, which keeps all categories.
+ * The output vectors are sparse.
+ *
+ * @see [[StringIndexer]] for converting categorical values into category indices
  */
 @Experimental
-class OneHotEncoder(override val uid: String)
-  extends UnaryTransformer[Double, Vector, OneHotEncoder] with HasInputCol with HasOutputCol {
+class OneHotEncoder(override val uid: String) extends Transformer
+  with HasInputCol with HasOutputCol {
 
   def this() = this(Identifiable.randomUID("oneHot"))
 
   /**
-   * Whether to include a component in the encoded vectors for the first category, defaults to true.
+   * Whether to drop the last category in the encoded vector (default: true)
    * @group param
    */
-  final val includeFirst: BooleanParam =
-    new BooleanParam(this, "includeFirst", "include first category")
-  setDefault(includeFirst -> true)
-
-  private var categories: Array[String] = _
+  final val dropLast: BooleanParam =
+    new BooleanParam(this, "dropLast", "whether to drop the last category")
+  setDefault(dropLast -> true)
 
   /** @group setParam */
-  def setIncludeFirst(value: Boolean): this.type = set(includeFirst, value)
+  def setDropLast(value: Boolean): this.type = set(dropLast, value)
 
   /** @group setParam */
-  override def setInputCol(value: String): this.type = set(inputCol, value)
+  def setInputCol(value: String): this.type = set(inputCol, value)
 
   /** @group setParam */
-  override def setOutputCol(value: String): this.type = set(outputCol, value)
+  def setOutputCol(value: String): this.type = set(outputCol, value)
 
   override def transformSchema(schema: StructType): StructType = {
-    SchemaUtils.checkColumnType(schema, $(inputCol), DoubleType)
-    val inputFields = schema.fields
+    val is = "_is_"
+    val inputColName = $(inputCol)
     val outputColName = $(outputCol)
-    require(inputFields.forall(_.name != $(outputCol)),
-      s"Output column ${$(outputCol)} already exists.")
 
-    val inputColAttr = Attribute.fromStructField(schema($(inputCol)))
-    categories = inputColAttr match {
+    SchemaUtils.checkColumnType(schema, inputColName, DoubleType)
+    val inputFields = schema.fields
+    require(!inputFields.exists(_.name == outputColName),
+      s"Output column $outputColName already exists.")
+
+    val inputAttr = Attribute.fromStructField(schema(inputColName))
+    val outputAttrNames: Option[Array[String]] = inputAttr match {
       case nominal: NominalAttribute =>
-        nominal.values.getOrElse((0 until nominal.numValues.get).map(_.toString).toArray)
-      case binary: BinaryAttribute => binary.values.getOrElse(Array("0", "1"))
+        if (nominal.values.isDefined) {
+          nominal.values.map(_.map(v => inputColName + is + v))
+        } else if (nominal.numValues.isDefined) {
+          nominal.numValues.map(n => Array.tabulate(n)(i => inputColName + is + i))
+        } else {
+          None
+        }
+      case binary: BinaryAttribute =>
+        if (binary.values.isDefined) {
+          binary.values.map(_.map(v => inputColName + is + v))
+        } else {
+          Some(Array.tabulate(2)(i => inputColName + is + i))
+        }
+      case _: NumericAttribute =>
+        throw new RuntimeException(
+          s"The input column $inputColName cannot be numeric.")
       case _ =>
-        throw new SparkException(s"OneHotEncoder input column ${$(inputCol)} is not nominal")
+        None // optimistic about unknown attributes
     }
 
-    val attrValues = (if ($(includeFirst)) categories else categories.drop(1)).toArray
-    val attr = NominalAttribute.defaultAttr.withName(outputColName).withValues(attrValues)
-    val outputFields = inputFields :+ attr.toStructField()
+    val filteredOutputAttrNames = outputAttrNames.map { names =>
+      if ($(dropLast)) {
+        require(names.length > 1,
+          s"The input column $inputColName should have at least two distinct values.")
+        names.dropRight(1)
+      } else {
+        names
+      }
+    }
+
+    val outputAttrGroup = if (filteredOutputAttrNames.isDefined) {
+      val attrs: Array[Attribute] = filteredOutputAttrNames.get.map { name =>
+        BinaryAttribute.defaultAttr.withName(name)
+      }
+      new AttributeGroup($(outputCol), attrs)
+    } else {
+      new AttributeGroup($(outputCol))
+    }
+
+    val outputFields = inputFields :+ outputAttrGroup.toStructField()
     StructType(outputFields)
   }
 
-  protected override def createTransformFunc(): (Double) => Vector = {
-    val first = $(includeFirst)
-    val vecLen = if (first) categories.length else categories.length - 1
+  override def transform(dataset: DataFrame): DataFrame = {
+    // schema transformation
+    val is = "_is_"
+    val inputColName = $(inputCol)
+    val outputColName = $(outputCol)
+    val shouldDropLast = $(dropLast)
+    var outputAttrGroup = AttributeGroup.fromStructField(
+      transformSchema(dataset.schema)(outputColName))
+    if (outputAttrGroup.size < 0) {
+      // If the number of attributes is unknown, we check the values from the input column.
+      val numAttrs = dataset.select(col(inputColName).cast(DoubleType)).map(_.getDouble(0))
+        .aggregate(0.0)(
+          (m, x) => {
+            assert(x >=0.0 && x == x.toInt,
+              s"Values from column $inputColName must be indices, but got $x.")
+            math.max(m, x)
+          },
+          (m0, m1) => {
+            math.max(m0, m1)
+          }
+        ).toInt + 1
+      val outputAttrNames = Array.tabulate(numAttrs)(i => inputColName + is + i)
+      val filtered = if (shouldDropLast) outputAttrNames.dropRight(1) else outputAttrNames
+      val outputAttrs: Array[Attribute] =
+        filtered.map(name => BinaryAttribute.defaultAttr.withName(name))
+      outputAttrGroup = new AttributeGroup(outputColName, outputAttrs)
+    }
+    val metadata = outputAttrGroup.toMetadata()
+
+    // data transformation
+    val size = outputAttrGroup.size
     val oneValue = Array(1.0)
     val emptyValues = Array[Double]()
     val emptyIndices = Array[Int]()
-    label: Double => {
-      val values = if (first || label != 0.0) oneValue else emptyValues
-      val indices = if (first) {
-        Array(label.toInt)
-      } else if (label != 0.0) {
-        Array(label.toInt - 1)
+    val encode = udf { label: Double =>
+      if (label < size) {
+        Vectors.sparse(size, Array(label.toInt), oneValue)
       } else {
-        emptyIndices
+        Vectors.sparse(size, emptyIndices, emptyValues)
       }
-      Vectors.sparse(vecLen, indices, values)
     }
-  }
 
-  /**
-   * Returns the data type of the output column.
-   */
-  protected def outputDataType: DataType = new VectorUDT
+    dataset.select(col("*"), encode(col(inputColName).cast(DoubleType)).as(outputColName, metadata))
+  }
 }
diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/OneHotEncoderSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/OneHotEncoderSuite.scala
index 056b9eda86bba..9018d0024d5f0 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/feature/OneHotEncoderSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/feature/OneHotEncoderSuite.scala
@@ -19,10 +19,11 @@ package org.apache.spark.ml.feature
 
 import org.scalatest.FunSuite
 
+import org.apache.spark.ml.attribute.{AttributeGroup, BinaryAttribute, NominalAttribute}
 import org.apache.spark.mllib.linalg.Vector
 import org.apache.spark.mllib.util.MLlibTestSparkContext
 import org.apache.spark.sql.DataFrame
-
+import org.apache.spark.sql.functions.col
 
 class OneHotEncoderSuite extends FunSuite with MLlibTestSparkContext {
 
@@ -36,15 +37,16 @@ class OneHotEncoderSuite extends FunSuite with MLlibTestSparkContext {
     indexer.transform(df)
   }
 
-  test("OneHotEncoder includeFirst = true") {
+  test("OneHotEncoder dropLast = false") {
     val transformed = stringIndexed()
     val encoder = new OneHotEncoder()
       .setInputCol("labelIndex")
       .setOutputCol("labelVec")
+      .setDropLast(false)
     val encoded = encoder.transform(transformed)
 
     val output = encoded.select("id", "labelVec").map { r =>
-      val vec = r.get(1).asInstanceOf[Vector]
+      val vec = r.getAs[Vector](1)
       (r.getInt(0), vec(0), vec(1), vec(2))
     }.collect().toSet
     // a -> 0, b -> 2, c -> 1
@@ -53,22 +55,46 @@ class OneHotEncoderSuite extends FunSuite with MLlibTestSparkContext {
     assert(output === expected)
   }
 
-  test("OneHotEncoder includeFirst = false") {
+  test("OneHotEncoder dropLast = true") {
     val transformed = stringIndexed()
     val encoder = new OneHotEncoder()
-      .setIncludeFirst(false)
       .setInputCol("labelIndex")
       .setOutputCol("labelVec")
     val encoded = encoder.transform(transformed)
 
     val output = encoded.select("id", "labelVec").map { r =>
-      val vec = r.get(1).asInstanceOf[Vector]
+      val vec = r.getAs[Vector](1)
       (r.getInt(0), vec(0), vec(1))
     }.collect().toSet
     // a -> 0, b -> 2, c -> 1
-    val expected = Set((0, 0.0, 0.0), (1, 0.0, 1.0), (2, 1.0, 0.0),
-      (3, 0.0, 0.0), (4, 0.0, 0.0), (5, 1.0, 0.0))
+    val expected = Set((0, 1.0, 0.0), (1, 0.0, 0.0), (2, 0.0, 1.0),
+      (3, 1.0, 0.0), (4, 1.0, 0.0), (5, 0.0, 1.0))
     assert(output === expected)
   }
 
+  test("input column with ML attribute") {
+    val attr = NominalAttribute.defaultAttr.withValues("small", "medium", "large")
+    val df = sqlContext.createDataFrame(Seq(0.0, 1.0, 2.0, 1.0).map(Tuple1.apply)).toDF("size")
+      .select(col("size").as("size", attr.toMetadata()))
+    val encoder = new OneHotEncoder()
+      .setInputCol("size")
+      .setOutputCol("encoded")
+    val output = encoder.transform(df)
+    val group = AttributeGroup.fromStructField(output.schema("encoded"))
+    assert(group.size === 2)
+    assert(group.getAttr(0) === BinaryAttribute.defaultAttr.withName("size_is_small").withIndex(0))
+    assert(group.getAttr(1) === BinaryAttribute.defaultAttr.withName("size_is_medium").withIndex(1))
+  }
+
+  test("input column without ML attribute") {
+    val df = sqlContext.createDataFrame(Seq(0.0, 1.0, 2.0, 1.0).map(Tuple1.apply)).toDF("index")
+    val encoder = new OneHotEncoder()
+      .setInputCol("index")
+      .setOutputCol("encoded")
+    val output = encoder.transform(df)
+    val group = AttributeGroup.fromStructField(output.schema("encoded"))
+    assert(group.size === 2)
+    assert(group.getAttr(0) === BinaryAttribute.defaultAttr.withName("index_is_0").withIndex(0))
+    assert(group.getAttr(1) === BinaryAttribute.defaultAttr.withName("index_is_1").withIndex(1))
+  }
 }
diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py
index b0479d9b074db..ddb33f427ac64 100644
--- a/python/pyspark/ml/feature.py
+++ b/python/pyspark/ml/feature.py
@@ -324,65 +324,73 @@ def getP(self):
 @inherit_doc
 class OneHotEncoder(JavaTransformer, HasInputCol, HasOutputCol):
     """
-    A one-hot encoder that maps a column of label indices to a column of binary vectors, with
-    at most a single one-value. By default, the binary vector has an element for each category, so
-    with 5 categories, an input value of 2.0 would map to an output vector of
-    (0.0, 0.0, 1.0, 0.0, 0.0). If includeFirst is set to false, the first category is omitted, so
-    the output vector for the previous example would be (0.0, 1.0, 0.0, 0.0) and an input value
-    of 0.0 would map to a vector of all zeros. Including the first category makes the vector columns
-    linearly dependent because they sum up to one.
-
-    TODO: This method requires the use of StringIndexer first. Decouple them.
+    A one-hot encoder that maps a column of category indices to a
+    column of binary vectors, with at most a single one-value per row
+    that indicates the input category index.
+    For example with 5 categories, an input value of 2.0 would map to
+    an output vector of `[0.0, 0.0, 1.0, 0.0]`.
+    The last category is not included by default (configurable via
+    :py:attr:`dropLast`) because it makes the vector entries sum up to
+    one, and hence linearly dependent.
+    So an input value of 4.0 maps to `[0.0, 0.0, 0.0, 0.0]`.
+    Note that this is different from scikit-learn's OneHotEncoder,
+    which keeps all categories.
+    The output vectors are sparse.
+
+    .. seealso::
+
+       :py:class:`StringIndexer` for converting categorical values into
+       category indices
 
     >>> stringIndexer = StringIndexer(inputCol="label", outputCol="indexed")
     >>> model = stringIndexer.fit(stringIndDf)
     >>> td = model.transform(stringIndDf)
-    >>> encoder = OneHotEncoder(includeFirst=False, inputCol="indexed", outputCol="features")
+    >>> encoder = OneHotEncoder(inputCol="indexed", outputCol="features")
     >>> encoder.transform(td).head().features
-    SparseVector(2, {})
+    SparseVector(2, {0: 1.0})
     >>> encoder.setParams(outputCol="freqs").transform(td).head().freqs
-    SparseVector(2, {})
-    >>> params = {encoder.includeFirst: True, encoder.outputCol: "test"}
+    SparseVector(2, {0: 1.0})
+    >>> params = {encoder.dropLast: False, encoder.outputCol: "test"}
     >>> encoder.transform(td, params).head().test
     SparseVector(3, {0: 1.0})
     """
 
     # a placeholder to make it appear in the generated doc
-    includeFirst = Param(Params._dummy(), "includeFirst", "include first category")
+    dropLast = Param(Params._dummy(), "dropLast", "whether to drop the last category")
 
     @keyword_only
-    def __init__(self, includeFirst=True, inputCol=None, outputCol=None):
+    def __init__(self, dropLast=True, inputCol=None, outputCol=None):
         """
         __init__(self, includeFirst=True, inputCol=None, outputCol=None)
         """
         super(OneHotEncoder, self).__init__()
         self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.OneHotEncoder", self.uid)
-        self.includeFirst = Param(self, "includeFirst", "include first category")
-        self._setDefault(includeFirst=True)
+        self.dropLast = Param(self, "dropLast", "whether to drop the last category")
+        self._setDefault(dropLast=True)
         kwargs = self.__init__._input_kwargs
         self.setParams(**kwargs)
 
     @keyword_only
-    def setParams(self, includeFirst=True, inputCol=None, outputCol=None):
+    def setParams(self, dropLast=True, inputCol=None, outputCol=None):
         """
-        setParams(self, includeFirst=True, inputCol=None, outputCol=None)
+        setParams(self, dropLast=True, inputCol=None, outputCol=None)
         Sets params for this OneHotEncoder.
         """
         kwargs = self.setParams._input_kwargs
         return self._set(**kwargs)
 
-    def setIncludeFirst(self, value):
+    def setDropLast(self, value):
         """
-        Sets the value of :py:attr:`includeFirst`.
+        Sets the value of :py:attr:`dropLast`.
         """
-        self._paramMap[self.includeFirst] = value
+        self._paramMap[self.dropLast] = value
         return self
 
-    def getIncludeFirst(self):
+    def getDropLast(self):
         """
-        Gets the value of includeFirst or its default value.
+        Gets the value of dropLast or its default value.
         """
-        return self.getOrDefault(self.includeFirst)
+        return self.getOrDefault(self.dropLast)
 
 
 @inherit_doc

From bf46580708e41a1d48ac091adbca8d82a4008699 Mon Sep 17 00:00:00 2001
From: Tim Ellison <t.p.ellison@gmail.com>
Date: Fri, 29 May 2015 05:14:43 -0400
Subject: [PATCH 239/525] [SPARK-7756] [CORE] Use testing cipher suites common
 to Oracle and IBM security providers

Add alias names for supported cipher suites to the sample SSL configuration.

The IBM JSSE provider reports its cipher suite with an SSL_ prefix, but accepts TLS_ prefixed suite names as an alias.  However, Jetty filters the requested ciphers based on the provider's reported supported suites, so the TLS_ versions are never passed through to JSSE causing an SSL handshake failure.

Author: Tim Ellison <t.p.ellison@gmail.com>

Closes #6282 from tellison/SSLFailure and squashes the following commits:

8de8a3e [Tim Ellison] Update SecurityManagerSuite with new expected suite names
96158b2 [Tim Ellison] Update the sample configs to use ciphers that are common to both the Oracle and IBM security providers.
705421b [Tim Ellison] Merge branch 'master' of github.com:tellison/spark into SSLFailure
68b9425 [Tim Ellison] Merge branch 'master' of https://github.com/apache/spark into SSLFailure
b0c35f6 [Tim Ellison] [CORE] Add aliases used for cipher suites in IBM provider
---
 core/src/test/scala/org/apache/spark/SSLSampleConfigs.scala   | 4 ++--
 .../test/scala/org/apache/spark/SecurityManagerSuite.scala    | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/core/src/test/scala/org/apache/spark/SSLSampleConfigs.scala b/core/src/test/scala/org/apache/spark/SSLSampleConfigs.scala
index 308b9ea17708d..1a099da2c6c8e 100644
--- a/core/src/test/scala/org/apache/spark/SSLSampleConfigs.scala
+++ b/core/src/test/scala/org/apache/spark/SSLSampleConfigs.scala
@@ -34,7 +34,7 @@ object SSLSampleConfigs {
     conf.set("spark.ssl.trustStore", trustStorePath)
     conf.set("spark.ssl.trustStorePassword", "password")
     conf.set("spark.ssl.enabledAlgorithms",
-      "TLS_RSA_WITH_AES_128_CBC_SHA, SSL_RSA_WITH_DES_CBC_SHA")
+      "SSL_RSA_WITH_RC4_128_SHA, SSL_RSA_WITH_DES_CBC_SHA")
     conf.set("spark.ssl.protocol", "TLSv1")
     conf
   }
@@ -48,7 +48,7 @@ object SSLSampleConfigs {
     conf.set("spark.ssl.trustStore", trustStorePath)
     conf.set("spark.ssl.trustStorePassword", "password")
     conf.set("spark.ssl.enabledAlgorithms",
-      "TLS_RSA_WITH_AES_128_CBC_SHA, SSL_RSA_WITH_DES_CBC_SHA")
+      "SSL_RSA_WITH_RC4_128_SHA, SSL_RSA_WITH_DES_CBC_SHA")
     conf.set("spark.ssl.protocol", "TLSv1")
     conf
   }
diff --git a/core/src/test/scala/org/apache/spark/SecurityManagerSuite.scala b/core/src/test/scala/org/apache/spark/SecurityManagerSuite.scala
index 62cb7649c0284..61571be44252a 100644
--- a/core/src/test/scala/org/apache/spark/SecurityManagerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/SecurityManagerSuite.scala
@@ -147,7 +147,7 @@ class SecurityManagerSuite extends FunSuite {
     assert(securityManager.fileServerSSLOptions.keyPassword === Some("password"))
     assert(securityManager.fileServerSSLOptions.protocol === Some("TLSv1"))
     assert(securityManager.fileServerSSLOptions.enabledAlgorithms ===
-        Set("TLS_RSA_WITH_AES_128_CBC_SHA", "SSL_RSA_WITH_DES_CBC_SHA"))
+        Set("SSL_RSA_WITH_RC4_128_SHA", "SSL_RSA_WITH_DES_CBC_SHA"))
 
     assert(securityManager.akkaSSLOptions.trustStore.isDefined === true)
     assert(securityManager.akkaSSLOptions.trustStore.get.getName === "truststore")
@@ -158,7 +158,7 @@ class SecurityManagerSuite extends FunSuite {
     assert(securityManager.akkaSSLOptions.keyPassword === Some("password"))
     assert(securityManager.akkaSSLOptions.protocol === Some("TLSv1"))
     assert(securityManager.akkaSSLOptions.enabledAlgorithms ===
-        Set("TLS_RSA_WITH_AES_128_CBC_SHA", "SSL_RSA_WITH_DES_CBC_SHA"))
+        Set("SSL_RSA_WITH_RC4_128_SHA", "SSL_RSA_WITH_DES_CBC_SHA"))
   }
 
   test("ssl off setup") {

From 8db40f6711058c3c3bf67ceaaaffffcc25d67d19 Mon Sep 17 00:00:00 2001
From: zsxwing <zsxwing@gmail.com>
Date: Fri, 29 May 2015 05:17:41 -0400
Subject: [PATCH 240/525] [SPARK-7863] [CORE] Create SimpleDateFormat for every
 SimpleDateParam instance because it's not thread-safe

SimpleDateFormat is not thread-safe. This PR creates new `SimpleDateFormat` for each `SimpleDateParam` instance.

Author: zsxwing <zsxwing@gmail.com>

Closes #6406 from zsxwing/SPARK-7863 and squashes the following commits:

aeed4c1 [zsxwing] Rewrite SimpleDateParam
8cdd986 [zsxwing] Inline formats
9680a15 [zsxwing] Create SimpleDateFormat for each SimpleDateParam instance because it's not thread-safe
---
 .../spark/status/api/v1/SimpleDateParam.scala | 49 ++++++++-----------
 .../status/api/v1/SimpleDateParamSuite.scala  |  5 ++
 2 files changed, 26 insertions(+), 28 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/status/api/v1/SimpleDateParam.scala b/core/src/main/scala/org/apache/spark/status/api/v1/SimpleDateParam.scala
index cee29786c3019..0c71cd2382225 100644
--- a/core/src/main/scala/org/apache/spark/status/api/v1/SimpleDateParam.scala
+++ b/core/src/main/scala/org/apache/spark/status/api/v1/SimpleDateParam.scala
@@ -16,40 +16,33 @@
  */
 package org.apache.spark.status.api.v1
 
-import java.text.SimpleDateFormat
+import java.text.{ParseException, SimpleDateFormat}
 import java.util.TimeZone
 import javax.ws.rs.WebApplicationException
 import javax.ws.rs.core.Response
 import javax.ws.rs.core.Response.Status
 
-import scala.util.Try
-
 private[v1] class SimpleDateParam(val originalValue: String) {
-  val timestamp: Long = {
-    SimpleDateParam.formats.collectFirst {
-      case fmt if Try(fmt.parse(originalValue)).isSuccess =>
-        fmt.parse(originalValue).getTime()
-    }.getOrElse(
-      throw new WebApplicationException(
-        Response
-          .status(Status.BAD_REQUEST)
-          .entity("Couldn't parse date: " + originalValue)
-          .build()
-      )
-    )
-  }
-}
 
-private[v1] object SimpleDateParam {
-
-  val formats: Seq[SimpleDateFormat] = {
-
-    val gmtDay = new SimpleDateFormat("yyyy-MM-dd")
-    gmtDay.setTimeZone(TimeZone.getTimeZone("GMT"))
-
-    Seq(
-      new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss.SSSz"),
-      gmtDay
-    )
+  val timestamp: Long = {
+    val format = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss.SSSz")
+    try {
+      format.parse(originalValue).getTime()
+    } catch {
+      case _: ParseException =>
+        val gmtDay = new SimpleDateFormat("yyyy-MM-dd")
+        gmtDay.setTimeZone(TimeZone.getTimeZone("GMT"))
+        try {
+          gmtDay.parse(originalValue).getTime()
+        } catch {
+          case _: ParseException =>
+            throw new WebApplicationException(
+              Response
+                .status(Status.BAD_REQUEST)
+                .entity("Couldn't parse date: " + originalValue)
+                .build()
+            )
+        }
+    }
   }
 }
diff --git a/core/src/test/scala/org/apache/spark/status/api/v1/SimpleDateParamSuite.scala b/core/src/test/scala/org/apache/spark/status/api/v1/SimpleDateParamSuite.scala
index 731d1f557ed33..183043bc05233 100644
--- a/core/src/test/scala/org/apache/spark/status/api/v1/SimpleDateParamSuite.scala
+++ b/core/src/test/scala/org/apache/spark/status/api/v1/SimpleDateParamSuite.scala
@@ -16,6 +16,8 @@
  */
 package org.apache.spark.status.api.v1
 
+import javax.ws.rs.WebApplicationException
+
 import org.scalatest.{Matchers, FunSuite}
 
 class SimpleDateParamSuite extends FunSuite with Matchers {
@@ -24,6 +26,9 @@ class SimpleDateParamSuite extends FunSuite with Matchers {
     new SimpleDateParam("2015-02-20T23:21:17.190GMT").timestamp should be (1424474477190L)
     new SimpleDateParam("2015-02-20T17:21:17.190EST").timestamp should be (1424470877190L)
     new SimpleDateParam("2015-02-20").timestamp should be (1424390400000L) // GMT
+    intercept[WebApplicationException] {
+      new SimpleDateParam("invalid date")
+    }
   }
 
 }

From a51b133de3c65a991ab105b6f020082080121b4c Mon Sep 17 00:00:00 2001
From: WangTaoTheTonic <wangtao111@huawei.com>
Date: Fri, 29 May 2015 11:06:11 -0500
Subject: [PATCH 241/525] [SPARK-7524] [SPARK-7846] add configs for keytab and
 principal, pass these two configs with different way in different modes

* As spark now supports long running service by updating tokens for namenode, but only accept parameters passed with "--k=v" format which is not very convinient. This patch add spark.* configs in properties file and system property.

*  --principal and --keytabl options are passed to client but when we started thrift server or spark-shell these two are also passed into the Main class (org.apache.spark.sql.hive.thriftserver.HiveThriftServer2 and org.apache.spark.repl.Main).
In these two main class, arguments passed in will be processed with some 3rd libraries, which will lead to some error: "Invalid option: --principal" or "Unrecgnised option: --principal".
We should pass these command args in different forms, say system properties.

Author: WangTaoTheTonic <wangtao111@huawei.com>

Closes #6051 from WangTaoTheTonic/SPARK-7524 and squashes the following commits:

e65699a [WangTaoTheTonic] change logic to loadEnvironments
ebd9ea0 [WangTaoTheTonic] merge master
ecfe43a [WangTaoTheTonic] pass keytab and principal seperately in different mode
33a7f40 [WangTaoTheTonic] expand the use of the current configs
08bb4e8 [WangTaoTheTonic] fix wrong cite
73afa64 [WangTaoTheTonic] add configs for keytab and principal, move originals to internal
---
 .../org/apache/spark/deploy/SparkSubmit.scala    |  8 ++++----
 .../spark/deploy/SparkSubmitArguments.scala      |  2 ++
 docs/running-on-yarn.md                          | 16 ++++++++++++++++
 .../deploy/yarn/AMDelegationTokenRenewer.scala   | 14 ++++++++------
 .../spark/deploy/yarn/ClientArguments.scala      |  6 ++++++
 5 files changed, 36 insertions(+), 10 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala b/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
index 92bb5059a0313..d1b32ea0778db 100644
--- a/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
@@ -428,6 +428,8 @@ object SparkSubmit {
       OptionAssigner(args.executorCores, YARN, CLIENT, sysProp = "spark.executor.cores"),
       OptionAssigner(args.files, YARN, CLIENT, sysProp = "spark.yarn.dist.files"),
       OptionAssigner(args.archives, YARN, CLIENT, sysProp = "spark.yarn.dist.archives"),
+      OptionAssigner(args.principal, YARN, CLIENT, sysProp = "spark.yarn.principal"),
+      OptionAssigner(args.keytab, YARN, CLIENT, sysProp = "spark.yarn.keytab"),
 
       // Yarn cluster only
       OptionAssigner(args.name, YARN, CLUSTER, clOption = "--name"),
@@ -440,10 +442,8 @@ object SparkSubmit {
       OptionAssigner(args.files, YARN, CLUSTER, clOption = "--files"),
       OptionAssigner(args.archives, YARN, CLUSTER, clOption = "--archives"),
       OptionAssigner(args.jars, YARN, CLUSTER, clOption = "--addJars"),
-
-      // Yarn client or cluster
-      OptionAssigner(args.principal, YARN, ALL_DEPLOY_MODES, clOption = "--principal"),
-      OptionAssigner(args.keytab, YARN, ALL_DEPLOY_MODES, clOption = "--keytab"),
+      OptionAssigner(args.principal, YARN, CLUSTER, clOption = "--principal"),
+      OptionAssigner(args.keytab, YARN, CLUSTER, clOption = "--keytab"),
 
       // Other options
       OptionAssigner(args.executorCores, STANDALONE, ALL_DEPLOY_MODES,
diff --git a/core/src/main/scala/org/apache/spark/deploy/SparkSubmitArguments.scala b/core/src/main/scala/org/apache/spark/deploy/SparkSubmitArguments.scala
index c0e4c771908b3..cc6a7bd9f4119 100644
--- a/core/src/main/scala/org/apache/spark/deploy/SparkSubmitArguments.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/SparkSubmitArguments.scala
@@ -169,6 +169,8 @@ private[deploy] class SparkSubmitArguments(args: Seq[String], env: Map[String, S
     deployMode = Option(deployMode).orElse(env.get("DEPLOY_MODE")).orNull
     numExecutors = Option(numExecutors)
       .getOrElse(sparkProperties.get("spark.executor.instances").orNull)
+    keytab = Option(keytab).orElse(sparkProperties.get("spark.yarn.keytab")).orNull
+    principal = Option(principal).orElse(sparkProperties.get("spark.yarn.principal")).orNull
 
     // Try to set main class from JAR if no --class argument is given
     if (mainClass == null && !isPython && !isR && primaryResource != null) {
diff --git a/docs/running-on-yarn.md b/docs/running-on-yarn.md
index 9d55f435e80ad..96cf612c54fdd 100644
--- a/docs/running-on-yarn.md
+++ b/docs/running-on-yarn.md
@@ -242,6 +242,22 @@ Most of the configs are the same for Spark on YARN as for other deployment modes
   running against earlier versions, this property will be ignored.
   </td>
 </tr>
+<tr>
+  <td><code>spark.yarn.keytab</code></td>
+  <td>(none)</td>
+  <td>
+  The full path to the file that contains the keytab for the principal specified above.
+  This keytab will be copied to the node running the Application Master via the Secure Distributed Cache,
+  for renewing the login tickets and the delegation tokens periodically.
+  </td>
+</tr>
+<tr>
+  <td><code>spark.yarn.principal</code></td>
+  <td>(none)</td>
+  <td>
+  Principal to be used to login to KDC, while running on secure HDFS.
+  </td>
+</tr>
 </table>
 
 # Launching Spark on YARN
diff --git a/yarn/src/main/scala/org/apache/spark/deploy/yarn/AMDelegationTokenRenewer.scala b/yarn/src/main/scala/org/apache/spark/deploy/yarn/AMDelegationTokenRenewer.scala
index aaae6f9734a85..77af46c192cc2 100644
--- a/yarn/src/main/scala/org/apache/spark/deploy/yarn/AMDelegationTokenRenewer.scala
+++ b/yarn/src/main/scala/org/apache/spark/deploy/yarn/AMDelegationTokenRenewer.scala
@@ -60,8 +60,11 @@ private[yarn] class AMDelegationTokenRenewer(
 
   private val hadoopUtil = YarnSparkHadoopUtil.get
 
-  private val daysToKeepFiles = sparkConf.getInt("spark.yarn.credentials.file.retention.days", 5)
-  private val numFilesToKeep = sparkConf.getInt("spark.yarn.credentials.file.retention.count", 5)
+  private val credentialsFile = sparkConf.get("spark.yarn.credentials.file")
+  private val daysToKeepFiles =
+    sparkConf.getInt("spark.yarn.credentials.file.retention.days", 5)
+  private val numFilesToKeep =
+    sparkConf.getInt("spark.yarn.credentials.file.retention.count", 5)
 
   /**
    * Schedule a login from the keytab and principal set using the --principal and --keytab
@@ -121,7 +124,7 @@ private[yarn] class AMDelegationTokenRenewer(
     import scala.concurrent.duration._
     try {
       val remoteFs = FileSystem.get(hadoopConf)
-      val credentialsPath = new Path(sparkConf.get("spark.yarn.credentials.file"))
+      val credentialsPath = new Path(credentialsFile)
       val thresholdTime = System.currentTimeMillis() - (daysToKeepFiles days).toMillis
       hadoopUtil.listFilesSorted(
         remoteFs, credentialsPath.getParent,
@@ -160,7 +163,7 @@ private[yarn] class AMDelegationTokenRenewer(
     val keytabLoggedInUGI = UserGroupInformation.loginUserFromKeytabAndReturnUGI(principal, keytab)
     logInfo("Successfully logged into KDC.")
     val tempCreds = keytabLoggedInUGI.getCredentials
-    val credentialsPath = new Path(sparkConf.get("spark.yarn.credentials.file"))
+    val credentialsPath = new Path(credentialsFile)
     val dst = credentialsPath.getParent
     keytabLoggedInUGI.doAs(new PrivilegedExceptionAction[Void] {
       // Get a copy of the credentials
@@ -186,8 +189,7 @@ private[yarn] class AMDelegationTokenRenewer(
     }
     val nextSuffix = lastCredentialsFileSuffix + 1
     val tokenPathStr =
-      sparkConf.get("spark.yarn.credentials.file") +
-        SparkHadoopUtil.SPARK_YARN_CREDS_COUNTER_DELIM + nextSuffix
+      credentialsFile + SparkHadoopUtil.SPARK_YARN_CREDS_COUNTER_DELIM + nextSuffix
     val tokenPath = new Path(tokenPathStr)
     val tempTokenPath = new Path(tokenPathStr + SparkHadoopUtil.SPARK_YARN_CREDS_TEMP_EXTENSION)
     logInfo("Writing out delegation tokens to " + tempTokenPath.toString)
diff --git a/yarn/src/main/scala/org/apache/spark/deploy/yarn/ClientArguments.scala b/yarn/src/main/scala/org/apache/spark/deploy/yarn/ClientArguments.scala
index 5653c9f14dc6d..9c7b1b3988082 100644
--- a/yarn/src/main/scala/org/apache/spark/deploy/yarn/ClientArguments.scala
+++ b/yarn/src/main/scala/org/apache/spark/deploy/yarn/ClientArguments.scala
@@ -98,6 +98,12 @@ private[spark] class ClientArguments(args: Array[String], sparkConf: SparkConf)
 
       numExecutors = initialNumExecutors
     }
+    principal = Option(principal)
+      .orElse(sparkConf.getOption("spark.yarn.principal"))
+      .orNull
+    keytab = Option(keytab)
+      .orElse(sparkConf.getOption("spark.yarn.keytab"))
+      .orNull
   }
 
   /**

From e7b61775571ce7a06d044bc3a6055ff94c7477d6 Mon Sep 17 00:00:00 2001
From: Cheng Lian <lian@databricks.com>
Date: Fri, 29 May 2015 10:43:34 -0700
Subject: [PATCH 242/525] [SPARK-7950] [SQL] Sets spark.sql.hive.version in
 HiveThriftServer2.startWithContext()

When starting `HiveThriftServer2` via `startWithContext`, property `spark.sql.hive.version` isn't set. This causes Simba ODBC driver 1.0.8.1006 behaves differently and fails simple queries.

Hive2 JDBC driver works fine in this case. Also, when starting the server with `start-thriftserver.sh`, both Hive2 JDBC driver and Simba ODBC driver works fine.

Please refer to [SPARK-7950] [1] for details.

[1]: https://issues.apache.org/jira/browse/SPARK-7950

Author: Cheng Lian <lian@databricks.com>

Closes #6500 from liancheng/odbc-bugfix and squashes the following commits:

051e3a3 [Cheng Lian] Fixes import order
3a97376 [Cheng Lian] Sets spark.sql.hive.version in HiveThriftServer2.startWithContext()
---
 .../sql/hive/thriftserver/HiveThriftServer2.scala | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/HiveThriftServer2.scala b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/HiveThriftServer2.scala
index 3458b04bfba0f..94687eeda4179 100644
--- a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/HiveThriftServer2.scala
+++ b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/HiveThriftServer2.scala
@@ -17,23 +17,23 @@
 
 package org.apache.spark.sql.hive.thriftserver
 
+import scala.collection.mutable
+import scala.collection.mutable.ArrayBuffer
+
 import org.apache.commons.logging.LogFactory
 import org.apache.hadoop.hive.conf.HiveConf
 import org.apache.hadoop.hive.conf.HiveConf.ConfVars
 import org.apache.hive.service.cli.thrift.{ThriftBinaryCLIService, ThriftHttpCLIService}
 import org.apache.hive.service.server.{HiveServer2, ServerOptionsProcessor}
-import org.apache.spark.sql.SQLConf
 
-import org.apache.spark.{SparkContext, SparkConf, Logging}
 import org.apache.spark.annotation.DeveloperApi
-import org.apache.spark.sql.hive.HiveContext
+import org.apache.spark.scheduler.{SparkListener, SparkListenerApplicationEnd, SparkListenerJobStart}
+import org.apache.spark.sql.SQLConf
 import org.apache.spark.sql.hive.thriftserver.ReflectionUtils._
-import org.apache.spark.scheduler.{SparkListenerJobStart, SparkListenerApplicationEnd, SparkListener}
 import org.apache.spark.sql.hive.thriftserver.ui.ThriftServerTab
+import org.apache.spark.sql.hive.{HiveContext, HiveShim}
 import org.apache.spark.util.Utils
-
-import scala.collection.mutable
-import scala.collection.mutable.ArrayBuffer
+import org.apache.spark.{Logging, SparkContext}
 
 /**
  * The main entry point for the Spark SQL port of HiveServer2.  Starts up a `SparkSQLContext` and a
@@ -51,6 +51,7 @@ object HiveThriftServer2 extends Logging {
   @DeveloperApi
   def startWithContext(sqlContext: HiveContext): Unit = {
     val server = new HiveThriftServer2(sqlContext)
+    sqlContext.setConf("spark.sql.hive.version", HiveShim.version)
     server.init(sqlContext.hiveconf)
     server.start()
     listener = new HiveThriftServer2Listener(server, sqlContext.conf)

From 4782e130400f16e77c8b7f7fe8791acae1c5f8f1 Mon Sep 17 00:00:00 2001
From: Cheng Lian <lian@databricks.com>
Date: Fri, 29 May 2015 11:11:40 -0700
Subject: [PATCH 243/525] [SQL] [TEST] [MINOR] Uses a temporary
 log4j.properties in HiveThriftServer2Test to ensure expected logging behavior

The `HiveThriftServer2Test` relies on proper logging behavior to assert whether the Thrift server daemon process is started successfully. However, some other jar files listed in the classpath may potentially contain an unexpected Log4J configuration file which overrides the logging behavior.

This PR writes a temporary `log4j.properties` and prepend it to driver classpath before starting the testing Thrift server process to ensure proper logging behavior.

cc andrewor14 yhuai

Author: Cheng Lian <lian@databricks.com>

Closes #6493 from liancheng/override-log4j and squashes the following commits:

c489e0e [Cheng Lian] Fixes minor Scala styling issue
b46ef0d [Cheng Lian] Uses a temporary log4j.properties in HiveThriftServer2Test to ensure expected logging behavior
---
 .../HiveThriftServer2Suites.scala             | 31 +++++++++++++++----
 1 file changed, 25 insertions(+), 6 deletions(-)

diff --git a/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/HiveThriftServer2Suites.scala b/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/HiveThriftServer2Suites.scala
index 1fadea97fd07f..610939c6a9481 100644
--- a/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/HiveThriftServer2Suites.scala
+++ b/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/HiveThriftServer2Suites.scala
@@ -19,6 +19,8 @@ package org.apache.spark.sql.hive.thriftserver
 
 import java.io.File
 import java.net.URL
+import java.nio.charset.StandardCharsets
+import java.nio.file.{Files, Paths}
 import java.sql.{Date, DriverManager, Statement}
 
 import scala.collection.mutable.ArrayBuffer
@@ -54,7 +56,7 @@ class HiveThriftBinaryServerSuite extends HiveThriftJdbcTest {
   override def mode: ServerMode.Value = ServerMode.binary
 
   private def withCLIServiceClient(f: ThriftCLIServiceClient => Unit): Unit = {
-    // Transport creation logics below mimics HiveConnection.createBinaryTransport
+    // Transport creation logic below mimics HiveConnection.createBinaryTransport
     val rawTransport = new TSocket("localhost", serverPort)
     val user = System.getProperty("user.name")
     val transport = PlainSaslHelper.getPlainTransport(user, "anonymous", rawTransport)
@@ -391,10 +393,10 @@ abstract class HiveThriftJdbcTest extends HiveThriftServer2Test {
     val statements = connections.map(_.createStatement())
 
     try {
-      statements.zip(fs).map { case (s, f) => f(s) }
+      statements.zip(fs).foreach { case (s, f) => f(s) }
     } finally {
-      statements.map(_.close())
-      connections.map(_.close())
+      statements.foreach(_.close())
+      connections.foreach(_.close())
     }
   }
 
@@ -433,15 +435,32 @@ abstract class HiveThriftServer2Test extends FunSuite with BeforeAndAfterAll wit
       ConfVars.HIVE_SERVER2_THRIFT_HTTP_PORT
     }
 
+    val driverClassPath = {
+      // Writes a temporary log4j.properties and prepend it to driver classpath, so that it
+      // overrides all other potential log4j configurations contained in other dependency jar files.
+      val tempLog4jConf = Utils.createTempDir().getCanonicalPath
+
+      Files.write(
+        Paths.get(s"$tempLog4jConf/log4j.properties"),
+        """log4j.rootCategory=INFO, console
+          |log4j.appender.console=org.apache.log4j.ConsoleAppender
+          |log4j.appender.console.target=System.err
+          |log4j.appender.console.layout=org.apache.log4j.PatternLayout
+          |log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n
+        """.stripMargin.getBytes(StandardCharsets.UTF_8))
+
+      tempLog4jConf + File.pathSeparator + sys.props("java.class.path")
+    }
+
     s"""$startScript
        |  --master local
-       |  --hiveconf hive.root.logger=INFO,console
        |  --hiveconf ${ConfVars.METASTORECONNECTURLKEY}=$metastoreJdbcUri
        |  --hiveconf ${ConfVars.METASTOREWAREHOUSE}=$warehousePath
        |  --hiveconf ${ConfVars.HIVE_SERVER2_THRIFT_BIND_HOST}=localhost
        |  --hiveconf ${ConfVars.HIVE_SERVER2_TRANSPORT_MODE}=$mode
        |  --hiveconf $portConf=$port
-       |  --driver-class-path ${sys.props("java.class.path")}
+       |  --driver-class-path $driverClassPath
+       |  --driver-java-options -Dlog4j.debug
        |  --conf spark.ui.enabled=false
      """.stripMargin.split("\\s+").toSeq
   }

From 6181937f315480543d28e542d43269cfa591e9d0 Mon Sep 17 00:00:00 2001
From: MechCoder <manojkumarsivaraj334@gmail.com>
Date: Fri, 29 May 2015 11:36:41 -0700
Subject: [PATCH 244/525] [SPARK-7946] [MLLIB] DecayFactor wrongly set in
 StreamingKMeans

Author: MechCoder <manojkumarsivaraj334@gmail.com>

Closes #6497 from MechCoder/spark-7946 and squashes the following commits:

2fdd0a3 [MechCoder] Add non-regression test
8c988c6 [MechCoder] [SPARK-7946] DecayFactor wrongly set in StreamingKMeans
---
 .../apache/spark/mllib/clustering/StreamingKMeans.scala    | 2 +-
 .../spark/mllib/clustering/StreamingKMeansSuite.scala      | 7 +++++++
 2 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/StreamingKMeans.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/StreamingKMeans.scala
index 812014a041719..c21e4fe7dc9b6 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/StreamingKMeans.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/StreamingKMeans.scala
@@ -178,7 +178,7 @@ class StreamingKMeans(
 
   /** Set the decay factor directly (for forgetful algorithms). */
   def setDecayFactor(a: Double): this.type = {
-    this.decayFactor = decayFactor
+    this.decayFactor = a
     this
   }
 
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/clustering/StreamingKMeansSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/clustering/StreamingKMeansSuite.scala
index f90025d535e45..13f9b17c027a4 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/clustering/StreamingKMeansSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/clustering/StreamingKMeansSuite.scala
@@ -133,6 +133,13 @@ class StreamingKMeansSuite extends FunSuite with TestSuiteBase {
     assert(math.abs(c1) ~== 0.8 absTol 0.6)
   }
 
+  test("SPARK-7946 setDecayFactor") {
+    val kMeans = new StreamingKMeans()
+    assert(kMeans.decayFactor === 1.0)
+    kMeans.setDecayFactor(2.0)
+    assert(kMeans.decayFactor === 2.0)
+  }
+
   def StreamingKMeansDataGenerator(
       numPoints: Int,
       numBatches: Int,

From 94f62a4979e4bc5f7bf4f5852d76977e097209e6 Mon Sep 17 00:00:00 2001
From: Reynold Xin <rxin@databricks.com>
Date: Fri, 29 May 2015 13:38:37 -0700
Subject: [PATCH 245/525] [SPARK-7940] Enforce whitespace checking for DO, TRY,
 CATCH, FINALLY, MATCH, LARROW, RARROW in style checker.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

…

Author: Reynold Xin <rxin@databricks.com>

Closes #6491 from rxin/more-whitespace and squashes the following commits:

f6e63dc [Reynold Xin] [SPARK-7940] Enforce whitespace checking for DO, TRY, CATCH, FINALLY, MATCH, LARROW, RARROW in style checker.
---
 .../scala/org/apache/spark/network/nio/BlockMessage.scala    | 2 +-
 .../main/scala/org/apache/spark/network/nio/Connection.scala | 5 ++---
 .../org/apache/spark/network/nio/ConnectionManager.scala     | 5 ++---
 .../org/apache/spark/rdd/PartitionerAwareUnionRDD.scala      | 2 +-
 .../main/scala/org/apache/spark/mllib/tree/model/Node.scala  | 2 +-
 .../spark/mllib/classification/LogisticRegressionSuite.scala | 4 ++--
 scalastyle-config.xml                                        | 4 ++--
 .../main/scala/org/apache/spark/sql/types/UTF8String.scala   | 2 +-
 .../src/main/scala/org/apache/spark/sql/jdbc/JDBCRDD.scala   | 2 +-
 9 files changed, 13 insertions(+), 15 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/network/nio/BlockMessage.scala b/core/src/main/scala/org/apache/spark/network/nio/BlockMessage.scala
index 1a92a799d004a..67a376102994c 100644
--- a/core/src/main/scala/org/apache/spark/network/nio/BlockMessage.scala
+++ b/core/src/main/scala/org/apache/spark/network/nio/BlockMessage.scala
@@ -155,7 +155,7 @@ private[nio] class BlockMessage() {
 
   override def toString: String = {
     "BlockMessage [type = " + typ + ", id = " + id + ", level = " + level +
-    ", data = " + (if (data != null) data.remaining.toString  else "null") + "]"
+    ", data = " + (if (data != null) data.remaining.toString else "null") + "]"
   }
 }
 
diff --git a/core/src/main/scala/org/apache/spark/network/nio/Connection.scala b/core/src/main/scala/org/apache/spark/network/nio/Connection.scala
index 6b898bd4bfc1b..1499da07bb83b 100644
--- a/core/src/main/scala/org/apache/spark/network/nio/Connection.scala
+++ b/core/src/main/scala/org/apache/spark/network/nio/Connection.scala
@@ -326,15 +326,14 @@ class SendingConnection(val address: InetSocketAddress, selector_ : Selector,
 
   // MUST be called within the selector loop
   def connect() {
-    try{
+    try {
       channel.register(selector, SelectionKey.OP_CONNECT)
       channel.connect(address)
       logInfo("Initiating connection to [" + address + "]")
     } catch {
-      case e: Exception => {
+      case e: Exception =>
         logError("Error connecting to " + address, e)
         callOnExceptionCallbacks(e)
-      }
     }
   }
 
diff --git a/core/src/main/scala/org/apache/spark/network/nio/ConnectionManager.scala b/core/src/main/scala/org/apache/spark/network/nio/ConnectionManager.scala
index 497871ed6d5e5..c0bca2c4bc994 100644
--- a/core/src/main/scala/org/apache/spark/network/nio/ConnectionManager.scala
+++ b/core/src/main/scala/org/apache/spark/network/nio/ConnectionManager.scala
@@ -635,12 +635,11 @@ private[nio] class ConnectionManager(
         val message = securityMsgResp.toBufferMessage
         if (message == null) throw new IOException("Error creating security message")
         sendSecurityMessage(waitingConn.getRemoteConnectionManagerId(), message)
-      } catch  {
-        case e: Exception => {
+      } catch {
+        case e: Exception =>
           logError("Error handling sasl client authentication", e)
           waitingConn.close()
           throw new IOException("Error evaluating sasl response: ", e)
-        }
       }
     }
   }
diff --git a/core/src/main/scala/org/apache/spark/rdd/PartitionerAwareUnionRDD.scala b/core/src/main/scala/org/apache/spark/rdd/PartitionerAwareUnionRDD.scala
index 7598ff617b399..9e3880714a79f 100644
--- a/core/src/main/scala/org/apache/spark/rdd/PartitionerAwareUnionRDD.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/PartitionerAwareUnionRDD.scala
@@ -86,7 +86,7 @@ class PartitionerAwareUnionRDD[T: ClassTag](
     }
     val location = if (locations.isEmpty) {
       None
-    } else  {
+    } else {
       // Find the location that maximum number of parent partitions prefer
       Some(locations.groupBy(x => x).maxBy(_._2.length)._1)
     }
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/model/Node.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/model/Node.scala
index ee710fc1ed299..a6d1398fc267b 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/tree/model/Node.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/model/Node.scala
@@ -83,7 +83,7 @@ class Node (
   def predict(features: Vector) : Double = {
     if (isLeaf) {
       predict.predict
-    } else{
+    } else {
       if (split.get.featureType == Continuous) {
         if (features(split.get.feature) <= split.get.threshold) {
           leftNode.get.predict(features)
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/classification/LogisticRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/classification/LogisticRegressionSuite.scala
index 966811a5a3263..b1014ab7c6203 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/classification/LogisticRegressionSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/classification/LogisticRegressionSuite.scala
@@ -119,7 +119,7 @@ object LogisticRegressionSuite {
       }
       // Preventing the overflow when we compute the probability
       val maxMargin = margins.max
-      if (maxMargin > 0) for (i <-0 until nClasses) margins(i) -= maxMargin
+      if (maxMargin > 0) for (i <- 0 until nClasses) margins(i) -= maxMargin
 
       // Computing the probabilities for each class from the margins.
       val norm = {
@@ -130,7 +130,7 @@ object LogisticRegressionSuite {
         }
         temp
       }
-      for (i <-0 until nClasses) probs(i) /= norm
+      for (i <- 0 until nClasses) probs(i) /= norm
 
       // Compute the cumulative probability so we can generate a random number and assign a label.
       for (i <- 1 until nClasses) probs(i) += probs(i - 1)
diff --git a/scalastyle-config.xml b/scalastyle-config.xml
index 68d980b610c00..68c8ce3b7e10b 100644
--- a/scalastyle-config.xml
+++ b/scalastyle-config.xml
@@ -144,12 +144,12 @@
  <check level="error" class="org.scalastyle.scalariform.SpaceAfterCommentStartChecker" enabled="true"></check>
  <check level="error" class="org.scalastyle.scalariform.EnsureSingleSpaceBeforeTokenChecker" enabled="true">
    <parameters>
-     <parameter name="tokens">ARROW, EQUALS</parameter>
+     <parameter name="tokens">ARROW, EQUALS, ELSE, TRY, CATCH, FINALLY, LARROW, RARROW</parameter>
    </parameters>
  </check>
   <check level="error" class="org.scalastyle.scalariform.EnsureSingleSpaceAfterTokenChecker" enabled="true">
     <parameters>
-     <parameter name="tokens">ARROW, EQUALS, COMMA, COLON, IF, WHILE, FOR</parameter>
+     <parameter name="tokens">ARROW, EQUALS, COMMA, COLON, IF, ELSE, DO, WHILE, FOR, MATCH, TRY, CATCH, FINALLY, LARROW, RARROW</parameter>
     </parameters>
   </check>
   <check level="error" class="org.scalastyle.scalariform.NotImplementedErrorUsage" enabled="true"></check>
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/UTF8String.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/UTF8String.scala
index bc9c37bf2d5d2..f5d8fcced362b 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/UTF8String.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/UTF8String.scala
@@ -203,7 +203,7 @@ object UTF8String {
   def apply(s: String): UTF8String = {
     if (s != null) {
       new UTF8String().set(s)
-    } else{
+    } else {
       null
     }
   }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/jdbc/JDBCRDD.scala b/sql/core/src/main/scala/org/apache/spark/sql/jdbc/JDBCRDD.scala
index 0bdb68e8ac845..2d8d950038e78 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/jdbc/JDBCRDD.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/jdbc/JDBCRDD.scala
@@ -262,7 +262,7 @@ private[sql] class JDBCRDD(
   }
 
   private def escapeSql(value: String): String =
-    if (value == null) null else  StringUtils.replace(value, "'", "''")
+    if (value == null) null else StringUtils.replace(value, "'", "''")
 
   /**
    * Turns a single Filter into a String representing a SQL expression.

From 9eb222c13991c2b4a22db485710dc2e27ccf06dd Mon Sep 17 00:00:00 2001
From: Andrew Or <andrew@databricks.com>
Date: Fri, 29 May 2015 14:03:12 -0700
Subject: [PATCH 246/525] [SPARK-7558] Demarcate tests in unit-tests.log

Right now `unit-tests.log` are not of much value because we can't tell where the test boundaries are easily. This patch adds log statements before and after each test to outline the test boundaries, e.g.:

```
===== TEST OUTPUT FOR o.a.s.serializer.KryoSerializerSuite: 'kryo with parallelize for primitive arrays' =====

15/05/27 12:36:39.596 pool-1-thread-1-ScalaTest-running-KryoSerializerSuite INFO SparkContext: Starting job: count at KryoSerializerSuite.scala:230
15/05/27 12:36:39.596 dag-scheduler-event-loop INFO DAGScheduler: Got job 3 (count at KryoSerializerSuite.scala:230) with 4 output partitions (allowLocal=false)
15/05/27 12:36:39.596 dag-scheduler-event-loop INFO DAGScheduler: Final stage: ResultStage 3(count at KryoSerializerSuite.scala:230)
15/05/27 12:36:39.596 dag-scheduler-event-loop INFO DAGScheduler: Parents of final stage: List()
15/05/27 12:36:39.597 dag-scheduler-event-loop INFO DAGScheduler: Missing parents: List()
15/05/27 12:36:39.597 dag-scheduler-event-loop INFO DAGScheduler: Submitting ResultStage 3 (ParallelCollectionRDD[5] at parallelize at KryoSerializerSuite.scala:230), which has no missing parents

...

15/05/27 12:36:39.624 pool-1-thread-1-ScalaTest-running-KryoSerializerSuite INFO DAGScheduler: Job 3 finished: count at KryoSerializerSuite.scala:230, took 0.028563 s
15/05/27 12:36:39.625 pool-1-thread-1-ScalaTest-running-KryoSerializerSuite INFO KryoSerializerSuite:

***** FINISHED o.a.s.serializer.KryoSerializerSuite: 'kryo with parallelize for primitive arrays' *****

...
```

Author: Andrew Or <andrew@databricks.com>

Closes #6441 from andrewor14/demarcate-tests and squashes the following commits:

879b060 [Andrew Or] Fix compile after rebase
d622af7 [Andrew Or] Merge branch 'master' of github.com:apache/spark into demarcate-tests
017c8ba [Andrew Or] Merge branch 'master' of github.com:apache/spark into demarcate-tests
7790b6c [Andrew Or] Fix tests after logical merge conflict
c7460c0 [Andrew Or] Merge branch 'master' of github.com:apache/spark into demarcate-tests
c43ffc4 [Andrew Or] Fix tests?
8882581 [Andrew Or] Fix tests
ee22cda [Andrew Or] Fix log message
fa9450e [Andrew Or] Merge branch 'master' of github.com:apache/spark into demarcate-tests
12d1e1b [Andrew Or] Various whitespace changes (minor)
69cbb24 [Andrew Or] Make all test suites extend SparkFunSuite instead of FunSuite
bbce12e [Andrew Or] Fix manual things that cannot be covered through automation
da0b12f [Andrew Or] Add core tests as dependencies in all modules
f7d29ce [Andrew Or] Introduce base abstract class for all test suites
---
 bagel/pom.xml                                 |  7 +++
 .../org/apache/spark/bagel/BagelSuite.scala   |  4 +-
 core/pom.xml                                  |  6 +++
 .../org/apache/spark/AccumulatorSuite.scala   |  3 +-
 .../org/apache/spark/CacheManagerSuite.scala  |  4 +-
 .../org/apache/spark/CheckpointSuite.scala    |  4 +-
 .../apache/spark/ContextCleanerSuite.scala    |  4 +-
 .../org/apache/spark/DistributedSuite.scala   |  3 +-
 .../scala/org/apache/spark/DriverSuite.scala  |  3 +-
 .../ExecutorAllocationManagerSuite.scala      |  8 +++-
 .../scala/org/apache/spark/FailureSuite.scala |  4 +-
 .../org/apache/spark/FileServerSuite.scala    |  3 +-
 .../scala/org/apache/spark/FileSuite.scala    |  3 +-
 .../org/apache/spark/FutureActionSuite.scala  |  8 +++-
 .../apache/spark/HeartbeatReceiverSuite.scala |  3 +-
 .../apache/spark/ImplicitOrderingSuite.scala  |  4 +-
 .../apache/spark/JobCancellationSuite.scala   |  4 +-
 .../apache/spark/MapOutputTrackerSuite.scala  |  3 +-
 .../org/apache/spark/PartitioningSuite.scala  |  4 +-
 .../org/apache/spark/SSLOptionsSuite.scala    |  4 +-
 .../apache/spark/SecurityManagerSuite.scala   |  4 +-
 .../scala/org/apache/spark/ShuffleSuite.scala |  3 +-
 .../org/apache/spark/SparkConfSuite.scala     |  3 +-
 .../apache/spark/SparkContextInfoSuite.scala  |  4 +-
 .../SparkContextSchedulerCreationSuite.scala  |  4 +-
 .../org/apache/spark/SparkContextSuite.scala  |  4 +-
 .../org/apache/spark/SparkFunSuite.scala      | 46 +++++++++++++++++++
 .../org/apache/spark/StatusTrackerSuite.scala |  4 +-
 .../org/apache/spark/ThreadingSuite.scala     |  3 +-
 .../org/apache/spark/UnpersistSuite.scala     |  3 +-
 .../api/python/PythonBroadcastSuite.scala     |  6 +--
 .../spark/api/python/PythonRDDSuite.scala     |  4 +-
 .../spark/api/python/SerDeUtilSuite.scala     |  6 +--
 .../spark/broadcast/BroadcastSuite.scala      |  6 +--
 .../org/apache/spark/deploy/ClientSuite.scala |  5 +-
 .../spark/deploy/JsonProtocolSuite.scala      |  5 +-
 .../spark/deploy/LogUrlsStandaloneSuite.scala |  6 +--
 .../spark/deploy/PythonRunnerSuite.scala      |  5 +-
 .../spark/deploy/SparkSubmitSuite.scala       |  8 +++-
 .../spark/deploy/SparkSubmitUtilsSuite.scala  |  5 +-
 .../history/FsHistoryProviderSuite.scala      |  6 +--
 .../deploy/history/HistoryServerSuite.scala   |  6 +--
 .../spark/deploy/master/MasterSuite.scala     |  6 +--
 .../rest/StandaloneRestSubmitSuite.scala      |  4 +-
 .../deploy/rest/SubmitRestProtocolSuite.scala |  5 +-
 .../deploy/worker/CommandUtilsSuite.scala     |  5 +-
 .../deploy/worker/DriverRunnerTest.scala      |  5 +-
 .../deploy/worker/ExecutorRunnerTest.scala    |  6 +--
 .../deploy/worker/WorkerArgumentsTest.scala   |  5 +-
 .../spark/deploy/worker/WorkerSuite.scala     |  6 +--
 .../deploy/worker/WorkerWatcherSuite.scala    |  5 +-
 .../spark/executor/TaskMetricsSuite.scala     |  4 +-
 .../WholeTextFileRecordReaderSuite.scala      |  5 +-
 .../spark/io/CompressionCodecSuite.scala      |  5 +-
 .../metrics/InputOutputMetricsSuite.scala     |  6 +--
 .../spark/metrics/MetricsConfigSuite.scala    |  6 ++-
 .../spark/metrics/MetricsSystemSuite.scala    |  6 +--
 .../NettyBlockTransferSecuritySuite.scala     |  6 +--
 .../NettyBlockTransferServiceSuite.scala      |  8 +++-
 .../network/nio/ConnectionManagerSuite.scala  |  6 +--
 .../spark/rdd/AsyncRDDActionsSuite.scala      |  6 +--
 .../org/apache/spark/rdd/DoubleRDDSuite.scala |  4 +-
 .../org/apache/spark/rdd/JdbcRDDSuite.scala   |  6 +--
 .../spark/rdd/PairRDDFunctionsSuite.scala     |  6 +--
 .../rdd/ParallelCollectionSplitSuite.scala    |  5 +-
 .../spark/rdd/PartitionPruningRDDSuite.scala  |  6 +--
 .../rdd/PartitionwiseSampledRDDSuite.scala    |  6 +--
 .../org/apache/spark/rdd/PipedRDDSuite.scala  |  3 +-
 .../spark/rdd/RDDOperationScopeSuite.scala    |  6 +--
 .../scala/org/apache/spark/rdd/RDDSuite.scala |  4 +-
 .../org/apache/spark/rdd/SortingSuite.scala   |  5 +-
 .../spark/rdd/ZippedPartitionsSuite.scala     |  5 +-
 .../org/apache/spark/rpc/RpcEnvSuite.scala    |  6 +--
 .../CoarseGrainedSchedulerBackendSuite.scala  |  6 +--
 .../spark/scheduler/DAGSchedulerSuite.scala   |  4 +-
 .../scheduler/EventLoggingListenerSuite.scala |  4 +-
 .../spark/scheduler/MapStatusSuite.scala      |  5 +-
 .../OutputCommitCoordinatorSuite.scala        |  4 +-
 .../apache/spark/scheduler/PoolSuite.scala    |  6 +--
 .../spark/scheduler/ReplayListenerSuite.scala |  6 +--
 .../spark/scheduler/SparkListenerSuite.scala  |  6 +--
 .../SparkListenerWithClusterSuite.scala       |  6 +--
 .../spark/scheduler/TaskContextSuite.scala    |  3 +-
 .../scheduler/TaskResultGetterSuite.scala     |  6 +--
 .../scheduler/TaskSchedulerImplSuite.scala    |  4 +-
 .../spark/scheduler/TaskSetManagerSuite.scala |  4 +-
 .../cluster/mesos/MemoryUtilsSuite.scala      |  5 +-
 .../mesos/MesosSchedulerBackendSuite.scala    |  5 +-
 .../mesos/MesosTaskLaunchDataSuite.scala      |  4 +-
 .../mesos/MesosClusterSchedulerSuite.scala    |  5 +-
 .../serializer/JavaSerializerSuite.scala      |  5 +-
 .../KryoSerializerDistributedSuite.scala      |  5 +-
 .../KryoSerializerResizableOutputSuite.scala  |  6 +--
 .../serializer/KryoSerializerSuite.scala      |  7 ++-
 .../ProactiveClosureSerializationSuite.scala  |  6 +--
 .../SerializationDebuggerSuite.scala          |  6 ++-
 .../SerializerPropertiesSuite.scala           |  6 +--
 .../shuffle/ShuffleMemoryManagerSuite.scala   |  5 +-
 .../hash/HashShuffleManagerSuite.scala        |  6 +--
 .../unsafe/UnsafeShuffleManagerSuite.scala    |  4 +-
 .../status/api/v1/SimpleDateParamSuite.scala  |  6 ++-
 .../apache/spark/storage/BlockIdSuite.scala   |  4 +-
 .../BlockManagerReplicationSuite.scala        |  6 +--
 .../spark/storage/BlockManagerSuite.scala     |  4 +-
 .../storage/BlockObjectWriterSuite.scala      |  6 +--
 .../spark/storage/DiskBlockManagerSuite.scala |  6 +--
 .../spark/storage/FlatmapIteratorSuite.scala  |  5 +-
 .../apache/spark/storage/LocalDirsSuite.scala |  6 +--
 .../ShuffleBlockFetcherIteratorSuite.scala    |  5 +-
 .../storage/StorageStatusListenerSuite.scala  |  5 +-
 .../apache/spark/storage/StorageSuite.scala   |  4 +-
 .../org/apache/spark/ui/UISeleniumSuite.scala |  2 +-
 .../scala/org/apache/spark/ui/UISuite.scala   |  5 +-
 .../ui/jobs/JobProgressListenerSuite.scala    |  3 +-
 .../RDDOperationGraphListenerSuite.scala      |  6 +--
 .../spark/ui/storage/StorageTabSuite.scala    |  6 +--
 .../apache/spark/util/AkkaUtilsSuite.scala    |  3 +-
 .../spark/util/ClosureCleanerSuite.scala      |  6 +--
 .../spark/util/ClosureCleanerSuite2.scala     |  6 +--
 .../spark/util/CompletionIteratorSuite.scala  |  4 +-
 .../apache/spark/util/DistributionSuite.scala |  5 +-
 .../apache/spark/util/EventLoopSuite.scala    |  5 +-
 .../apache/spark/util/FileAppenderSuite.scala |  6 +--
 .../apache/spark/util/JsonProtocolSuite.scala |  3 +-
 .../util/MutableURLClassLoaderSuite.scala     |  6 +--
 .../apache/spark/util/NextIteratorSuite.scala |  5 +-
 .../spark/util/ResetSystemProperties.scala    |  4 +-
 .../spark/util/SizeEstimatorSuite.scala       |  9 +++-
 .../apache/spark/util/ThreadUtilsSuite.scala  |  4 +-
 .../spark/util/TimeStampedHashMapSuite.scala  |  4 +-
 .../org/apache/spark/util/UtilsSuite.scala    |  5 +-
 .../org/apache/spark/util/VectorSuite.scala   |  4 +-
 .../util/collection/AppendOnlyMapSuite.scala  |  4 +-
 .../spark/util/collection/BitSetSuite.scala   |  4 +-
 .../util/collection/ChainedBufferSuite.scala  |  5 +-
 .../util/collection/CompactBufferSuite.scala  |  4 +-
 .../ExternalAppendOnlyMapSuite.scala          |  4 +-
 .../util/collection/ExternalSorterSuite.scala |  4 +-
 .../util/collection/OpenHashMapSuite.scala    |  4 +-
 .../util/collection/OpenHashSetSuite.scala    |  4 +-
 ...PartitionedSerializedPairBufferSuite.scala |  5 +-
 .../PrimitiveKeyOpenHashMapSuite.scala        |  4 +-
 .../collection/PrimitiveVectorSuite.scala     |  5 +-
 .../util/collection/SizeTrackerSuite.scala    |  5 +-
 .../spark/util/collection/SorterSuite.scala   |  5 +-
 .../io/ByteArrayChunkOutputStreamSuite.scala  |  4 +-
 .../util/random/RandomSamplerSuite.scala      |  6 ++-
 .../util/random/SamplingUtilsSuite.scala      |  5 +-
 .../util/random/XORShiftRandomSuite.scala     |  4 +-
 external/flume-sink/pom.xml                   |  7 +++
 .../streaming/flume/sink/SparkSinkSuite.scala |  5 +-
 external/flume/pom.xml                        |  7 +++
 .../flume/FlumePollingStreamSuite.scala       |  6 +--
 .../streaming/flume/FlumeStreamSuite.scala    |  6 +--
 external/kafka/pom.xml                        |  7 +++
 .../kafka/DirectKafkaStreamSuite.scala        |  6 +--
 .../streaming/kafka/KafkaClusterSuite.scala   |  6 ++-
 .../spark/streaming/kafka/KafkaRDDSuite.scala |  4 +-
 .../streaming/kafka/KafkaStreamSuite.scala    |  6 +--
 .../kafka/ReliableKafkaStreamSuite.scala      |  6 +--
 external/mqtt/pom.xml                         |  7 +++
 .../streaming/mqtt/MQTTStreamSuite.scala      |  6 +--
 external/twitter/pom.xml                      |  7 +++
 .../twitter/TwitterStreamSuite.scala          |  6 +--
 external/zeromq/pom.xml                       |  7 +++
 .../streaming/zeromq/ZeroMQStreamSuite.scala  |  4 +-
 graphx/pom.xml                                |  7 +++
 .../apache/spark/graphx/EdgeRDDSuite.scala    |  5 +-
 .../org/apache/spark/graphx/EdgeSuite.scala   |  4 +-
 .../apache/spark/graphx/GraphOpsSuite.scala   |  5 +-
 .../org/apache/spark/graphx/GraphSuite.scala  |  6 +--
 .../org/apache/spark/graphx/PregelSuite.scala |  6 +--
 .../apache/spark/graphx/VertexRDDSuite.scala  |  6 +--
 .../graphx/impl/EdgePartitionSuite.scala      |  6 +--
 .../graphx/impl/VertexPartitionSuite.scala    |  6 +--
 .../graphx/lib/ConnectedComponentsSuite.scala |  6 +--
 .../graphx/lib/LabelPropagationSuite.scala    |  5 +-
 .../spark/graphx/lib/PageRankSuite.scala      |  5 +-
 .../spark/graphx/lib/SVDPlusPlusSuite.scala   |  5 +-
 .../spark/graphx/lib/ShortestPathsSuite.scala |  6 +--
 .../StronglyConnectedComponentsSuite.scala    |  6 +--
 .../spark/graphx/lib/TriangleCountSuite.scala |  5 +-
 .../graphx/util/BytecodeUtilsSuite.scala      |  4 +-
 .../graphx/util/GraphGeneratorsSuite.scala    |  5 +-
 .../spark/ml/util/IdentifiableSuite.scala     |  4 +-
 .../org/apache/spark/ml/PipelineSuite.scala   |  4 +-
 .../ml/attribute/AttributeGroupSuite.scala    |  4 +-
 .../spark/ml/attribute/AttributeSuite.scala   |  5 +-
 .../DecisionTreeClassifierSuite.scala         |  7 ++-
 .../classification/GBTClassifierSuite.scala   |  5 +-
 .../LogisticRegressionSuite.scala             |  5 +-
 .../ml/classification/OneVsRestSuite.scala    |  5 +-
 .../RandomForestClassifierSuite.scala         |  5 +-
 .../evaluation/RegressionEvaluatorSuite.scala |  5 +-
 .../spark/ml/feature/BinarizerSuite.scala     |  5 +-
 .../spark/ml/feature/BucketizerSuite.scala    |  8 ++--
 .../spark/ml/feature/HashingTFSuite.scala     |  5 +-
 .../apache/spark/ml/feature/IDFSuite.scala    |  5 +-
 .../spark/ml/feature/NormalizerSuite.scala    |  5 +-
 .../spark/ml/feature/OneHotEncoderSuite.scala |  5 +-
 .../ml/feature/PolynomialExpansionSuite.scala |  4 +-
 .../spark/ml/feature/StringIndexerSuite.scala |  5 +-
 .../spark/ml/feature/TokenizerSuite.scala     |  7 ++-
 .../ml/feature/VectorAssemblerSuite.scala     |  6 +--
 .../spark/ml/feature/VectorIndexerSuite.scala |  6 +--
 .../spark/ml/feature/Word2VecSuite.scala      |  5 +-
 .../org/apache/spark/ml/impl/TreeTests.scala  |  5 +-
 .../apache/spark/ml/param/ParamsSuite.scala   |  6 +--
 .../ml/param/shared/SharedParamsSuite.scala   |  5 +-
 .../spark/ml/recommendation/ALSSuite.scala    |  5 +-
 .../DecisionTreeRegressorSuite.scala          |  7 ++-
 .../ml/regression/GBTRegressorSuite.scala     |  5 +-
 .../ml/regression/LinearRegressionSuite.scala |  5 +-
 .../RandomForestRegressorSuite.scala          |  7 ++-
 .../spark/ml/tuning/CrossValidatorSuite.scala |  4 +-
 .../ml/tuning/ParamGridBuilderSuite.scala     |  5 +-
 .../api/python/PythonMLLibAPISuite.scala      |  5 +-
 .../LogisticRegressionSuite.scala             |  6 +--
 .../classification/NaiveBayesSuite.scala      |  7 ++-
 .../spark/mllib/classification/SVMSuite.scala |  7 ++-
 .../StreamingLogisticRegressionSuite.scala    |  5 +-
 .../clustering/GaussianMixtureSuite.scala     |  5 +-
 .../spark/mllib/clustering/KMeansSuite.scala  |  9 ++--
 .../spark/mllib/clustering/LDASuite.scala     |  5 +-
 .../PowerIterationClusteringSuite.scala       |  8 ++--
 .../clustering/StreamingKMeansSuite.scala     |  5 +-
 .../evaluation/AreaUnderCurveSuite.scala      |  5 +-
 .../BinaryClassificationMetricsSuite.scala    |  5 +-
 .../evaluation/MulticlassMetricsSuite.scala   |  5 +-
 .../evaluation/MultilabelMetricsSuite.scala   |  5 +-
 .../evaluation/RankingMetricsSuite.scala      |  5 +-
 .../evaluation/RegressionMetricsSuite.scala   |  5 +-
 .../mllib/feature/ChiSqSelectorSuite.scala    |  5 +-
 .../feature/ElementwiseProductSuite.scala     |  5 +-
 .../spark/mllib/feature/HashingTFSuite.scala  |  5 +-
 .../apache/spark/mllib/feature/IDFSuite.scala |  5 +-
 .../spark/mllib/feature/NormalizerSuite.scala |  5 +-
 .../apache/spark/mllib/feature/PCASuite.scala |  5 +-
 .../mllib/feature/StandardScalerSuite.scala   |  5 +-
 .../spark/mllib/feature/Word2VecSuite.scala   |  5 +-
 .../spark/mllib/fpm/FPGrowthSuite.scala       |  5 +-
 .../apache/spark/mllib/fpm/FPTreeSuite.scala  |  5 +-
 .../impl/PeriodicGraphCheckpointerSuite.scala |  6 +--
 .../apache/spark/mllib/linalg/BLASSuite.scala |  5 +-
 .../linalg/BreezeMatrixConversionSuite.scala  |  6 +--
 .../linalg/BreezeVectorConversionSuite.scala  |  6 +--
 .../spark/mllib/linalg/MatricesSuite.scala    |  4 +-
 .../spark/mllib/linalg/VectorsSuite.scala     |  5 +-
 .../linalg/distributed/BlockMatrixSuite.scala |  5 +-
 .../distributed/CoordinateMatrixSuite.scala   |  5 +-
 .../distributed/IndexedRowMatrixSuite.scala   |  5 +-
 .../linalg/distributed/RowMatrixSuite.scala   |  6 +--
 .../optimization/GradientDescentSuite.scala   |  7 +--
 .../spark/mllib/optimization/LBFGSSuite.scala |  7 +--
 .../spark/mllib/optimization/NNLSSuite.scala  |  5 +-
 ...ryClassificationPMMLModelExportSuite.scala |  4 +-
 ...eneralizedLinearPMMLModelExportSuite.scala |  4 +-
 .../export/KMeansPMMLModelExportSuite.scala   |  4 +-
 .../export/PMMLModelExportFactorySuite.scala  |  5 +-
 .../random/RandomDataGeneratorSuite.scala     |  5 +-
 .../spark/mllib/random/RandomRDDsSuite.scala  |  5 +-
 .../mllib/rdd/MLPairRDDFunctionsSuite.scala   |  5 +-
 .../spark/mllib/rdd/RDDFunctionsSuite.scala   |  5 +-
 .../spark/mllib/recommendation/ALSSuite.scala |  4 +-
 .../MatrixFactorizationModelSuite.scala       |  5 +-
 .../regression/IsotonicRegressionSuite.scala  |  5 +-
 .../mllib/regression/LabeledPointSuite.scala  |  5 +-
 .../spark/mllib/regression/LassoSuite.scala   |  7 ++-
 .../regression/LinearRegressionSuite.scala    |  7 ++-
 .../regression/RidgeRegressionSuite.scala     |  6 +--
 .../StreamingLinearRegressionSuite.scala      |  5 +-
 .../spark/mllib/stat/CorrelationSuite.scala   |  5 +-
 .../mllib/stat/HypothesisTestSuite.scala      |  6 +--
 .../spark/mllib/stat/KernelDensitySuite.scala |  4 +-
 .../MultivariateOnlineSummarizerSuite.scala   |  5 +-
 .../MultivariateGaussianSuite.scala           |  5 +-
 .../spark/mllib/tree/DecisionTreeSuite.scala  |  7 ++-
 .../tree/GradientBoostedTreesSuite.scala      |  5 +-
 .../spark/mllib/tree/ImpuritySuite.scala      |  5 +-
 .../spark/mllib/tree/RandomForestSuite.scala  |  5 +-
 .../mllib/tree/impl/BaggedPointSuite.scala    |  5 +-
 .../spark/mllib/util/MLUtilsSuite.scala       |  5 +-
 .../spark/mllib/util/NumericParserSuite.scala |  6 +--
 .../spark/mllib/util/TestingUtilsSuite.scala  |  4 +-
 repl/pom.xml                                  |  7 +++
 .../org/apache/spark/repl/ReplSuite.scala     |  5 +-
 .../org/apache/spark/repl/ReplSuite.scala     |  5 +-
 .../spark/repl/ExecutorClassLoaderSuite.scala |  3 +-
 sql/catalyst/pom.xml                          |  7 +++
 .../sql/catalyst/DistributionSuite.scala      |  5 +-
 .../sql/catalyst/ScalaReflectionSuite.scala   |  5 +-
 .../spark/sql/catalyst/SqlParserSuite.scala   |  4 +-
 .../sql/catalyst/analysis/AnalysisSuite.scala |  5 +-
 .../analysis/DecimalPrecisionSuite.scala      |  5 +-
 .../expressions/AttributeSetSuite.scala       |  5 +-
 .../ExpressionEvaluationSuite.scala           |  4 +-
 .../UnsafeFixedWidthAggregationMapSuite.scala |  8 +++-
 .../expressions/UnsafeRowConverterSuite.scala |  5 +-
 .../spark/sql/catalyst/plans/PlanTest.scala   |  5 +-
 .../sql/catalyst/plans/SameResultSuite.scala  |  5 +-
 .../catalyst/trees/RuleExecutorSuite.scala    |  5 +-
 .../sql/catalyst/trees/TreeNodeSuite.scala    |  5 +-
 .../sql/catalyst/util/MetadataSuite.scala     |  4 +-
 .../spark/sql/types/DataTypeParserSuite.scala |  4 +-
 .../spark/sql/types/DataTypeSuite.scala       |  5 +-
 .../spark/sql/types/UTF8StringSuite.scala     |  4 +-
 .../sql/types/decimal/DecimalSuite.scala      |  5 +-
 .../apache/spark/sql/DataFrameStatSuite.scala |  4 +-
 .../spark/sql/MathExpressionsSuite.scala      |  2 +-
 .../scala/org/apache/spark/sql/RowSuite.scala |  4 +-
 .../org/apache/spark/sql/SQLConfSuite.scala   |  5 +-
 .../apache/spark/sql/SQLContextSuite.scala    |  5 +-
 .../sql/ScalaReflectionRelationSuite.scala    |  5 +-
 .../apache/spark/sql/SerializationSuite.scala |  6 +--
 .../spark/sql/columnar/ColumnStatsSuite.scala |  5 +-
 .../spark/sql/columnar/ColumnTypeSuite.scala  |  5 +-
 .../NullableColumnAccessorSuite.scala         |  5 +-
 .../columnar/NullableColumnBuilderSuite.scala |  5 +-
 .../columnar/PartitionBatchPruningSuite.scala |  5 +-
 .../compression/BooleanBitSetSuite.scala      |  5 +-
 .../compression/DictionaryEncodingSuite.scala |  5 +-
 .../compression/IntegralDeltaSuite.scala      |  5 +-
 .../compression/RunLengthEncodingSuite.scala  |  5 +-
 .../spark/sql/execution/PlannerSuite.scala    |  5 +-
 .../execution/SparkSqlSerializer2Suite.scala  |  6 +--
 .../sql/execution/debug/DebuggingSuite.scala  |  5 +-
 .../execution/joins/HashedRelationSuite.scala |  5 +-
 .../org/apache/spark/sql/jdbc/JDBCSuite.scala |  5 +-
 .../spark/sql/jdbc/JDBCWriteSuite.scala       |  5 +-
 .../sql/parquet/ParquetSchemaSuite.scala      |  4 +-
 .../sql/sources/ResolvedDataSourceSuite.scala |  4 +-
 sql/hive-thriftserver/pom.xml                 |  7 +++
 .../sql/hive/thriftserver/CliSuite.scala      |  6 +--
 .../HiveThriftServer2Suites.scala             |  6 +--
 sql/hive/pom.xml                              |  7 +++
 .../spark/sql/hive/HiveInspectorSuite.scala   |  4 +-
 .../sql/hive/HiveMetastoreCatalogSuite.scala  |  4 +-
 .../apache/spark/sql/hive/HiveQlSuite.scala   |  5 +-
 .../spark/sql/hive/SerializationSuite.scala   |  6 +--
 .../spark/sql/hive/client/VersionsSuite.scala |  5 +-
 .../hive/execution/ConcurrentHiveSuite.scala  |  6 +--
 .../hive/execution/HiveComparisonTest.scala   |  6 +--
 .../hive/orc/OrcPartitionDiscoverySuite.scala |  5 +-
 .../spark/sql/hive/orc/OrcQuerySuite.scala    |  5 +-
 .../sql/sources/hadoopFsRelationSuites.scala  |  5 +-
 streaming/pom.xml                             |  7 +++
 .../spark/streaming/DStreamClosureSuite.scala |  6 +--
 .../spark/streaming/DStreamScopeSuite.scala   |  6 +--
 .../streaming/ReceivedBlockHandlerSuite.scala |  8 +++-
 .../streaming/ReceivedBlockTrackerSuite.scala |  6 +--
 .../streaming/StreamingContextSuite.scala     |  6 +--
 .../spark/streaming/TestSuiteBase.scala       |  6 +--
 .../spark/streaming/UISeleniumSuite.scala     |  2 +-
 .../WriteAheadLogBackedBlockRDDSuite.scala    |  6 +--
 .../scheduler/InputInfoTrackerSuite.scala     |  6 +--
 .../spark/streaming/ui/UIUtilsSuite.scala     |  5 +-
 .../util/RateLimitedOutputStreamSuite.scala   |  4 +-
 .../streaming/util/WriteAheadLogSuite.scala   |  6 +--
 yarn/pom.xml                                  |  7 +++
 .../ClientDistributedCacheManagerSuite.scala  |  5 +-
 .../spark/deploy/yarn/ClientSuite.scala       |  6 +--
 .../deploy/yarn/YarnAllocatorSuite.scala      |  6 +--
 .../spark/deploy/yarn/YarnClusterSuite.scala  |  6 +--
 .../yarn/YarnSparkHadoopUtilSuite.scala       |  6 +--
 364 files changed, 953 insertions(+), 968 deletions(-)
 create mode 100644 core/src/test/scala/org/apache/spark/SparkFunSuite.scala

diff --git a/bagel/pom.xml b/bagel/pom.xml
index 1f3dec91314f2..132cd433d78a2 100644
--- a/bagel/pom.xml
+++ b/bagel/pom.xml
@@ -40,6 +40,13 @@
       <artifactId>spark-core_${scala.binary.version}</artifactId>
       <version>${project.version}</version>
     </dependency>
+    <dependency>
+      <groupId>org.apache.spark</groupId>
+      <artifactId>spark-core_${scala.binary.version}</artifactId>
+      <version>${project.version}</version>
+      <type>test-jar</type>
+      <scope>test</scope>
+    </dependency>
     <dependency>
       <groupId>org.scalacheck</groupId>
       <artifactId>scalacheck_${scala.binary.version}</artifactId>
diff --git a/bagel/src/test/scala/org/apache/spark/bagel/BagelSuite.scala b/bagel/src/test/scala/org/apache/spark/bagel/BagelSuite.scala
index ccb262a4ee02a..fb10d734ac74b 100644
--- a/bagel/src/test/scala/org/apache/spark/bagel/BagelSuite.scala
+++ b/bagel/src/test/scala/org/apache/spark/bagel/BagelSuite.scala
@@ -17,7 +17,7 @@
 
 package org.apache.spark.bagel
 
-import org.scalatest.{BeforeAndAfter, FunSuite, Assertions}
+import org.scalatest.{BeforeAndAfter, Assertions}
 import org.scalatest.concurrent.Timeouts
 import org.scalatest.time.SpanSugar._
 
@@ -27,7 +27,7 @@ import org.apache.spark.storage.StorageLevel
 class TestVertex(val active: Boolean, val age: Int) extends Vertex with Serializable
 class TestMessage(val targetId: String) extends Message[String] with Serializable
 
-class BagelSuite extends FunSuite with Assertions with BeforeAndAfter with Timeouts {
+class BagelSuite extends SparkFunSuite with Assertions with BeforeAndAfter with Timeouts {
 
   var sc: SparkContext = _
 
diff --git a/core/pom.xml b/core/pom.xml
index e58efe495e36d..5c02be831ce06 100644
--- a/core/pom.xml
+++ b/core/pom.xml
@@ -338,6 +338,12 @@
     <dependency>
       <groupId>org.seleniumhq.selenium</groupId>
       <artifactId>selenium-java</artifactId>
+      <exclusions>
+        <exclusion>
+          <groupId>com.google.guava</groupId>
+          <artifactId>guava</artifactId>
+        </exclusion>
+      </exclusions>
       <scope>test</scope>
     </dependency>
     <!-- Added for selenium: -->
diff --git a/core/src/test/scala/org/apache/spark/AccumulatorSuite.scala b/core/src/test/scala/org/apache/spark/AccumulatorSuite.scala
index 746a40a21bf9e..e942d6579b2fd 100644
--- a/core/src/test/scala/org/apache/spark/AccumulatorSuite.scala
+++ b/core/src/test/scala/org/apache/spark/AccumulatorSuite.scala
@@ -20,11 +20,10 @@ package org.apache.spark
 import scala.collection.mutable
 import scala.ref.WeakReference
 
-import org.scalatest.FunSuite
 import org.scalatest.Matchers
 
 
-class AccumulatorSuite extends FunSuite with Matchers with LocalSparkContext {
+class AccumulatorSuite extends SparkFunSuite with Matchers with LocalSparkContext {
 
 
   implicit def setAccum[A]: AccumulableParam[mutable.Set[A], A] =
diff --git a/core/src/test/scala/org/apache/spark/CacheManagerSuite.scala b/core/src/test/scala/org/apache/spark/CacheManagerSuite.scala
index 668ddf9f5f0a9..af81e46a657d3 100644
--- a/core/src/test/scala/org/apache/spark/CacheManagerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/CacheManagerSuite.scala
@@ -18,7 +18,7 @@
 package org.apache.spark
 
 import org.mockito.Mockito._
-import org.scalatest.{BeforeAndAfter, FunSuite}
+import org.scalatest.BeforeAndAfter
 import org.scalatest.mock.MockitoSugar
 
 import org.apache.spark.executor.DataReadMethod
@@ -26,7 +26,7 @@ import org.apache.spark.rdd.RDD
 import org.apache.spark.storage._
 
 // TODO: Test the CacheManager's thread-safety aspects
-class CacheManagerSuite extends FunSuite with LocalSparkContext with BeforeAndAfter
+class CacheManagerSuite extends SparkFunSuite with LocalSparkContext with BeforeAndAfter
   with MockitoSugar {
 
   var blockManager: BlockManager = _
diff --git a/core/src/test/scala/org/apache/spark/CheckpointSuite.scala b/core/src/test/scala/org/apache/spark/CheckpointSuite.scala
index 91d8fdedbe0f3..d1761a48babbc 100644
--- a/core/src/test/scala/org/apache/spark/CheckpointSuite.scala
+++ b/core/src/test/scala/org/apache/spark/CheckpointSuite.scala
@@ -21,13 +21,11 @@ import java.io.File
 
 import scala.reflect.ClassTag
 
-import org.scalatest.FunSuite
-
 import org.apache.spark.rdd._
 import org.apache.spark.storage.{BlockId, StorageLevel, TestBlockId}
 import org.apache.spark.util.Utils
 
-class CheckpointSuite extends FunSuite with LocalSparkContext with Logging {
+class CheckpointSuite extends SparkFunSuite with LocalSparkContext with Logging {
   var checkpointDir: File = _
   val partitioner = new HashPartitioner(2)
 
diff --git a/core/src/test/scala/org/apache/spark/ContextCleanerSuite.scala b/core/src/test/scala/org/apache/spark/ContextCleanerSuite.scala
index 4a48f6580c78e..501fe186bfd7c 100644
--- a/core/src/test/scala/org/apache/spark/ContextCleanerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/ContextCleanerSuite.scala
@@ -23,7 +23,7 @@ import scala.collection.mutable.{HashSet, SynchronizedSet}
 import scala.language.existentials
 import scala.util.Random
 
-import org.scalatest.{BeforeAndAfter, FunSuite}
+import org.scalatest.BeforeAndAfter
 import org.scalatest.concurrent.{PatienceConfiguration, Eventually}
 import org.scalatest.concurrent.Eventually._
 import org.scalatest.time.SpanSugar._
@@ -44,7 +44,7 @@ import org.apache.spark.storage.ShuffleIndexBlockId
  * config options, in particular, a different shuffle manager class
  */
 abstract class ContextCleanerSuiteBase(val shuffleManager: Class[_] = classOf[HashShuffleManager])
-  extends FunSuite with BeforeAndAfter with LocalSparkContext
+  extends SparkFunSuite with BeforeAndAfter with LocalSparkContext
 {
   implicit val defaultTimeout = timeout(10000 millis)
   val conf = new SparkConf()
diff --git a/core/src/test/scala/org/apache/spark/DistributedSuite.scala b/core/src/test/scala/org/apache/spark/DistributedSuite.scala
index 96a9c207ad022..9c191ed52206d 100644
--- a/core/src/test/scala/org/apache/spark/DistributedSuite.scala
+++ b/core/src/test/scala/org/apache/spark/DistributedSuite.scala
@@ -17,7 +17,6 @@
 
 package org.apache.spark
 
-import org.scalatest.FunSuite
 import org.scalatest.concurrent.Timeouts._
 import org.scalatest.Matchers
 import org.scalatest.time.{Millis, Span}
@@ -28,7 +27,7 @@ class NotSerializableClass
 class NotSerializableExn(val notSer: NotSerializableClass) extends Throwable() {}
 
 
-class DistributedSuite extends FunSuite with Matchers with LocalSparkContext {
+class DistributedSuite extends SparkFunSuite with Matchers with LocalSparkContext {
 
   val clusterUrl = "local-cluster[2,1,512]"
 
diff --git a/core/src/test/scala/org/apache/spark/DriverSuite.scala b/core/src/test/scala/org/apache/spark/DriverSuite.scala
index c42dfbc82ada4..b2262033ca238 100644
--- a/core/src/test/scala/org/apache/spark/DriverSuite.scala
+++ b/core/src/test/scala/org/apache/spark/DriverSuite.scala
@@ -19,14 +19,13 @@ package org.apache.spark
 
 import java.io.File
 
-import org.scalatest.FunSuite
 import org.scalatest.concurrent.Timeouts
 import org.scalatest.prop.TableDrivenPropertyChecks._
 import org.scalatest.time.SpanSugar._
 
 import org.apache.spark.util.Utils
 
-class DriverSuite extends FunSuite with Timeouts {
+class DriverSuite extends SparkFunSuite with Timeouts {
 
   ignore("driver should exit after finishing without cleanup (SPARK-530)") {
     val sparkHome = sys.props.getOrElse("spark.test.home", fail("spark.test.home is not set!"))
diff --git a/core/src/test/scala/org/apache/spark/ExecutorAllocationManagerSuite.scala b/core/src/test/scala/org/apache/spark/ExecutorAllocationManagerSuite.scala
index 84f787ee3715d..1c2b681f0b843 100644
--- a/core/src/test/scala/org/apache/spark/ExecutorAllocationManagerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/ExecutorAllocationManagerSuite.scala
@@ -19,7 +19,7 @@ package org.apache.spark
 
 import scala.collection.mutable
 
-import org.scalatest.{BeforeAndAfter, FunSuite, PrivateMethodTester}
+import org.scalatest.{BeforeAndAfter, PrivateMethodTester}
 import org.apache.spark.executor.TaskMetrics
 import org.apache.spark.scheduler._
 import org.apache.spark.scheduler.cluster.ExecutorInfo
@@ -28,7 +28,11 @@ import org.apache.spark.util.ManualClock
 /**
  * Test add and remove behavior of ExecutorAllocationManager.
  */
-class ExecutorAllocationManagerSuite extends FunSuite with LocalSparkContext with BeforeAndAfter {
+class ExecutorAllocationManagerSuite
+  extends SparkFunSuite
+  with LocalSparkContext
+  with BeforeAndAfter {
+
   import ExecutorAllocationManager._
   import ExecutorAllocationManagerSuite._
 
diff --git a/core/src/test/scala/org/apache/spark/FailureSuite.scala b/core/src/test/scala/org/apache/spark/FailureSuite.scala
index cade1fda2c7be..b18067e68f5a1 100644
--- a/core/src/test/scala/org/apache/spark/FailureSuite.scala
+++ b/core/src/test/scala/org/apache/spark/FailureSuite.scala
@@ -17,8 +17,6 @@
 
 package org.apache.spark
 
-import org.scalatest.FunSuite
-
 import org.apache.spark.util.NonSerializable
 
 import java.io.NotSerializableException
@@ -38,7 +36,7 @@ object FailureSuiteState {
   }
 }
 
-class FailureSuite extends FunSuite with LocalSparkContext {
+class FailureSuite extends SparkFunSuite with LocalSparkContext {
 
   // Run a 3-task map job in which task 1 deterministically fails once, and check
   // whether the job completes successfully and we ran 4 tasks in total.
diff --git a/core/src/test/scala/org/apache/spark/FileServerSuite.scala b/core/src/test/scala/org/apache/spark/FileServerSuite.scala
index bff2d10b9946c..6e65b0a8f6c76 100644
--- a/core/src/test/scala/org/apache/spark/FileServerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/FileServerSuite.scala
@@ -24,13 +24,12 @@ import javax.net.ssl.SSLException
 
 import com.google.common.io.{ByteStreams, Files}
 import org.apache.commons.lang3.RandomUtils
-import org.scalatest.FunSuite
 
 import org.apache.spark.util.Utils
 
 import SSLSampleConfigs._
 
-class FileServerSuite extends FunSuite with LocalSparkContext {
+class FileServerSuite extends SparkFunSuite with LocalSparkContext {
 
   @transient var tmpDir: File = _
   @transient var tmpFile: File = _
diff --git a/core/src/test/scala/org/apache/spark/FileSuite.scala b/core/src/test/scala/org/apache/spark/FileSuite.scala
index d67de8692df62..1d8fade90f398 100644
--- a/core/src/test/scala/org/apache/spark/FileSuite.scala
+++ b/core/src/test/scala/org/apache/spark/FileSuite.scala
@@ -30,12 +30,11 @@ import org.apache.hadoop.mapred.{JobConf, FileAlreadyExistsException, FileSplit,
 import org.apache.hadoop.mapreduce.Job
 import org.apache.hadoop.mapreduce.lib.input.{FileSplit => NewFileSplit, TextInputFormat => NewTextInputFormat}
 import org.apache.hadoop.mapreduce.lib.output.{TextOutputFormat => NewTextOutputFormat}
-import org.scalatest.FunSuite
 
 import org.apache.spark.rdd.{NewHadoopRDD, HadoopRDD}
 import org.apache.spark.util.Utils
 
-class FileSuite extends FunSuite with LocalSparkContext {
+class FileSuite extends SparkFunSuite with LocalSparkContext {
   var tempDir: File = _
 
   override def beforeEach() {
diff --git a/core/src/test/scala/org/apache/spark/FutureActionSuite.scala b/core/src/test/scala/org/apache/spark/FutureActionSuite.scala
index f5cdb01ec9504..1102aea96b548 100644
--- a/core/src/test/scala/org/apache/spark/FutureActionSuite.scala
+++ b/core/src/test/scala/org/apache/spark/FutureActionSuite.scala
@@ -20,10 +20,14 @@ package org.apache.spark
 import scala.concurrent.Await
 import scala.concurrent.duration.Duration
 
-import org.scalatest.{BeforeAndAfter, FunSuite, Matchers}
+import org.scalatest.{BeforeAndAfter, Matchers}
 
 
-class FutureActionSuite extends FunSuite with BeforeAndAfter with Matchers with LocalSparkContext {
+class FutureActionSuite
+  extends SparkFunSuite
+  with BeforeAndAfter
+  with Matchers
+  with LocalSparkContext {
 
   before {
     sc = new SparkContext("local", "FutureActionSuite")
diff --git a/core/src/test/scala/org/apache/spark/HeartbeatReceiverSuite.scala b/core/src/test/scala/org/apache/spark/HeartbeatReceiverSuite.scala
index b789912e9ebef..911b3bddd1836 100644
--- a/core/src/test/scala/org/apache/spark/HeartbeatReceiverSuite.scala
+++ b/core/src/test/scala/org/apache/spark/HeartbeatReceiverSuite.scala
@@ -22,7 +22,6 @@ import scala.language.postfixOps
 
 import org.apache.spark.executor.TaskMetrics
 import org.apache.spark.storage.BlockManagerId
-import org.scalatest.FunSuite
 import org.mockito.Mockito.{mock, spy, verify, when}
 import org.mockito.Matchers
 import org.mockito.Matchers._
@@ -31,7 +30,7 @@ import org.apache.spark.scheduler.TaskScheduler
 import org.apache.spark.util.RpcUtils
 import org.scalatest.concurrent.Eventually._
 
-class HeartbeatReceiverSuite extends FunSuite with LocalSparkContext {
+class HeartbeatReceiverSuite extends SparkFunSuite with LocalSparkContext {
 
   test("HeartbeatReceiver") {
     sc = spy(new SparkContext("local[2]", "test"))
diff --git a/core/src/test/scala/org/apache/spark/ImplicitOrderingSuite.scala b/core/src/test/scala/org/apache/spark/ImplicitOrderingSuite.scala
index 69314deda1f03..e47173f8a8b03 100644
--- a/core/src/test/scala/org/apache/spark/ImplicitOrderingSuite.scala
+++ b/core/src/test/scala/org/apache/spark/ImplicitOrderingSuite.scala
@@ -17,11 +17,9 @@
 
 package org.apache.spark
 
-import org.scalatest.FunSuite
-
 import org.apache.spark.rdd.RDD
 
-class ImplicitOrderingSuite extends FunSuite with LocalSparkContext {
+class ImplicitOrderingSuite extends SparkFunSuite with LocalSparkContext {
   // Tests that PairRDDFunctions grabs an implicit Ordering in various cases where it should.
   test("basic inference of Orderings"){
     sc = new SparkContext("local", "test")
diff --git a/core/src/test/scala/org/apache/spark/JobCancellationSuite.scala b/core/src/test/scala/org/apache/spark/JobCancellationSuite.scala
index ae17fc60e4a43..340a9e327107e 100644
--- a/core/src/test/scala/org/apache/spark/JobCancellationSuite.scala
+++ b/core/src/test/scala/org/apache/spark/JobCancellationSuite.scala
@@ -24,7 +24,7 @@ import scala.concurrent.ExecutionContext.Implicits.global
 import scala.concurrent.duration._
 import scala.concurrent.future
 
-import org.scalatest.{BeforeAndAfter, FunSuite}
+import org.scalatest.BeforeAndAfter
 import org.scalatest.Matchers
 
 import org.apache.spark.scheduler.{SparkListener, SparkListenerTaskStart}
@@ -34,7 +34,7 @@ import org.apache.spark.scheduler.{SparkListener, SparkListenerTaskStart}
  * (e.g. count) as well as multi-job action (e.g. take). We test the local and cluster schedulers
  * in both FIFO and fair scheduling modes.
  */
-class JobCancellationSuite extends FunSuite with Matchers with BeforeAndAfter
+class JobCancellationSuite extends SparkFunSuite with Matchers with BeforeAndAfter
   with LocalSparkContext {
 
   override def afterEach() {
diff --git a/core/src/test/scala/org/apache/spark/MapOutputTrackerSuite.scala b/core/src/test/scala/org/apache/spark/MapOutputTrackerSuite.scala
index 6ed057a7cab97..1fab69678d040 100644
--- a/core/src/test/scala/org/apache/spark/MapOutputTrackerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/MapOutputTrackerSuite.scala
@@ -19,14 +19,13 @@ package org.apache.spark
 
 import org.mockito.Mockito._
 import org.mockito.Matchers.{any, isA}
-import org.scalatest.FunSuite
 
 import org.apache.spark.rpc.{RpcAddress, RpcEndpointRef, RpcCallContext, RpcEnv}
 import org.apache.spark.scheduler.{CompressedMapStatus, MapStatus}
 import org.apache.spark.shuffle.FetchFailedException
 import org.apache.spark.storage.BlockManagerId
 
-class MapOutputTrackerSuite extends FunSuite {
+class MapOutputTrackerSuite extends SparkFunSuite {
   private val conf = new SparkConf
 
   def createRpcEnv(name: String, host: String = "localhost", port: Int = 0,
diff --git a/core/src/test/scala/org/apache/spark/PartitioningSuite.scala b/core/src/test/scala/org/apache/spark/PartitioningSuite.scala
index 47e3bf6e1ac41..3316f561a4949 100644
--- a/core/src/test/scala/org/apache/spark/PartitioningSuite.scala
+++ b/core/src/test/scala/org/apache/spark/PartitioningSuite.scala
@@ -20,12 +20,12 @@ package org.apache.spark
 import scala.collection.mutable.ArrayBuffer
 import scala.math.abs
 
-import org.scalatest.{FunSuite, PrivateMethodTester}
+import org.scalatest.PrivateMethodTester
 
 import org.apache.spark.rdd.RDD
 import org.apache.spark.util.StatCounter
 
-class PartitioningSuite extends FunSuite with SharedSparkContext with PrivateMethodTester {
+class PartitioningSuite extends SparkFunSuite with SharedSparkContext with PrivateMethodTester {
 
   test("HashPartitioner equality") {
     val p2 = new HashPartitioner(2)
diff --git a/core/src/test/scala/org/apache/spark/SSLOptionsSuite.scala b/core/src/test/scala/org/apache/spark/SSLOptionsSuite.scala
index 93f46ef11c0e2..376481ba541fa 100644
--- a/core/src/test/scala/org/apache/spark/SSLOptionsSuite.scala
+++ b/core/src/test/scala/org/apache/spark/SSLOptionsSuite.scala
@@ -21,9 +21,9 @@ import java.io.File
 
 import com.google.common.io.Files
 import org.apache.spark.util.Utils
-import org.scalatest.{BeforeAndAfterAll, FunSuite}
+import org.scalatest.BeforeAndAfterAll
 
-class SSLOptionsSuite extends FunSuite with BeforeAndAfterAll {
+class SSLOptionsSuite extends SparkFunSuite with BeforeAndAfterAll {
 
   test("test resolving property file as spark conf ") {
     val keyStorePath = new File(this.getClass.getResource("/keystore").toURI).getAbsolutePath
diff --git a/core/src/test/scala/org/apache/spark/SecurityManagerSuite.scala b/core/src/test/scala/org/apache/spark/SecurityManagerSuite.scala
index 61571be44252a..e9b64aa82a17a 100644
--- a/core/src/test/scala/org/apache/spark/SecurityManagerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/SecurityManagerSuite.scala
@@ -19,11 +19,9 @@ package org.apache.spark
 
 import java.io.File
 
-import org.scalatest.FunSuite
-
 import org.apache.spark.util.Utils
 
-class SecurityManagerSuite extends FunSuite {
+class SecurityManagerSuite extends SparkFunSuite {
 
   test("set security with conf") {
     val conf = new SparkConf
diff --git a/core/src/test/scala/org/apache/spark/ShuffleSuite.scala b/core/src/test/scala/org/apache/spark/ShuffleSuite.scala
index d7180516029d5..91f4ab360857e 100644
--- a/core/src/test/scala/org/apache/spark/ShuffleSuite.scala
+++ b/core/src/test/scala/org/apache/spark/ShuffleSuite.scala
@@ -17,7 +17,6 @@
 
 package org.apache.spark
 
-import org.scalatest.FunSuite
 import org.scalatest.Matchers
 
 import org.apache.spark.ShuffleSuite.NonJavaSerializableClass
@@ -26,7 +25,7 @@ import org.apache.spark.serializer.KryoSerializer
 import org.apache.spark.storage.{ShuffleDataBlockId, ShuffleBlockId}
 import org.apache.spark.util.MutablePair
 
-abstract class ShuffleSuite extends FunSuite with Matchers with LocalSparkContext {
+abstract class ShuffleSuite extends SparkFunSuite with Matchers with LocalSparkContext {
 
   val conf = new SparkConf(loadDefaults = false)
 
diff --git a/core/src/test/scala/org/apache/spark/SparkConfSuite.scala b/core/src/test/scala/org/apache/spark/SparkConfSuite.scala
index fafc9d47503b7..9fbaeb33f97cd 100644
--- a/core/src/test/scala/org/apache/spark/SparkConfSuite.scala
+++ b/core/src/test/scala/org/apache/spark/SparkConfSuite.scala
@@ -23,13 +23,12 @@ import scala.concurrent.duration._
 import scala.language.postfixOps
 import scala.util.{Try, Random}
 
-import org.scalatest.FunSuite
 import org.apache.spark.network.util.ByteUnit
 import org.apache.spark.serializer.{KryoRegistrator, KryoSerializer}
 import org.apache.spark.util.{RpcUtils, ResetSystemProperties}
 import com.esotericsoftware.kryo.Kryo
 
-class SparkConfSuite extends FunSuite with LocalSparkContext with ResetSystemProperties {
+class SparkConfSuite extends SparkFunSuite with LocalSparkContext with ResetSystemProperties {
   test("Test byteString conversion") {
     val conf = new SparkConf()
     // Simply exercise the API, we don't need a complete conversion test since that's handled in
diff --git a/core/src/test/scala/org/apache/spark/SparkContextInfoSuite.scala b/core/src/test/scala/org/apache/spark/SparkContextInfoSuite.scala
index e6ab538d77bcc..2bdbd70c638a5 100644
--- a/core/src/test/scala/org/apache/spark/SparkContextInfoSuite.scala
+++ b/core/src/test/scala/org/apache/spark/SparkContextInfoSuite.scala
@@ -17,10 +17,10 @@
 
 package org.apache.spark
 
-import org.scalatest.{Assertions, FunSuite}
+import org.scalatest.Assertions
 import org.apache.spark.storage.StorageLevel
 
-class SparkContextInfoSuite extends FunSuite with LocalSparkContext {
+class SparkContextInfoSuite extends SparkFunSuite with LocalSparkContext {
   test("getPersistentRDDs only returns RDDs that are marked as cached") {
     sc = new SparkContext("local", "test")
     assert(sc.getPersistentRDDs.isEmpty === true)
diff --git a/core/src/test/scala/org/apache/spark/SparkContextSchedulerCreationSuite.scala b/core/src/test/scala/org/apache/spark/SparkContextSchedulerCreationSuite.scala
index 9343f4fff89da..f89e3d0a49920 100644
--- a/core/src/test/scala/org/apache/spark/SparkContextSchedulerCreationSuite.scala
+++ b/core/src/test/scala/org/apache/spark/SparkContextSchedulerCreationSuite.scala
@@ -17,7 +17,7 @@
 
 package org.apache.spark
 
-import org.scalatest.{FunSuite, PrivateMethodTester}
+import org.scalatest.PrivateMethodTester
 
 import org.apache.spark.scheduler.{SchedulerBackend, TaskScheduler, TaskSchedulerImpl}
 import org.apache.spark.scheduler.cluster.{SimrSchedulerBackend, SparkDeploySchedulerBackend}
@@ -25,7 +25,7 @@ import org.apache.spark.scheduler.cluster.mesos.{CoarseMesosSchedulerBackend, Me
 import org.apache.spark.scheduler.local.LocalBackend
 
 class SparkContextSchedulerCreationSuite
-  extends FunSuite with LocalSparkContext with PrivateMethodTester with Logging {
+  extends SparkFunSuite with LocalSparkContext with PrivateMethodTester with Logging {
 
   def createTaskScheduler(master: String): TaskSchedulerImpl =
     createTaskScheduler(master, new SparkConf())
diff --git a/core/src/test/scala/org/apache/spark/SparkContextSuite.scala b/core/src/test/scala/org/apache/spark/SparkContextSuite.scala
index 31ef5cd75bd4a..93426822f704e 100644
--- a/core/src/test/scala/org/apache/spark/SparkContextSuite.scala
+++ b/core/src/test/scala/org/apache/spark/SparkContextSuite.scala
@@ -23,8 +23,6 @@ import java.util.concurrent.TimeUnit
 import com.google.common.base.Charsets._
 import com.google.common.io.Files
 
-import org.scalatest.FunSuite
-
 import org.apache.hadoop.io.{BytesWritable, LongWritable, Text}
 import org.apache.hadoop.mapred.TextInputFormat
 import org.apache.hadoop.mapreduce.lib.input.{TextInputFormat => NewTextInputFormat}
@@ -33,7 +31,7 @@ import org.apache.spark.util.Utils
 import scala.concurrent.Await
 import scala.concurrent.duration.Duration
 
-class SparkContextSuite extends FunSuite with LocalSparkContext {
+class SparkContextSuite extends SparkFunSuite with LocalSparkContext {
 
   test("Only one SparkContext may be active at a time") {
     // Regression test for SPARK-4180
diff --git a/core/src/test/scala/org/apache/spark/SparkFunSuite.scala b/core/src/test/scala/org/apache/spark/SparkFunSuite.scala
new file mode 100644
index 0000000000000..0327dfad6ea51
--- /dev/null
+++ b/core/src/test/scala/org/apache/spark/SparkFunSuite.scala
@@ -0,0 +1,46 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark
+
+import org.scalatest.{FunSuite, Outcome}
+
+/**
+ * Base abstract class for all unit tests in Spark for handling common functionality.
+ */
+private[spark] abstract class SparkFunSuite extends FunSuite with Logging {
+
+  /**
+   * Log the suite name and the test name before and after each test.
+   *
+   * Subclasses should never override this method. If they wish to run
+   * custom code before and after each test, they should should mix in
+   * the {{org.scalatest.BeforeAndAfter}} trait instead.
+   */
+  final protected override def withFixture(test: NoArgTest): Outcome = {
+    val testName = test.text
+    val suiteName = this.getClass.getName
+    val shortSuiteName = suiteName.replaceAll("org.apache.spark", "o.a.s")
+    try {
+      logInfo(s"\n\n===== TEST OUTPUT FOR $shortSuiteName: '$testName' =====\n")
+      test()
+    } finally {
+      logInfo(s"\n\n===== FINISHED $shortSuiteName: '$testName' =====\n")
+    }
+  }
+
+}
diff --git a/core/src/test/scala/org/apache/spark/StatusTrackerSuite.scala b/core/src/test/scala/org/apache/spark/StatusTrackerSuite.scala
index 084eb237d70d1..46516e8d25298 100644
--- a/core/src/test/scala/org/apache/spark/StatusTrackerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/StatusTrackerSuite.scala
@@ -21,12 +21,12 @@ import scala.concurrent.duration._
 import scala.language.implicitConversions
 import scala.language.postfixOps
 
-import org.scalatest.{Matchers, FunSuite}
+import org.scalatest.Matchers
 import org.scalatest.concurrent.Eventually._
 
 import org.apache.spark.JobExecutionStatus._
 
-class StatusTrackerSuite extends FunSuite with Matchers with LocalSparkContext {
+class StatusTrackerSuite extends SparkFunSuite with Matchers with LocalSparkContext {
 
   test("basic status API usage") {
     sc = new SparkContext("local", "test", new SparkConf(false))
diff --git a/core/src/test/scala/org/apache/spark/ThreadingSuite.scala b/core/src/test/scala/org/apache/spark/ThreadingSuite.scala
index 10917c866cc7d..6580139df6c60 100644
--- a/core/src/test/scala/org/apache/spark/ThreadingSuite.scala
+++ b/core/src/test/scala/org/apache/spark/ThreadingSuite.scala
@@ -22,7 +22,6 @@ import java.util.concurrent.atomic.AtomicBoolean
 import java.util.concurrent.atomic.AtomicInteger
 
 import org.apache.spark.scheduler._
-import org.scalatest.FunSuite
 
 /**
  * Holds state shared across task threads in some ThreadingSuite tests.
@@ -37,7 +36,7 @@ object ThreadingSuiteState {
   }
 }
 
-class ThreadingSuite extends FunSuite with LocalSparkContext {
+class ThreadingSuite extends SparkFunSuite with LocalSparkContext {
 
   test("accessing SparkContext form a different thread") {
     sc = new SparkContext("local", "test")
diff --git a/core/src/test/scala/org/apache/spark/UnpersistSuite.scala b/core/src/test/scala/org/apache/spark/UnpersistSuite.scala
index 42ff059e018a3..f7a13ab3996d8 100644
--- a/core/src/test/scala/org/apache/spark/UnpersistSuite.scala
+++ b/core/src/test/scala/org/apache/spark/UnpersistSuite.scala
@@ -17,11 +17,10 @@
 
 package org.apache.spark
 
-import org.scalatest.FunSuite
 import org.scalatest.concurrent.Timeouts._
 import org.scalatest.time.{Millis, Span}
 
-class UnpersistSuite extends FunSuite with LocalSparkContext {
+class UnpersistSuite extends SparkFunSuite with LocalSparkContext {
   test("unpersist RDD") {
     sc = new SparkContext("local", "test")
     val rdd = sc.makeRDD(Array(1, 2, 3, 4), 2).cache()
diff --git a/core/src/test/scala/org/apache/spark/api/python/PythonBroadcastSuite.scala b/core/src/test/scala/org/apache/spark/api/python/PythonBroadcastSuite.scala
index 8959a843dbd7d..135c56bf5bc9d 100644
--- a/core/src/test/scala/org/apache/spark/api/python/PythonBroadcastSuite.scala
+++ b/core/src/test/scala/org/apache/spark/api/python/PythonBroadcastSuite.scala
@@ -21,15 +21,15 @@ import scala.io.Source
 
 import java.io.{PrintWriter, File}
 
-import org.scalatest.{Matchers, FunSuite}
+import org.scalatest.Matchers
 
-import org.apache.spark.{SharedSparkContext, SparkConf}
+import org.apache.spark.{SharedSparkContext, SparkConf, SparkFunSuite}
 import org.apache.spark.serializer.KryoSerializer
 import org.apache.spark.util.Utils
 
 // This test suite uses SharedSparkContext because we need a SparkEnv in order to deserialize
 // a PythonBroadcast:
-class PythonBroadcastSuite extends FunSuite with Matchers with SharedSparkContext {
+class PythonBroadcastSuite extends SparkFunSuite with Matchers with SharedSparkContext {
   test("PythonBroadcast can be serialized with Kryo (SPARK-4882)") {
     val tempDir = Utils.createTempDir()
     val broadcastedString = "Hello, world!"
diff --git a/core/src/test/scala/org/apache/spark/api/python/PythonRDDSuite.scala b/core/src/test/scala/org/apache/spark/api/python/PythonRDDSuite.scala
index c63d834f9048b..41f2a5c972b6b 100644
--- a/core/src/test/scala/org/apache/spark/api/python/PythonRDDSuite.scala
+++ b/core/src/test/scala/org/apache/spark/api/python/PythonRDDSuite.scala
@@ -19,9 +19,9 @@ package org.apache.spark.api.python
 
 import java.io.{ByteArrayOutputStream, DataOutputStream}
 
-import org.scalatest.FunSuite
+import org.apache.spark.SparkFunSuite
 
-class PythonRDDSuite extends FunSuite {
+class PythonRDDSuite extends SparkFunSuite {
 
   test("Writing large strings to the worker") {
     val input: List[String] = List("a"*100000)
diff --git a/core/src/test/scala/org/apache/spark/api/python/SerDeUtilSuite.scala b/core/src/test/scala/org/apache/spark/api/python/SerDeUtilSuite.scala
index f8c39326145e1..267a79fa63782 100644
--- a/core/src/test/scala/org/apache/spark/api/python/SerDeUtilSuite.scala
+++ b/core/src/test/scala/org/apache/spark/api/python/SerDeUtilSuite.scala
@@ -17,11 +17,9 @@
 
 package org.apache.spark.api.python
 
-import org.scalatest.FunSuite
+import org.apache.spark.{SharedSparkContext, SparkFunSuite}
 
-import org.apache.spark.SharedSparkContext
-
-class SerDeUtilSuite extends FunSuite with SharedSparkContext {
+class SerDeUtilSuite extends SparkFunSuite with SharedSparkContext {
 
   test("Converting an empty pair RDD to python does not throw an exception (SPARK-5441)") {
     val emptyRdd = sc.makeRDD(Seq[(Any, Any)]())
diff --git a/core/src/test/scala/org/apache/spark/broadcast/BroadcastSuite.scala b/core/src/test/scala/org/apache/spark/broadcast/BroadcastSuite.scala
index c38e306b6ac40..c05e8bb6538ba 100644
--- a/core/src/test/scala/org/apache/spark/broadcast/BroadcastSuite.scala
+++ b/core/src/test/scala/org/apache/spark/broadcast/BroadcastSuite.scala
@@ -20,10 +20,10 @@ package org.apache.spark.broadcast
 import scala.concurrent.duration._
 import scala.util.Random
 
-import org.scalatest.{Assertions, FunSuite}
+import org.scalatest.Assertions
 import org.scalatest.concurrent.Eventually._
 
-import org.apache.spark.{LocalSparkContext, SparkConf, SparkContext, SparkException, SparkEnv}
+import org.apache.spark._
 import org.apache.spark.io.SnappyCompressionCodec
 import org.apache.spark.rdd.RDD
 import org.apache.spark.serializer.JavaSerializer
@@ -45,7 +45,7 @@ class DummyBroadcastClass(rdd: RDD[Int]) extends Serializable {
   }
 }
 
-class BroadcastSuite extends FunSuite with LocalSparkContext {
+class BroadcastSuite extends SparkFunSuite with LocalSparkContext {
 
   private val httpConf = broadcastConf("HttpBroadcastFactory")
   private val torrentConf = broadcastConf("TorrentBroadcastFactory")
diff --git a/core/src/test/scala/org/apache/spark/deploy/ClientSuite.scala b/core/src/test/scala/org/apache/spark/deploy/ClientSuite.scala
index 745f9eeee7536..6a99dbca64f4b 100644
--- a/core/src/test/scala/org/apache/spark/deploy/ClientSuite.scala
+++ b/core/src/test/scala/org/apache/spark/deploy/ClientSuite.scala
@@ -17,10 +17,11 @@
 
 package org.apache.spark.deploy
 
-import org.scalatest.FunSuite
 import org.scalatest.Matchers
 
-class ClientSuite extends FunSuite with Matchers {
+import org.apache.spark.SparkFunSuite
+
+class ClientSuite extends SparkFunSuite with Matchers {
   test("correctly validates driver jar URL's") {
     ClientArguments.isValidJarUrl("http://someHost:8080/foo.jar") should be (true)
     ClientArguments.isValidJarUrl("https://someHost:8080/foo.jar") should be (true)
diff --git a/core/src/test/scala/org/apache/spark/deploy/JsonProtocolSuite.scala b/core/src/test/scala/org/apache/spark/deploy/JsonProtocolSuite.scala
index e04a79284175c..08529e0ef2806 100644
--- a/core/src/test/scala/org/apache/spark/deploy/JsonProtocolSuite.scala
+++ b/core/src/test/scala/org/apache/spark/deploy/JsonProtocolSuite.scala
@@ -23,14 +23,13 @@ import java.util.Date
 import com.fasterxml.jackson.core.JsonParseException
 import org.json4s._
 import org.json4s.jackson.JsonMethods
-import org.scalatest.FunSuite
 
 import org.apache.spark.deploy.DeployMessages.{MasterStateResponse, WorkerStateResponse}
 import org.apache.spark.deploy.master.{ApplicationInfo, DriverInfo, RecoveryState, WorkerInfo}
 import org.apache.spark.deploy.worker.{DriverRunner, ExecutorRunner}
-import org.apache.spark.{JsonTestUtils, SecurityManager, SparkConf}
+import org.apache.spark.{JsonTestUtils, SecurityManager, SparkConf, SparkFunSuite}
 
-class JsonProtocolSuite extends FunSuite with JsonTestUtils {
+class JsonProtocolSuite extends SparkFunSuite with JsonTestUtils {
 
   test("writeApplicationInfo") {
     val output = JsonProtocol.writeApplicationInfo(createAppInfo())
diff --git a/core/src/test/scala/org/apache/spark/deploy/LogUrlsStandaloneSuite.scala b/core/src/test/scala/org/apache/spark/deploy/LogUrlsStandaloneSuite.scala
index c93d16f8a1586..c215b0582889f 100644
--- a/core/src/test/scala/org/apache/spark/deploy/LogUrlsStandaloneSuite.scala
+++ b/core/src/test/scala/org/apache/spark/deploy/LogUrlsStandaloneSuite.scala
@@ -23,13 +23,11 @@ import scala.collection.JavaConversions._
 import scala.collection.mutable
 import scala.io.Source
 
-import org.scalatest.FunSuite
-
 import org.apache.spark.scheduler.cluster.ExecutorInfo
 import org.apache.spark.scheduler.{SparkListenerExecutorAdded, SparkListener}
-import org.apache.spark.{SparkConf, SparkContext, LocalSparkContext}
+import org.apache.spark.{LocalSparkContext, SparkConf, SparkContext, SparkFunSuite}
 
-class LogUrlsStandaloneSuite extends FunSuite with LocalSparkContext {
+class LogUrlsStandaloneSuite extends SparkFunSuite with LocalSparkContext {
 
   /** Length of time to wait while draining listener events. */
   private val WAIT_TIMEOUT_MILLIS = 10000
diff --git a/core/src/test/scala/org/apache/spark/deploy/PythonRunnerSuite.scala b/core/src/test/scala/org/apache/spark/deploy/PythonRunnerSuite.scala
index 80f2cc02516fe..473a2d7b2a258 100644
--- a/core/src/test/scala/org/apache/spark/deploy/PythonRunnerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/deploy/PythonRunnerSuite.scala
@@ -17,11 +17,10 @@
 
 package org.apache.spark.deploy
 
-import org.scalatest.FunSuite
-
+import org.apache.spark.SparkFunSuite
 import org.apache.spark.util.Utils
 
-class PythonRunnerSuite extends FunSuite {
+class PythonRunnerSuite extends SparkFunSuite {
 
   // Test formatting a single path to be added to the PYTHONPATH
   test("format path") {
diff --git a/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala b/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala
index ea9227a7e9af5..46369457f000a 100644
--- a/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala
+++ b/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala
@@ -23,7 +23,6 @@ import scala.collection.mutable.ArrayBuffer
 
 import com.google.common.base.Charsets.UTF_8
 import com.google.common.io.ByteStreams
-import org.scalatest.FunSuite
 import org.scalatest.Matchers
 import org.scalatest.concurrent.Timeouts
 import org.scalatest.time.SpanSugar._
@@ -35,7 +34,12 @@ import org.apache.spark.util.{ResetSystemProperties, Utils}
 
 // Note: this suite mixes in ResetSystemProperties because SparkSubmit.main() sets a bunch
 // of properties that neeed to be cleared after tests.
-class SparkSubmitSuite extends FunSuite with Matchers with ResetSystemProperties with Timeouts {
+class SparkSubmitSuite
+  extends SparkFunSuite
+  with Matchers
+  with ResetSystemProperties
+  with Timeouts {
+
   def beforeAll() {
     System.setProperty("spark.testing", "true")
   }
diff --git a/core/src/test/scala/org/apache/spark/deploy/SparkSubmitUtilsSuite.scala b/core/src/test/scala/org/apache/spark/deploy/SparkSubmitUtilsSuite.scala
index 088ca3cb93b49..8fda5c8b472c9 100644
--- a/core/src/test/scala/org/apache/spark/deploy/SparkSubmitUtilsSuite.scala
+++ b/core/src/test/scala/org/apache/spark/deploy/SparkSubmitUtilsSuite.scala
@@ -20,15 +20,16 @@ package org.apache.spark.deploy
 import java.io.{File, PrintStream, OutputStream}
 
 import scala.collection.mutable.ArrayBuffer
-import org.scalatest.{BeforeAndAfterAll, FunSuite}
+import org.scalatest.BeforeAndAfterAll
 
 import org.apache.ivy.core.module.descriptor.MDArtifact
 import org.apache.ivy.core.settings.IvySettings
 import org.apache.ivy.plugins.resolver.IBiblioResolver
 
+import org.apache.spark.SparkFunSuite
 import org.apache.spark.deploy.SparkSubmitUtils.MavenCoordinate
 
-class SparkSubmitUtilsSuite extends FunSuite with BeforeAndAfterAll {
+class SparkSubmitUtilsSuite extends SparkFunSuite with BeforeAndAfterAll {
 
   private val noOpOutputStream = new OutputStream {
     def write(b: Int) = {}
diff --git a/core/src/test/scala/org/apache/spark/deploy/history/FsHistoryProviderSuite.scala b/core/src/test/scala/org/apache/spark/deploy/history/FsHistoryProviderSuite.scala
index a0a0afa48833e..0f6933df9e6bc 100644
--- a/core/src/test/scala/org/apache/spark/deploy/history/FsHistoryProviderSuite.scala
+++ b/core/src/test/scala/org/apache/spark/deploy/history/FsHistoryProviderSuite.scala
@@ -25,15 +25,15 @@ import scala.io.Source
 
 import org.apache.hadoop.fs.Path
 import org.json4s.jackson.JsonMethods._
-import org.scalatest.{BeforeAndAfter, FunSuite}
+import org.scalatest.BeforeAndAfter
 import org.scalatest.Matchers
 
-import org.apache.spark.{Logging, SparkConf}
+import org.apache.spark.{Logging, SparkConf, SparkFunSuite}
 import org.apache.spark.io._
 import org.apache.spark.scheduler._
 import org.apache.spark.util.{JsonProtocol, ManualClock, Utils}
 
-class FsHistoryProviderSuite extends FunSuite with BeforeAndAfter with Matchers with Logging {
+class FsHistoryProviderSuite extends SparkFunSuite with BeforeAndAfter with Matchers with Logging {
 
   private var testDir: File = null
 
diff --git a/core/src/test/scala/org/apache/spark/deploy/history/HistoryServerSuite.scala b/core/src/test/scala/org/apache/spark/deploy/history/HistoryServerSuite.scala
index e10dd4cf837aa..14f2d1a5894b8 100644
--- a/core/src/test/scala/org/apache/spark/deploy/history/HistoryServerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/deploy/history/HistoryServerSuite.scala
@@ -22,10 +22,10 @@ import javax.servlet.http.{HttpServletRequest, HttpServletResponse}
 
 import org.apache.commons.io.{FileUtils, IOUtils}
 import org.mockito.Mockito.when
-import org.scalatest.{BeforeAndAfter, FunSuite, Matchers}
+import org.scalatest.{BeforeAndAfter, Matchers}
 import org.scalatest.mock.MockitoSugar
 
-import org.apache.spark.{JsonTestUtils, SecurityManager, SparkConf}
+import org.apache.spark.{JsonTestUtils, SecurityManager, SparkConf, SparkFunSuite}
 import org.apache.spark.ui.SparkUI
 
 /**
@@ -39,7 +39,7 @@ import org.apache.spark.ui.SparkUI
  * expectations.  However, in general this should be done with extreme caution, as the metrics
  * are considered part of Spark's public api.
  */
-class HistoryServerSuite extends FunSuite with BeforeAndAfter with Matchers with MockitoSugar
+class HistoryServerSuite extends SparkFunSuite with BeforeAndAfter with Matchers with MockitoSugar
   with JsonTestUtils {
 
   private val logDir = new File("src/test/resources/spark-events")
diff --git a/core/src/test/scala/org/apache/spark/deploy/master/MasterSuite.scala b/core/src/test/scala/org/apache/spark/deploy/master/MasterSuite.scala
index f97e5ff6db31d..014e87bb40254 100644
--- a/core/src/test/scala/org/apache/spark/deploy/master/MasterSuite.scala
+++ b/core/src/test/scala/org/apache/spark/deploy/master/MasterSuite.scala
@@ -27,14 +27,14 @@ import scala.language.postfixOps
 import akka.actor.Address
 import org.json4s._
 import org.json4s.jackson.JsonMethods._
-import org.scalatest.{FunSuite, Matchers}
+import org.scalatest.Matchers
 import org.scalatest.concurrent.Eventually
 import other.supplier.{CustomPersistenceEngine, CustomRecoveryModeFactory}
 
-import org.apache.spark.{SparkConf, SparkException}
+import org.apache.spark.{SparkConf, SparkException, SparkFunSuite}
 import org.apache.spark.deploy._
 
-class MasterSuite extends FunSuite with Matchers with Eventually {
+class MasterSuite extends SparkFunSuite with Matchers with Eventually {
 
   test("toAkkaUrl") {
     val conf = new SparkConf(loadDefaults = false)
diff --git a/core/src/test/scala/org/apache/spark/deploy/rest/StandaloneRestSubmitSuite.scala b/core/src/test/scala/org/apache/spark/deploy/rest/StandaloneRestSubmitSuite.scala
index f4d548d9e7720..197f68e7ec5ed 100644
--- a/core/src/test/scala/org/apache/spark/deploy/rest/StandaloneRestSubmitSuite.scala
+++ b/core/src/test/scala/org/apache/spark/deploy/rest/StandaloneRestSubmitSuite.scala
@@ -25,7 +25,7 @@ import scala.collection.mutable
 
 import akka.actor.{Actor, ActorRef, ActorSystem, Props}
 import com.google.common.base.Charsets
-import org.scalatest.{BeforeAndAfterEach, FunSuite}
+import org.scalatest.BeforeAndAfterEach
 import org.json4s.JsonAST._
 import org.json4s.jackson.JsonMethods._
 
@@ -38,7 +38,7 @@ import org.apache.spark.deploy.master.DriverState._
 /**
  * Tests for the REST application submission protocol used in standalone cluster mode.
  */
-class StandaloneRestSubmitSuite extends FunSuite with BeforeAndAfterEach {
+class StandaloneRestSubmitSuite extends SparkFunSuite with BeforeAndAfterEach {
   private var actorSystem: Option[ActorSystem] = None
   private var server: Option[RestSubmissionServer] = None
 
diff --git a/core/src/test/scala/org/apache/spark/deploy/rest/SubmitRestProtocolSuite.scala b/core/src/test/scala/org/apache/spark/deploy/rest/SubmitRestProtocolSuite.scala
index 61071ee17256c..115ac0534a1b4 100644
--- a/core/src/test/scala/org/apache/spark/deploy/rest/SubmitRestProtocolSuite.scala
+++ b/core/src/test/scala/org/apache/spark/deploy/rest/SubmitRestProtocolSuite.scala
@@ -21,14 +21,13 @@ import java.lang.Boolean
 import java.lang.Integer
 
 import org.json4s.jackson.JsonMethods._
-import org.scalatest.FunSuite
 
-import org.apache.spark.SparkConf
+import org.apache.spark.{SparkConf, SparkFunSuite}
 
 /**
  * Tests for the REST application submission protocol.
  */
-class SubmitRestProtocolSuite extends FunSuite {
+class SubmitRestProtocolSuite extends SparkFunSuite {
 
   test("validate") {
     val request = new DummyRequest
diff --git a/core/src/test/scala/org/apache/spark/deploy/worker/CommandUtilsSuite.scala b/core/src/test/scala/org/apache/spark/deploy/worker/CommandUtilsSuite.scala
index 1c27d83cf876c..5b3930c0b0132 100644
--- a/core/src/test/scala/org/apache/spark/deploy/worker/CommandUtilsSuite.scala
+++ b/core/src/test/scala/org/apache/spark/deploy/worker/CommandUtilsSuite.scala
@@ -17,11 +17,12 @@
 
 package org.apache.spark.deploy.worker
 
+import org.apache.spark.SparkFunSuite
 import org.apache.spark.deploy.Command
 import org.apache.spark.util.Utils
-import org.scalatest.{FunSuite, Matchers}
+import org.scalatest.Matchers
 
-class CommandUtilsSuite extends FunSuite with Matchers {
+class CommandUtilsSuite extends SparkFunSuite with Matchers {
 
   test("set libraryPath correctly") {
     val appId = "12345-worker321-9876"
diff --git a/core/src/test/scala/org/apache/spark/deploy/worker/DriverRunnerTest.scala b/core/src/test/scala/org/apache/spark/deploy/worker/DriverRunnerTest.scala
index 2159fd8c16c6f..6258c18d177fd 100644
--- a/core/src/test/scala/org/apache/spark/deploy/worker/DriverRunnerTest.scala
+++ b/core/src/test/scala/org/apache/spark/deploy/worker/DriverRunnerTest.scala
@@ -23,13 +23,12 @@ import org.mockito.Mockito._
 import org.mockito.Matchers._
 import org.mockito.invocation.InvocationOnMock
 import org.mockito.stubbing.Answer
-import org.scalatest.FunSuite
 
-import org.apache.spark.{SecurityManager, SparkConf}
+import org.apache.spark.{SecurityManager, SparkConf, SparkFunSuite}
 import org.apache.spark.deploy.{Command, DriverDescription}
 import org.apache.spark.util.Clock
 
-class DriverRunnerTest extends FunSuite {
+class DriverRunnerTest extends SparkFunSuite {
   private def createDriverRunner() = {
     val command = new Command("mainClass", Seq(), Map(), Seq(), Seq(), Seq())
     val driverDescription = new DriverDescription("jarUrl", 512, 1, true, command)
diff --git a/core/src/test/scala/org/apache/spark/deploy/worker/ExecutorRunnerTest.scala b/core/src/test/scala/org/apache/spark/deploy/worker/ExecutorRunnerTest.scala
index a8b9df227c996..3da992788962b 100644
--- a/core/src/test/scala/org/apache/spark/deploy/worker/ExecutorRunnerTest.scala
+++ b/core/src/test/scala/org/apache/spark/deploy/worker/ExecutorRunnerTest.scala
@@ -21,12 +21,10 @@ import java.io.File
 
 import scala.collection.JavaConversions._
 
-import org.scalatest.FunSuite
-
 import org.apache.spark.deploy.{ApplicationDescription, Command, ExecutorState}
-import org.apache.spark.SparkConf
+import org.apache.spark.{SparkConf, SparkFunSuite}
 
-class ExecutorRunnerTest extends FunSuite {
+class ExecutorRunnerTest extends SparkFunSuite {
   test("command includes appId") {
     val appId = "12345-worker321-9876"
     val sparkHome = sys.props.getOrElse("spark.test.home", fail("spark.test.home is not set!"))
diff --git a/core/src/test/scala/org/apache/spark/deploy/worker/WorkerArgumentsTest.scala b/core/src/test/scala/org/apache/spark/deploy/worker/WorkerArgumentsTest.scala
index e432b8e94654a..15f7ca4a6dacc 100644
--- a/core/src/test/scala/org/apache/spark/deploy/worker/WorkerArgumentsTest.scala
+++ b/core/src/test/scala/org/apache/spark/deploy/worker/WorkerArgumentsTest.scala
@@ -18,11 +18,10 @@
 
 package org.apache.spark.deploy.worker
 
-import org.apache.spark.SparkConf
-import org.scalatest.FunSuite
+import org.apache.spark.{SparkConf, SparkFunSuite}
 
 
-class WorkerArgumentsTest extends FunSuite {
+class WorkerArgumentsTest extends SparkFunSuite {
 
   test("Memory can't be set to 0 when cmd line args leave off M or G") {
     val conf = new SparkConf
diff --git a/core/src/test/scala/org/apache/spark/deploy/worker/WorkerSuite.scala b/core/src/test/scala/org/apache/spark/deploy/worker/WorkerSuite.scala
index 93a779d5ce6f2..0f4d3b28d09df 100644
--- a/core/src/test/scala/org/apache/spark/deploy/worker/WorkerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/deploy/worker/WorkerSuite.scala
@@ -17,12 +17,12 @@
 
 package org.apache.spark.deploy.worker
 
-import org.apache.spark.SparkConf
+import org.apache.spark.{SparkConf, SparkFunSuite}
 import org.apache.spark.deploy.Command
 
-import org.scalatest.{Matchers, FunSuite}
+import org.scalatest.Matchers
 
-class WorkerSuite extends FunSuite with Matchers {
+class WorkerSuite extends SparkFunSuite with Matchers {
 
   def cmd(javaOpts: String*): Command = {
     Command("", Seq.empty, Map.empty, Seq.empty, Seq.empty, Seq(javaOpts : _*))
diff --git a/core/src/test/scala/org/apache/spark/deploy/worker/WorkerWatcherSuite.scala b/core/src/test/scala/org/apache/spark/deploy/worker/WorkerWatcherSuite.scala
index 6a6f29dd613cd..ac18f04a11475 100644
--- a/core/src/test/scala/org/apache/spark/deploy/worker/WorkerWatcherSuite.scala
+++ b/core/src/test/scala/org/apache/spark/deploy/worker/WorkerWatcherSuite.scala
@@ -18,12 +18,11 @@
 package org.apache.spark.deploy.worker
 
 import akka.actor.AddressFromURIString
-import org.apache.spark.SparkConf
+import org.apache.spark.{SparkConf, SparkFunSuite}
 import org.apache.spark.SecurityManager
 import org.apache.spark.rpc.{RpcAddress, RpcEnv}
-import org.scalatest.FunSuite
 
-class WorkerWatcherSuite extends FunSuite {
+class WorkerWatcherSuite extends SparkFunSuite {
   test("WorkerWatcher shuts down on valid disassociation") {
     val conf = new SparkConf()
     val rpcEnv = RpcEnv.create("test", "localhost", 12345, conf, new SecurityManager(conf))
diff --git a/core/src/test/scala/org/apache/spark/executor/TaskMetricsSuite.scala b/core/src/test/scala/org/apache/spark/executor/TaskMetricsSuite.scala
index 326e203afe136..8275fd87764cd 100644
--- a/core/src/test/scala/org/apache/spark/executor/TaskMetricsSuite.scala
+++ b/core/src/test/scala/org/apache/spark/executor/TaskMetricsSuite.scala
@@ -17,9 +17,9 @@
 
 package org.apache.spark.executor
 
-import org.scalatest.FunSuite
+import org.apache.spark.SparkFunSuite
 
-class TaskMetricsSuite extends FunSuite {
+class TaskMetricsSuite extends SparkFunSuite {
   test("[SPARK-5701] updateShuffleReadMetrics: ShuffleReadMetrics not added when no shuffle deps") {
     val taskMetrics = new TaskMetrics()
     taskMetrics.updateShuffleReadMetrics()
diff --git a/core/src/test/scala/org/apache/spark/input/WholeTextFileRecordReaderSuite.scala b/core/src/test/scala/org/apache/spark/input/WholeTextFileRecordReaderSuite.scala
index 2e58c159a2ed8..63947df3d43a2 100644
--- a/core/src/test/scala/org/apache/spark/input/WholeTextFileRecordReaderSuite.scala
+++ b/core/src/test/scala/org/apache/spark/input/WholeTextFileRecordReaderSuite.scala
@@ -24,11 +24,10 @@ import java.io.FileOutputStream
 import scala.collection.immutable.IndexedSeq
 
 import org.scalatest.BeforeAndAfterAll
-import org.scalatest.FunSuite
 
 import org.apache.hadoop.io.Text
 
-import org.apache.spark.{SparkConf, SparkContext}
+import org.apache.spark.{SparkConf, SparkContext, SparkFunSuite}
 import org.apache.spark.util.Utils
 import org.apache.hadoop.io.compress.{DefaultCodec, CompressionCodecFactory, GzipCodec}
 
@@ -37,7 +36,7 @@ import org.apache.hadoop.io.compress.{DefaultCodec, CompressionCodecFactory, Gzi
  * [[org.apache.spark.input.WholeTextFileRecordReader WholeTextFileRecordReader]]. A temporary
  * directory is created as fake input. Temporal storage would be deleted in the end.
  */
-class WholeTextFileRecordReaderSuite extends FunSuite with BeforeAndAfterAll {
+class WholeTextFileRecordReaderSuite extends SparkFunSuite with BeforeAndAfterAll {
   private var sc: SparkContext = _
   private var factory: CompressionCodecFactory = _
 
diff --git a/core/src/test/scala/org/apache/spark/io/CompressionCodecSuite.scala b/core/src/test/scala/org/apache/spark/io/CompressionCodecSuite.scala
index cf6a143537889..cbdb33c89d0fb 100644
--- a/core/src/test/scala/org/apache/spark/io/CompressionCodecSuite.scala
+++ b/core/src/test/scala/org/apache/spark/io/CompressionCodecSuite.scala
@@ -20,11 +20,10 @@ package org.apache.spark.io
 import java.io.{ByteArrayInputStream, ByteArrayOutputStream}
 
 import com.google.common.io.ByteStreams
-import org.scalatest.FunSuite
 
-import org.apache.spark.SparkConf
+import org.apache.spark.{SparkConf, SparkFunSuite}
 
-class CompressionCodecSuite extends FunSuite {
+class CompressionCodecSuite extends SparkFunSuite {
   val conf = new SparkConf(false)
 
   def testCodec(codec: CompressionCodec) {
diff --git a/core/src/test/scala/org/apache/spark/metrics/InputOutputMetricsSuite.scala b/core/src/test/scala/org/apache/spark/metrics/InputOutputMetricsSuite.scala
index 60dba3b2d6719..19f1af0dcd461 100644
--- a/core/src/test/scala/org/apache/spark/metrics/InputOutputMetricsSuite.scala
+++ b/core/src/test/scala/org/apache/spark/metrics/InputOutputMetricsSuite.scala
@@ -36,14 +36,14 @@ import org.apache.hadoop.mapreduce.lib.input.{CombineFileInputFormat => NewCombi
 import org.apache.hadoop.mapreduce.lib.output.{TextOutputFormat => NewTextOutputFormat}
 import org.apache.hadoop.mapreduce.{TaskAttemptContext, InputSplit => NewInputSplit,
   RecordReader => NewRecordReader}
-import org.scalatest.{BeforeAndAfter, FunSuite}
+import org.scalatest.BeforeAndAfter
 
-import org.apache.spark.SharedSparkContext
+import org.apache.spark.{SharedSparkContext, SparkFunSuite}
 import org.apache.spark.deploy.SparkHadoopUtil
 import org.apache.spark.scheduler.{SparkListener, SparkListenerTaskEnd}
 import org.apache.spark.util.Utils
 
-class InputOutputMetricsSuite extends FunSuite with SharedSparkContext
+class InputOutputMetricsSuite extends SparkFunSuite with SharedSparkContext
   with BeforeAndAfter {
 
   @transient var tmpDir: File = _
diff --git a/core/src/test/scala/org/apache/spark/metrics/MetricsConfigSuite.scala b/core/src/test/scala/org/apache/spark/metrics/MetricsConfigSuite.scala
index 100ac77dec1f7..a901a069d9bfe 100644
--- a/core/src/test/scala/org/apache/spark/metrics/MetricsConfigSuite.scala
+++ b/core/src/test/scala/org/apache/spark/metrics/MetricsConfigSuite.scala
@@ -17,9 +17,11 @@
 
 package org.apache.spark.metrics
 
-import org.scalatest.{BeforeAndAfter, FunSuite}
+import org.scalatest.BeforeAndAfter
 
-class MetricsConfigSuite extends FunSuite with BeforeAndAfter {
+import org.apache.spark.SparkFunSuite
+
+class MetricsConfigSuite extends SparkFunSuite with BeforeAndAfter {
   var filePath: String = _
 
   before {
diff --git a/core/src/test/scala/org/apache/spark/metrics/MetricsSystemSuite.scala b/core/src/test/scala/org/apache/spark/metrics/MetricsSystemSuite.scala
index bbdc9568a6ddb..9c389c76bf3bd 100644
--- a/core/src/test/scala/org/apache/spark/metrics/MetricsSystemSuite.scala
+++ b/core/src/test/scala/org/apache/spark/metrics/MetricsSystemSuite.scala
@@ -17,9 +17,9 @@
 
 package org.apache.spark.metrics
 
-import org.scalatest.{BeforeAndAfter, FunSuite, PrivateMethodTester}
+import org.scalatest.{BeforeAndAfter, PrivateMethodTester}
 
-import org.apache.spark.{SecurityManager, SparkConf}
+import org.apache.spark.{SecurityManager, SparkConf, SparkFunSuite}
 import org.apache.spark.deploy.master.MasterSource
 import org.apache.spark.metrics.source.Source
 
@@ -27,7 +27,7 @@ import com.codahale.metrics.MetricRegistry
 
 import scala.collection.mutable.ArrayBuffer
 
-class MetricsSystemSuite extends FunSuite with BeforeAndAfter with PrivateMethodTester{
+class MetricsSystemSuite extends SparkFunSuite with BeforeAndAfter with PrivateMethodTester{
   var filePath: String = _
   var conf: SparkConf = null
   var securityMgr: SecurityManager = null
diff --git a/core/src/test/scala/org/apache/spark/network/netty/NettyBlockTransferSecuritySuite.scala b/core/src/test/scala/org/apache/spark/network/netty/NettyBlockTransferSecuritySuite.scala
index 46d2e5173acae..3940527fb874e 100644
--- a/core/src/test/scala/org/apache/spark/network/netty/NettyBlockTransferSecuritySuite.scala
+++ b/core/src/test/scala/org/apache/spark/network/netty/NettyBlockTransferSecuritySuite.scala
@@ -31,12 +31,12 @@ import org.apache.spark.network.buffer.{ManagedBuffer, NioManagedBuffer}
 import org.apache.spark.network.shuffle.BlockFetchingListener
 import org.apache.spark.network.{BlockDataManager, BlockTransferService}
 import org.apache.spark.storage.{BlockId, ShuffleBlockId}
-import org.apache.spark.{SecurityManager, SparkConf}
+import org.apache.spark.{SecurityManager, SparkConf, SparkFunSuite}
 import org.mockito.Mockito._
 import org.scalatest.mock.MockitoSugar
-import org.scalatest.{FunSuite, ShouldMatchers}
+import org.scalatest.ShouldMatchers
 
-class NettyBlockTransferSecuritySuite extends FunSuite with MockitoSugar with ShouldMatchers {
+class NettyBlockTransferSecuritySuite extends SparkFunSuite with MockitoSugar with ShouldMatchers {
   test("security default off") {
     val conf = new SparkConf()
       .set("spark.app.id", "app-id")
diff --git a/core/src/test/scala/org/apache/spark/network/netty/NettyBlockTransferServiceSuite.scala b/core/src/test/scala/org/apache/spark/network/netty/NettyBlockTransferServiceSuite.scala
index a41f8b7ce5ce0..6f8e8a7ac6033 100644
--- a/core/src/test/scala/org/apache/spark/network/netty/NettyBlockTransferServiceSuite.scala
+++ b/core/src/test/scala/org/apache/spark/network/netty/NettyBlockTransferServiceSuite.scala
@@ -18,11 +18,15 @@
 package org.apache.spark.network.netty
 
 import org.apache.spark.network.BlockDataManager
-import org.apache.spark.{SecurityManager, SparkConf}
+import org.apache.spark.{SecurityManager, SparkConf, SparkFunSuite}
 import org.mockito.Mockito.mock
 import org.scalatest._
 
-class NettyBlockTransferServiceSuite extends FunSuite with BeforeAndAfterEach with ShouldMatchers {
+class NettyBlockTransferServiceSuite
+  extends SparkFunSuite
+  with BeforeAndAfterEach
+  with ShouldMatchers {
+
   private var service0: NettyBlockTransferService = _
   private var service1: NettyBlockTransferService = _
 
diff --git a/core/src/test/scala/org/apache/spark/network/nio/ConnectionManagerSuite.scala b/core/src/test/scala/org/apache/spark/network/nio/ConnectionManagerSuite.scala
index 02424c59d6831..5e364cc0edeb2 100644
--- a/core/src/test/scala/org/apache/spark/network/nio/ConnectionManagerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/network/nio/ConnectionManagerSuite.scala
@@ -24,15 +24,13 @@ import scala.concurrent.duration._
 import scala.concurrent.{Await, TimeoutException}
 import scala.language.postfixOps
 
-import org.scalatest.FunSuite
-
-import org.apache.spark.{SecurityManager, SparkConf}
+import org.apache.spark.{SecurityManager, SparkConf, SparkFunSuite}
 import org.apache.spark.util.Utils
 
 /**
   * Test the ConnectionManager with various security settings.
   */
-class ConnectionManagerSuite extends FunSuite {
+class ConnectionManagerSuite extends SparkFunSuite {
 
   test("security default off") {
     val conf = new SparkConf
diff --git a/core/src/test/scala/org/apache/spark/rdd/AsyncRDDActionsSuite.scala b/core/src/test/scala/org/apache/spark/rdd/AsyncRDDActionsSuite.scala
index f2b0ea1063a72..ec99f2a1bad66 100644
--- a/core/src/test/scala/org/apache/spark/rdd/AsyncRDDActionsSuite.scala
+++ b/core/src/test/scala/org/apache/spark/rdd/AsyncRDDActionsSuite.scala
@@ -23,13 +23,13 @@ import scala.concurrent.{Await, TimeoutException}
 import scala.concurrent.duration.Duration
 import scala.concurrent.ExecutionContext.Implicits.global
 
-import org.scalatest.{BeforeAndAfterAll, FunSuite}
+import org.scalatest.BeforeAndAfterAll
 import org.scalatest.concurrent.Timeouts
 import org.scalatest.time.SpanSugar._
 
-import org.apache.spark.{SparkContext, SparkException, LocalSparkContext}
+import org.apache.spark.{LocalSparkContext, SparkContext, SparkException, SparkFunSuite}
 
-class AsyncRDDActionsSuite extends FunSuite with BeforeAndAfterAll with Timeouts {
+class AsyncRDDActionsSuite extends SparkFunSuite with BeforeAndAfterAll with Timeouts {
 
   @transient private var sc: SparkContext = _
 
diff --git a/core/src/test/scala/org/apache/spark/rdd/DoubleRDDSuite.scala b/core/src/test/scala/org/apache/spark/rdd/DoubleRDDSuite.scala
index 01039b9449daf..4e72b89bfcc40 100644
--- a/core/src/test/scala/org/apache/spark/rdd/DoubleRDDSuite.scala
+++ b/core/src/test/scala/org/apache/spark/rdd/DoubleRDDSuite.scala
@@ -17,11 +17,9 @@
 
 package org.apache.spark.rdd
 
-import org.scalatest.FunSuite
-
 import org.apache.spark._
 
-class DoubleRDDSuite extends FunSuite with SharedSparkContext {
+class DoubleRDDSuite extends SparkFunSuite with SharedSparkContext {
   test("sum") {
     assert(sc.parallelize(Seq.empty[Double]).sum() === 0.0)
     assert(sc.parallelize(Seq(1.0)).sum() === 1.0)
diff --git a/core/src/test/scala/org/apache/spark/rdd/JdbcRDDSuite.scala b/core/src/test/scala/org/apache/spark/rdd/JdbcRDDSuite.scala
index be8467354b222..a8466ed8c1dc2 100644
--- a/core/src/test/scala/org/apache/spark/rdd/JdbcRDDSuite.scala
+++ b/core/src/test/scala/org/apache/spark/rdd/JdbcRDDSuite.scala
@@ -19,11 +19,11 @@ package org.apache.spark.rdd
 
 import java.sql._
 
-import org.scalatest.{BeforeAndAfter, FunSuite}
+import org.scalatest.BeforeAndAfter
 
-import org.apache.spark.{LocalSparkContext, SparkContext}
+import org.apache.spark.{LocalSparkContext, SparkContext, SparkFunSuite}
 
-class JdbcRDDSuite extends FunSuite with BeforeAndAfter with LocalSparkContext {
+class JdbcRDDSuite extends SparkFunSuite with BeforeAndAfter with LocalSparkContext {
 
   before {
     Class.forName("org.apache.derby.jdbc.EmbeddedDriver")
diff --git a/core/src/test/scala/org/apache/spark/rdd/PairRDDFunctionsSuite.scala b/core/src/test/scala/org/apache/spark/rdd/PairRDDFunctionsSuite.scala
index 6564232986cfa..dfa102f432a02 100644
--- a/core/src/test/scala/org/apache/spark/rdd/PairRDDFunctionsSuite.scala
+++ b/core/src/test/scala/org/apache/spark/rdd/PairRDDFunctionsSuite.scala
@@ -28,12 +28,10 @@ import org.apache.hadoop.conf.{Configurable, Configuration}
 import org.apache.hadoop.mapreduce.{JobContext => NewJobContext, OutputCommitter => NewOutputCommitter,
 OutputFormat => NewOutputFormat, RecordWriter => NewRecordWriter,
 TaskAttemptContext => NewTaskAttempContext}
-import org.apache.spark.{Partitioner, SharedSparkContext}
+import org.apache.spark.{Partitioner, SharedSparkContext, SparkFunSuite}
 import org.apache.spark.util.Utils
 
-import org.scalatest.FunSuite
-
-class PairRDDFunctionsSuite extends FunSuite with SharedSparkContext {
+class PairRDDFunctionsSuite extends SparkFunSuite with SharedSparkContext {
   test("aggregateByKey") {
     val pairs = sc.parallelize(Array((1, 1), (1, 1), (3, 2), (5, 1), (5, 3)), 2)
 
diff --git a/core/src/test/scala/org/apache/spark/rdd/ParallelCollectionSplitSuite.scala b/core/src/test/scala/org/apache/spark/rdd/ParallelCollectionSplitSuite.scala
index 1880364581c1a..e7cc1617cdf1c 100644
--- a/core/src/test/scala/org/apache/spark/rdd/ParallelCollectionSplitSuite.scala
+++ b/core/src/test/scala/org/apache/spark/rdd/ParallelCollectionSplitSuite.scala
@@ -22,10 +22,11 @@ import scala.collection.immutable.NumericRange
 import org.scalacheck.Arbitrary._
 import org.scalacheck.Gen
 import org.scalacheck.Prop._
-import org.scalatest.FunSuite
 import org.scalatest.prop.Checkers
 
-class ParallelCollectionSplitSuite extends FunSuite with Checkers {
+import org.apache.spark.SparkFunSuite
+
+class ParallelCollectionSplitSuite extends SparkFunSuite with Checkers {
   test("one element per slice") {
     val data = Array(1, 2, 3)
     val slices = ParallelCollectionRDD.slice(data, 3)
diff --git a/core/src/test/scala/org/apache/spark/rdd/PartitionPruningRDDSuite.scala b/core/src/test/scala/org/apache/spark/rdd/PartitionPruningRDDSuite.scala
index 465068c6cbb16..b1544a6106110 100644
--- a/core/src/test/scala/org/apache/spark/rdd/PartitionPruningRDDSuite.scala
+++ b/core/src/test/scala/org/apache/spark/rdd/PartitionPruningRDDSuite.scala
@@ -17,11 +17,9 @@
 
 package org.apache.spark.rdd
 
-import org.scalatest.FunSuite
+import org.apache.spark.{Partition, SharedSparkContext, SparkFunSuite, TaskContext}
 
-import org.apache.spark.{Partition, SharedSparkContext, TaskContext}
-
-class PartitionPruningRDDSuite extends FunSuite with SharedSparkContext {
+class PartitionPruningRDDSuite extends SparkFunSuite with SharedSparkContext {
 
   test("Pruned Partitions inherit locality prefs correctly") {
 
diff --git a/core/src/test/scala/org/apache/spark/rdd/PartitionwiseSampledRDDSuite.scala b/core/src/test/scala/org/apache/spark/rdd/PartitionwiseSampledRDDSuite.scala
index 0d1369c19c69e..132a5fa9a80fb 100644
--- a/core/src/test/scala/org/apache/spark/rdd/PartitionwiseSampledRDDSuite.scala
+++ b/core/src/test/scala/org/apache/spark/rdd/PartitionwiseSampledRDDSuite.scala
@@ -17,9 +17,7 @@
 
 package org.apache.spark.rdd
 
-import org.scalatest.FunSuite
-
-import org.apache.spark.SharedSparkContext
+import org.apache.spark.{SharedSparkContext, SparkFunSuite}
 import org.apache.spark.util.random.{BernoulliSampler, PoissonSampler, RandomSampler}
 
 /** a sampler that outputs its seed */
@@ -38,7 +36,7 @@ class MockSampler extends RandomSampler[Long, Long] {
   override def clone: MockSampler = new MockSampler
 }
 
-class PartitionwiseSampledRDDSuite extends FunSuite with SharedSparkContext {
+class PartitionwiseSampledRDDSuite extends SparkFunSuite with SharedSparkContext {
 
   test("seed distribution") {
     val rdd = sc.makeRDD(Array(1L, 2L, 3L, 4L), 2)
diff --git a/core/src/test/scala/org/apache/spark/rdd/PipedRDDSuite.scala b/core/src/test/scala/org/apache/spark/rdd/PipedRDDSuite.scala
index 85eb2a1d07ba4..32f04d54eff94 100644
--- a/core/src/test/scala/org/apache/spark/rdd/PipedRDDSuite.scala
+++ b/core/src/test/scala/org/apache/spark/rdd/PipedRDDSuite.scala
@@ -22,7 +22,6 @@ import java.io.File
 import org.apache.hadoop.fs.Path
 import org.apache.hadoop.io.{LongWritable, Text}
 import org.apache.hadoop.mapred.{FileSplit, JobConf, TextInputFormat}
-import org.scalatest.FunSuite
 
 import scala.collection.Map
 import scala.language.postfixOps
@@ -32,7 +31,7 @@ import scala.util.Try
 import org.apache.spark._
 import org.apache.spark.util.Utils
 
-class PipedRDDSuite extends FunSuite with SharedSparkContext {
+class PipedRDDSuite extends SparkFunSuite with SharedSparkContext {
 
   test("basic pipe") {
     if (testCommandAvailable("cat")) {
diff --git a/core/src/test/scala/org/apache/spark/rdd/RDDOperationScopeSuite.scala b/core/src/test/scala/org/apache/spark/rdd/RDDOperationScopeSuite.scala
index 4434ed858c60c..f65349e3e3585 100644
--- a/core/src/test/scala/org/apache/spark/rdd/RDDOperationScopeSuite.scala
+++ b/core/src/test/scala/org/apache/spark/rdd/RDDOperationScopeSuite.scala
@@ -17,14 +17,14 @@
 
 package org.apache.spark.rdd
 
-import org.scalatest.{BeforeAndAfter, FunSuite}
+import org.scalatest.BeforeAndAfter
 
-import org.apache.spark.{TaskContext, Partition, SparkContext}
+import org.apache.spark.{Partition, SparkContext, SparkFunSuite, TaskContext}
 
 /**
  * Tests whether scopes are passed from the RDD operation to the RDDs correctly.
  */
-class RDDOperationScopeSuite extends FunSuite with BeforeAndAfter {
+class RDDOperationScopeSuite extends SparkFunSuite with BeforeAndAfter {
   private var sc: SparkContext = null
   private val scope1 = new RDDOperationScope("scope1")
   private val scope2 = new RDDOperationScope("scope2", Some(scope1))
diff --git a/core/src/test/scala/org/apache/spark/rdd/RDDSuite.scala b/core/src/test/scala/org/apache/spark/rdd/RDDSuite.scala
index 8079d5dcaea81..f6da9f98ad253 100644
--- a/core/src/test/scala/org/apache/spark/rdd/RDDSuite.scala
+++ b/core/src/test/scala/org/apache/spark/rdd/RDDSuite.scala
@@ -25,14 +25,12 @@ import scala.collection.mutable.{ArrayBuffer, HashMap}
 import scala.collection.JavaConverters._
 import scala.reflect.ClassTag
 
-import org.scalatest.FunSuite
-
 import org.apache.spark._
 import org.apache.spark.api.java.{JavaRDD, JavaSparkContext}
 import org.apache.spark.rdd.RDDSuiteUtils._
 import org.apache.spark.util.Utils
 
-class RDDSuite extends FunSuite with SharedSparkContext {
+class RDDSuite extends SparkFunSuite with SharedSparkContext {
 
   test("basic operations") {
     val nums = sc.makeRDD(Array(1, 2, 3, 4), 2)
diff --git a/core/src/test/scala/org/apache/spark/rdd/SortingSuite.scala b/core/src/test/scala/org/apache/spark/rdd/SortingSuite.scala
index 54fc914722b46..a7de9cabe7cc9 100644
--- a/core/src/test/scala/org/apache/spark/rdd/SortingSuite.scala
+++ b/core/src/test/scala/org/apache/spark/rdd/SortingSuite.scala
@@ -17,12 +17,11 @@
 
 package org.apache.spark.rdd
 
-import org.scalatest.FunSuite
 import org.scalatest.Matchers
 
-import org.apache.spark.{Logging, SharedSparkContext}
+import org.apache.spark.{Logging, SharedSparkContext, SparkFunSuite}
 
-class SortingSuite extends FunSuite with SharedSparkContext with Matchers with Logging {
+class SortingSuite extends SparkFunSuite with SharedSparkContext with Matchers with Logging {
 
   test("sortByKey") {
     val pairs = sc.parallelize(Array((1, 0), (2, 0), (0, 0), (3, 0)), 2)
diff --git a/core/src/test/scala/org/apache/spark/rdd/ZippedPartitionsSuite.scala b/core/src/test/scala/org/apache/spark/rdd/ZippedPartitionsSuite.scala
index 72596e86865b2..5d7b973fbd9ac 100644
--- a/core/src/test/scala/org/apache/spark/rdd/ZippedPartitionsSuite.scala
+++ b/core/src/test/scala/org/apache/spark/rdd/ZippedPartitionsSuite.scala
@@ -17,8 +17,7 @@
 
 package org.apache.spark.rdd
 
-import org.apache.spark.SharedSparkContext
-import org.scalatest.FunSuite
+import org.apache.spark.{SharedSparkContext, SparkFunSuite}
 
 object ZippedPartitionsSuite {
   def procZippedData(i: Iterator[Int], s: Iterator[String], d: Iterator[Double]) : Iterator[Int] = {
@@ -26,7 +25,7 @@ object ZippedPartitionsSuite {
   }
 }
 
-class ZippedPartitionsSuite extends FunSuite with SharedSparkContext {
+class ZippedPartitionsSuite extends SparkFunSuite with SharedSparkContext {
   test("print sizes") {
     val data1 = sc.makeRDD(Array(1, 2, 3, 4), 2)
     val data2 = sc.makeRDD(Array("1", "2", "3", "4", "5", "6"), 2)
diff --git a/core/src/test/scala/org/apache/spark/rpc/RpcEnvSuite.scala b/core/src/test/scala/org/apache/spark/rpc/RpcEnvSuite.scala
index 21eb71d9acfbd..1f0aa759b08da 100644
--- a/core/src/test/scala/org/apache/spark/rpc/RpcEnvSuite.scala
+++ b/core/src/test/scala/org/apache/spark/rpc/RpcEnvSuite.scala
@@ -24,15 +24,15 @@ import scala.concurrent.Await
 import scala.concurrent.duration._
 import scala.language.postfixOps
 
-import org.scalatest.{BeforeAndAfterAll, FunSuite}
+import org.scalatest.BeforeAndAfterAll
 import org.scalatest.concurrent.Eventually._
 
-import org.apache.spark.{SparkException, SparkConf}
+import org.apache.spark.{SparkConf, SparkException, SparkFunSuite}
 
 /**
  * Common tests for an RpcEnv implementation.
  */
-abstract class RpcEnvSuite extends FunSuite with BeforeAndAfterAll {
+abstract class RpcEnvSuite extends SparkFunSuite with BeforeAndAfterAll {
 
   var env: RpcEnv = _
 
diff --git a/core/src/test/scala/org/apache/spark/scheduler/CoarseGrainedSchedulerBackendSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/CoarseGrainedSchedulerBackendSuite.scala
index 3821166386fa6..34145691153ce 100644
--- a/core/src/test/scala/org/apache/spark/scheduler/CoarseGrainedSchedulerBackendSuite.scala
+++ b/core/src/test/scala/org/apache/spark/scheduler/CoarseGrainedSchedulerBackendSuite.scala
@@ -17,12 +17,10 @@
 
 package org.apache.spark.scheduler
 
-import org.apache.spark.{LocalSparkContext, SparkConf, SparkException, SparkContext}
+import org.apache.spark.{LocalSparkContext, SparkConf, SparkContext, SparkException, SparkFunSuite}
 import org.apache.spark.util.{SerializableBuffer, AkkaUtils}
 
-import org.scalatest.FunSuite
-
-class CoarseGrainedSchedulerBackendSuite extends FunSuite with LocalSparkContext {
+class CoarseGrainedSchedulerBackendSuite extends SparkFunSuite with LocalSparkContext {
 
   test("serialized task larger than akka frame size") {
     val conf = new SparkConf
diff --git a/core/src/test/scala/org/apache/spark/scheduler/DAGSchedulerSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/DAGSchedulerSuite.scala
index eea7a600841cc..bfcf918e06162 100644
--- a/core/src/test/scala/org/apache/spark/scheduler/DAGSchedulerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/scheduler/DAGSchedulerSuite.scala
@@ -21,7 +21,7 @@ import scala.collection.mutable.{ArrayBuffer, HashSet, HashMap, Map}
 import scala.language.reflectiveCalls
 import scala.util.control.NonFatal
 
-import org.scalatest.{BeforeAndAfter, FunSuiteLike}
+import org.scalatest.BeforeAndAfter
 import org.scalatest.concurrent.Timeouts
 import org.scalatest.time.SpanSugar._
 
@@ -68,7 +68,7 @@ class MyRDD(
 class DAGSchedulerSuiteDummyException extends Exception
 
 class DAGSchedulerSuite
-  extends FunSuiteLike with BeforeAndAfter with LocalSparkContext with Timeouts {
+  extends SparkFunSuite with BeforeAndAfter with LocalSparkContext with Timeouts {
 
   val conf = new SparkConf
   /** Set of TaskSets the DAGScheduler has requested executed. */
diff --git a/core/src/test/scala/org/apache/spark/scheduler/EventLoggingListenerSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/EventLoggingListenerSuite.scala
index b52a8d11d147d..f681f21b6205e 100644
--- a/core/src/test/scala/org/apache/spark/scheduler/EventLoggingListenerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/scheduler/EventLoggingListenerSuite.scala
@@ -25,7 +25,7 @@ import scala.io.Source
 
 import org.apache.hadoop.fs.Path
 import org.json4s.jackson.JsonMethods._
-import org.scalatest.{FunSuiteLike, BeforeAndAfter, FunSuite}
+import org.scalatest.BeforeAndAfter
 
 import org.apache.spark._
 import org.apache.spark.deploy.SparkHadoopUtil
@@ -39,7 +39,7 @@ import org.apache.spark.util.{JsonProtocol, Utils}
  * logging events, whether the parsing of the file names is correct, and whether the logged events
  * can be read and deserialized into actual SparkListenerEvents.
  */
-class EventLoggingListenerSuite extends FunSuite with LocalSparkContext with BeforeAndAfter
+class EventLoggingListenerSuite extends SparkFunSuite with LocalSparkContext with BeforeAndAfter
   with Logging {
   import EventLoggingListenerSuite._
 
diff --git a/core/src/test/scala/org/apache/spark/scheduler/MapStatusSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/MapStatusSuite.scala
index 950c6dc58e332..b8e466fab4506 100644
--- a/core/src/test/scala/org/apache/spark/scheduler/MapStatusSuite.scala
+++ b/core/src/test/scala/org/apache/spark/scheduler/MapStatusSuite.scala
@@ -18,14 +18,13 @@
 package org.apache.spark.scheduler
 
 import org.apache.spark.storage.BlockManagerId
-import org.scalatest.FunSuite
 
-import org.apache.spark.SparkConf
+import org.apache.spark.{SparkConf, SparkFunSuite}
 import org.apache.spark.serializer.JavaSerializer
 
 import scala.util.Random
 
-class MapStatusSuite extends FunSuite {
+class MapStatusSuite extends SparkFunSuite {
 
   test("compressSize") {
     assert(MapStatus.compressSize(0L) === 0)
diff --git a/core/src/test/scala/org/apache/spark/scheduler/OutputCommitCoordinatorSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/OutputCommitCoordinatorSuite.scala
index 7078a7a12232a..a9036da9cc93d 100644
--- a/core/src/test/scala/org/apache/spark/scheduler/OutputCommitCoordinatorSuite.scala
+++ b/core/src/test/scala/org/apache/spark/scheduler/OutputCommitCoordinatorSuite.scala
@@ -24,7 +24,7 @@ import org.mockito.Matchers
 import org.mockito.Mockito._
 import org.mockito.invocation.InvocationOnMock
 import org.mockito.stubbing.Answer
-import org.scalatest.{BeforeAndAfter, FunSuite}
+import org.scalatest.BeforeAndAfter
 
 import org.apache.hadoop.mapred.{TaskAttemptID, JobConf, TaskAttemptContext, OutputCommitter}
 
@@ -64,7 +64,7 @@ import scala.language.postfixOps
  * increments would be captured even though the commit in both tasks was executed
  * erroneously.
  */
-class OutputCommitCoordinatorSuite extends FunSuite with BeforeAndAfter {
+class OutputCommitCoordinatorSuite extends SparkFunSuite with BeforeAndAfter {
 
   var outputCommitCoordinator: OutputCommitCoordinator = null
   var tempDir: File = null
diff --git a/core/src/test/scala/org/apache/spark/scheduler/PoolSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/PoolSuite.scala
index 456451b676bed..467796d7c24b0 100644
--- a/core/src/test/scala/org/apache/spark/scheduler/PoolSuite.scala
+++ b/core/src/test/scala/org/apache/spark/scheduler/PoolSuite.scala
@@ -19,15 +19,13 @@ package org.apache.spark.scheduler
 
 import java.util.Properties
 
-import org.scalatest.FunSuite
-
-import org.apache.spark.{LocalSparkContext, SparkConf, SparkContext}
+import org.apache.spark.{LocalSparkContext, SparkConf, SparkContext, SparkFunSuite}
 
 /**
  * Tests that pools and the associated scheduling algorithms for FIFO and fair scheduling work
  * correctly.
  */
-class PoolSuite extends FunSuite with LocalSparkContext {
+class PoolSuite extends SparkFunSuite with LocalSparkContext {
 
   def createTaskSetManager(stageId: Int, numTasks: Int, taskScheduler: TaskSchedulerImpl)
     : TaskSetManager = {
diff --git a/core/src/test/scala/org/apache/spark/scheduler/ReplayListenerSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/ReplayListenerSuite.scala
index dabe4574b6456..ff3fa95ec32ae 100644
--- a/core/src/test/scala/org/apache/spark/scheduler/ReplayListenerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/scheduler/ReplayListenerSuite.scala
@@ -21,10 +21,10 @@ import java.io.{File, PrintWriter}
 import java.net.URI
 
 import org.json4s.jackson.JsonMethods._
-import org.scalatest.{BeforeAndAfter, FunSuite}
+import org.scalatest.BeforeAndAfter
 
 import org.apache.spark.{SparkConf, SparkContext, SPARK_VERSION}
-import org.apache.spark.{SparkConf, SparkContext}
+import org.apache.spark.{SparkConf, SparkContext, SparkFunSuite}
 import org.apache.spark.deploy.SparkHadoopUtil
 import org.apache.spark.io.CompressionCodec
 import org.apache.spark.util.{JsonProtocol, Utils}
@@ -32,7 +32,7 @@ import org.apache.spark.util.{JsonProtocol, Utils}
 /**
  * Test whether ReplayListenerBus replays events from logs correctly.
  */
-class ReplayListenerSuite extends FunSuite with BeforeAndAfter {
+class ReplayListenerSuite extends SparkFunSuite with BeforeAndAfter {
   private val fileSystem = Utils.getHadoopFileSystem("/",
     SparkHadoopUtil.get.newConfiguration(new SparkConf()))
   private var testDir: File = _
diff --git a/core/src/test/scala/org/apache/spark/scheduler/SparkListenerSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/SparkListenerSuite.scala
index 825c616c0c3e0..06fb909bf5419 100644
--- a/core/src/test/scala/org/apache/spark/scheduler/SparkListenerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/scheduler/SparkListenerSuite.scala
@@ -22,13 +22,13 @@ import java.util.concurrent.Semaphore
 import scala.collection.mutable
 import scala.collection.JavaConversions._
 
-import org.scalatest.{FunSuite, Matchers}
+import org.scalatest.Matchers
 
 import org.apache.spark.executor.TaskMetrics
 import org.apache.spark.util.ResetSystemProperties
-import org.apache.spark.{LocalSparkContext, SparkConf, SparkContext}
+import org.apache.spark.{LocalSparkContext, SparkConf, SparkContext, SparkFunSuite}
 
-class SparkListenerSuite extends FunSuite with LocalSparkContext with Matchers
+class SparkListenerSuite extends SparkFunSuite with LocalSparkContext with Matchers
   with ResetSystemProperties {
 
   /** Length of time to wait while draining listener events. */
diff --git a/core/src/test/scala/org/apache/spark/scheduler/SparkListenerWithClusterSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/SparkListenerWithClusterSuite.scala
index 623a687c359a2..c7f179e1483a5 100644
--- a/core/src/test/scala/org/apache/spark/scheduler/SparkListenerWithClusterSuite.scala
+++ b/core/src/test/scala/org/apache/spark/scheduler/SparkListenerWithClusterSuite.scala
@@ -18,16 +18,16 @@
 package org.apache.spark.scheduler
 
 import org.apache.spark.scheduler.cluster.ExecutorInfo
-import org.apache.spark.{SparkContext, LocalSparkContext}
+import org.apache.spark.{LocalSparkContext, SparkContext, SparkFunSuite}
 
-import org.scalatest.{FunSuite, BeforeAndAfter, BeforeAndAfterAll}
+import org.scalatest.{BeforeAndAfter, BeforeAndAfterAll}
 
 import scala.collection.mutable
 
 /**
  * Unit tests for SparkListener that require a local cluster.
  */
-class SparkListenerWithClusterSuite extends FunSuite with LocalSparkContext
+class SparkListenerWithClusterSuite extends SparkFunSuite with LocalSparkContext
   with BeforeAndAfter with BeforeAndAfterAll {
 
   /** Length of time to wait while draining listener events. */
diff --git a/core/src/test/scala/org/apache/spark/scheduler/TaskContextSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/TaskContextSuite.scala
index 83ae8701243e5..7c1adc1aef1b6 100644
--- a/core/src/test/scala/org/apache/spark/scheduler/TaskContextSuite.scala
+++ b/core/src/test/scala/org/apache/spark/scheduler/TaskContextSuite.scala
@@ -20,7 +20,6 @@ package org.apache.spark.scheduler
 import org.mockito.Mockito._
 import org.mockito.Matchers.any
 
-import org.scalatest.FunSuite
 import org.scalatest.BeforeAndAfter
 
 import org.apache.spark._
@@ -28,7 +27,7 @@ import org.apache.spark.rdd.RDD
 import org.apache.spark.util.{TaskCompletionListenerException, TaskCompletionListener}
 
 
-class TaskContextSuite extends FunSuite with BeforeAndAfter with LocalSparkContext {
+class TaskContextSuite extends SparkFunSuite with BeforeAndAfter with LocalSparkContext {
 
   test("calls TaskCompletionListener after failure") {
     TaskContextSuite.completed = false
diff --git a/core/src/test/scala/org/apache/spark/scheduler/TaskResultGetterSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/TaskResultGetterSuite.scala
index e3a3803e6483a..815caa79ff529 100644
--- a/core/src/test/scala/org/apache/spark/scheduler/TaskResultGetterSuite.scala
+++ b/core/src/test/scala/org/apache/spark/scheduler/TaskResultGetterSuite.scala
@@ -23,10 +23,10 @@ import scala.concurrent.duration._
 import scala.language.postfixOps
 import scala.util.control.NonFatal
 
-import org.scalatest.{BeforeAndAfter, FunSuite}
+import org.scalatest.BeforeAndAfter
 import org.scalatest.concurrent.Eventually._
 
-import org.apache.spark.{LocalSparkContext, SparkConf, SparkContext, SparkEnv}
+import org.apache.spark.{LocalSparkContext, SparkConf, SparkContext, SparkEnv, SparkFunSuite}
 import org.apache.spark.storage.TaskResultBlockId
 
 /**
@@ -71,7 +71,7 @@ class ResultDeletingTaskResultGetter(sparkEnv: SparkEnv, scheduler: TaskSchedule
 /**
  * Tests related to handling task results (both direct and indirect).
  */
-class TaskResultGetterSuite extends FunSuite with BeforeAndAfter with LocalSparkContext {
+class TaskResultGetterSuite extends SparkFunSuite with BeforeAndAfter with LocalSparkContext {
 
   // Set the Akka frame size to be as small as possible (it must be an integer, so 1 is as small
   // as we can make it) so the tests don't take too long.
diff --git a/core/src/test/scala/org/apache/spark/scheduler/TaskSchedulerImplSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/TaskSchedulerImplSuite.scala
index ffa4381969b68..a6d5232feb8de 100644
--- a/core/src/test/scala/org/apache/spark/scheduler/TaskSchedulerImplSuite.scala
+++ b/core/src/test/scala/org/apache/spark/scheduler/TaskSchedulerImplSuite.scala
@@ -17,8 +17,6 @@
 
 package org.apache.spark.scheduler
 
-import org.scalatest.FunSuite
-
 import org.apache.spark._
 
 class FakeSchedulerBackend extends SchedulerBackend {
@@ -28,7 +26,7 @@ class FakeSchedulerBackend extends SchedulerBackend {
   def defaultParallelism(): Int = 1
 }
 
-class TaskSchedulerImplSuite extends FunSuite with LocalSparkContext with Logging {
+class TaskSchedulerImplSuite extends SparkFunSuite with LocalSparkContext with Logging {
 
   test("Scheduler does not always schedule tasks on the same workers") {
     sc = new SparkContext("local", "TaskSchedulerImplSuite")
diff --git a/core/src/test/scala/org/apache/spark/scheduler/TaskSetManagerSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/TaskSetManagerSuite.scala
index 6198cea46ddf8..0060f3396dcde 100644
--- a/core/src/test/scala/org/apache/spark/scheduler/TaskSetManagerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/scheduler/TaskSetManagerSuite.scala
@@ -22,8 +22,6 @@ import java.util.Random
 import scala.collection.mutable.ArrayBuffer
 import scala.collection.mutable
 
-import org.scalatest.FunSuite
-
 import org.apache.spark._
 import org.apache.spark.executor.TaskMetrics
 import org.apache.spark.util.{ManualClock, Utils}
@@ -146,7 +144,7 @@ class LargeTask(stageId: Int) extends Task[Array[Byte]](stageId, 0) {
   override def preferredLocations: Seq[TaskLocation] = Seq[TaskLocation]()
 }
 
-class TaskSetManagerSuite extends FunSuite with LocalSparkContext with Logging {
+class TaskSetManagerSuite extends SparkFunSuite with LocalSparkContext with Logging {
   import TaskLocality.{ANY, PROCESS_LOCAL, NO_PREF, NODE_LOCAL, RACK_LOCAL}
 
   private val conf = new SparkConf
diff --git a/core/src/test/scala/org/apache/spark/scheduler/cluster/mesos/MemoryUtilsSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/cluster/mesos/MemoryUtilsSuite.scala
index 3fa0115e68259..d565132a06789 100644
--- a/core/src/test/scala/org/apache/spark/scheduler/cluster/mesos/MemoryUtilsSuite.scala
+++ b/core/src/test/scala/org/apache/spark/scheduler/cluster/mesos/MemoryUtilsSuite.scala
@@ -18,12 +18,11 @@
 package org.apache.spark.scheduler.cluster.mesos
 
 import org.mockito.Mockito._
-import org.scalatest.FunSuite
 import org.scalatest.mock.MockitoSugar
 
-import org.apache.spark.{SparkConf, SparkContext}
+import org.apache.spark.{SparkConf, SparkContext, SparkFunSuite}
 
-class MemoryUtilsSuite extends FunSuite with MockitoSugar {
+class MemoryUtilsSuite extends SparkFunSuite with MockitoSugar {
   test("MesosMemoryUtils should always override memoryOverhead when it's set") {
     val sparkConf = new SparkConf
 
diff --git a/core/src/test/scala/org/apache/spark/scheduler/cluster/mesos/MesosSchedulerBackendSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/cluster/mesos/MesosSchedulerBackendSuite.scala
index ab863f3d8d672..6f4ff0814b8da 100644
--- a/core/src/test/scala/org/apache/spark/scheduler/cluster/mesos/MesosSchedulerBackendSuite.scala
+++ b/core/src/test/scala/org/apache/spark/scheduler/cluster/mesos/MesosSchedulerBackendSuite.scala
@@ -30,16 +30,15 @@ import org.apache.mesos.SchedulerDriver
 import org.mockito.Matchers._
 import org.mockito.Mockito._
 import org.mockito.{ArgumentCaptor, Matchers}
-import org.scalatest.FunSuite
 import org.scalatest.mock.MockitoSugar
 
 import org.apache.spark.executor.MesosExecutorBackend
 import org.apache.spark.scheduler.cluster.ExecutorInfo
 import org.apache.spark.scheduler.{LiveListenerBus, SparkListenerExecutorAdded,
   TaskDescription, TaskSchedulerImpl, WorkerOffer}
-import org.apache.spark.{LocalSparkContext, SparkConf, SparkContext}
+import org.apache.spark.{LocalSparkContext, SparkConf, SparkContext, SparkFunSuite}
 
-class MesosSchedulerBackendSuite extends FunSuite with LocalSparkContext with MockitoSugar {
+class MesosSchedulerBackendSuite extends SparkFunSuite with LocalSparkContext with MockitoSugar {
 
   test("check spark-class location correctly") {
     val conf = new SparkConf
diff --git a/core/src/test/scala/org/apache/spark/scheduler/cluster/mesos/MesosTaskLaunchDataSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/cluster/mesos/MesosTaskLaunchDataSuite.scala
index eebcba40f8a1c..5a81bb335fdb7 100644
--- a/core/src/test/scala/org/apache/spark/scheduler/cluster/mesos/MesosTaskLaunchDataSuite.scala
+++ b/core/src/test/scala/org/apache/spark/scheduler/cluster/mesos/MesosTaskLaunchDataSuite.scala
@@ -19,9 +19,9 @@ package org.apache.spark.scheduler.cluster.mesos
 
 import java.nio.ByteBuffer
 
-import org.scalatest.FunSuite
+import org.apache.spark.SparkFunSuite
 
-class MesosTaskLaunchDataSuite extends FunSuite {
+class MesosTaskLaunchDataSuite extends SparkFunSuite {
   test("serialize and deserialize data must be same") {
     val serializedTask = ByteBuffer.allocate(40)
     (Range(100, 110).map(serializedTask.putInt(_)))
diff --git a/core/src/test/scala/org/apache/spark/scheduler/mesos/MesosClusterSchedulerSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/mesos/MesosClusterSchedulerSuite.scala
index f28e29e9b8d8e..f5cef1caaf1ac 100644
--- a/core/src/test/scala/org/apache/spark/scheduler/mesos/MesosClusterSchedulerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/scheduler/mesos/MesosClusterSchedulerSuite.scala
@@ -19,16 +19,15 @@ package org.apache.spark.scheduler.mesos
 
 import java.util.Date
 
-import org.scalatest.FunSuite
 import org.scalatest.mock.MockitoSugar
 
 import org.apache.spark.deploy.Command
 import org.apache.spark.deploy.mesos.MesosDriverDescription
 import org.apache.spark.scheduler.cluster.mesos._
-import org.apache.spark.{LocalSparkContext, SparkConf}
+import org.apache.spark.{LocalSparkContext, SparkConf, SparkFunSuite}
 
 
-class MesosClusterSchedulerSuite extends FunSuite with LocalSparkContext with MockitoSugar {
+class MesosClusterSchedulerSuite extends SparkFunSuite with LocalSparkContext with MockitoSugar {
 
   private val command = new Command("mainClass", Seq("arg"), null, null, null, null)
 
diff --git a/core/src/test/scala/org/apache/spark/serializer/JavaSerializerSuite.scala b/core/src/test/scala/org/apache/spark/serializer/JavaSerializerSuite.scala
index ed4d8ce632e16..329a2b6dad831 100644
--- a/core/src/test/scala/org/apache/spark/serializer/JavaSerializerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/serializer/JavaSerializerSuite.scala
@@ -17,10 +17,9 @@
 
 package org.apache.spark.serializer
 
-import org.apache.spark.SparkConf
-import org.scalatest.FunSuite
+import org.apache.spark.{SparkConf, SparkFunSuite}
 
-class JavaSerializerSuite extends FunSuite {
+class JavaSerializerSuite extends SparkFunSuite {
   test("JavaSerializer instances are serializable") {
     val serializer = new JavaSerializer(new SparkConf())
     val instance = serializer.newInstance()
diff --git a/core/src/test/scala/org/apache/spark/serializer/KryoSerializerDistributedSuite.scala b/core/src/test/scala/org/apache/spark/serializer/KryoSerializerDistributedSuite.scala
index 054a4c64897a9..63a8480c9b57b 100644
--- a/core/src/test/scala/org/apache/spark/serializer/KryoSerializerDistributedSuite.scala
+++ b/core/src/test/scala/org/apache/spark/serializer/KryoSerializerDistributedSuite.scala
@@ -20,12 +20,11 @@ package org.apache.spark.serializer
 import org.apache.spark.util.Utils
 
 import com.esotericsoftware.kryo.Kryo
-import org.scalatest.FunSuite
 
-import org.apache.spark.{LocalSparkContext, SparkConf, SparkContext, SparkEnv, TestUtils}
+import org.apache.spark._
 import org.apache.spark.serializer.KryoDistributedTest._
 
-class KryoSerializerDistributedSuite extends FunSuite {
+class KryoSerializerDistributedSuite extends SparkFunSuite {
 
   test("kryo objects are serialised consistently in different processes") {
     val conf = new SparkConf(false)
diff --git a/core/src/test/scala/org/apache/spark/serializer/KryoSerializerResizableOutputSuite.scala b/core/src/test/scala/org/apache/spark/serializer/KryoSerializerResizableOutputSuite.scala
index da98d09184735..a9b209ccfc76e 100644
--- a/core/src/test/scala/org/apache/spark/serializer/KryoSerializerResizableOutputSuite.scala
+++ b/core/src/test/scala/org/apache/spark/serializer/KryoSerializerResizableOutputSuite.scala
@@ -17,15 +17,13 @@
 
 package org.apache.spark.serializer
 
-import org.scalatest.FunSuite
-
-import org.apache.spark.SparkConf
+import org.apache.spark.{SparkConf, SparkFunSuite}
 import org.apache.spark.SparkContext
 import org.apache.spark.LocalSparkContext
 import org.apache.spark.SparkException
 
 
-class KryoSerializerResizableOutputSuite extends FunSuite {
+class KryoSerializerResizableOutputSuite extends SparkFunSuite {
 
   // trial and error showed this will not serialize with 1mb buffer
   val x = (1 to 400000).toArray
diff --git a/core/src/test/scala/org/apache/spark/serializer/KryoSerializerSuite.scala b/core/src/test/scala/org/apache/spark/serializer/KryoSerializerSuite.scala
index 14c0172fa96ab..c32fe232cc27c 100644
--- a/core/src/test/scala/org/apache/spark/serializer/KryoSerializerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/serializer/KryoSerializerSuite.scala
@@ -23,14 +23,13 @@ import scala.collection.mutable
 import scala.reflect.ClassTag
 
 import com.esotericsoftware.kryo.Kryo
-import org.scalatest.FunSuite
 
-import org.apache.spark.{SharedSparkContext, SparkConf}
+import org.apache.spark.{SharedSparkContext, SparkConf, SparkFunSuite}
 import org.apache.spark.scheduler.HighlyCompressedMapStatus
 import org.apache.spark.serializer.KryoTest._
 import org.apache.spark.storage.BlockManagerId
 
-class KryoSerializerSuite extends FunSuite with SharedSparkContext {
+class KryoSerializerSuite extends SparkFunSuite with SharedSparkContext {
   conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
   conf.set("spark.kryo.registrator", classOf[MyRegistrator].getName)
 
@@ -361,7 +360,7 @@ class KryoSerializerSuite extends FunSuite with SharedSparkContext {
   }
 }
 
-class KryoSerializerAutoResetDisabledSuite extends FunSuite with SharedSparkContext {
+class KryoSerializerAutoResetDisabledSuite extends SparkFunSuite with SharedSparkContext {
   conf.set("spark.serializer", classOf[KryoSerializer].getName)
   conf.set("spark.kryo.registrator", classOf[RegistratorWithoutAutoReset].getName)
   conf.set("spark.kryo.referenceTracking", "true")
diff --git a/core/src/test/scala/org/apache/spark/serializer/ProactiveClosureSerializationSuite.scala b/core/src/test/scala/org/apache/spark/serializer/ProactiveClosureSerializationSuite.scala
index 673948d84d82b..77d66864f755e 100644
--- a/core/src/test/scala/org/apache/spark/serializer/ProactiveClosureSerializationSuite.scala
+++ b/core/src/test/scala/org/apache/spark/serializer/ProactiveClosureSerializationSuite.scala
@@ -17,9 +17,7 @@
 
 package org.apache.spark.serializer
 
-import org.scalatest.FunSuite
-
-import org.apache.spark.{SharedSparkContext, SparkException}
+import org.apache.spark.{SharedSparkContext, SparkException, SparkFunSuite}
 import org.apache.spark.rdd.RDD
 
 /* A trivial (but unserializable) container for trivial functions */
@@ -29,7 +27,7 @@ class UnserializableClass {
   def pred[T](x: T): Boolean = x.toString.length % 2 == 0
 }
 
-class ProactiveClosureSerializationSuite extends FunSuite with SharedSparkContext {
+class ProactiveClosureSerializationSuite extends SparkFunSuite with SharedSparkContext {
 
   def fixture: (RDD[String], UnserializableClass) = {
     (sc.parallelize(0 until 1000).map(_.toString), new UnserializableClass)
diff --git a/core/src/test/scala/org/apache/spark/serializer/SerializationDebuggerSuite.scala b/core/src/test/scala/org/apache/spark/serializer/SerializationDebuggerSuite.scala
index e62828c4fbac6..2707bb53bc383 100644
--- a/core/src/test/scala/org/apache/spark/serializer/SerializationDebuggerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/serializer/SerializationDebuggerSuite.scala
@@ -19,10 +19,12 @@ package org.apache.spark.serializer
 
 import java.io.{ObjectOutput, ObjectInput}
 
-import org.scalatest.{BeforeAndAfterEach, FunSuite}
+import org.scalatest.BeforeAndAfterEach
 
+import org.apache.spark.SparkFunSuite
 
-class SerializationDebuggerSuite extends FunSuite with BeforeAndAfterEach {
+
+class SerializationDebuggerSuite extends SparkFunSuite with BeforeAndAfterEach {
 
   import SerializationDebugger.find
 
diff --git a/core/src/test/scala/org/apache/spark/serializer/SerializerPropertiesSuite.scala b/core/src/test/scala/org/apache/spark/serializer/SerializerPropertiesSuite.scala
index bb34033fe9e7e..4ce3b941bea55 100644
--- a/core/src/test/scala/org/apache/spark/serializer/SerializerPropertiesSuite.scala
+++ b/core/src/test/scala/org/apache/spark/serializer/SerializerPropertiesSuite.scala
@@ -21,9 +21,9 @@ import java.io.{ByteArrayInputStream, ByteArrayOutputStream}
 
 import scala.util.Random
 
-import org.scalatest.{Assertions, FunSuite}
+import org.scalatest.Assertions
 
-import org.apache.spark.SparkConf
+import org.apache.spark.{SparkConf, SparkFunSuite}
 import org.apache.spark.serializer.KryoTest.RegistratorWithoutAutoReset
 
 /**
@@ -31,7 +31,7 @@ import org.apache.spark.serializer.KryoTest.RegistratorWithoutAutoReset
  * describe properties of the serialized stream, such as
  * [[Serializer.supportsRelocationOfSerializedObjects]].
  */
-class SerializerPropertiesSuite extends FunSuite {
+class SerializerPropertiesSuite extends SparkFunSuite {
 
   import SerializerPropertiesSuite._
 
diff --git a/core/src/test/scala/org/apache/spark/shuffle/ShuffleMemoryManagerSuite.scala b/core/src/test/scala/org/apache/spark/shuffle/ShuffleMemoryManagerSuite.scala
index e0e646f0a3652..96778c9ebafb1 100644
--- a/core/src/test/scala/org/apache/spark/shuffle/ShuffleMemoryManagerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/shuffle/ShuffleMemoryManagerSuite.scala
@@ -17,13 +17,14 @@
 
 package org.apache.spark.shuffle
 
-import org.scalatest.FunSuite
 import org.scalatest.concurrent.Timeouts
 import org.scalatest.time.SpanSugar._
 import java.util.concurrent.atomic.AtomicBoolean
 import java.util.concurrent.CountDownLatch
 
-class ShuffleMemoryManagerSuite extends FunSuite with Timeouts {
+import org.apache.spark.SparkFunSuite
+
+class ShuffleMemoryManagerSuite extends SparkFunSuite with Timeouts {
   /** Launch a thread with the given body block and return it. */
   private def startThread(name: String)(body: => Unit): Thread = {
     val thread = new Thread("ShuffleMemorySuite " + name) {
diff --git a/core/src/test/scala/org/apache/spark/shuffle/hash/HashShuffleManagerSuite.scala b/core/src/test/scala/org/apache/spark/shuffle/hash/HashShuffleManagerSuite.scala
index 0537bf66ad020..491dc3659e184 100644
--- a/core/src/test/scala/org/apache/spark/shuffle/hash/HashShuffleManagerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/shuffle/hash/HashShuffleManagerSuite.scala
@@ -21,16 +21,14 @@ import java.io.{File, FileWriter}
 
 import scala.language.reflectiveCalls
 
-import org.scalatest.FunSuite
-
-import org.apache.spark.{SparkEnv, SparkContext, LocalSparkContext, SparkConf}
+import org.apache.spark.{LocalSparkContext, SparkConf, SparkContext, SparkEnv, SparkFunSuite}
 import org.apache.spark.executor.ShuffleWriteMetrics
 import org.apache.spark.network.buffer.{FileSegmentManagedBuffer, ManagedBuffer}
 import org.apache.spark.serializer.JavaSerializer
 import org.apache.spark.shuffle.FileShuffleBlockResolver
 import org.apache.spark.storage.{ShuffleBlockId, FileSegment}
 
-class HashShuffleManagerSuite extends FunSuite with LocalSparkContext {
+class HashShuffleManagerSuite extends SparkFunSuite with LocalSparkContext {
   private val testConf = new SparkConf(false)
 
   private def checkSegments(expected: FileSegment, buffer: ManagedBuffer) {
diff --git a/core/src/test/scala/org/apache/spark/shuffle/unsafe/UnsafeShuffleManagerSuite.scala b/core/src/test/scala/org/apache/spark/shuffle/unsafe/UnsafeShuffleManagerSuite.scala
index 49a04a2a45280..a73e94e05575e 100644
--- a/core/src/test/scala/org/apache/spark/shuffle/unsafe/UnsafeShuffleManagerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/shuffle/unsafe/UnsafeShuffleManagerSuite.scala
@@ -20,7 +20,7 @@ package org.apache.spark.shuffle.unsafe
 import org.mockito.Mockito._
 import org.mockito.invocation.InvocationOnMock
 import org.mockito.stubbing.Answer
-import org.scalatest.{FunSuite, Matchers}
+import org.scalatest.Matchers
 
 import org.apache.spark._
 import org.apache.spark.serializer.{JavaSerializer, KryoSerializer, Serializer}
@@ -29,7 +29,7 @@ import org.apache.spark.serializer.{JavaSerializer, KryoSerializer, Serializer}
  * Tests for the fallback logic in UnsafeShuffleManager. Actual tests of shuffling data are
  * performed in other suites.
  */
-class UnsafeShuffleManagerSuite extends FunSuite with Matchers {
+class UnsafeShuffleManagerSuite extends SparkFunSuite with Matchers {
 
   import UnsafeShuffleManager.canUseUnsafeShuffle
 
diff --git a/core/src/test/scala/org/apache/spark/status/api/v1/SimpleDateParamSuite.scala b/core/src/test/scala/org/apache/spark/status/api/v1/SimpleDateParamSuite.scala
index 183043bc05233..63b0e77629dde 100644
--- a/core/src/test/scala/org/apache/spark/status/api/v1/SimpleDateParamSuite.scala
+++ b/core/src/test/scala/org/apache/spark/status/api/v1/SimpleDateParamSuite.scala
@@ -18,9 +18,11 @@ package org.apache.spark.status.api.v1
 
 import javax.ws.rs.WebApplicationException
 
-import org.scalatest.{Matchers, FunSuite}
+import org.scalatest.Matchers
 
-class SimpleDateParamSuite extends FunSuite with Matchers {
+import org.apache.spark.SparkFunSuite
+
+class SimpleDateParamSuite extends SparkFunSuite with Matchers {
 
   test("date parsing") {
     new SimpleDateParam("2015-02-20T23:21:17.190GMT").timestamp should be (1424474477190L)
diff --git a/core/src/test/scala/org/apache/spark/storage/BlockIdSuite.scala b/core/src/test/scala/org/apache/spark/storage/BlockIdSuite.scala
index b647e8a6728ec..89ed031b6fcd1 100644
--- a/core/src/test/scala/org/apache/spark/storage/BlockIdSuite.scala
+++ b/core/src/test/scala/org/apache/spark/storage/BlockIdSuite.scala
@@ -17,9 +17,9 @@
 
 package org.apache.spark.storage
 
-import org.scalatest.FunSuite
+import org.apache.spark.SparkFunSuite
 
-class BlockIdSuite extends FunSuite {
+class BlockIdSuite extends SparkFunSuite {
   def assertSame(id1: BlockId, id2: BlockId) {
     assert(id1.name === id2.name)
     assert(id1.hashCode === id2.hashCode)
diff --git a/core/src/test/scala/org/apache/spark/storage/BlockManagerReplicationSuite.scala b/core/src/test/scala/org/apache/spark/storage/BlockManagerReplicationSuite.scala
index f647200402ecb..0f5ba46f69c2f 100644
--- a/core/src/test/scala/org/apache/spark/storage/BlockManagerReplicationSuite.scala
+++ b/core/src/test/scala/org/apache/spark/storage/BlockManagerReplicationSuite.scala
@@ -23,11 +23,11 @@ import scala.language.implicitConversions
 import scala.language.postfixOps
 
 import org.mockito.Mockito.{mock, when}
-import org.scalatest.{BeforeAndAfter, FunSuite, Matchers}
+import org.scalatest.{BeforeAndAfter, Matchers}
 import org.scalatest.concurrent.Eventually._
 
 import org.apache.spark.rpc.RpcEnv
-import org.apache.spark.{MapOutputTrackerMaster, SparkConf, SparkContext, SecurityManager}
+import org.apache.spark._
 import org.apache.spark.network.BlockTransferService
 import org.apache.spark.network.nio.NioBlockTransferService
 import org.apache.spark.scheduler.LiveListenerBus
@@ -36,7 +36,7 @@ import org.apache.spark.shuffle.hash.HashShuffleManager
 import org.apache.spark.storage.StorageLevel._
 
 /** Testsuite that tests block replication in BlockManager */
-class BlockManagerReplicationSuite extends FunSuite with Matchers with BeforeAndAfter {
+class BlockManagerReplicationSuite extends SparkFunSuite with Matchers with BeforeAndAfter {
 
   private val conf = new SparkConf(false)
   var rpcEnv: RpcEnv = null
diff --git a/core/src/test/scala/org/apache/spark/storage/BlockManagerSuite.scala b/core/src/test/scala/org/apache/spark/storage/BlockManagerSuite.scala
index 151955ef7f435..bcee901f5dd5f 100644
--- a/core/src/test/scala/org/apache/spark/storage/BlockManagerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/storage/BlockManagerSuite.scala
@@ -31,7 +31,7 @@ import org.scalatest.concurrent.Eventually._
 import org.scalatest.concurrent.Timeouts._
 
 import org.apache.spark.rpc.RpcEnv
-import org.apache.spark.{MapOutputTrackerMaster, SparkConf, SparkContext, SecurityManager}
+import org.apache.spark._
 import org.apache.spark.executor.DataReadMethod
 import org.apache.spark.network.nio.NioBlockTransferService
 import org.apache.spark.scheduler.LiveListenerBus
@@ -41,7 +41,7 @@ import org.apache.spark.storage.BlockManagerMessages.BlockManagerHeartbeat
 import org.apache.spark.util._
 
 
-class BlockManagerSuite extends FunSuite with Matchers with BeforeAndAfterEach
+class BlockManagerSuite extends SparkFunSuite with Matchers with BeforeAndAfterEach
   with PrivateMethodTester with ResetSystemProperties {
 
   private val conf = new SparkConf(false)
diff --git a/core/src/test/scala/org/apache/spark/storage/BlockObjectWriterSuite.scala b/core/src/test/scala/org/apache/spark/storage/BlockObjectWriterSuite.scala
index 43ef469c1fd48..ad43a3e5fdc88 100644
--- a/core/src/test/scala/org/apache/spark/storage/BlockObjectWriterSuite.scala
+++ b/core/src/test/scala/org/apache/spark/storage/BlockObjectWriterSuite.scala
@@ -18,14 +18,12 @@ package org.apache.spark.storage
 
 import java.io.File
 
-import org.scalatest.FunSuite
-
-import org.apache.spark.SparkConf
+import org.apache.spark.{SparkConf, SparkFunSuite}
 import org.apache.spark.executor.ShuffleWriteMetrics
 import org.apache.spark.serializer.JavaSerializer
 import org.apache.spark.util.Utils
 
-class BlockObjectWriterSuite extends FunSuite {
+class BlockObjectWriterSuite extends SparkFunSuite {
   test("verify write metrics") {
     val file = new File(Utils.createTempDir(), "somefile")
     val writeMetrics = new ShuffleWriteMetrics()
diff --git a/core/src/test/scala/org/apache/spark/storage/DiskBlockManagerSuite.scala b/core/src/test/scala/org/apache/spark/storage/DiskBlockManagerSuite.scala
index bc5c74c126b74..688f56f4665f3 100644
--- a/core/src/test/scala/org/apache/spark/storage/DiskBlockManagerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/storage/DiskBlockManagerSuite.scala
@@ -22,12 +22,12 @@ import java.io.{File, FileWriter}
 import scala.language.reflectiveCalls
 
 import org.mockito.Mockito.{mock, when}
-import org.scalatest.{BeforeAndAfterAll, BeforeAndAfterEach, FunSuite}
+import org.scalatest.{BeforeAndAfterAll, BeforeAndAfterEach}
 
-import org.apache.spark.SparkConf
+import org.apache.spark.{SparkConf, SparkFunSuite}
 import org.apache.spark.util.Utils
 
-class DiskBlockManagerSuite extends FunSuite with BeforeAndAfterEach with BeforeAndAfterAll {
+class DiskBlockManagerSuite extends SparkFunSuite with BeforeAndAfterEach with BeforeAndAfterAll {
   private val testConf = new SparkConf(false)
   private var rootDir0: File = _
   private var rootDir1: File = _
diff --git a/core/src/test/scala/org/apache/spark/storage/FlatmapIteratorSuite.scala b/core/src/test/scala/org/apache/spark/storage/FlatmapIteratorSuite.scala
index 47341b74e9c0f..b21c91f75d5c7 100644
--- a/core/src/test/scala/org/apache/spark/storage/FlatmapIteratorSuite.scala
+++ b/core/src/test/scala/org/apache/spark/storage/FlatmapIteratorSuite.scala
@@ -16,11 +16,10 @@
  */
 package org.apache.spark.storage
 
-import org.scalatest.FunSuite
-import org.apache.spark.{SharedSparkContext, SparkConf, LocalSparkContext, SparkContext}
+import org.apache.spark._
 
 
-class FlatmapIteratorSuite extends FunSuite with LocalSparkContext {
+class FlatmapIteratorSuite extends SparkFunSuite with LocalSparkContext {
   /* Tests the ability of Spark to deal with user provided iterators from flatMap
    * calls, that may generate more data then available memory. In any
    * memory based persistance Spark will unroll the iterator into an ArrayBuffer
diff --git a/core/src/test/scala/org/apache/spark/storage/LocalDirsSuite.scala b/core/src/test/scala/org/apache/spark/storage/LocalDirsSuite.scala
index b47157f8331cc..ac6fec56bbf4f 100644
--- a/core/src/test/scala/org/apache/spark/storage/LocalDirsSuite.scala
+++ b/core/src/test/scala/org/apache/spark/storage/LocalDirsSuite.scala
@@ -20,15 +20,15 @@ package org.apache.spark.storage
 import java.io.File
 
 import org.apache.spark.util.Utils
-import org.scalatest.{BeforeAndAfter, FunSuite}
+import org.scalatest.BeforeAndAfter
 
-import org.apache.spark.SparkConf
+import org.apache.spark.{SparkConf, SparkFunSuite}
 
 
 /**
  * Tests for the spark.local.dir and SPARK_LOCAL_DIRS configuration options.
  */
-class LocalDirsSuite extends FunSuite with BeforeAndAfter {
+class LocalDirsSuite extends SparkFunSuite with BeforeAndAfter {
 
   before {
     Utils.clearLocalRootDirs()
diff --git a/core/src/test/scala/org/apache/spark/storage/ShuffleBlockFetcherIteratorSuite.scala b/core/src/test/scala/org/apache/spark/storage/ShuffleBlockFetcherIteratorSuite.scala
index 2080c432d77db..2a7fe67ad8585 100644
--- a/core/src/test/scala/org/apache/spark/storage/ShuffleBlockFetcherIteratorSuite.scala
+++ b/core/src/test/scala/org/apache/spark/storage/ShuffleBlockFetcherIteratorSuite.scala
@@ -26,15 +26,14 @@ import org.mockito.Matchers.{any, eq => meq}
 import org.mockito.Mockito._
 import org.mockito.invocation.InvocationOnMock
 import org.mockito.stubbing.Answer
-import org.scalatest.FunSuite
 
-import org.apache.spark.{SparkConf, TaskContextImpl}
+import org.apache.spark.{SparkConf, SparkFunSuite, TaskContextImpl}
 import org.apache.spark.network._
 import org.apache.spark.network.buffer.ManagedBuffer
 import org.apache.spark.network.shuffle.BlockFetchingListener
 import org.apache.spark.serializer.TestSerializer
 
-class ShuffleBlockFetcherIteratorSuite extends FunSuite {
+class ShuffleBlockFetcherIteratorSuite extends SparkFunSuite {
   // Some of the tests are quite tricky because we are testing the cleanup behavior
   // in the presence of faults.
 
diff --git a/core/src/test/scala/org/apache/spark/storage/StorageStatusListenerSuite.scala b/core/src/test/scala/org/apache/spark/storage/StorageStatusListenerSuite.scala
index 3a45875391e29..1a199beb3558f 100644
--- a/core/src/test/scala/org/apache/spark/storage/StorageStatusListenerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/storage/StorageStatusListenerSuite.scala
@@ -17,15 +17,14 @@
 
 package org.apache.spark.storage
 
-import org.scalatest.FunSuite
-import org.apache.spark.Success
+import org.apache.spark.{SparkFunSuite, Success}
 import org.apache.spark.executor.TaskMetrics
 import org.apache.spark.scheduler._
 
 /**
  * Test the behavior of StorageStatusListener in response to all relevant events.
  */
-class StorageStatusListenerSuite extends FunSuite {
+class StorageStatusListenerSuite extends SparkFunSuite {
   private val bm1 = BlockManagerId("big", "dog", 1)
   private val bm2 = BlockManagerId("fat", "duck", 2)
   private val taskInfo1 = new TaskInfo(0, 0, 0, 0, "big", "dog", TaskLocality.ANY, false)
diff --git a/core/src/test/scala/org/apache/spark/storage/StorageSuite.scala b/core/src/test/scala/org/apache/spark/storage/StorageSuite.scala
index 17193ddbfd894..1d5a813a4d336 100644
--- a/core/src/test/scala/org/apache/spark/storage/StorageSuite.scala
+++ b/core/src/test/scala/org/apache/spark/storage/StorageSuite.scala
@@ -17,12 +17,12 @@
 
 package org.apache.spark.storage
 
-import org.scalatest.FunSuite
+import org.apache.spark.SparkFunSuite
 
 /**
  * Test various functionalities in StorageUtils and StorageStatus.
  */
-class StorageSuite extends FunSuite {
+class StorageSuite extends SparkFunSuite {
   private val memAndDisk = StorageLevel.MEMORY_AND_DISK
 
   // For testing add, update, and remove (for non-RDD blocks)
diff --git a/core/src/test/scala/org/apache/spark/ui/UISeleniumSuite.scala b/core/src/test/scala/org/apache/spark/ui/UISeleniumSuite.scala
index a727a43f44dfc..33712f1bfa782 100644
--- a/core/src/test/scala/org/apache/spark/ui/UISeleniumSuite.scala
+++ b/core/src/test/scala/org/apache/spark/ui/UISeleniumSuite.scala
@@ -42,7 +42,7 @@ import org.apache.spark.status.api.v1.{JacksonMessageWriter, StageStatus}
 /**
  * Selenium tests for the Spark Web UI.
  */
-class UISeleniumSuite extends FunSuite with WebBrowser with Matchers with BeforeAndAfterAll {
+class UISeleniumSuite extends SparkFunSuite with WebBrowser with Matchers with BeforeAndAfterAll {
 
   implicit var webDriver: WebDriver = _
   implicit val formats = DefaultFormats
diff --git a/core/src/test/scala/org/apache/spark/ui/UISuite.scala b/core/src/test/scala/org/apache/spark/ui/UISuite.scala
index 77a038dc1720d..8f9502b5673d1 100644
--- a/core/src/test/scala/org/apache/spark/ui/UISuite.scala
+++ b/core/src/test/scala/org/apache/spark/ui/UISuite.scala
@@ -23,14 +23,13 @@ import scala.io.Source
 import scala.util.{Failure, Success, Try}
 
 import org.eclipse.jetty.servlet.ServletContextHandler
-import org.scalatest.FunSuite
 import org.scalatest.concurrent.Eventually._
 import org.scalatest.time.SpanSugar._
 
 import org.apache.spark.LocalSparkContext._
-import org.apache.spark.{SparkConf, SparkContext}
+import org.apache.spark.{SparkConf, SparkContext, SparkFunSuite}
 
-class UISuite extends FunSuite {
+class UISuite extends SparkFunSuite {
 
   /**
    * Create a test SparkContext with the SparkUI enabled.
diff --git a/core/src/test/scala/org/apache/spark/ui/jobs/JobProgressListenerSuite.scala b/core/src/test/scala/org/apache/spark/ui/jobs/JobProgressListenerSuite.scala
index 967dd0821ebd0..56f7b9cf1f358 100644
--- a/core/src/test/scala/org/apache/spark/ui/jobs/JobProgressListenerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/ui/jobs/JobProgressListenerSuite.scala
@@ -19,7 +19,6 @@ package org.apache.spark.ui.jobs
 
 import java.util.Properties
 
-import org.scalatest.FunSuite
 import org.scalatest.Matchers
 
 import org.apache.spark._
@@ -28,7 +27,7 @@ import org.apache.spark.executor._
 import org.apache.spark.scheduler._
 import org.apache.spark.util.Utils
 
-class JobProgressListenerSuite extends FunSuite with LocalSparkContext with Matchers {
+class JobProgressListenerSuite extends SparkFunSuite with LocalSparkContext with Matchers {
 
   val jobSubmissionTime = 1421191042750L
   val jobCompletionTime = 1421191296660L
diff --git a/core/src/test/scala/org/apache/spark/ui/scope/RDDOperationGraphListenerSuite.scala b/core/src/test/scala/org/apache/spark/ui/scope/RDDOperationGraphListenerSuite.scala
index c1126f3af52e6..86b078851851f 100644
--- a/core/src/test/scala/org/apache/spark/ui/scope/RDDOperationGraphListenerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/ui/scope/RDDOperationGraphListenerSuite.scala
@@ -17,9 +17,7 @@
 
 package org.apache.spark.ui.scope
 
-import org.scalatest.FunSuite
-
-import org.apache.spark.SparkConf
+import org.apache.spark.{SparkConf, SparkFunSuite}
 import org.apache.spark.scheduler._
 import org.apache.spark.scheduler.SparkListenerStageSubmitted
 import org.apache.spark.scheduler.SparkListenerStageCompleted
@@ -28,7 +26,7 @@ import org.apache.spark.scheduler.SparkListenerJobStart
 /**
  * Tests that this listener populates and cleans up its data structures properly.
  */
-class RDDOperationGraphListenerSuite extends FunSuite {
+class RDDOperationGraphListenerSuite extends SparkFunSuite {
   private var jobIdCounter = 0
   private var stageIdCounter = 0
   private val maxRetainedJobs = 10
diff --git a/core/src/test/scala/org/apache/spark/ui/storage/StorageTabSuite.scala b/core/src/test/scala/org/apache/spark/ui/storage/StorageTabSuite.scala
index 8778042e34657..37e2670de9685 100644
--- a/core/src/test/scala/org/apache/spark/ui/storage/StorageTabSuite.scala
+++ b/core/src/test/scala/org/apache/spark/ui/storage/StorageTabSuite.scala
@@ -17,8 +17,8 @@
 
 package org.apache.spark.ui.storage
 
-import org.scalatest.{BeforeAndAfter, FunSuite}
-import org.apache.spark.Success
+import org.scalatest.BeforeAndAfter
+import org.apache.spark.{SparkFunSuite, Success}
 import org.apache.spark.executor.TaskMetrics
 import org.apache.spark.scheduler._
 import org.apache.spark.storage._
@@ -26,7 +26,7 @@ import org.apache.spark.storage._
 /**
  * Test various functionality in the StorageListener that supports the StorageTab.
  */
-class StorageTabSuite extends FunSuite with BeforeAndAfter {
+class StorageTabSuite extends SparkFunSuite with BeforeAndAfter {
   private var bus: LiveListenerBus = _
   private var storageStatusListener: StorageStatusListener = _
   private var storageListener: StorageListener = _
diff --git a/core/src/test/scala/org/apache/spark/util/AkkaUtilsSuite.scala b/core/src/test/scala/org/apache/spark/util/AkkaUtilsSuite.scala
index ccdb3f571429d..6c40685484ed4 100644
--- a/core/src/test/scala/org/apache/spark/util/AkkaUtilsSuite.scala
+++ b/core/src/test/scala/org/apache/spark/util/AkkaUtilsSuite.scala
@@ -20,7 +20,6 @@ package org.apache.spark.util
 import java.util.concurrent.TimeoutException
 
 import akka.actor.ActorNotFound
-import org.scalatest.FunSuite
 
 import org.apache.spark._
 import org.apache.spark.rpc.RpcEnv
@@ -32,7 +31,7 @@ import org.apache.spark.SSLSampleConfigs._
 /**
   * Test the AkkaUtils with various security settings.
   */
-class AkkaUtilsSuite extends FunSuite with LocalSparkContext with ResetSystemProperties {
+class AkkaUtilsSuite extends SparkFunSuite with LocalSparkContext with ResetSystemProperties {
 
   test("remote fetch security bad password") {
     val conf = new SparkConf
diff --git a/core/src/test/scala/org/apache/spark/util/ClosureCleanerSuite.scala b/core/src/test/scala/org/apache/spark/util/ClosureCleanerSuite.scala
index 7b165fe28bdd3..a97a842f434fb 100644
--- a/core/src/test/scala/org/apache/spark/util/ClosureCleanerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/util/ClosureCleanerSuite.scala
@@ -20,14 +20,12 @@ package org.apache.spark.util
 import java.io.NotSerializableException
 import java.util.Random
 
-import org.scalatest.FunSuite
-
 import org.apache.spark.LocalSparkContext._
-import org.apache.spark.{TaskContext, SparkContext, SparkException}
+import org.apache.spark.{SparkContext, SparkException, SparkFunSuite, TaskContext}
 import org.apache.spark.partial.CountEvaluator
 import org.apache.spark.rdd.RDD
 
-class ClosureCleanerSuite extends FunSuite {
+class ClosureCleanerSuite extends SparkFunSuite {
   test("closures inside an object") {
     assert(TestObject.run() === 30) // 6 + 7 + 8 + 9
   }
diff --git a/core/src/test/scala/org/apache/spark/util/ClosureCleanerSuite2.scala b/core/src/test/scala/org/apache/spark/util/ClosureCleanerSuite2.scala
index 59456790e89f0..3147c937769d2 100644
--- a/core/src/test/scala/org/apache/spark/util/ClosureCleanerSuite2.scala
+++ b/core/src/test/scala/org/apache/spark/util/ClosureCleanerSuite2.scala
@@ -21,16 +21,16 @@ import java.io.NotSerializableException
 
 import scala.collection.mutable
 
-import org.scalatest.{BeforeAndAfterAll, FunSuite, PrivateMethodTester}
+import org.scalatest.{BeforeAndAfterAll, PrivateMethodTester}
 
-import org.apache.spark.{SparkContext, SparkException}
+import org.apache.spark.{SparkContext, SparkException, SparkFunSuite}
 import org.apache.spark.serializer.SerializerInstance
 
 /**
  * Another test suite for the closure cleaner that is finer-grained.
  * For tests involving end-to-end Spark jobs, see {{ClosureCleanerSuite}}.
  */
-class ClosureCleanerSuite2 extends FunSuite with BeforeAndAfterAll with PrivateMethodTester {
+class ClosureCleanerSuite2 extends SparkFunSuite with BeforeAndAfterAll with PrivateMethodTester {
 
   // Start a SparkContext so that the closure serializer is accessible
   // We do not actually use this explicitly otherwise
diff --git a/core/src/test/scala/org/apache/spark/util/CompletionIteratorSuite.scala b/core/src/test/scala/org/apache/spark/util/CompletionIteratorSuite.scala
index 3755d43e25ea8..688fcd9f9aaba 100644
--- a/core/src/test/scala/org/apache/spark/util/CompletionIteratorSuite.scala
+++ b/core/src/test/scala/org/apache/spark/util/CompletionIteratorSuite.scala
@@ -17,9 +17,9 @@
 
 package org.apache.spark.util
 
-import org.scalatest.FunSuite
+import org.apache.spark.SparkFunSuite
 
-class CompletionIteratorSuite extends FunSuite {
+class CompletionIteratorSuite extends SparkFunSuite {
   test("basic test") {
     var numTimesCompleted = 0
     val iter = List(1, 2, 3).iterator
diff --git a/core/src/test/scala/org/apache/spark/util/DistributionSuite.scala b/core/src/test/scala/org/apache/spark/util/DistributionSuite.scala
index 090d48ec921a1..cdd6555697c23 100644
--- a/core/src/test/scala/org/apache/spark/util/DistributionSuite.scala
+++ b/core/src/test/scala/org/apache/spark/util/DistributionSuite.scala
@@ -17,14 +17,15 @@
 
 package org.apache.spark.util
 
-import org.scalatest.FunSuite
 import org.scalatest.Matchers
 
+import org.apache.spark.SparkFunSuite
+
 /**
  *
  */
 
-class DistributionSuite extends FunSuite with Matchers {
+class DistributionSuite extends SparkFunSuite with Matchers {
   test("summary") {
     val d = new Distribution((1 to 100).toArray.map{_.toDouble})
     val stats = d.statCounter
diff --git a/core/src/test/scala/org/apache/spark/util/EventLoopSuite.scala b/core/src/test/scala/org/apache/spark/util/EventLoopSuite.scala
index 47b535206c949..b207d497f33c2 100644
--- a/core/src/test/scala/org/apache/spark/util/EventLoopSuite.scala
+++ b/core/src/test/scala/org/apache/spark/util/EventLoopSuite.scala
@@ -25,9 +25,10 @@ import scala.language.postfixOps
 
 import org.scalatest.concurrent.Eventually._
 import org.scalatest.concurrent.Timeouts
-import org.scalatest.FunSuite
 
-class EventLoopSuite extends FunSuite with Timeouts {
+import org.apache.spark.SparkFunSuite
+
+class EventLoopSuite extends SparkFunSuite with Timeouts {
 
   test("EventLoop") {
     val buffer = new mutable.ArrayBuffer[Int] with mutable.SynchronizedBuffer[Int]
diff --git a/core/src/test/scala/org/apache/spark/util/FileAppenderSuite.scala b/core/src/test/scala/org/apache/spark/util/FileAppenderSuite.scala
index c05317534cddf..2b76ae1f8a24b 100644
--- a/core/src/test/scala/org/apache/spark/util/FileAppenderSuite.scala
+++ b/core/src/test/scala/org/apache/spark/util/FileAppenderSuite.scala
@@ -22,15 +22,15 @@ import java.io._
 import scala.collection.mutable.HashSet
 import scala.reflect._
 
-import org.scalatest.{BeforeAndAfter, FunSuite}
+import org.scalatest.BeforeAndAfter
 
 import com.google.common.base.Charsets.UTF_8
 import com.google.common.io.Files
 
-import org.apache.spark.{Logging, SparkConf}
+import org.apache.spark.{Logging, SparkConf, SparkFunSuite}
 import org.apache.spark.util.logging.{RollingFileAppender, SizeBasedRollingPolicy, TimeBasedRollingPolicy, FileAppender}
 
-class FileAppenderSuite extends FunSuite with BeforeAndAfter with Logging {
+class FileAppenderSuite extends SparkFunSuite with BeforeAndAfter with Logging {
 
   val testFile = new File(Utils.createTempDir(), "FileAppenderSuite-test").getAbsoluteFile
 
diff --git a/core/src/test/scala/org/apache/spark/util/JsonProtocolSuite.scala b/core/src/test/scala/org/apache/spark/util/JsonProtocolSuite.scala
index 0d9126f23ccc5..e0ef9c70a5fc3 100644
--- a/core/src/test/scala/org/apache/spark/util/JsonProtocolSuite.scala
+++ b/core/src/test/scala/org/apache/spark/util/JsonProtocolSuite.scala
@@ -25,7 +25,6 @@ import org.apache.spark.shuffle.MetadataFetchFailedException
 import scala.collection.Map
 
 import org.json4s.jackson.JsonMethods._
-import org.scalatest.FunSuite
 
 import org.apache.spark._
 import org.apache.spark.executor._
@@ -33,7 +32,7 @@ import org.apache.spark.rdd.RDDOperationScope
 import org.apache.spark.scheduler._
 import org.apache.spark.storage._
 
-class JsonProtocolSuite extends FunSuite {
+class JsonProtocolSuite extends SparkFunSuite {
 
   val jobSubmissionTime = 1421191042750L
   val jobCompletionTime = 1421191296660L
diff --git a/core/src/test/scala/org/apache/spark/util/MutableURLClassLoaderSuite.scala b/core/src/test/scala/org/apache/spark/util/MutableURLClassLoaderSuite.scala
index 87de90bb0dfb0..42125547436cb 100644
--- a/core/src/test/scala/org/apache/spark/util/MutableURLClassLoaderSuite.scala
+++ b/core/src/test/scala/org/apache/spark/util/MutableURLClassLoaderSuite.scala
@@ -19,11 +19,9 @@ package org.apache.spark.util
 
 import java.net.URLClassLoader
 
-import org.scalatest.FunSuite
+import org.apache.spark.{SparkContext, SparkException, SparkFunSuite, TestUtils}
 
-import org.apache.spark.{SparkContext, SparkException, TestUtils}
-
-class MutableURLClassLoaderSuite extends FunSuite {
+class MutableURLClassLoaderSuite extends SparkFunSuite {
 
   val urls2 = List(TestUtils.createJarWithClasses(
       classNames = Seq("FakeClass1", "FakeClass2", "FakeClass3"),
diff --git a/core/src/test/scala/org/apache/spark/util/NextIteratorSuite.scala b/core/src/test/scala/org/apache/spark/util/NextIteratorSuite.scala
index 403dcb03bd6e5..4b7164d8acbce 100644
--- a/core/src/test/scala/org/apache/spark/util/NextIteratorSuite.scala
+++ b/core/src/test/scala/org/apache/spark/util/NextIteratorSuite.scala
@@ -21,10 +21,11 @@ import java.util.NoSuchElementException
 
 import scala.collection.mutable.Buffer
 
-import org.scalatest.FunSuite
 import org.scalatest.Matchers
 
-class NextIteratorSuite extends FunSuite with Matchers {
+import org.apache.spark.SparkFunSuite
+
+class NextIteratorSuite extends SparkFunSuite with Matchers {
   test("one iteration") {
     val i = new StubIterator(Buffer(1))
     i.hasNext should be (true)
diff --git a/core/src/test/scala/org/apache/spark/util/ResetSystemProperties.scala b/core/src/test/scala/org/apache/spark/util/ResetSystemProperties.scala
index bad1aa99952cf..c58db5e606f7c 100644
--- a/core/src/test/scala/org/apache/spark/util/ResetSystemProperties.scala
+++ b/core/src/test/scala/org/apache/spark/util/ResetSystemProperties.scala
@@ -22,12 +22,14 @@ import java.util.Properties
 import org.apache.commons.lang3.SerializationUtils
 import org.scalatest.{BeforeAndAfterEach, Suite}
 
+import org.apache.spark.SparkFunSuite
+
 /**
  * Mixin for automatically resetting system properties that are modified in ScalaTest tests.
  * This resets the properties after each individual test.
  *
  * The order in which fixtures are mixed in affects the order in which they are invoked by tests.
- * If we have a suite `MySuite extends FunSuite with Foo with Bar`, then
+ * If we have a suite `MySuite extends SparkFunSuite with Foo with Bar`, then
  * Bar's `super` is Foo, so Bar's beforeEach() will and afterEach() methods will be invoked first
  * by the rest runner.
  *
diff --git a/core/src/test/scala/org/apache/spark/util/SizeEstimatorSuite.scala b/core/src/test/scala/org/apache/spark/util/SizeEstimatorSuite.scala
index 04f0f3749d6b9..20550178fb1bd 100644
--- a/core/src/test/scala/org/apache/spark/util/SizeEstimatorSuite.scala
+++ b/core/src/test/scala/org/apache/spark/util/SizeEstimatorSuite.scala
@@ -19,7 +19,9 @@ package org.apache.spark.util
 
 import scala.collection.mutable.ArrayBuffer
 
-import org.scalatest.{BeforeAndAfterEach, BeforeAndAfterAll, FunSuite, PrivateMethodTester}
+import org.scalatest.{BeforeAndAfterEach, BeforeAndAfterAll, PrivateMethodTester}
+
+import org.apache.spark.SparkFunSuite
 
 class DummyClass1 {}
 
@@ -59,7 +61,10 @@ class DummyString(val arr: Array[Char]) {
 }
 
 class SizeEstimatorSuite
-  extends FunSuite with BeforeAndAfterEach with PrivateMethodTester with ResetSystemProperties {
+  extends SparkFunSuite
+  with BeforeAndAfterEach
+  with PrivateMethodTester
+  with ResetSystemProperties {
 
   override def beforeEach() {
     // Set the arch to 64-bit and compressedOops to true to get a deterministic test-case
diff --git a/core/src/test/scala/org/apache/spark/util/ThreadUtilsSuite.scala b/core/src/test/scala/org/apache/spark/util/ThreadUtilsSuite.scala
index 751d3df9cc8f7..8c51e6b14b7fc 100644
--- a/core/src/test/scala/org/apache/spark/util/ThreadUtilsSuite.scala
+++ b/core/src/test/scala/org/apache/spark/util/ThreadUtilsSuite.scala
@@ -23,9 +23,9 @@ import java.util.concurrent.{CountDownLatch, TimeUnit}
 import scala.concurrent.{Await, Future}
 import scala.concurrent.duration._
 
-import org.scalatest.FunSuite
+import org.apache.spark.SparkFunSuite
 
-class ThreadUtilsSuite extends FunSuite {
+class ThreadUtilsSuite extends SparkFunSuite {
 
   test("newDaemonSingleThreadExecutor") {
     val executor = ThreadUtils.newDaemonSingleThreadExecutor("this-is-a-thread-name")
diff --git a/core/src/test/scala/org/apache/spark/util/TimeStampedHashMapSuite.scala b/core/src/test/scala/org/apache/spark/util/TimeStampedHashMapSuite.scala
index 8b72fe665c214..9b3169026cda3 100644
--- a/core/src/test/scala/org/apache/spark/util/TimeStampedHashMapSuite.scala
+++ b/core/src/test/scala/org/apache/spark/util/TimeStampedHashMapSuite.scala
@@ -23,9 +23,9 @@ import scala.collection.mutable
 import scala.collection.mutable.ArrayBuffer
 import scala.util.Random
 
-import org.scalatest.FunSuite
+import org.apache.spark.SparkFunSuite
 
-class TimeStampedHashMapSuite extends FunSuite {
+class TimeStampedHashMapSuite extends SparkFunSuite {
 
   // Test the testMap function - a Scala HashMap should obviously pass
   testMap(new mutable.HashMap[String, String]())
diff --git a/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala b/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala
index afa5cdc819746..a867cf83dc3f1 100644
--- a/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala
+++ b/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala
@@ -29,16 +29,15 @@ import scala.util.Random
 
 import com.google.common.base.Charsets.UTF_8
 import com.google.common.io.Files
-import org.scalatest.FunSuite
 
 import org.apache.hadoop.conf.Configuration
 import org.apache.hadoop.fs.Path
 
 import org.apache.spark.network.util.ByteUnit
-import org.apache.spark.Logging
+import org.apache.spark.{Logging, SparkFunSuite}
 import org.apache.spark.SparkConf
 
-class UtilsSuite extends FunSuite with ResetSystemProperties with Logging {
+class UtilsSuite extends SparkFunSuite with ResetSystemProperties with Logging {
 
   test("timeConversion") {
     // Test -1
diff --git a/core/src/test/scala/org/apache/spark/util/VectorSuite.scala b/core/src/test/scala/org/apache/spark/util/VectorSuite.scala
index ce2968728a996..11194cd22a419 100644
--- a/core/src/test/scala/org/apache/spark/util/VectorSuite.scala
+++ b/core/src/test/scala/org/apache/spark/util/VectorSuite.scala
@@ -19,13 +19,13 @@ package org.apache.spark.util
 
 import scala.util.Random
 
-import org.scalatest.FunSuite
+import org.apache.spark.SparkFunSuite
 
 /**
  * Tests org.apache.spark.util.Vector functionality
  */
 @deprecated("suppress compile time deprecation warning", "1.0.0")
-class VectorSuite extends FunSuite {
+class VectorSuite extends SparkFunSuite {
 
   def verifyVector(vector: Vector, expectedLength: Int): Unit = {
     assert(vector.length == expectedLength)
diff --git a/core/src/test/scala/org/apache/spark/util/collection/AppendOnlyMapSuite.scala b/core/src/test/scala/org/apache/spark/util/collection/AppendOnlyMapSuite.scala
index cb99d14b27af4..a2a6d703860f2 100644
--- a/core/src/test/scala/org/apache/spark/util/collection/AppendOnlyMapSuite.scala
+++ b/core/src/test/scala/org/apache/spark/util/collection/AppendOnlyMapSuite.scala
@@ -21,9 +21,9 @@ import java.util.Comparator
 
 import scala.collection.mutable.HashSet
 
-import org.scalatest.FunSuite
+import org.apache.spark.SparkFunSuite
 
-class AppendOnlyMapSuite extends FunSuite {
+class AppendOnlyMapSuite extends SparkFunSuite {
   test("initialization") {
     val goodMap1 = new AppendOnlyMap[Int, Int](1)
     assert(goodMap1.size === 0)
diff --git a/core/src/test/scala/org/apache/spark/util/collection/BitSetSuite.scala b/core/src/test/scala/org/apache/spark/util/collection/BitSetSuite.scala
index ffc206991906a..69dbfa9cd7141 100644
--- a/core/src/test/scala/org/apache/spark/util/collection/BitSetSuite.scala
+++ b/core/src/test/scala/org/apache/spark/util/collection/BitSetSuite.scala
@@ -17,9 +17,9 @@
 
 package org.apache.spark.util.collection
 
-import org.scalatest.FunSuite
+import org.apache.spark.SparkFunSuite
 
-class BitSetSuite extends FunSuite {
+class BitSetSuite extends SparkFunSuite {
 
   test("basic set and get") {
     val setBits = Seq(0, 9, 1, 10, 90, 96)
diff --git a/core/src/test/scala/org/apache/spark/util/collection/ChainedBufferSuite.scala b/core/src/test/scala/org/apache/spark/util/collection/ChainedBufferSuite.scala
index c0c38cd4ac4ad..05306f408847d 100644
--- a/core/src/test/scala/org/apache/spark/util/collection/ChainedBufferSuite.scala
+++ b/core/src/test/scala/org/apache/spark/util/collection/ChainedBufferSuite.scala
@@ -19,10 +19,11 @@ package org.apache.spark.util.collection
 
 import java.nio.ByteBuffer
 
-import org.scalatest.FunSuite
 import org.scalatest.Matchers._
 
-class ChainedBufferSuite extends FunSuite {
+import org.apache.spark.SparkFunSuite
+
+class ChainedBufferSuite extends SparkFunSuite {
   test("write and read at start") {
     // write from start of source array
     val buffer = new ChainedBuffer(8)
diff --git a/core/src/test/scala/org/apache/spark/util/collection/CompactBufferSuite.scala b/core/src/test/scala/org/apache/spark/util/collection/CompactBufferSuite.scala
index 6c956d93dc80d..bc5479991a99d 100644
--- a/core/src/test/scala/org/apache/spark/util/collection/CompactBufferSuite.scala
+++ b/core/src/test/scala/org/apache/spark/util/collection/CompactBufferSuite.scala
@@ -17,9 +17,9 @@
 
 package org.apache.spark.util.collection
 
-import org.scalatest.FunSuite
+import org.apache.spark.SparkFunSuite
 
-class CompactBufferSuite extends FunSuite {
+class CompactBufferSuite extends SparkFunSuite {
   test("empty buffer") {
     val b = new CompactBuffer[Int]
     assert(b.size === 0)
diff --git a/core/src/test/scala/org/apache/spark/util/collection/ExternalAppendOnlyMapSuite.scala b/core/src/test/scala/org/apache/spark/util/collection/ExternalAppendOnlyMapSuite.scala
index dff8f3ddc816f..79eba61a87251 100644
--- a/core/src/test/scala/org/apache/spark/util/collection/ExternalAppendOnlyMapSuite.scala
+++ b/core/src/test/scala/org/apache/spark/util/collection/ExternalAppendOnlyMapSuite.scala
@@ -19,12 +19,10 @@ package org.apache.spark.util.collection
 
 import scala.collection.mutable.ArrayBuffer
 
-import org.scalatest.FunSuite
-
 import org.apache.spark._
 import org.apache.spark.io.CompressionCodec
 
-class ExternalAppendOnlyMapSuite extends FunSuite with LocalSparkContext {
+class ExternalAppendOnlyMapSuite extends SparkFunSuite with LocalSparkContext {
   private val allCompressionCodecs = CompressionCodec.ALL_COMPRESSION_CODECS
   private def createCombiner[T](i: T) = ArrayBuffer[T](i)
   private def mergeValue[T](buffer: ArrayBuffer[T], i: T): ArrayBuffer[T] = buffer += i
diff --git a/core/src/test/scala/org/apache/spark/util/collection/ExternalSorterSuite.scala b/core/src/test/scala/org/apache/spark/util/collection/ExternalSorterSuite.scala
index 7a98723bc6472..9039dbef1fb71 100644
--- a/core/src/test/scala/org/apache/spark/util/collection/ExternalSorterSuite.scala
+++ b/core/src/test/scala/org/apache/spark/util/collection/ExternalSorterSuite.scala
@@ -19,14 +19,14 @@ package org.apache.spark.util.collection
 
 import scala.collection.mutable.ArrayBuffer
 
-import org.scalatest.{FunSuite, PrivateMethodTester}
+import org.scalatest.PrivateMethodTester
 
 import scala.util.Random
 
 import org.apache.spark._
 import org.apache.spark.serializer.{JavaSerializer, KryoSerializer}
 
-class ExternalSorterSuite extends FunSuite with LocalSparkContext with PrivateMethodTester {
+class ExternalSorterSuite extends SparkFunSuite with LocalSparkContext with PrivateMethodTester {
   private def createSparkConf(loadDefaults: Boolean, kryo: Boolean): SparkConf = {
     val conf = new SparkConf(loadDefaults)
     if (kryo) {
diff --git a/core/src/test/scala/org/apache/spark/util/collection/OpenHashMapSuite.scala b/core/src/test/scala/org/apache/spark/util/collection/OpenHashMapSuite.scala
index ef890d2ba60f3..94e011799921b 100644
--- a/core/src/test/scala/org/apache/spark/util/collection/OpenHashMapSuite.scala
+++ b/core/src/test/scala/org/apache/spark/util/collection/OpenHashMapSuite.scala
@@ -19,12 +19,12 @@ package org.apache.spark.util.collection
 
 import scala.collection.mutable.HashSet
 
-import org.scalatest.FunSuite
 import org.scalatest.Matchers
 
+import org.apache.spark.SparkFunSuite
 import org.apache.spark.util.SizeEstimator
 
-class OpenHashMapSuite extends FunSuite with Matchers {
+class OpenHashMapSuite extends SparkFunSuite with Matchers {
 
   test("size for specialized, primitive value (int)") {
     val capacity = 1024
diff --git a/core/src/test/scala/org/apache/spark/util/collection/OpenHashSetSuite.scala b/core/src/test/scala/org/apache/spark/util/collection/OpenHashSetSuite.scala
index 68a03e3a0970f..2607a543dd614 100644
--- a/core/src/test/scala/org/apache/spark/util/collection/OpenHashSetSuite.scala
+++ b/core/src/test/scala/org/apache/spark/util/collection/OpenHashSetSuite.scala
@@ -17,12 +17,12 @@
 
 package org.apache.spark.util.collection
 
-import org.scalatest.FunSuite
 import org.scalatest.Matchers
 
+import org.apache.spark.SparkFunSuite
 import org.apache.spark.util.SizeEstimator
 
-class OpenHashSetSuite extends FunSuite with Matchers {
+class OpenHashSetSuite extends SparkFunSuite with Matchers {
 
   test("size for specialized, primitive int") {
     val loadFactor = 0.7
diff --git a/core/src/test/scala/org/apache/spark/util/collection/PartitionedSerializedPairBufferSuite.scala b/core/src/test/scala/org/apache/spark/util/collection/PartitionedSerializedPairBufferSuite.scala
index b5a2d9ef720c1..6d2459d48d326 100644
--- a/core/src/test/scala/org/apache/spark/util/collection/PartitionedSerializedPairBufferSuite.scala
+++ b/core/src/test/scala/org/apache/spark/util/collection/PartitionedSerializedPairBufferSuite.scala
@@ -21,14 +21,13 @@ import java.io.{ByteArrayInputStream, ByteArrayOutputStream, InputStream}
 
 import com.google.common.io.ByteStreams
 
-import org.scalatest.FunSuite
 import org.scalatest.Matchers._
 
-import org.apache.spark.SparkConf
+import org.apache.spark.{SparkConf, SparkFunSuite}
 import org.apache.spark.serializer.KryoSerializer
 import org.apache.spark.storage.{FileSegment, BlockObjectWriter}
 
-class PartitionedSerializedPairBufferSuite extends FunSuite {
+class PartitionedSerializedPairBufferSuite extends SparkFunSuite {
   test("OrderedInputStream single record") {
     val serializerInstance = new KryoSerializer(new SparkConf()).newInstance
 
diff --git a/core/src/test/scala/org/apache/spark/util/collection/PrimitiveKeyOpenHashMapSuite.scala b/core/src/test/scala/org/apache/spark/util/collection/PrimitiveKeyOpenHashMapSuite.scala
index caf378fec8b3e..462bc2f29f9f8 100644
--- a/core/src/test/scala/org/apache/spark/util/collection/PrimitiveKeyOpenHashMapSuite.scala
+++ b/core/src/test/scala/org/apache/spark/util/collection/PrimitiveKeyOpenHashMapSuite.scala
@@ -19,12 +19,12 @@ package org.apache.spark.util.collection
 
 import scala.collection.mutable.HashSet
 
-import org.scalatest.FunSuite
 import org.scalatest.Matchers
 
+import org.apache.spark.SparkFunSuite
 import org.apache.spark.util.SizeEstimator
 
-class PrimitiveKeyOpenHashMapSuite extends FunSuite with Matchers {
+class PrimitiveKeyOpenHashMapSuite extends SparkFunSuite with Matchers {
 
   test("size for specialized, primitive key, value (int, int)") {
     val capacity = 1024
diff --git a/core/src/test/scala/org/apache/spark/util/collection/PrimitiveVectorSuite.scala b/core/src/test/scala/org/apache/spark/util/collection/PrimitiveVectorSuite.scala
index 970dade628fe4..ae0eebc26f01b 100644
--- a/core/src/test/scala/org/apache/spark/util/collection/PrimitiveVectorSuite.scala
+++ b/core/src/test/scala/org/apache/spark/util/collection/PrimitiveVectorSuite.scala
@@ -17,11 +17,10 @@
 
 package org.apache.spark.util.collection
 
-import org.scalatest.FunSuite
-
+import org.apache.spark.SparkFunSuite
 import org.apache.spark.util.SizeEstimator
 
-class PrimitiveVectorSuite extends FunSuite {
+class PrimitiveVectorSuite extends SparkFunSuite {
 
   test("primitive value") {
     val vector = new PrimitiveVector[Int]
diff --git a/core/src/test/scala/org/apache/spark/util/collection/SizeTrackerSuite.scala b/core/src/test/scala/org/apache/spark/util/collection/SizeTrackerSuite.scala
index 1f33967249654..5a5919fca2469 100644
--- a/core/src/test/scala/org/apache/spark/util/collection/SizeTrackerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/util/collection/SizeTrackerSuite.scala
@@ -20,11 +20,10 @@ package org.apache.spark.util.collection
 import scala.reflect.ClassTag
 import scala.util.Random
 
-import org.scalatest.FunSuite
-
+import org.apache.spark.SparkFunSuite
 import org.apache.spark.util.SizeEstimator
 
-class SizeTrackerSuite extends FunSuite {
+class SizeTrackerSuite extends SparkFunSuite {
   val NORMAL_ERROR = 0.20
   val HIGH_ERROR = 0.30
 
diff --git a/core/src/test/scala/org/apache/spark/util/collection/SorterSuite.scala b/core/src/test/scala/org/apache/spark/util/collection/SorterSuite.scala
index e0d6cc16bde05..72fd6daba8de0 100644
--- a/core/src/test/scala/org/apache/spark/util/collection/SorterSuite.scala
+++ b/core/src/test/scala/org/apache/spark/util/collection/SorterSuite.scala
@@ -20,11 +20,10 @@ package org.apache.spark.util.collection
 import java.lang.{Float => JFloat, Integer => JInteger}
 import java.util.{Arrays, Comparator}
 
-import org.scalatest.FunSuite
-
+import org.apache.spark.SparkFunSuite
 import org.apache.spark.util.random.XORShiftRandom
 
-class SorterSuite extends FunSuite {
+class SorterSuite extends SparkFunSuite {
 
   test("equivalent to Arrays.sort") {
     val rand = new XORShiftRandom(123)
diff --git a/core/src/test/scala/org/apache/spark/util/io/ByteArrayChunkOutputStreamSuite.scala b/core/src/test/scala/org/apache/spark/util/io/ByteArrayChunkOutputStreamSuite.scala
index f855831b8e367..361ec95654f47 100644
--- a/core/src/test/scala/org/apache/spark/util/io/ByteArrayChunkOutputStreamSuite.scala
+++ b/core/src/test/scala/org/apache/spark/util/io/ByteArrayChunkOutputStreamSuite.scala
@@ -19,10 +19,10 @@ package org.apache.spark.util.io
 
 import scala.util.Random
 
-import org.scalatest.FunSuite
+import org.apache.spark.SparkFunSuite
 
 
-class ByteArrayChunkOutputStreamSuite extends FunSuite {
+class ByteArrayChunkOutputStreamSuite extends SparkFunSuite {
 
   test("empty output") {
     val o = new ByteArrayChunkOutputStream(1024)
diff --git a/core/src/test/scala/org/apache/spark/util/random/RandomSamplerSuite.scala b/core/src/test/scala/org/apache/spark/util/random/RandomSamplerSuite.scala
index 20944b62473c5..2f1e6a39f4554 100644
--- a/core/src/test/scala/org/apache/spark/util/random/RandomSamplerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/util/random/RandomSamplerSuite.scala
@@ -21,9 +21,11 @@ import java.util.Random
 import scala.collection.mutable.ArrayBuffer
 import org.apache.commons.math3.distribution.PoissonDistribution
 
-import org.scalatest.{FunSuite, Matchers}
+import org.scalatest.Matchers
 
-class RandomSamplerSuite extends FunSuite with Matchers {
+import org.apache.spark.SparkFunSuite
+
+class RandomSamplerSuite extends SparkFunSuite with Matchers {
   /**
    * My statistical testing methodology is to run a Kolmogorov-Smirnov (KS) test
    * between the random samplers and simple reference samplers (known to work correctly).
diff --git a/core/src/test/scala/org/apache/spark/util/random/SamplingUtilsSuite.scala b/core/src/test/scala/org/apache/spark/util/random/SamplingUtilsSuite.scala
index 73a9d029b0248..667a4db6f7bb6 100644
--- a/core/src/test/scala/org/apache/spark/util/random/SamplingUtilsSuite.scala
+++ b/core/src/test/scala/org/apache/spark/util/random/SamplingUtilsSuite.scala
@@ -20,9 +20,10 @@ package org.apache.spark.util.random
 import scala.util.Random
 
 import org.apache.commons.math3.distribution.{BinomialDistribution, PoissonDistribution}
-import org.scalatest.FunSuite
 
-class SamplingUtilsSuite extends FunSuite {
+import org.apache.spark.SparkFunSuite
+
+class SamplingUtilsSuite extends SparkFunSuite {
 
   test("reservoirSampleAndCount") {
     val input = Seq.fill(100)(Random.nextInt())
diff --git a/core/src/test/scala/org/apache/spark/util/random/XORShiftRandomSuite.scala b/core/src/test/scala/org/apache/spark/util/random/XORShiftRandomSuite.scala
index 03f5f2d1b8528..6ca484ccd0c06 100644
--- a/core/src/test/scala/org/apache/spark/util/random/XORShiftRandomSuite.scala
+++ b/core/src/test/scala/org/apache/spark/util/random/XORShiftRandomSuite.scala
@@ -17,16 +17,16 @@
 
 package org.apache.spark.util.random
 
-import org.scalatest.FunSuite
 import org.scalatest.Matchers
 
 import org.apache.commons.math3.stat.inference.ChiSquareTest
 
+import org.apache.spark.SparkFunSuite
 import org.apache.spark.util.Utils.times
 
 import scala.language.reflectiveCalls
 
-class XORShiftRandomSuite extends FunSuite with Matchers {
+class XORShiftRandomSuite extends SparkFunSuite with Matchers {
 
   def fixture: Object {val seed: Long; val hundMil: Int; val xorRand: XORShiftRandom} = new {
     val seed = 1L
diff --git a/external/flume-sink/pom.xml b/external/flume-sink/pom.xml
index 1f3e619d97a24..bb2ec96715942 100644
--- a/external/flume-sink/pom.xml
+++ b/external/flume-sink/pom.xml
@@ -35,6 +35,13 @@
   <url>http://spark.apache.org/</url>
 
   <dependencies>
+    <dependency>
+      <groupId>org.apache.spark</groupId>
+      <artifactId>spark-core_${scala.binary.version}</artifactId>
+      <version>${project.version}</version>
+      <type>test-jar</type>
+      <scope>test</scope>
+    </dependency>
     <dependency>
       <groupId>org.apache.commons</groupId>
       <artifactId>commons-lang3</artifactId>
diff --git a/external/flume-sink/src/test/scala/org/apache/spark/streaming/flume/sink/SparkSinkSuite.scala b/external/flume-sink/src/test/scala/org/apache/spark/streaming/flume/sink/SparkSinkSuite.scala
index 650b2fbe1c142..e9fbcb9db6b78 100644
--- a/external/flume-sink/src/test/scala/org/apache/spark/streaming/flume/sink/SparkSinkSuite.scala
+++ b/external/flume-sink/src/test/scala/org/apache/spark/streaming/flume/sink/SparkSinkSuite.scala
@@ -31,9 +31,10 @@ import org.apache.flume.Context
 import org.apache.flume.channel.MemoryChannel
 import org.apache.flume.event.EventBuilder
 import org.jboss.netty.channel.socket.nio.NioClientSocketChannelFactory
-import org.scalatest.FunSuite
 
-class SparkSinkSuite extends FunSuite {
+import org.apache.spark.SparkFunSuite
+
+class SparkSinkSuite extends SparkFunSuite {
   val eventsPerBatch = 1000
   val channelCapacity = 5000
 
diff --git a/external/flume/pom.xml b/external/flume/pom.xml
index 8df7edbdcad33..a345c03582ad6 100644
--- a/external/flume/pom.xml
+++ b/external/flume/pom.xml
@@ -41,6 +41,13 @@
       <version>${project.version}</version>
       <scope>provided</scope>
     </dependency>
+    <dependency>
+      <groupId>org.apache.spark</groupId>
+      <artifactId>spark-core_${scala.binary.version}</artifactId>
+      <version>${project.version}</version>
+      <type>test-jar</type>
+      <scope>test</scope>
+    </dependency>
     <dependency>
       <groupId>org.apache.spark</groupId>
       <artifactId>spark-streaming-flume-sink_${scala.binary.version}</artifactId>
diff --git a/external/flume/src/test/scala/org/apache/spark/streaming/flume/FlumePollingStreamSuite.scala b/external/flume/src/test/scala/org/apache/spark/streaming/flume/FlumePollingStreamSuite.scala
index 93afe50c2134f..d772b9ca9b570 100644
--- a/external/flume/src/test/scala/org/apache/spark/streaming/flume/FlumePollingStreamSuite.scala
+++ b/external/flume/src/test/scala/org/apache/spark/streaming/flume/FlumePollingStreamSuite.scala
@@ -31,16 +31,16 @@ import org.apache.flume.conf.Configurables
 import org.apache.flume.event.EventBuilder
 import org.scalatest.concurrent.Eventually._
 
-import org.scalatest.{BeforeAndAfter, FunSuite}
+import org.scalatest.BeforeAndAfter
 
-import org.apache.spark.{SparkConf, Logging}
+import org.apache.spark.{Logging, SparkConf, SparkFunSuite}
 import org.apache.spark.storage.StorageLevel
 import org.apache.spark.streaming.dstream.ReceiverInputDStream
 import org.apache.spark.streaming.{Seconds, TestOutputStream, StreamingContext}
 import org.apache.spark.streaming.flume.sink._
 import org.apache.spark.util.{ManualClock, Utils}
 
-class FlumePollingStreamSuite extends FunSuite with BeforeAndAfter with Logging {
+class FlumePollingStreamSuite extends SparkFunSuite with BeforeAndAfter with Logging {
 
   val batchCount = 5
   val eventsPerBatch = 100
diff --git a/external/flume/src/test/scala/org/apache/spark/streaming/flume/FlumeStreamSuite.scala b/external/flume/src/test/scala/org/apache/spark/streaming/flume/FlumeStreamSuite.scala
index 39e6754c81dbf..3d9daeb6e4363 100644
--- a/external/flume/src/test/scala/org/apache/spark/streaming/flume/FlumeStreamSuite.scala
+++ b/external/flume/src/test/scala/org/apache/spark/streaming/flume/FlumeStreamSuite.scala
@@ -35,15 +35,15 @@ import org.jboss.netty.channel.ChannelPipeline
 import org.jboss.netty.channel.socket.SocketChannel
 import org.jboss.netty.channel.socket.nio.NioClientSocketChannelFactory
 import org.jboss.netty.handler.codec.compression._
-import org.scalatest.{BeforeAndAfter, FunSuite, Matchers}
+import org.scalatest.{BeforeAndAfter, Matchers}
 import org.scalatest.concurrent.Eventually._
 
-import org.apache.spark.{Logging, SparkConf}
+import org.apache.spark.{Logging, SparkConf, SparkFunSuite}
 import org.apache.spark.storage.StorageLevel
 import org.apache.spark.streaming.{Milliseconds, StreamingContext, TestOutputStream}
 import org.apache.spark.util.Utils
 
-class FlumeStreamSuite extends FunSuite with BeforeAndAfter with Matchers with Logging {
+class FlumeStreamSuite extends SparkFunSuite with BeforeAndAfter with Matchers with Logging {
   val conf = new SparkConf().setMaster("local[4]").setAppName("FlumeStreamSuite")
 
   var ssc: StreamingContext = null
diff --git a/external/kafka/pom.xml b/external/kafka/pom.xml
index 243ce6eaca658..5734d55bf4784 100644
--- a/external/kafka/pom.xml
+++ b/external/kafka/pom.xml
@@ -41,6 +41,13 @@
       <version>${project.version}</version>
       <scope>provided</scope>
     </dependency>
+    <dependency>
+      <groupId>org.apache.spark</groupId>
+      <artifactId>spark-core_${scala.binary.version}</artifactId>
+      <version>${project.version}</version>
+      <type>test-jar</type>
+      <scope>test</scope>
+    </dependency>
     <dependency>
       <groupId>org.apache.kafka</groupId>
       <artifactId>kafka_${scala.binary.version}</artifactId>
diff --git a/external/kafka/src/test/scala/org/apache/spark/streaming/kafka/DirectKafkaStreamSuite.scala b/external/kafka/src/test/scala/org/apache/spark/streaming/kafka/DirectKafkaStreamSuite.scala
index b6d314dfc7783..47bbfb605850a 100644
--- a/external/kafka/src/test/scala/org/apache/spark/streaming/kafka/DirectKafkaStreamSuite.scala
+++ b/external/kafka/src/test/scala/org/apache/spark/streaming/kafka/DirectKafkaStreamSuite.scala
@@ -28,10 +28,10 @@ import scala.language.postfixOps
 import kafka.common.TopicAndPartition
 import kafka.message.MessageAndMetadata
 import kafka.serializer.StringDecoder
-import org.scalatest.{BeforeAndAfter, BeforeAndAfterAll, FunSuite}
+import org.scalatest.{BeforeAndAfter, BeforeAndAfterAll}
 import org.scalatest.concurrent.Eventually
 
-import org.apache.spark.{Logging, SparkConf, SparkContext}
+import org.apache.spark.{Logging, SparkConf, SparkContext, SparkFunSuite}
 import org.apache.spark.rdd.RDD
 import org.apache.spark.streaming.{Milliseconds, StreamingContext, Time}
 import org.apache.spark.streaming.dstream.DStream
@@ -39,7 +39,7 @@ import org.apache.spark.streaming.scheduler._
 import org.apache.spark.util.Utils
 
 class DirectKafkaStreamSuite
-  extends FunSuite
+  extends SparkFunSuite
   with BeforeAndAfter
   with BeforeAndAfterAll
   with Eventually
diff --git a/external/kafka/src/test/scala/org/apache/spark/streaming/kafka/KafkaClusterSuite.scala b/external/kafka/src/test/scala/org/apache/spark/streaming/kafka/KafkaClusterSuite.scala
index 7fb841b79cb65..d66830cbacdee 100644
--- a/external/kafka/src/test/scala/org/apache/spark/streaming/kafka/KafkaClusterSuite.scala
+++ b/external/kafka/src/test/scala/org/apache/spark/streaming/kafka/KafkaClusterSuite.scala
@@ -20,9 +20,11 @@ package org.apache.spark.streaming.kafka
 import scala.util.Random
 
 import kafka.common.TopicAndPartition
-import org.scalatest.{BeforeAndAfterAll, FunSuite}
+import org.scalatest.BeforeAndAfterAll
 
-class KafkaClusterSuite extends FunSuite with BeforeAndAfterAll {
+import org.apache.spark.SparkFunSuite
+
+class KafkaClusterSuite extends SparkFunSuite with BeforeAndAfterAll {
   private val topic = "kcsuitetopic" + Random.nextInt(10000)
   private val topicAndPartition = TopicAndPartition(topic, 0)
   private var kc: KafkaCluster = null
diff --git a/external/kafka/src/test/scala/org/apache/spark/streaming/kafka/KafkaRDDSuite.scala b/external/kafka/src/test/scala/org/apache/spark/streaming/kafka/KafkaRDDSuite.scala
index 3c875cb766513..054487269a935 100644
--- a/external/kafka/src/test/scala/org/apache/spark/streaming/kafka/KafkaRDDSuite.scala
+++ b/external/kafka/src/test/scala/org/apache/spark/streaming/kafka/KafkaRDDSuite.scala
@@ -22,11 +22,11 @@ import scala.util.Random
 import kafka.serializer.StringDecoder
 import kafka.common.TopicAndPartition
 import kafka.message.MessageAndMetadata
-import org.scalatest.{BeforeAndAfterAll, FunSuite}
+import org.scalatest.BeforeAndAfterAll
 
 import org.apache.spark._
 
-class KafkaRDDSuite extends FunSuite with BeforeAndAfterAll {
+class KafkaRDDSuite extends SparkFunSuite with BeforeAndAfterAll {
 
   private var kafkaTestUtils: KafkaTestUtils = _
 
diff --git a/external/kafka/src/test/scala/org/apache/spark/streaming/kafka/KafkaStreamSuite.scala b/external/kafka/src/test/scala/org/apache/spark/streaming/kafka/KafkaStreamSuite.scala
index 24699dfc33adb..8ee2cc660f849 100644
--- a/external/kafka/src/test/scala/org/apache/spark/streaming/kafka/KafkaStreamSuite.scala
+++ b/external/kafka/src/test/scala/org/apache/spark/streaming/kafka/KafkaStreamSuite.scala
@@ -23,14 +23,14 @@ import scala.language.postfixOps
 import scala.util.Random
 
 import kafka.serializer.StringDecoder
-import org.scalatest.{BeforeAndAfterAll, FunSuite}
+import org.scalatest.BeforeAndAfterAll
 import org.scalatest.concurrent.Eventually
 
-import org.apache.spark.SparkConf
+import org.apache.spark.{SparkConf, SparkFunSuite}
 import org.apache.spark.storage.StorageLevel
 import org.apache.spark.streaming.{Milliseconds, StreamingContext}
 
-class KafkaStreamSuite extends FunSuite with Eventually with BeforeAndAfterAll {
+class KafkaStreamSuite extends SparkFunSuite with Eventually with BeforeAndAfterAll {
   private var ssc: StreamingContext = _
   private var kafkaTestUtils: KafkaTestUtils = _
 
diff --git a/external/kafka/src/test/scala/org/apache/spark/streaming/kafka/ReliableKafkaStreamSuite.scala b/external/kafka/src/test/scala/org/apache/spark/streaming/kafka/ReliableKafkaStreamSuite.scala
index 38548dd73b82c..80e2df62de3fe 100644
--- a/external/kafka/src/test/scala/org/apache/spark/streaming/kafka/ReliableKafkaStreamSuite.scala
+++ b/external/kafka/src/test/scala/org/apache/spark/streaming/kafka/ReliableKafkaStreamSuite.scala
@@ -26,15 +26,15 @@ import scala.util.Random
 
 import kafka.serializer.StringDecoder
 import kafka.utils.{ZKGroupTopicDirs, ZkUtils}
-import org.scalatest.{BeforeAndAfter, BeforeAndAfterAll, FunSuite}
+import org.scalatest.{BeforeAndAfter, BeforeAndAfterAll}
 import org.scalatest.concurrent.Eventually
 
-import org.apache.spark.SparkConf
+import org.apache.spark.{SparkConf, SparkFunSuite}
 import org.apache.spark.storage.StorageLevel
 import org.apache.spark.streaming.{Milliseconds, StreamingContext}
 import org.apache.spark.util.Utils
 
-class ReliableKafkaStreamSuite extends FunSuite
+class ReliableKafkaStreamSuite extends SparkFunSuite
     with BeforeAndAfterAll with BeforeAndAfter with Eventually {
 
   private val sparkConf = new SparkConf()
diff --git a/external/mqtt/pom.xml b/external/mqtt/pom.xml
index 98f95a9a64fa0..7d102e10ab60f 100644
--- a/external/mqtt/pom.xml
+++ b/external/mqtt/pom.xml
@@ -41,6 +41,13 @@
       <version>${project.version}</version>
       <scope>provided</scope>
     </dependency>
+    <dependency>
+      <groupId>org.apache.spark</groupId>
+      <artifactId>spark-core_${scala.binary.version}</artifactId>
+      <version>${project.version}</version>
+      <type>test-jar</type>
+      <scope>test</scope>
+    </dependency>
     <dependency>
       <groupId>org.eclipse.paho</groupId>
       <artifactId>org.eclipse.paho.client.mqttv3</artifactId>
diff --git a/external/mqtt/src/test/scala/org/apache/spark/streaming/mqtt/MQTTStreamSuite.scala b/external/mqtt/src/test/scala/org/apache/spark/streaming/mqtt/MQTTStreamSuite.scala
index a19a72c58a705..c4bf5aa7869bb 100644
--- a/external/mqtt/src/test/scala/org/apache/spark/streaming/mqtt/MQTTStreamSuite.scala
+++ b/external/mqtt/src/test/scala/org/apache/spark/streaming/mqtt/MQTTStreamSuite.scala
@@ -29,7 +29,7 @@ import org.apache.commons.lang3.RandomUtils
 import org.eclipse.paho.client.mqttv3._
 import org.eclipse.paho.client.mqttv3.persist.MqttDefaultFilePersistence
 
-import org.scalatest.{BeforeAndAfter, FunSuite}
+import org.scalatest.BeforeAndAfter
 import org.scalatest.concurrent.Eventually
 
 import org.apache.spark.streaming.{Milliseconds, StreamingContext}
@@ -37,10 +37,10 @@ import org.apache.spark.storage.StorageLevel
 import org.apache.spark.streaming.dstream.ReceiverInputDStream
 import org.apache.spark.streaming.scheduler.StreamingListener
 import org.apache.spark.streaming.scheduler.StreamingListenerReceiverStarted
-import org.apache.spark.SparkConf
+import org.apache.spark.{SparkConf, SparkFunSuite}
 import org.apache.spark.util.Utils
 
-class MQTTStreamSuite extends FunSuite with Eventually with BeforeAndAfter {
+class MQTTStreamSuite extends SparkFunSuite with Eventually with BeforeAndAfter {
 
   private val batchDuration = Milliseconds(500)
   private val master = "local[2]"
diff --git a/external/twitter/pom.xml b/external/twitter/pom.xml
index 8b6a8959ac4cf..d28e3e1846d70 100644
--- a/external/twitter/pom.xml
+++ b/external/twitter/pom.xml
@@ -41,6 +41,13 @@
       <version>${project.version}</version>
       <scope>provided</scope>
     </dependency>
+    <dependency>
+      <groupId>org.apache.spark</groupId>
+      <artifactId>spark-core_${scala.binary.version}</artifactId>
+      <version>${project.version}</version>
+      <type>test-jar</type>
+      <scope>test</scope>
+    </dependency>
     <dependency>
       <groupId>org.twitter4j</groupId>
       <artifactId>twitter4j-stream</artifactId>
diff --git a/external/twitter/src/test/scala/org/apache/spark/streaming/twitter/TwitterStreamSuite.scala b/external/twitter/src/test/scala/org/apache/spark/streaming/twitter/TwitterStreamSuite.scala
index 9ee57d7581d85..d9acb568879fe 100644
--- a/external/twitter/src/test/scala/org/apache/spark/streaming/twitter/TwitterStreamSuite.scala
+++ b/external/twitter/src/test/scala/org/apache/spark/streaming/twitter/TwitterStreamSuite.scala
@@ -18,16 +18,16 @@
 package org.apache.spark.streaming.twitter
 
 
-import org.scalatest.{BeforeAndAfter, FunSuite}
+import org.scalatest.BeforeAndAfter
 import twitter4j.Status
 import twitter4j.auth.{NullAuthorization, Authorization}
 
-import org.apache.spark.Logging
+import org.apache.spark.{Logging, SparkFunSuite}
 import org.apache.spark.streaming.{Seconds, StreamingContext}
 import org.apache.spark.storage.StorageLevel
 import org.apache.spark.streaming.dstream.ReceiverInputDStream
 
-class TwitterStreamSuite extends FunSuite with BeforeAndAfter with Logging {
+class TwitterStreamSuite extends SparkFunSuite with BeforeAndAfter with Logging {
 
   val batchDuration = Seconds(1)
 
diff --git a/external/zeromq/pom.xml b/external/zeromq/pom.xml
index a50d378b34335..9998c11c85171 100644
--- a/external/zeromq/pom.xml
+++ b/external/zeromq/pom.xml
@@ -41,6 +41,13 @@
       <version>${project.version}</version>
       <scope>provided</scope>
     </dependency>
+    <dependency>
+      <groupId>org.apache.spark</groupId>
+      <artifactId>spark-core_${scala.binary.version}</artifactId>
+      <version>${project.version}</version>
+      <type>test-jar</type>
+      <scope>test</scope>
+    </dependency>
     <dependency>
       <groupId>${akka.group}</groupId>
       <artifactId>akka-zeromq_${scala.binary.version}</artifactId>
diff --git a/external/zeromq/src/test/scala/org/apache/spark/streaming/zeromq/ZeroMQStreamSuite.scala b/external/zeromq/src/test/scala/org/apache/spark/streaming/zeromq/ZeroMQStreamSuite.scala
index a7566e733d891..35d2e62c68480 100644
--- a/external/zeromq/src/test/scala/org/apache/spark/streaming/zeromq/ZeroMQStreamSuite.scala
+++ b/external/zeromq/src/test/scala/org/apache/spark/streaming/zeromq/ZeroMQStreamSuite.scala
@@ -20,13 +20,13 @@ package org.apache.spark.streaming.zeromq
 import akka.actor.SupervisorStrategy
 import akka.util.ByteString
 import akka.zeromq.Subscribe
-import org.scalatest.FunSuite
 
+import org.apache.spark.SparkFunSuite
 import org.apache.spark.storage.StorageLevel
 import org.apache.spark.streaming.{Seconds, StreamingContext}
 import org.apache.spark.streaming.dstream.ReceiverInputDStream
 
-class ZeroMQStreamSuite extends FunSuite {
+class ZeroMQStreamSuite extends SparkFunSuite {
 
   val batchDuration = Seconds(1)
 
diff --git a/graphx/pom.xml b/graphx/pom.xml
index d38a3aa8256b7..28b41228feb3d 100644
--- a/graphx/pom.xml
+++ b/graphx/pom.xml
@@ -40,6 +40,13 @@
       <artifactId>spark-core_${scala.binary.version}</artifactId>
       <version>${project.version}</version>
     </dependency>
+    <dependency>
+      <groupId>org.apache.spark</groupId>
+      <artifactId>spark-core_${scala.binary.version}</artifactId>
+      <version>${project.version}</version>
+      <type>test-jar</type>
+      <scope>test</scope>
+    </dependency>
     <dependency>
       <groupId>com.google.guava</groupId>
       <artifactId>guava</artifactId>
diff --git a/graphx/src/test/scala/org/apache/spark/graphx/EdgeRDDSuite.scala b/graphx/src/test/scala/org/apache/spark/graphx/EdgeRDDSuite.scala
index eb1dbe52c2fda..f1ecc9e2219d1 100644
--- a/graphx/src/test/scala/org/apache/spark/graphx/EdgeRDDSuite.scala
+++ b/graphx/src/test/scala/org/apache/spark/graphx/EdgeRDDSuite.scala
@@ -17,11 +17,10 @@
 
 package org.apache.spark.graphx
 
-import org.scalatest.FunSuite
-
+import org.apache.spark.SparkFunSuite
 import org.apache.spark.storage.StorageLevel
 
-class EdgeRDDSuite extends FunSuite with LocalSparkContext {
+class EdgeRDDSuite extends SparkFunSuite with LocalSparkContext {
 
   test("cache, getStorageLevel") {
     // test to see if getStorageLevel returns correct value after caching
diff --git a/graphx/src/test/scala/org/apache/spark/graphx/EdgeSuite.scala b/graphx/src/test/scala/org/apache/spark/graphx/EdgeSuite.scala
index 5a2c73b414279..7629128010193 100644
--- a/graphx/src/test/scala/org/apache/spark/graphx/EdgeSuite.scala
+++ b/graphx/src/test/scala/org/apache/spark/graphx/EdgeSuite.scala
@@ -17,9 +17,9 @@
 
 package org.apache.spark.graphx
 
-import org.scalatest.FunSuite
+import org.apache.spark.SparkFunSuite
 
-class EdgeSuite extends FunSuite {
+class EdgeSuite extends SparkFunSuite {
   test ("compare") {
     // decending order
     val testEdges: Array[Edge[Int]] = Array(
diff --git a/graphx/src/test/scala/org/apache/spark/graphx/GraphOpsSuite.scala b/graphx/src/test/scala/org/apache/spark/graphx/GraphOpsSuite.scala
index 68fe83739e399..57a8b95dd12e9 100644
--- a/graphx/src/test/scala/org/apache/spark/graphx/GraphOpsSuite.scala
+++ b/graphx/src/test/scala/org/apache/spark/graphx/GraphOpsSuite.scala
@@ -17,13 +17,12 @@
 
 package org.apache.spark.graphx
 
-import org.apache.spark.SparkContext
+import org.apache.spark.{SparkContext, SparkFunSuite}
 import org.apache.spark.graphx.Graph._
 import org.apache.spark.graphx.impl.EdgePartition
 import org.apache.spark.rdd._
-import org.scalatest.FunSuite
 
-class GraphOpsSuite extends FunSuite with LocalSparkContext {
+class GraphOpsSuite extends SparkFunSuite with LocalSparkContext {
 
   test("joinVertices") {
     withSpark { sc =>
diff --git a/graphx/src/test/scala/org/apache/spark/graphx/GraphSuite.scala b/graphx/src/test/scala/org/apache/spark/graphx/GraphSuite.scala
index 2b1d8e47326f8..1f5e27d5508b8 100644
--- a/graphx/src/test/scala/org/apache/spark/graphx/GraphSuite.scala
+++ b/graphx/src/test/scala/org/apache/spark/graphx/GraphSuite.scala
@@ -17,16 +17,14 @@
 
 package org.apache.spark.graphx
 
-import org.scalatest.FunSuite
-
-import org.apache.spark.SparkContext
+import org.apache.spark.{SparkContext, SparkFunSuite}
 import org.apache.spark.graphx.Graph._
 import org.apache.spark.graphx.PartitionStrategy._
 import org.apache.spark.rdd._
 import org.apache.spark.storage.StorageLevel
 import org.apache.spark.util.Utils
 
-class GraphSuite extends FunSuite with LocalSparkContext {
+class GraphSuite extends SparkFunSuite with LocalSparkContext {
 
   def starGraph(sc: SparkContext, n: Int): Graph[String, Int] = {
     Graph.fromEdgeTuples(sc.parallelize((1 to n).map(x => (0: VertexId, x: VertexId)), 3), "v")
diff --git a/graphx/src/test/scala/org/apache/spark/graphx/PregelSuite.scala b/graphx/src/test/scala/org/apache/spark/graphx/PregelSuite.scala
index 490b94429ea1f..8afa2d403b53f 100644
--- a/graphx/src/test/scala/org/apache/spark/graphx/PregelSuite.scala
+++ b/graphx/src/test/scala/org/apache/spark/graphx/PregelSuite.scala
@@ -17,12 +17,10 @@
 
 package org.apache.spark.graphx
 
-import org.scalatest.FunSuite
-
-import org.apache.spark.SparkContext
+import org.apache.spark.{SparkContext, SparkFunSuite}
 import org.apache.spark.rdd._
 
-class PregelSuite extends FunSuite with LocalSparkContext {
+class PregelSuite extends SparkFunSuite with LocalSparkContext {
 
   test("1 iteration") {
     withSpark { sc =>
diff --git a/graphx/src/test/scala/org/apache/spark/graphx/VertexRDDSuite.scala b/graphx/src/test/scala/org/apache/spark/graphx/VertexRDDSuite.scala
index d0a7198d691d7..f1aa685a79c98 100644
--- a/graphx/src/test/scala/org/apache/spark/graphx/VertexRDDSuite.scala
+++ b/graphx/src/test/scala/org/apache/spark/graphx/VertexRDDSuite.scala
@@ -17,13 +17,11 @@
 
 package org.apache.spark.graphx
 
-import org.scalatest.FunSuite
-
-import org.apache.spark.{HashPartitioner, SparkContext}
+import org.apache.spark.{HashPartitioner, SparkContext, SparkFunSuite}
 import org.apache.spark.rdd.RDD
 import org.apache.spark.storage.StorageLevel
 
-class VertexRDDSuite extends FunSuite with LocalSparkContext {
+class VertexRDDSuite extends SparkFunSuite with LocalSparkContext {
 
   private def vertices(sc: SparkContext, n: Int) = {
     VertexRDD(sc.parallelize((0 to n).map(x => (x.toLong, x)), 5))
diff --git a/graphx/src/test/scala/org/apache/spark/graphx/impl/EdgePartitionSuite.scala b/graphx/src/test/scala/org/apache/spark/graphx/impl/EdgePartitionSuite.scala
index 515f3a9cd02eb..7435647c6d9ee 100644
--- a/graphx/src/test/scala/org/apache/spark/graphx/impl/EdgePartitionSuite.scala
+++ b/graphx/src/test/scala/org/apache/spark/graphx/impl/EdgePartitionSuite.scala
@@ -20,15 +20,13 @@ package org.apache.spark.graphx.impl
 import scala.reflect.ClassTag
 import scala.util.Random
 
-import org.scalatest.FunSuite
-
-import org.apache.spark.SparkConf
+import org.apache.spark.{SparkConf, SparkFunSuite}
 import org.apache.spark.serializer.JavaSerializer
 import org.apache.spark.serializer.KryoSerializer
 
 import org.apache.spark.graphx._
 
-class EdgePartitionSuite extends FunSuite {
+class EdgePartitionSuite extends SparkFunSuite {
 
   def makeEdgePartition[A: ClassTag](xs: Iterable[(Int, Int, A)]): EdgePartition[A, Int] = {
     val builder = new EdgePartitionBuilder[A, Int]
diff --git a/graphx/src/test/scala/org/apache/spark/graphx/impl/VertexPartitionSuite.scala b/graphx/src/test/scala/org/apache/spark/graphx/impl/VertexPartitionSuite.scala
index fe8304c1cdc32..1203f8959f506 100644
--- a/graphx/src/test/scala/org/apache/spark/graphx/impl/VertexPartitionSuite.scala
+++ b/graphx/src/test/scala/org/apache/spark/graphx/impl/VertexPartitionSuite.scala
@@ -17,15 +17,13 @@
 
 package org.apache.spark.graphx.impl
 
-import org.scalatest.FunSuite
-
-import org.apache.spark.SparkConf
+import org.apache.spark.{SparkConf, SparkFunSuite}
 import org.apache.spark.serializer.JavaSerializer
 import org.apache.spark.serializer.KryoSerializer
 
 import org.apache.spark.graphx._
 
-class VertexPartitionSuite extends FunSuite {
+class VertexPartitionSuite extends SparkFunSuite {
 
   test("isDefined, filter") {
     val vp = VertexPartition(Iterator((0L, 1), (1L, 1))).filter { (vid, attr) => vid == 0 }
diff --git a/graphx/src/test/scala/org/apache/spark/graphx/lib/ConnectedComponentsSuite.scala b/graphx/src/test/scala/org/apache/spark/graphx/lib/ConnectedComponentsSuite.scala
index accccfc232cd3..c965a6eb8df13 100644
--- a/graphx/src/test/scala/org/apache/spark/graphx/lib/ConnectedComponentsSuite.scala
+++ b/graphx/src/test/scala/org/apache/spark/graphx/lib/ConnectedComponentsSuite.scala
@@ -17,16 +17,14 @@
 
 package org.apache.spark.graphx.lib
 
-import org.scalatest.FunSuite
-
-import org.apache.spark.SparkContext
+import org.apache.spark.{SparkContext, SparkFunSuite}
 import org.apache.spark.SparkContext._
 import org.apache.spark.graphx._
 import org.apache.spark.graphx.util.GraphGenerators
 import org.apache.spark.rdd._
 
 
-class ConnectedComponentsSuite extends FunSuite with LocalSparkContext {
+class ConnectedComponentsSuite extends SparkFunSuite with LocalSparkContext {
 
   test("Grid Connected Components") {
     withSpark { sc =>
diff --git a/graphx/src/test/scala/org/apache/spark/graphx/lib/LabelPropagationSuite.scala b/graphx/src/test/scala/org/apache/spark/graphx/lib/LabelPropagationSuite.scala
index 61fd0c4605568..808877f0590f8 100644
--- a/graphx/src/test/scala/org/apache/spark/graphx/lib/LabelPropagationSuite.scala
+++ b/graphx/src/test/scala/org/apache/spark/graphx/lib/LabelPropagationSuite.scala
@@ -17,11 +17,10 @@
 
 package org.apache.spark.graphx.lib
 
-import org.scalatest.FunSuite
-
+import org.apache.spark.SparkFunSuite
 import org.apache.spark.graphx._
 
-class LabelPropagationSuite extends FunSuite with LocalSparkContext {
+class LabelPropagationSuite extends SparkFunSuite with LocalSparkContext {
   test("Label Propagation") {
     withSpark { sc =>
       // Construct a graph with two cliques connected by a single edge
diff --git a/graphx/src/test/scala/org/apache/spark/graphx/lib/PageRankSuite.scala b/graphx/src/test/scala/org/apache/spark/graphx/lib/PageRankSuite.scala
index 39c6ace912b00..45f1e3011035e 100644
--- a/graphx/src/test/scala/org/apache/spark/graphx/lib/PageRankSuite.scala
+++ b/graphx/src/test/scala/org/apache/spark/graphx/lib/PageRankSuite.scala
@@ -17,8 +17,7 @@
 
 package org.apache.spark.graphx.lib
 
-import org.scalatest.FunSuite
-
+import org.apache.spark.SparkFunSuite
 import org.apache.spark.graphx._
 import org.apache.spark.graphx.util.GraphGenerators
 
@@ -57,7 +56,7 @@ object GridPageRank {
 }
 
 
-class PageRankSuite extends FunSuite with LocalSparkContext {
+class PageRankSuite extends SparkFunSuite with LocalSparkContext {
 
   def compareRanks(a: VertexRDD[Double], b: VertexRDD[Double]): Double = {
     a.leftJoin(b) { case (id, a, bOpt) => (a - bOpt.getOrElse(0.0)) * (a - bOpt.getOrElse(0.0)) }
diff --git a/graphx/src/test/scala/org/apache/spark/graphx/lib/SVDPlusPlusSuite.scala b/graphx/src/test/scala/org/apache/spark/graphx/lib/SVDPlusPlusSuite.scala
index 7bd6b7f3c4ab2..2991438f5e57e 100644
--- a/graphx/src/test/scala/org/apache/spark/graphx/lib/SVDPlusPlusSuite.scala
+++ b/graphx/src/test/scala/org/apache/spark/graphx/lib/SVDPlusPlusSuite.scala
@@ -17,12 +17,11 @@
 
 package org.apache.spark.graphx.lib
 
-import org.scalatest.FunSuite
-
+import org.apache.spark.SparkFunSuite
 import org.apache.spark.graphx._
 
 
-class SVDPlusPlusSuite extends FunSuite with LocalSparkContext {
+class SVDPlusPlusSuite extends SparkFunSuite with LocalSparkContext {
 
   test("Test SVD++ with mean square error on training set") {
     withSpark { sc =>
diff --git a/graphx/src/test/scala/org/apache/spark/graphx/lib/ShortestPathsSuite.scala b/graphx/src/test/scala/org/apache/spark/graphx/lib/ShortestPathsSuite.scala
index f2c38e79c452c..d7eaa70ce6407 100644
--- a/graphx/src/test/scala/org/apache/spark/graphx/lib/ShortestPathsSuite.scala
+++ b/graphx/src/test/scala/org/apache/spark/graphx/lib/ShortestPathsSuite.scala
@@ -17,16 +17,14 @@
 
 package org.apache.spark.graphx.lib
 
-import org.scalatest.FunSuite
-
-import org.apache.spark.SparkContext
+import org.apache.spark.{SparkContext, SparkFunSuite}
 import org.apache.spark.SparkContext._
 import org.apache.spark.graphx._
 import org.apache.spark.graphx.lib._
 import org.apache.spark.graphx.util.GraphGenerators
 import org.apache.spark.rdd._
 
-class ShortestPathsSuite extends FunSuite with LocalSparkContext {
+class ShortestPathsSuite extends SparkFunSuite with LocalSparkContext {
 
   test("Shortest Path Computations") {
     withSpark { sc =>
diff --git a/graphx/src/test/scala/org/apache/spark/graphx/lib/StronglyConnectedComponentsSuite.scala b/graphx/src/test/scala/org/apache/spark/graphx/lib/StronglyConnectedComponentsSuite.scala
index 1f658c371ffcf..d6b03208180db 100644
--- a/graphx/src/test/scala/org/apache/spark/graphx/lib/StronglyConnectedComponentsSuite.scala
+++ b/graphx/src/test/scala/org/apache/spark/graphx/lib/StronglyConnectedComponentsSuite.scala
@@ -17,16 +17,14 @@
 
 package org.apache.spark.graphx.lib
 
-import org.scalatest.FunSuite
-
-import org.apache.spark.SparkContext
+import org.apache.spark.{SparkContext, SparkFunSuite}
 import org.apache.spark.SparkContext._
 import org.apache.spark.graphx._
 import org.apache.spark.graphx.util.GraphGenerators
 import org.apache.spark.rdd._
 
 
-class StronglyConnectedComponentsSuite extends FunSuite with LocalSparkContext {
+class StronglyConnectedComponentsSuite extends SparkFunSuite with LocalSparkContext {
 
   test("Island Strongly Connected Components") {
     withSpark { sc =>
diff --git a/graphx/src/test/scala/org/apache/spark/graphx/lib/TriangleCountSuite.scala b/graphx/src/test/scala/org/apache/spark/graphx/lib/TriangleCountSuite.scala
index 79bf4e6cd18ee..c47552cf3a3bd 100644
--- a/graphx/src/test/scala/org/apache/spark/graphx/lib/TriangleCountSuite.scala
+++ b/graphx/src/test/scala/org/apache/spark/graphx/lib/TriangleCountSuite.scala
@@ -17,13 +17,12 @@
 
 package org.apache.spark.graphx.lib
 
-import org.scalatest.FunSuite
-
+import org.apache.spark.SparkFunSuite
 import org.apache.spark.graphx._
 import org.apache.spark.graphx.PartitionStrategy.RandomVertexCut
 
 
-class TriangleCountSuite extends FunSuite with LocalSparkContext {
+class TriangleCountSuite extends SparkFunSuite with LocalSparkContext {
 
   test("Count a single triangle") {
     withSpark { sc =>
diff --git a/graphx/src/test/scala/org/apache/spark/graphx/util/BytecodeUtilsSuite.scala b/graphx/src/test/scala/org/apache/spark/graphx/util/BytecodeUtilsSuite.scala
index f3b3738db0dad..186d0cc2a977b 100644
--- a/graphx/src/test/scala/org/apache/spark/graphx/util/BytecodeUtilsSuite.scala
+++ b/graphx/src/test/scala/org/apache/spark/graphx/util/BytecodeUtilsSuite.scala
@@ -17,10 +17,10 @@
 
 package org.apache.spark.graphx.util
 
-import org.scalatest.FunSuite
+import org.apache.spark.SparkFunSuite
 
 
-class BytecodeUtilsSuite extends FunSuite {
+class BytecodeUtilsSuite extends SparkFunSuite {
 
   import BytecodeUtilsSuite.TestClass
 
diff --git a/graphx/src/test/scala/org/apache/spark/graphx/util/GraphGeneratorsSuite.scala b/graphx/src/test/scala/org/apache/spark/graphx/util/GraphGeneratorsSuite.scala
index 8d9c8ddccbb3c..32e0c841c6997 100644
--- a/graphx/src/test/scala/org/apache/spark/graphx/util/GraphGeneratorsSuite.scala
+++ b/graphx/src/test/scala/org/apache/spark/graphx/util/GraphGeneratorsSuite.scala
@@ -17,11 +17,10 @@
 
 package org.apache.spark.graphx.util
 
-import org.scalatest.FunSuite
-
+import org.apache.spark.SparkFunSuite
 import org.apache.spark.graphx.LocalSparkContext
 
-class GraphGeneratorsSuite extends FunSuite with LocalSparkContext {
+class GraphGeneratorsSuite extends SparkFunSuite with LocalSparkContext {
 
   test("GraphGenerators.generateRandomEdges") {
     val src = 5
diff --git a/mllib/src/test/java/org/apache/spark/ml/util/IdentifiableSuite.scala b/mllib/src/test/java/org/apache/spark/ml/util/IdentifiableSuite.scala
index 67c262d0f9d8d..928301523fba9 100644
--- a/mllib/src/test/java/org/apache/spark/ml/util/IdentifiableSuite.scala
+++ b/mllib/src/test/java/org/apache/spark/ml/util/IdentifiableSuite.scala
@@ -17,9 +17,9 @@
 
 package org.apache.spark.ml.util
 
-import org.scalatest.FunSuite
+import org.apache.spark.SparkFunSuite
 
-class IdentifiableSuite extends FunSuite {
+class IdentifiableSuite extends SparkFunSuite {
 
   import IdentifiableSuite.Test
 
diff --git a/mllib/src/test/scala/org/apache/spark/ml/PipelineSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/PipelineSuite.scala
index 2b04a3034782e..05bf58e63abaf 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/PipelineSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/PipelineSuite.scala
@@ -19,13 +19,13 @@ package org.apache.spark.ml
 
 import org.mockito.Matchers.{any, eq => meq}
 import org.mockito.Mockito.when
-import org.scalatest.FunSuite
 import org.scalatest.mock.MockitoSugar.mock
 
+import org.apache.spark.SparkFunSuite
 import org.apache.spark.ml.param.ParamMap
 import org.apache.spark.sql.DataFrame
 
-class PipelineSuite extends FunSuite {
+class PipelineSuite extends SparkFunSuite {
 
   abstract class MyModel extends Model[MyModel]
 
diff --git a/mllib/src/test/scala/org/apache/spark/ml/attribute/AttributeGroupSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/attribute/AttributeGroupSuite.scala
index 17ddd335deb6d..512cffb1acb66 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/attribute/AttributeGroupSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/attribute/AttributeGroupSuite.scala
@@ -17,9 +17,9 @@
 
 package org.apache.spark.ml.attribute
 
-import org.scalatest.FunSuite
+import org.apache.spark.SparkFunSuite
 
-class AttributeGroupSuite extends FunSuite {
+class AttributeGroupSuite extends SparkFunSuite {
 
   test("attribute group") {
     val attrs = Array(
diff --git a/mllib/src/test/scala/org/apache/spark/ml/attribute/AttributeSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/attribute/AttributeSuite.scala
index ec9b717e41ce8..72b575d022547 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/attribute/AttributeSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/attribute/AttributeSuite.scala
@@ -17,11 +17,10 @@
 
 package org.apache.spark.ml.attribute
 
-import org.scalatest.FunSuite
-
+import org.apache.spark.SparkFunSuite
 import org.apache.spark.sql.types._
 
-class AttributeSuite extends FunSuite {
+class AttributeSuite extends SparkFunSuite {
 
   test("default numeric attribute") {
     val attr: NumericAttribute = NumericAttribute.defaultAttr
diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/DecisionTreeClassifierSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/DecisionTreeClassifierSuite.scala
index 3fdc66be8a314..40554f6ef94a8 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/classification/DecisionTreeClassifierSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/classification/DecisionTreeClassifierSuite.scala
@@ -17,8 +17,7 @@
 
 package org.apache.spark.ml.classification
 
-import org.scalatest.FunSuite
-
+import org.apache.spark.SparkFunSuite
 import org.apache.spark.ml.impl.TreeTests
 import org.apache.spark.mllib.linalg.Vectors
 import org.apache.spark.mllib.regression.LabeledPoint
@@ -29,7 +28,7 @@ import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.DataFrame
 
 
-class DecisionTreeClassifierSuite extends FunSuite with MLlibTestSparkContext {
+class DecisionTreeClassifierSuite extends SparkFunSuite with MLlibTestSparkContext {
 
   import DecisionTreeClassifierSuite.compareAPIs
 
@@ -251,7 +250,7 @@ class DecisionTreeClassifierSuite extends FunSuite with MLlibTestSparkContext {
   */
 }
 
-private[ml] object DecisionTreeClassifierSuite extends FunSuite {
+private[ml] object DecisionTreeClassifierSuite extends SparkFunSuite {
 
   /**
    * Train 2 decision trees on the given dataset, one using the old API and one using the new API.
diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/GBTClassifierSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/GBTClassifierSuite.scala
index ea86867f1161a..09327051621e0 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/classification/GBTClassifierSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/classification/GBTClassifierSuite.scala
@@ -17,8 +17,7 @@
 
 package org.apache.spark.ml.classification
 
-import org.scalatest.FunSuite
-
+import org.apache.spark.SparkFunSuite
 import org.apache.spark.ml.impl.TreeTests
 import org.apache.spark.mllib.regression.LabeledPoint
 import org.apache.spark.mllib.tree.{EnsembleTestHelper, GradientBoostedTrees => OldGBT}
@@ -31,7 +30,7 @@ import org.apache.spark.sql.DataFrame
 /**
  * Test suite for [[GBTClassifier]].
  */
-class GBTClassifierSuite extends FunSuite with MLlibTestSparkContext {
+class GBTClassifierSuite extends SparkFunSuite with MLlibTestSparkContext {
 
   import GBTClassifierSuite.compareAPIs
 
diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala
index 9f77d5f3efc55..a755cac3ea76e 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala
@@ -17,15 +17,14 @@
 
 package org.apache.spark.ml.classification
 
-import org.scalatest.FunSuite
-
+import org.apache.spark.SparkFunSuite
 import org.apache.spark.mllib.classification.LogisticRegressionSuite._
 import org.apache.spark.mllib.linalg.Vector
 import org.apache.spark.mllib.util.MLlibTestSparkContext
 import org.apache.spark.mllib.util.TestingUtils._
 import org.apache.spark.sql.{DataFrame, Row}
 
-class LogisticRegressionSuite extends FunSuite with MLlibTestSparkContext {
+class LogisticRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
 
   @transient var dataset: DataFrame = _
   @transient var binaryDataset: DataFrame = _
diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/OneVsRestSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/OneVsRestSuite.scala
index 770b56890fa45..f439f3261f06f 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/classification/OneVsRestSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/classification/OneVsRestSuite.scala
@@ -17,8 +17,7 @@
 
 package org.apache.spark.ml.classification
 
-import org.scalatest.FunSuite
-
+import org.apache.spark.SparkFunSuite
 import org.apache.spark.ml.attribute.NominalAttribute
 import org.apache.spark.ml.util.MetadataUtils
 import org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS
@@ -30,7 +29,7 @@ import org.apache.spark.mllib.util.TestingUtils._
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.DataFrame
 
-class OneVsRestSuite extends FunSuite with MLlibTestSparkContext {
+class OneVsRestSuite extends SparkFunSuite with MLlibTestSparkContext {
 
   @transient var dataset: DataFrame = _
   @transient var rdd: RDD[LabeledPoint] = _
diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/RandomForestClassifierSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/RandomForestClassifierSuite.scala
index cdbbacab8e0e3..f699d0c374d2f 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/classification/RandomForestClassifierSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/classification/RandomForestClassifierSuite.scala
@@ -17,8 +17,7 @@
 
 package org.apache.spark.ml.classification
 
-import org.scalatest.FunSuite
-
+import org.apache.spark.SparkFunSuite
 import org.apache.spark.ml.impl.TreeTests
 import org.apache.spark.mllib.linalg.Vectors
 import org.apache.spark.mllib.regression.LabeledPoint
@@ -32,7 +31,7 @@ import org.apache.spark.sql.DataFrame
 /**
  * Test suite for [[RandomForestClassifier]].
  */
-class RandomForestClassifierSuite extends FunSuite with MLlibTestSparkContext {
+class RandomForestClassifierSuite extends SparkFunSuite with MLlibTestSparkContext {
 
   import RandomForestClassifierSuite.compareAPIs
 
diff --git a/mllib/src/test/scala/org/apache/spark/ml/evaluation/RegressionEvaluatorSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/evaluation/RegressionEvaluatorSuite.scala
index 3ea7aad5274f2..9da0618abd23c 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/evaluation/RegressionEvaluatorSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/evaluation/RegressionEvaluatorSuite.scala
@@ -17,13 +17,12 @@
 
 package org.apache.spark.ml.evaluation
 
-import org.scalatest.FunSuite
-
+import org.apache.spark.SparkFunSuite
 import org.apache.spark.ml.regression.LinearRegression
 import org.apache.spark.mllib.util.{LinearDataGenerator, MLlibTestSparkContext}
 import org.apache.spark.mllib.util.TestingUtils._
 
-class RegressionEvaluatorSuite extends FunSuite with MLlibTestSparkContext {
+class RegressionEvaluatorSuite extends SparkFunSuite with MLlibTestSparkContext {
 
   test("Regression Evaluator: default params") {
     /**
diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/BinarizerSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/BinarizerSuite.scala
index 8f6c6b39dc93b..d4631518e0f5b 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/feature/BinarizerSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/feature/BinarizerSuite.scala
@@ -17,12 +17,11 @@
 
 package org.apache.spark.ml.feature
 
-import org.scalatest.FunSuite
-
+import org.apache.spark.SparkFunSuite
 import org.apache.spark.mllib.util.MLlibTestSparkContext
 import org.apache.spark.sql.{DataFrame, Row}
 
-class BinarizerSuite extends FunSuite with MLlibTestSparkContext {
+class BinarizerSuite extends SparkFunSuite with MLlibTestSparkContext {
 
   @transient var data: Array[Double] = _
 
diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/BucketizerSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/BucketizerSuite.scala
index 0391bd8427c2c..507a8a7db24c7 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/feature/BucketizerSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/feature/BucketizerSuite.scala
@@ -19,15 +19,13 @@ package org.apache.spark.ml.feature
 
 import scala.util.Random
 
-import org.scalatest.FunSuite
-
-import org.apache.spark.SparkException
+import org.apache.spark.{SparkException, SparkFunSuite}
 import org.apache.spark.mllib.linalg.Vectors
 import org.apache.spark.mllib.util.MLlibTestSparkContext
 import org.apache.spark.mllib.util.TestingUtils._
 import org.apache.spark.sql.{DataFrame, Row}
 
-class BucketizerSuite extends FunSuite with MLlibTestSparkContext {
+class BucketizerSuite extends SparkFunSuite with MLlibTestSparkContext {
 
   test("Bucket continuous features, without -inf,inf") {
     // Check a set of valid feature values.
@@ -110,7 +108,7 @@ class BucketizerSuite extends FunSuite with MLlibTestSparkContext {
   }
 }
 
-private object BucketizerSuite extends FunSuite {
+private object BucketizerSuite extends SparkFunSuite {
   /** Brute force search for buckets.  Bucket i is defined by the range [split(i), split(i+1)). */
   def linearSearchForBuckets(splits: Array[Double], feature: Double): Double = {
     require(feature >= splits.head)
diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/HashingTFSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/HashingTFSuite.scala
index 2e4beb0bfff63..7b2d70e644005 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/feature/HashingTFSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/feature/HashingTFSuite.scala
@@ -17,8 +17,7 @@
 
 package org.apache.spark.ml.feature
 
-import org.scalatest.FunSuite
-
+import org.apache.spark.SparkFunSuite
 import org.apache.spark.ml.attribute.AttributeGroup
 import org.apache.spark.ml.param.ParamsSuite
 import org.apache.spark.mllib.linalg.{Vector, Vectors}
@@ -26,7 +25,7 @@ import org.apache.spark.mllib.util.MLlibTestSparkContext
 import org.apache.spark.mllib.util.TestingUtils._
 import org.apache.spark.util.Utils
 
-class HashingTFSuite extends FunSuite with MLlibTestSparkContext {
+class HashingTFSuite extends SparkFunSuite with MLlibTestSparkContext {
 
   test("params") {
     val hashingTF = new HashingTF
diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/IDFSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/IDFSuite.scala
index f85e85471617a..d83772e8be755 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/feature/IDFSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/feature/IDFSuite.scala
@@ -17,14 +17,13 @@
 
 package org.apache.spark.ml.feature
 
-import org.scalatest.FunSuite
-
+import org.apache.spark.SparkFunSuite
 import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vector, Vectors}
 import org.apache.spark.mllib.util.MLlibTestSparkContext
 import org.apache.spark.mllib.util.TestingUtils._
 import org.apache.spark.sql.Row
 
-class IDFSuite extends FunSuite with MLlibTestSparkContext {
+class IDFSuite extends SparkFunSuite with MLlibTestSparkContext {
 
   def scaleDataWithIDF(dataSet: Array[Vector], model: Vector): Array[Vector] = {
     dataSet.map {
diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/NormalizerSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/NormalizerSuite.scala
index 9d09f24709e23..9f03470b7f328 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/feature/NormalizerSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/feature/NormalizerSuite.scala
@@ -17,15 +17,14 @@
 
 package org.apache.spark.ml.feature
 
-import org.scalatest.FunSuite
-
+import org.apache.spark.SparkFunSuite
 import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vector, Vectors}
 import org.apache.spark.mllib.util.MLlibTestSparkContext
 import org.apache.spark.mllib.util.TestingUtils._
 import org.apache.spark.sql.{DataFrame, Row, SQLContext}
 
 
-class NormalizerSuite extends FunSuite with MLlibTestSparkContext {
+class NormalizerSuite extends SparkFunSuite with MLlibTestSparkContext {
 
   @transient var data: Array[Vector] = _
   @transient var dataFrame: DataFrame = _
diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/OneHotEncoderSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/OneHotEncoderSuite.scala
index 9018d0024d5f0..2e5036a844562 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/feature/OneHotEncoderSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/feature/OneHotEncoderSuite.scala
@@ -17,15 +17,14 @@
 
 package org.apache.spark.ml.feature
 
-import org.scalatest.FunSuite
-
+import org.apache.spark.SparkFunSuite
 import org.apache.spark.ml.attribute.{AttributeGroup, BinaryAttribute, NominalAttribute}
 import org.apache.spark.mllib.linalg.Vector
 import org.apache.spark.mllib.util.MLlibTestSparkContext
 import org.apache.spark.sql.DataFrame
 import org.apache.spark.sql.functions.col
 
-class OneHotEncoderSuite extends FunSuite with MLlibTestSparkContext {
+class OneHotEncoderSuite extends SparkFunSuite with MLlibTestSparkContext {
 
   def stringIndexed(): DataFrame = {
     val data = sc.parallelize(Seq((0, "a"), (1, "b"), (2, "c"), (3, "a"), (4, "a"), (5, "c")), 2)
diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/PolynomialExpansionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/PolynomialExpansionSuite.scala
index aa230ca073d5b..feca866cd711d 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/feature/PolynomialExpansionSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/feature/PolynomialExpansionSuite.scala
@@ -17,15 +17,15 @@
 
 package org.apache.spark.ml.feature
 
-import org.scalatest.FunSuite
 import org.scalatest.exceptions.TestFailedException
 
+import org.apache.spark.SparkFunSuite
 import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vector, Vectors}
 import org.apache.spark.mllib.util.MLlibTestSparkContext
 import org.apache.spark.mllib.util.TestingUtils._
 import org.apache.spark.sql.Row
 
-class PolynomialExpansionSuite extends FunSuite with MLlibTestSparkContext {
+class PolynomialExpansionSuite extends SparkFunSuite with MLlibTestSparkContext {
 
   test("Polynomial expansion with default parameter") {
     val data = Array(
diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/StringIndexerSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/StringIndexerSuite.scala
index 89c2fe45573aa..cbf1e8ddcb48a 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/feature/StringIndexerSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/feature/StringIndexerSuite.scala
@@ -17,12 +17,11 @@
 
 package org.apache.spark.ml.feature
 
-import org.scalatest.FunSuite
-
+import org.apache.spark.SparkFunSuite
 import org.apache.spark.ml.attribute.{Attribute, NominalAttribute}
 import org.apache.spark.mllib.util.MLlibTestSparkContext
 
-class StringIndexerSuite extends FunSuite with MLlibTestSparkContext {
+class StringIndexerSuite extends SparkFunSuite with MLlibTestSparkContext {
 
   test("StringIndexer") {
     val data = sc.parallelize(Seq((0, "a"), (1, "b"), (2, "c"), (3, "a"), (4, "a"), (5, "c")), 2)
diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/TokenizerSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/TokenizerSuite.scala
index eabda089d0988..ac279cb3215c2 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/feature/TokenizerSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/feature/TokenizerSuite.scala
@@ -19,15 +19,14 @@ package org.apache.spark.ml.feature
 
 import scala.beans.BeanInfo
 
-import org.scalatest.FunSuite
-
+import org.apache.spark.SparkFunSuite
 import org.apache.spark.mllib.util.MLlibTestSparkContext
 import org.apache.spark.sql.{DataFrame, Row}
 
 @BeanInfo
 case class TokenizerTestData(rawText: String, wantedTokens: Array[String])
 
-class RegexTokenizerSuite extends FunSuite with MLlibTestSparkContext {
+class RegexTokenizerSuite extends SparkFunSuite with MLlibTestSparkContext {
   import org.apache.spark.ml.feature.RegexTokenizerSuite._
 
   test("RegexTokenizer") {
@@ -60,7 +59,7 @@ class RegexTokenizerSuite extends FunSuite with MLlibTestSparkContext {
   }
 }
 
-object RegexTokenizerSuite extends FunSuite {
+object RegexTokenizerSuite extends SparkFunSuite {
 
   def testRegexTokenizer(t: RegexTokenizer, dataset: DataFrame): Unit = {
     t.transform(dataset)
diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/VectorAssemblerSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/VectorAssemblerSuite.scala
index 43534e89928b1..489abb5af7130 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/feature/VectorAssemblerSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/feature/VectorAssemblerSuite.scala
@@ -17,16 +17,14 @@
 
 package org.apache.spark.ml.feature
 
-import org.scalatest.FunSuite
-
-import org.apache.spark.SparkException
+import org.apache.spark.{SparkException, SparkFunSuite}
 import org.apache.spark.ml.attribute.{AttributeGroup, NominalAttribute, NumericAttribute}
 import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vector, Vectors}
 import org.apache.spark.mllib.util.MLlibTestSparkContext
 import org.apache.spark.sql.Row
 import org.apache.spark.sql.functions.col
 
-class VectorAssemblerSuite extends FunSuite with MLlibTestSparkContext {
+class VectorAssemblerSuite extends SparkFunSuite with MLlibTestSparkContext {
 
   test("assemble") {
     import org.apache.spark.ml.feature.VectorAssembler.assemble
diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/VectorIndexerSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/VectorIndexerSuite.scala
index b11b029c6343e..06affc7305cf5 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/feature/VectorIndexerSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/feature/VectorIndexerSuite.scala
@@ -19,16 +19,14 @@ package org.apache.spark.ml.feature
 
 import scala.beans.{BeanInfo, BeanProperty}
 
-import org.scalatest.FunSuite
-
-import org.apache.spark.SparkException
+import org.apache.spark.{SparkException, SparkFunSuite}
 import org.apache.spark.ml.attribute._
 import org.apache.spark.mllib.linalg.{SparseVector, Vector, Vectors}
 import org.apache.spark.mllib.util.MLlibTestSparkContext
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.DataFrame
 
-class VectorIndexerSuite extends FunSuite with MLlibTestSparkContext {
+class VectorIndexerSuite extends SparkFunSuite with MLlibTestSparkContext {
 
   import VectorIndexerSuite.FeatureData
 
diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/Word2VecSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/Word2VecSuite.scala
index df446d0c22015..94ebc3aebfa37 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/feature/Word2VecSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/feature/Word2VecSuite.scala
@@ -17,14 +17,13 @@
 
 package org.apache.spark.ml.feature
 
-import org.scalatest.FunSuite
-
+import org.apache.spark.SparkFunSuite
 import org.apache.spark.mllib.linalg.{Vector, Vectors}
 import org.apache.spark.mllib.util.MLlibTestSparkContext
 import org.apache.spark.mllib.util.TestingUtils._
 import org.apache.spark.sql.{Row, SQLContext}
 
-class Word2VecSuite extends FunSuite with MLlibTestSparkContext {
+class Word2VecSuite extends SparkFunSuite with MLlibTestSparkContext {
 
   test("Word2Vec") {
     val sqlContext = new SQLContext(sc)
diff --git a/mllib/src/test/scala/org/apache/spark/ml/impl/TreeTests.scala b/mllib/src/test/scala/org/apache/spark/ml/impl/TreeTests.scala
index 1505ad872536b..778abcba22c10 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/impl/TreeTests.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/impl/TreeTests.scala
@@ -19,8 +19,7 @@ package org.apache.spark.ml.impl
 
 import scala.collection.JavaConverters._
 
-import org.scalatest.FunSuite
-
+import org.apache.spark.SparkFunSuite
 import org.apache.spark.api.java.JavaRDD
 import org.apache.spark.ml.attribute.{AttributeGroup, NominalAttribute, NumericAttribute}
 import org.apache.spark.ml.tree._
@@ -29,7 +28,7 @@ import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.{SQLContext, DataFrame}
 
 
-private[ml] object TreeTests extends FunSuite {
+private[ml] object TreeTests extends SparkFunSuite {
 
   /**
    * Convert the given data to a DataFrame, and set the features and label metadata.
diff --git a/mllib/src/test/scala/org/apache/spark/ml/param/ParamsSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/param/ParamsSuite.scala
index 04f2af4727ea4..f80e7749098a5 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/param/ParamsSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/param/ParamsSuite.scala
@@ -17,9 +17,9 @@
 
 package org.apache.spark.ml.param
 
-import org.scalatest.FunSuite
+import org.apache.spark.SparkFunSuite
 
-class ParamsSuite extends FunSuite {
+class ParamsSuite extends SparkFunSuite {
 
   test("param") {
     val solver = new TestParams()
@@ -202,7 +202,7 @@ class ParamsSuite extends FunSuite {
   }
 }
 
-object ParamsSuite extends FunSuite {
+object ParamsSuite extends SparkFunSuite {
 
   /**
    * Checks common requirements for [[Params.params]]: 1) number of params; 2) params are ordered
diff --git a/mllib/src/test/scala/org/apache/spark/ml/param/shared/SharedParamsSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/param/shared/SharedParamsSuite.scala
index ca18fa1ad3c15..eb5408d3fee7c 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/param/shared/SharedParamsSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/param/shared/SharedParamsSuite.scala
@@ -17,11 +17,10 @@
 
 package org.apache.spark.ml.param.shared
 
-import org.scalatest.FunSuite
-
+import org.apache.spark.SparkFunSuite
 import org.apache.spark.ml.param.Params
 
-class SharedParamsSuite extends FunSuite {
+class SharedParamsSuite extends SparkFunSuite {
 
   test("outputCol") {
 
diff --git a/mllib/src/test/scala/org/apache/spark/ml/recommendation/ALSSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/recommendation/ALSSuite.scala
index 9a35555e52b90..2e5cfe7027eb6 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/recommendation/ALSSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/recommendation/ALSSuite.scala
@@ -25,9 +25,8 @@ import scala.collection.mutable.ArrayBuffer
 import scala.language.existentials
 
 import com.github.fommil.netlib.BLAS.{getInstance => blas}
-import org.scalatest.FunSuite
 
-import org.apache.spark.{Logging, SparkException}
+import org.apache.spark.{Logging, SparkException, SparkFunSuite}
 import org.apache.spark.ml.recommendation.ALS._
 import org.apache.spark.mllib.linalg.Vectors
 import org.apache.spark.mllib.util.MLlibTestSparkContext
@@ -36,7 +35,7 @@ import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.{Row, SQLContext}
 import org.apache.spark.util.Utils
 
-class ALSSuite extends FunSuite with MLlibTestSparkContext with Logging {
+class ALSSuite extends SparkFunSuite with MLlibTestSparkContext with Logging {
 
   private var tempDir: File = _
 
diff --git a/mllib/src/test/scala/org/apache/spark/ml/regression/DecisionTreeRegressorSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/regression/DecisionTreeRegressorSuite.scala
index 1196a772dfdd4..1182b89a8e3aa 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/regression/DecisionTreeRegressorSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/regression/DecisionTreeRegressorSuite.scala
@@ -17,8 +17,7 @@
 
 package org.apache.spark.ml.regression
 
-import org.scalatest.FunSuite
-
+import org.apache.spark.SparkFunSuite
 import org.apache.spark.ml.impl.TreeTests
 import org.apache.spark.mllib.regression.LabeledPoint
 import org.apache.spark.mllib.tree.{DecisionTree => OldDecisionTree,
@@ -28,7 +27,7 @@ import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.DataFrame
 
 
-class DecisionTreeRegressorSuite extends FunSuite with MLlibTestSparkContext {
+class DecisionTreeRegressorSuite extends SparkFunSuite with MLlibTestSparkContext {
 
   import DecisionTreeRegressorSuite.compareAPIs
 
@@ -69,7 +68,7 @@ class DecisionTreeRegressorSuite extends FunSuite with MLlibTestSparkContext {
   // TODO: test("model save/load")   SPARK-6725
 }
 
-private[ml] object DecisionTreeRegressorSuite extends FunSuite {
+private[ml] object DecisionTreeRegressorSuite extends SparkFunSuite {
 
   /**
    * Train 2 decision trees on the given dataset, one using the old API and one using the new API.
diff --git a/mllib/src/test/scala/org/apache/spark/ml/regression/GBTRegressorSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/regression/GBTRegressorSuite.scala
index 40e7e3273e965..f8a1469fee313 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/regression/GBTRegressorSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/regression/GBTRegressorSuite.scala
@@ -17,8 +17,7 @@
 
 package org.apache.spark.ml.regression
 
-import org.scalatest.FunSuite
-
+import org.apache.spark.SparkFunSuite
 import org.apache.spark.ml.impl.TreeTests
 import org.apache.spark.mllib.regression.LabeledPoint
 import org.apache.spark.mllib.tree.{EnsembleTestHelper, GradientBoostedTrees => OldGBT}
@@ -31,7 +30,7 @@ import org.apache.spark.sql.DataFrame
 /**
  * Test suite for [[GBTRegressor]].
  */
-class GBTRegressorSuite extends FunSuite with MLlibTestSparkContext {
+class GBTRegressorSuite extends SparkFunSuite with MLlibTestSparkContext {
 
   import GBTRegressorSuite.compareAPIs
 
diff --git a/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala
index 50a78631fa6d6..732e2c42be144 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala
@@ -17,14 +17,13 @@
 
 package org.apache.spark.ml.regression
 
-import org.scalatest.FunSuite
-
+import org.apache.spark.SparkFunSuite
 import org.apache.spark.mllib.linalg.DenseVector
 import org.apache.spark.mllib.util.{LinearDataGenerator, MLlibTestSparkContext}
 import org.apache.spark.mllib.util.TestingUtils._
 import org.apache.spark.sql.{DataFrame, Row}
 
-class LinearRegressionSuite extends FunSuite with MLlibTestSparkContext {
+class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
 
   @transient var dataset: DataFrame = _
 
diff --git a/mllib/src/test/scala/org/apache/spark/ml/regression/RandomForestRegressorSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/regression/RandomForestRegressorSuite.scala
index 3efffbb763b78..78911560945a2 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/regression/RandomForestRegressorSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/regression/RandomForestRegressorSuite.scala
@@ -17,8 +17,7 @@
 
 package org.apache.spark.ml.regression
 
-import org.scalatest.FunSuite
-
+import org.apache.spark.SparkFunSuite
 import org.apache.spark.ml.impl.TreeTests
 import org.apache.spark.mllib.regression.LabeledPoint
 import org.apache.spark.mllib.tree.{EnsembleTestHelper, RandomForest => OldRandomForest}
@@ -31,7 +30,7 @@ import org.apache.spark.sql.DataFrame
 /**
  * Test suite for [[RandomForestRegressor]].
  */
-class RandomForestRegressorSuite extends FunSuite with MLlibTestSparkContext {
+class RandomForestRegressorSuite extends SparkFunSuite with MLlibTestSparkContext {
 
   import RandomForestRegressorSuite.compareAPIs
 
@@ -98,7 +97,7 @@ class RandomForestRegressorSuite extends FunSuite with MLlibTestSparkContext {
   */
 }
 
-private object RandomForestRegressorSuite extends FunSuite {
+private object RandomForestRegressorSuite extends SparkFunSuite {
 
   /**
    * Train 2 models on the given dataset, one using the old API and one using the new API.
diff --git a/mllib/src/test/scala/org/apache/spark/ml/tuning/CrossValidatorSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/tuning/CrossValidatorSuite.scala
index 60d8bfe38fb13..5ba469c7b10a0 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/tuning/CrossValidatorSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/tuning/CrossValidatorSuite.scala
@@ -17,7 +17,7 @@
 
 package org.apache.spark.ml.tuning
 
-import org.scalatest.FunSuite
+import org.apache.spark.SparkFunSuite
 
 import org.apache.spark.ml.{Estimator, Model}
 import org.apache.spark.ml.classification.LogisticRegression
@@ -29,7 +29,7 @@ import org.apache.spark.mllib.util.MLlibTestSparkContext
 import org.apache.spark.sql.{DataFrame, SQLContext}
 import org.apache.spark.sql.types.StructType
 
-class CrossValidatorSuite extends FunSuite with MLlibTestSparkContext {
+class CrossValidatorSuite extends SparkFunSuite with MLlibTestSparkContext {
 
   @transient var dataset: DataFrame = _
 
diff --git a/mllib/src/test/scala/org/apache/spark/ml/tuning/ParamGridBuilderSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/tuning/ParamGridBuilderSuite.scala
index 20aa100112bfe..810b70049ec15 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/tuning/ParamGridBuilderSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/tuning/ParamGridBuilderSuite.scala
@@ -19,11 +19,10 @@ package org.apache.spark.ml.tuning
 
 import scala.collection.mutable
 
-import org.scalatest.FunSuite
-
+import org.apache.spark.SparkFunSuite
 import org.apache.spark.ml.param.{ParamMap, TestParams}
 
-class ParamGridBuilderSuite extends FunSuite {
+class ParamGridBuilderSuite extends SparkFunSuite {
 
   val solver = new TestParams()
   import solver.{inputCol, maxIter}
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/api/python/PythonMLLibAPISuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/api/python/PythonMLLibAPISuite.scala
index 3d362b5ee53ea..59944416d96a6 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/api/python/PythonMLLibAPISuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/api/python/PythonMLLibAPISuite.scala
@@ -17,13 +17,12 @@
 
 package org.apache.spark.mllib.api.python
 
-import org.scalatest.FunSuite
-
+import org.apache.spark.SparkFunSuite
 import org.apache.spark.mllib.linalg.{DenseMatrix, Matrices, Vectors, SparseMatrix}
 import org.apache.spark.mllib.regression.LabeledPoint
 import org.apache.spark.mllib.recommendation.Rating
 
-class PythonMLLibAPISuite extends FunSuite {
+class PythonMLLibAPISuite extends SparkFunSuite {
 
   SerDe.initialize()
 
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/classification/LogisticRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/classification/LogisticRegressionSuite.scala
index b1014ab7c6203..e8f3d0c4db20a 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/classification/LogisticRegressionSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/classification/LogisticRegressionSuite.scala
@@ -21,9 +21,9 @@ import scala.collection.JavaConversions._
 import scala.util.Random
 import scala.util.control.Breaks._
 
-import org.scalatest.FunSuite
 import org.scalatest.Matchers
 
+import org.apache.spark.SparkFunSuite
 import org.apache.spark.mllib.linalg.{Vector, Vectors}
 import org.apache.spark.mllib.regression._
 import org.apache.spark.mllib.util.{LocalClusterSparkContext, MLlibTestSparkContext}
@@ -169,7 +169,7 @@ object LogisticRegressionSuite {
 }
 
 
-class LogisticRegressionSuite extends FunSuite with MLlibTestSparkContext with Matchers {
+class LogisticRegressionSuite extends SparkFunSuite with MLlibTestSparkContext with Matchers {
   def validatePrediction(
       predictions: Seq[Double],
       input: Seq[LabeledPoint],
@@ -541,7 +541,7 @@ class LogisticRegressionSuite extends FunSuite with MLlibTestSparkContext with M
 
 }
 
-class LogisticRegressionClusterSuite extends FunSuite with LocalClusterSparkContext {
+class LogisticRegressionClusterSuite extends SparkFunSuite with LocalClusterSparkContext {
 
   test("task size should be small in both training and prediction using SGD optimizer") {
     val m = 4
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/classification/NaiveBayesSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/classification/NaiveBayesSuite.scala
index ea40b41bbbe5e..f7fc8730606af 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/classification/NaiveBayesSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/classification/NaiveBayesSuite.scala
@@ -21,9 +21,8 @@ import scala.util.Random
 
 import breeze.linalg.{DenseMatrix => BDM, DenseVector => BDV, argmax => brzArgmax, sum => brzSum}
 import breeze.stats.distributions.{Multinomial => BrzMultinomial}
-import org.scalatest.FunSuite
 
-import org.apache.spark.SparkException
+import org.apache.spark.{SparkException, SparkFunSuite}
 import org.apache.spark.mllib.linalg.Vectors
 import org.apache.spark.mllib.regression.LabeledPoint
 import org.apache.spark.mllib.util.{LocalClusterSparkContext, MLlibTestSparkContext}
@@ -86,7 +85,7 @@ object NaiveBayesSuite {
     pi = Array(0.2, 0.8), theta = Array(Array(0.1, 0.3, 0.6), Array(0.2, 0.4, 0.4)), Multinomial)
 }
 
-class NaiveBayesSuite extends FunSuite with MLlibTestSparkContext {
+class NaiveBayesSuite extends SparkFunSuite with MLlibTestSparkContext {
 
   import NaiveBayes.{Multinomial, Bernoulli}
 
@@ -286,7 +285,7 @@ class NaiveBayesSuite extends FunSuite with MLlibTestSparkContext {
   }
 }
 
-class NaiveBayesClusterSuite extends FunSuite with LocalClusterSparkContext {
+class NaiveBayesClusterSuite extends SparkFunSuite with LocalClusterSparkContext {
 
   test("task size should be small in both training and prediction") {
     val m = 10
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/classification/SVMSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/classification/SVMSuite.scala
index 90f9cec6855bf..b1d78cba9e3dc 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/classification/SVMSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/classification/SVMSuite.scala
@@ -21,9 +21,8 @@ import scala.collection.JavaConversions._
 import scala.util.Random
 
 import org.jblas.DoubleMatrix
-import org.scalatest.FunSuite
 
-import org.apache.spark.SparkException
+import org.apache.spark.{SparkException, SparkFunSuite}
 import org.apache.spark.mllib.linalg.Vectors
 import org.apache.spark.mllib.regression._
 import org.apache.spark.mllib.util.{LocalClusterSparkContext, MLlibTestSparkContext}
@@ -62,7 +61,7 @@ object SVMSuite {
 
 }
 
-class SVMSuite extends FunSuite with MLlibTestSparkContext {
+class SVMSuite extends SparkFunSuite with MLlibTestSparkContext {
 
   def validatePrediction(predictions: Seq[Double], input: Seq[LabeledPoint]) {
     val numOffPredictions = predictions.zip(input).count { case (prediction, expected) =>
@@ -229,7 +228,7 @@ class SVMSuite extends FunSuite with MLlibTestSparkContext {
   }
 }
 
-class SVMClusterSuite extends FunSuite with LocalClusterSparkContext {
+class SVMClusterSuite extends SparkFunSuite with LocalClusterSparkContext {
 
   test("task size should be small in both training and prediction") {
     val m = 4
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/classification/StreamingLogisticRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/classification/StreamingLogisticRegressionSuite.scala
index 5683b55e8500a..e98b61e13e21f 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/classification/StreamingLogisticRegressionSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/classification/StreamingLogisticRegressionSuite.scala
@@ -19,15 +19,14 @@ package org.apache.spark.mllib.classification
 
 import scala.collection.mutable.ArrayBuffer
 
-import org.scalatest.FunSuite
-
+import org.apache.spark.SparkFunSuite
 import org.apache.spark.mllib.linalg.Vectors
 import org.apache.spark.mllib.regression.LabeledPoint
 import org.apache.spark.mllib.util.TestingUtils._
 import org.apache.spark.streaming.dstream.DStream
 import org.apache.spark.streaming.TestSuiteBase
 
-class StreamingLogisticRegressionSuite extends FunSuite with TestSuiteBase {
+class StreamingLogisticRegressionSuite extends SparkFunSuite with TestSuiteBase {
 
   // use longer wait time to ensure job completion
   override def maxWaitTimeMillis: Int = 30000
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/clustering/GaussianMixtureSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/clustering/GaussianMixtureSuite.scala
index f356ffa3e3a26..a3b085e441491 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/clustering/GaussianMixtureSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/clustering/GaussianMixtureSuite.scala
@@ -17,15 +17,14 @@
 
 package org.apache.spark.mllib.clustering
 
-import org.scalatest.FunSuite
-
+import org.apache.spark.SparkFunSuite
 import org.apache.spark.mllib.linalg.{Vectors, Matrices}
 import org.apache.spark.mllib.stat.distribution.MultivariateGaussian
 import org.apache.spark.mllib.util.MLlibTestSparkContext
 import org.apache.spark.mllib.util.TestingUtils._
 import org.apache.spark.util.Utils
 
-class GaussianMixtureSuite extends FunSuite with MLlibTestSparkContext {
+class GaussianMixtureSuite extends SparkFunSuite with MLlibTestSparkContext {
   test("single cluster") {
     val data = sc.parallelize(Array(
       Vectors.dense(6.0, 9.0),
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/clustering/KMeansSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/clustering/KMeansSuite.scala
index 877e6dc699523..0dbbd7127444f 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/clustering/KMeansSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/clustering/KMeansSuite.scala
@@ -19,14 +19,13 @@ package org.apache.spark.mllib.clustering
 
 import scala.util.Random
 
-import org.scalatest.FunSuite
-
+import org.apache.spark.SparkFunSuite
 import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vector, Vectors}
 import org.apache.spark.mllib.util.{LocalClusterSparkContext, MLlibTestSparkContext}
 import org.apache.spark.mllib.util.TestingUtils._
 import org.apache.spark.util.Utils
 
-class KMeansSuite extends FunSuite with MLlibTestSparkContext {
+class KMeansSuite extends SparkFunSuite with MLlibTestSparkContext {
 
   import org.apache.spark.mllib.clustering.KMeans.{K_MEANS_PARALLEL, RANDOM}
 
@@ -281,7 +280,7 @@ class KMeansSuite extends FunSuite with MLlibTestSparkContext {
   }
 }
 
-object KMeansSuite extends FunSuite {
+object KMeansSuite extends SparkFunSuite {
   def createModel(dim: Int, k: Int, isSparse: Boolean): KMeansModel = {
     val singlePoint = isSparse match {
       case true =>
@@ -305,7 +304,7 @@ object KMeansSuite extends FunSuite {
   }
 }
 
-class KMeansClusterSuite extends FunSuite with LocalClusterSparkContext {
+class KMeansClusterSuite extends SparkFunSuite with LocalClusterSparkContext {
 
   test("task size should be small in both training and prediction") {
     val m = 4
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/clustering/LDASuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/clustering/LDASuite.scala
index d5b7d96335744..406affa25539d 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/clustering/LDASuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/clustering/LDASuite.scala
@@ -19,13 +19,12 @@ package org.apache.spark.mllib.clustering
 
 import breeze.linalg.{DenseMatrix => BDM}
 
-import org.scalatest.FunSuite
-
+import org.apache.spark.SparkFunSuite
 import org.apache.spark.mllib.linalg.{Vector, DenseMatrix, Matrix, Vectors}
 import org.apache.spark.mllib.util.MLlibTestSparkContext
 import org.apache.spark.mllib.util.TestingUtils._
 
-class LDASuite extends FunSuite with MLlibTestSparkContext {
+class LDASuite extends SparkFunSuite with MLlibTestSparkContext {
 
   import LDASuite._
 
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/clustering/PowerIterationClusteringSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/clustering/PowerIterationClusteringSuite.scala
index 556842f3129a3..3903712879928 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/clustering/PowerIterationClusteringSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/clustering/PowerIterationClusteringSuite.scala
@@ -20,15 +20,13 @@ package org.apache.spark.mllib.clustering
 import scala.collection.mutable
 import scala.util.Random
 
-import org.scalatest.FunSuite
-
-import org.apache.spark.SparkContext
+import org.apache.spark.{SparkContext, SparkFunSuite}
 import org.apache.spark.graphx.{Edge, Graph}
 import org.apache.spark.mllib.util.MLlibTestSparkContext
 import org.apache.spark.mllib.util.TestingUtils._
 import org.apache.spark.util.Utils
 
-class PowerIterationClusteringSuite extends FunSuite with MLlibTestSparkContext {
+class PowerIterationClusteringSuite extends SparkFunSuite with MLlibTestSparkContext {
 
   import org.apache.spark.mllib.clustering.PowerIterationClustering._
 
@@ -130,7 +128,7 @@ class PowerIterationClusteringSuite extends FunSuite with MLlibTestSparkContext
   }
 }
 
-object PowerIterationClusteringSuite extends FunSuite {
+object PowerIterationClusteringSuite extends SparkFunSuite {
   def createModel(sc: SparkContext, k: Int, nPoints: Int): PowerIterationClusteringModel = {
     val assignments = sc.parallelize(
       (0 until nPoints).map(p => PowerIterationClustering.Assignment(p, Random.nextInt(k))))
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/clustering/StreamingKMeansSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/clustering/StreamingKMeansSuite.scala
index 13f9b17c027a4..ac01622b8a089 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/clustering/StreamingKMeansSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/clustering/StreamingKMeansSuite.scala
@@ -17,15 +17,14 @@
 
 package org.apache.spark.mllib.clustering
 
-import org.scalatest.FunSuite
-
+import org.apache.spark.SparkFunSuite
 import org.apache.spark.mllib.linalg.{Vector, Vectors}
 import org.apache.spark.mllib.util.TestingUtils._
 import org.apache.spark.streaming.TestSuiteBase
 import org.apache.spark.streaming.dstream.DStream
 import org.apache.spark.util.random.XORShiftRandom
 
-class StreamingKMeansSuite extends FunSuite with TestSuiteBase {
+class StreamingKMeansSuite extends SparkFunSuite with TestSuiteBase {
 
   override def maxWaitTimeMillis: Int = 30000
 
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/evaluation/AreaUnderCurveSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/evaluation/AreaUnderCurveSuite.scala
index 79847633ff0dc..87ccc7eda44ea 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/evaluation/AreaUnderCurveSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/evaluation/AreaUnderCurveSuite.scala
@@ -17,12 +17,11 @@
 
 package org.apache.spark.mllib.evaluation
 
-import org.scalatest.FunSuite
-
+import org.apache.spark.SparkFunSuite
 import org.apache.spark.mllib.util.MLlibTestSparkContext
 import org.apache.spark.mllib.util.TestingUtils._
 
-class AreaUnderCurveSuite extends FunSuite with MLlibTestSparkContext {
+class AreaUnderCurveSuite extends SparkFunSuite with MLlibTestSparkContext {
   test("auc computation") {
     val curve = Seq((0.0, 0.0), (1.0, 1.0), (2.0, 3.0), (3.0, 0.0))
     val auc = 4.0
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/evaluation/BinaryClassificationMetricsSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/evaluation/BinaryClassificationMetricsSuite.scala
index e0224f960cc43..99d52fabc5309 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/evaluation/BinaryClassificationMetricsSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/evaluation/BinaryClassificationMetricsSuite.scala
@@ -17,12 +17,11 @@
 
 package org.apache.spark.mllib.evaluation
 
-import org.scalatest.FunSuite
-
+import org.apache.spark.SparkFunSuite
 import org.apache.spark.mllib.util.MLlibTestSparkContext
 import org.apache.spark.mllib.util.TestingUtils._
 
-class BinaryClassificationMetricsSuite extends FunSuite with MLlibTestSparkContext {
+class BinaryClassificationMetricsSuite extends SparkFunSuite with MLlibTestSparkContext {
 
   private def areWithinEpsilon(x: (Double, Double)): Boolean = x._1 ~= (x._2) absTol 1E-5
 
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/evaluation/MulticlassMetricsSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/evaluation/MulticlassMetricsSuite.scala
index 7dc4f3cfbc4e4..d55bc8c3ec09f 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/evaluation/MulticlassMetricsSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/evaluation/MulticlassMetricsSuite.scala
@@ -17,12 +17,11 @@
 
 package org.apache.spark.mllib.evaluation
 
-import org.scalatest.FunSuite
-
+import org.apache.spark.SparkFunSuite
 import org.apache.spark.mllib.linalg.Matrices
 import org.apache.spark.mllib.util.MLlibTestSparkContext
 
-class MulticlassMetricsSuite extends FunSuite with MLlibTestSparkContext {
+class MulticlassMetricsSuite extends SparkFunSuite with MLlibTestSparkContext {
   test("Multiclass evaluation metrics") {
     /*
      * Confusion matrix for 3-class classification with total 9 instances:
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/evaluation/MultilabelMetricsSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/evaluation/MultilabelMetricsSuite.scala
index 2537dd62c92f2..f3b19aeb42f84 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/evaluation/MultilabelMetricsSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/evaluation/MultilabelMetricsSuite.scala
@@ -17,12 +17,11 @@
 
 package org.apache.spark.mllib.evaluation
 
-import org.scalatest.FunSuite
-
+import org.apache.spark.SparkFunSuite
 import org.apache.spark.mllib.util.MLlibTestSparkContext
 import org.apache.spark.rdd.RDD
 
-class MultilabelMetricsSuite extends FunSuite with MLlibTestSparkContext {
+class MultilabelMetricsSuite extends SparkFunSuite with MLlibTestSparkContext {
   test("Multilabel evaluation metrics") {
     /*
     * Documents true labels (5x class0, 3x class1, 4x class2):
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/evaluation/RankingMetricsSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/evaluation/RankingMetricsSuite.scala
index 609eed983ff4e..c0924a213a844 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/evaluation/RankingMetricsSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/evaluation/RankingMetricsSuite.scala
@@ -17,12 +17,11 @@
 
 package org.apache.spark.mllib.evaluation
 
-import org.scalatest.FunSuite
-
+import org.apache.spark.SparkFunSuite
 import org.apache.spark.mllib.util.TestingUtils._
 import org.apache.spark.mllib.util.MLlibTestSparkContext
 
-class RankingMetricsSuite extends FunSuite with MLlibTestSparkContext {
+class RankingMetricsSuite extends SparkFunSuite with MLlibTestSparkContext {
   test("Ranking metrics: map, ndcg") {
     val predictionAndLabels = sc.parallelize(
       Seq(
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/evaluation/RegressionMetricsSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/evaluation/RegressionMetricsSuite.scala
index 3aa732474ec2e..9de2bdb6d7246 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/evaluation/RegressionMetricsSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/evaluation/RegressionMetricsSuite.scala
@@ -17,12 +17,11 @@
 
 package org.apache.spark.mllib.evaluation
 
-import org.scalatest.FunSuite
-
+import org.apache.spark.SparkFunSuite
 import org.apache.spark.mllib.util.MLlibTestSparkContext
 import org.apache.spark.mllib.util.TestingUtils._
 
-class RegressionMetricsSuite extends FunSuite with MLlibTestSparkContext {
+class RegressionMetricsSuite extends SparkFunSuite with MLlibTestSparkContext {
 
   test("regression metrics") {
     val predictionAndObservations = sc.parallelize(
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/feature/ChiSqSelectorSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/feature/ChiSqSelectorSuite.scala
index 747f5914598ec..889727fb55823 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/feature/ChiSqSelectorSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/feature/ChiSqSelectorSuite.scala
@@ -17,13 +17,12 @@
 
 package org.apache.spark.mllib.feature
 
-import org.scalatest.FunSuite
-
+import org.apache.spark.SparkFunSuite
 import org.apache.spark.mllib.linalg.Vectors
 import org.apache.spark.mllib.regression.LabeledPoint
 import org.apache.spark.mllib.util.MLlibTestSparkContext
 
-class ChiSqSelectorSuite extends FunSuite with MLlibTestSparkContext {
+class ChiSqSelectorSuite extends SparkFunSuite with MLlibTestSparkContext {
 
   /*
    *  Contingency tables
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/feature/ElementwiseProductSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/feature/ElementwiseProductSuite.scala
index f3a482abda873..ccbf8a91cdd37 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/feature/ElementwiseProductSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/feature/ElementwiseProductSuite.scala
@@ -17,13 +17,12 @@
 
 package org.apache.spark.mllib.feature
 
-import org.scalatest.FunSuite
-
+import org.apache.spark.SparkFunSuite
 import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vectors}
 import org.apache.spark.mllib.util.MLlibTestSparkContext
 import org.apache.spark.mllib.util.TestingUtils._
 
-class ElementwiseProductSuite extends FunSuite with MLlibTestSparkContext {
+class ElementwiseProductSuite extends SparkFunSuite with MLlibTestSparkContext {
 
   test("elementwise (hadamard) product should properly apply vector to dense data set") {
     val denseData = Array(
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/feature/HashingTFSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/feature/HashingTFSuite.scala
index 0c4dfb7b97c7f..cf279c02334e9 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/feature/HashingTFSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/feature/HashingTFSuite.scala
@@ -17,12 +17,11 @@
 
 package org.apache.spark.mllib.feature
 
-import org.scalatest.FunSuite
-
+import org.apache.spark.SparkFunSuite
 import org.apache.spark.mllib.linalg.Vectors
 import org.apache.spark.mllib.util.MLlibTestSparkContext
 
-class HashingTFSuite extends FunSuite with MLlibTestSparkContext {
+class HashingTFSuite extends SparkFunSuite with MLlibTestSparkContext {
 
   test("hashing tf on a single doc") {
     val hashingTF = new HashingTF(1000)
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/feature/IDFSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/feature/IDFSuite.scala
index 0a5cad7caf8e4..21163633051e5 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/feature/IDFSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/feature/IDFSuite.scala
@@ -17,13 +17,12 @@
 
 package org.apache.spark.mllib.feature
 
-import org.scalatest.FunSuite
-
+import org.apache.spark.SparkFunSuite
 import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vectors, Vector}
 import org.apache.spark.mllib.util.MLlibTestSparkContext
 import org.apache.spark.mllib.util.TestingUtils._
 
-class IDFSuite extends FunSuite with MLlibTestSparkContext {
+class IDFSuite extends SparkFunSuite with MLlibTestSparkContext {
 
   test("idf") {
     val n = 4
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/feature/NormalizerSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/feature/NormalizerSuite.scala
index 5c4af2b99e68b..34122d6ed2e95 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/feature/NormalizerSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/feature/NormalizerSuite.scala
@@ -17,15 +17,14 @@
 
 package org.apache.spark.mllib.feature
 
-import org.scalatest.FunSuite
-
 import breeze.linalg.{norm => brzNorm}
 
+import org.apache.spark.SparkFunSuite
 import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vectors}
 import org.apache.spark.mllib.util.MLlibTestSparkContext
 import org.apache.spark.mllib.util.TestingUtils._
 
-class NormalizerSuite extends FunSuite with MLlibTestSparkContext {
+class NormalizerSuite extends SparkFunSuite with MLlibTestSparkContext {
 
   val data = Array(
     Vectors.sparse(3, Seq((0, -2.0), (1, 2.3))),
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/feature/PCASuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/feature/PCASuite.scala
index 758af588f1c69..e57f49191378f 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/feature/PCASuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/feature/PCASuite.scala
@@ -17,13 +17,12 @@
 
 package org.apache.spark.mllib.feature
 
-import org.scalatest.FunSuite
-
+import org.apache.spark.SparkFunSuite
 import org.apache.spark.mllib.linalg.Vectors
 import org.apache.spark.mllib.linalg.distributed.RowMatrix
 import org.apache.spark.mllib.util.MLlibTestSparkContext
 
-class PCASuite extends FunSuite with MLlibTestSparkContext {
+class PCASuite extends SparkFunSuite with MLlibTestSparkContext {
 
   private val data = Array(
     Vectors.sparse(5, Seq((1, 1.0), (3, 7.0))),
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/feature/StandardScalerSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/feature/StandardScalerSuite.scala
index 1eb991869de40..6ab2fa6770123 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/feature/StandardScalerSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/feature/StandardScalerSuite.scala
@@ -17,15 +17,14 @@
 
 package org.apache.spark.mllib.feature
 
-import org.scalatest.FunSuite
-
+import org.apache.spark.SparkFunSuite
 import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vector, Vectors}
 import org.apache.spark.mllib.util.MLlibTestSparkContext
 import org.apache.spark.mllib.util.TestingUtils._
 import org.apache.spark.mllib.stat.{MultivariateStatisticalSummary, MultivariateOnlineSummarizer}
 import org.apache.spark.rdd.RDD
 
-class StandardScalerSuite extends FunSuite with MLlibTestSparkContext {
+class StandardScalerSuite extends SparkFunSuite with MLlibTestSparkContext {
 
   // When the input data is all constant, the variance is zero. The standardization against
   // zero variance is not well-defined, but we decide to just set it into zero here.
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/feature/Word2VecSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/feature/Word2VecSuite.scala
index 98a98a7599bcb..b6818369208d7 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/feature/Word2VecSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/feature/Word2VecSuite.scala
@@ -17,14 +17,13 @@
 
 package org.apache.spark.mllib.feature
 
-import org.scalatest.FunSuite
-
+import org.apache.spark.SparkFunSuite
 import org.apache.spark.mllib.util.MLlibTestSparkContext
 
 import org.apache.spark.mllib.util.TestingUtils._
 import org.apache.spark.util.Utils
 
-class Word2VecSuite extends FunSuite with MLlibTestSparkContext {
+class Word2VecSuite extends SparkFunSuite with MLlibTestSparkContext {
 
   // TODO: add more tests
 
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/fpm/FPGrowthSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/fpm/FPGrowthSuite.scala
index bd5b9cc3afa10..66ae3543ecc4e 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/fpm/FPGrowthSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/fpm/FPGrowthSuite.scala
@@ -16,11 +16,10 @@
  */
 package org.apache.spark.mllib.fpm
 
-import org.scalatest.FunSuite
-
+import org.apache.spark.SparkFunSuite
 import org.apache.spark.mllib.util.MLlibTestSparkContext
 
-class FPGrowthSuite extends FunSuite with MLlibTestSparkContext {
+class FPGrowthSuite extends SparkFunSuite with MLlibTestSparkContext {
 
 
   test("FP-Growth using String type") {
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/fpm/FPTreeSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/fpm/FPTreeSuite.scala
index 04017f67c311d..a56d7b3579213 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/fpm/FPTreeSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/fpm/FPTreeSuite.scala
@@ -19,11 +19,10 @@ package org.apache.spark.mllib.fpm
 
 import scala.language.existentials
 
-import org.scalatest.FunSuite
-
+import org.apache.spark.SparkFunSuite
 import org.apache.spark.mllib.util.MLlibTestSparkContext
 
-class FPTreeSuite extends FunSuite with MLlibTestSparkContext {
+class FPTreeSuite extends SparkFunSuite with MLlibTestSparkContext {
 
   test("add transaction") {
     val tree = new FPTree[String]
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/impl/PeriodicGraphCheckpointerSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/impl/PeriodicGraphCheckpointerSuite.scala
index 699f009f0f2ec..d34888af2d73b 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/impl/PeriodicGraphCheckpointerSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/impl/PeriodicGraphCheckpointerSuite.scala
@@ -17,18 +17,16 @@
 
 package org.apache.spark.mllib.impl
 
-import org.scalatest.FunSuite
-
 import org.apache.hadoop.fs.{FileSystem, Path}
 
-import org.apache.spark.SparkContext
+import org.apache.spark.{SparkContext, SparkFunSuite}
 import org.apache.spark.graphx.{Edge, Graph}
 import org.apache.spark.mllib.util.MLlibTestSparkContext
 import org.apache.spark.storage.StorageLevel
 import org.apache.spark.util.Utils
 
 
-class PeriodicGraphCheckpointerSuite extends FunSuite with MLlibTestSparkContext {
+class PeriodicGraphCheckpointerSuite extends SparkFunSuite with MLlibTestSparkContext {
 
   import PeriodicGraphCheckpointerSuite._
 
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/linalg/BLASSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/linalg/BLASSuite.scala
index 64ecd12ea7ded..bcc2e657f3fd4 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/linalg/BLASSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/linalg/BLASSuite.scala
@@ -17,12 +17,11 @@
 
 package org.apache.spark.mllib.linalg
 
-import org.scalatest.FunSuite
-
+import org.apache.spark.SparkFunSuite
 import org.apache.spark.mllib.util.TestingUtils._
 import org.apache.spark.mllib.linalg.BLAS._
 
-class BLASSuite extends FunSuite {
+class BLASSuite extends SparkFunSuite {
 
   test("copy") {
     val sx = Vectors.sparse(4, Array(0, 2), Array(1.0, -2.0))
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/linalg/BreezeMatrixConversionSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/linalg/BreezeMatrixConversionSuite.scala
index 2031032373971..dc04258e41d27 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/linalg/BreezeMatrixConversionSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/linalg/BreezeMatrixConversionSuite.scala
@@ -17,11 +17,11 @@
 
 package org.apache.spark.mllib.linalg
 
-import org.scalatest.FunSuite
-
 import breeze.linalg.{DenseMatrix => BDM, CSCMatrix => BSM}
 
-class BreezeMatrixConversionSuite extends FunSuite {
+import org.apache.spark.SparkFunSuite
+
+class BreezeMatrixConversionSuite extends SparkFunSuite {
   test("dense matrix to breeze") {
     val mat = Matrices.dense(3, 2, Array(0.0, 1.0, 2.0, 3.0, 4.0, 5.0))
     val breeze = mat.toBreeze.asInstanceOf[BDM[Double]]
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/linalg/BreezeVectorConversionSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/linalg/BreezeVectorConversionSuite.scala
index 8abdac72902c6..3772c9235ad3a 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/linalg/BreezeVectorConversionSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/linalg/BreezeVectorConversionSuite.scala
@@ -17,14 +17,14 @@
 
 package org.apache.spark.mllib.linalg
 
-import org.scalatest.FunSuite
-
 import breeze.linalg.{DenseVector => BDV, SparseVector => BSV}
 
+import org.apache.spark.SparkFunSuite
+
 /**
  * Test Breeze vector conversions.
  */
-class BreezeVectorConversionSuite extends FunSuite {
+class BreezeVectorConversionSuite extends SparkFunSuite {
 
   val arr = Array(0.1, 0.2, 0.3, 0.4)
   val n = 20
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/linalg/MatricesSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/linalg/MatricesSuite.scala
index 86119ec38101e..8dbb70f5d1c4c 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/linalg/MatricesSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/linalg/MatricesSuite.scala
@@ -20,13 +20,13 @@ package org.apache.spark.mllib.linalg
 import java.util.Random
 
 import org.mockito.Mockito.when
-import org.scalatest.FunSuite
 import org.scalatest.mock.MockitoSugar._
 import scala.collection.mutable.{Map => MutableMap}
 
+import org.apache.spark.SparkFunSuite
 import org.apache.spark.mllib.util.TestingUtils._
 
-class MatricesSuite extends FunSuite {
+class MatricesSuite extends SparkFunSuite {
   test("dense matrix construction") {
     val m = 3
     val n = 2
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/linalg/VectorsSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/linalg/VectorsSuite.scala
index 24755e9ff46fc..c6d29dcdb0f2b 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/linalg/VectorsSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/linalg/VectorsSuite.scala
@@ -20,12 +20,11 @@ package org.apache.spark.mllib.linalg
 import scala.util.Random
 
 import breeze.linalg.{DenseMatrix => BDM, squaredDistance => breezeSquaredDistance}
-import org.scalatest.FunSuite
 
-import org.apache.spark.SparkException
+import org.apache.spark.{SparkException, SparkFunSuite}
 import org.apache.spark.mllib.util.TestingUtils._
 
-class VectorsSuite extends FunSuite {
+class VectorsSuite extends SparkFunSuite {
 
   val arr = Array(0.1, 0.0, 0.3, 0.4)
   val n = 4
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/linalg/distributed/BlockMatrixSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/linalg/distributed/BlockMatrixSuite.scala
index a58336175899c..93fe04c139b9a 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/linalg/distributed/BlockMatrixSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/linalg/distributed/BlockMatrixSuite.scala
@@ -20,14 +20,13 @@ package org.apache.spark.mllib.linalg.distributed
 import java.{util => ju}
 
 import breeze.linalg.{DenseMatrix => BDM}
-import org.scalatest.FunSuite
 
-import org.apache.spark.SparkException
+import org.apache.spark.{SparkException, SparkFunSuite}
 import org.apache.spark.mllib.linalg.{SparseMatrix, DenseMatrix, Matrices, Matrix}
 import org.apache.spark.mllib.util.MLlibTestSparkContext
 import org.apache.spark.mllib.util.TestingUtils._
 
-class BlockMatrixSuite extends FunSuite with MLlibTestSparkContext {
+class BlockMatrixSuite extends SparkFunSuite with MLlibTestSparkContext {
 
   val m = 5
   val n = 4
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/linalg/distributed/CoordinateMatrixSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/linalg/distributed/CoordinateMatrixSuite.scala
index 04b36a9ef9990..f3728cd036a3f 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/linalg/distributed/CoordinateMatrixSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/linalg/distributed/CoordinateMatrixSuite.scala
@@ -17,14 +17,13 @@
 
 package org.apache.spark.mllib.linalg.distributed
 
-import org.scalatest.FunSuite
-
 import breeze.linalg.{DenseMatrix => BDM}
 
+import org.apache.spark.SparkFunSuite
 import org.apache.spark.mllib.util.MLlibTestSparkContext
 import org.apache.spark.mllib.linalg.Vectors
 
-class CoordinateMatrixSuite extends FunSuite with MLlibTestSparkContext {
+class CoordinateMatrixSuite extends SparkFunSuite with MLlibTestSparkContext {
 
   val m = 5
   val n = 4
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/linalg/distributed/IndexedRowMatrixSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/linalg/distributed/IndexedRowMatrixSuite.scala
index 2ab53cc13db71..4a7b99a976f0a 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/linalg/distributed/IndexedRowMatrixSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/linalg/distributed/IndexedRowMatrixSuite.scala
@@ -17,15 +17,14 @@
 
 package org.apache.spark.mllib.linalg.distributed
 
-import org.scalatest.FunSuite
-
 import breeze.linalg.{diag => brzDiag, DenseMatrix => BDM, DenseVector => BDV}
 
+import org.apache.spark.SparkFunSuite
 import org.apache.spark.mllib.util.MLlibTestSparkContext
 import org.apache.spark.rdd.RDD
 import org.apache.spark.mllib.linalg.{Matrices, Vectors}
 
-class IndexedRowMatrixSuite extends FunSuite with MLlibTestSparkContext {
+class IndexedRowMatrixSuite extends SparkFunSuite with MLlibTestSparkContext {
 
   val m = 4
   val n = 3
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/linalg/distributed/RowMatrixSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/linalg/distributed/RowMatrixSuite.scala
index 27bb19f472e1e..b6cb53d0c743e 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/linalg/distributed/RowMatrixSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/linalg/distributed/RowMatrixSuite.scala
@@ -20,12 +20,12 @@ package org.apache.spark.mllib.linalg.distributed
 import scala.util.Random
 
 import breeze.linalg.{DenseVector => BDV, DenseMatrix => BDM, norm => brzNorm, svd => brzSvd}
-import org.scalatest.FunSuite
 
+import org.apache.spark.SparkFunSuite
 import org.apache.spark.mllib.linalg.{Matrices, Vectors, Vector}
 import org.apache.spark.mllib.util.{LocalClusterSparkContext, MLlibTestSparkContext}
 
-class RowMatrixSuite extends FunSuite with MLlibTestSparkContext {
+class RowMatrixSuite extends SparkFunSuite with MLlibTestSparkContext {
 
   val m = 4
   val n = 3
@@ -240,7 +240,7 @@ class RowMatrixSuite extends FunSuite with MLlibTestSparkContext {
   }
 }
 
-class RowMatrixClusterSuite extends FunSuite with LocalClusterSparkContext {
+class RowMatrixClusterSuite extends SparkFunSuite with LocalClusterSparkContext {
 
   var mat: RowMatrix = _
 
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/optimization/GradientDescentSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/optimization/GradientDescentSuite.scala
index e110506d579b0..a5a59e9fad5ae 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/optimization/GradientDescentSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/optimization/GradientDescentSuite.scala
@@ -20,8 +20,9 @@ package org.apache.spark.mllib.optimization
 import scala.collection.JavaConversions._
 import scala.util.Random
 
-import org.scalatest.{FunSuite, Matchers}
+import org.scalatest.Matchers
 
+import org.apache.spark.SparkFunSuite
 import org.apache.spark.mllib.linalg.Vectors
 import org.apache.spark.mllib.regression._
 import org.apache.spark.mllib.util.{LocalClusterSparkContext, MLlibTestSparkContext}
@@ -61,7 +62,7 @@ object GradientDescentSuite {
   }
 }
 
-class GradientDescentSuite extends FunSuite with MLlibTestSparkContext with Matchers {
+class GradientDescentSuite extends SparkFunSuite with MLlibTestSparkContext with Matchers {
 
   test("Assert the loss is decreasing.") {
     val nPoints = 10000
@@ -140,7 +141,7 @@ class GradientDescentSuite extends FunSuite with MLlibTestSparkContext with Matc
   }
 }
 
-class GradientDescentClusterSuite extends FunSuite with LocalClusterSparkContext {
+class GradientDescentClusterSuite extends SparkFunSuite with LocalClusterSparkContext {
 
   test("task size should be small") {
     val m = 4
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/optimization/LBFGSSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/optimization/LBFGSSuite.scala
index c8f2adcf155a7..d07b9d5b89227 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/optimization/LBFGSSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/optimization/LBFGSSuite.scala
@@ -19,14 +19,15 @@ package org.apache.spark.mllib.optimization
 
 import scala.util.Random
 
-import org.scalatest.{FunSuite, Matchers}
+import org.scalatest.Matchers
 
+import org.apache.spark.SparkFunSuite
 import org.apache.spark.mllib.linalg.Vectors
 import org.apache.spark.mllib.regression.LabeledPoint
 import org.apache.spark.mllib.util.{LocalClusterSparkContext, MLlibTestSparkContext}
 import org.apache.spark.mllib.util.TestingUtils._
 
-class LBFGSSuite extends FunSuite with MLlibTestSparkContext with Matchers {
+class LBFGSSuite extends SparkFunSuite with MLlibTestSparkContext with Matchers {
 
   val nPoints = 10000
   val A = 2.0
@@ -229,7 +230,7 @@ class LBFGSSuite extends FunSuite with MLlibTestSparkContext with Matchers {
   }
 }
 
-class LBFGSClusterSuite extends FunSuite with LocalClusterSparkContext {
+class LBFGSClusterSuite extends SparkFunSuite with LocalClusterSparkContext {
 
   test("task size should be small") {
     val m = 10
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/optimization/NNLSSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/optimization/NNLSSuite.scala
index bb723fc471181..d8f9b8c33963d 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/optimization/NNLSSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/optimization/NNLSSuite.scala
@@ -19,13 +19,12 @@ package org.apache.spark.mllib.optimization
 
 import scala.util.Random
 
-import org.scalatest.FunSuite
-
 import org.jblas.{DoubleMatrix, SimpleBlas}
 
+import org.apache.spark.SparkFunSuite
 import org.apache.spark.mllib.util.TestingUtils._
 
-class NNLSSuite extends FunSuite {
+class NNLSSuite extends SparkFunSuite {
   /** Generate an NNLS problem whose optimal solution is the all-ones vector. */
   def genOnesData(n: Int, rand: Random): (DoubleMatrix, DoubleMatrix) = {
     val A = new DoubleMatrix(n, n, Array.fill(n*n)(rand.nextDouble()): _*)
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/pmml/export/BinaryClassificationPMMLModelExportSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/pmml/export/BinaryClassificationPMMLModelExportSuite.scala
index 0b646cf1ce6c4..7a724fc78b1d9 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/pmml/export/BinaryClassificationPMMLModelExportSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/pmml/export/BinaryClassificationPMMLModelExportSuite.scala
@@ -19,13 +19,13 @@ package org.apache.spark.mllib.pmml.export
 
 import org.dmg.pmml.RegressionModel
 import org.dmg.pmml.RegressionNormalizationMethodType
-import org.scalatest.FunSuite
 
+import org.apache.spark.SparkFunSuite
 import org.apache.spark.mllib.classification.LogisticRegressionModel
 import org.apache.spark.mllib.classification.SVMModel
 import org.apache.spark.mllib.util.LinearDataGenerator
 
-class BinaryClassificationPMMLModelExportSuite extends FunSuite {
+class BinaryClassificationPMMLModelExportSuite extends SparkFunSuite {
 
   test("logistic regression PMML export") {
     val linearInput = LinearDataGenerator.generateLinearInput(3.0, Array(10.0, 10.0), 1, 17)
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/pmml/export/GeneralizedLinearPMMLModelExportSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/pmml/export/GeneralizedLinearPMMLModelExportSuite.scala
index f9afbd888dfc5..1d32309481787 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/pmml/export/GeneralizedLinearPMMLModelExportSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/pmml/export/GeneralizedLinearPMMLModelExportSuite.scala
@@ -18,12 +18,12 @@
 package org.apache.spark.mllib.pmml.export
 
 import org.dmg.pmml.RegressionModel
-import org.scalatest.FunSuite
 
+import org.apache.spark.SparkFunSuite
 import org.apache.spark.mllib.regression.{LassoModel, LinearRegressionModel, RidgeRegressionModel}
 import org.apache.spark.mllib.util.LinearDataGenerator
 
-class GeneralizedLinearPMMLModelExportSuite extends FunSuite {
+class GeneralizedLinearPMMLModelExportSuite extends SparkFunSuite {
 
   test("linear regression PMML export") {
     val linearInput = LinearDataGenerator.generateLinearInput(3.0, Array(10.0, 10.0), 1, 17)
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/pmml/export/KMeansPMMLModelExportSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/pmml/export/KMeansPMMLModelExportSuite.scala
index b985d0446d7b0..a1a683559a54c 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/pmml/export/KMeansPMMLModelExportSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/pmml/export/KMeansPMMLModelExportSuite.scala
@@ -18,12 +18,12 @@
 package org.apache.spark.mllib.pmml.export
 
 import org.dmg.pmml.ClusteringModel
-import org.scalatest.FunSuite
 
+import org.apache.spark.SparkFunSuite
 import org.apache.spark.mllib.clustering.KMeansModel
 import org.apache.spark.mllib.linalg.Vectors
 
-class KMeansPMMLModelExportSuite extends FunSuite {
+class KMeansPMMLModelExportSuite extends SparkFunSuite {
 
   test("KMeansPMMLModelExport generate PMML format") {
     val clusterCenters = Array(
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/pmml/export/PMMLModelExportFactorySuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/pmml/export/PMMLModelExportFactorySuite.scala
index f28a4ac8ad01f..0d194005a30b2 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/pmml/export/PMMLModelExportFactorySuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/pmml/export/PMMLModelExportFactorySuite.scala
@@ -17,15 +17,14 @@
 
 package org.apache.spark.mllib.pmml.export
 
-import org.scalatest.FunSuite
-
+import org.apache.spark.SparkFunSuite
 import org.apache.spark.mllib.classification.{LogisticRegressionModel, SVMModel}
 import org.apache.spark.mllib.clustering.KMeansModel
 import org.apache.spark.mllib.linalg.Vectors
 import org.apache.spark.mllib.regression.{LassoModel, LinearRegressionModel, RidgeRegressionModel}
 import org.apache.spark.mllib.util.LinearDataGenerator
 
-class PMMLModelExportFactorySuite extends FunSuite {
+class PMMLModelExportFactorySuite extends SparkFunSuite {
 
   test("PMMLModelExportFactory create KMeansPMMLModelExport when passing a KMeansModel") {
     val clusterCenters = Array(
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/random/RandomDataGeneratorSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/random/RandomDataGeneratorSuite.scala
index b792d819fdabb..a5ca1518f82f5 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/random/RandomDataGeneratorSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/random/RandomDataGeneratorSuite.scala
@@ -19,12 +19,11 @@ package org.apache.spark.mllib.random
 
 import scala.math
 
-import org.scalatest.FunSuite
-
+import org.apache.spark.SparkFunSuite
 import org.apache.spark.util.StatCounter
 
 // TODO update tests to use TestingUtils for floating point comparison after PR 1367 is merged
-class RandomDataGeneratorSuite extends FunSuite {
+class RandomDataGeneratorSuite extends SparkFunSuite {
 
   def apiChecks(gen: RandomDataGenerator[Double]) {
     // resetting seed should generate the same sequence of random numbers
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/random/RandomRDDsSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/random/RandomRDDsSuite.scala
index 63f2ea916d457..413db2000d6d7 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/random/RandomRDDsSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/random/RandomRDDsSuite.scala
@@ -19,8 +19,7 @@ package org.apache.spark.mllib.random
 
 import scala.collection.mutable.ArrayBuffer
 
-import org.scalatest.FunSuite
-
+import org.apache.spark.SparkFunSuite
 import org.apache.spark.SparkContext._
 import org.apache.spark.mllib.linalg.Vector
 import org.apache.spark.mllib.rdd.{RandomRDDPartition, RandomRDD}
@@ -34,7 +33,7 @@ import org.apache.spark.util.StatCounter
  *
  * TODO update tests to use TestingUtils for floating point comparison after PR 1367 is merged
  */
-class RandomRDDsSuite extends FunSuite with MLlibTestSparkContext with Serializable {
+class RandomRDDsSuite extends SparkFunSuite with MLlibTestSparkContext with Serializable {
 
   def testGeneratedRDD(rdd: RDD[Double],
       expectedSize: Long,
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/rdd/MLPairRDDFunctionsSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/rdd/MLPairRDDFunctionsSuite.scala
index 57216e8eb4a55..10f5a2be48f7c 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/rdd/MLPairRDDFunctionsSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/rdd/MLPairRDDFunctionsSuite.scala
@@ -17,12 +17,11 @@
 
 package org.apache.spark.mllib.rdd
 
-import org.scalatest.FunSuite
-
+import org.apache.spark.SparkFunSuite
 import org.apache.spark.mllib.util.MLlibTestSparkContext
 import org.apache.spark.mllib.rdd.MLPairRDDFunctions._
 
-class MLPairRDDFunctionsSuite extends FunSuite with MLlibTestSparkContext {
+class MLPairRDDFunctionsSuite extends SparkFunSuite with MLlibTestSparkContext {
   test("topByKey") {
     val topMap = sc.parallelize(Array((1, 7), (1, 3), (1, 6), (1, 1), (1, 2), (3, 2), (3, 7), (5,
       1), (3, 5)), 2)
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/rdd/RDDFunctionsSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/rdd/RDDFunctionsSuite.scala
index 6d6c0aa5be812..bc64172614830 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/rdd/RDDFunctionsSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/rdd/RDDFunctionsSuite.scala
@@ -17,12 +17,11 @@
 
 package org.apache.spark.mllib.rdd
 
-import org.scalatest.FunSuite
-
+import org.apache.spark.SparkFunSuite
 import org.apache.spark.mllib.util.MLlibTestSparkContext
 import org.apache.spark.mllib.rdd.RDDFunctions._
 
-class RDDFunctionsSuite extends FunSuite with MLlibTestSparkContext {
+class RDDFunctionsSuite extends SparkFunSuite with MLlibTestSparkContext {
 
   test("sliding") {
     val data = 0 until 6
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/recommendation/ALSSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/recommendation/ALSSuite.scala
index b3798940ddc38..05b87728d6fdb 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/recommendation/ALSSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/recommendation/ALSSuite.scala
@@ -21,9 +21,9 @@ import scala.collection.JavaConversions._
 import scala.math.abs
 import scala.util.Random
 
-import org.scalatest.FunSuite
 import org.jblas.DoubleMatrix
 
+import org.apache.spark.SparkFunSuite
 import org.apache.spark.mllib.util.MLlibTestSparkContext
 import org.apache.spark.storage.StorageLevel
 
@@ -84,7 +84,7 @@ object ALSSuite {
 }
 
 
-class ALSSuite extends FunSuite with MLlibTestSparkContext {
+class ALSSuite extends SparkFunSuite with MLlibTestSparkContext {
 
   test("rank-1 matrices") {
     testALS(50, 100, 1, 15, 0.7, 0.3)
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/recommendation/MatrixFactorizationModelSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/recommendation/MatrixFactorizationModelSuite.scala
index 2c92866f3893d..2c8ed057a516a 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/recommendation/MatrixFactorizationModelSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/recommendation/MatrixFactorizationModelSuite.scala
@@ -17,14 +17,13 @@
 
 package org.apache.spark.mllib.recommendation
 
-import org.scalatest.FunSuite
-
+import org.apache.spark.SparkFunSuite
 import org.apache.spark.mllib.util.MLlibTestSparkContext
 import org.apache.spark.mllib.util.TestingUtils._
 import org.apache.spark.rdd.RDD
 import org.apache.spark.util.Utils
 
-class MatrixFactorizationModelSuite extends FunSuite with MLlibTestSparkContext {
+class MatrixFactorizationModelSuite extends SparkFunSuite with MLlibTestSparkContext {
 
   val rank = 2
   var userFeatures: RDD[(Int, Array[Double])] = _
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/regression/IsotonicRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/regression/IsotonicRegressionSuite.scala
index 3b38bdf5ef5eb..ea4f2865757c1 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/regression/IsotonicRegressionSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/regression/IsotonicRegressionSuite.scala
@@ -17,13 +17,14 @@
 
 package org.apache.spark.mllib.regression
 
-import org.scalatest.{Matchers, FunSuite}
+import org.scalatest.Matchers
 
+import org.apache.spark.SparkFunSuite
 import org.apache.spark.mllib.util.MLlibTestSparkContext
 import org.apache.spark.mllib.util.TestingUtils._
 import org.apache.spark.util.Utils
 
-class IsotonicRegressionSuite extends FunSuite with MLlibTestSparkContext with Matchers {
+class IsotonicRegressionSuite extends SparkFunSuite with MLlibTestSparkContext with Matchers {
 
   private def round(d: Double) = {
     math.round(d * 100).toDouble / 100
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/regression/LabeledPointSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/regression/LabeledPointSuite.scala
index 110c44a7193fd..d8364a06de4da 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/regression/LabeledPointSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/regression/LabeledPointSuite.scala
@@ -17,11 +17,10 @@
 
 package org.apache.spark.mllib.regression
 
-import org.scalatest.FunSuite
-
+import org.apache.spark.SparkFunSuite
 import org.apache.spark.mllib.linalg.Vectors
 
-class LabeledPointSuite extends FunSuite {
+class LabeledPointSuite extends SparkFunSuite {
 
   test("parse labeled points") {
     val points = Seq(
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/regression/LassoSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/regression/LassoSuite.scala
index 71dce50922991..08a152ffc7a23 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/regression/LassoSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/regression/LassoSuite.scala
@@ -19,8 +19,7 @@ package org.apache.spark.mllib.regression
 
 import scala.util.Random
 
-import org.scalatest.FunSuite
-
+import org.apache.spark.SparkFunSuite
 import org.apache.spark.mllib.linalg.Vectors
 import org.apache.spark.mllib.util.{LocalClusterSparkContext, LinearDataGenerator,
   MLlibTestSparkContext}
@@ -32,7 +31,7 @@ private object LassoSuite {
   val model = new LassoModel(weights = Vectors.dense(0.1, 0.2, 0.3), intercept = 0.5)
 }
 
-class LassoSuite extends FunSuite with MLlibTestSparkContext {
+class LassoSuite extends SparkFunSuite with MLlibTestSparkContext {
 
   def validatePrediction(predictions: Seq[Double], input: Seq[LabeledPoint]) {
     val numOffPredictions = predictions.zip(input).count { case (prediction, expected) =>
@@ -143,7 +142,7 @@ class LassoSuite extends FunSuite with MLlibTestSparkContext {
   }
 }
 
-class LassoClusterSuite extends FunSuite with LocalClusterSparkContext {
+class LassoClusterSuite extends SparkFunSuite with LocalClusterSparkContext {
 
   test("task size should be small in both training and prediction") {
     val m = 4
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/regression/LinearRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/regression/LinearRegressionSuite.scala
index 3781931c2f819..f88a1c33c9f7c 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/regression/LinearRegressionSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/regression/LinearRegressionSuite.scala
@@ -19,8 +19,7 @@ package org.apache.spark.mllib.regression
 
 import scala.util.Random
 
-import org.scalatest.FunSuite
-
+import org.apache.spark.SparkFunSuite
 import org.apache.spark.mllib.linalg.Vectors
 import org.apache.spark.mllib.util.{LocalClusterSparkContext, LinearDataGenerator,
   MLlibTestSparkContext}
@@ -32,7 +31,7 @@ private object LinearRegressionSuite {
   val model = new LinearRegressionModel(weights = Vectors.dense(0.1, 0.2, 0.3), intercept = 0.5)
 }
 
-class LinearRegressionSuite extends FunSuite with MLlibTestSparkContext {
+class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
 
   def validatePrediction(predictions: Seq[Double], input: Seq[LabeledPoint]) {
     val numOffPredictions = predictions.zip(input).count { case (prediction, expected) =>
@@ -150,7 +149,7 @@ class LinearRegressionSuite extends FunSuite with MLlibTestSparkContext {
   }
 }
 
-class LinearRegressionClusterSuite extends FunSuite with LocalClusterSparkContext {
+class LinearRegressionClusterSuite extends SparkFunSuite with LocalClusterSparkContext {
 
   test("task size should be small in both training and prediction") {
     val m = 4
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/regression/RidgeRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/regression/RidgeRegressionSuite.scala
index d6c93cc0e49cd..7a781fee634c8 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/regression/RidgeRegressionSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/regression/RidgeRegressionSuite.scala
@@ -20,8 +20,8 @@ package org.apache.spark.mllib.regression
 import scala.util.Random
 
 import org.jblas.DoubleMatrix
-import org.scalatest.FunSuite
 
+import org.apache.spark.SparkFunSuite
 import org.apache.spark.mllib.linalg.Vectors
 import org.apache.spark.mllib.util.{LocalClusterSparkContext, LinearDataGenerator,
   MLlibTestSparkContext}
@@ -33,7 +33,7 @@ private object RidgeRegressionSuite {
   val model = new RidgeRegressionModel(weights = Vectors.dense(0.1, 0.2, 0.3), intercept = 0.5)
 }
 
-class RidgeRegressionSuite extends FunSuite with MLlibTestSparkContext {
+class RidgeRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
 
   def predictionError(predictions: Seq[Double], input: Seq[LabeledPoint]): Double = {
     predictions.zip(input).map { case (prediction, expected) =>
@@ -101,7 +101,7 @@ class RidgeRegressionSuite extends FunSuite with MLlibTestSparkContext {
   }
 }
 
-class RidgeRegressionClusterSuite extends FunSuite with LocalClusterSparkContext {
+class RidgeRegressionClusterSuite extends SparkFunSuite with LocalClusterSparkContext {
 
   test("task size should be small in both training and prediction") {
     val m = 4
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/regression/StreamingLinearRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/regression/StreamingLinearRegressionSuite.scala
index 26604dbe6c1ef..9a379406d5061 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/regression/StreamingLinearRegressionSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/regression/StreamingLinearRegressionSuite.scala
@@ -19,14 +19,13 @@ package org.apache.spark.mllib.regression
 
 import scala.collection.mutable.ArrayBuffer
 
-import org.scalatest.FunSuite
-
+import org.apache.spark.SparkFunSuite
 import org.apache.spark.mllib.linalg.Vectors
 import org.apache.spark.mllib.util.LinearDataGenerator
 import org.apache.spark.streaming.dstream.DStream
 import org.apache.spark.streaming.TestSuiteBase
 
-class StreamingLinearRegressionSuite extends FunSuite with TestSuiteBase {
+class StreamingLinearRegressionSuite extends SparkFunSuite with TestSuiteBase {
 
   // use longer wait time to ensure job completion
   override def maxWaitTimeMillis: Int = 20000
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/stat/CorrelationSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/stat/CorrelationSuite.scala
index a7e6fce31ff7e..c292ced75e870 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/stat/CorrelationSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/stat/CorrelationSuite.scala
@@ -17,16 +17,15 @@
 
 package org.apache.spark.mllib.stat
 
-import org.scalatest.FunSuite
-
 import breeze.linalg.{DenseMatrix => BDM, Matrix => BM}
 
+import org.apache.spark.SparkFunSuite
 import org.apache.spark.mllib.linalg.Vectors
 import org.apache.spark.mllib.stat.correlation.{Correlations, PearsonCorrelation,
   SpearmanCorrelation}
 import org.apache.spark.mllib.util.MLlibTestSparkContext
 
-class CorrelationSuite extends FunSuite with MLlibTestSparkContext {
+class CorrelationSuite extends SparkFunSuite with MLlibTestSparkContext {
 
   // test input data
   val xData = Array(1.0, 0.0, -2.0)
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/stat/HypothesisTestSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/stat/HypothesisTestSuite.scala
index 15418e6035965..b084a5fb4313f 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/stat/HypothesisTestSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/stat/HypothesisTestSuite.scala
@@ -19,16 +19,14 @@ package org.apache.spark.mllib.stat
 
 import java.util.Random
 
-import org.scalatest.FunSuite
-
-import org.apache.spark.SparkException
+import org.apache.spark.{SparkException, SparkFunSuite}
 import org.apache.spark.mllib.linalg.{DenseVector, Matrices, Vectors}
 import org.apache.spark.mllib.regression.LabeledPoint
 import org.apache.spark.mllib.stat.test.ChiSqTest
 import org.apache.spark.mllib.util.MLlibTestSparkContext
 import org.apache.spark.mllib.util.TestingUtils._
 
-class HypothesisTestSuite extends FunSuite with MLlibTestSparkContext {
+class HypothesisTestSuite extends SparkFunSuite with MLlibTestSparkContext {
 
   test("chi squared pearson goodness of fit") {
 
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/stat/KernelDensitySuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/stat/KernelDensitySuite.scala
index a309c942cf8ff..5feccdf33681a 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/stat/KernelDensitySuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/stat/KernelDensitySuite.scala
@@ -18,11 +18,11 @@
 package org.apache.spark.mllib.stat
 
 import org.apache.commons.math3.distribution.NormalDistribution
-import org.scalatest.FunSuite
 
+import org.apache.spark.SparkFunSuite
 import org.apache.spark.mllib.util.MLlibTestSparkContext
 
-class KernelDensitySuite extends FunSuite with MLlibTestSparkContext {
+class KernelDensitySuite extends SparkFunSuite with MLlibTestSparkContext {
   test("kernel density single sample") {
     val rdd = sc.parallelize(Array(5.0))
     val evaluationPoints = Array(5.0, 6.0)
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/stat/MultivariateOnlineSummarizerSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/stat/MultivariateOnlineSummarizerSuite.scala
index 23b0eec865de6..07efde4f5e6dc 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/stat/MultivariateOnlineSummarizerSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/stat/MultivariateOnlineSummarizerSuite.scala
@@ -17,12 +17,11 @@
 
 package org.apache.spark.mllib.stat
 
-import org.scalatest.FunSuite
-
+import org.apache.spark.SparkFunSuite
 import org.apache.spark.mllib.linalg.Vectors
 import org.apache.spark.mllib.util.TestingUtils._
 
-class MultivariateOnlineSummarizerSuite extends FunSuite {
+class MultivariateOnlineSummarizerSuite extends SparkFunSuite {
 
   test("basic error handing") {
     val summarizer = new MultivariateOnlineSummarizer
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/stat/distribution/MultivariateGaussianSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/stat/distribution/MultivariateGaussianSuite.scala
index fac2498e4dcb3..703b623536315 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/stat/distribution/MultivariateGaussianSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/stat/distribution/MultivariateGaussianSuite.scala
@@ -17,13 +17,12 @@
 
 package org.apache.spark.mllib.stat.distribution
 
-import org.scalatest.FunSuite
-
+import org.apache.spark.SparkFunSuite
 import org.apache.spark.mllib.linalg.{ Vectors, Matrices }
 import org.apache.spark.mllib.util.MLlibTestSparkContext
 import org.apache.spark.mllib.util.TestingUtils._
 
-class MultivariateGaussianSuite extends FunSuite with MLlibTestSparkContext {
+class MultivariateGaussianSuite extends SparkFunSuite with MLlibTestSparkContext {
   test("univariate") {
     val x1 = Vectors.dense(0.0)
     val x2 = Vectors.dense(1.5)
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/tree/DecisionTreeSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/tree/DecisionTreeSuite.scala
index ce983eb27fa35..356d957f15909 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/tree/DecisionTreeSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/tree/DecisionTreeSuite.scala
@@ -20,8 +20,7 @@ package org.apache.spark.mllib.tree
 import scala.collection.JavaConverters._
 import scala.collection.mutable
 
-import org.scalatest.FunSuite
-
+import org.apache.spark.SparkFunSuite
 import org.apache.spark.mllib.linalg.Vectors
 import org.apache.spark.mllib.regression.LabeledPoint
 import org.apache.spark.mllib.tree.configuration.Algo._
@@ -34,7 +33,7 @@ import org.apache.spark.mllib.util.MLlibTestSparkContext
 import org.apache.spark.util.Utils
 
 
-class DecisionTreeSuite extends FunSuite with MLlibTestSparkContext {
+class DecisionTreeSuite extends SparkFunSuite with MLlibTestSparkContext {
 
   /////////////////////////////////////////////////////////////////////////////
   // Tests examining individual elements of training
@@ -859,7 +858,7 @@ class DecisionTreeSuite extends FunSuite with MLlibTestSparkContext {
   }
 }
 
-object DecisionTreeSuite extends FunSuite {
+object DecisionTreeSuite extends SparkFunSuite {
 
   def validateClassifier(
       model: DecisionTreeModel,
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/tree/GradientBoostedTreesSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/tree/GradientBoostedTreesSuite.scala
index 55b0bac7d49fe..84dd3b342d4c0 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/tree/GradientBoostedTreesSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/tree/GradientBoostedTreesSuite.scala
@@ -17,8 +17,7 @@
 
 package org.apache.spark.mllib.tree
 
-import org.scalatest.FunSuite
-
+import org.apache.spark.SparkFunSuite
 import org.apache.spark.mllib.regression.LabeledPoint
 import org.apache.spark.mllib.tree.configuration.Algo._
 import org.apache.spark.mllib.tree.configuration.{BoostingStrategy, Strategy}
@@ -32,7 +31,7 @@ import org.apache.spark.util.Utils
 /**
  * Test suite for [[GradientBoostedTrees]].
  */
-class GradientBoostedTreesSuite extends FunSuite with MLlibTestSparkContext {
+class GradientBoostedTreesSuite extends SparkFunSuite with MLlibTestSparkContext {
 
   test("Regression with continuous features: SquaredError") {
     GradientBoostedTreesSuite.testCombinations.foreach {
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/tree/ImpuritySuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/tree/ImpuritySuite.scala
index 92b498580af03..49aff21fe7914 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/tree/ImpuritySuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/tree/ImpuritySuite.scala
@@ -17,15 +17,14 @@
 
 package org.apache.spark.mllib.tree
 
-import org.scalatest.FunSuite
-
+import org.apache.spark.SparkFunSuite
 import org.apache.spark.mllib.tree.impurity.{EntropyAggregator, GiniAggregator}
 import org.apache.spark.mllib.util.MLlibTestSparkContext
 
 /**
  * Test suites for [[GiniAggregator]] and [[EntropyAggregator]].
  */
-class ImpuritySuite extends FunSuite with MLlibTestSparkContext {
+class ImpuritySuite extends SparkFunSuite with MLlibTestSparkContext {
   test("Gini impurity does not support negative labels") {
     val gini = new GiniAggregator(2)
     intercept[IllegalArgumentException] {
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/tree/RandomForestSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/tree/RandomForestSuite.scala
index 4ed66953cb628..e6df5d974bf36 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/tree/RandomForestSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/tree/RandomForestSuite.scala
@@ -19,8 +19,7 @@ package org.apache.spark.mllib.tree
 
 import scala.collection.mutable
 
-import org.scalatest.FunSuite
-
+import org.apache.spark.SparkFunSuite
 import org.apache.spark.mllib.linalg.Vectors
 import org.apache.spark.mllib.regression.LabeledPoint
 import org.apache.spark.mllib.tree.configuration.Algo._
@@ -35,7 +34,7 @@ import org.apache.spark.util.Utils
 /**
  * Test suite for [[RandomForest]].
  */
-class RandomForestSuite extends FunSuite with MLlibTestSparkContext {
+class RandomForestSuite extends SparkFunSuite with MLlibTestSparkContext {
   def binaryClassificationTestWithContinuousFeatures(strategy: Strategy) {
     val arr = EnsembleTestHelper.generateOrderedLabeledPoints(numFeatures = 50, 1000)
     val rdd = sc.parallelize(arr)
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/tree/impl/BaggedPointSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/tree/impl/BaggedPointSuite.scala
index b184e936672ca..9d756da410325 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/tree/impl/BaggedPointSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/tree/impl/BaggedPointSuite.scala
@@ -17,15 +17,14 @@
 
 package org.apache.spark.mllib.tree.impl
 
-import org.scalatest.FunSuite
-
+import org.apache.spark.SparkFunSuite
 import org.apache.spark.mllib.tree.EnsembleTestHelper
 import org.apache.spark.mllib.util.MLlibTestSparkContext
 
 /**
  * Test suite for [[BaggedPoint]].
  */
-class BaggedPointSuite extends FunSuite with MLlibTestSparkContext  {
+class BaggedPointSuite extends SparkFunSuite with MLlibTestSparkContext  {
 
   test("BaggedPoint RDD: without subsampling") {
     val arr = EnsembleTestHelper.generateOrderedLabeledPoints(1, 1000)
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/util/MLUtilsSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/util/MLUtilsSuite.scala
index cdece2c174be4..87b3661f77944 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/util/MLUtilsSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/util/MLUtilsSuite.scala
@@ -21,19 +21,18 @@ import java.io.File
 
 import scala.io.Source
 
-import org.scalatest.FunSuite
-
 import breeze.linalg.{squaredDistance => breezeSquaredDistance}
 import com.google.common.base.Charsets
 import com.google.common.io.Files
 
+import org.apache.spark.SparkFunSuite
 import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vectors}
 import org.apache.spark.mllib.regression.LabeledPoint
 import org.apache.spark.mllib.util.MLUtils._
 import org.apache.spark.mllib.util.TestingUtils._
 import org.apache.spark.util.Utils
 
-class MLUtilsSuite extends FunSuite with MLlibTestSparkContext {
+class MLUtilsSuite extends SparkFunSuite with MLlibTestSparkContext {
 
   test("epsilon computation") {
     assert(1.0 + EPSILON > 1.0, s"EPSILON is too small: $EPSILON.")
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/util/NumericParserSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/util/NumericParserSuite.scala
index f68fb95eac4e4..8dcb9ba9be108 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/util/NumericParserSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/util/NumericParserSuite.scala
@@ -17,11 +17,9 @@
 
 package org.apache.spark.mllib.util
 
-import org.scalatest.FunSuite
+import org.apache.spark.{SparkException, SparkFunSuite}
 
-import org.apache.spark.SparkException
-
-class NumericParserSuite extends FunSuite {
+class NumericParserSuite extends SparkFunSuite {
 
   test("parser") {
     val s = "((1.0,2e3),-4,[5e-6,7.0E8],+9)"
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/util/TestingUtilsSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/util/TestingUtilsSuite.scala
index 59e6c778806f4..8f475f30249d6 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/util/TestingUtilsSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/util/TestingUtilsSuite.scala
@@ -17,12 +17,12 @@
 
 package org.apache.spark.mllib.util
 
+import org.apache.spark.SparkFunSuite
 import org.apache.spark.mllib.linalg.Vectors
-import org.scalatest.FunSuite
 import org.apache.spark.mllib.util.TestingUtils._
 import org.scalatest.exceptions.TestFailedException
 
-class TestingUtilsSuite extends FunSuite {
+class TestingUtilsSuite extends SparkFunSuite {
 
   test("Comparing doubles using relative error.") {
 
diff --git a/repl/pom.xml b/repl/pom.xml
index 03053b4c3b287..6e5cb7f77e1df 100644
--- a/repl/pom.xml
+++ b/repl/pom.xml
@@ -48,6 +48,13 @@
       <artifactId>spark-core_${scala.binary.version}</artifactId>
       <version>${project.version}</version>
     </dependency>
+    <dependency>
+      <groupId>org.apache.spark</groupId>
+      <artifactId>spark-core_${scala.binary.version}</artifactId>
+      <version>${project.version}</version>
+      <type>test-jar</type>
+      <scope>test</scope>
+    </dependency>
     <dependency>
       <groupId>org.apache.spark</groupId>
       <artifactId>spark-bagel_${scala.binary.version}</artifactId>
diff --git a/repl/scala-2.10/src/test/scala/org/apache/spark/repl/ReplSuite.scala b/repl/scala-2.10/src/test/scala/org/apache/spark/repl/ReplSuite.scala
index 934daaeaafca1..50fd43a418bca 100644
--- a/repl/scala-2.10/src/test/scala/org/apache/spark/repl/ReplSuite.scala
+++ b/repl/scala-2.10/src/test/scala/org/apache/spark/repl/ReplSuite.scala
@@ -22,13 +22,12 @@ import java.net.URLClassLoader
 
 import scala.collection.mutable.ArrayBuffer
 
-import org.scalatest.FunSuite
-import org.apache.spark.SparkContext
+import org.apache.spark.{SparkContext, SparkFunSuite}
 import org.apache.commons.lang3.StringEscapeUtils
 import org.apache.spark.util.Utils
 
 
-class ReplSuite extends FunSuite {
+class ReplSuite extends SparkFunSuite {
 
   def runInterpreter(master: String, input: String): String = {
     val CONF_EXECUTOR_CLASSPATH = "spark.executor.extraClassPath"
diff --git a/repl/scala-2.11/src/test/scala/org/apache/spark/repl/ReplSuite.scala b/repl/scala-2.11/src/test/scala/org/apache/spark/repl/ReplSuite.scala
index 14f5e9ed4f25e..9ecc7c229e38a 100644
--- a/repl/scala-2.11/src/test/scala/org/apache/spark/repl/ReplSuite.scala
+++ b/repl/scala-2.11/src/test/scala/org/apache/spark/repl/ReplSuite.scala
@@ -24,14 +24,13 @@ import scala.collection.mutable.ArrayBuffer
 import scala.concurrent.duration._
 import scala.tools.nsc.interpreter.SparkILoop
 
-import org.scalatest.FunSuite
 import org.apache.commons.lang3.StringEscapeUtils
-import org.apache.spark.SparkContext
+import org.apache.spark.{SparkContext, SparkFunSuite}
 import org.apache.spark.util.Utils
 
 
 
-class ReplSuite extends FunSuite {
+class ReplSuite extends SparkFunSuite {
 
   def runInterpreter(master: String, input: String): String = {
     val CONF_EXECUTOR_CLASSPATH = "spark.executor.extraClassPath"
diff --git a/repl/src/test/scala/org/apache/spark/repl/ExecutorClassLoaderSuite.scala b/repl/src/test/scala/org/apache/spark/repl/ExecutorClassLoaderSuite.scala
index c709cde740748..a58eda12b1120 100644
--- a/repl/src/test/scala/org/apache/spark/repl/ExecutorClassLoaderSuite.scala
+++ b/repl/src/test/scala/org/apache/spark/repl/ExecutorClassLoaderSuite.scala
@@ -25,7 +25,6 @@ import scala.language.implicitConversions
 import scala.language.postfixOps
 
 import org.scalatest.BeforeAndAfterAll
-import org.scalatest.FunSuite
 import org.scalatest.concurrent.Interruptor
 import org.scalatest.concurrent.Timeouts._
 import org.scalatest.mock.MockitoSugar
@@ -35,7 +34,7 @@ import org.apache.spark._
 import org.apache.spark.util.Utils
 
 class ExecutorClassLoaderSuite
-  extends FunSuite
+  extends SparkFunSuite
   with BeforeAndAfterAll
   with MockitoSugar
   with Logging {
diff --git a/sql/catalyst/pom.xml b/sql/catalyst/pom.xml
index 5c322d032d474..d9e1cdb84bb27 100644
--- a/sql/catalyst/pom.xml
+++ b/sql/catalyst/pom.xml
@@ -50,6 +50,13 @@
       <artifactId>spark-core_${scala.binary.version}</artifactId>
       <version>${project.version}</version>
     </dependency>
+    <dependency>
+      <groupId>org.apache.spark</groupId>
+      <artifactId>spark-core_${scala.binary.version}</artifactId>
+      <version>${project.version}</version>
+      <type>test-jar</type>
+      <scope>test</scope>
+    </dependency>
     <dependency>
       <groupId>org.apache.spark</groupId>
       <artifactId>spark-unsafe_${scala.binary.version}</artifactId>
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/DistributionSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/DistributionSuite.scala
index ea82cd2622de9..c046dbf4dc2c9 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/DistributionSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/DistributionSuite.scala
@@ -17,14 +17,13 @@
 
 package org.apache.spark.sql.catalyst
 
-import org.scalatest.FunSuite
-
+import org.apache.spark.SparkFunSuite
 import org.apache.spark.sql.catalyst.plans.physical._
 
 /* Implicit conversions */
 import org.apache.spark.sql.catalyst.dsl.expressions._
 
-class DistributionSuite extends FunSuite {
+class DistributionSuite extends SparkFunSuite {
 
   protected def checkSatisfied(
       inputPartitioning: Partitioning,
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/ScalaReflectionSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/ScalaReflectionSuite.scala
index 7ff51db76b6bb..9a24b23024e18 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/ScalaReflectionSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/ScalaReflectionSuite.scala
@@ -20,8 +20,7 @@ package org.apache.spark.sql.catalyst
 import java.math.BigInteger
 import java.sql.{Date, Timestamp}
 
-import org.scalatest.FunSuite
-
+import org.apache.spark.SparkFunSuite
 import org.apache.spark.sql.catalyst.expressions.Row
 import org.apache.spark.sql.types._
 
@@ -75,7 +74,7 @@ case class MultipleConstructorsData(a: Int, b: String, c: Double) {
   def this(b: String, a: Int) = this(a, b, c = 1.0)
 }
 
-class ScalaReflectionSuite extends FunSuite {
+class ScalaReflectionSuite extends SparkFunSuite {
   import ScalaReflection._
 
   test("primitive data") {
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/SqlParserSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/SqlParserSuite.scala
index 9eed15952d82b..b93a3abc6ebd2 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/SqlParserSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/SqlParserSuite.scala
@@ -17,10 +17,10 @@
 
 package org.apache.spark.sql.catalyst
 
+import org.apache.spark.SparkFunSuite
 import org.apache.spark.sql.catalyst.expressions.Attribute
 import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
 import org.apache.spark.sql.catalyst.plans.logical.Command
-import org.scalatest.FunSuite
 
 private[sql] case class TestCommand(cmd: String) extends LogicalPlan with Command {
   override def output: Seq[Attribute] = Seq.empty
@@ -49,7 +49,7 @@ private[sql] class CaseInsensitiveTestParser extends AbstractSparkSQLParser {
     }
 }
 
-class SqlParserSuite extends FunSuite {
+class SqlParserSuite extends SparkFunSuite {
 
   test("test long keyword") {
     val parser = new SuperLongKeywordTestParser
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala
index fcff24ca31486..e09cd790a7187 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala
@@ -17,8 +17,9 @@
 
 package org.apache.spark.sql.catalyst.analysis
 
-import org.scalatest.{BeforeAndAfter, FunSuite}
+import org.scalatest.BeforeAndAfter
 
+import org.apache.spark.SparkFunSuite
 import org.apache.spark.sql.AnalysisException
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.plans.logical._
@@ -27,7 +28,7 @@ import org.apache.spark.sql.catalyst.SimpleCatalystConf
 import org.apache.spark.sql.catalyst.dsl.expressions._
 import org.apache.spark.sql.catalyst.dsl.plans._
 
-class AnalysisSuite extends FunSuite with BeforeAndAfter {
+class AnalysisSuite extends SparkFunSuite with BeforeAndAfter {
   val caseSensitiveConf = new SimpleCatalystConf(true)
   val caseInsensitiveConf = new SimpleCatalystConf(false)
 
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/DecimalPrecisionSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/DecimalPrecisionSuite.scala
index 565b1cfe019c7..1b8d18ded2257 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/DecimalPrecisionSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/DecimalPrecisionSuite.scala
@@ -17,14 +17,15 @@
 
 package org.apache.spark.sql.catalyst.analysis
 
-import org.scalatest.{BeforeAndAfter, FunSuite}
+import org.scalatest.BeforeAndAfter
 
+import org.apache.spark.SparkFunSuite
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.plans.logical.{Union, Project, LocalRelation}
 import org.apache.spark.sql.types._
 import org.apache.spark.sql.catalyst.SimpleCatalystConf
 
-class DecimalPrecisionSuite extends FunSuite with BeforeAndAfter {
+class DecimalPrecisionSuite extends SparkFunSuite with BeforeAndAfter {
   val conf = new SimpleCatalystConf(true)
   val catalog = new SimpleCatalog(conf)
   val analyzer = new Analyzer(catalog, EmptyFunctionRegistry, conf)
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/AttributeSetSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/AttributeSetSuite.scala
index f2f3a84d19380..97cfb5f06dd73 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/AttributeSetSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/AttributeSetSuite.scala
@@ -17,11 +17,10 @@
 
 package org.apache.spark.sql.catalyst.expressions
 
-import org.scalatest.FunSuite
-
+import org.apache.spark.SparkFunSuite
 import org.apache.spark.sql.types.IntegerType
 
-class AttributeSetSuite extends FunSuite {
+class AttributeSetSuite extends SparkFunSuite {
 
   val aUpper = AttributeReference("A", IntegerType)(exprId = ExprId(1))
   val aLower = AttributeReference("a", IntegerType)(exprId = ExprId(1))
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionEvaluationSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionEvaluationSuite.scala
index a14f776b1eaee..b511aa3a24420 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionEvaluationSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionEvaluationSuite.scala
@@ -22,9 +22,9 @@ import java.sql.{Date, Timestamp}
 import scala.collection.immutable.HashSet
 
 import org.scalactic.TripleEqualsSupport.Spread
-import org.scalatest.FunSuite
 import org.scalatest.Matchers._
 
+import org.apache.spark.SparkFunSuite
 import org.apache.spark.sql.catalyst.CatalystTypeConverters
 import org.apache.spark.sql.catalyst.analysis.UnresolvedExtractValue
 import org.apache.spark.sql.catalyst.dsl.expressions._
@@ -33,7 +33,7 @@ import org.apache.spark.sql.catalyst.util.DateUtils
 import org.apache.spark.sql.types._
 
 
-class ExpressionEvaluationBaseSuite extends FunSuite {
+class ExpressionEvaluationBaseSuite extends SparkFunSuite {
 
   def evaluate(expression: Expression, inputRow: Row = EmptyRow): Any = {
     expression.eval(inputRow)
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/UnsafeFixedWidthAggregationMapSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/UnsafeFixedWidthAggregationMapSuite.scala
index 7a19e511eb8b5..88a36aa121b55 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/UnsafeFixedWidthAggregationMapSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/UnsafeFixedWidthAggregationMapSuite.scala
@@ -20,12 +20,16 @@ package org.apache.spark.sql.catalyst.expressions
 import scala.collection.JavaConverters._
 import scala.util.Random
 
+import org.apache.spark.SparkFunSuite
 import org.apache.spark.unsafe.memory.{ExecutorMemoryManager, TaskMemoryManager, MemoryAllocator}
-import org.scalatest.{BeforeAndAfterEach, FunSuite, Matchers}
+import org.scalatest.{BeforeAndAfterEach, Matchers}
 
 import org.apache.spark.sql.types._
 
-class UnsafeFixedWidthAggregationMapSuite extends FunSuite with Matchers with BeforeAndAfterEach {
+class UnsafeFixedWidthAggregationMapSuite
+  extends SparkFunSuite
+  with Matchers
+  with BeforeAndAfterEach {
 
   import UnsafeFixedWidthAggregationMap._
 
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/UnsafeRowConverterSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/UnsafeRowConverterSuite.scala
index 3a60c7fd32675..61722f1ffa462 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/UnsafeRowConverterSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/UnsafeRowConverterSuite.scala
@@ -19,13 +19,14 @@ package org.apache.spark.sql.catalyst.expressions
 
 import java.util.Arrays
 
-import org.scalatest.{FunSuite, Matchers}
+import org.scalatest.Matchers
 
+import org.apache.spark.SparkFunSuite
 import org.apache.spark.sql.types._
 import org.apache.spark.unsafe.PlatformDependent
 import org.apache.spark.unsafe.array.ByteArrayMethods
 
-class UnsafeRowConverterSuite extends FunSuite with Matchers {
+class UnsafeRowConverterSuite extends SparkFunSuite with Matchers {
 
   test("basic conversion with only primitive types") {
     val fieldTypes: Array[DataType] = Array(LongType, LongType, IntegerType)
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/plans/PlanTest.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/plans/PlanTest.scala
index e7cafcc96de87..765c1e2dda99f 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/plans/PlanTest.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/plans/PlanTest.scala
@@ -17,8 +17,7 @@
 
 package org.apache.spark.sql.catalyst.plans
 
-import org.scalatest.FunSuite
-
+import org.apache.spark.SparkFunSuite
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.plans.logical.{OneRowRelation, Filter, LogicalPlan}
 import org.apache.spark.sql.catalyst.util._
@@ -26,7 +25,7 @@ import org.apache.spark.sql.catalyst.util._
 /**
  * Provides helper methods for comparing plans.
  */
-class PlanTest extends FunSuite {
+class PlanTest extends SparkFunSuite {
 
   /**
    * Since attribute references are given globally unique ids during analysis,
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/plans/SameResultSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/plans/SameResultSuite.scala
index 1273921f6394c..62d5f6ac74885 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/plans/SameResultSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/plans/SameResultSuite.scala
@@ -17,8 +17,7 @@
 
 package org.apache.spark.sql.catalyst.plans
 
-import org.scalatest.FunSuite
-
+import org.apache.spark.SparkFunSuite
 import org.apache.spark.sql.catalyst.dsl.plans._
 import org.apache.spark.sql.catalyst.dsl.expressions._
 import org.apache.spark.sql.catalyst.expressions.{ExprId, AttributeReference}
@@ -28,7 +27,7 @@ import org.apache.spark.sql.catalyst.util._
 /**
  * Tests for the sameResult function of [[LogicalPlan]].
  */
-class SameResultSuite extends FunSuite {
+class SameResultSuite extends SparkFunSuite {
   val testRelation = LocalRelation('a.int, 'b.int, 'c.int)
   val testRelation2 = LocalRelation('a.int, 'b.int, 'c.int)
 
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/trees/RuleExecutorSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/trees/RuleExecutorSuite.scala
index 2a641c63f87bb..a7de7b052bdc3 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/trees/RuleExecutorSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/trees/RuleExecutorSuite.scala
@@ -17,12 +17,11 @@
 
 package org.apache.spark.sql.catalyst.trees
 
-import org.scalatest.FunSuite
-
+import org.apache.spark.SparkFunSuite
 import org.apache.spark.sql.catalyst.expressions.{Expression, IntegerLiteral, Literal}
 import org.apache.spark.sql.catalyst.rules.{Rule, RuleExecutor}
 
-class RuleExecutorSuite extends FunSuite {
+class RuleExecutorSuite extends SparkFunSuite {
   object DecrementLiterals extends Rule[Expression] {
     def apply(e: Expression): Expression = e transform {
       case IntegerLiteral(i) if i > 0 => Literal(i - 1)
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/trees/TreeNodeSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/trees/TreeNodeSuite.scala
index 9fcfc51c96139..67db3d5e6d751 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/trees/TreeNodeSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/trees/TreeNodeSuite.scala
@@ -19,8 +19,7 @@ package org.apache.spark.sql.catalyst.trees
 
 import scala.collection.mutable.ArrayBuffer
 
-import org.scalatest.FunSuite
-
+import org.apache.spark.SparkFunSuite
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.types.{IntegerType, StringType, NullType}
 
@@ -32,7 +31,7 @@ case class Dummy(optKey: Option[Expression]) extends Expression {
   override def eval(input: Row): Any = null.asInstanceOf[Any]
 }
 
-class TreeNodeSuite extends FunSuite {
+class TreeNodeSuite extends SparkFunSuite {
   test("top node changed") {
     val after = Literal(1) transform { case Literal(1, _) => Literal(2) }
     assert(after === Literal(2))
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/MetadataSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/MetadataSuite.scala
index d7d60efee50fa..4030a1b1df358 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/MetadataSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/MetadataSuite.scala
@@ -18,11 +18,11 @@
 package org.apache.spark.sql.catalyst.util
 
 import org.json4s.jackson.JsonMethods.parse
-import org.scalatest.FunSuite
 
+import org.apache.spark.SparkFunSuite
 import org.apache.spark.sql.types.{MetadataBuilder, Metadata}
 
-class MetadataSuite extends FunSuite {
+class MetadataSuite extends SparkFunSuite {
 
   val baseMetadata = new MetadataBuilder()
     .putString("purpose", "ml")
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/types/DataTypeParserSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/types/DataTypeParserSuite.scala
index 3e7cf7cbb5e63..c6171b7b6916d 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/types/DataTypeParserSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/types/DataTypeParserSuite.scala
@@ -17,9 +17,9 @@
 
 package org.apache.spark.sql.types
 
-import org.scalatest.FunSuite
+import org.apache.spark.SparkFunSuite
 
-class DataTypeParserSuite extends FunSuite {
+class DataTypeParserSuite extends SparkFunSuite {
 
   def checkDataType(dataTypeString: String, expectedDataType: DataType): Unit = {
     test(s"parse ${dataTypeString.replace("\n", "")}") {
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/types/DataTypeSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/types/DataTypeSuite.scala
index df119827812f9..543cdefc5293b 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/types/DataTypeSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/types/DataTypeSuite.scala
@@ -17,10 +17,9 @@
 
 package org.apache.spark.sql.types
 
-import org.apache.spark.SparkException
-import org.scalatest.FunSuite
+import org.apache.spark.{SparkException, SparkFunSuite}
 
-class DataTypeSuite extends FunSuite {
+class DataTypeSuite extends SparkFunSuite {
 
   test("construct an ArrayType") {
     val array = ArrayType(StringType)
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/types/UTF8StringSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/types/UTF8StringSuite.scala
index a22aa6f244c48..81d7ab010f394 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/types/UTF8StringSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/types/UTF8StringSuite.scala
@@ -17,10 +17,10 @@
 
 package org.apache.spark.sql.types
 
-import org.scalatest.FunSuite
+import org.apache.spark.SparkFunSuite
 
 // scalastyle:off
-class UTF8StringSuite extends FunSuite {
+class UTF8StringSuite extends SparkFunSuite {
   test("basic") {
     def check(str: String, len: Int) {
 
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/types/decimal/DecimalSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/types/decimal/DecimalSuite.scala
index de6a2cd448c47..28b373e258311 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/types/decimal/DecimalSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/types/decimal/DecimalSuite.scala
@@ -17,12 +17,13 @@
 
 package org.apache.spark.sql.types.decimal
 
+import org.apache.spark.SparkFunSuite
 import org.apache.spark.sql.types.Decimal
-import org.scalatest.{PrivateMethodTester, FunSuite}
+import org.scalatest.PrivateMethodTester
 
 import scala.language.postfixOps
 
-class DecimalSuite extends FunSuite with PrivateMethodTester {
+class DecimalSuite extends SparkFunSuite with PrivateMethodTester {
   test("creating decimals") {
     /** Check that a Decimal has the given string representation, precision and scale */
     def checkDecimal(d: Decimal, string: String, precision: Int, scale: Int): Unit = {
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameStatSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameStatSuite.scala
index 46b1845a9180c..add0fd58e28c8 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameStatSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameStatSuite.scala
@@ -17,13 +17,13 @@
 
 package org.apache.spark.sql
 
-import org.scalatest.FunSuite
 import org.scalatest.Matchers._
 
+import org.apache.spark.SparkFunSuite
 import org.apache.spark.sql.test.TestSQLContext
 import org.apache.spark.sql.test.TestSQLContext.implicits._
 
-class DataFrameStatSuite extends FunSuite  {
+class DataFrameStatSuite extends SparkFunSuite  {
   
   val sqlCtx = TestSQLContext
   def toLetter(i: Int): String = (i + 97).toChar.toString
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/MathExpressionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/MathExpressionsSuite.scala
index c4281c4b55c02..dd68965444f5d 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/MathExpressionsSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/MathExpressionsSuite.scala
@@ -206,7 +206,7 @@ class MathExpressionsSuite extends QueryTest {
   }
 
   test("log") {
-    testOneToOneNonNegativeMathFunction(log, math.log)
+    testOneToOneNonNegativeMathFunction(org.apache.spark.sql.functions.log, math.log)
   }
 
   test("log10") {
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/RowSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/RowSuite.scala
index fb3ba4bc1b908..513ac915dcb2a 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/RowSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/RowSuite.scala
@@ -17,15 +17,15 @@
 
 package org.apache.spark.sql
 
+import org.apache.spark.SparkFunSuite
 import org.apache.spark.sql.execution.SparkSqlSerializer
-import org.scalatest.FunSuite
 
 import org.apache.spark.sql.catalyst.expressions.{GenericMutableRow, SpecificMutableRow}
 import org.apache.spark.sql.test.TestSQLContext
 import org.apache.spark.sql.test.TestSQLContext.implicits._
 import org.apache.spark.sql.types._
 
-class RowSuite extends FunSuite {
+class RowSuite extends SparkFunSuite {
 
   test("create row") {
     val expected = new GenericMutableRow(4)
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLConfSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLConfSuite.scala
index bf73d0c7074a5..3a5f071e2f7cb 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/SQLConfSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/SQLConfSuite.scala
@@ -17,14 +17,13 @@
 
 package org.apache.spark.sql
 
-import org.scalatest.FunSuiteLike
-
+import org.apache.spark.SparkFunSuite
 import org.apache.spark.sql.test._
 
 /* Implicits */
 import TestSQLContext._
 
-class SQLConfSuite extends QueryTest with FunSuiteLike {
+class SQLConfSuite extends QueryTest {
 
   val testKey = "test.key.0"
   val testVal = "test.val.0"
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLContextSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLContextSuite.scala
index f186bc1c18123..797d123b48668 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/SQLContextSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/SQLContextSuite.scala
@@ -17,11 +17,12 @@
 
 package org.apache.spark.sql
 
-import org.scalatest.{BeforeAndAfterAll, FunSuite}
+import org.scalatest.BeforeAndAfterAll
 
+import org.apache.spark.SparkFunSuite
 import org.apache.spark.sql.test.TestSQLContext
 
-class SQLContextSuite extends FunSuite with BeforeAndAfterAll {
+class SQLContextSuite extends SparkFunSuite with BeforeAndAfterAll {
 
   private val testSqlContext = TestSQLContext
   private val testSparkContext = TestSQLContext.sparkContext
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/ScalaReflectionRelationSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/ScalaReflectionRelationSuite.scala
index 52d265b445e14..d2ede39f0a5f6 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/ScalaReflectionRelationSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/ScalaReflectionRelationSuite.scala
@@ -19,8 +19,7 @@ package org.apache.spark.sql
 
 import java.sql.{Date, Timestamp}
 
-import org.scalatest.FunSuite
-
+import org.apache.spark.SparkFunSuite
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.test.TestSQLContext._
 
@@ -74,7 +73,7 @@ case class ComplexReflectData(
     mapFieldContainsNull: Map[Int, Option[Long]],
     dataField: Data)
 
-class ScalaReflectionRelationSuite extends FunSuite {
+class ScalaReflectionRelationSuite extends SparkFunSuite {
 
   import org.apache.spark.sql.test.TestSQLContext.implicits._
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SerializationSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SerializationSuite.scala
index 6f6d3c9c243d4..1e8cde606b67b 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/SerializationSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/SerializationSuite.scala
@@ -17,13 +17,11 @@
 
 package org.apache.spark.sql
 
-import org.scalatest.FunSuite
-
-import org.apache.spark.SparkConf
+import org.apache.spark.{SparkConf, SparkFunSuite}
 import org.apache.spark.serializer.JavaSerializer
 import org.apache.spark.sql.test.TestSQLContext
 
-class SerializationSuite extends FunSuite {
+class SerializationSuite extends SparkFunSuite {
 
   test("[SPARK-5235] SQLContext should be serializable") {
     val sqlContext = new SQLContext(TestSQLContext.sparkContext)
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/columnar/ColumnStatsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/columnar/ColumnStatsSuite.scala
index 7cefcf44061ce..339e719f39f16 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/columnar/ColumnStatsSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/columnar/ColumnStatsSuite.scala
@@ -17,12 +17,11 @@
 
 package org.apache.spark.sql.columnar
 
-import org.scalatest.FunSuite
-
+import org.apache.spark.SparkFunSuite
 import org.apache.spark.sql.catalyst.expressions.Row
 import org.apache.spark.sql.types._
 
-class ColumnStatsSuite extends FunSuite {
+class ColumnStatsSuite extends SparkFunSuite {
   testColumnStats(classOf[ByteColumnStats], BYTE, Row(Byte.MaxValue, Byte.MinValue, 0))
   testColumnStats(classOf[ShortColumnStats], SHORT, Row(Short.MaxValue, Short.MinValue, 0))
   testColumnStats(classOf[IntColumnStats], INT, Row(Int.MaxValue, Int.MinValue, 0))
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/columnar/ColumnTypeSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/columnar/ColumnTypeSuite.scala
index 061efb37a0ac3..a1e76eaa982cc 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/columnar/ColumnTypeSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/columnar/ColumnTypeSuite.scala
@@ -23,15 +23,14 @@ import java.sql.Timestamp
 import com.esotericsoftware.kryo.{Serializer, Kryo}
 import com.esotericsoftware.kryo.io.{Input, Output}
 import org.apache.spark.serializer.KryoRegistrator
-import org.scalatest.FunSuite
 
-import org.apache.spark.{SparkConf, Logging}
+import org.apache.spark.{Logging, SparkConf, SparkFunSuite}
 import org.apache.spark.sql.catalyst.expressions.GenericMutableRow
 import org.apache.spark.sql.columnar.ColumnarTestUtils._
 import org.apache.spark.sql.execution.SparkSqlSerializer
 import org.apache.spark.sql.types._
 
-class ColumnTypeSuite extends FunSuite with Logging {
+class ColumnTypeSuite extends SparkFunSuite with Logging {
   val DEFAULT_BUFFER_SIZE = 512
 
   test("defaultSize") {
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/columnar/NullableColumnAccessorSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/columnar/NullableColumnAccessorSuite.scala
index a0702144f942c..2a6e0c376551a 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/columnar/NullableColumnAccessorSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/columnar/NullableColumnAccessorSuite.scala
@@ -19,8 +19,7 @@ package org.apache.spark.sql.columnar
 
 import java.nio.ByteBuffer
 
-import org.scalatest.FunSuite
-
+import org.apache.spark.SparkFunSuite
 import org.apache.spark.sql.catalyst.expressions.GenericMutableRow
 import org.apache.spark.sql.types.DataType
 
@@ -39,7 +38,7 @@ object TestNullableColumnAccessor {
   }
 }
 
-class NullableColumnAccessorSuite extends FunSuite {
+class NullableColumnAccessorSuite extends SparkFunSuite {
   import ColumnarTestUtils._
 
   Seq(
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/columnar/NullableColumnBuilderSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/columnar/NullableColumnBuilderSuite.scala
index 3a5605d2335d7..cb4e9f1eb7f46 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/columnar/NullableColumnBuilderSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/columnar/NullableColumnBuilderSuite.scala
@@ -17,8 +17,7 @@
 
 package org.apache.spark.sql.columnar
 
-import org.scalatest.FunSuite
-
+import org.apache.spark.SparkFunSuite
 import org.apache.spark.sql.execution.SparkSqlSerializer
 import org.apache.spark.sql.types._
 
@@ -35,7 +34,7 @@ object TestNullableColumnBuilder {
   }
 }
 
-class NullableColumnBuilderSuite extends FunSuite {
+class NullableColumnBuilderSuite extends SparkFunSuite {
   import ColumnarTestUtils._
 
   Seq(
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/columnar/PartitionBatchPruningSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/columnar/PartitionBatchPruningSuite.scala
index 2a0b701cad7fa..cda1b0992e36f 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/columnar/PartitionBatchPruningSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/columnar/PartitionBatchPruningSuite.scala
@@ -17,13 +17,14 @@
 
 package org.apache.spark.sql.columnar
 
-import org.scalatest.{BeforeAndAfter, BeforeAndAfterAll, FunSuite}
+import org.scalatest.{BeforeAndAfter, BeforeAndAfterAll}
 
+import org.apache.spark.SparkFunSuite
 import org.apache.spark.sql._
 import org.apache.spark.sql.test.TestSQLContext._
 import org.apache.spark.sql.test.TestSQLContext.implicits._
 
-class PartitionBatchPruningSuite extends FunSuite with BeforeAndAfterAll with BeforeAndAfter {
+class PartitionBatchPruningSuite extends SparkFunSuite with BeforeAndAfterAll with BeforeAndAfter {
   val originalColumnBatchSize = conf.columnBatchSize
   val originalInMemoryPartitionPruning = conf.inMemoryPartitionPruning
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/columnar/compression/BooleanBitSetSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/columnar/compression/BooleanBitSetSuite.scala
index 8b518f094174c..20d65a74e3b7a 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/columnar/compression/BooleanBitSetSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/columnar/compression/BooleanBitSetSuite.scala
@@ -17,14 +17,13 @@
 
 package org.apache.spark.sql.columnar.compression
 
-import org.scalatest.FunSuite
-
+import org.apache.spark.SparkFunSuite
 import org.apache.spark.sql.Row
 import org.apache.spark.sql.catalyst.expressions.GenericMutableRow
 import org.apache.spark.sql.columnar.{NoopColumnStats, BOOLEAN}
 import org.apache.spark.sql.columnar.ColumnarTestUtils._
 
-class BooleanBitSetSuite extends FunSuite {
+class BooleanBitSetSuite extends SparkFunSuite {
   import BooleanBitSet._
 
   def skeleton(count: Int) {
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/columnar/compression/DictionaryEncodingSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/columnar/compression/DictionaryEncodingSuite.scala
index cef60ec204faa..acfab6586c0d1 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/columnar/compression/DictionaryEncodingSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/columnar/compression/DictionaryEncodingSuite.scala
@@ -19,14 +19,13 @@ package org.apache.spark.sql.columnar.compression
 
 import java.nio.ByteBuffer
 
-import org.scalatest.FunSuite
-
+import org.apache.spark.SparkFunSuite
 import org.apache.spark.sql.catalyst.expressions.GenericMutableRow
 import org.apache.spark.sql.columnar._
 import org.apache.spark.sql.columnar.ColumnarTestUtils._
 import org.apache.spark.sql.types.AtomicType
 
-class DictionaryEncodingSuite extends FunSuite {
+class DictionaryEncodingSuite extends SparkFunSuite {
   testDictionaryEncoding(new IntColumnStats, INT)
   testDictionaryEncoding(new LongColumnStats, LONG)
   testDictionaryEncoding(new StringColumnStats, STRING)
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/columnar/compression/IntegralDeltaSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/columnar/compression/IntegralDeltaSuite.scala
index 5514590541dd6..2111e9fbe62cb 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/columnar/compression/IntegralDeltaSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/columnar/compression/IntegralDeltaSuite.scala
@@ -17,14 +17,13 @@
 
 package org.apache.spark.sql.columnar.compression
 
-import org.scalatest.FunSuite
-
+import org.apache.spark.SparkFunSuite
 import org.apache.spark.sql.catalyst.expressions.GenericMutableRow
 import org.apache.spark.sql.columnar._
 import org.apache.spark.sql.columnar.ColumnarTestUtils._
 import org.apache.spark.sql.types.IntegralType
 
-class IntegralDeltaSuite extends FunSuite {
+class IntegralDeltaSuite extends SparkFunSuite {
   testIntegralDelta(new IntColumnStats, INT, IntDelta)
   testIntegralDelta(new LongColumnStats, LONG, LongDelta)
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/columnar/compression/RunLengthEncodingSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/columnar/compression/RunLengthEncodingSuite.scala
index 6ee48f6291914..67ec08f594a43 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/columnar/compression/RunLengthEncodingSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/columnar/compression/RunLengthEncodingSuite.scala
@@ -17,14 +17,13 @@
 
 package org.apache.spark.sql.columnar.compression
 
-import org.scalatest.FunSuite
-
+import org.apache.spark.SparkFunSuite
 import org.apache.spark.sql.catalyst.expressions.GenericMutableRow
 import org.apache.spark.sql.columnar._
 import org.apache.spark.sql.columnar.ColumnarTestUtils._
 import org.apache.spark.sql.types.AtomicType
 
-class RunLengthEncodingSuite extends FunSuite {
+class RunLengthEncodingSuite extends SparkFunSuite {
   testRunLengthEncoding(new NoopColumnStats, BOOLEAN)
   testRunLengthEncoding(new ByteColumnStats, BYTE)
   testRunLengthEncoding(new ShortColumnStats, SHORT)
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/PlannerSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/PlannerSuite.scala
index 523be56df65ba..45a7e8fe68f72 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/PlannerSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/PlannerSuite.scala
@@ -17,8 +17,7 @@
 
 package org.apache.spark.sql.execution
 
-import org.scalatest.FunSuite
-
+import org.apache.spark.SparkFunSuite
 import org.apache.spark.sql.{SQLConf, execution}
 import org.apache.spark.sql.functions._
 import org.apache.spark.sql.TestData._
@@ -31,7 +30,7 @@ import org.apache.spark.sql.test.TestSQLContext.planner._
 import org.apache.spark.sql.types._
 
 
-class PlannerSuite extends FunSuite {
+class PlannerSuite extends SparkFunSuite {
   test("unions are collapsed") {
     val query = testData.unionAll(testData).unionAll(testData).logicalPlan
     val planned = BasicOperators(query).head
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/SparkSqlSerializer2Suite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/SparkSqlSerializer2Suite.scala
index 15337c4045436..6ca5390cde23e 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/SparkSqlSerializer2Suite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/SparkSqlSerializer2Suite.scala
@@ -19,17 +19,17 @@ package org.apache.spark.sql.execution
 
 import java.sql.{Timestamp, Date}
 
-import org.scalatest.{FunSuite, BeforeAndAfterAll}
+import org.scalatest.BeforeAndAfterAll
 
 import org.apache.spark.rdd.ShuffledRDD
 import org.apache.spark.serializer.Serializer
-import org.apache.spark.ShuffleDependency
+import org.apache.spark.{ShuffleDependency, SparkFunSuite}
 import org.apache.spark.sql.types._
 import org.apache.spark.sql.Row
 import org.apache.spark.sql.test.TestSQLContext._
 import org.apache.spark.sql.{MyDenseVectorUDT, QueryTest}
 
-class SparkSqlSerializer2DataTypeSuite extends FunSuite {
+class SparkSqlSerializer2DataTypeSuite extends SparkFunSuite {
   // Make sure that we will not use serializer2 for unsupported data types.
   def checkSupported(dataType: DataType, isSupported: Boolean): Unit = {
     val testName =
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/debug/DebuggingSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/debug/DebuggingSuite.scala
index 358d8cf06e463..8ec3985e00360 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/debug/DebuggingSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/debug/DebuggingSuite.scala
@@ -17,12 +17,11 @@
 
 package org.apache.spark.sql.execution.debug
 
-import org.scalatest.FunSuite
-
+import org.apache.spark.SparkFunSuite
 import org.apache.spark.sql.TestData._
 import org.apache.spark.sql.test.TestSQLContext._
 
-class DebuggingSuite extends FunSuite {
+class DebuggingSuite extends SparkFunSuite {
   test("DataFrame.debug()") {
     testData.debug()
   }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/joins/HashedRelationSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/joins/HashedRelationSuite.scala
index 2aad01ded1acf..5290c28cfca02 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/joins/HashedRelationSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/joins/HashedRelationSuite.scala
@@ -17,13 +17,12 @@
 
 package org.apache.spark.sql.execution.joins
 
-import org.scalatest.FunSuite
-
+import org.apache.spark.SparkFunSuite
 import org.apache.spark.sql.catalyst.expressions.{Projection, Row}
 import org.apache.spark.util.collection.CompactBuffer
 
 
-class HashedRelationSuite extends FunSuite {
+class HashedRelationSuite extends SparkFunSuite {
 
   // Key is simply the record itself
   private val keyProjection = new Projection {
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCSuite.scala
index 30279f528944b..af279007c587e 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCSuite.scala
@@ -21,14 +21,15 @@ import java.math.BigDecimal
 import java.sql.DriverManager
 import java.util.{Calendar, GregorianCalendar, Properties}
 
+import org.apache.spark.SparkFunSuite
 import org.apache.spark.sql.test._
 import org.apache.spark.sql.types._
 import org.h2.jdbc.JdbcSQLException
-import org.scalatest.{FunSuite, BeforeAndAfter}
+import org.scalatest.BeforeAndAfter
 import TestSQLContext._
 import TestSQLContext.implicits._
 
-class JDBCSuite extends FunSuite with BeforeAndAfter {
+class JDBCSuite extends SparkFunSuite with BeforeAndAfter {
   val url = "jdbc:h2:mem:testdb0"
   val urlWithUserAndPass = "jdbc:h2:mem:testdb0;user=testUser;password=testPass"
   var conn: java.sql.Connection = null
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCWriteSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCWriteSuite.scala
index 2e4c12f9da80c..3cd987b0b3383 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCWriteSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCWriteSuite.scala
@@ -20,13 +20,14 @@ package org.apache.spark.sql.jdbc
 import java.sql.DriverManager
 import java.util.Properties
 
-import org.scalatest.{BeforeAndAfter, FunSuite}
+import org.scalatest.BeforeAndAfter
 
+import org.apache.spark.SparkFunSuite
 import org.apache.spark.sql.{SaveMode, Row}
 import org.apache.spark.sql.test._
 import org.apache.spark.sql.types._
 
-class JDBCWriteSuite extends FunSuite with BeforeAndAfter {
+class JDBCWriteSuite extends SparkFunSuite with BeforeAndAfter {
   val url = "jdbc:h2:mem:testdb2"
   var conn: java.sql.Connection = null
   val url1 = "jdbc:h2:mem:testdb3"
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetSchemaSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetSchemaSuite.scala
index c964b6d984557..caec2a6f25489 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetSchemaSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetSchemaSuite.scala
@@ -20,14 +20,14 @@ package org.apache.spark.sql.parquet
 import scala.reflect.ClassTag
 import scala.reflect.runtime.universe.TypeTag
 
-import org.scalatest.FunSuite
 import parquet.schema.MessageTypeParser
 
+import org.apache.spark.SparkFunSuite
 import org.apache.spark.sql.catalyst.ScalaReflection
 import org.apache.spark.sql.test.TestSQLContext
 import org.apache.spark.sql.types._
 
-class ParquetSchemaSuite extends FunSuite with ParquetTest {
+class ParquetSchemaSuite extends SparkFunSuite with ParquetTest {
   val sqlContext = TestSQLContext
 
   /**
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/ResolvedDataSourceSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/ResolvedDataSourceSuite.scala
index 8331a14c9295c..296b0d6f74a0c 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/sources/ResolvedDataSourceSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/sources/ResolvedDataSourceSuite.scala
@@ -17,9 +17,9 @@
 
 package org.apache.spark.sql.sources
 
-import org.scalatest.FunSuite
+import org.apache.spark.SparkFunSuite
 
-class ResolvedDataSourceSuite extends FunSuite {
+class ResolvedDataSourceSuite extends SparkFunSuite {
 
   test("builtin sources") {
     assert(ResolvedDataSource.lookupDataSource("jdbc") ===
diff --git a/sql/hive-thriftserver/pom.xml b/sql/hive-thriftserver/pom.xml
index 437f697d25bf3..20d3c7d4c5959 100644
--- a/sql/hive-thriftserver/pom.xml
+++ b/sql/hive-thriftserver/pom.xml
@@ -41,6 +41,13 @@
       <artifactId>spark-hive_${scala.binary.version}</artifactId>
       <version>${project.version}</version>
     </dependency>
+    <dependency>
+      <groupId>org.apache.spark</groupId>
+      <artifactId>spark-core_${scala.binary.version}</artifactId>
+      <version>${project.version}</version>
+      <type>test-jar</type>
+      <scope>test</scope>
+    </dependency>
     <dependency>
       <groupId>com.google.guava</groupId>
       <artifactId>guava</artifactId>
diff --git a/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/CliSuite.scala b/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/CliSuite.scala
index cc07db827d359..3732af7870b93 100644
--- a/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/CliSuite.scala
+++ b/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/CliSuite.scala
@@ -25,16 +25,16 @@ import scala.concurrent.{Await, Promise}
 import scala.sys.process.{Process, ProcessLogger}
 
 import org.apache.hadoop.hive.conf.HiveConf.ConfVars
-import org.scalatest.{BeforeAndAfter, FunSuite}
+import org.scalatest.BeforeAndAfter
 
-import org.apache.spark.Logging
+import org.apache.spark.{Logging, SparkFunSuite}
 import org.apache.spark.util.Utils
 
 /**
  * A test suite for the `spark-sql` CLI tool.  Note that all test cases share the same temporary
  * Hive metastore and warehouse.
  */
-class CliSuite extends FunSuite with BeforeAndAfter with Logging {
+class CliSuite extends SparkFunSuite with BeforeAndAfter with Logging {
   val warehousePath = Utils.createTempDir()
   val metastorePath = Utils.createTempDir()
 
diff --git a/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/HiveThriftServer2Suites.scala b/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/HiveThriftServer2Suites.scala
index 610939c6a9481..da511ebd05ad2 100644
--- a/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/HiveThriftServer2Suites.scala
+++ b/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/HiveThriftServer2Suites.scala
@@ -37,9 +37,9 @@ import org.apache.hive.service.cli.thrift.TCLIService.Client
 import org.apache.hive.service.cli.thrift.ThriftCLIServiceClient
 import org.apache.thrift.protocol.TBinaryProtocol
 import org.apache.thrift.transport.TSocket
-import org.scalatest.{BeforeAndAfterAll, FunSuite}
+import org.scalatest.BeforeAndAfterAll
 
-import org.apache.spark.Logging
+import org.apache.spark.{Logging, SparkFunSuite}
 import org.apache.spark.sql.hive.HiveShim
 import org.apache.spark.util.Utils
 
@@ -405,7 +405,7 @@ abstract class HiveThriftJdbcTest extends HiveThriftServer2Test {
   }
 }
 
-abstract class HiveThriftServer2Test extends FunSuite with BeforeAndAfterAll with Logging {
+abstract class HiveThriftServer2Test extends SparkFunSuite with BeforeAndAfterAll with Logging {
   def mode: ServerMode.Value
 
   private val CLASS_NAME = HiveThriftServer2.getClass.getCanonicalName.stripSuffix("$")
diff --git a/sql/hive/pom.xml b/sql/hive/pom.xml
index 615b07e74d535..923ffabb9b99e 100644
--- a/sql/hive/pom.xml
+++ b/sql/hive/pom.xml
@@ -41,6 +41,13 @@
       <artifactId>spark-core_${scala.binary.version}</artifactId>
       <version>${project.version}</version>
     </dependency>
+    <dependency>
+      <groupId>org.apache.spark</groupId>
+      <artifactId>spark-core_${scala.binary.version}</artifactId>
+      <version>${project.version}</version>
+      <type>test-jar</type>
+      <scope>test</scope>
+    </dependency>
     <dependency>
       <groupId>org.apache.spark</groupId>
       <artifactId>spark-sql_${scala.binary.version}</artifactId>
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveInspectorSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveInspectorSuite.scala
index 80c2d32bf70d7..df137e7b2b333 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveInspectorSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveInspectorSuite.scala
@@ -26,12 +26,12 @@ import org.apache.hadoop.hive.serde2.objectinspector.{ObjectInspector, ObjectIns
 import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory.ObjectInspectorOptions
 import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory
 import org.apache.hadoop.io.LongWritable
-import org.scalatest.FunSuite
 
+import org.apache.spark.SparkFunSuite
 import org.apache.spark.sql.catalyst.expressions.{Literal, Row}
 import org.apache.spark.sql.types._
 
-class HiveInspectorSuite extends FunSuite with HiveInspectors {
+class HiveInspectorSuite extends SparkFunSuite with HiveInspectors {
   test("Test wrap SettableStructObjectInspector") {
     val udaf = new UDAFPercentile.PercentileLongEvaluator()
     udaf.init()
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveMetastoreCatalogSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveMetastoreCatalogSuite.scala
index fa8e11ffec2b4..e9bb32667936c 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveMetastoreCatalogSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveMetastoreCatalogSuite.scala
@@ -17,13 +17,13 @@
 
 package org.apache.spark.sql.hive
 
+import org.apache.spark.SparkFunSuite
 import org.apache.spark.sql.hive.test.TestHive
-import org.scalatest.FunSuite
 
 import org.apache.spark.sql.test.ExamplePointUDT
 import org.apache.spark.sql.types.StructType
 
-class HiveMetastoreCatalogSuite extends FunSuite {
+class HiveMetastoreCatalogSuite extends SparkFunSuite {
 
   test("struct field should accept underscore in sub-column name") {
     val metastr = "struct<a: int, b_1: string, c: string>"
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveQlSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveQlSuite.scala
index 941a2941649b8..f765395e148af 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveQlSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveQlSuite.scala
@@ -20,12 +20,13 @@ package org.apache.spark.sql.hive
 import org.apache.hadoop.hive.conf.HiveConf
 import org.apache.hadoop.hive.ql.session.SessionState
 import org.apache.hadoop.hive.serde.serdeConstants
+import org.apache.spark.SparkFunSuite
 import org.apache.spark.sql.AnalysisException
 import org.apache.spark.sql.hive.client.{ManagedTable, HiveColumn, ExternalTable, HiveTable}
-import org.scalatest.{BeforeAndAfterAll, FunSuite}
+import org.scalatest.BeforeAndAfterAll
 
 
-class HiveQlSuite extends FunSuite with BeforeAndAfterAll {
+class HiveQlSuite extends SparkFunSuite with BeforeAndAfterAll {
   override def beforeAll() {
     if (SessionState.get() == null) {
       SessionState.start(new HiveConf())
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/SerializationSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/SerializationSuite.scala
index 8afe5459d4f1b..a492ecf203d17 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/SerializationSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/SerializationSuite.scala
@@ -17,13 +17,11 @@
 
 package org.apache.spark.sql.hive
 
-import org.scalatest.FunSuite
-
-import org.apache.spark.SparkConf
+import org.apache.spark.{SparkConf, SparkFunSuite}
 import org.apache.spark.serializer.JavaSerializer
 import org.apache.spark.sql.hive.test.TestHive
 
-class SerializationSuite extends FunSuite {
+class SerializationSuite extends SparkFunSuite {
 
   test("[SPARK-5840] HiveContext should be serializable") {
     val hiveContext = TestHive
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/VersionsSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/VersionsSuite.scala
index 321dc8d7322b8..446a2f2d646e1 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/VersionsSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/VersionsSuite.scala
@@ -17,10 +17,9 @@
 
 package org.apache.spark.sql.hive.client
 
-import org.apache.spark.Logging
+import org.apache.spark.{Logging, SparkFunSuite}
 import org.apache.spark.sql.catalyst.util.quietly
 import org.apache.spark.util.Utils
-import org.scalatest.FunSuite
 
 /**
  * A simple set of tests that call the methods of a hive ClientInterface, loading different version 
@@ -28,7 +27,7 @@ import org.scalatest.FunSuite
  * sure that reflective calls are not throwing NoSuchMethod error, but the actually functionallity 
  * is not fully tested.
  */
-class VersionsSuite extends FunSuite with Logging {
+class VersionsSuite extends SparkFunSuite with Logging {
   private def buildConf() = {
     lazy val warehousePath = Utils.createTempDir()
     lazy val metastorePath = Utils.createTempDir()
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/ConcurrentHiveSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/ConcurrentHiveSuite.scala
index 23ece7e7cf6e9..b0d3dd44daedc 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/ConcurrentHiveSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/ConcurrentHiveSuite.scala
@@ -17,11 +17,11 @@
 
 package org.apache.spark.sql.hive.execution
 
-import org.apache.spark.{SparkConf, SparkContext}
+import org.apache.spark.{SparkConf, SparkContext, SparkFunSuite}
 import org.apache.spark.sql.hive.test.TestHiveContext
-import org.scalatest.{BeforeAndAfterAll, FunSuite}
+import org.scalatest.BeforeAndAfterAll
 
-class ConcurrentHiveSuite extends FunSuite with BeforeAndAfterAll {
+class ConcurrentHiveSuite extends SparkFunSuite with BeforeAndAfterAll {
   ignore("multiple instances not supported") {
     test("Multiple Hive Instances") {
       (1 to 10).map { i =>
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveComparisonTest.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveComparisonTest.scala
index 55e5551b63818..c9dd4c0935a72 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveComparisonTest.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveComparisonTest.scala
@@ -19,9 +19,9 @@ package org.apache.spark.sql.hive.execution
 
 import java.io._
 
-import org.scalatest.{BeforeAndAfterAll, FunSuite, GivenWhenThen}
+import org.scalatest.{BeforeAndAfterAll, GivenWhenThen}
 
-import org.apache.spark.Logging
+import org.apache.spark.{Logging, SparkFunSuite}
 import org.apache.spark.sql.sources.DescribeCommand
 import org.apache.spark.sql.execution.{SetCommand, ExplainCommand}
 import org.apache.spark.sql.catalyst.planning.PhysicalOperation
@@ -40,7 +40,7 @@ import org.apache.spark.sql.hive.test.TestHive
  * configured using system properties.
  */
 abstract class HiveComparisonTest
-  extends FunSuite with BeforeAndAfterAll with GivenWhenThen with Logging {
+  extends SparkFunSuite with BeforeAndAfterAll with GivenWhenThen with Logging {
 
   /**
    * When set, any cache files that result in test failures will be deleted.  Used when the test
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcPartitionDiscoverySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcPartitionDiscoverySuite.scala
index 88c99e35260d9..0e63d84e9824a 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcPartitionDiscoverySuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcPartitionDiscoverySuite.scala
@@ -19,13 +19,14 @@ package org.apache.spark.sql.hive.orc
 
 import java.io.File
 import org.apache.hadoop.hive.conf.HiveConf.ConfVars
+import org.apache.spark.SparkFunSuite
 import org.apache.spark.sql._
 import org.apache.spark.sql.catalyst.expressions.Row
 import org.apache.spark.sql.hive.test.TestHive
 import org.apache.spark.sql.hive.test.TestHive._
 import org.apache.spark.sql.hive.test.TestHive.implicits._
 import org.apache.spark.util.Utils
-import org.scalatest.{BeforeAndAfterAll, FunSuiteLike}
+import org.scalatest.BeforeAndAfterAll
 
 import scala.reflect.ClassTag
 import scala.reflect.runtime.universe.TypeTag
@@ -38,7 +39,7 @@ case class OrcParData(intField: Int, stringField: String)
 case class OrcParDataWithKey(intField: Int, pi: Int, stringField: String, ps: String)
 
 // TODO This test suite duplicates ParquetPartitionDiscoverySuite a lot
-class OrcPartitionDiscoverySuite extends QueryTest with FunSuiteLike with BeforeAndAfterAll {
+class OrcPartitionDiscoverySuite extends QueryTest with BeforeAndAfterAll {
   val defaultPartitionName = ConfVars.DEFAULTPARTITIONNAME.defaultVal
 
   def withTempDir(f: File => Unit): Unit = {
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcQuerySuite.scala
index cdd6e705f4a2c..57c23fe77f8b5 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcQuerySuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcQuerySuite.scala
@@ -21,8 +21,9 @@ import java.io.File
 
 import org.apache.hadoop.hive.conf.HiveConf.ConfVars
 import org.apache.hadoop.hive.ql.io.orc.CompressionKind
-import org.scalatest.{BeforeAndAfterAll, FunSuiteLike}
+import org.scalatest.BeforeAndAfterAll
 
+import org.apache.spark.SparkFunSuite
 import org.apache.spark.sql._
 import org.apache.spark.sql.catalyst.expressions.Row
 import org.apache.spark.sql.hive.test.TestHive
@@ -50,7 +51,7 @@ case class Contact(name: String, phone: String)
 
 case class Person(name: String, age: Int, contacts: Seq[Contact])
 
-class OrcQuerySuite extends QueryTest with FunSuiteLike with BeforeAndAfterAll with OrcTest {
+class OrcQuerySuite extends QueryTest with BeforeAndAfterAll with OrcTest {
   override val sqlContext = TestHive
 
   import TestHive.read
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/sources/hadoopFsRelationSuites.scala b/sql/hive/src/test/scala/org/apache/spark/sql/sources/hadoopFsRelationSuites.scala
index cf5ae88dc4bee..af36fa6f1faae 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/sources/hadoopFsRelationSuites.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/sources/hadoopFsRelationSuites.scala
@@ -18,9 +18,8 @@
 package org.apache.spark.sql.sources
 
 import org.apache.hadoop.fs.Path
-import org.scalatest.FunSuite
 
-import org.apache.spark.SparkException
+import org.apache.spark.{SparkException, SparkFunSuite}
 import org.apache.spark.deploy.SparkHadoopUtil
 import org.apache.spark.sql._
 import org.apache.spark.sql.hive.test.TestHive
@@ -485,7 +484,7 @@ class SimpleTextHadoopFsRelationSuite extends HadoopFsRelationTest {
   }
 }
 
-class CommitFailureTestRelationSuite extends FunSuite with SQLTestUtils {
+class CommitFailureTestRelationSuite extends SparkFunSuite with SQLTestUtils {
   import TestHive.implicits._
 
   override val sqlContext = TestHive
diff --git a/streaming/pom.xml b/streaming/pom.xml
index 5ab7f4472c38b..49d035a1e9696 100644
--- a/streaming/pom.xml
+++ b/streaming/pom.xml
@@ -40,6 +40,13 @@
       <artifactId>spark-core_${scala.binary.version}</artifactId>
       <version>${project.version}</version>
     </dependency>
+    <dependency>
+      <groupId>org.apache.spark</groupId>
+      <artifactId>spark-core_${scala.binary.version}</artifactId>
+      <version>${project.version}</version>
+      <type>test-jar</type>
+      <scope>test</scope>
+    </dependency>
 
     <!-- Explicit listing of transitive deps that are shaded. Otherwise, odd compiler crashes. -->
     <dependency>
diff --git a/streaming/src/test/scala/org/apache/spark/streaming/DStreamClosureSuite.scala b/streaming/src/test/scala/org/apache/spark/streaming/DStreamClosureSuite.scala
index 6a1dd6949b204..9b5e4dc819a2b 100644
--- a/streaming/src/test/scala/org/apache/spark/streaming/DStreamClosureSuite.scala
+++ b/streaming/src/test/scala/org/apache/spark/streaming/DStreamClosureSuite.scala
@@ -19,9 +19,9 @@ package org.apache.spark.streaming
 
 import java.io.NotSerializableException
 
-import org.scalatest.{BeforeAndAfterAll, FunSuite}
+import org.scalatest.BeforeAndAfterAll
 
-import org.apache.spark.{HashPartitioner, SparkContext, SparkException}
+import org.apache.spark.{HashPartitioner, SparkContext, SparkException, SparkFunSuite}
 import org.apache.spark.rdd.RDD
 import org.apache.spark.streaming.dstream.DStream
 import org.apache.spark.util.ReturnStatementInClosureException
@@ -29,7 +29,7 @@ import org.apache.spark.util.ReturnStatementInClosureException
 /**
  * Test that closures passed to DStream operations are actually cleaned.
  */
-class DStreamClosureSuite extends FunSuite with BeforeAndAfterAll {
+class DStreamClosureSuite extends SparkFunSuite with BeforeAndAfterAll {
   private var ssc: StreamingContext = null
 
   override def beforeAll(): Unit = {
diff --git a/streaming/src/test/scala/org/apache/spark/streaming/DStreamScopeSuite.scala b/streaming/src/test/scala/org/apache/spark/streaming/DStreamScopeSuite.scala
index e3fb2ef130859..8844c9d74b933 100644
--- a/streaming/src/test/scala/org/apache/spark/streaming/DStreamScopeSuite.scala
+++ b/streaming/src/test/scala/org/apache/spark/streaming/DStreamScopeSuite.scala
@@ -17,9 +17,9 @@
 
 package org.apache.spark.streaming
 
-import org.scalatest.{BeforeAndAfter, BeforeAndAfterAll, FunSuite}
+import org.scalatest.{BeforeAndAfter, BeforeAndAfterAll}
 
-import org.apache.spark.SparkContext
+import org.apache.spark.{SparkContext, SparkFunSuite}
 import org.apache.spark.rdd.RDDOperationScope
 import org.apache.spark.streaming.dstream.DStream
 import org.apache.spark.streaming.ui.UIUtils
@@ -27,7 +27,7 @@ import org.apache.spark.streaming.ui.UIUtils
 /**
  * Tests whether scope information is passed from DStream operations to RDDs correctly.
  */
-class DStreamScopeSuite extends FunSuite with BeforeAndAfter with BeforeAndAfterAll {
+class DStreamScopeSuite extends SparkFunSuite with BeforeAndAfter with BeforeAndAfterAll {
   private var ssc: StreamingContext = null
   private val batchDuration: Duration = Seconds(1)
 
diff --git a/streaming/src/test/scala/org/apache/spark/streaming/ReceivedBlockHandlerSuite.scala b/streaming/src/test/scala/org/apache/spark/streaming/ReceivedBlockHandlerSuite.scala
index 23804237bda80..cca8cedb1d080 100644
--- a/streaming/src/test/scala/org/apache/spark/streaming/ReceivedBlockHandlerSuite.scala
+++ b/streaming/src/test/scala/org/apache/spark/streaming/ReceivedBlockHandlerSuite.scala
@@ -25,7 +25,7 @@ import scala.concurrent.duration._
 import scala.language.postfixOps
 
 import org.apache.hadoop.conf.Configuration
-import org.scalatest.{BeforeAndAfter, FunSuite, Matchers}
+import org.scalatest.{BeforeAndAfter, Matchers}
 import org.scalatest.concurrent.Eventually._
 
 import org.apache.spark._
@@ -41,7 +41,11 @@ import org.apache.spark.util.{ManualClock, Utils}
 import WriteAheadLogBasedBlockHandler._
 import WriteAheadLogSuite._
 
-class ReceivedBlockHandlerSuite extends FunSuite with BeforeAndAfter with Matchers with Logging {
+class ReceivedBlockHandlerSuite
+  extends SparkFunSuite
+  with BeforeAndAfter
+  with Matchers
+  with Logging {
 
   val conf = new SparkConf().set("spark.streaming.receiver.writeAheadLog.rollingIntervalSecs", "1")
   val hadoopConf = new Configuration()
diff --git a/streaming/src/test/scala/org/apache/spark/streaming/ReceivedBlockTrackerSuite.scala b/streaming/src/test/scala/org/apache/spark/streaming/ReceivedBlockTrackerSuite.scala
index b1af8d5eaacfb..6f0ee774cb5cf 100644
--- a/streaming/src/test/scala/org/apache/spark/streaming/ReceivedBlockTrackerSuite.scala
+++ b/streaming/src/test/scala/org/apache/spark/streaming/ReceivedBlockTrackerSuite.scala
@@ -25,10 +25,10 @@ import scala.language.{implicitConversions, postfixOps}
 import scala.util.Random
 
 import org.apache.hadoop.conf.Configuration
-import org.scalatest.{BeforeAndAfter, FunSuite, Matchers}
+import org.scalatest.{BeforeAndAfter, Matchers}
 import org.scalatest.concurrent.Eventually._
 
-import org.apache.spark.{Logging, SparkConf, SparkException}
+import org.apache.spark.{Logging, SparkConf, SparkException, SparkFunSuite}
 import org.apache.spark.storage.StreamBlockId
 import org.apache.spark.streaming.receiver.BlockManagerBasedStoreResult
 import org.apache.spark.streaming.scheduler._
@@ -37,7 +37,7 @@ import org.apache.spark.streaming.util.WriteAheadLogSuite._
 import org.apache.spark.util.{Clock, ManualClock, SystemClock, Utils}
 
 class ReceivedBlockTrackerSuite
-  extends FunSuite with BeforeAndAfter with Matchers with Logging {
+  extends SparkFunSuite with BeforeAndAfter with Matchers with Logging {
 
   val hadoopConf = new Configuration()
   val akkaTimeout = 10 seconds
diff --git a/streaming/src/test/scala/org/apache/spark/streaming/StreamingContextSuite.scala b/streaming/src/test/scala/org/apache/spark/streaming/StreamingContextSuite.scala
index e36c7914b130e..d304c9a7328f3 100644
--- a/streaming/src/test/scala/org/apache/spark/streaming/StreamingContextSuite.scala
+++ b/streaming/src/test/scala/org/apache/spark/streaming/StreamingContextSuite.scala
@@ -25,16 +25,16 @@ import org.scalatest.concurrent.Eventually._
 import org.scalatest.concurrent.Timeouts
 import org.scalatest.exceptions.TestFailedDueToTimeoutException
 import org.scalatest.time.SpanSugar._
-import org.scalatest.{Assertions, BeforeAndAfter, FunSuite}
+import org.scalatest.{Assertions, BeforeAndAfter}
 
 import org.apache.spark.storage.StorageLevel
 import org.apache.spark.streaming.dstream.DStream
 import org.apache.spark.streaming.receiver.Receiver
 import org.apache.spark.util.Utils
-import org.apache.spark.{Logging, SparkConf, SparkContext, SparkException}
+import org.apache.spark.{Logging, SparkConf, SparkContext, SparkException, SparkFunSuite}
 
 
-class StreamingContextSuite extends FunSuite with BeforeAndAfter with Timeouts with Logging {
+class StreamingContextSuite extends SparkFunSuite with BeforeAndAfter with Timeouts with Logging {
 
   val master = "local[2]"
   val appName = this.getClass.getSimpleName
diff --git a/streaming/src/test/scala/org/apache/spark/streaming/TestSuiteBase.scala b/streaming/src/test/scala/org/apache/spark/streaming/TestSuiteBase.scala
index 554cd30223f44..31b1aebf6a8ec 100644
--- a/streaming/src/test/scala/org/apache/spark/streaming/TestSuiteBase.scala
+++ b/streaming/src/test/scala/org/apache/spark/streaming/TestSuiteBase.scala
@@ -24,12 +24,12 @@ import scala.collection.mutable.SynchronizedBuffer
 import scala.language.implicitConversions
 import scala.reflect.ClassTag
 
-import org.scalatest.{BeforeAndAfter, FunSuite}
+import org.scalatest.BeforeAndAfter
 import org.scalatest.time.{Span, Seconds => ScalaTestSeconds}
 import org.scalatest.concurrent.Eventually.timeout
 import org.scalatest.concurrent.PatienceConfiguration
 
-import org.apache.spark.{SparkConf, Logging}
+import org.apache.spark.{Logging, SparkConf, SparkFunSuite}
 import org.apache.spark.rdd.RDD
 import org.apache.spark.streaming.dstream.{DStream, InputDStream, ForEachDStream}
 import org.apache.spark.streaming.scheduler._
@@ -204,7 +204,7 @@ class BatchCounter(ssc: StreamingContext) {
  * This is the base trait for Spark Streaming testsuites. This provides basic functionality
  * to run user-defined set of input on user-defined stream operations, and verify the output.
  */
-trait TestSuiteBase extends FunSuite with BeforeAndAfter with Logging {
+trait TestSuiteBase extends SparkFunSuite with BeforeAndAfter with Logging {
 
   // Name of the framework for Spark context
   def framework: String = this.getClass.getSimpleName
diff --git a/streaming/src/test/scala/org/apache/spark/streaming/UISeleniumSuite.scala b/streaming/src/test/scala/org/apache/spark/streaming/UISeleniumSuite.scala
index 441bbf95d0153..021d2c95a4aad 100644
--- a/streaming/src/test/scala/org/apache/spark/streaming/UISeleniumSuite.scala
+++ b/streaming/src/test/scala/org/apache/spark/streaming/UISeleniumSuite.scala
@@ -35,7 +35,7 @@ import org.apache.spark._
  * Selenium tests for the Spark Web UI.
  */
 class UISeleniumSuite
-  extends FunSuite with WebBrowser with Matchers with BeforeAndAfterAll with TestSuiteBase {
+  extends SparkFunSuite with WebBrowser with Matchers with BeforeAndAfterAll with TestSuiteBase {
 
   implicit var webDriver: WebDriver = _
 
diff --git a/streaming/src/test/scala/org/apache/spark/streaming/rdd/WriteAheadLogBackedBlockRDDSuite.scala b/streaming/src/test/scala/org/apache/spark/streaming/rdd/WriteAheadLogBackedBlockRDDSuite.scala
index 6859b65c7165f..cb017b798b2a4 100644
--- a/streaming/src/test/scala/org/apache/spark/streaming/rdd/WriteAheadLogBackedBlockRDDSuite.scala
+++ b/streaming/src/test/scala/org/apache/spark/streaming/rdd/WriteAheadLogBackedBlockRDDSuite.scala
@@ -21,15 +21,15 @@ import java.io.File
 import scala.util.Random
 
 import org.apache.hadoop.conf.Configuration
-import org.scalatest.{BeforeAndAfterAll, BeforeAndAfterEach, FunSuite}
+import org.scalatest.{BeforeAndAfterAll, BeforeAndAfterEach}
 
 import org.apache.spark.storage.{BlockId, BlockManager, StorageLevel, StreamBlockId}
 import org.apache.spark.streaming.util.{FileBasedWriteAheadLogSegment, FileBasedWriteAheadLogWriter}
 import org.apache.spark.util.Utils
-import org.apache.spark.{SparkConf, SparkContext, SparkException}
+import org.apache.spark.{SparkConf, SparkContext, SparkException, SparkFunSuite}
 
 class WriteAheadLogBackedBlockRDDSuite
-  extends FunSuite with BeforeAndAfterAll with BeforeAndAfterEach {
+  extends SparkFunSuite with BeforeAndAfterAll with BeforeAndAfterEach {
 
   val conf = new SparkConf()
     .setMaster("local[2]")
diff --git a/streaming/src/test/scala/org/apache/spark/streaming/scheduler/InputInfoTrackerSuite.scala b/streaming/src/test/scala/org/apache/spark/streaming/scheduler/InputInfoTrackerSuite.scala
index 5478b41845943..2e210397fe7c7 100644
--- a/streaming/src/test/scala/org/apache/spark/streaming/scheduler/InputInfoTrackerSuite.scala
+++ b/streaming/src/test/scala/org/apache/spark/streaming/scheduler/InputInfoTrackerSuite.scala
@@ -17,12 +17,12 @@
 
 package org.apache.spark.streaming.scheduler
 
-import org.scalatest.{BeforeAndAfter, FunSuite}
+import org.scalatest.BeforeAndAfter
 
-import org.apache.spark.SparkConf
+import org.apache.spark.{SparkConf, SparkFunSuite}
 import org.apache.spark.streaming.{Time, Duration, StreamingContext}
 
-class InputInfoTrackerSuite extends FunSuite with BeforeAndAfter {
+class InputInfoTrackerSuite extends SparkFunSuite with BeforeAndAfter {
 
   private var ssc: StreamingContext = _
 
diff --git a/streaming/src/test/scala/org/apache/spark/streaming/ui/UIUtilsSuite.scala b/streaming/src/test/scala/org/apache/spark/streaming/ui/UIUtilsSuite.scala
index e9ab917ab845c..d3ca2b58f36c2 100644
--- a/streaming/src/test/scala/org/apache/spark/streaming/ui/UIUtilsSuite.scala
+++ b/streaming/src/test/scala/org/apache/spark/streaming/ui/UIUtilsSuite.scala
@@ -20,10 +20,11 @@ package org.apache.spark.streaming.ui
 import java.util.TimeZone
 import java.util.concurrent.TimeUnit
 
-import org.scalatest.FunSuite
 import org.scalatest.Matchers
 
-class UIUtilsSuite extends FunSuite with Matchers{
+import org.apache.spark.SparkFunSuite
+
+class UIUtilsSuite extends SparkFunSuite with Matchers{
 
   test("shortTimeUnitString") {
     assert("ns" === UIUtils.shortTimeUnitString(TimeUnit.NANOSECONDS))
diff --git a/streaming/src/test/scala/org/apache/spark/streaming/util/RateLimitedOutputStreamSuite.scala b/streaming/src/test/scala/org/apache/spark/streaming/util/RateLimitedOutputStreamSuite.scala
index 9ebf7b484f421..78fc344b00177 100644
--- a/streaming/src/test/scala/org/apache/spark/streaming/util/RateLimitedOutputStreamSuite.scala
+++ b/streaming/src/test/scala/org/apache/spark/streaming/util/RateLimitedOutputStreamSuite.scala
@@ -20,9 +20,9 @@ package org.apache.spark.streaming.util
 import java.io.ByteArrayOutputStream
 import java.util.concurrent.TimeUnit._
 
-import org.scalatest.FunSuite
+import org.apache.spark.SparkFunSuite
 
-class RateLimitedOutputStreamSuite extends FunSuite {
+class RateLimitedOutputStreamSuite extends SparkFunSuite {
 
   private def benchmark[U](f: => U): Long = {
     val start = System.nanoTime
diff --git a/streaming/src/test/scala/org/apache/spark/streaming/util/WriteAheadLogSuite.scala b/streaming/src/test/scala/org/apache/spark/streaming/util/WriteAheadLogSuite.scala
index 79098bcf4861c..0acf7068ef4a4 100644
--- a/streaming/src/test/scala/org/apache/spark/streaming/util/WriteAheadLogSuite.scala
+++ b/streaming/src/test/scala/org/apache/spark/streaming/util/WriteAheadLogSuite.scala
@@ -28,12 +28,12 @@ import scala.reflect.ClassTag
 import org.apache.hadoop.conf.Configuration
 import org.apache.hadoop.fs.Path
 import org.scalatest.concurrent.Eventually._
-import org.scalatest.{BeforeAndAfter, FunSuite}
+import org.scalatest.BeforeAndAfter
 
 import org.apache.spark.util.{ManualClock, Utils}
-import org.apache.spark.{SparkConf, SparkException}
+import org.apache.spark.{SparkConf, SparkException, SparkFunSuite}
 
-class WriteAheadLogSuite extends FunSuite with BeforeAndAfter {
+class WriteAheadLogSuite extends SparkFunSuite with BeforeAndAfter {
 
   import WriteAheadLogSuite._
   
diff --git a/yarn/pom.xml b/yarn/pom.xml
index 00d219f836708..e207a46809684 100644
--- a/yarn/pom.xml
+++ b/yarn/pom.xml
@@ -39,6 +39,13 @@
       <artifactId>spark-core_${scala.binary.version}</artifactId>
       <version>${project.version}</version>
     </dependency>
+    <dependency>
+      <groupId>org.apache.spark</groupId>
+      <artifactId>spark-core_${scala.binary.version}</artifactId>
+      <version>${project.version}</version>
+      <type>test-jar</type>
+      <scope>test</scope>
+    </dependency>
     <dependency>
       <groupId>org.apache.hadoop</groupId>
       <artifactId>hadoop-yarn-api</artifactId>
diff --git a/yarn/src/test/scala/org/apache/spark/deploy/yarn/ClientDistributedCacheManagerSuite.scala b/yarn/src/test/scala/org/apache/spark/deploy/yarn/ClientDistributedCacheManagerSuite.scala
index 80b57d1355a3a..43a7334db874c 100644
--- a/yarn/src/test/scala/org/apache/spark/deploy/yarn/ClientDistributedCacheManagerSuite.scala
+++ b/yarn/src/test/scala/org/apache/spark/deploy/yarn/ClientDistributedCacheManagerSuite.scala
@@ -19,7 +19,6 @@ package org.apache.spark.deploy.yarn
 
 import java.net.URI
 
-import org.scalatest.FunSuite
 import org.scalatest.mock.MockitoSugar
 import org.mockito.Mockito.when
 
@@ -36,8 +35,10 @@ import org.apache.hadoop.yarn.util.{Records, ConverterUtils}
 import scala.collection.mutable.HashMap
 import scala.collection.mutable.Map
 
+import org.apache.spark.SparkFunSuite
 
-class ClientDistributedCacheManagerSuite extends FunSuite with MockitoSugar {
+
+class ClientDistributedCacheManagerSuite extends SparkFunSuite with MockitoSugar {
 
   class MockClientDistributedCacheManager extends ClientDistributedCacheManager {
     override def getVisibility(conf: Configuration, uri: URI, statCache: Map[URI, FileStatus]): 
diff --git a/yarn/src/test/scala/org/apache/spark/deploy/yarn/ClientSuite.scala b/yarn/src/test/scala/org/apache/spark/deploy/yarn/ClientSuite.scala
index 6da3e82acdb14..01d33c9ce9297 100644
--- a/yarn/src/test/scala/org/apache/spark/deploy/yarn/ClientSuite.scala
+++ b/yarn/src/test/scala/org/apache/spark/deploy/yarn/ClientSuite.scala
@@ -33,12 +33,12 @@ import org.apache.hadoop.yarn.api.records._
 import org.apache.hadoop.yarn.conf.YarnConfiguration
 import org.mockito.Matchers._
 import org.mockito.Mockito._
-import org.scalatest.{BeforeAndAfterAll, FunSuite, Matchers}
+import org.scalatest.{BeforeAndAfterAll, Matchers}
 
-import org.apache.spark.{SparkException, SparkConf}
+import org.apache.spark.{SparkConf, SparkException, SparkFunSuite}
 import org.apache.spark.util.Utils
 
-class ClientSuite extends FunSuite with Matchers with BeforeAndAfterAll {
+class ClientSuite extends SparkFunSuite with Matchers with BeforeAndAfterAll {
 
   override def beforeAll(): Unit = {
     System.setProperty("SPARK_YARN_MODE", "true")
diff --git a/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnAllocatorSuite.scala b/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnAllocatorSuite.scala
index b343cbb0c7569..7509000771d94 100644
--- a/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnAllocatorSuite.scala
+++ b/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnAllocatorSuite.scala
@@ -26,13 +26,13 @@ import org.apache.hadoop.yarn.api.records._
 import org.apache.hadoop.yarn.client.api.AMRMClient
 import org.apache.hadoop.yarn.client.api.AMRMClient.ContainerRequest
 
-import org.apache.spark.SecurityManager
+import org.apache.spark.{SecurityManager, SparkFunSuite}
 import org.apache.spark.SparkConf
 import org.apache.spark.deploy.yarn.YarnSparkHadoopUtil._
 import org.apache.spark.deploy.yarn.YarnAllocator._
 import org.apache.spark.scheduler.SplitInfo
 
-import org.scalatest.{BeforeAndAfterEach, FunSuite, Matchers}
+import org.scalatest.{BeforeAndAfterEach, Matchers}
 
 class MockResolver extends DNSToSwitchMapping {
 
@@ -46,7 +46,7 @@ class MockResolver extends DNSToSwitchMapping {
   def reloadCachedMappings(names: JList[String]) {}
 }
 
-class YarnAllocatorSuite extends FunSuite with Matchers with BeforeAndAfterEach {
+class YarnAllocatorSuite extends SparkFunSuite with Matchers with BeforeAndAfterEach {
   val conf = new Configuration()
   conf.setClass(
     CommonConfigurationKeysPublic.NET_TOPOLOGY_NODE_SWITCH_MAPPING_IMPL_KEY,
diff --git a/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnClusterSuite.scala b/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnClusterSuite.scala
index dcaeb2e43ff41..d8bc2534c1a6a 100644
--- a/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnClusterSuite.scala
+++ b/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnClusterSuite.scala
@@ -30,9 +30,9 @@ import com.google.common.io.ByteStreams
 import com.google.common.io.Files
 import org.apache.hadoop.yarn.conf.YarnConfiguration
 import org.apache.hadoop.yarn.server.MiniYARNCluster
-import org.scalatest.{BeforeAndAfterAll, FunSuite, Matchers}
+import org.scalatest.{BeforeAndAfterAll, Matchers}
 
-import org.apache.spark.{Logging, SparkConf, SparkContext, SparkException, TestUtils}
+import org.apache.spark._
 import org.apache.spark.scheduler.cluster.ExecutorInfo
 import org.apache.spark.scheduler.{SparkListener, SparkListenerApplicationStart,
   SparkListenerExecutorAdded}
@@ -43,7 +43,7 @@ import org.apache.spark.util.Utils
  * applications, and require the Spark assembly to be built before they can be successfully
  * run.
  */
-class YarnClusterSuite extends FunSuite with BeforeAndAfterAll with Matchers with Logging {
+class YarnClusterSuite extends SparkFunSuite with BeforeAndAfterAll with Matchers with Logging {
 
   // log4j configuration for the YARN containers, so that their output is collected
   // by YARN instead of trying to overwrite unit-tests.log.
diff --git a/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnSparkHadoopUtilSuite.scala b/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnSparkHadoopUtilSuite.scala
index e10b985c3c236..49bee0866dd43 100644
--- a/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnSparkHadoopUtilSuite.scala
+++ b/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnSparkHadoopUtilSuite.scala
@@ -25,15 +25,15 @@ import org.apache.hadoop.fs.Path
 import org.apache.hadoop.yarn.api.ApplicationConstants
 import org.apache.hadoop.yarn.api.ApplicationConstants.Environment
 import org.apache.hadoop.yarn.conf.YarnConfiguration
-import org.scalatest.{FunSuite, Matchers}
+import org.scalatest.Matchers
 
 import org.apache.hadoop.yarn.api.records.ApplicationAccessType
 
-import org.apache.spark.{Logging, SecurityManager, SparkConf, SparkException}
+import org.apache.spark.{Logging, SecurityManager, SparkConf, SparkException, SparkFunSuite}
 import org.apache.spark.util.Utils
 
 
-class YarnSparkHadoopUtilSuite extends FunSuite with Matchers with Logging {
+class YarnSparkHadoopUtilSuite extends SparkFunSuite with Matchers with Logging {
 
   val hasBash =
     try {

From 5f48e5c33bafa376be5741e260a037c66103fdcd Mon Sep 17 00:00:00 2001
From: Shivaram Venkataraman <shivaram@cs.berkeley.edu>
Date: Fri, 29 May 2015 14:11:58 -0700
Subject: [PATCH 247/525] [SPARK-6806] [SPARKR] [DOCS] Add a new SparkR
 programming guide

This PR adds a new SparkR programming guide at the top-level. This will be useful for R users as our APIs don't directly match the Scala/Python APIs and as we need to explain SparkR without using RDDs as examples etc.

cc rxin davies pwendell

cc cafreeman -- Would be great if you could also take a look at this !

Author: Shivaram Venkataraman <shivaram@cs.berkeley.edu>

Closes #6490 from shivaram/sparkr-guide and squashes the following commits:

d5ff360 [Shivaram Venkataraman] Add a section on HiveContext, HQL queries
408dce5 [Shivaram Venkataraman] Fix link
dbb86e3 [Shivaram Venkataraman] Fix minor typo
9aff5e0 [Shivaram Venkataraman] Address comments, use dplyr-like syntax in example
d09703c [Shivaram Venkataraman] Fix default argument in read.df
ea816a1 [Shivaram Venkataraman] Add a new SparkR programming guide Also update write.df, read.df to handle defaults better
---
 R/pkg/R/DataFrame.R           |  10 +-
 R/pkg/R/SQLContext.R          |   5 +
 R/pkg/R/generics.R            |   4 +-
 docs/_layouts/global.html     |   1 +
 docs/index.md                 |   2 +-
 docs/sparkr.md                | 223 ++++++++++++++++++++++++++++++++++
 docs/sql-programming-guide.md |   4 +-
 7 files changed, 238 insertions(+), 11 deletions(-)
 create mode 100644 docs/sparkr.md

diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R
index ed8093c80d360..e79d324838fe3 100644
--- a/R/pkg/R/DataFrame.R
+++ b/R/pkg/R/DataFrame.R
@@ -1314,9 +1314,8 @@ setMethod("except",
 #' write.df(df, "myfile", "parquet", "overwrite")
 #' }
 setMethod("write.df",
-          signature(df = "DataFrame", path = 'character', source = 'character',
-                    mode = 'character'),
-          function(df, path = NULL, source = NULL, mode = "append", ...){
+          signature(df = "DataFrame", path = 'character'),
+          function(df, path, source = NULL, mode = "append", ...){
             if (is.null(source)) {
               sqlContext <- get(".sparkRSQLsc", envir = .sparkREnv)
               source <- callJMethod(sqlContext, "getConf", "spark.sql.sources.default",
@@ -1338,9 +1337,8 @@ setMethod("write.df",
 #' @aliases saveDF
 #' @export
 setMethod("saveDF",
-          signature(df = "DataFrame", path = 'character', source = 'character',
-                    mode = 'character'),
-          function(df, path = NULL, source = NULL, mode = "append", ...){
+          signature(df = "DataFrame", path = 'character'),
+          function(df, path, source = NULL, mode = "append", ...){
             write.df(df, path, source, mode, ...)
           })
 
diff --git a/R/pkg/R/SQLContext.R b/R/pkg/R/SQLContext.R
index 36cc612875879..88e1a508f37c4 100644
--- a/R/pkg/R/SQLContext.R
+++ b/R/pkg/R/SQLContext.R
@@ -457,6 +457,11 @@ read.df <- function(sqlContext, path = NULL, source = NULL, ...) {
   if (!is.null(path)) {
     options[['path']] <- path
   }
+  if (is.null(source)) {
+    sqlContext <- get(".sparkRSQLsc", envir = .sparkREnv)
+    source <- callJMethod(sqlContext, "getConf", "spark.sql.sources.default",
+                          "org.apache.spark.sql.parquet")
+  }
   sdf <- callJMethod(sqlContext, "load", source, options)
   dataFrame(sdf)
 }
diff --git a/R/pkg/R/generics.R b/R/pkg/R/generics.R
index a23d3b217b2fd..1f4fc6adaca8d 100644
--- a/R/pkg/R/generics.R
+++ b/R/pkg/R/generics.R
@@ -482,11 +482,11 @@ setGeneric("saveAsTable", function(df, tableName, source, mode, ...) {
 
 #' @rdname write.df
 #' @export
-setGeneric("write.df", function(df, path, source, mode, ...) { standardGeneric("write.df") })
+setGeneric("write.df", function(df, path, ...) { standardGeneric("write.df") })
 
 #' @rdname write.df
 #' @export
-setGeneric("saveDF", function(df, path, source, mode, ...) { standardGeneric("saveDF") })
+setGeneric("saveDF", function(df, path, ...) { standardGeneric("saveDF") })
 
 #' @rdname schema
 #' @export
diff --git a/docs/_layouts/global.html b/docs/_layouts/global.html
index b92c75f90b11c..eebb3faf90fc0 100755
--- a/docs/_layouts/global.html
+++ b/docs/_layouts/global.html
@@ -75,6 +75,7 @@
                                 <li><a href="mllib-guide.html">MLlib (Machine Learning)</a></li>
                                 <li><a href="graphx-programming-guide.html">GraphX (Graph Processing)</a></li>
                                 <li><a href="bagel-programming-guide.html">Bagel (Pregel on Spark)</a></li>
+                                <li><a href="sparkr.html">SparkR (R on Spark)</a></li>
                             </ul>
                         </li>
 
diff --git a/docs/index.md b/docs/index.md
index 5ef6d983c45a5..fac071da81e60 100644
--- a/docs/index.md
+++ b/docs/index.md
@@ -54,7 +54,7 @@ Example applications are also provided in Python. For example,
 
     ./bin/spark-submit examples/src/main/python/pi.py 10
 
-Spark also provides an experimental R API since 1.4 (only DataFrames APIs included).
+Spark also provides an experimental [R API](sparkr.html) since 1.4 (only DataFrames APIs included).
 To run Spark interactively in a R interpreter, use `bin/sparkR`:
 
     ./bin/sparkR --master local[2]
diff --git a/docs/sparkr.md b/docs/sparkr.md
new file mode 100644
index 0000000000000..4d82129921a37
--- /dev/null
+++ b/docs/sparkr.md
@@ -0,0 +1,223 @@
+---
+layout: global
+displayTitle: SparkR (R on Spark)
+title: SparkR (R on Spark)
+---
+
+* This will become a table of contents (this text will be scraped).
+{:toc}
+
+# Overview
+SparkR is an R package that provides a light-weight frontend to use Apache Spark from R.
+In Spark {{site.SPARK_VERSION}}, SparkR provides a distributed data frame implementation that
+supports operations like selection, filtering, aggregation etc. (similar to R data frames,
+[dplyr](https://github.com/hadley/dplyr)) but on large datasets.
+
+# SparkR DataFrames
+
+A DataFrame is a distributed collection of data organized into named columns. It is conceptually
+equivalent to a table in a relational database or a data frame in R, but with richer
+optimizations under the hood. DataFrames can be constructed from a wide array of sources such as:
+structured data files, tables in Hive, external databases, or existing local R data frames.
+
+All of the examples on this page use sample data included in R or the Spark distribution and can be run using the `./bin/sparkR` shell.
+
+## Starting Up: SparkContext, SQLContext
+
+<div data-lang="r"  markdown="1">
+The entry point into SparkR is the `SparkContext` which connects your R program to a Spark cluster.
+You can create a `SparkContext` using `sparkR.init` and pass in options such as the application name
+etc. Further, to work with DataFrames we will need a `SQLContext`, which can be created from the 
+SparkContext. If you are working from the SparkR shell, the `SQLContext` and `SparkContext` should
+already be created for you.
+
+{% highlight r %}
+sc <- sparkR.init()
+sqlContext <- sparkRSQL.init(sc)
+{% endhighlight %}
+
+</div>
+
+## Creating DataFrames
+With a `SQLContext`, applications can create `DataFrame`s from a local R data frame, from a [Hive table](sql-programming-guide.html#hive-tables), or from other [data sources](sql-programming-guide.html#data-sources).
+
+### From local data frames
+The simplest way to create a data frame is to convert a local R data frame into a SparkR DataFrame. Specifically we can use `createDataFrame` and pass in the local R data frame to create a SparkR DataFrame. As an example, the following creates a `DataFrame` based using the `faithful` dataset from R. 
+
+<div data-lang="r"  markdown="1">
+{% highlight r %}
+df <- createDataFrame(sqlContext, faithful) 
+
+# Displays the content of the DataFrame to stdout
+head(df)
+##  eruptions waiting
+##1     3.600      79
+##2     1.800      54
+##3     3.333      74
+
+{% endhighlight %}
+</div>
+
+### From Data Sources
+
+SparkR supports operating on a variety of data sources through the `DataFrame` interface. This section describes the general methods for loading and saving data using Data Sources. You can check the Spark SQL programming guide for more [specific options](sql-programming-guide.html#manually-specifying-options) that are available for the built-in data sources.
+
+The general method for creating DataFrames from data sources is `read.df`. This method takes in the `SQLContext`, the path for the file to load and the type of data source. SparkR supports reading JSON and Parquet files natively and through [Spark Packages](http://spark-packages.org/) you can find data source connectors for popular file formats like [CSV](http://spark-packages.org/package/databricks/spark-csv) and [Avro](http://spark-packages.org/package/databricks/spark-avro).
+
+We can see how to use data sources using an example JSON input file. Note that the file that is used here is _not_ a typical JSON file. Each line in the file must contain a separate, self-contained valid JSON object. As a consequence, a regular multi-line JSON file will most often fail.
+
+<div data-lang="r"  markdown="1">
+
+{% highlight r %}
+people <- read.df(sqlContext, "./examples/src/main/resources/people.json", "json")
+head(people)
+##  age    name
+##1  NA Michael
+##2  30    Andy
+##3  19  Justin
+
+# SparkR automatically infers the schema from the JSON file
+printSchema(people)
+# root
+#  |-- age: integer (nullable = true)
+#  |-- name: string (nullable = true)
+
+{% endhighlight %}
+</div>
+
+The data sources API can also be used to save out DataFrames into multiple file formats. For example we can save the DataFrame from the previous example
+to a Parquet file using `write.df` 
+
+<div data-lang="r"  markdown="1">
+{% highlight r %}
+write.df(people, path="people.parquet", source="parquet", mode="overwrite")
+{% endhighlight %}
+</div>
+
+### From Hive tables
+
+You can also create SparkR DataFrames from Hive tables. To do this we will need to create a HiveContext which can access tables in the Hive MetaStore. Note that Spark should have been built with [Hive support](building-spark.html#building-with-hive-and-jdbc-support) and more details on the difference between SQLContext and HiveContext can be found in the [SQL programming guide](sql-programming-guide.html#starting-point-sqlcontext).
+
+<div data-lang="r" markdown="1">
+{% highlight r %}
+# sc is an existing SparkContext.
+hiveContext <- sparkRHive.init(sc)
+
+sql(hiveContext, "CREATE TABLE IF NOT EXISTS src (key INT, value STRING)")
+sql(hiveContext, "LOAD DATA LOCAL INPATH 'examples/src/main/resources/kv1.txt' INTO TABLE src")
+
+# Queries can be expressed in HiveQL.
+results <- hiveContext.sql("FROM src SELECT key, value")
+
+# results is now a DataFrame
+head(results)
+##  key   value
+## 1 238 val_238
+## 2  86  val_86
+## 3 311 val_311
+
+{% endhighlight %}
+</div>
+
+## DataFrame Operations
+
+SparkR DataFrames support a number of functions to do structured data processing.
+Here we include some basic examples and a complete list can be found in the [API](api/R/index.html) docs:
+
+### Selecting rows, columns
+
+<div data-lang="r"  markdown="1">
+{% highlight r %}
+# Create the DataFrame
+df <- createDataFrame(sqlContext, faithful) 
+
+# Get basic information about the DataFrame
+df
+## DataFrame[eruptions:double, waiting:double]
+
+# Select only the "eruptions" column
+head(select(df, df$eruptions))
+##  eruptions
+##1     3.600
+##2     1.800
+##3     3.333
+
+# You can also pass in column name as strings 
+head(select(df, "eruptions"))
+
+# Filter the DataFrame to only retain rows with wait times shorter than 50 mins
+head(filter(df, df$waiting < 50))
+##  eruptions waiting
+##1     1.750      47
+##2     1.750      47
+##3     1.867      48
+
+{% endhighlight %}
+
+</div>
+
+### Grouping, Aggregation 
+
+SparkR data frames support a number of commonly used functions to aggregate data after grouping. For example we can compute a histogram of the `waiting` time in the `faithful` dataset as shown below
+
+<div data-lang="r"  markdown="1">
+{% highlight r %}
+
+# We use the `n` operator to count the number of times each waiting time appears
+head(summarize(groupBy(df, df$waiting), count = n(df$waiting)))
+##  waiting count
+##1      81    13
+##2      60     6
+##3      68     1
+
+# We can also sort the output from the aggregation to get the most common waiting times
+waiting_counts <- summarize(groupBy(df, df$waiting), count = n(df$waiting))
+head(arrange(waiting_counts, desc(waiting_counts$count)))
+
+##   waiting count
+##1      78    15
+##2      83    14
+##3      81    13
+
+{% endhighlight %}
+</div>
+
+### Operating on Columns
+
+SparkR also provides a number of functions that can directly applied to columns for data processing and during aggregation. The example below shows the use of basic arithmetic functions. 
+
+<div data-lang="r"  markdown="1">
+{% highlight r %}
+
+# Convert waiting time from hours to seconds.
+# Note that we can assign this to a new column in the same DataFrame
+df$waiting_secs <- df$waiting * 60
+head(df)
+##  eruptions waiting waiting_secs
+##1     3.600      79         4740
+##2     1.800      54         3240
+##3     3.333      74         4440
+
+{% endhighlight %}
+</div>
+
+## Running SQL Queries from SparkR
+A SparkR DataFrame can also be registered as a temporary table in Spark SQL and registering a DataFrame as a table allows you to run SQL queries over its data.
+The `sql` function enables applications to run SQL queries programmatically and returns the result as a `DataFrame`.
+
+<div data-lang="r"  markdown="1">
+{% highlight r %}
+# Load a JSON file
+people <- read.df(sqlContext, "./examples/src/main/resources/people.json", "json")
+
+# Register this DataFrame as a table.
+registerTempTable(people, "people")
+
+# SQL statements can be run by using the sql method
+teenagers <- sql(sqlContext, "SELECT name FROM people WHERE age >= 13 AND age <= 19")
+head(teenagers)
+##    name
+##1 Justin
+
+{% endhighlight %}
+</div>
diff --git a/docs/sql-programming-guide.md b/docs/sql-programming-guide.md
index ab646f65bb5eb..7cc0a87fd5c53 100644
--- a/docs/sql-programming-guide.md
+++ b/docs/sql-programming-guide.md
@@ -1526,8 +1526,8 @@ adds support for finding tables in the MetaStore and writing queries using HiveQ
 # sc is an existing SparkContext.
 sqlContext <- sparkRHive.init(sc)
 
-hql(sqlContext, "CREATE TABLE IF NOT EXISTS src (key INT, value STRING)")
-hql(sqlContext, "LOAD DATA LOCAL INPATH 'examples/src/main/resources/kv1.txt' INTO TABLE src")
+sql(sqlContext, "CREATE TABLE IF NOT EXISTS src (key INT, value STRING)")
+sql(sqlContext, "LOAD DATA LOCAL INPATH 'examples/src/main/resources/kv1.txt' INTO TABLE src")
 
 # Queries can be expressed in HiveQL.
 results = sqlContext.sql("FROM src SELECT key, value").collect()

From 1c5b19827a091b5aba69a967600e7ca35ed3bcfd Mon Sep 17 00:00:00 2001
From: Michael Nazario <mnazario@palantir.com>
Date: Fri, 29 May 2015 14:13:44 -0700
Subject: [PATCH 248/525] [SPARK-7899] [PYSPARK] Fix Python 3 pyspark/sql/types
 module conflict

This PR makes the types module in `pyspark/sql/types` work with pylint static analysis by removing the dynamic naming of the `pyspark/sql/_types` module to `pyspark/sql/types`.

Tests are now loaded using `$PYSPARK_DRIVER_PYTHON -m module` rather than `$PYSPARK_DRIVER_PYTHON module.py`. The old method adds the location of `module.py` to `sys.path`, so this change prevents accidental use of relative paths in Python.

Author: Michael Nazario <mnazario@palantir.com>

Closes #6439 from mnazario/feature/SPARK-7899 and squashes the following commits:

366ef30 [Michael Nazario] Remove hack on random.py
bb8b04d [Michael Nazario] Make doctests consistent with other tests
6ee4f75 [Michael Nazario] Change test scripts to use "-m"
673528f [Michael Nazario] Move _types back to types
---
 bin/pyspark                                 |  6 +-
 python/pyspark/accumulators.py              |  4 ++
 python/pyspark/mllib/__init__.py            |  8 ---
 python/pyspark/mllib/{rand.py => random.py} |  0
 python/pyspark/sql/__init__.py              | 12 ----
 python/pyspark/sql/{_types.py => types.py}  |  0
 python/run-tests                            | 76 ++++++++++-----------
 7 files changed, 43 insertions(+), 63 deletions(-)
 rename python/pyspark/mllib/{rand.py => random.py} (100%)
 rename python/pyspark/sql/{_types.py => types.py} (100%)

diff --git a/bin/pyspark b/bin/pyspark
index 8acad6113797d..7cb19c51b43a2 100755
--- a/bin/pyspark
+++ b/bin/pyspark
@@ -90,11 +90,7 @@ if [[ -n "$SPARK_TESTING" ]]; then
   unset YARN_CONF_DIR
   unset HADOOP_CONF_DIR
   export PYTHONHASHSEED=0
-  if [[ -n "$PYSPARK_DOC_TEST" ]]; then
-    exec "$PYSPARK_DRIVER_PYTHON" -m doctest $1
-  else
-    exec "$PYSPARK_DRIVER_PYTHON" $1
-  fi
+  exec "$PYSPARK_DRIVER_PYTHON" -m $1
   exit
 fi
 
diff --git a/python/pyspark/accumulators.py b/python/pyspark/accumulators.py
index 0d21a132048a5..adca90ddaf397 100644
--- a/python/pyspark/accumulators.py
+++ b/python/pyspark/accumulators.py
@@ -261,3 +261,7 @@ def _start_update_server():
     thread.daemon = True
     thread.start()
     return server
+
+if __name__ == "__main__":
+    import doctest
+    doctest.testmod()
diff --git a/python/pyspark/mllib/__init__.py b/python/pyspark/mllib/__init__.py
index 07507b2ad0d05..b11aed2c3afda 100644
--- a/python/pyspark/mllib/__init__.py
+++ b/python/pyspark/mllib/__init__.py
@@ -28,11 +28,3 @@
 
 __all__ = ['classification', 'clustering', 'feature', 'fpm', 'linalg', 'random',
            'recommendation', 'regression', 'stat', 'tree', 'util']
-
-import sys
-from . import rand as random
-modname = __name__ + '.random'
-random.__name__ = modname
-random.RandomRDDs.__module__ = modname
-sys.modules[modname] = random
-del modname, sys
diff --git a/python/pyspark/mllib/rand.py b/python/pyspark/mllib/random.py
similarity index 100%
rename from python/pyspark/mllib/rand.py
rename to python/pyspark/mllib/random.py
diff --git a/python/pyspark/sql/__init__.py b/python/pyspark/sql/__init__.py
index 8fee92ae3aed5..726d288d97b2e 100644
--- a/python/pyspark/sql/__init__.py
+++ b/python/pyspark/sql/__init__.py
@@ -50,18 +50,6 @@ def deco(f):
         return f
     return deco
 
-# fix the module name conflict for Python 3+
-import sys
-from . import _types as types
-modname = __name__ + '.types'
-types.__name__ = modname
-# update the __module__ for all objects, make them picklable
-for v in types.__dict__.values():
-    if hasattr(v, "__module__") and v.__module__.endswith('._types'):
-        v.__module__ = modname
-sys.modules[modname] = types
-del modname, sys
-
 from pyspark.sql.types import Row
 from pyspark.sql.context import SQLContext, HiveContext
 from pyspark.sql.column import Column
diff --git a/python/pyspark/sql/_types.py b/python/pyspark/sql/types.py
similarity index 100%
rename from python/pyspark/sql/_types.py
rename to python/pyspark/sql/types.py
diff --git a/python/run-tests b/python/run-tests
index ffde2fb24b369..fcfb49556b7cf 100755
--- a/python/run-tests
+++ b/python/run-tests
@@ -57,54 +57,54 @@ function run_test() {
 
 function run_core_tests() {
     echo "Run core tests ..."
-    run_test "pyspark/rdd.py"
-    run_test "pyspark/context.py"
-    run_test "pyspark/conf.py"
-    PYSPARK_DOC_TEST=1 run_test "pyspark/broadcast.py"
-    PYSPARK_DOC_TEST=1 run_test "pyspark/accumulators.py"
-    run_test "pyspark/serializers.py"
-    run_test "pyspark/profiler.py"
-    run_test "pyspark/shuffle.py"
-    run_test "pyspark/tests.py"
+    run_test "pyspark.rdd"
+    run_test "pyspark.context"
+    run_test "pyspark.conf"
+    run_test "pyspark.broadcast"
+    run_test "pyspark.accumulators"
+    run_test "pyspark.serializers"
+    run_test "pyspark.profiler"
+    run_test "pyspark.shuffle"
+    run_test "pyspark.tests"
 }
 
 function run_sql_tests() {
     echo "Run sql tests ..."
-    run_test "pyspark/sql/_types.py"
-    run_test "pyspark/sql/context.py"
-    run_test "pyspark/sql/column.py"
-    run_test "pyspark/sql/dataframe.py"
-    run_test "pyspark/sql/group.py"
-    run_test "pyspark/sql/functions.py"
-    run_test "pyspark/sql/tests.py"
+    run_test "pyspark.sql.types"
+    run_test "pyspark.sql.context"
+    run_test "pyspark.sql.column"
+    run_test "pyspark.sql.dataframe"
+    run_test "pyspark.sql.group"
+    run_test "pyspark.sql.functions"
+    run_test "pyspark.sql.tests"
 }
 
 function run_mllib_tests() {
     echo "Run mllib tests ..."
-    run_test "pyspark/mllib/classification.py"
-    run_test "pyspark/mllib/clustering.py"
-    run_test "pyspark/mllib/evaluation.py"
-    run_test "pyspark/mllib/feature.py"
-    run_test "pyspark/mllib/fpm.py"
-    run_test "pyspark/mllib/linalg.py"
-    run_test "pyspark/mllib/rand.py"
-    run_test "pyspark/mllib/recommendation.py"
-    run_test "pyspark/mllib/regression.py"
-    run_test "pyspark/mllib/stat/_statistics.py"
-    run_test "pyspark/mllib/tree.py"
-    run_test "pyspark/mllib/util.py"
-    run_test "pyspark/mllib/tests.py"
+    run_test "pyspark.mllib.classification"
+    run_test "pyspark.mllib.clustering"
+    run_test "pyspark.mllib.evaluation"
+    run_test "pyspark.mllib.feature"
+    run_test "pyspark.mllib.fpm"
+    run_test "pyspark.mllib.linalg"
+    run_test "pyspark.mllib.random"
+    run_test "pyspark.mllib.recommendation"
+    run_test "pyspark.mllib.regression"
+    run_test "pyspark.mllib.stat._statistics"
+    run_test "pyspark.mllib.tree"
+    run_test "pyspark.mllib.util"
+    run_test "pyspark.mllib.tests"
 }
 
 function run_ml_tests() {
     echo "Run ml tests ..."
-    run_test "pyspark/ml/feature.py"
-    run_test "pyspark/ml/classification.py"
-    run_test "pyspark/ml/recommendation.py"
-    run_test "pyspark/ml/regression.py"
-    run_test "pyspark/ml/tuning.py"
-    run_test "pyspark/ml/tests.py"
-    run_test "pyspark/ml/evaluation.py"
+    run_test "pyspark.ml.feature"
+    run_test "pyspark.ml.classification"
+    run_test "pyspark.ml.recommendation"
+    run_test "pyspark.ml.regression"
+    run_test "pyspark.ml.tuning"
+    run_test "pyspark.ml.tests"
+    run_test "pyspark.ml.evaluation"
 }
 
 function run_streaming_tests() {
@@ -124,8 +124,8 @@ function run_streaming_tests() {
     done
 
     export PYSPARK_SUBMIT_ARGS="--jars ${KAFKA_ASSEMBLY_JAR} pyspark-shell"
-    run_test "pyspark/streaming/util.py"
-    run_test "pyspark/streaming/tests.py"
+    run_test "pyspark.streaming.util"
+    run_test "pyspark.streaming.tests"
 }
 
 echo "Running PySpark tests. Output is in python/$LOG_FILE."

From 82a396c2f594bade276606dcd0c0545a650fb838 Mon Sep 17 00:00:00 2001
From: Holden Karau <holden@pigscanfly.ca>
Date: Fri, 29 May 2015 14:59:18 -0700
Subject: [PATCH 249/525] [SPARK-7910] [TINY] [JAVAAPI] expose partitioner
 information in javardd

Author: Holden Karau <holden@pigscanfly.ca>

Closes #6464 from holdenk/SPARK-7910-expose-partitioner-information-in-javardd and squashes the following commits:

de1e644 [Holden Karau] Fix the test to get the partitioner
bdb31cc [Holden Karau] Add Mima exclude for the new method
347ef4c [Holden Karau] Add a quick little test for the partitioner JavaAPI
f49dca9 [Holden Karau] Add partitoner information to JavaRDDLike and fix some whitespace
---
 .../scala/org/apache/spark/api/java/JavaRDDLike.scala    | 9 ++++++---
 core/src/test/java/org/apache/spark/JavaAPISuite.java    | 2 ++
 project/MimaExcludes.scala                               | 2 ++
 3 files changed, 10 insertions(+), 3 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/api/java/JavaRDDLike.scala b/core/src/main/scala/org/apache/spark/api/java/JavaRDDLike.scala
index b8e15f38a20d2..c95615a5a9307 100644
--- a/core/src/main/scala/org/apache/spark/api/java/JavaRDDLike.scala
+++ b/core/src/main/scala/org/apache/spark/api/java/JavaRDDLike.scala
@@ -60,10 +60,13 @@ trait JavaRDDLike[T, This <: JavaRDDLike[T, This]] extends Serializable {
 
   @deprecated("Use partitions() instead.", "1.1.0")
   def splits: JList[Partition] = new java.util.ArrayList(rdd.partitions.toSeq)
-  
+
   /** Set of partitions in this RDD. */
   def partitions: JList[Partition] = new java.util.ArrayList(rdd.partitions.toSeq)
 
+  /** The partitioner of this RDD. */
+  def partitioner: Optional[Partitioner] = JavaUtils.optionToOptional(rdd.partitioner)
+
   /** The [[org.apache.spark.SparkContext]] that this RDD was created on. */
   def context: SparkContext = rdd.context
 
@@ -492,9 +495,9 @@ trait JavaRDDLike[T, This <: JavaRDDLike[T, This]] extends Serializable {
     new java.util.ArrayList(arr)
   }
 
-  def takeSample(withReplacement: Boolean, num: Int): JList[T] = 
+  def takeSample(withReplacement: Boolean, num: Int): JList[T] =
     takeSample(withReplacement, num, Utils.random.nextLong)
-    
+
   def takeSample(withReplacement: Boolean, num: Int, seed: Long): JList[T] = {
     import scala.collection.JavaConversions._
     val arr: java.util.Collection[T] = rdd.takeSample(withReplacement, num, seed).toSeq
diff --git a/core/src/test/java/org/apache/spark/JavaAPISuite.java b/core/src/test/java/org/apache/spark/JavaAPISuite.java
index c2089b0e56a1f..dfd86d3e51e7d 100644
--- a/core/src/test/java/org/apache/spark/JavaAPISuite.java
+++ b/core/src/test/java/org/apache/spark/JavaAPISuite.java
@@ -212,6 +212,8 @@ public int getPartition(Object key) {
 
     JavaPairRDD<Integer, Integer> repartitioned =
         rdd.repartitionAndSortWithinPartitions(partitioner);
+    Assert.assertTrue(repartitioned.partitioner().isPresent());
+    Assert.assertEquals(repartitioned.partitioner().get(), partitioner);
     List<List<Tuple2<Integer, Integer>>> partitions = repartitioned.glom().collect();
     Assert.assertEquals(partitions.get(0), Arrays.asList(new Tuple2<Integer, Integer>(0, 5),
         new Tuple2<Integer, Integer>(0, 8), new Tuple2<Integer, Integer>(2, 6)));
diff --git a/project/MimaExcludes.scala b/project/MimaExcludes.scala
index 11b439e7875fc..8da72b3fa7cdb 100644
--- a/project/MimaExcludes.scala
+++ b/project/MimaExcludes.scala
@@ -38,6 +38,8 @@ object MimaExcludes {
           Seq(
             MimaBuild.excludeSparkPackage("deploy"),
             MimaBuild.excludeSparkPackage("ml"),
+            // SPARK-7910 Adding a method to get the partioner to JavaRDD,
+            ProblemFilters.exclude[MissingMethodProblem]("org.apache.spark.api.java.JavaRDDLike.partitioner"),
             // SPARK-5922 Adding a generalized diff(other: RDD[(VertexId, VD)]) to VertexRDD
             ProblemFilters.exclude[MissingMethodProblem]("org.apache.spark.graphx.VertexRDD.diff"),
             // These are needed if checking against the sbt build, since they are part of

From 5fb97dca9bcfc29ac33823554c8783997e811b99 Mon Sep 17 00:00:00 2001
From: Shivaram Venkataraman <shivaram@cs.berkeley.edu>
Date: Fri, 29 May 2015 15:08:30 -0700
Subject: [PATCH 250/525] [SPARK-7954] [SPARKR] Create SparkContext in
 sparkRSQL init

cc davies

Author: Shivaram Venkataraman <shivaram@cs.berkeley.edu>

Closes #6507 from shivaram/sparkr-init and squashes the following commits:

6fdd169 [Shivaram Venkataraman] Create SparkContext in sparkRSQL init
---
 R/pkg/R/sparkR.R | 24 +++++++++++++++++++-----
 1 file changed, 19 insertions(+), 5 deletions(-)

diff --git a/R/pkg/R/sparkR.R b/R/pkg/R/sparkR.R
index 68387f0f5365d..5ced7c688f98a 100644
--- a/R/pkg/R/sparkR.R
+++ b/R/pkg/R/sparkR.R
@@ -225,14 +225,21 @@ sparkR.init <- function(
 #' sqlContext <- sparkRSQL.init(sc)
 #'}
 
-sparkRSQL.init <- function(jsc) {
+sparkRSQL.init <- function(jsc = NULL) {
   if (exists(".sparkRSQLsc", envir = .sparkREnv)) {
     return(get(".sparkRSQLsc", envir = .sparkREnv))
   }
 
+  # If jsc is NULL, create a Spark Context
+  sc <- if (is.null(jsc)) {
+    sparkR.init()
+  } else {
+    jsc
+  }
+
   sqlContext <- callJStatic("org.apache.spark.sql.api.r.SQLUtils",
-                        "createSQLContext",
-                        jsc)
+                            "createSQLContext",
+                            sc)
   assign(".sparkRSQLsc", sqlContext, envir = .sparkREnv)
   sqlContext
 }
@@ -249,12 +256,19 @@ sparkRSQL.init <- function(jsc) {
 #' sqlContext <- sparkRHive.init(sc)
 #'}
 
-sparkRHive.init <- function(jsc) {
+sparkRHive.init <- function(jsc = NULL) {
   if (exists(".sparkRHivesc", envir = .sparkREnv)) {
     return(get(".sparkRHivesc", envir = .sparkREnv))
   }
 
-  ssc <- callJMethod(jsc, "sc")
+  # If jsc is NULL, create a Spark Context
+  sc <- if (is.null(jsc)) {
+    sparkR.init()
+  } else {
+    jsc
+  }
+
+  ssc <- callJMethod(sc, "sc")
   hiveCtx <- tryCatch({
     newJObject("org.apache.spark.sql.hive.HiveContext", ssc)
   }, error = function(err) {

From dbf8ff38de0f95f467b874a5b527dcf59439efe8 Mon Sep 17 00:00:00 2001
From: Ram Sriharsha <rsriharsha@hw11853.local>
Date: Fri, 29 May 2015 15:22:26 -0700
Subject: [PATCH 251/525] [SPARK-6013] [ML] Add more Python ML examples for
 spark.ml

Author: Ram Sriharsha <rsriharsha@hw11853.local>

Closes #6443 from harsha2010/SPARK-6013 and squashes the following commits:

732506e [Ram Sriharsha] Code Review Feedback
121c211 [Ram Sriharsha] python style fix
5f9b8c3 [Ram Sriharsha] python style fixes
925ca86 [Ram Sriharsha] Simple Params Example
8b372b1 [Ram Sriharsha] GBT Example
965ec14 [Ram Sriharsha] Random Forest Example
---
 .../examples/ml/JavaSimpleParamsExample.java  |  2 +-
 .../main/python/ml/gradient_boosted_trees.py  | 83 ++++++++++++++++
 .../main/python/ml/random_forest_example.py   | 87 ++++++++++++++++
 .../main/python/ml/simple_params_example.py   | 98 +++++++++++++++++++
 .../examples/ml/SimpleParamsExample.scala     |  2 +-
 5 files changed, 270 insertions(+), 2 deletions(-)
 create mode 100644 examples/src/main/python/ml/gradient_boosted_trees.py
 create mode 100644 examples/src/main/python/ml/random_forest_example.py
 create mode 100644 examples/src/main/python/ml/simple_params_example.py

diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaSimpleParamsExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaSimpleParamsExample.java
index 29158d5c85651..dac649d1d5ae6 100644
--- a/examples/src/main/java/org/apache/spark/examples/ml/JavaSimpleParamsExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaSimpleParamsExample.java
@@ -97,7 +97,7 @@ public static void main(String[] args) {
     DataFrame test = jsql.createDataFrame(jsc.parallelize(localTest), LabeledPoint.class);
 
     // Make predictions on test documents using the Transformer.transform() method.
-    // LogisticRegression.transform will only use the 'features' column.
+    // LogisticRegressionModel.transform will only use the 'features' column.
     // Note that model2.transform() outputs a 'myProbability' column instead of the usual
     // 'probability' column since we renamed the lr.probabilityCol parameter previously.
     DataFrame results = model2.transform(test);
diff --git a/examples/src/main/python/ml/gradient_boosted_trees.py b/examples/src/main/python/ml/gradient_boosted_trees.py
new file mode 100644
index 0000000000000..6446f0fe5eeab
--- /dev/null
+++ b/examples/src/main/python/ml/gradient_boosted_trees.py
@@ -0,0 +1,83 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from __future__ import print_function
+
+import sys
+
+from pyspark import SparkContext
+from pyspark.ml.classification import GBTClassifier
+from pyspark.ml.feature import StringIndexer
+from pyspark.ml.regression import GBTRegressor
+from pyspark.mllib.evaluation import BinaryClassificationMetrics, RegressionMetrics
+from pyspark.mllib.util import MLUtils
+from pyspark.sql import Row, SQLContext
+
+"""
+A simple example demonstrating a Gradient Boosted Trees Classification/Regression Pipeline.
+Note: GBTClassifier only supports binary classification currently
+Run with:
+  bin/spark-submit examples/src/main/python/ml/gradient_boosted_trees.py
+"""
+
+
+def testClassification(train, test):
+    # Train a GradientBoostedTrees model.
+
+    rf = GBTClassifier(maxIter=30, maxDepth=4, labelCol="indexedLabel")
+
+    model = rf.fit(train)
+    predictionAndLabels = model.transform(test).select("prediction", "indexedLabel") \
+        .map(lambda x: (x.prediction, x.indexedLabel))
+
+    metrics = BinaryClassificationMetrics(predictionAndLabels)
+    print("AUC %.3f" % metrics.areaUnderROC)
+
+
+def testRegression(train, test):
+    # Train a GradientBoostedTrees model.
+
+    rf = GBTRegressor(maxIter=30, maxDepth=4, labelCol="indexedLabel")
+
+    model = rf.fit(train)
+    predictionAndLabels = model.transform(test).select("prediction", "indexedLabel") \
+        .map(lambda x: (x.prediction, x.indexedLabel))
+
+    metrics = RegressionMetrics(predictionAndLabels)
+    print("rmse %.3f" % metrics.rootMeanSquaredError)
+    print("r2 %.3f" % metrics.r2)
+    print("mae %.3f" % metrics.meanAbsoluteError)
+
+
+if __name__ == "__main__":
+    if len(sys.argv) > 1:
+        print("Usage: gradient_boosted_trees", file=sys.stderr)
+        exit(1)
+    sc = SparkContext(appName="PythonGBTExample")
+    sqlContext = SQLContext(sc)
+
+    # Load and parse the data file into a dataframe.
+    df = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt").toDF()
+
+    # Map labels into an indexed column of labels in [0, numLabels)
+    stringIndexer = StringIndexer(inputCol="label", outputCol="indexedLabel")
+    si_model = stringIndexer.fit(df)
+    td = si_model.transform(df)
+    [train, test] = td.randomSplit([0.7, 0.3])
+    testClassification(train, test)
+    testRegression(train, test)
+    sc.stop()
diff --git a/examples/src/main/python/ml/random_forest_example.py b/examples/src/main/python/ml/random_forest_example.py
new file mode 100644
index 0000000000000..c7730e1bfacd9
--- /dev/null
+++ b/examples/src/main/python/ml/random_forest_example.py
@@ -0,0 +1,87 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from __future__ import print_function
+
+import sys
+
+from pyspark import SparkContext
+from pyspark.ml.classification import RandomForestClassifier
+from pyspark.ml.feature import StringIndexer
+from pyspark.ml.regression import RandomForestRegressor
+from pyspark.mllib.evaluation import MulticlassMetrics, RegressionMetrics
+from pyspark.mllib.util import MLUtils
+from pyspark.sql import Row, SQLContext
+
+"""
+A simple example demonstrating a RandomForest Classification/Regression Pipeline.
+Run with:
+  bin/spark-submit examples/src/main/python/ml/random_forest_example.py
+"""
+
+
+def testClassification(train, test):
+    # Train a RandomForest model.
+    # Setting featureSubsetStrategy="auto" lets the algorithm choose.
+    # Note: Use larger numTrees in practice.
+
+    rf = RandomForestClassifier(labelCol="indexedLabel", numTrees=3, maxDepth=4)
+
+    model = rf.fit(train)
+    predictionAndLabels = model.transform(test).select("prediction", "indexedLabel") \
+        .map(lambda x: (x.prediction, x.indexedLabel))
+
+    metrics = MulticlassMetrics(predictionAndLabels)
+    print("weighted f-measure %.3f" % metrics.weightedFMeasure())
+    print("precision %s" % metrics.precision())
+    print("recall %s" % metrics.recall())
+
+
+def testRegression(train, test):
+    # Train a RandomForest model.
+    # Note: Use larger numTrees in practice.
+
+    rf = RandomForestRegressor(labelCol="indexedLabel", numTrees=3, maxDepth=4)
+
+    model = rf.fit(train)
+    predictionAndLabels = model.transform(test).select("prediction", "indexedLabel") \
+        .map(lambda x: (x.prediction, x.indexedLabel))
+
+    metrics = RegressionMetrics(predictionAndLabels)
+    print("rmse %.3f" % metrics.rootMeanSquaredError)
+    print("r2 %.3f" % metrics.r2)
+    print("mae %.3f" % metrics.meanAbsoluteError)
+
+
+if __name__ == "__main__":
+    if len(sys.argv) > 1:
+        print("Usage: random_forest_example", file=sys.stderr)
+        exit(1)
+    sc = SparkContext(appName="PythonRandomForestExample")
+    sqlContext = SQLContext(sc)
+
+    # Load and parse the data file into a dataframe.
+    df = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt").toDF()
+
+    # Map labels into an indexed column of labels in [0, numLabels)
+    stringIndexer = StringIndexer(inputCol="label", outputCol="indexedLabel")
+    si_model = stringIndexer.fit(df)
+    td = si_model.transform(df)
+    [train, test] = td.randomSplit([0.7, 0.3])
+    testClassification(train, test)
+    testRegression(train, test)
+    sc.stop()
diff --git a/examples/src/main/python/ml/simple_params_example.py b/examples/src/main/python/ml/simple_params_example.py
new file mode 100644
index 0000000000000..3933d59b52cd1
--- /dev/null
+++ b/examples/src/main/python/ml/simple_params_example.py
@@ -0,0 +1,98 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from __future__ import print_function
+
+import pprint
+import sys
+
+from pyspark import SparkContext
+from pyspark.ml.classification import LogisticRegression
+from pyspark.mllib.linalg import DenseVector
+from pyspark.mllib.regression import LabeledPoint
+from pyspark.sql import SQLContext
+
+"""
+A simple example demonstrating ways to specify parameters for Estimators and Transformers.
+Run with:
+  bin/spark-submit examples/src/main/python/ml/simple_params_example.py
+"""
+
+if __name__ == "__main__":
+    if len(sys.argv) > 1:
+        print("Usage: simple_params_example", file=sys.stderr)
+        exit(1)
+    sc = SparkContext(appName="PythonSimpleParamsExample")
+    sqlContext = SQLContext(sc)
+
+    # prepare training data.
+    # We create an RDD of LabeledPoints and convert them into a DataFrame.
+    # Spark DataFrames can automatically infer the schema from named tuples
+    # and LabeledPoint implements __reduce__ to behave like a named tuple.
+    training = sc.parallelize([
+        LabeledPoint(1.0, DenseVector([0.0, 1.1, 0.1])),
+        LabeledPoint(0.0, DenseVector([2.0, 1.0, -1.0])),
+        LabeledPoint(0.0, DenseVector([2.0, 1.3, 1.0])),
+        LabeledPoint(1.0, DenseVector([0.0, 1.2, -0.5]))]).toDF()
+
+    # Create a LogisticRegression instance with maxIter = 10.
+    # This instance is an Estimator.
+    lr = LogisticRegression(maxIter=10)
+    # Print out the parameters, documentation, and any default values.
+    print("LogisticRegression parameters:\n" + lr.explainParams() + "\n")
+
+    # We may also set parameters using setter methods.
+    lr.setRegParam(0.01)
+
+    # Learn a LogisticRegression model.  This uses the parameters stored in lr.
+    model1 = lr.fit(training)
+
+    # Since model1 is a Model (i.e., a Transformer produced by an Estimator),
+    # we can view the parameters it used during fit().
+    # This prints the parameter (name: value) pairs, where names are unique IDs for this
+    # LogisticRegression instance.
+    print("Model 1 was fit using parameters:\n")
+    pprint.pprint(model1.extractParamMap())
+
+    # We may alternatively specify parameters using a parameter map.
+    # paramMap overrides all lr parameters set earlier.
+    paramMap = {lr.maxIter: 20, lr.threshold: 0.55, lr.probabilityCol: "myProbability"}
+
+    # Now learn a new model using the new parameters.
+    model2 = lr.fit(training, paramMap)
+    print("Model 2 was fit using parameters:\n")
+    pprint.pprint(model2.extractParamMap())
+
+    # prepare test data.
+    test = sc.parallelize([
+        LabeledPoint(1.0, DenseVector([-1.0, 1.5, 1.3])),
+        LabeledPoint(0.0, DenseVector([3.0, 2.0, -0.1])),
+        LabeledPoint(0.0, DenseVector([0.0, 2.2, -1.5]))]).toDF()
+
+    # Make predictions on test data using the Transformer.transform() method.
+    # LogisticRegressionModel.transform will only use the 'features' column.
+    # Note that model2.transform() outputs a 'myProbability' column instead of the usual
+    # 'probability' column since we renamed the lr.probabilityCol parameter previously.
+    result = model2.transform(test) \
+        .select("features", "label", "myProbability", "prediction") \
+        .collect()
+
+    for row in result:
+        print("features=%s,label=%s -> prob=%s, prediction=%s"
+              % (row.features, row.label, row.myProbability, row.prediction))
+
+    sc.stop()
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/SimpleParamsExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/SimpleParamsExample.scala
index e8a991f50e338..a0561e2573fc9 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/SimpleParamsExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/SimpleParamsExample.scala
@@ -87,7 +87,7 @@ object SimpleParamsExample {
       LabeledPoint(1.0, Vectors.dense(0.0, 2.2, -1.5))))
 
     // Make predictions on test data using the Transformer.transform() method.
-    // LogisticRegression.transform will only use the 'features' column.
+    // LogisticRegressionModel.transform will only use the 'features' column.
     // Note that model2.transform() outputs a 'myProbability' column instead of the usual
     // 'probability' column since we renamed the lr.probabilityCol parameter previously.
     model2.transform(test.toDF())

From 8c9979337f193c72fd2f1a891909283de53777e3 Mon Sep 17 00:00:00 2001
From: Andrew Or <andrew@databricks.com>
Date: Fri, 29 May 2015 15:26:49 -0700
Subject: [PATCH 252/525] [HOTFIX] [SQL] Maven test compilation issue

Tests compile in SBT but not Maven.
---
 sql/core/pom.xml | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/sql/core/pom.xml b/sql/core/pom.xml
index ffe95bb49188f..8210c552603ea 100644
--- a/sql/core/pom.xml
+++ b/sql/core/pom.xml
@@ -41,6 +41,13 @@
       <artifactId>spark-core_${scala.binary.version}</artifactId>
       <version>${project.version}</version>
     </dependency>
+    <dependency>
+      <groupId>org.apache.spark</groupId>
+      <artifactId>spark-core_${scala.binary.version}</artifactId>
+      <version>${project.version}</version>
+      <type>test-jar</type>
+      <scope>test</scope>
+    </dependency>
     <dependency>
       <groupId>org.apache.spark</groupId>
       <artifactId>spark-catalyst_${scala.binary.version}</artifactId>

From a4f24123d8857656524c9138c7c067a4b1033a5e Mon Sep 17 00:00:00 2001
From: Andrew Or <andrew@databricks.com>
Date: Fri, 29 May 2015 17:19:46 -0700
Subject: [PATCH 253/525] [HOT FIX] [BUILD] Fix maven build failures

This patch fixes a build break in maven caused by #6441.

Note that this patch reverts the changes in flume-sink because
this module does not currently depend on Spark core, but the
tests require it. There is not an easy way to make this work
because mvn test dependencies are not transitive (MNG-1378).

For now, we will leave the one test suite in flume-sink out
until we figure out a better solution. This patch is mainly
intended to unbreak the maven build.

Author: Andrew Or <andrew@databricks.com>

Closes #6511 from andrewor14/fix-build-mvn and squashes the following commits:

3d53643 [Andrew Or] [HOT FIX #6441] Fix maven build failures
---
 external/flume-sink/pom.xml                                | 7 -------
 .../apache/spark/streaming/flume/sink/SparkSinkSuite.scala | 5 ++---
 mllib/pom.xml                                              | 7 +++++++
 3 files changed, 9 insertions(+), 10 deletions(-)

diff --git a/external/flume-sink/pom.xml b/external/flume-sink/pom.xml
index bb2ec96715942..1f3e619d97a24 100644
--- a/external/flume-sink/pom.xml
+++ b/external/flume-sink/pom.xml
@@ -35,13 +35,6 @@
   <url>http://spark.apache.org/</url>
 
   <dependencies>
-    <dependency>
-      <groupId>org.apache.spark</groupId>
-      <artifactId>spark-core_${scala.binary.version}</artifactId>
-      <version>${project.version}</version>
-      <type>test-jar</type>
-      <scope>test</scope>
-    </dependency>
     <dependency>
       <groupId>org.apache.commons</groupId>
       <artifactId>commons-lang3</artifactId>
diff --git a/external/flume-sink/src/test/scala/org/apache/spark/streaming/flume/sink/SparkSinkSuite.scala b/external/flume-sink/src/test/scala/org/apache/spark/streaming/flume/sink/SparkSinkSuite.scala
index e9fbcb9db6b78..650b2fbe1c142 100644
--- a/external/flume-sink/src/test/scala/org/apache/spark/streaming/flume/sink/SparkSinkSuite.scala
+++ b/external/flume-sink/src/test/scala/org/apache/spark/streaming/flume/sink/SparkSinkSuite.scala
@@ -31,10 +31,9 @@ import org.apache.flume.Context
 import org.apache.flume.channel.MemoryChannel
 import org.apache.flume.event.EventBuilder
 import org.jboss.netty.channel.socket.nio.NioClientSocketChannelFactory
+import org.scalatest.FunSuite
 
-import org.apache.spark.SparkFunSuite
-
-class SparkSinkSuite extends SparkFunSuite {
+class SparkSinkSuite extends FunSuite {
   val eventsPerBatch = 1000
   val channelCapacity = 5000
 
diff --git a/mllib/pom.xml b/mllib/pom.xml
index 0c07ca1a62fd3..65c647a91d192 100644
--- a/mllib/pom.xml
+++ b/mllib/pom.xml
@@ -40,6 +40,13 @@
       <artifactId>spark-core_${scala.binary.version}</artifactId>
       <version>${project.version}</version>
     </dependency>
+    <dependency>
+      <groupId>org.apache.spark</groupId>
+      <artifactId>spark-core_${scala.binary.version}</artifactId>
+      <version>${project.version}</version>
+      <type>test-jar</type>
+      <scope>test</scope>
+    </dependency>
     <dependency>
       <groupId>org.apache.spark</groupId>
       <artifactId>spark-streaming_${scala.binary.version}</artifactId>

From 3792d25836e1e521da64c5a62ca1b6cca1bcb6b9 Mon Sep 17 00:00:00 2001
From: Taka Shinagawa <taka.epsilon@gmail.com>
Date: Fri, 29 May 2015 20:35:14 -0700
Subject: [PATCH 254/525] [DOCS][Tiny] Added a missing dash(-) in
 docs/configuration.md

The first line had only two dashes (--) instead of three(---). Because of this missing dash(-), 'jekyll build' command was not converting configuration.md to _site/configuration.html

Author: Taka Shinagawa <taka.epsilon@gmail.com>

Closes #6513 from mrt/docfix3 and squashes the following commits:

c470e2c [Taka Shinagawa] Added a missing dash(-) preventing jekyll from converting configuration.md to html format
---
 docs/configuration.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/configuration.md b/docs/configuration.md
index 30508a617fdd8..3a48da4592dd9 100644
--- a/docs/configuration.md
+++ b/docs/configuration.md
@@ -1,4 +1,4 @@
---
+---
 layout: global
 displayTitle: Spark Configuration
 title: Configuration

From 7ed06c39922ac90acab3a78ce0f2f21184ed68a5 Mon Sep 17 00:00:00 2001
From: Burak Yavuz <brkyvz@gmail.com>
Date: Fri, 29 May 2015 22:19:15 -0700
Subject: [PATCH 255/525] [SPARK-7957] Preserve partitioning when using
 randomSplit

cc JoshRosen
Thanks for noticing this!

Author: Burak Yavuz <brkyvz@gmail.com>

Closes #6509 from brkyvz/sample-perf-reg and squashes the following commits:

497465d [Burak Yavuz] addressed code review
293f95f [Burak Yavuz] [SPARK-7957] Preserve partitioning when using randomSplit
---
 core/src/main/scala/org/apache/spark/rdd/RDD.scala | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/rdd/RDD.scala b/core/src/main/scala/org/apache/spark/rdd/RDD.scala
index 5fcef255e13af..10610f4b6f1ff 100644
--- a/core/src/main/scala/org/apache/spark/rdd/RDD.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/RDD.scala
@@ -434,11 +434,11 @@ abstract class RDD[T: ClassTag](
    * @return A random sub-sample of the RDD without replacement.
    */
   private[spark] def randomSampleWithRange(lb: Double, ub: Double, seed: Long): RDD[T] = {
-    this.mapPartitionsWithIndex { case (index, partition) =>
+    this.mapPartitionsWithIndex( { (index, partition) =>
       val sampler = new BernoulliCellSampler[T](lb, ub)
       sampler.setSeed(seed + index)
       sampler.sample(partition)
-    }
+    }, preservesPartitioning = true)
   }
 
   /**

From 609c4923f98c188bce60ae35c1c8a08a8dfd95f1 Mon Sep 17 00:00:00 2001
From: Andrew Or <andrew@databricks.com>
Date: Fri, 29 May 2015 22:57:46 -0700
Subject: [PATCH 256/525] [SPARK-7558] Guard against direct uses of FunSuite /
 FunSuiteLike

This is a follow-up patch to #6441.

Author: Andrew Or <andrew@databricks.com>

Closes #6510 from andrewor14/extends-funsuite-check and squashes the following commits:

6618b46 [Andrew Or] Exempt SparkSinkSuite from the FunSuite check
99d02ac [Andrew Or] Merge branch 'master' of github.com:apache/spark into extends-funsuite-check
48874dd [Andrew Or] Guard against direct uses of FunSuite / FunSuiteLike
---
 core/src/test/scala/org/apache/spark/SparkFunSuite.scala | 2 ++
 .../spark/streaming/flume/sink/SparkSinkSuite.scala      | 9 +++++++++
 scalastyle-config.xml                                    | 7 +++++++
 3 files changed, 18 insertions(+)

diff --git a/core/src/test/scala/org/apache/spark/SparkFunSuite.scala b/core/src/test/scala/org/apache/spark/SparkFunSuite.scala
index 0327dfad6ea51..8cb344332668f 100644
--- a/core/src/test/scala/org/apache/spark/SparkFunSuite.scala
+++ b/core/src/test/scala/org/apache/spark/SparkFunSuite.scala
@@ -17,12 +17,14 @@
 
 package org.apache.spark
 
+// scalastyle:off
 import org.scalatest.{FunSuite, Outcome}
 
 /**
  * Base abstract class for all unit tests in Spark for handling common functionality.
  */
 private[spark] abstract class SparkFunSuite extends FunSuite with Logging {
+// scalastyle:on
 
   /**
    * Log the suite name and the test name before and after each test.
diff --git a/external/flume-sink/src/test/scala/org/apache/spark/streaming/flume/sink/SparkSinkSuite.scala b/external/flume-sink/src/test/scala/org/apache/spark/streaming/flume/sink/SparkSinkSuite.scala
index 650b2fbe1c142..605b3fe71017f 100644
--- a/external/flume-sink/src/test/scala/org/apache/spark/streaming/flume/sink/SparkSinkSuite.scala
+++ b/external/flume-sink/src/test/scala/org/apache/spark/streaming/flume/sink/SparkSinkSuite.scala
@@ -31,9 +31,18 @@ import org.apache.flume.Context
 import org.apache.flume.channel.MemoryChannel
 import org.apache.flume.event.EventBuilder
 import org.jboss.netty.channel.socket.nio.NioClientSocketChannelFactory
+
+// Due to MNG-1378, there is not a way to include test dependencies transitively.
+// We cannot include Spark core tests as a dependency here because it depends on
+// Spark core main, which has too many dependencies to require here manually.
+// For this reason, we continue to use FunSuite and ignore the scalastyle checks
+// that fail if this is detected.
+//scalastyle:off
 import org.scalatest.FunSuite
 
 class SparkSinkSuite extends FunSuite {
+//scalastyle:on
+
   val eventsPerBatch = 1000
   val channelCapacity = 5000
 
diff --git a/scalastyle-config.xml b/scalastyle-config.xml
index 68c8ce3b7e10b..890bf3794925b 100644
--- a/scalastyle-config.xml
+++ b/scalastyle-config.xml
@@ -153,4 +153,11 @@
     </parameters>
   </check>
   <check level="error" class="org.scalastyle.scalariform.NotImplementedErrorUsage" enabled="true"></check>
+  <!-- As of SPARK-7558, all tests in Spark should extend o.a.s.SparkFunSuite instead of FunSuited directly -->
+  <check level="error" class="org.scalastyle.scalariform.TokenChecker" enabled="true">
+   <parameters>
+    <parameter name="regex">^FunSuite[A-Za-z]*$</parameter>
+   </parameters>
+   <customMessage>Tests must extend org.apache.spark.SparkFunSuite instead.</customMessage>
+  </check>
 </scalastyle>

From 193dba01c77ef1bb63e3f617213eb257960f8d2f Mon Sep 17 00:00:00 2001
From: Andrew Or <andrew@databricks.com>
Date: Fri, 29 May 2015 23:08:47 -0700
Subject: [PATCH 257/525] [TRIVIAL] Typo fix for last commit

---
 scalastyle-config.xml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scalastyle-config.xml b/scalastyle-config.xml
index 890bf3794925b..a0098169a0248 100644
--- a/scalastyle-config.xml
+++ b/scalastyle-config.xml
@@ -153,7 +153,7 @@
     </parameters>
   </check>
   <check level="error" class="org.scalastyle.scalariform.NotImplementedErrorUsage" enabled="true"></check>
-  <!-- As of SPARK-7558, all tests in Spark should extend o.a.s.SparkFunSuite instead of FunSuited directly -->
+  <!-- As of SPARK-7558, all tests in Spark should extend o.a.s.SparkFunSuite instead of FunSuite directly -->
   <check level="error" class="org.scalastyle.scalariform.TokenChecker" enabled="true">
    <parameters>
     <parameter name="regex">^FunSuite[A-Za-z]*$</parameter>

From da2112aef28e63c452f592e0abd007141787877d Mon Sep 17 00:00:00 2001
From: Octavian Geagla <ogeagla@gmail.com>
Date: Fri, 29 May 2015 23:55:19 -0700
Subject: [PATCH 258/525] [SPARK-7576] [MLLIB] Add spark.ml user guide
 doc/example for ElementwiseProduct

Author: Octavian Geagla <ogeagla@gmail.com>

Closes #6501 from ogeagla/ml-guide-elemwiseprod and squashes the following commits:

4ad93d5 [Octavian Geagla] [SPARK-7576] [MLLIB] Incorporate code review feedback.
f7be7ad [Octavian Geagla] [SPARK-7576] [MLLIB] Add spark.ml user guide doc/example for ElementwiseProduct.
---
 docs/ml-features.md | 88 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 88 insertions(+)

diff --git a/docs/ml-features.md b/docs/ml-features.md
index d7851a55fabfe..81f1b8823a8ce 100644
--- a/docs/ml-features.md
+++ b/docs/ml-features.md
@@ -876,5 +876,93 @@ bucketedData = bucketizer.transform(dataFrame)
 </div>
 </div>
 
+## ElementwiseProduct
+
+ElementwiseProduct multiplies each input vector by a provided "weight" vector, using element-wise multiplication. In other words, it scales each column of the dataset by a scalar multiplier.  This represents the [Hadamard product](https://en.wikipedia.org/wiki/Hadamard_product_%28matrices%29) between the input vector, `v` and transforming vector, `w`, to yield a result vector.
+
+`\[ \begin{pmatrix}
+v_1 \\
+\vdots \\
+v_N
+\end{pmatrix} \circ \begin{pmatrix}
+                    w_1 \\
+                    \vdots \\
+                    w_N
+                    \end{pmatrix}
+= \begin{pmatrix}
+  v_1 w_1 \\
+  \vdots \\
+  v_N w_N
+  \end{pmatrix}
+\]`
+
+[`ElementwiseProduct`](api/scala/index.html#org.apache.spark.ml.feature.ElementwiseProduct) takes the following parameter:
+
+* `scalingVec`: the transforming vector.
+
+This example below demonstrates how to transform vectors using a transforming vector value.
+
+<div class="codetabs">
+<div data-lang="scala">
+{% highlight scala %}
+import org.apache.spark.ml.feature.ElementwiseProduct
+import org.apache.spark.mllib.linalg.Vectors
+
+// Create some vector data; also works for sparse vectors
+val dataFrame = sqlContext.createDataFrame(Seq(
+  ("a", Vectors.dense(1.0, 2.0, 3.0)),
+  ("b", Vectors.dense(4.0, 5.0, 6.0)))).toDF("id", "vector")
+
+val transformingVector = Vectors.dense(0.0, 1.0, 2.0)
+val transformer = new ElementwiseProduct()
+  .setScalingVec(transformingVector)
+  .setInputCol("vector")
+  .setOutputCol("transformedVector")
+
+// Batch transform the vectors to create new column:
+val transformedData = transformer.transform(dataFrame)
+
+{% endhighlight %}
+</div>
+
+<div data-lang="java">
+{% highlight java %}
+import com.google.common.collect.Lists;
+
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.ml.feature.ElementwiseProduct;
+import org.apache.spark.mllib.linalg.Vector;
+import org.apache.spark.mllib.linalg.Vectors;
+import org.apache.spark.sql.DataFrame;
+import org.apache.spark.sql.Row;
+import org.apache.spark.sql.RowFactory;
+import org.apache.spark.sql.SQLContext;
+import org.apache.spark.sql.types.DataTypes;
+import org.apache.spark.sql.types.Metadata;
+import org.apache.spark.sql.types.StructField;
+import org.apache.spark.sql.types.StructType;
+
+// Create some vector data; also works for sparse vectors
+JavaRDD<Row> jrdd = jsc.parallelize(Lists.newArrayList(
+  RowFactory.create("a", Vectors.dense(1.0, 2.0, 3.0)),
+  RowFactory.create("b", Vectors.dense(4.0, 5.0, 6.0))
+));
+List<StructField> fields = new ArrayList<StructField>(2);
+fields.add(DataTypes.createStructField("id", DataTypes.StringType, false));
+fields.add(DataTypes.createStructField("vector", DataTypes.StringType, false));
+StructType schema = DataTypes.createStructType(fields);
+DataFrame dataFrame = sqlContext.createDataFrame(jrdd, schema);
+Vector transformingVector = Vectors.dense(0.0, 1.0, 2.0);
+ElementwiseProduct transformer = new ElementwiseProduct()
+  .setScalingVec(transformingVector)
+  .setInputCol("vector")
+  .setOutputCol("transformedVector");
+// Batch transform the vectors to create new column:
+DataFrame transformedData = transformer.transform(dataFrame);
+
+{% endhighlight %}
+</div>
+</div>
+
 # Feature Selectors
 

From 78657d53d71b9d3e86b675cc519868f99e2ffa01 Mon Sep 17 00:00:00 2001
From: Timothy Chen <tnachen@gmail.com>
Date: Fri, 29 May 2015 23:56:18 -0700
Subject: [PATCH 259/525] [SPARK-7962] [MESOS] Fix master url parsing in rest
 submission client.

Only parse standalone master url when master url starts with spark://

Author: Timothy Chen <tnachen@gmail.com>

Closes #6517 from tnachen/fix_mesos_client and squashes the following commits:

61a1198 [Timothy Chen] Fix master url parsing in rest submission client.
---
 .../org/apache/spark/deploy/rest/RestSubmissionClient.scala | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/core/src/main/scala/org/apache/spark/deploy/rest/RestSubmissionClient.scala b/core/src/main/scala/org/apache/spark/deploy/rest/RestSubmissionClient.scala
index 6078f50518ba4..1fe956320a1b8 100644
--- a/core/src/main/scala/org/apache/spark/deploy/rest/RestSubmissionClient.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/rest/RestSubmissionClient.scala
@@ -57,7 +57,11 @@ private[spark] class RestSubmissionClient(master: String) extends Logging {
 
   private val supportedMasterPrefixes = Seq("spark://", "mesos://")
 
-  private val masters: Array[String] = Utils.parseStandaloneMasterUrls(master)
+  private val masters: Array[String] = if (master.startsWith("spark://")) {
+    Utils.parseStandaloneMasterUrls(master)
+  } else {
+    Array(master)
+  }
 
   // Set of masters that lost contact with us, used to keep track of
   // whether there are masters still alive for us to communicate with

From e3a43748338b02ef6864ca62de40e218e5677506 Mon Sep 17 00:00:00 2001
From: Octavian Geagla <ogeagla@gmail.com>
Date: Sat, 30 May 2015 00:00:36 -0700
Subject: [PATCH 260/525] [SPARK-7459] [MLLIB] ElementwiseProduct Java example

Author: Octavian Geagla <ogeagla@gmail.com>

Closes #6008 from ogeagla/elementwise-prod-doc and squashes the following commits:

72e6dc0 [Octavian Geagla] [SPARK-7459] [MLLIB] Java example import.
cf2afbd [Octavian Geagla] [SPARK-7459] [MLLIB] Update description of example.
b66431b [Octavian Geagla] [SPARK-7459] [MLLIB] Add override annotation to java example, make scala example use same data as java.
6b26b03 [Octavian Geagla] [SPARK-7459] [MLLIB] Fix line which is too long.
79af020 [Octavian Geagla] [SPARK-7459] [MLLIB] Actually don't use Java 8.
9d5b31a [Octavian Geagla] [SPARK-7459] [MLLIB] Don't use Java 8
4f0c92f [Octavian Geagla] [SPARK-7459] [MLLIB] ElementwiseProduct Java example.
---
 docs/mllib-feature-extraction.md | 40 +++++++++++++++++++++++++++-----
 1 file changed, 34 insertions(+), 6 deletions(-)

diff --git a/docs/mllib-feature-extraction.md b/docs/mllib-feature-extraction.md
index f723cd6b9dfab..764985d436ead 100644
--- a/docs/mllib-feature-extraction.md
+++ b/docs/mllib-feature-extraction.md
@@ -505,7 +505,7 @@ v_N
 
 ### Example
 
-This example below demonstrates how to load a simple vectors file, extract a set of vectors, then transform those vectors using a transforming vector value.
+This example below demonstrates how to transform vectors using a transforming vector value.
 
 <div class="codetabs">
 <div data-lang="scala">
@@ -514,16 +514,44 @@ import org.apache.spark.SparkContext._
 import org.apache.spark.mllib.feature.ElementwiseProduct
 import org.apache.spark.mllib.linalg.Vectors
 
-// Load and parse the data:
-val data = sc.textFile("data/mllib/kmeans_data.txt")
-val parsedData = data.map(s => Vectors.dense(s.split(' ').map(_.toDouble)))
+// Create some vector data; also works for sparse vectors
+val data = sc.parallelize(Array(Vectors.dense(1.0, 2.0, 3.0), Vectors.dense(4.0, 5.0, 6.0)))
 
 val transformingVector = Vectors.dense(0.0, 1.0, 2.0)
 val transformer = new ElementwiseProduct(transformingVector)
 
 // Batch transform and per-row transform give the same results:
-val transformedData = transformer.transform(parsedData)
-val transformedData2 = parsedData.map(x => transformer.transform(x))
+val transformedData = transformer.transform(data)
+val transformedData2 = data.map(x => transformer.transform(x))
+
+{% endhighlight %}
+</div>
+
+<div data-lang="java">
+{% highlight java %}
+import java.util.Arrays;
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.mllib.feature.ElementwiseProduct;
+import org.apache.spark.mllib.linalg.Vector;
+import org.apache.spark.mllib.linalg.Vectors;
+
+// Create some vector data; also works for sparse vectors
+JavaRDD<Vector> data = sc.parallelize(Arrays.asList(
+  Vectors.dense(1.0, 2.0, 3.0), Vectors.dense(4.0, 5.0, 6.0)));
+Vector transformingVector = Vectors.dense(0.0, 1.0, 2.0);
+ElementwiseProduct transformer = new ElementwiseProduct(transformingVector);
+
+// Batch transform and per-row transform give the same results:
+JavaRDD<Vector> transformedData = transformer.transform(data);
+JavaRDD<Vector> transformedData2 = data.map(
+  new Function<Vector, Vector>() {
+    @Override
+    public Vector call(Vector v) {
+      return transformer.transform(v);
+    }
+  }
+);
 
 {% endhighlight %}
 </div>

From 0978aec9cd47dc0618e47b74a99e1cc2266be424 Mon Sep 17 00:00:00 2001
From: Wenchen Fan <cloud0fan@outlook.com>
Date: Sat, 30 May 2015 00:26:46 -0700
Subject: [PATCH 261/525] [SPARK-7964][SQL] remove unnecessary type coercion
 rule

We have defined these logics in `Cast` already, I think we should remove this rule.

Author: Wenchen Fan <cloud0fan@outlook.com>

Closes #6516 from cloud-fan/tmp2 and squashes the following commits:

d5035a4 [Wenchen Fan] remove useless rule
---
 .../catalyst/analysis/HiveTypeCoercion.scala  | 27 -------------------
 .../analysis/HiveTypeCoercionSuite.scala      | 16 -----------
 .../ExpressionEvaluationSuite.scala           |  2 ++
 3 files changed, 2 insertions(+), 43 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala
index 44664f898f762..195418d6dfb1f 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala
@@ -77,7 +77,6 @@ trait HiveTypeCoercion {
     PromoteStrings ::
     DecimalPrecision ::
     BooleanComparisons ::
-    BooleanCasts ::
     StringToIntegralCasts ::
     FunctionArgumentConversion ::
     CaseWhenCoercion ::
@@ -510,32 +509,6 @@ trait HiveTypeCoercion {
     }
   }
 
-  /**
-   * Casts to/from [[BooleanType]] are transformed into comparisons since
-   * the JVM does not consider Booleans to be numeric types.
-   */
-  object BooleanCasts extends Rule[LogicalPlan] {
-    def apply(plan: LogicalPlan): LogicalPlan = plan transformAllExpressions {
-      // Skip nodes who's children have not been resolved yet.
-      case e if !e.childrenResolved => e
-      // Skip if the type is boolean type already. Note that this extra cast should be removed
-      // by optimizer.SimplifyCasts.
-      case Cast(e, BooleanType) if e.dataType == BooleanType => e
-      // DateType should be null if be cast to boolean.
-      case Cast(e, BooleanType) if e.dataType == DateType => Cast(e, BooleanType)
-      // If the data type is not boolean and is being cast boolean, turn it into a comparison
-      // with the numeric value, i.e. x != 0. This will coerce the type into numeric type.
-      case Cast(e, BooleanType) if e.dataType != BooleanType => Not(EqualTo(e, Literal(0)))
-      // Stringify boolean if casting to StringType.
-      // TODO Ensure true/false string letter casing is consistent with Hive in all cases.
-      case Cast(e, StringType) if e.dataType == BooleanType =>
-        If(e, Literal("true"), Literal("false"))
-      // Turn true into 1, and false into 0 if casting boolean into other types.
-      case Cast(e, dataType) if e.dataType == BooleanType =>
-        Cast(If(e, Literal(1), Literal(0)), dataType)
-    }
-  }
-
   /**
    * When encountering a cast from a string representing a valid fractional number to an integral
    * type the jvm will throw a `java.lang.NumberFormatException`.  Hive, in contrast, returns the
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercionSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercionSuite.scala
index fcd745f43cfbf..f0101f4a88f86 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercionSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercionSuite.scala
@@ -104,22 +104,6 @@ class HiveTypeCoercionSuite extends PlanTest {
     widenTest(ArrayType(IntegerType), StructType(Seq()), None)
   }
 
-  test("boolean casts") {
-    val booleanCasts = new HiveTypeCoercion { }.BooleanCasts
-    def ruleTest(initial: Expression, transformed: Expression) {
-      val testRelation = LocalRelation(AttributeReference("a", IntegerType)())
-      comparePlans(
-        booleanCasts(Project(Seq(Alias(initial, "a")()), testRelation)),
-        Project(Seq(Alias(transformed, "a")()), testRelation))
-    }
-    // Remove superflous boolean -> boolean casts.
-    ruleTest(Cast(Literal(true), BooleanType), Literal(true))
-    // Stringify boolean when casting to string.
-    ruleTest(
-      Cast(Literal(false), StringType),
-      If(Literal(false), Literal("true"), Literal("false")))
-  }
-
   test("coalesce casts") {
     val fac = new HiveTypeCoercion { }.FunctionArgumentConversion
     def ruleTest(initial: Expression, transformed: Expression) {
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionEvaluationSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionEvaluationSuite.scala
index b511aa3a24420..10181366c2fcd 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionEvaluationSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionEvaluationSuite.scala
@@ -372,6 +372,8 @@ class ExpressionEvaluationSuite extends ExpressionEvaluationBaseSuite {
       DecimalType.Unlimited, ByteType), TimestampType), LongType), StringType), ShortType), 0)
     checkEvaluation(Literal(true) cast IntegerType, 1)
     checkEvaluation(Literal(false) cast IntegerType, 0)
+    checkEvaluation(Literal(true) cast StringType, "true")
+    checkEvaluation(Literal(false) cast StringType, "false")
     checkEvaluation(Cast(Literal(1) cast BooleanType, IntegerType), 1)
     checkEvaluation(Cast(Literal(0) cast BooleanType, IntegerType), 0)
     checkEvaluation("23" cast DoubleType, 23d)

From 8c8de3ed863985554e84fd07d1cdcaeca7e3375c Mon Sep 17 00:00:00 2001
From: Sean Owen <sowen@cloudera.com>
Date: Sat, 30 May 2015 07:59:27 -0400
Subject: [PATCH 262/525] [SPARK-7890] [DOCS] Document that Spark 2.11 now
 supports Kafka

Remove caveat about Kafka / JDBC not being supported for Scala 2.11

Author: Sean Owen <sowen@cloudera.com>

Closes #6470 from srowen/SPARK-7890 and squashes the following commits:

4652634 [Sean Owen] One more rewording
7b7f3c8 [Sean Owen] Restore note about JDBC component
126744d [Sean Owen] Remove caveat about Kafka / JDBC not being supported for Scala 2.11
---
 docs/building-spark.md | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/docs/building-spark.md b/docs/building-spark.md
index b2649d1ee2a53..b4cea158dac93 100644
--- a/docs/building-spark.md
+++ b/docs/building-spark.md
@@ -130,9 +130,7 @@ To produce a Spark package compiled with Scala 2.11, use the `-Dscala-2.11` prop
     dev/change-version-to-2.11.sh
     mvn -Pyarn -Phadoop-2.4 -Dscala-2.11 -DskipTests clean package
 
-Scala 2.11 support in Spark does not support a few features due to dependencies
-which are themselves not Scala 2.11 ready. Specifically, Spark's external 
-Kafka library and JDBC component are not yet supported in Scala 2.11 builds.
+Spark does not yet support its JDBC component for Scala 2.11.
 
 # Spark Tests in Maven
 

From 9d8aadb72bbc86595e253fe30201cda6a8db877e Mon Sep 17 00:00:00 2001
From: WangTaoTheTonic <wangtao111@huawei.com>
Date: Sat, 30 May 2015 08:04:27 -0400
Subject: [PATCH 263/525] [SPARK-7945] [CORE] Do trim to values in properties
 file

https://issues.apache.org/jira/browse/SPARK-7945

Now applications submited by org.apache.spark.launcher.Main read properties file without doing trim to values in it.
If user left a space after a value(say spark.driver.extraClassPath) then it probably affect global functions(like some jar could not be included in the classpath), so we should do it like Utils.getPropertiesFromFile.

Author: WangTaoTheTonic <wangtao111@huawei.com>
Author: Tao Wang <wangtao111@huawei.com>

Closes #6496 from WangTaoTheTonic/SPARK-7945 and squashes the following commits:

bb41b4b [Tao Wang] indent 4 to 2
6dd1cf2 [WangTaoTheTonic] use a simpler way
2c053a1 [WangTaoTheTonic] Do trim to values in properties file
---
 .../java/org/apache/spark/launcher/AbstractCommandBuilder.java | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/launcher/src/main/java/org/apache/spark/launcher/AbstractCommandBuilder.java b/launcher/src/main/java/org/apache/spark/launcher/AbstractCommandBuilder.java
index 33fd813f7a86c..33d65d13f0d25 100644
--- a/launcher/src/main/java/org/apache/spark/launcher/AbstractCommandBuilder.java
+++ b/launcher/src/main/java/org/apache/spark/launcher/AbstractCommandBuilder.java
@@ -296,6 +296,9 @@ Properties loadPropertiesFile() throws IOException {
       try {
         fd = new FileInputStream(propsFile);
         props.load(new InputStreamReader(fd, "UTF-8"));
+        for (Map.Entry<Object, Object> e : props.entrySet()) {
+          e.setValue(e.getValue().toString().trim());
+        }
       } finally {
         if (fd != null) {
           try {

From 2b35c99c7e73d22e82aef90b675709ae7f8d3b4a Mon Sep 17 00:00:00 2001
From: "zhichao.li" <zhichao.li@intel.com>
Date: Sat, 30 May 2015 08:06:11 -0400
Subject: [PATCH 264/525] [SPARK-7717] [WEBUI] Only showing total memory and
 cores for alive workers

Author: zhichao.li <zhichao.li@intel.com>

Closes #6317 from zhichao-li/workers and squashes the following commits:

d68bf11 [zhichao.li] change prefix
99b6768 [zhichao.li] remove extra space and add 'Alive' prefix
1e8eb06 [zhichao.li] only showing alive workers
---
 .../apache/spark/deploy/master/ui/MasterPage.scala  | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/deploy/master/ui/MasterPage.scala b/core/src/main/scala/org/apache/spark/deploy/master/ui/MasterPage.scala
index 756927682cd24..6a7c74020bace 100644
--- a/core/src/main/scala/org/apache/spark/deploy/master/ui/MasterPage.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/master/ui/MasterPage.scala
@@ -75,6 +75,7 @@ private[ui] class MasterPage(parent: MasterWebUI) extends WebUIPage("") {
 
     val workerHeaders = Seq("Worker Id", "Address", "State", "Cores", "Memory")
     val workers = state.workers.sortBy(_.id)
+    val aliveWorkers = state.workers.filter(_.state == WorkerState.ALIVE)
     val workerTable = UIUtils.listingTable(workerHeaders, workerRow, workers)
 
     val appHeaders = Seq("Application ID", "Name", "Cores", "Memory per Node", "Submitted Time",
@@ -108,12 +109,12 @@ private[ui] class MasterPage(parent: MasterWebUI) extends WebUIPage("") {
                   </li>
                 }.getOrElse { Seq.empty }
               }
-              <li><strong>Workers:</strong> {state.workers.size}</li>
-              <li><strong>Cores:</strong> {state.workers.map(_.cores).sum} Total,
-                {state.workers.map(_.coresUsed).sum} Used</li>
-              <li><strong>Memory:</strong>
-                {Utils.megabytesToString(state.workers.map(_.memory).sum)} Total,
-                {Utils.megabytesToString(state.workers.map(_.memoryUsed).sum)} Used</li>
+              <li><strong>Alive Workers:</strong> {aliveWorkers.size}</li>
+              <li><strong>Cores in use:</strong> {aliveWorkers.map(_.cores).sum} Total,
+                {aliveWorkers.map(_.coresUsed).sum} Used</li>
+              <li><strong>Memory in use:</strong>
+                {Utils.megabytesToString(aliveWorkers.map(_.memory).sum)} Total,
+                {Utils.megabytesToString(aliveWorkers.map(_.memoryUsed).sum)} Used</li>
               <li><strong>Applications:</strong>
                 {state.activeApps.size} Running,
                 {state.completedApps.size} Completed </li>

From 3ab71eb9d5e3fe21af7720421eafa51f6da9b63f Mon Sep 17 00:00:00 2001
From: Taka Shinagawa <taka.epsilon@gmail.com>
Date: Sat, 30 May 2015 08:25:21 -0400
Subject: [PATCH 265/525] [DOCS] [MINOR] Update for the Hadoop versions table
 with hadoop-2.6

Updated the doc for the hadoop-2.6 profile, which is new to Spark 1.4

Author: Taka Shinagawa <taka.epsilon@gmail.com>

Closes #6450 from mrt/docfix2 and squashes the following commits:

db1c43b [Taka Shinagawa] Updated the hadoop versions for hadoop-2.6 profile
323710e [Taka Shinagawa] The hadoop-2.6 profile is added to the Hadoop versions table
---
 docs/building-spark.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/docs/building-spark.md b/docs/building-spark.md
index b4cea158dac93..78cb9086f95e8 100644
--- a/docs/building-spark.md
+++ b/docs/building-spark.md
@@ -80,6 +80,7 @@ Because HDFS is not protocol-compatible across versions, if you want to read fro
     <tr><td>2.2.x</td><td>hadoop-2.2</td></tr>
     <tr><td>2.3.x</td><td>hadoop-2.3</td></tr>
     <tr><td>2.4.x</td><td>hadoop-2.4</td></tr>
+    <tr><td>2.6.x and later 2.x</td><td>hadoop-2.6</td></tr>
   </tbody>
 </table>
 

From d34b43bd5964e1feb03a17937de87a3f718806a5 Mon Sep 17 00:00:00 2001
From: Reynold Xin <rxin@databricks.com>
Date: Sat, 30 May 2015 12:06:38 -0700
Subject: [PATCH 266/525] Closes #4685


From 6e3f0c7810a6721698b0ed51cfbd41a0cd07a4a3 Mon Sep 17 00:00:00 2001
From: Cheng Lian <lian@databricks.com>
Date: Sat, 30 May 2015 12:16:09 -0700
Subject: [PATCH 267/525] [SPARK-7849] [SQL] [Docs] Updates SQL programming
 guide for 1.4

Author: Cheng Lian <lian@databricks.com>

Closes #6520 from liancheng/spark-7849 and squashes the following commits:

705264b [Cheng Lian] Updates SQL programming guide for 1.4
---
 docs/sql-programming-guide.md | 91 ++++++++++++++++++++++++++++++++---
 1 file changed, 85 insertions(+), 6 deletions(-)

diff --git a/docs/sql-programming-guide.md b/docs/sql-programming-guide.md
index 7cc0a87fd5c53..4ec3d83016ac6 100644
--- a/docs/sql-programming-guide.md
+++ b/docs/sql-programming-guide.md
@@ -11,6 +11,7 @@ title: Spark SQL and DataFrames
 
 Spark SQL is a Spark module for structured data processing. It provides a programming abstraction called DataFrames and can also act as distributed SQL query engine.
 
+For how to enable Hive support, please refer to the [Hive Tables](#hive-tables) section.
 
 # DataFrames
 
@@ -906,7 +907,7 @@ new data.
   <td>
     Ignore mode means that when saving a DataFrame to a data source, if data already exists,
     the save operation is expected to not save the contents of the DataFrame and to not
-    change the existing data.  This is similar to a `CREATE TABLE IF NOT EXISTS` in SQL.
+    change the existing data.  This is similar to a <code>CREATE TABLE IF NOT EXISTS</code> in SQL.
   </td>
 </tr>
 </table>
@@ -1030,7 +1031,7 @@ teenagers <- sql(sqlContext, "SELECT name FROM parquetFile WHERE age >= 13 AND a
 teenNames <- map(teenagers, function(p) { paste("Name:", p$name)})
 for (teenName in collect(teenNames)) {
   cat(teenName, "\n")
-} 
+}
 {% endhighlight %}
 
 </div>
@@ -1502,7 +1503,7 @@ Row[] results = sqlContext.sql("FROM src SELECT key, value").collect();
 <div data-lang="python"  markdown="1">
 
 When working with Hive one must construct a `HiveContext`, which inherits from `SQLContext`, and
-adds support for finding tables in the MetaStore and writing queries using HiveQL. 
+adds support for finding tables in the MetaStore and writing queries using HiveQL.
 {% highlight python %}
 # sc is an existing SparkContext.
 from pyspark.sql import HiveContext
@@ -1537,6 +1538,82 @@ results = sqlContext.sql("FROM src SELECT key, value").collect()
 </div>
 </div>
 
+### Interacting with Different Versions of Hive Metastore
+
+One of the most important pieces of Spark SQL's Hive support is interaction with Hive metastore,
+which enables Spark SQL to access metadata of Hive tables.  Starting from Spark 1.2.0, Spark SQL can
+talk to two versions of Hive metastore, either 0.12.0 or 0.13.1, default to the latter.  However, to
+switch to desired Hive metastore version, users have to rebuild the assembly jar with proper profile
+flags (either `-Phive-0.12.0` or `-Phive-0.13.1`), which is quite inconvenient.
+
+Starting from 1.4.0, users no longer need to rebuild the assembly jar to switch Hive metastore
+version.  Instead, configuration properties described in the table below can be used to specify
+desired Hive metastore version.  Currently, supported versions are still limited to 0.13.1 and
+0.12.0, but we are working on a more generalized mechanism to support a wider range of versions.
+
+Internally, Spark SQL 1.4.0 uses two Hive clients, one for executing native Hive commands like `SET`
+and `DESCRIBE`, the other dedicated for communicating with Hive metastore.  The former uses Hive
+jars of version 0.13.1, which are bundled with Spark 1.4.0.  The latter uses Hive jars of the
+version specified by users.  An isolated classloader is used here to avoid dependency conflicts.
+
+<table class="table">
+  <tr><th>Property Name</th><th>Meaning</th></tr>
+  <tr>
+    <td><code>spark.sql.hive.metastore.version</code></td>
+    <td>
+      The version of the hive client that will be used to communicate with the metastore.  Available
+      options are <code>0.12.0</code> and <code>0.13.1</code>.  Defaults to <code>0.13.1</code>.
+    </td>
+  </tr>
+
+  <tr>
+    <td><code>spark.sql.hive.metastore.jars</code></td>
+    <td>
+      The location of the jars that should be used to instantiate the HiveMetastoreClient.  This
+      property can be one of three options:
+      <ol>
+        <li><code>builtin</code></li>
+        Use Hive 0.13.1, which is bundled with the Spark assembly jar when <code>-Phive</code> is
+        enabled.  When this option is chosen, <code>spark.sql.hive.metastore.version</code> must be
+        either <code>0.13.1</code> or not defined.
+        <li><code>maven</code></li>
+        Use Hive jars of specified version downloaded from Maven repositories.
+        <li>A classpath in the standard format for both Hive and Hadoop.</li>
+      </ol>
+      Defaults to <code>builtin</code>.
+    </td>
+  </tr>
+
+  <tr>
+    <td><code>spark.sql.hive.metastore.sharedPrefixes</code></td>
+
+    <td>
+      <p>
+        A comma separated list of class prefixes that should be loaded using the classloader that is
+        shared between Spark SQL and a specific version of Hive. An example of classes that should
+        be shared is JDBC drivers that are needed to talk to the metastore. Other classes that need
+        to be shared are those that interact with classes that are already shared.  For example,
+        custom appenders that are used by log4j.
+      </p>
+      <p>
+        Defaults to <code>com.mysql.jdbc,org.postgresql,com.microsoft.sqlserver,oracle.jdbc</code>.
+      </p>
+    </td>
+  </tr>
+
+  <tr>
+    <td><code>spark.sql.hive.metastore.barrierPrefixes</code></td>
+    <td>
+      <p>
+        A comma separated list of class prefixes that should explicitly be reloaded for each version
+        of Hive that Spark SQL is communicating with.  For example, Hive UDFs that are declared in a
+        prefix that typically would be shared (i.e. <code>org.apache.spark.*</code>).
+      </p>
+      <p>Defaults to empty.</p>
+    </td>
+  </tr>
+</table>
+
 ## JDBC To Other Databases
 
 Spark SQL also includes a data source that can read data from other databases using JDBC.  This
@@ -1570,7 +1647,7 @@ the Data Sources API.  The following options are supported:
   <tr>
     <td><code>dbtable</code></td>
     <td>
-      The JDBC table that should be read.  Note that anything that is valid in a `FROM` clause of
+      The JDBC table that should be read.  Note that anything that is valid in a <code>FROM</code> clause of
       a SQL query can be used.  For example, instead of a full table you could also use a
       subquery in parentheses.
     </td>
@@ -1714,7 +1791,7 @@ that these options will be deprecated in future release as more optimizations ar
       Configures the maximum size in bytes for a table that will be broadcast to all worker nodes when
       performing a join.  By setting this value to -1 broadcasting can be disabled.  Note that currently
       statistics are only supported for Hive Metastore tables where the command
-      `ANALYZE TABLE &lt;tableName&gt; COMPUTE STATISTICS noscan` has been run.
+      <code>ANALYZE TABLE &lt;tableName&gt; COMPUTE STATISTICS noscan</code> has been run.
     </td>
   </tr>
   <tr>
@@ -1737,7 +1814,9 @@ that these options will be deprecated in future release as more optimizations ar
 
 # Distributed SQL Engine
 
-Spark SQL can also act as a distributed query engine using its JDBC/ODBC or command-line interface. In this mode, end-users or applications can interact with Spark SQL directly to run SQL queries, without the need to write any code.
+Spark SQL can also act as a distributed query engine using its JDBC/ODBC or command-line interface.
+In this mode, end-users or applications can interact with Spark SQL directly to run SQL queries,
+without the need to write any code.
 
 ## Running the Thrift JDBC/ODBC server
 

From 7716a5a1ec8ff8dc24e0146f8ead2f51da6512ad Mon Sep 17 00:00:00 2001
From: Reynold Xin <rxin@databricks.com>
Date: Sat, 30 May 2015 14:57:23 -0700
Subject: [PATCH 268/525] Updated SQL programming guide's Hive connectivity
 section.

---
 docs/sql-programming-guide.md | 46 +++++++++++++----------------------
 1 file changed, 17 insertions(+), 29 deletions(-)

diff --git a/docs/sql-programming-guide.md b/docs/sql-programming-guide.md
index 4ec3d83016ac6..2ea7572c6026a 100644
--- a/docs/sql-programming-guide.md
+++ b/docs/sql-programming-guide.md
@@ -1541,79 +1541,67 @@ results = sqlContext.sql("FROM src SELECT key, value").collect()
 ### Interacting with Different Versions of Hive Metastore
 
 One of the most important pieces of Spark SQL's Hive support is interaction with Hive metastore,
-which enables Spark SQL to access metadata of Hive tables.  Starting from Spark 1.2.0, Spark SQL can
-talk to two versions of Hive metastore, either 0.12.0 or 0.13.1, default to the latter.  However, to
-switch to desired Hive metastore version, users have to rebuild the assembly jar with proper profile
-flags (either `-Phive-0.12.0` or `-Phive-0.13.1`), which is quite inconvenient.
+which enables Spark SQL to access metadata of Hive tables. Starting from Spark 1.4.0, a single binary build of Spark SQL can be used to query different versions of Hive metastores, using the configuration described below.
 
-Starting from 1.4.0, users no longer need to rebuild the assembly jar to switch Hive metastore
-version.  Instead, configuration properties described in the table below can be used to specify
-desired Hive metastore version.  Currently, supported versions are still limited to 0.13.1 and
-0.12.0, but we are working on a more generalized mechanism to support a wider range of versions.
-
-Internally, Spark SQL 1.4.0 uses two Hive clients, one for executing native Hive commands like `SET`
-and `DESCRIBE`, the other dedicated for communicating with Hive metastore.  The former uses Hive
-jars of version 0.13.1, which are bundled with Spark 1.4.0.  The latter uses Hive jars of the
-version specified by users.  An isolated classloader is used here to avoid dependency conflicts.
+Internally, Spark SQL uses two Hive clients, one for executing native Hive commands like `SET`
+and `DESCRIBE`, the other dedicated for communicating with Hive metastore. The former uses Hive
+jars of version 0.13.1, which are bundled with Spark 1.4.0. The latter uses Hive jars of the
+version specified by users. An isolated classloader is used here to avoid dependency conflicts.
 
 <table class="table">
-  <tr><th>Property Name</th><th>Meaning</th></tr>
+  <tr><th>Property Name</th><th>Default</th><th>Meaning</th></tr>
   <tr>
     <td><code>spark.sql.hive.metastore.version</code></td>
+    <td><code>0.13.1</code></td>
     <td>
-      The version of the hive client that will be used to communicate with the metastore.  Available
-      options are <code>0.12.0</code> and <code>0.13.1</code>.  Defaults to <code>0.13.1</code>.
+      Version of the Hive metastore. Available
+      options are <code>0.12.0</code> and <code>0.13.1</code>. Support for more versions is coming in the future.
     </td>
   </tr>
-
   <tr>
     <td><code>spark.sql.hive.metastore.jars</code></td>
+    <td><code>builtin</code></td>
     <td>
-      The location of the jars that should be used to instantiate the HiveMetastoreClient.  This
+      Location of the jars that should be used to instantiate the HiveMetastoreClient. This
       property can be one of three options:
       <ol>
         <li><code>builtin</code></li>
         Use Hive 0.13.1, which is bundled with the Spark assembly jar when <code>-Phive</code> is
-        enabled.  When this option is chosen, <code>spark.sql.hive.metastore.version</code> must be
+        enabled. When this option is chosen, <code>spark.sql.hive.metastore.version</code> must be
         either <code>0.13.1</code> or not defined.
         <li><code>maven</code></li>
         Use Hive jars of specified version downloaded from Maven repositories.
         <li>A classpath in the standard format for both Hive and Hadoop.</li>
       </ol>
-      Defaults to <code>builtin</code>.
     </td>
   </tr>
-
   <tr>
     <td><code>spark.sql.hive.metastore.sharedPrefixes</code></td>
-
+    <td><code>com.mysql.jdbc,<br/>org.postgresql,<br/>com.microsoft.sqlserver,<br/>oracle.jdbc</code></td>
     <td>
       <p>
         A comma separated list of class prefixes that should be loaded using the classloader that is
         shared between Spark SQL and a specific version of Hive. An example of classes that should
         be shared is JDBC drivers that are needed to talk to the metastore. Other classes that need
-        to be shared are those that interact with classes that are already shared.  For example,
+        to be shared are those that interact with classes that are already shared. For example,
         custom appenders that are used by log4j.
       </p>
-      <p>
-        Defaults to <code>com.mysql.jdbc,org.postgresql,com.microsoft.sqlserver,oracle.jdbc</code>.
-      </p>
     </td>
   </tr>
-
   <tr>
     <td><code>spark.sql.hive.metastore.barrierPrefixes</code></td>
+    <td><code>(empty)</code></td>
     <td>
       <p>
         A comma separated list of class prefixes that should explicitly be reloaded for each version
-        of Hive that Spark SQL is communicating with.  For example, Hive UDFs that are declared in a
+        of Hive that Spark SQL is communicating with. For example, Hive UDFs that are declared in a
         prefix that typically would be shared (i.e. <code>org.apache.spark.*</code>).
       </p>
-      <p>Defaults to empty.</p>
     </td>
   </tr>
 </table>
 
+
 ## JDBC To Other Databases
 
 Spark SQL also includes a data source that can read data from other databases using JDBC.  This

From a6430028ecd7a6130f1eb15af9ec00e242c46725 Mon Sep 17 00:00:00 2001
From: Josh Rosen <joshrosen@databricks.com>
Date: Sat, 30 May 2015 15:27:51 -0700
Subject: [PATCH 269/525] [SPARK-7855] Move bypassMergeSort-handling from
 ExternalSorter to own component

Spark's `ExternalSorter` writes shuffle output files during sort-based shuffle. Sort-shuffle contains a configuration, `spark.shuffle.sort.bypassMergeThreshold`, which causes ExternalSorter to skip sorting and merging and simply write separate files per partition, which are then concatenated together to form the final map output file.

The code paths used during this bypass are almost completely separate from ExternalSorter's other code paths, so refactoring them into a separate file can significantly simplify the code.

In addition to re-arranging code, this patch deletes a bunch of dead code.  The main entry point into ExternalSorter is `insertAll()` and in SPARK-4479 / #3422 this method was modified to completely bypass in-memory buffering of records when `bypassMergeSort` takes effect. As a result, some of the spilling and merging code paths will no longer be called when `bypassMergeSort` is used, so we should be able to safely remove that code.

There's an open JIRA ([SPARK-6026](https://issues.apache.org/jira/browse/SPARK-6026)) for removing the `bypassMergeThreshold` parameter and code paths; I have not done that here, but the changes in this patch will make removing that parameter significantly easier if we ever decide to do that.

This patch also makes several improvements to shuffle-related tests and adds more defensive checks to certain shuffle classes:

- DiskBlockObjectWriter now throws an exception if `fileSegment()` is called before `commitAndClose()` has been called.
- DiskBlockObjectWriter's close methods are now idempotent, so calling any of the close methods twice in a row will no longer result in incorrect shuffle write metrics changes.  Calling `revertPartialWritesAndClose()` on a closed DiskBlockObjectWriter now has no effect (before, it might mess up the metrics).
- The end-to-end shuffle record count metrics tests have been moved from InputOutputMetricsSuite to ShuffleSuite.  This means that these tests will now be run against all shuffle implementations rather than just the default shuffle configuration.
- The end-to-end metrics tests now include a test of a job which performs aggregation in the shuffle.
- Our tests now check that `shuffleBytesWritten == totalShuffleBytesRead`.
- FileSegment now throws IllegalArgumentException if it is constructed with a negative length or offset.

Author: Josh Rosen <joshrosen@databricks.com>

Closes #6397 from JoshRosen/external-sorter-bypass-cleanup and squashes the following commits:

bf3f3f6 [Josh Rosen] Merge remote-tracking branch 'origin/master' into external-sorter-bypass-cleanup
8b216c4 [Josh Rosen] Guard against negative offsets and lengths in FileSegment
03f35a4 [Josh Rosen] Minor fix to cleanup logic.
b5cc35b [Josh Rosen] Move shuffle metrics tests to ShuffleSuite.
8b8fb9e [Josh Rosen] Add more tests + defensive programming to DiskBlockObjectWriter.
16564eb [Josh Rosen] Guard against calling fileSegment() before commitAndClose() has been called.
96811b4 [Josh Rosen] Remove confusing taskMetrics.shuffleWriteMetrics() optional call
8522b6a [Josh Rosen] Do not perform a map-side sort unless we're also doing map-side aggregation
08e40f3 [Josh Rosen] Remove excessively clever (and wrong) implementation of newBuffer()
d7f9938 [Josh Rosen] Add missing overrides; fix compilation
71d76ff [Josh Rosen] Update Javadoc
bf0d98f [Josh Rosen] Add comment to clarify confusing factory code
5197f73 [Josh Rosen] Add missing private[this]
30ef2c8 [Josh Rosen] Convert BypassMergeSortShuffleWriter to Java
bc1a820 [Josh Rosen] Fix bug when aggregator is used but map-side combine is disabled
0d3dcc0 [Josh Rosen] Remove unnecessary overloaded methods
25b964f [Josh Rosen] Rename SortShuffleSorter to SortShuffleFileWriter
0d9848c [Josh Rosen] Make it more clear that curWriteMetrics is now only used for spill metrics
7af7aea [Josh Rosen] Combine spill() and spillToMergeableFile()
6320112 [Josh Rosen] Add missing negation in deletion success check.
d267e0d [Josh Rosen] Fix style issue
7f15f7b [Josh Rosen] Back out extra cleanup-handling code, since this is already covered in stop()
25aa3bd [Josh Rosen] Make sure to delete outputFile after errors.
931ca68 [Josh Rosen] Refactor tests.
6a35716 [Josh Rosen] Refactor logic for deciding when to bypass
4b03539 [Josh Rosen] Move conf prior to first use
1265b25 [Josh Rosen] Fix some style errors and comments.
02355ef [Josh Rosen] More simplification
d4cb536 [Josh Rosen] Delete more unused code
bb96678 [Josh Rosen] Add missing interface file
b6cc1eb [Josh Rosen] Realize that bypass never buffers; proceed to delete tons of code
6185ee2 [Josh Rosen] WIP towards moving bypass code into own file.
8d0678c [Josh Rosen] Move diskBytesSpilled getter next to variable
19bccd6 [Josh Rosen] Remove duplicated buffer creation code.
18959bb [Josh Rosen] Move comparator methods closer together.
---
 .../sort/BypassMergeSortShuffleWriter.java    | 184 +++++++++++++
 .../shuffle/sort/SortShuffleFileWriter.java   |  53 ++++
 .../shuffle/sort/SortShuffleWriter.scala      |  34 ++-
 .../spark/storage/BlockObjectWriter.scala     |  19 +-
 .../apache/spark/storage/FileSegment.scala    |   2 +
 .../util/collection/ExternalSorter.scala      | 260 +++++-------------
 .../spark/util/collection/PairIterator.scala  |  24 --
 .../collection/PartitionedAppendOnlyMap.scala |   4 -
 .../collection/PartitionedPairBuffer.scala    |   4 -
 .../PartitionedSerializedPairBuffer.scala     |   4 -
 .../WritablePartitionedPairCollection.scala   |  36 +--
 .../scala/org/apache/spark/ShuffleSuite.scala |  65 +++++
 .../metrics/InputOutputMetricsSuite.scala     |  28 --
 .../BypassMergeSortShuffleWriterSuite.scala   | 171 ++++++++++++
 .../shuffle/sort/SortShuffleWriterSuite.scala |  46 ++++
 .../storage/BlockObjectWriterSuite.scala      |  97 ++++++-
 .../util/collection/ExternalSorterSuite.scala | 130 +--------
 17 files changed, 738 insertions(+), 423 deletions(-)
 create mode 100644 core/src/main/java/org/apache/spark/shuffle/sort/BypassMergeSortShuffleWriter.java
 create mode 100644 core/src/main/java/org/apache/spark/shuffle/sort/SortShuffleFileWriter.java
 delete mode 100644 core/src/main/scala/org/apache/spark/util/collection/PairIterator.scala
 create mode 100644 core/src/test/scala/org/apache/spark/shuffle/sort/BypassMergeSortShuffleWriterSuite.scala
 create mode 100644 core/src/test/scala/org/apache/spark/shuffle/sort/SortShuffleWriterSuite.scala

diff --git a/core/src/main/java/org/apache/spark/shuffle/sort/BypassMergeSortShuffleWriter.java b/core/src/main/java/org/apache/spark/shuffle/sort/BypassMergeSortShuffleWriter.java
new file mode 100644
index 0000000000000..d3d6280284beb
--- /dev/null
+++ b/core/src/main/java/org/apache/spark/shuffle/sort/BypassMergeSortShuffleWriter.java
@@ -0,0 +1,184 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.shuffle.sort;
+
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.FileOutputStream;
+import java.io.IOException;
+
+import scala.Product2;
+import scala.Tuple2;
+import scala.collection.Iterator;
+
+import com.google.common.io.Closeables;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.spark.Partitioner;
+import org.apache.spark.SparkConf;
+import org.apache.spark.TaskContext;
+import org.apache.spark.executor.ShuffleWriteMetrics;
+import org.apache.spark.serializer.Serializer;
+import org.apache.spark.serializer.SerializerInstance;
+import org.apache.spark.storage.*;
+import org.apache.spark.util.Utils;
+
+/**
+ * This class implements sort-based shuffle's hash-style shuffle fallback path. This write path
+ * writes incoming records to separate files, one file per reduce partition, then concatenates these
+ * per-partition files to form a single output file, regions of which are served to reducers.
+ * Records are not buffered in memory. This is essentially identical to
+ * {@link org.apache.spark.shuffle.hash.HashShuffleWriter}, except that it writes output in a format
+ * that can be served / consumed via {@link org.apache.spark.shuffle.IndexShuffleBlockResolver}.
+ * <p>
+ * This write path is inefficient for shuffles with large numbers of reduce partitions because it
+ * simultaneously opens separate serializers and file streams for all partitions. As a result,
+ * {@link SortShuffleManager} only selects this write path when
+ * <ul>
+ *    <li>no Ordering is specified,</li>
+ *    <li>no Aggregator is specific, and</li>
+ *    <li>the number of partitions is less than
+ *      <code>spark.shuffle.sort.bypassMergeThreshold</code>.</li>
+ * </ul>
+ *
+ * This code used to be part of {@link org.apache.spark.util.collection.ExternalSorter} but was
+ * refactored into its own class in order to reduce code complexity; see SPARK-7855 for details.
+ * <p>
+ * There have been proposals to completely remove this code path; see SPARK-6026 for details.
+ */
+final class BypassMergeSortShuffleWriter<K, V> implements SortShuffleFileWriter<K, V> {
+
+  private final Logger logger = LoggerFactory.getLogger(BypassMergeSortShuffleWriter.class);
+
+  private final int fileBufferSize;
+  private final boolean transferToEnabled;
+  private final int numPartitions;
+  private final BlockManager blockManager;
+  private final Partitioner partitioner;
+  private final ShuffleWriteMetrics writeMetrics;
+  private final Serializer serializer;
+
+  /** Array of file writers, one for each partition */
+  private BlockObjectWriter[] partitionWriters;
+
+  public BypassMergeSortShuffleWriter(
+      SparkConf conf,
+      BlockManager blockManager,
+      Partitioner partitioner,
+      ShuffleWriteMetrics writeMetrics,
+      Serializer serializer) {
+    // Use getSizeAsKb (not bytes) to maintain backwards compatibility if no units are provided
+    this.fileBufferSize = (int) conf.getSizeAsKb("spark.shuffle.file.buffer", "32k") * 1024;
+    this.transferToEnabled = conf.getBoolean("spark.file.transferTo", true);
+    this.numPartitions = partitioner.numPartitions();
+    this.blockManager = blockManager;
+    this.partitioner = partitioner;
+    this.writeMetrics = writeMetrics;
+    this.serializer = serializer;
+  }
+
+  @Override
+  public void insertAll(Iterator<Product2<K, V>> records) throws IOException {
+    assert (partitionWriters == null);
+    if (!records.hasNext()) {
+      return;
+    }
+    final SerializerInstance serInstance = serializer.newInstance();
+    final long openStartTime = System.nanoTime();
+    partitionWriters = new BlockObjectWriter[numPartitions];
+    for (int i = 0; i < numPartitions; i++) {
+      final Tuple2<TempShuffleBlockId, File> tempShuffleBlockIdPlusFile =
+        blockManager.diskBlockManager().createTempShuffleBlock();
+      final File file = tempShuffleBlockIdPlusFile._2();
+      final BlockId blockId = tempShuffleBlockIdPlusFile._1();
+      partitionWriters[i] =
+        blockManager.getDiskWriter(blockId, file, serInstance, fileBufferSize, writeMetrics).open();
+    }
+    // Creating the file to write to and creating a disk writer both involve interacting with
+    // the disk, and can take a long time in aggregate when we open many files, so should be
+    // included in the shuffle write time.
+    writeMetrics.incShuffleWriteTime(System.nanoTime() - openStartTime);
+
+    while (records.hasNext()) {
+      final Product2<K, V> record = records.next();
+      final K key = record._1();
+      partitionWriters[partitioner.getPartition(key)].write(key, record._2());
+    }
+
+    for (BlockObjectWriter writer : partitionWriters) {
+      writer.commitAndClose();
+    }
+  }
+
+  @Override
+  public long[] writePartitionedFile(
+      BlockId blockId,
+      TaskContext context,
+      File outputFile) throws IOException {
+    // Track location of the partition starts in the output file
+    final long[] lengths = new long[numPartitions];
+    if (partitionWriters == null) {
+      // We were passed an empty iterator
+      return lengths;
+    }
+
+    final FileOutputStream out = new FileOutputStream(outputFile, true);
+    final long writeStartTime = System.nanoTime();
+    boolean threwException = true;
+    try {
+      for (int i = 0; i < numPartitions; i++) {
+        final FileInputStream in = new FileInputStream(partitionWriters[i].fileSegment().file());
+        boolean copyThrewException = true;
+        try {
+          lengths[i] = Utils.copyStream(in, out, false, transferToEnabled);
+          copyThrewException = false;
+        } finally {
+          Closeables.close(in, copyThrewException);
+        }
+        if (!blockManager.diskBlockManager().getFile(partitionWriters[i].blockId()).delete()) {
+          logger.error("Unable to delete file for partition {}", i);
+        }
+      }
+      threwException = false;
+    } finally {
+      Closeables.close(out, threwException);
+      writeMetrics.incShuffleWriteTime(System.nanoTime() - writeStartTime);
+    }
+    partitionWriters = null;
+    return lengths;
+  }
+
+  @Override
+  public void stop() throws IOException {
+    if (partitionWriters != null) {
+      try {
+        final DiskBlockManager diskBlockManager = blockManager.diskBlockManager();
+        for (BlockObjectWriter writer : partitionWriters) {
+          // This method explicitly does _not_ throw exceptions:
+          writer.revertPartialWritesAndClose();
+          if (!diskBlockManager.getFile(writer.blockId()).delete()) {
+            logger.error("Error while deleting file for block {}", writer.blockId());
+          }
+        }
+      } finally {
+        partitionWriters = null;
+      }
+    }
+  }
+}
diff --git a/core/src/main/java/org/apache/spark/shuffle/sort/SortShuffleFileWriter.java b/core/src/main/java/org/apache/spark/shuffle/sort/SortShuffleFileWriter.java
new file mode 100644
index 0000000000000..656ea0401a144
--- /dev/null
+++ b/core/src/main/java/org/apache/spark/shuffle/sort/SortShuffleFileWriter.java
@@ -0,0 +1,53 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.shuffle.sort;
+
+import java.io.File;
+import java.io.IOException;
+
+import scala.Product2;
+import scala.collection.Iterator;
+
+import org.apache.spark.annotation.Private;
+import org.apache.spark.TaskContext;
+import org.apache.spark.storage.BlockId;
+
+/**
+ * Interface for objects that {@link SortShuffleWriter} uses to write its output files.
+ */
+@Private
+public interface SortShuffleFileWriter<K, V> {
+
+  void insertAll(Iterator<Product2<K, V>> records) throws IOException;
+
+  /**
+   * Write all the data added into this shuffle sorter into a file in the disk store. This is
+   * called by the SortShuffleWriter and can go through an efficient path of just concatenating
+   * binary files if we decided to avoid merge-sorting.
+   *
+   * @param blockId block ID to write to. The index file will be blockId.name + ".index".
+   * @param context a TaskContext for a running Spark task, for us to update shuffle metrics.
+   * @return array of lengths, in bytes, of each partition of the file (used by map output tracker)
+   */
+  long[] writePartitionedFile(
+      BlockId blockId,
+      TaskContext context,
+      File outputFile) throws IOException;
+
+  void stop() throws IOException;
+}
diff --git a/core/src/main/scala/org/apache/spark/shuffle/sort/SortShuffleWriter.scala b/core/src/main/scala/org/apache/spark/shuffle/sort/SortShuffleWriter.scala
index c9dd6bfc4c219..5865e7640c1cf 100644
--- a/core/src/main/scala/org/apache/spark/shuffle/sort/SortShuffleWriter.scala
+++ b/core/src/main/scala/org/apache/spark/shuffle/sort/SortShuffleWriter.scala
@@ -17,9 +17,10 @@
 
 package org.apache.spark.shuffle.sort
 
-import org.apache.spark.{MapOutputTracker, SparkEnv, Logging, TaskContext}
+import org.apache.spark._
 import org.apache.spark.executor.ShuffleWriteMetrics
 import org.apache.spark.scheduler.MapStatus
+import org.apache.spark.serializer.Serializer
 import org.apache.spark.shuffle.{IndexShuffleBlockResolver, ShuffleWriter, BaseShuffleHandle}
 import org.apache.spark.storage.ShuffleBlockId
 import org.apache.spark.util.collection.ExternalSorter
@@ -35,7 +36,7 @@ private[spark] class SortShuffleWriter[K, V, C](
 
   private val blockManager = SparkEnv.get.blockManager
 
-  private var sorter: ExternalSorter[K, V, _] = null
+  private var sorter: SortShuffleFileWriter[K, V] = null
 
   // Are we in the process of stopping? Because map tasks can call stop() with success = true
   // and then call stop() with success = false if they get an exception, we want to make sure
@@ -49,18 +50,27 @@ private[spark] class SortShuffleWriter[K, V, C](
 
   /** Write a bunch of records to this task's output */
   override def write(records: Iterator[Product2[K, V]]): Unit = {
-    if (dep.mapSideCombine) {
+    sorter = if (dep.mapSideCombine) {
       require(dep.aggregator.isDefined, "Map-side combine without Aggregator specified!")
-      sorter = new ExternalSorter[K, V, C](
+      new ExternalSorter[K, V, C](
         dep.aggregator, Some(dep.partitioner), dep.keyOrdering, dep.serializer)
-      sorter.insertAll(records)
+    } else if (SortShuffleWriter.shouldBypassMergeSort(
+        SparkEnv.get.conf, dep.partitioner.numPartitions, aggregator = None, keyOrdering = None)) {
+      // If there are fewer than spark.shuffle.sort.bypassMergeThreshold partitions and we don't
+      // need local aggregation and sorting, write numPartitions files directly and just concatenate
+      // them at the end. This avoids doing serialization and deserialization twice to merge
+      // together the spilled files, which would happen with the normal code path. The downside is
+      // having multiple files open at a time and thus more memory allocated to buffers.
+      new BypassMergeSortShuffleWriter[K, V](SparkEnv.get.conf, blockManager, dep.partitioner,
+        writeMetrics, Serializer.getSerializer(dep.serializer))
     } else {
       // In this case we pass neither an aggregator nor an ordering to the sorter, because we don't
       // care whether the keys get sorted in each partition; that will be done on the reduce side
       // if the operation being run is sortByKey.
-      sorter = new ExternalSorter[K, V, V](None, Some(dep.partitioner), None, dep.serializer)
-      sorter.insertAll(records)
+      new ExternalSorter[K, V, V](
+        aggregator = None, Some(dep.partitioner), ordering = None, dep.serializer)
     }
+    sorter.insertAll(records)
 
     // Don't bother including the time to open the merged output file in the shuffle write time,
     // because it just opens a single file, so is typically too fast to measure accurately
@@ -100,3 +110,13 @@ private[spark] class SortShuffleWriter[K, V, C](
   }
 }
 
+private[spark] object SortShuffleWriter {
+  def shouldBypassMergeSort(
+      conf: SparkConf,
+      numPartitions: Int,
+      aggregator: Option[Aggregator[_, _, _]],
+      keyOrdering: Option[Ordering[_]]): Boolean = {
+    val bypassMergeThreshold: Int = conf.getInt("spark.shuffle.sort.bypassMergeThreshold", 200)
+    numPartitions <= bypassMergeThreshold && aggregator.isEmpty && keyOrdering.isEmpty
+  }
+}
diff --git a/core/src/main/scala/org/apache/spark/storage/BlockObjectWriter.scala b/core/src/main/scala/org/apache/spark/storage/BlockObjectWriter.scala
index a33f22ef52687..7eeabd1e0489c 100644
--- a/core/src/main/scala/org/apache/spark/storage/BlockObjectWriter.scala
+++ b/core/src/main/scala/org/apache/spark/storage/BlockObjectWriter.scala
@@ -95,6 +95,7 @@ private[spark] class DiskBlockObjectWriter(
   private var objOut: SerializationStream = null
   private var initialized = false
   private var hasBeenClosed = false
+  private var commitAndCloseHasBeenCalled = false
 
   /**
    * Cursors used to represent positions in the file.
@@ -167,20 +168,22 @@ private[spark] class DiskBlockObjectWriter(
       objOut.flush()
       bs.flush()
       close()
+      finalPosition = file.length()
+      // In certain compression codecs, more bytes are written after close() is called
+      writeMetrics.incShuffleBytesWritten(finalPosition - reportedPosition)
+    } else {
+      finalPosition = file.length()
     }
-    finalPosition = file.length()
-    // In certain compression codecs, more bytes are written after close() is called
-    writeMetrics.incShuffleBytesWritten(finalPosition - reportedPosition)
+    commitAndCloseHasBeenCalled = true
   }
 
   // Discard current writes. We do this by flushing the outstanding writes and then
   // truncating the file to its initial position.
   override def revertPartialWritesAndClose() {
     try {
-      writeMetrics.decShuffleBytesWritten(reportedPosition - initialPosition)
-      writeMetrics.decShuffleRecordsWritten(numRecordsWritten)
-
       if (initialized) {
+        writeMetrics.decShuffleBytesWritten(reportedPosition - initialPosition)
+        writeMetrics.decShuffleRecordsWritten(numRecordsWritten)
         objOut.flush()
         bs.flush()
         close()
@@ -228,6 +231,10 @@ private[spark] class DiskBlockObjectWriter(
   }
 
   override def fileSegment(): FileSegment = {
+    if (!commitAndCloseHasBeenCalled) {
+      throw new IllegalStateException(
+        "fileSegment() is only valid after commitAndClose() has been called")
+    }
     new FileSegment(file, initialPosition, finalPosition - initialPosition)
   }
 
diff --git a/core/src/main/scala/org/apache/spark/storage/FileSegment.scala b/core/src/main/scala/org/apache/spark/storage/FileSegment.scala
index 95e2d688d9b17..021a9facfb0b2 100644
--- a/core/src/main/scala/org/apache/spark/storage/FileSegment.scala
+++ b/core/src/main/scala/org/apache/spark/storage/FileSegment.scala
@@ -24,6 +24,8 @@ import java.io.File
  * based off an offset and a length.
  */
 private[spark] class FileSegment(val file: File, val offset: Long, val length: Long) {
+  require(offset >= 0, s"File segment offset cannot be negative (got $offset)")
+  require(length >= 0, s"File segment length cannot be negative (got $length)")
   override def toString: String = {
     "(name=%s, offset=%d, length=%d)".format(file.getName, offset, length)
   }
diff --git a/core/src/main/scala/org/apache/spark/util/collection/ExternalSorter.scala b/core/src/main/scala/org/apache/spark/util/collection/ExternalSorter.scala
index 3b9d14f9372b6..ef2dbb7ff0ae0 100644
--- a/core/src/main/scala/org/apache/spark/util/collection/ExternalSorter.scala
+++ b/core/src/main/scala/org/apache/spark/util/collection/ExternalSorter.scala
@@ -23,12 +23,14 @@ import java.util.Comparator
 import scala.collection.mutable.ArrayBuffer
 import scala.collection.mutable
 
+import com.google.common.annotations.VisibleForTesting
 import com.google.common.io.ByteStreams
 
 import org.apache.spark._
 import org.apache.spark.serializer._
 import org.apache.spark.executor.ShuffleWriteMetrics
-import org.apache.spark.storage.{BlockObjectWriter, BlockId}
+import org.apache.spark.shuffle.sort.{SortShuffleFileWriter, SortShuffleWriter}
+import org.apache.spark.storage.{BlockId, BlockObjectWriter}
 
 /**
  * Sorts and potentially merges a number of key-value pairs of type (K, V) to produce key-combiner
@@ -84,35 +86,40 @@ import org.apache.spark.storage.{BlockObjectWriter, BlockId}
  *   each other for equality to merge values.
  *
  * - Users are expected to call stop() at the end to delete all the intermediate files.
- *
- * As a special case, if no Ordering and no Aggregator is given, and the number of partitions is
- * less than spark.shuffle.sort.bypassMergeThreshold, we bypass the merge-sort and just write to
- * separate files for each partition each time we spill, similar to the HashShuffleWriter. We can
- * then concatenate these files to produce a single sorted file, without having to serialize and
- * de-serialize each item twice (as is needed during the merge). This speeds up the map side of
- * groupBy, sort, etc operations since they do no partial aggregation.
  */
 private[spark] class ExternalSorter[K, V, C](
     aggregator: Option[Aggregator[K, V, C]] = None,
     partitioner: Option[Partitioner] = None,
     ordering: Option[Ordering[K]] = None,
     serializer: Option[Serializer] = None)
-  extends Logging with Spillable[WritablePartitionedPairCollection[K, C]] {
+  extends Logging
+  with Spillable[WritablePartitionedPairCollection[K, C]]
+  with SortShuffleFileWriter[K, V] {
+
+  private val conf = SparkEnv.get.conf
 
   private val numPartitions = partitioner.map(_.numPartitions).getOrElse(1)
   private val shouldPartition = numPartitions > 1
+  private def getPartition(key: K): Int = {
+    if (shouldPartition) partitioner.get.getPartition(key) else 0
+  }
+
+  // Since SPARK-7855, bypassMergeSort optimization is no longer performed as part of this class.
+  // As a sanity check, make sure that we're not handling a shuffle which should use that path.
+  if (SortShuffleWriter.shouldBypassMergeSort(conf, numPartitions, aggregator, ordering)) {
+    throw new IllegalArgumentException("ExternalSorter should not be used to handle "
+      + " a sort that the BypassMergeSortShuffleWriter should handle")
+  }
 
   private val blockManager = SparkEnv.get.blockManager
   private val diskBlockManager = blockManager.diskBlockManager
   private val ser = Serializer.getSerializer(serializer)
   private val serInstance = ser.newInstance()
 
-  private val conf = SparkEnv.get.conf
   private val spillingEnabled = conf.getBoolean("spark.shuffle.spill", true)
   
   // Use getSizeAsKb (not bytes) to maintain backwards compatibility if no units are provided
   private val fileBufferSize = conf.getSizeAsKb("spark.shuffle.file.buffer", "32k").toInt * 1024
-  private val transferToEnabled = conf.getBoolean("spark.file.transferTo", true)
 
   // Size of object batches when reading/writing from serializers.
   //
@@ -123,43 +130,28 @@ private[spark] class ExternalSorter[K, V, C](
   // grow internal data structures by growing + copying every time the number of objects doubles.
   private val serializerBatchSize = conf.getLong("spark.shuffle.spill.batchSize", 10000)
 
-  private def getPartition(key: K): Int = {
-    if (shouldPartition) partitioner.get.getPartition(key) else 0
-  }
-
-  private val metaInitialRecords = 256
-  private val kvChunkSize = conf.getInt("spark.shuffle.sort.kvChunkSize", 1 << 22) // 4 MB
   private val useSerializedPairBuffer =
-    !ordering.isDefined && conf.getBoolean("spark.shuffle.sort.serializeMapOutputs", true) &&
-    ser.supportsRelocationOfSerializedObjects
-
+    ordering.isEmpty &&
+      conf.getBoolean("spark.shuffle.sort.serializeMapOutputs", true) &&
+      ser.supportsRelocationOfSerializedObjects
+  private val kvChunkSize = conf.getInt("spark.shuffle.sort.kvChunkSize", 1 << 22) // 4 MB
+  private def newBuffer(): WritablePartitionedPairCollection[K, C] with SizeTracker = {
+    if (useSerializedPairBuffer) {
+      new PartitionedSerializedPairBuffer(metaInitialRecords = 256, kvChunkSize, serInstance)
+    } else {
+      new PartitionedPairBuffer[K, C]
+    }
+  }
   // Data structures to store in-memory objects before we spill. Depending on whether we have an
   // Aggregator set, we either put objects into an AppendOnlyMap where we combine them, or we
   // store them in an array buffer.
   private var map = new PartitionedAppendOnlyMap[K, C]
-  private var buffer = if (useSerializedPairBuffer) {
-    new PartitionedSerializedPairBuffer[K, C](metaInitialRecords, kvChunkSize, serInstance)
-  } else {
-    new PartitionedPairBuffer[K, C]
-  }
+  private var buffer = newBuffer()
 
   // Total spilling statistics
   private var _diskBytesSpilled = 0L
+  def diskBytesSpilled: Long = _diskBytesSpilled
 
-  // Write metrics for current spill
-  private var curWriteMetrics: ShuffleWriteMetrics = _
-
-  // If there are fewer than spark.shuffle.sort.bypassMergeThreshold partitions and we don't need
-  // local aggregation and sorting, write numPartitions files directly and just concatenate them
-  // at the end. This avoids doing serialization and deserialization twice to merge together the
-  // spilled files, which would happen with the normal code path. The downside is having multiple
-  // files open at a time and thus more memory allocated to buffers.
-  private val bypassMergeThreshold = conf.getInt("spark.shuffle.sort.bypassMergeThreshold", 200)
-  private val bypassMergeSort =
-    (numPartitions <= bypassMergeThreshold && aggregator.isEmpty && ordering.isEmpty)
-
-  // Array of file writers for each partition, used if bypassMergeSort is true and we've spilled
-  private var partitionWriters: Array[BlockObjectWriter] = null
 
   // A comparator for keys K that orders them within a partition to allow aggregation or sorting.
   // Can be a partial ordering by hash code if a total ordering is not provided through by the
@@ -174,6 +166,14 @@ private[spark] class ExternalSorter[K, V, C](
     }
   })
 
+  private def comparator: Option[Comparator[K]] = {
+    if (ordering.isDefined || aggregator.isDefined) {
+      Some(keyComparator)
+    } else {
+      None
+    }
+  }
+
   // Information about a spilled file. Includes sizes in bytes of "batches" written by the
   // serializer as we periodically reset its stream, as well as number of elements in each
   // partition, used to efficiently keep track of partitions when merging.
@@ -182,9 +182,10 @@ private[spark] class ExternalSorter[K, V, C](
     blockId: BlockId,
     serializerBatchSizes: Array[Long],
     elementsPerPartition: Array[Long])
+
   private val spills = new ArrayBuffer[SpilledFile]
 
-  def insertAll(records: Iterator[_ <: Product2[K, V]]): Unit = {
+  override def insertAll(records: Iterator[Product2[K, V]]): Unit = {
     // TODO: stop combining if we find that the reduction factor isn't high
     val shouldCombine = aggregator.isDefined
 
@@ -202,15 +203,6 @@ private[spark] class ExternalSorter[K, V, C](
         map.changeValue((getPartition(kv._1), kv._1), update)
         maybeSpillCollection(usingMap = true)
       }
-    } else if (bypassMergeSort) {
-      // SPARK-4479: Also bypass buffering if merge sort is bypassed to avoid defensive copies
-      if (records.hasNext) {
-        spillToPartitionFiles(
-          WritablePartitionedIterator.fromIterator(records.map { kv =>
-            ((getPartition(kv._1), kv._1), kv._2.asInstanceOf[C])
-          })
-        )
-      }
     } else {
       // Stick values into our buffer
       while (records.hasNext) {
@@ -238,46 +230,33 @@ private[spark] class ExternalSorter[K, V, C](
       }
     } else {
       if (maybeSpill(buffer, buffer.estimateSize())) {
-        buffer = if (useSerializedPairBuffer) {
-          new PartitionedSerializedPairBuffer[K, C](metaInitialRecords, kvChunkSize, serInstance)
-        } else {
-          new PartitionedPairBuffer[K, C]
-        }
+        buffer = newBuffer()
       }
     }
   }
 
   /**
-   * Spill the current in-memory collection to disk, adding a new file to spills, and clear it.
-   */
-  override protected[this] def spill(collection: WritablePartitionedPairCollection[K, C]): Unit = {
-    if (bypassMergeSort) {
-      spillToPartitionFiles(collection)
-    } else {
-      spillToMergeableFile(collection)
-    }
-  }
-
-  /**
-   * Spill our in-memory collection to a sorted file that we can merge later (normal code path).
-   * We add this file into spilledFiles to find it later.
-   *
-   * This should not be invoked if bypassMergeSort is true. In that case, spillToPartitionedFiles()
-   * is used to write files for each partition.
+   * Spill our in-memory collection to a sorted file that we can merge later.
+   * We add this file into `spilledFiles` to find it later.
    *
    * @param collection whichever collection we're using (map or buffer)
    */
-  private def spillToMergeableFile(collection: WritablePartitionedPairCollection[K, C]): Unit = {
-    assert(!bypassMergeSort)
-
+  override protected[this] def spill(collection: WritablePartitionedPairCollection[K, C]): Unit = {
     // Because these files may be read during shuffle, their compression must be controlled by
     // spark.shuffle.compress instead of spark.shuffle.spill.compress, so we need to use
     // createTempShuffleBlock here; see SPARK-3426 for more context.
     val (blockId, file) = diskBlockManager.createTempShuffleBlock()
-    curWriteMetrics = new ShuffleWriteMetrics()
-    var writer = blockManager.getDiskWriter(
-      blockId, file, serInstance, fileBufferSize, curWriteMetrics)
-    var objectsWritten = 0   // Objects written since the last flush
+
+    // These variables are reset after each flush
+    var objectsWritten: Long = 0
+    var spillMetrics: ShuffleWriteMetrics = null
+    var writer: BlockObjectWriter = null
+    def openWriter(): Unit = {
+      assert (writer == null && spillMetrics == null)
+      spillMetrics = new ShuffleWriteMetrics
+      writer = blockManager.getDiskWriter(blockId, file, serInstance, fileBufferSize, spillMetrics)
+    }
+    openWriter()
 
     // List of batch sizes (bytes) in the order they are written to disk
     val batchSizes = new ArrayBuffer[Long]
@@ -291,8 +270,9 @@ private[spark] class ExternalSorter[K, V, C](
       val w = writer
       writer = null
       w.commitAndClose()
-      _diskBytesSpilled += curWriteMetrics.shuffleBytesWritten
-      batchSizes.append(curWriteMetrics.shuffleBytesWritten)
+      _diskBytesSpilled += spillMetrics.shuffleBytesWritten
+      batchSizes.append(spillMetrics.shuffleBytesWritten)
+      spillMetrics = null
       objectsWritten = 0
     }
 
@@ -307,9 +287,7 @@ private[spark] class ExternalSorter[K, V, C](
 
         if (objectsWritten == serializerBatchSize) {
           flush()
-          curWriteMetrics = new ShuffleWriteMetrics()
-          writer = blockManager.getDiskWriter(
-            blockId, file, serInstance, fileBufferSize, curWriteMetrics)
+          openWriter()
         }
       }
       if (objectsWritten > 0) {
@@ -336,46 +314,6 @@ private[spark] class ExternalSorter[K, V, C](
     spills.append(SpilledFile(file, blockId, batchSizes.toArray, elementsPerPartition))
   }
 
-  /**
-   * Spill our in-memory collection to separate files, one for each partition. This is used when
-   * there's no aggregator and ordering and the number of partitions is small, because it allows
-   * writePartitionedFile to just concatenate files without deserializing data.
-   *
-   * @param collection whichever collection we're using (map or buffer)
-   */
-  private def spillToPartitionFiles(collection: WritablePartitionedPairCollection[K, C]): Unit = {
-    spillToPartitionFiles(collection.writablePartitionedIterator())
-  }
-
-  private def spillToPartitionFiles(iterator: WritablePartitionedIterator): Unit = {
-    assert(bypassMergeSort)
-
-    // Create our file writers if we haven't done so yet
-    if (partitionWriters == null) {
-      curWriteMetrics = new ShuffleWriteMetrics()
-      val openStartTime = System.nanoTime
-      partitionWriters = Array.fill(numPartitions) {
-        // Because these files may be read during shuffle, their compression must be controlled by
-        // spark.shuffle.compress instead of spark.shuffle.spill.compress, so we need to use
-        // createTempShuffleBlock here; see SPARK-3426 for more context.
-        val (blockId, file) = diskBlockManager.createTempShuffleBlock()
-        val writer = blockManager.getDiskWriter(blockId, file, serInstance, fileBufferSize,
-          curWriteMetrics)
-        writer.open()
-      }
-      // Creating the file to write to and creating a disk writer both involve interacting with
-      // the disk, and can take a long time in aggregate when we open many files, so should be
-      // included in the shuffle write time.
-      curWriteMetrics.incShuffleWriteTime(System.nanoTime - openStartTime)
-    }
-
-    // No need to sort stuff, just write each element out
-    while (iterator.hasNext) {
-      val partitionId = iterator.nextPartition()
-      iterator.writeNext(partitionWriters(partitionId))
-    }
-  }
-
   /**
    * Merge a sequence of sorted files, giving an iterator over partitions and then over elements
    * inside each partition. This can be used to either write out a new file or return data to
@@ -665,8 +603,6 @@ private[spark] class ExternalSorter[K, V, C](
   }
 
   /**
-   * Exposed for testing purposes.
-   *
    * Return an iterator over all the data written to this object, grouped by partition and
    * aggregated by the requested aggregator. For each partition we then have an iterator over its
    * contents, and these are expected to be accessed in order (you can't "skip ahead" to one
@@ -676,10 +612,11 @@ private[spark] class ExternalSorter[K, V, C](
    * For now, we just merge all the spilled files in once pass, but this can be modified to
    * support hierarchical merging.
    */
-   def partitionedIterator: Iterator[(Int, Iterator[Product2[K, C]])] = {
+  @VisibleForTesting
+  def partitionedIterator: Iterator[(Int, Iterator[Product2[K, C]])] = {
     val usingMap = aggregator.isDefined
     val collection: WritablePartitionedPairCollection[K, C] = if (usingMap) map else buffer
-    if (spills.isEmpty && partitionWriters == null) {
+    if (spills.isEmpty) {
       // Special case: if we have only in-memory data, we don't need to merge streams, and perhaps
       // we don't even need to sort by anything other than partition ID
       if (!ordering.isDefined) {
@@ -689,13 +626,6 @@ private[spark] class ExternalSorter[K, V, C](
         // We do need to sort by both partition ID and key
         groupByPartition(collection.partitionedDestructiveSortedIterator(Some(keyComparator)))
       }
-    } else if (bypassMergeSort) {
-      // Read data from each partition file and merge it together with the data in memory;
-      // note that there's no ordering or aggregator in this case -- we just partition objects
-      val collIter = groupByPartition(collection.partitionedDestructiveSortedIterator(None))
-      collIter.map { case (partitionId, values) =>
-        (partitionId, values ++ readPartitionFile(partitionWriters(partitionId)))
-      }
     } else {
       // Merge spilled and in-memory data
       merge(spills, collection.partitionedDestructiveSortedIterator(comparator))
@@ -709,14 +639,13 @@ private[spark] class ExternalSorter[K, V, C](
 
   /**
    * Write all the data added into this ExternalSorter into a file in the disk store. This is
-   * called by the SortShuffleWriter and can go through an efficient path of just concatenating
-   * binary files if we decided to avoid merge-sorting.
+   * called by the SortShuffleWriter.
    *
    * @param blockId block ID to write to. The index file will be blockId.name + ".index".
    * @param context a TaskContext for a running Spark task, for us to update shuffle metrics.
    * @return array of lengths, in bytes, of each partition of the file (used by map output tracker)
    */
-  def writePartitionedFile(
+  override def writePartitionedFile(
       blockId: BlockId,
       context: TaskContext,
       outputFile: File): Array[Long] = {
@@ -724,28 +653,7 @@ private[spark] class ExternalSorter[K, V, C](
     // Track location of each range in the output file
     val lengths = new Array[Long](numPartitions)
 
-    if (bypassMergeSort && partitionWriters != null) {
-      // We decided to write separate files for each partition, so just concatenate them. To keep
-      // this simple we spill out the current in-memory collection so that everything is in files.
-      spillToPartitionFiles(if (aggregator.isDefined) map else buffer)
-      partitionWriters.foreach(_.commitAndClose())
-      val out = new FileOutputStream(outputFile, true)
-      val writeStartTime = System.nanoTime
-      util.Utils.tryWithSafeFinally {
-        for (i <- 0 until numPartitions) {
-          val in = new FileInputStream(partitionWriters(i).fileSegment().file)
-          util.Utils.tryWithSafeFinally {
-            lengths(i) = org.apache.spark.util.Utils.copyStream(in, out, false, transferToEnabled)
-          } {
-            in.close()
-          }
-        }
-      } {
-        out.close()
-        context.taskMetrics.shuffleWriteMetrics.foreach(
-          _.incShuffleWriteTime(System.nanoTime - writeStartTime))
-      }
-    } else if (spills.isEmpty && partitionWriters == null) {
+    if (spills.isEmpty) {
       // Case where we only have in-memory data
       val collection = if (aggregator.isDefined) map else buffer
       val it = collection.destructiveSortedWritablePartitionedIterator(comparator)
@@ -761,7 +669,7 @@ private[spark] class ExternalSorter[K, V, C](
         lengths(partitionId) = segment.length
       }
     } else {
-      // Not bypassing merge-sort; get an iterator by partition and just write everything directly.
+      // We must perform merge-sort; get an iterator by partition and write everything directly.
       for ((id, elements) <- this.partitionedIterator) {
         if (elements.hasNext) {
           val writer = blockManager.getDiskWriter(blockId, outputFile, serInstance, fileBufferSize,
@@ -778,41 +686,15 @@ private[spark] class ExternalSorter[K, V, C](
 
     context.taskMetrics.incMemoryBytesSpilled(memoryBytesSpilled)
     context.taskMetrics.incDiskBytesSpilled(diskBytesSpilled)
-    context.taskMetrics.shuffleWriteMetrics.filter(_ => bypassMergeSort).foreach { m =>
-      if (curWriteMetrics != null) {
-        m.incShuffleBytesWritten(curWriteMetrics.shuffleBytesWritten)
-        m.incShuffleWriteTime(curWriteMetrics.shuffleWriteTime)
-        m.incShuffleRecordsWritten(curWriteMetrics.shuffleRecordsWritten)
-      }
-    }
 
     lengths
   }
 
-  /**
-   * Read a partition file back as an iterator (used in our iterator method)
-   */
-  private def readPartitionFile(writer: BlockObjectWriter): Iterator[Product2[K, C]] = {
-    if (writer.isOpen) {
-      writer.commitAndClose()
-    }
-    new PairIterator[K, C](blockManager.diskStore.getValues(writer.blockId, ser).get)
-  }
-
   def stop(): Unit = {
     spills.foreach(s => s.file.delete())
     spills.clear()
-    if (partitionWriters != null) {
-      partitionWriters.foreach { w =>
-        w.revertPartialWritesAndClose()
-        diskBlockManager.getFile(w.blockId).delete()
-      }
-      partitionWriters = null
-    }
   }
 
-  def diskBytesSpilled: Long = _diskBytesSpilled
-
   /**
    * Given a stream of ((partition, key), combiner) pairs *assumed to be sorted by partition ID*,
    * group together the pairs for each partition into a sub-iterator.
@@ -826,14 +708,6 @@ private[spark] class ExternalSorter[K, V, C](
     (0 until numPartitions).iterator.map(p => (p, new IteratorForPartition(p, buffered)))
   }
 
-  private def comparator: Option[Comparator[K]] = {
-    if (ordering.isDefined || aggregator.isDefined) {
-      Some(keyComparator)
-    } else {
-      None
-    }
-  }
-
   /**
    * An iterator that reads only the elements for a given partition ID from an underlying buffered
    * stream, assuming this partition is the next one to be read. Used to make it easier to return
diff --git a/core/src/main/scala/org/apache/spark/util/collection/PairIterator.scala b/core/src/main/scala/org/apache/spark/util/collection/PairIterator.scala
deleted file mode 100644
index d75959f480756..0000000000000
--- a/core/src/main/scala/org/apache/spark/util/collection/PairIterator.scala
+++ /dev/null
@@ -1,24 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.util.collection
-
-private[spark] class PairIterator[K, V](iter: Iterator[Any]) extends Iterator[(K, V)] {
-  def hasNext: Boolean = iter.hasNext
-
-  def next(): (K, V) = (iter.next().asInstanceOf[K], iter.next().asInstanceOf[V])
-}
diff --git a/core/src/main/scala/org/apache/spark/util/collection/PartitionedAppendOnlyMap.scala b/core/src/main/scala/org/apache/spark/util/collection/PartitionedAppendOnlyMap.scala
index e2e2f1faae9d1..d0d25b43d0477 100644
--- a/core/src/main/scala/org/apache/spark/util/collection/PartitionedAppendOnlyMap.scala
+++ b/core/src/main/scala/org/apache/spark/util/collection/PartitionedAppendOnlyMap.scala
@@ -34,10 +34,6 @@ private[spark] class PartitionedAppendOnlyMap[K, V]
     destructiveSortedIterator(comparator)
   }
 
-  def writablePartitionedIterator(): WritablePartitionedIterator = {
-    WritablePartitionedIterator.fromIterator(super.iterator)
-  }
-
   def insert(partition: Int, key: K, value: V): Unit = {
     update((partition, key), value)
   }
diff --git a/core/src/main/scala/org/apache/spark/util/collection/PartitionedPairBuffer.scala b/core/src/main/scala/org/apache/spark/util/collection/PartitionedPairBuffer.scala
index e8332e1a87eac..5a6e9a9580e9b 100644
--- a/core/src/main/scala/org/apache/spark/util/collection/PartitionedPairBuffer.scala
+++ b/core/src/main/scala/org/apache/spark/util/collection/PartitionedPairBuffer.scala
@@ -71,10 +71,6 @@ private[spark] class PartitionedPairBuffer[K, V](initialCapacity: Int = 64)
     iterator
   }
 
-  override def writablePartitionedIterator(): WritablePartitionedIterator = {
-    WritablePartitionedIterator.fromIterator(iterator)
-  }
-
   private def iterator(): Iterator[((Int, K), V)] = new Iterator[((Int, K), V)] {
     var pos = 0
 
diff --git a/core/src/main/scala/org/apache/spark/util/collection/PartitionedSerializedPairBuffer.scala b/core/src/main/scala/org/apache/spark/util/collection/PartitionedSerializedPairBuffer.scala
index 554d88206e221..862408b7a4d21 100644
--- a/core/src/main/scala/org/apache/spark/util/collection/PartitionedSerializedPairBuffer.scala
+++ b/core/src/main/scala/org/apache/spark/util/collection/PartitionedSerializedPairBuffer.scala
@@ -122,10 +122,6 @@ private[spark] class PartitionedSerializedPairBuffer[K, V](
   override def destructiveSortedWritablePartitionedIterator(keyComparator: Option[Comparator[K]])
     : WritablePartitionedIterator = {
     sort(keyComparator)
-    writablePartitionedIterator
-  }
-
-  override def writablePartitionedIterator(): WritablePartitionedIterator = {
     new WritablePartitionedIterator {
       // current position in the meta buffer in ints
       var pos = 0
diff --git a/core/src/main/scala/org/apache/spark/util/collection/WritablePartitionedPairCollection.scala b/core/src/main/scala/org/apache/spark/util/collection/WritablePartitionedPairCollection.scala
index f26d1618c9200..7bc59898658e4 100644
--- a/core/src/main/scala/org/apache/spark/util/collection/WritablePartitionedPairCollection.scala
+++ b/core/src/main/scala/org/apache/spark/util/collection/WritablePartitionedPairCollection.scala
@@ -47,13 +47,20 @@ private[spark] trait WritablePartitionedPairCollection[K, V] {
    */
   def destructiveSortedWritablePartitionedIterator(keyComparator: Option[Comparator[K]])
     : WritablePartitionedIterator = {
-    WritablePartitionedIterator.fromIterator(partitionedDestructiveSortedIterator(keyComparator))
-  }
+    val it = partitionedDestructiveSortedIterator(keyComparator)
+    new WritablePartitionedIterator {
+      private[this] var cur = if (it.hasNext) it.next() else null
 
-  /**
-   * Iterate through the data and write out the elements instead of returning them.
-   */
-  def writablePartitionedIterator(): WritablePartitionedIterator
+      def writeNext(writer: BlockObjectWriter): Unit = {
+        writer.write(cur._1._2, cur._2)
+        cur = if (it.hasNext) it.next() else null
+      }
+
+      def hasNext(): Boolean = cur != null
+
+      def nextPartition(): Int = cur._1._1
+    }
+  }
 }
 
 private[spark] object WritablePartitionedPairCollection {
@@ -94,20 +101,3 @@ private[spark] trait WritablePartitionedIterator {
 
   def nextPartition(): Int
 }
-
-private[spark] object WritablePartitionedIterator {
-  def fromIterator(it: Iterator[((Int, _), _)]): WritablePartitionedIterator = {
-    new WritablePartitionedIterator {
-      var cur = if (it.hasNext) it.next() else null
-
-      def writeNext(writer: BlockObjectWriter): Unit = {
-        writer.write(cur._1._2, cur._2)
-        cur = if (it.hasNext) it.next() else null
-      }
-
-      def hasNext(): Boolean = cur != null
-
-      def nextPartition(): Int = cur._1._1
-    }
-  }
-}
diff --git a/core/src/test/scala/org/apache/spark/ShuffleSuite.scala b/core/src/test/scala/org/apache/spark/ShuffleSuite.scala
index 91f4ab360857e..c3c2b1ffc1efa 100644
--- a/core/src/test/scala/org/apache/spark/ShuffleSuite.scala
+++ b/core/src/test/scala/org/apache/spark/ShuffleSuite.scala
@@ -21,6 +21,7 @@ import org.scalatest.Matchers
 
 import org.apache.spark.ShuffleSuite.NonJavaSerializableClass
 import org.apache.spark.rdd.{CoGroupedRDD, OrderedRDDFunctions, RDD, ShuffledRDD, SubtractedRDD}
+import org.apache.spark.scheduler.{SparkListener, SparkListenerTaskEnd}
 import org.apache.spark.serializer.KryoSerializer
 import org.apache.spark.storage.{ShuffleDataBlockId, ShuffleBlockId}
 import org.apache.spark.util.MutablePair
@@ -281,6 +282,39 @@ abstract class ShuffleSuite extends SparkFunSuite with Matchers with LocalSparkC
     // This count should retry the execution of the previous stage and rerun shuffle.
     rdd.count()
   }
+
+  test("metrics for shuffle without aggregation") {
+    sc = new SparkContext("local", "test", conf.clone())
+    val numRecords = 10000
+
+    val metrics = ShuffleSuite.runAndReturnMetrics(sc) {
+      sc.parallelize(1 to numRecords, 4)
+        .map(key => (key, 1))
+        .groupByKey()
+        .collect()
+    }
+
+    assert(metrics.recordsRead === numRecords)
+    assert(metrics.recordsWritten === numRecords)
+    assert(metrics.bytesWritten === metrics.byresRead)
+    assert(metrics.bytesWritten > 0)
+  }
+
+  test("metrics for shuffle with aggregation") {
+    sc = new SparkContext("local", "test", conf.clone())
+    val numRecords = 10000
+
+    val metrics = ShuffleSuite.runAndReturnMetrics(sc) {
+      sc.parallelize(1 to numRecords, 4)
+        .flatMap(key => Array.fill(100)((key, 1)))
+        .countByKey()
+    }
+
+    assert(metrics.recordsRead === numRecords)
+    assert(metrics.recordsWritten === numRecords)
+    assert(metrics.bytesWritten === metrics.byresRead)
+    assert(metrics.bytesWritten > 0)
+  }
 }
 
 object ShuffleSuite {
@@ -294,4 +328,35 @@ object ShuffleSuite {
       value - o.value
     }
   }
+
+  case class AggregatedShuffleMetrics(
+    recordsWritten: Long,
+    recordsRead: Long,
+    bytesWritten: Long,
+    byresRead: Long)
+
+  def runAndReturnMetrics(sc: SparkContext)(job: => Unit): AggregatedShuffleMetrics = {
+    @volatile var recordsWritten: Long = 0
+    @volatile var recordsRead: Long = 0
+    @volatile var bytesWritten: Long = 0
+    @volatile var bytesRead: Long = 0
+    val listener = new SparkListener {
+      override def onTaskEnd(taskEnd: SparkListenerTaskEnd) {
+        taskEnd.taskMetrics.shuffleWriteMetrics.foreach { m =>
+          recordsWritten += m.shuffleRecordsWritten
+          bytesWritten += m.shuffleBytesWritten
+        }
+        taskEnd.taskMetrics.shuffleReadMetrics.foreach { m =>
+          recordsRead += m.recordsRead
+          bytesRead += m.totalBytesRead
+        }
+      }
+    }
+    sc.addSparkListener(listener)
+
+    job
+
+    sc.listenerBus.waitUntilEmpty(500)
+    AggregatedShuffleMetrics(recordsWritten, recordsRead, bytesWritten, bytesRead)
+  }
 }
diff --git a/core/src/test/scala/org/apache/spark/metrics/InputOutputMetricsSuite.scala b/core/src/test/scala/org/apache/spark/metrics/InputOutputMetricsSuite.scala
index 19f1af0dcd461..9e4d34fb7d382 100644
--- a/core/src/test/scala/org/apache/spark/metrics/InputOutputMetricsSuite.scala
+++ b/core/src/test/scala/org/apache/spark/metrics/InputOutputMetricsSuite.scala
@@ -193,26 +193,6 @@ class InputOutputMetricsSuite extends SparkFunSuite with SharedSparkContext
     assert(records == numRecords)
   }
 
-  test("shuffle records read metrics") {
-    val recordsRead = runAndReturnShuffleRecordsRead {
-      sc.textFile(tmpFilePath, 4)
-        .map(key => (key, 1))
-        .groupByKey()
-        .collect()
-    }
-    assert(recordsRead == numRecords)
-  }
-
-  test("shuffle records written metrics") {
-    val recordsWritten = runAndReturnShuffleRecordsWritten {
-      sc.textFile(tmpFilePath, 4)
-        .map(key => (key, 1))
-        .groupByKey()
-        .collect()
-    }
-    assert(recordsWritten == numRecords)
-  }
-
   /**
    * Tests the metrics from end to end.
    * 1) reading a hadoop file
@@ -301,14 +281,6 @@ class InputOutputMetricsSuite extends SparkFunSuite with SharedSparkContext
     runAndReturnMetrics(job, _.taskMetrics.outputMetrics.map(_.recordsWritten))
   }
 
-  private def runAndReturnShuffleRecordsRead(job: => Unit): Long = {
-    runAndReturnMetrics(job, _.taskMetrics.shuffleReadMetrics.map(_.recordsRead))
-  }
-
-  private def runAndReturnShuffleRecordsWritten(job: => Unit): Long = {
-    runAndReturnMetrics(job, _.taskMetrics.shuffleWriteMetrics.map(_.shuffleRecordsWritten))
-  }
-
   private def runAndReturnMetrics(job: => Unit,
       collector: (SparkListenerTaskEnd) => Option[Long]): Long = {
     val taskMetrics = new ArrayBuffer[Long]()
diff --git a/core/src/test/scala/org/apache/spark/shuffle/sort/BypassMergeSortShuffleWriterSuite.scala b/core/src/test/scala/org/apache/spark/shuffle/sort/BypassMergeSortShuffleWriterSuite.scala
new file mode 100644
index 0000000000000..c8420db6126c0
--- /dev/null
+++ b/core/src/test/scala/org/apache/spark/shuffle/sort/BypassMergeSortShuffleWriterSuite.scala
@@ -0,0 +1,171 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.shuffle.sort
+
+import java.io.File
+import java.util.UUID
+
+import scala.collection.mutable
+import scala.collection.mutable.ArrayBuffer
+
+import org.mockito.Answers.RETURNS_SMART_NULLS
+import org.mockito.{Mock, MockitoAnnotations}
+import org.mockito.Matchers._
+import org.mockito.Mockito._
+import org.mockito.invocation.InvocationOnMock
+import org.mockito.stubbing.Answer
+import org.scalatest.{BeforeAndAfterEach, FunSuite}
+
+import org.apache.spark._
+import org.apache.spark.executor.{TaskMetrics, ShuffleWriteMetrics}
+import org.apache.spark.serializer.{SerializerInstance, Serializer, JavaSerializer}
+import org.apache.spark.storage._
+import org.apache.spark.util.Utils
+
+class BypassMergeSortShuffleWriterSuite extends FunSuite with BeforeAndAfterEach {
+
+  @Mock(answer = RETURNS_SMART_NULLS) private var blockManager: BlockManager = _
+  @Mock(answer = RETURNS_SMART_NULLS) private var diskBlockManager: DiskBlockManager = _
+  @Mock(answer = RETURNS_SMART_NULLS) private var taskContext: TaskContext = _
+
+  private var taskMetrics: TaskMetrics = _
+  private var shuffleWriteMetrics: ShuffleWriteMetrics = _
+  private var tempDir: File = _
+  private var outputFile: File = _
+  private val conf: SparkConf = new SparkConf(loadDefaults = false)
+  private val temporaryFilesCreated: mutable.Buffer[File] = new ArrayBuffer[File]()
+  private val blockIdToFileMap: mutable.Map[BlockId, File] = new mutable.HashMap[BlockId, File]
+  private val shuffleBlockId: ShuffleBlockId = new ShuffleBlockId(0, 0, 0)
+  private val serializer: Serializer = new JavaSerializer(conf)
+
+  override def beforeEach(): Unit = {
+    tempDir = Utils.createTempDir()
+    outputFile = File.createTempFile("shuffle", null, tempDir)
+    shuffleWriteMetrics = new ShuffleWriteMetrics
+    taskMetrics = new TaskMetrics
+    taskMetrics.shuffleWriteMetrics = Some(shuffleWriteMetrics)
+    MockitoAnnotations.initMocks(this)
+    when(taskContext.taskMetrics()).thenReturn(taskMetrics)
+    when(blockManager.diskBlockManager).thenReturn(diskBlockManager)
+    when(blockManager.getDiskWriter(
+      any[BlockId],
+      any[File],
+      any[SerializerInstance],
+      anyInt(),
+      any[ShuffleWriteMetrics]
+    )).thenAnswer(new Answer[BlockObjectWriter] {
+      override def answer(invocation: InvocationOnMock): BlockObjectWriter = {
+        val args = invocation.getArguments
+        new DiskBlockObjectWriter(
+          args(0).asInstanceOf[BlockId],
+          args(1).asInstanceOf[File],
+          args(2).asInstanceOf[SerializerInstance],
+          args(3).asInstanceOf[Int],
+          compressStream = identity,
+          syncWrites = false,
+          args(4).asInstanceOf[ShuffleWriteMetrics]
+        )
+      }
+    })
+    when(diskBlockManager.createTempShuffleBlock()).thenAnswer(
+      new Answer[(TempShuffleBlockId, File)] {
+        override def answer(invocation: InvocationOnMock): (TempShuffleBlockId, File) = {
+          val blockId = new TempShuffleBlockId(UUID.randomUUID)
+          val file = File.createTempFile(blockId.toString, null, tempDir)
+          blockIdToFileMap.put(blockId, file)
+          temporaryFilesCreated.append(file)
+          (blockId, file)
+        }
+      })
+    when(diskBlockManager.getFile(any[BlockId])).thenAnswer(
+      new Answer[File] {
+        override def answer(invocation: InvocationOnMock): File = {
+          blockIdToFileMap.get(invocation.getArguments.head.asInstanceOf[BlockId]).get
+        }
+    })
+  }
+
+  override def afterEach(): Unit = {
+    Utils.deleteRecursively(tempDir)
+    blockIdToFileMap.clear()
+    temporaryFilesCreated.clear()
+  }
+
+  test("write empty iterator") {
+    val writer = new BypassMergeSortShuffleWriter[Int, Int](
+      new SparkConf(loadDefaults = false),
+      blockManager,
+      new HashPartitioner(7),
+      shuffleWriteMetrics,
+      serializer
+    )
+    writer.insertAll(Iterator.empty)
+    val partitionLengths = writer.writePartitionedFile(shuffleBlockId, taskContext, outputFile)
+    assert(partitionLengths.sum === 0)
+    assert(outputFile.exists())
+    assert(outputFile.length() === 0)
+    assert(temporaryFilesCreated.isEmpty)
+    assert(shuffleWriteMetrics.shuffleBytesWritten === 0)
+    assert(shuffleWriteMetrics.shuffleRecordsWritten === 0)
+    assert(taskMetrics.diskBytesSpilled === 0)
+    assert(taskMetrics.memoryBytesSpilled === 0)
+  }
+
+  test("write with some empty partitions") {
+    def records: Iterator[(Int, Int)] =
+      Iterator((1, 1), (5, 5)) ++ (0 until 100000).iterator.map(x => (2, 2))
+    val writer = new BypassMergeSortShuffleWriter[Int, Int](
+      new SparkConf(loadDefaults = false),
+      blockManager,
+      new HashPartitioner(7),
+      shuffleWriteMetrics,
+      serializer
+    )
+    writer.insertAll(records)
+    assert(temporaryFilesCreated.nonEmpty)
+    val partitionLengths = writer.writePartitionedFile(shuffleBlockId, taskContext, outputFile)
+    assert(partitionLengths.sum === outputFile.length())
+    assert(temporaryFilesCreated.count(_.exists()) === 0) // check that temporary files were deleted
+    assert(shuffleWriteMetrics.shuffleBytesWritten === outputFile.length())
+    assert(shuffleWriteMetrics.shuffleRecordsWritten === records.length)
+    assert(taskMetrics.diskBytesSpilled === 0)
+    assert(taskMetrics.memoryBytesSpilled === 0)
+  }
+
+  test("cleanup of intermediate files after errors") {
+    val writer = new BypassMergeSortShuffleWriter[Int, Int](
+      new SparkConf(loadDefaults = false),
+      blockManager,
+      new HashPartitioner(7),
+      shuffleWriteMetrics,
+      serializer
+    )
+    intercept[SparkException] {
+      writer.insertAll((0 until 100000).iterator.map(i => {
+        if (i == 99990) {
+          throw new SparkException("Intentional failure")
+        }
+        (i, i)
+      }))
+    }
+    assert(temporaryFilesCreated.nonEmpty)
+    writer.stop()
+    assert(temporaryFilesCreated.count(_.exists()) === 0)
+  }
+
+}
diff --git a/core/src/test/scala/org/apache/spark/shuffle/sort/SortShuffleWriterSuite.scala b/core/src/test/scala/org/apache/spark/shuffle/sort/SortShuffleWriterSuite.scala
new file mode 100644
index 0000000000000..c6ada7139c198
--- /dev/null
+++ b/core/src/test/scala/org/apache/spark/shuffle/sort/SortShuffleWriterSuite.scala
@@ -0,0 +1,46 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.shuffle.sort
+
+import org.mockito.Mockito._
+import org.scalatest.FunSuite
+
+import org.apache.spark.{Aggregator, SparkConf}
+
+class SortShuffleWriterSuite extends FunSuite {
+
+  import SortShuffleWriter._
+
+  test("conditions for bypassing merge-sort") {
+    val conf = new SparkConf(loadDefaults = false)
+    val agg = mock(classOf[Aggregator[_, _, _]], RETURNS_SMART_NULLS)
+    val ord = implicitly[Ordering[Int]]
+
+    // Numbers of partitions that are above and below the default bypassMergeThreshold
+    val FEW_PARTITIONS = 50
+    val MANY_PARTITIONS = 10000
+
+    // Shuffles with no ordering or aggregator: should bypass unless # of partitions is high
+    assert(shouldBypassMergeSort(conf, FEW_PARTITIONS, None, None))
+    assert(!shouldBypassMergeSort(conf, MANY_PARTITIONS, None, None))
+
+    // Shuffles with an ordering or aggregator: should not bypass even if they have few partitions
+    assert(!shouldBypassMergeSort(conf, FEW_PARTITIONS, None, Some(ord)))
+    assert(!shouldBypassMergeSort(conf, FEW_PARTITIONS, Some(agg), None))
+  }
+}
diff --git a/core/src/test/scala/org/apache/spark/storage/BlockObjectWriterSuite.scala b/core/src/test/scala/org/apache/spark/storage/BlockObjectWriterSuite.scala
index ad43a3e5fdc88..7bdea724fea58 100644
--- a/core/src/test/scala/org/apache/spark/storage/BlockObjectWriterSuite.scala
+++ b/core/src/test/scala/org/apache/spark/storage/BlockObjectWriterSuite.scala
@@ -18,14 +18,28 @@ package org.apache.spark.storage
 
 import java.io.File
 
+import org.scalatest.BeforeAndAfterEach
+
+import org.apache.spark.SparkConf
 import org.apache.spark.{SparkConf, SparkFunSuite}
 import org.apache.spark.executor.ShuffleWriteMetrics
 import org.apache.spark.serializer.JavaSerializer
 import org.apache.spark.util.Utils
 
-class BlockObjectWriterSuite extends SparkFunSuite {
+class BlockObjectWriterSuite extends SparkFunSuite with BeforeAndAfterEach {
+
+  var tempDir: File = _
+
+  override def beforeEach(): Unit = {
+    tempDir = Utils.createTempDir()
+  }
+
+  override def afterEach(): Unit = {
+    Utils.deleteRecursively(tempDir)
+  }
+
   test("verify write metrics") {
-    val file = new File(Utils.createTempDir(), "somefile")
+    val file = new File(tempDir, "somefile")
     val writeMetrics = new ShuffleWriteMetrics()
     val writer = new DiskBlockObjectWriter(new TestBlockId("0"), file,
       new JavaSerializer(new SparkConf()).newInstance(), 1024, os => os, true, writeMetrics)
@@ -47,7 +61,7 @@ class BlockObjectWriterSuite extends SparkFunSuite {
   }
 
   test("verify write metrics on revert") {
-    val file = new File(Utils.createTempDir(), "somefile")
+    val file = new File(tempDir, "somefile")
     val writeMetrics = new ShuffleWriteMetrics()
     val writer = new DiskBlockObjectWriter(new TestBlockId("0"), file,
       new JavaSerializer(new SparkConf()).newInstance(), 1024, os => os, true, writeMetrics)
@@ -70,7 +84,7 @@ class BlockObjectWriterSuite extends SparkFunSuite {
   }
 
   test("Reopening a closed block writer") {
-    val file = new File(Utils.createTempDir(), "somefile")
+    val file = new File(tempDir, "somefile")
     val writeMetrics = new ShuffleWriteMetrics()
     val writer = new DiskBlockObjectWriter(new TestBlockId("0"), file,
       new JavaSerializer(new SparkConf()).newInstance(), 1024, os => os, true, writeMetrics)
@@ -81,4 +95,79 @@ class BlockObjectWriterSuite extends SparkFunSuite {
       writer.open()
     }
   }
+
+  test("calling revertPartialWritesAndClose() on a closed block writer should have no effect") {
+    val file = new File(tempDir, "somefile")
+    val writeMetrics = new ShuffleWriteMetrics()
+    val writer = new DiskBlockObjectWriter(new TestBlockId("0"), file,
+      new JavaSerializer(new SparkConf()).newInstance(), 1024, os => os, true, writeMetrics)
+    for (i <- 1 to 1000) {
+      writer.write(i, i)
+    }
+    writer.commitAndClose()
+    val bytesWritten = writeMetrics.shuffleBytesWritten
+    assert(writeMetrics.shuffleRecordsWritten === 1000)
+    writer.revertPartialWritesAndClose()
+    assert(writeMetrics.shuffleRecordsWritten === 1000)
+    assert(writeMetrics.shuffleBytesWritten === bytesWritten)
+  }
+
+  test("commitAndClose() should be idempotent") {
+    val file = new File(tempDir, "somefile")
+    val writeMetrics = new ShuffleWriteMetrics()
+    val writer = new DiskBlockObjectWriter(new TestBlockId("0"), file,
+      new JavaSerializer(new SparkConf()).newInstance(), 1024, os => os, true, writeMetrics)
+    for (i <- 1 to 1000) {
+      writer.write(i, i)
+    }
+    writer.commitAndClose()
+    val bytesWritten = writeMetrics.shuffleBytesWritten
+    val writeTime = writeMetrics.shuffleWriteTime
+    assert(writeMetrics.shuffleRecordsWritten === 1000)
+    writer.commitAndClose()
+    assert(writeMetrics.shuffleRecordsWritten === 1000)
+    assert(writeMetrics.shuffleBytesWritten === bytesWritten)
+    assert(writeMetrics.shuffleWriteTime === writeTime)
+  }
+
+  test("revertPartialWritesAndClose() should be idempotent") {
+    val file = new File(tempDir, "somefile")
+    val writeMetrics = new ShuffleWriteMetrics()
+    val writer = new DiskBlockObjectWriter(new TestBlockId("0"), file,
+      new JavaSerializer(new SparkConf()).newInstance(), 1024, os => os, true, writeMetrics)
+    for (i <- 1 to 1000) {
+      writer.write(i, i)
+    }
+    writer.revertPartialWritesAndClose()
+    val bytesWritten = writeMetrics.shuffleBytesWritten
+    val writeTime = writeMetrics.shuffleWriteTime
+    assert(writeMetrics.shuffleRecordsWritten === 0)
+    writer.revertPartialWritesAndClose()
+    assert(writeMetrics.shuffleRecordsWritten === 0)
+    assert(writeMetrics.shuffleBytesWritten === bytesWritten)
+    assert(writeMetrics.shuffleWriteTime === writeTime)
+  }
+
+  test("fileSegment() can only be called after commitAndClose() has been called") {
+    val file = new File(tempDir, "somefile")
+    val writeMetrics = new ShuffleWriteMetrics()
+    val writer = new DiskBlockObjectWriter(new TestBlockId("0"), file,
+      new JavaSerializer(new SparkConf()).newInstance(), 1024, os => os, true, writeMetrics)
+    for (i <- 1 to 1000) {
+      writer.write(i, i)
+    }
+    intercept[IllegalStateException] {
+      writer.fileSegment()
+    }
+    writer.close()
+  }
+
+  test("commitAndClose() without ever opening or writing") {
+    val file = new File(tempDir, "somefile")
+    val writeMetrics = new ShuffleWriteMetrics()
+    val writer = new DiskBlockObjectWriter(new TestBlockId("0"), file,
+      new JavaSerializer(new SparkConf()).newInstance(), 1024, os => os, true, writeMetrics)
+    writer.commitAndClose()
+    assert(writer.fileSegment().length === 0)
+  }
 }
diff --git a/core/src/test/scala/org/apache/spark/util/collection/ExternalSorterSuite.scala b/core/src/test/scala/org/apache/spark/util/collection/ExternalSorterSuite.scala
index 9039dbef1fb71..7d7b41bc23284 100644
--- a/core/src/test/scala/org/apache/spark/util/collection/ExternalSorterSuite.scala
+++ b/core/src/test/scala/org/apache/spark/util/collection/ExternalSorterSuite.scala
@@ -23,10 +23,12 @@ import org.scalatest.PrivateMethodTester
 
 import scala.util.Random
 
+import org.scalatest.FunSuite
+
 import org.apache.spark._
 import org.apache.spark.serializer.{JavaSerializer, KryoSerializer}
 
-class ExternalSorterSuite extends SparkFunSuite with LocalSparkContext with PrivateMethodTester {
+class ExternalSorterSuite extends SparkFunSuite with LocalSparkContext {
   private def createSparkConf(loadDefaults: Boolean, kryo: Boolean): SparkConf = {
     val conf = new SparkConf(loadDefaults)
     if (kryo) {
@@ -37,21 +39,12 @@ class ExternalSorterSuite extends SparkFunSuite with LocalSparkContext with Priv
       conf.set("spark.serializer.objectStreamReset", "1")
       conf.set("spark.serializer", classOf[JavaSerializer].getName)
     }
+    conf.set("spark.shuffle.sort.bypassMergeThreshold", "0")
     // Ensure that we actually have multiple batches per spill file
     conf.set("spark.shuffle.spill.batchSize", "10")
     conf
   }
 
-  private def assertBypassedMergeSort(sorter: ExternalSorter[_, _, _]): Unit = {
-    val bypassMergeSort = PrivateMethod[Boolean]('bypassMergeSort)
-    assert(sorter.invokePrivate(bypassMergeSort()), "sorter did not bypass merge-sort")
-  }
-
-  private def assertDidNotBypassMergeSort(sorter: ExternalSorter[_, _, _]): Unit = {
-    val bypassMergeSort = PrivateMethod[Boolean]('bypassMergeSort)
-    assert(!sorter.invokePrivate(bypassMergeSort()), "sorter bypassed merge-sort")
-  }
-
   test("empty data stream with kryo ser") {
     emptyDataStream(createSparkConf(false, true))
   }
@@ -161,39 +154,6 @@ class ExternalSorterSuite extends SparkFunSuite with LocalSparkContext with Priv
 
     val sorter = new ExternalSorter[Int, Int, Int](
       None, Some(new HashPartitioner(7)), Some(ord), None)
-    assertDidNotBypassMergeSort(sorter)
-    sorter.insertAll(elements)
-    assert(sc.env.blockManager.diskBlockManager.getAllFiles().length > 0) // Make sure it spilled
-    val iter = sorter.partitionedIterator.map(p => (p._1, p._2.toList))
-    assert(iter.next() === (0, Nil))
-    assert(iter.next() === (1, List((1, 1))))
-    assert(iter.next() === (2, (0 until 100000).map(x => (2, 2)).toList))
-    assert(iter.next() === (3, Nil))
-    assert(iter.next() === (4, Nil))
-    assert(iter.next() === (5, List((5, 5))))
-    assert(iter.next() === (6, Nil))
-    sorter.stop()
-  }
-
-  test("empty partitions with spilling, bypass merge-sort with kryo ser") {
-    emptyPartitionerWithSpillingBypassMergeSort(createSparkConf(false, true))
-  }
-
-  test("empty partitions with spilling, bypass merge-sort with java ser") {
-    emptyPartitionerWithSpillingBypassMergeSort(createSparkConf(false, false))
-  }
-
-  def emptyPartitionerWithSpillingBypassMergeSort(conf: SparkConf) {
-    conf.set("spark.shuffle.memoryFraction", "0.001")
-    conf.set("spark.shuffle.spill.initialMemoryThreshold", "512")
-    conf.set("spark.shuffle.manager", "org.apache.spark.shuffle.sort.SortShuffleManager")
-    sc = new SparkContext("local", "test", conf)
-
-    val elements = Iterator((1, 1), (5, 5)) ++ (0 until 100000).iterator.map(x => (2, 2))
-
-    val sorter = new ExternalSorter[Int, Int, Int](
-      None, Some(new HashPartitioner(7)), None, None)
-    assertBypassedMergeSort(sorter)
     sorter.insertAll(elements)
     assert(sc.env.blockManager.diskBlockManager.getAllFiles().length > 0) // Make sure it spilled
     val iter = sorter.partitionedIterator.map(p => (p._1, p._2.toList))
@@ -376,7 +336,6 @@ class ExternalSorterSuite extends SparkFunSuite with LocalSparkContext with Priv
 
     val sorter = new ExternalSorter[Int, Int, Int](
       None, Some(new HashPartitioner(3)), Some(ord), None)
-    assertDidNotBypassMergeSort(sorter)
     sorter.insertAll((0 until 120000).iterator.map(i => (i, i)))
     assert(diskBlockManager.getAllFiles().length > 0)
     sorter.stop()
@@ -384,7 +343,6 @@ class ExternalSorterSuite extends SparkFunSuite with LocalSparkContext with Priv
 
     val sorter2 = new ExternalSorter[Int, Int, Int](
       None, Some(new HashPartitioner(3)), Some(ord), None)
-    assertDidNotBypassMergeSort(sorter2)
     sorter2.insertAll((0 until 120000).iterator.map(i => (i, i)))
     assert(diskBlockManager.getAllFiles().length > 0)
     assert(sorter2.iterator.toSet === (0 until 120000).map(i => (i, i)).toSet)
@@ -392,29 +350,6 @@ class ExternalSorterSuite extends SparkFunSuite with LocalSparkContext with Priv
     assert(diskBlockManager.getAllBlocks().length === 0)
   }
 
-  test("cleanup of intermediate files in sorter, bypass merge-sort") {
-    val conf = createSparkConf(true, false)  // Load defaults, otherwise SPARK_HOME is not found
-    conf.set("spark.shuffle.memoryFraction", "0.001")
-    conf.set("spark.shuffle.manager", "org.apache.spark.shuffle.sort.SortShuffleManager")
-    sc = new SparkContext("local", "test", conf)
-    val diskBlockManager = SparkEnv.get.blockManager.diskBlockManager
-
-    val sorter = new ExternalSorter[Int, Int, Int](None, Some(new HashPartitioner(3)), None, None)
-    assertBypassedMergeSort(sorter)
-    sorter.insertAll((0 until 100000).iterator.map(i => (i, i)))
-    assert(diskBlockManager.getAllFiles().length > 0)
-    sorter.stop()
-    assert(diskBlockManager.getAllBlocks().length === 0)
-
-    val sorter2 = new ExternalSorter[Int, Int, Int](None, Some(new HashPartitioner(3)), None, None)
-    assertBypassedMergeSort(sorter2)
-    sorter2.insertAll((0 until 100000).iterator.map(i => (i, i)))
-    assert(diskBlockManager.getAllFiles().length > 0)
-    assert(sorter2.iterator.toSet === (0 until 100000).map(i => (i, i)).toSet)
-    sorter2.stop()
-    assert(diskBlockManager.getAllBlocks().length === 0)
-  }
-
   test("cleanup of intermediate files in sorter if there are errors") {
     val conf = createSparkConf(true, false)  // Load defaults, otherwise SPARK_HOME is not found
     conf.set("spark.shuffle.memoryFraction", "0.001")
@@ -426,7 +361,6 @@ class ExternalSorterSuite extends SparkFunSuite with LocalSparkContext with Priv
 
     val sorter = new ExternalSorter[Int, Int, Int](
       None, Some(new HashPartitioner(3)), Some(ord), None)
-    assertDidNotBypassMergeSort(sorter)
     intercept[SparkException] {
       sorter.insertAll((0 until 120000).iterator.map(i => {
         if (i == 119990) {
@@ -440,28 +374,6 @@ class ExternalSorterSuite extends SparkFunSuite with LocalSparkContext with Priv
     assert(diskBlockManager.getAllBlocks().length === 0)
   }
 
-  test("cleanup of intermediate files in sorter if there are errors, bypass merge-sort") {
-    val conf = createSparkConf(true, false)  // Load defaults, otherwise SPARK_HOME is not found
-    conf.set("spark.shuffle.memoryFraction", "0.001")
-    conf.set("spark.shuffle.manager", "org.apache.spark.shuffle.sort.SortShuffleManager")
-    sc = new SparkContext("local", "test", conf)
-    val diskBlockManager = SparkEnv.get.blockManager.diskBlockManager
-
-    val sorter = new ExternalSorter[Int, Int, Int](None, Some(new HashPartitioner(3)), None, None)
-    assertBypassedMergeSort(sorter)
-    intercept[SparkException] {
-      sorter.insertAll((0 until 100000).iterator.map(i => {
-        if (i == 99990) {
-          throw new SparkException("Intentional failure")
-        }
-        (i, i)
-      }))
-    }
-    assert(diskBlockManager.getAllFiles().length > 0)
-    sorter.stop()
-    assert(diskBlockManager.getAllBlocks().length === 0)
-  }
-
   test("cleanup of intermediate files in shuffle") {
     val conf = createSparkConf(false, false)
     conf.set("spark.shuffle.memoryFraction", "0.001")
@@ -776,40 +688,6 @@ class ExternalSorterSuite extends SparkFunSuite with LocalSparkContext with Priv
     }
   }
 
-  test("conditions for bypassing merge-sort") {
-    val conf = createSparkConf(false, false)
-    conf.set("spark.shuffle.memoryFraction", "0.001")
-    conf.set("spark.shuffle.manager", "org.apache.spark.shuffle.sort.SortShuffleManager")
-    sc = new SparkContext("local", "test", conf)
-
-    val agg = new Aggregator[Int, Int, Int](i => i, (i, j) => i + j, (i, j) => i + j)
-    val ord = implicitly[Ordering[Int]]
-
-    // Numbers of partitions that are above and below the default bypassMergeThreshold
-    val FEW_PARTITIONS = 50
-    val MANY_PARTITIONS = 10000
-
-    // Sorters with no ordering or aggregator: should bypass unless # of partitions is high
-
-    val sorter1 = new ExternalSorter[Int, Int, Int](
-      None, Some(new HashPartitioner(FEW_PARTITIONS)), None, None)
-    assertBypassedMergeSort(sorter1)
-
-    val sorter2 = new ExternalSorter[Int, Int, Int](
-      None, Some(new HashPartitioner(MANY_PARTITIONS)), None, None)
-    assertDidNotBypassMergeSort(sorter2)
-
-    // Sorters with an ordering or aggregator: should not bypass even if they have few partitions
-
-    val sorter3 = new ExternalSorter[Int, Int, Int](
-      None, Some(new HashPartitioner(FEW_PARTITIONS)), Some(ord), None)
-    assertDidNotBypassMergeSort(sorter3)
-
-    val sorter4 = new ExternalSorter[Int, Int, Int](
-      Some(agg), Some(new HashPartitioner(FEW_PARTITIONS)), None, None)
-    assertDidNotBypassMergeSort(sorter4)
-  }
-
   test("sort without breaking sorting contracts with kryo ser") {
     sortWithoutBreakingSortingContracts(createSparkConf(true, true))
   }

From 1617363fbb9b22a2eb09e7bab98c8d05f9508761 Mon Sep 17 00:00:00 2001
From: Yanbo Liang <ybliang8@gmail.com>
Date: Sat, 30 May 2015 16:24:07 -0700
Subject: [PATCH 270/525] [SPARK-7918] [MLLIB] MLlib Python doc parity check
 for evaluation and feature

Check then make the MLlib Python evaluation and feature doc to be as complete as the Scala doc.

Author: Yanbo Liang <ybliang8@gmail.com>

Closes #6461 from yanboliang/spark-7918 and squashes the following commits:

940e3f1 [Yanbo Liang] truncate too long line and remove extra sparse
a80ae58 [Yanbo Liang] MLlib Python doc parity check for evaluation and feature
---
 python/pyspark/mllib/evaluation.py | 26 ++++++++--------
 python/pyspark/mllib/feature.py    | 49 ++++++++++++++----------------
 2 files changed, 36 insertions(+), 39 deletions(-)

diff --git a/python/pyspark/mllib/evaluation.py b/python/pyspark/mllib/evaluation.py
index aab5e5f4b77b5..c5cf3a4e7ff22 100644
--- a/python/pyspark/mllib/evaluation.py
+++ b/python/pyspark/mllib/evaluation.py
@@ -27,6 +27,8 @@ class BinaryClassificationMetrics(JavaModelWrapper):
     """
     Evaluator for binary classification.
 
+    :param scoreAndLabels: an RDD of (score, label) pairs
+
     >>> scoreAndLabels = sc.parallelize([
     ...     (0.1, 0.0), (0.1, 1.0), (0.4, 0.0), (0.6, 0.0), (0.6, 1.0), (0.6, 1.0), (0.8, 1.0)], 2)
     >>> metrics = BinaryClassificationMetrics(scoreAndLabels)
@@ -38,9 +40,6 @@ class BinaryClassificationMetrics(JavaModelWrapper):
     """
 
     def __init__(self, scoreAndLabels):
-        """
-        :param scoreAndLabels: an RDD of (score, label) pairs
-        """
         sc = scoreAndLabels.ctx
         sql_ctx = SQLContext(sc)
         df = sql_ctx.createDataFrame(scoreAndLabels, schema=StructType([
@@ -76,6 +75,9 @@ class RegressionMetrics(JavaModelWrapper):
     """
     Evaluator for regression.
 
+    :param predictionAndObservations: an RDD of (prediction,
+                                      observation) pairs.
+
     >>> predictionAndObservations = sc.parallelize([
     ...     (2.5, 3.0), (0.0, -0.5), (2.0, 2.0), (8.0, 7.0)])
     >>> metrics = RegressionMetrics(predictionAndObservations)
@@ -92,9 +94,6 @@ class RegressionMetrics(JavaModelWrapper):
     """
 
     def __init__(self, predictionAndObservations):
-        """
-        :param predictionAndObservations: an RDD of (prediction, observation) pairs.
-        """
         sc = predictionAndObservations.ctx
         sql_ctx = SQLContext(sc)
         df = sql_ctx.createDataFrame(predictionAndObservations, schema=StructType([
@@ -148,6 +147,8 @@ class MulticlassMetrics(JavaModelWrapper):
     """
     Evaluator for multiclass classification.
 
+    :param predictionAndLabels an RDD of (prediction, label) pairs.
+
     >>> predictionAndLabels = sc.parallelize([(0.0, 0.0), (0.0, 1.0), (0.0, 0.0),
     ...     (1.0, 0.0), (1.0, 1.0), (1.0, 1.0), (1.0, 1.0), (2.0, 2.0), (2.0, 0.0)])
     >>> metrics = MulticlassMetrics(predictionAndLabels)
@@ -176,9 +177,6 @@ class MulticlassMetrics(JavaModelWrapper):
     """
 
     def __init__(self, predictionAndLabels):
-        """
-        :param predictionAndLabels an RDD of (prediction, label) pairs.
-        """
         sc = predictionAndLabels.ctx
         sql_ctx = SQLContext(sc)
         df = sql_ctx.createDataFrame(predictionAndLabels, schema=StructType([
@@ -277,6 +275,9 @@ class RankingMetrics(JavaModelWrapper):
     """
     Evaluator for ranking algorithms.
 
+    :param predictionAndLabels: an RDD of (predicted ranking,
+                                ground truth set) pairs.
+
     >>> predictionAndLabels = sc.parallelize([
     ...     ([1, 6, 2, 7, 8, 3, 9, 10, 4, 5], [1, 2, 3, 4, 5]),
     ...     ([4, 1, 5, 6, 2, 7, 3, 8, 9, 10], [1, 2, 3]),
@@ -298,9 +299,6 @@ class RankingMetrics(JavaModelWrapper):
     """
 
     def __init__(self, predictionAndLabels):
-        """
-        :param predictionAndLabels: an RDD of (predicted ranking, ground truth set) pairs.
-        """
         sc = predictionAndLabels.ctx
         sql_ctx = SQLContext(sc)
         df = sql_ctx.createDataFrame(predictionAndLabels,
@@ -347,6 +345,10 @@ class MultilabelMetrics(JavaModelWrapper):
     """
     Evaluator for multilabel classification.
 
+    :param predictionAndLabels: an RDD of (predictions, labels) pairs,
+                                both are non-null Arrays, each with
+                                unique elements.
+
     >>> predictionAndLabels = sc.parallelize([([0.0, 1.0], [0.0, 2.0]), ([0.0, 2.0], [0.0, 1.0]),
     ...     ([], [0.0]), ([2.0], [2.0]), ([2.0, 0.0], [2.0, 0.0]),
     ...     ([0.0, 1.0, 2.0], [0.0, 1.0]), ([1.0], [1.0, 2.0])])
diff --git a/python/pyspark/mllib/feature.py b/python/pyspark/mllib/feature.py
index aac305db6c19a..da90554f41437 100644
--- a/python/pyspark/mllib/feature.py
+++ b/python/pyspark/mllib/feature.py
@@ -68,6 +68,8 @@ class Normalizer(VectorTransformer):
     For `p` = float('inf'), max(abs(vector)) will be used as norm for
     normalization.
 
+    :param p: Normalization in L^p^ space, p = 2 by default.
+
     >>> v = Vectors.dense(range(3))
     >>> nor = Normalizer(1)
     >>> nor.transform(v)
@@ -82,9 +84,6 @@ class Normalizer(VectorTransformer):
     DenseVector([0.0, 0.5, 1.0])
     """
     def __init__(self, p=2.0):
-        """
-        :param p: Normalization in L^p^ space, p = 2 by default.
-        """
         assert p >= 1.0, "p should be greater than 1.0"
         self.p = float(p)
 
@@ -94,7 +93,7 @@ def transform(self, vector):
 
         :param vector: vector or RDD of vector to be normalized.
         :return: normalized vector. If the norm of the input is zero, it
-                will return the input vector.
+                 will return the input vector.
         """
         sc = SparkContext._active_spark_context
         assert sc is not None, "SparkContext should be initialized first"
@@ -164,6 +163,13 @@ class StandardScaler(object):
     variance using column summary statistics on the samples in the
     training set.
 
+    :param withMean: False by default. Centers the data with mean
+                     before scaling. It will build a dense output, so this
+                     does not work on sparse input and will raise an
+                     exception.
+    :param withStd: True by default. Scales the data to unit
+                    standard deviation.
+
     >>> vs = [Vectors.dense([-2.0, 2.3, 0]), Vectors.dense([3.8, 0.0, 1.9])]
     >>> dataset = sc.parallelize(vs)
     >>> standardizer = StandardScaler(True, True)
@@ -174,14 +180,6 @@ class StandardScaler(object):
     DenseVector([0.7071, -0.7071, 0.7071])
     """
     def __init__(self, withMean=False, withStd=True):
-        """
-        :param withMean: False by default. Centers the data with mean
-                 before scaling. It will build a dense output, so this
-                 does not work on sparse input and will raise an
-                 exception.
-        :param withStd: True by default. Scales the data to unit
-                 standard deviation.
-        """
         if not (withMean or withStd):
             warnings.warn("Both withMean and withStd are false. The model does nothing.")
         self.withMean = withMean
@@ -193,7 +191,7 @@ def fit(self, dataset):
         for later scaling.
 
         :param data: The data used to compute the mean and variance
-                 to build the transformation model.
+                     to build the transformation model.
         :return: a StandardScalarModel
         """
         dataset = dataset.map(_convert_to_vector)
@@ -223,6 +221,8 @@ class ChiSqSelector(object):
 
     Creates a ChiSquared feature selector.
 
+    :param numTopFeatures: number of features that selector will select.
+
     >>> data = [
     ...     LabeledPoint(0.0, SparseVector(3, {0: 8.0, 1: 7.0})),
     ...     LabeledPoint(1.0, SparseVector(3, {1: 9.0, 2: 6.0})),
@@ -236,9 +236,6 @@ class ChiSqSelector(object):
     DenseVector([5.0])
     """
     def __init__(self, numTopFeatures):
-        """
-        :param numTopFeatures: number of features that selector will select.
-        """
         self.numTopFeatures = int(numTopFeatures)
 
     def fit(self, data):
@@ -246,9 +243,9 @@ def fit(self, data):
         Returns a ChiSquared feature selector.
 
         :param data: an `RDD[LabeledPoint]` containing the labeled dataset
-                 with categorical features. Real-valued features will be
-                 treated as categorical for each distinct value.
-                 Apply feature discretizer before using this function.
+                     with categorical features. Real-valued features will be
+                     treated as categorical for each distinct value.
+                     Apply feature discretizer before using this function.
         """
         jmodel = callMLlibFunc("fitChiSqSelector", self.numTopFeatures, data)
         return ChiSqSelectorModel(jmodel)
@@ -263,15 +260,14 @@ class HashingTF(object):
 
     Note: the terms must be hashable (can not be dict/set/list...).
 
+    :param numFeatures: number of features (default: 2^20)
+
     >>> htf = HashingTF(100)
     >>> doc = "a a b b c d".split(" ")
     >>> htf.transform(doc)
     SparseVector(100, {...})
     """
     def __init__(self, numFeatures=1 << 20):
-        """
-        :param numFeatures: number of features (default: 2^20)
-        """
         self.numFeatures = numFeatures
 
     def indexOf(self, term):
@@ -311,7 +307,7 @@ def transform(self, x):
               Call transform directly on the RDD instead.
 
         :param x: an RDD of term frequency vectors or a term frequency
-                 vector
+                  vector
         :return: an RDD of TF-IDF vectors or a TF-IDF vector
         """
         if isinstance(x, RDD):
@@ -342,6 +338,9 @@ class IDF(object):
     `minDocFreq`). For terms that are not in at least `minDocFreq`
     documents, the IDF is found as 0, resulting in TF-IDFs of 0.
 
+    :param minDocFreq: minimum of documents in which a term
+                       should appear for filtering
+
     >>> n = 4
     >>> freqs = [Vectors.sparse(n, (1, 3), (1.0, 2.0)),
     ...          Vectors.dense([0.0, 1.0, 2.0, 3.0]),
@@ -362,10 +361,6 @@ class IDF(object):
     SparseVector(4, {1: 0.0, 3: 0.5754})
     """
     def __init__(self, minDocFreq=0):
-        """
-        :param minDocFreq: minimum of documents in which a term
-                           should appear for filtering
-        """
         self.minDocFreq = minDocFreq
 
     def fit(self, dataset):

From 1281a3518802bfa624618236e6b9b59bc0e78585 Mon Sep 17 00:00:00 2001
From: Mike Dusenberry <dusenberrymw@gmail.com>
Date: Sat, 30 May 2015 16:50:59 -0700
Subject: [PATCH 271/525] [SPARK-7920] [MLLIB] Make MLlib ChiSqSelector
 Serializable (& Fix Related Documentation Example).

The MLlib ChiSqSelector class is not serializable, and so the example in the ChiSqSelector documentation fails. Also, that example is missing the import of ChiSqSelector.

This PR makes ChiSqSelector extend Serializable in MLlib, and adds the ChiSqSelector import statement to the associated example in the documentation.

Author: Mike Dusenberry <dusenberrymw@gmail.com>

Closes #6462 from dusenberrymw/Make_ChiSqSelector_Serializable_and_Fix_Related_Docs_Example and squashes the following commits:

9cb2f94 [Mike Dusenberry] Make MLlib ChiSqSelector Serializable.
d9003bf [Mike Dusenberry] Add missing import in MLlib ChiSqSelector Docs Scala example.
---
 docs/mllib-feature-extraction.md                                | 1 +
 .../scala/org/apache/spark/mllib/feature/ChiSqSelector.scala    | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/docs/mllib-feature-extraction.md b/docs/mllib-feature-extraction.md
index 764985d436ead..1f6ad8b13d730 100644
--- a/docs/mllib-feature-extraction.md
+++ b/docs/mllib-feature-extraction.md
@@ -410,6 +410,7 @@ import org.apache.spark.SparkContext._
 import org.apache.spark.mllib.linalg.Vectors
 import org.apache.spark.mllib.regression.LabeledPoint
 import org.apache.spark.mllib.util.MLUtils
+import org.apache.spark.mllib.feature.ChiSqSelector
 
 // Load some data in libsvm format
 val data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt")
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala b/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala
index 9cc2d0ffcab7d..5f8c1dea237b4 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala
@@ -108,7 +108,7 @@ class ChiSqSelectorModel (val selectedFeatures: Array[Int]) extends VectorTransf
  *                       (ordered by statistic value descending)
  */
 @Experimental
-class ChiSqSelector (val numTopFeatures: Int) {
+class ChiSqSelector (val numTopFeatures: Int) extends Serializable {
 
   /**
    * Returns a ChiSquared feature selector.

From 66a53a69643e0004742667e140bad2aa8dae44e4 Mon Sep 17 00:00:00 2001
From: Josh Rosen <joshrosen@databricks.com>
Date: Sat, 30 May 2015 16:52:34 -0700
Subject: [PATCH 272/525] [HOTFIX] Replace FunSuite with SparkFunSuite.

This fixes a build break introduced by merging a6430028ecd7a6130f1eb15af9ec00e242c46725,
which fails the new style checks that ensure that we use SparkFunSuite instead
of FunSuite.
---
 .../shuffle/sort/BypassMergeSortShuffleWriterSuite.scala     | 4 ++--
 .../apache/spark/shuffle/sort/SortShuffleWriterSuite.scala   | 5 ++---
 .../apache/spark/util/collection/ExternalSorterSuite.scala   | 4 ----
 3 files changed, 4 insertions(+), 9 deletions(-)

diff --git a/core/src/test/scala/org/apache/spark/shuffle/sort/BypassMergeSortShuffleWriterSuite.scala b/core/src/test/scala/org/apache/spark/shuffle/sort/BypassMergeSortShuffleWriterSuite.scala
index c8420db6126c0..542f8f45125a4 100644
--- a/core/src/test/scala/org/apache/spark/shuffle/sort/BypassMergeSortShuffleWriterSuite.scala
+++ b/core/src/test/scala/org/apache/spark/shuffle/sort/BypassMergeSortShuffleWriterSuite.scala
@@ -29,7 +29,7 @@ import org.mockito.Matchers._
 import org.mockito.Mockito._
 import org.mockito.invocation.InvocationOnMock
 import org.mockito.stubbing.Answer
-import org.scalatest.{BeforeAndAfterEach, FunSuite}
+import org.scalatest.BeforeAndAfterEach
 
 import org.apache.spark._
 import org.apache.spark.executor.{TaskMetrics, ShuffleWriteMetrics}
@@ -37,7 +37,7 @@ import org.apache.spark.serializer.{SerializerInstance, Serializer, JavaSerializ
 import org.apache.spark.storage._
 import org.apache.spark.util.Utils
 
-class BypassMergeSortShuffleWriterSuite extends FunSuite with BeforeAndAfterEach {
+class BypassMergeSortShuffleWriterSuite extends SparkFunSuite with BeforeAndAfterEach {
 
   @Mock(answer = RETURNS_SMART_NULLS) private var blockManager: BlockManager = _
   @Mock(answer = RETURNS_SMART_NULLS) private var diskBlockManager: DiskBlockManager = _
diff --git a/core/src/test/scala/org/apache/spark/shuffle/sort/SortShuffleWriterSuite.scala b/core/src/test/scala/org/apache/spark/shuffle/sort/SortShuffleWriterSuite.scala
index c6ada7139c198..34b4984f12c09 100644
--- a/core/src/test/scala/org/apache/spark/shuffle/sort/SortShuffleWriterSuite.scala
+++ b/core/src/test/scala/org/apache/spark/shuffle/sort/SortShuffleWriterSuite.scala
@@ -18,11 +18,10 @@
 package org.apache.spark.shuffle.sort
 
 import org.mockito.Mockito._
-import org.scalatest.FunSuite
 
-import org.apache.spark.{Aggregator, SparkConf}
+import org.apache.spark.{Aggregator, SparkConf, SparkFunSuite}
 
-class SortShuffleWriterSuite extends FunSuite {
+class SortShuffleWriterSuite extends SparkFunSuite {
 
   import SortShuffleWriter._
 
diff --git a/core/src/test/scala/org/apache/spark/util/collection/ExternalSorterSuite.scala b/core/src/test/scala/org/apache/spark/util/collection/ExternalSorterSuite.scala
index 7d7b41bc23284..9cefa612f5491 100644
--- a/core/src/test/scala/org/apache/spark/util/collection/ExternalSorterSuite.scala
+++ b/core/src/test/scala/org/apache/spark/util/collection/ExternalSorterSuite.scala
@@ -19,12 +19,8 @@ package org.apache.spark.util.collection
 
 import scala.collection.mutable.ArrayBuffer
 
-import org.scalatest.PrivateMethodTester
-
 import scala.util.Random
 
-import org.scalatest.FunSuite
-
 import org.apache.spark._
 import org.apache.spark.serializer.{JavaSerializer, KryoSerializer}
 

From 2b258e1c0784c8ca958bf94cd9e75fa17f104448 Mon Sep 17 00:00:00 2001
From: Xiangrui Meng <meng@databricks.com>
Date: Sat, 30 May 2015 17:21:41 -0700
Subject: [PATCH 273/525] [SPARK-5610] [DOC] update genjavadocSettings to use
 the patched version of genjavadoc

This PR updates `genjavadocSettings` to use a patched version of `genjavadoc-plugin` that hides package private classes/methods/interfaces in the generated Java API doc. The patch can be found at: https://github.com/typesafehub/genjavadoc/compare/master...mengxr:spark-1.4.

It wasn't merged into the main repo because there exist corner cases where a package private Scala class has to be a Java public class in order to compile. This doesn't seem to apply to the Spark codebase. So we release a patched version under `org.spark-project` and use it in the Spark build. brkyvz is publishing the artifacts to Maven Central.

Need more people audit the generated APIs and make sure we don't have false negatives.

Current listed classes under `org.apache.spark.rdd`:
![screen shot 2015-05-29 at 12 48 52 pm](https://cloud.githubusercontent.com/assets/829644/7891396/28fb9daa-0601-11e5-8ed8-4e9522d25a71.png)

After this PR:
![screen shot 2015-05-29 at 12 48 23 pm](https://cloud.githubusercontent.com/assets/829644/7891408/408e210e-0601-11e5-975c-ff0a02eb5c91.png)

cc: pwendell rxin srowen

Author: Xiangrui Meng <meng@databricks.com>

Closes #6506 from mengxr/SPARK-5610 and squashes the following commits:

489c785 [Xiangrui Meng] update genjavadocSettings to use the patched version of genjavadoc
---
 project/SparkBuild.scala | 10 +++++++---
 project/plugins.sbt      |  2 +-
 2 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/project/SparkBuild.scala b/project/SparkBuild.scala
index b9515a12bc573..9a849639233bc 100644
--- a/project/SparkBuild.scala
+++ b/project/SparkBuild.scala
@@ -23,7 +23,6 @@ import scala.collection.JavaConversions._
 import sbt._
 import sbt.Classpaths.publishTask
 import sbt.Keys._
-import sbtunidoc.Plugin.genjavadocSettings
 import sbtunidoc.Plugin.UnidocKeys.unidocGenjavadocVersion
 import com.typesafe.sbt.pom.{loadEffectivePom, PomBuild, SbtPomKeys}
 import net.virtualvoid.sbt.graph.Plugin.graphSettings
@@ -118,7 +117,12 @@ object SparkBuild extends PomBuild {
   lazy val MavenCompile = config("m2r") extend(Compile)
   lazy val publishLocalBoth = TaskKey[Unit]("publish-local", "publish local for m2 and ivy")
 
-  lazy val sharedSettings = graphSettings ++ genjavadocSettings ++ Seq (
+  lazy val sparkGenjavadocSettings: Seq[sbt.Def.Setting[_]] = Seq(
+    libraryDependencies += compilerPlugin(
+      "org.spark-project" %% "genjavadoc-plugin" % unidocGenjavadocVersion.value cross CrossVersion.full),
+    scalacOptions <+= target.map(t => "-P:genjavadoc:out=" + (t / "java")))
+
+  lazy val sharedSettings = graphSettings ++ sparkGenjavadocSettings ++ Seq (
     javaHome := sys.env.get("JAVA_HOME")
       .orElse(sys.props.get("java.home").map { p => new File(p).getParentFile().getAbsolutePath() })
       .map(file),
@@ -126,7 +130,7 @@ object SparkBuild extends PomBuild {
     retrieveManaged := true,
     retrievePattern := "[type]s/[artifact](-[revision])(-[classifier]).[ext]",
     publishMavenStyle := true,
-    unidocGenjavadocVersion := "0.8",
+    unidocGenjavadocVersion := "0.9-spark0",
 
     resolvers += Resolver.mavenLocal,
     otherResolvers <<= SbtPomKeys.mvnLocalRepository(dotM2 => Seq(Resolver.file("dotM2", dotM2))),
diff --git a/project/plugins.sbt b/project/plugins.sbt
index 7096b0d3ee7de..75bd604a1b857 100644
--- a/project/plugins.sbt
+++ b/project/plugins.sbt
@@ -25,7 +25,7 @@ addSbtPlugin("com.typesafe" % "sbt-mima-plugin" % "0.1.6")
 
 addSbtPlugin("com.alpinenow" % "junit_xml_listener" % "0.5.1")
 
-addSbtPlugin("com.eed3si9n" % "sbt-unidoc" % "0.3.1")
+addSbtPlugin("com.eed3si9n" % "sbt-unidoc" % "0.3.3")
 
 addSbtPlugin("com.cavorite" % "sbt-avro" % "0.3.2")
 

From 14b314dc2cad7bbf23976347217c676d338e0a2d Mon Sep 17 00:00:00 2001
From: Reynold Xin <rxin@databricks.com>
Date: Sat, 30 May 2015 19:50:52 -0700
Subject: [PATCH 274/525] [SQL] Tighten up visibility for JavaDoc.

I went through all the JavaDocs and tightened up visibility.

Author: Reynold Xin <rxin@databricks.com>

Closes #6526 from rxin/sql-1.4-visibility-for-docs and squashes the following commits:

bc37d1e [Reynold Xin] Tighten up visibility for JavaDoc.
---
 .../org/apache/spark/sql/types/Decimal.scala    |  6 +++---
 .../apache/spark/sql/types/DecimalType.scala    |  4 ++--
 .../spark/sql/types/SQLUserDefinedType.java     |  4 ++--
 .../org/apache/spark/sql/GroupedData.scala      |  8 ++++----
 .../apache/spark/sql/expressions/Window.scala   | 17 +++++++++++++++++
 .../apache/spark/sql/sources/interfaces.scala   |  3 ++-
 .../org/apache/spark/sql/hive/HiveQl.scala      |  2 +-
 .../spark/sql/hive/client/ReflectionMagic.scala |  2 +-
 8 files changed, 32 insertions(+), 14 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/Decimal.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/Decimal.scala
index 994c5202c15dc..eb3c58c37f308 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/Decimal.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/Decimal.scala
@@ -313,7 +313,7 @@ object Decimal {
   // See scala.math's Numeric.scala for examples for Scala's built-in types.
 
   /** Common methods for Decimal evidence parameters */
-  trait DecimalIsConflicted extends Numeric[Decimal] {
+  private[sql] trait DecimalIsConflicted extends Numeric[Decimal] {
     override def plus(x: Decimal, y: Decimal): Decimal = x + y
     override def times(x: Decimal, y: Decimal): Decimal = x * y
     override def minus(x: Decimal, y: Decimal): Decimal = x - y
@@ -327,12 +327,12 @@ object Decimal {
   }
 
   /** A [[scala.math.Fractional]] evidence parameter for Decimals. */
-  object DecimalIsFractional extends DecimalIsConflicted with Fractional[Decimal] {
+  private[sql] object DecimalIsFractional extends DecimalIsConflicted with Fractional[Decimal] {
     override def div(x: Decimal, y: Decimal): Decimal = x / y
   }
 
   /** A [[scala.math.Integral]] evidence parameter for Decimals. */
-  object DecimalAsIfIntegral extends DecimalIsConflicted with Integral[Decimal] {
+  private[sql] object DecimalAsIfIntegral extends DecimalIsConflicted with Integral[Decimal] {
     override def quot(x: Decimal, y: Decimal): Decimal = x / y
     override def rem(x: Decimal, y: Decimal): Decimal = x % y
   }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/DecimalType.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/DecimalType.scala
index 0f8cecd28f7df..407dc27326c2e 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/DecimalType.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/DecimalType.scala
@@ -82,12 +82,12 @@ case class DecimalType(precisionInfo: Option[PrecisionInfo]) extends FractionalT
 object DecimalType {
   val Unlimited: DecimalType = DecimalType(None)
 
-  object Fixed {
+  private[sql] object Fixed {
     def unapply(t: DecimalType): Option[(Int, Int)] =
       t.precisionInfo.map(p => (p.precision, p.scale))
   }
 
-  object Expression {
+  private[sql] object Expression {
     def unapply(e: Expression): Option[(Int, Int)] = e.dataType match {
       case t: DecimalType => t.precisionInfo.map(p => (p.precision, p.scale))
       case _ => None
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/SQLUserDefinedType.java b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/SQLUserDefinedType.java
index a64d2bb7cde37..df64a878b6b36 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/SQLUserDefinedType.java
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/SQLUserDefinedType.java
@@ -24,11 +24,11 @@
 /**
  * ::DeveloperApi::
  * A user-defined type which can be automatically recognized by a SQLContext and registered.
- *
+ * <p>
  * WARNING: This annotation will only work if both Java and Scala reflection return the same class
  *          names (after erasure) for the UDT.  This will NOT be the case when, e.g., the UDT class
  *          is enclosed in an object (a singleton).
- *
+ * <p>
  * WARNING: UDTs are currently only supported from Scala.
  */
 // TODO: Should I used @Documented ?
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/GroupedData.scala b/sql/core/src/main/scala/org/apache/spark/sql/GroupedData.scala
index 516ba2ac23371..c4ceb0c173887 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/GroupedData.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/GroupedData.scala
@@ -40,22 +40,22 @@ private[sql] object GroupedData {
   /**
    * The Grouping Type
    */
-  trait GroupType
+  private[sql] trait GroupType
 
   /**
    * To indicate it's the GroupBy
    */
-  object GroupByType extends GroupType
+  private[sql] object GroupByType extends GroupType
 
   /**
    * To indicate it's the CUBE
    */
-  object CubeType extends GroupType
+  private[sql] object CubeType extends GroupType
 
   /**
    * To indicate it's the ROLLUP
    */
-  object RollupType extends GroupType
+  private[sql] object RollupType extends GroupType
 }
 
 /**
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/expressions/Window.scala b/sql/core/src/main/scala/org/apache/spark/sql/expressions/Window.scala
index d4003b2d9cbf6..e9b60841fc28c 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/expressions/Window.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/expressions/Window.scala
@@ -79,3 +79,20 @@ object Window {
   }
 
 }
+
+/**
+ * :: Experimental ::
+ * Utility functions for defining window in DataFrames.
+ *
+ * {{{
+ *   // PARTITION BY country ORDER BY date ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW
+ *   Window.partitionBy("country").orderBy("date").rowsBetween(Long.MinValue, 0)
+ *
+ *   // PARTITION BY country ORDER BY date ROWS BETWEEN 3 PRECEDING AND 3 FOLLOWING
+ *   Window.partitionBy("country").orderBy("date").rowsBetween(-3, 3)
+ * }}}
+ *
+ * @since 1.4.0
+ */
+@Experimental
+class Window private()  // So we can see Window in JavaDoc.
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/sources/interfaces.scala b/sql/core/src/main/scala/org/apache/spark/sql/sources/interfaces.scala
index c06026e042d9f..b1b997c030a60 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/sources/interfaces.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/sources/interfaces.scala
@@ -93,7 +93,7 @@ trait SchemaRelationProvider {
 }
 
 /**
- * ::DeveloperApi::
+ * ::Experimental::
  * Implemented by objects that produce relations for a specific kind of data source
  * with a given schema and partitioned columns.  When Spark SQL is given a DDL operation with a
  * USING clause specified (to specify the implemented [[HadoopFsRelationProvider]]), a user defined
@@ -115,6 +115,7 @@ trait SchemaRelationProvider {
  *
  * @since 1.4.0
  */
+@Experimental
 trait HadoopFsRelationProvider {
   /**
    * Returns a new base relation with the given parameters, a user defined schema, and a list of
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala
index 3915ee835685f..253bf1125262e 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala
@@ -57,7 +57,7 @@ private[hive] case object NativePlaceholder extends LogicalPlan {
   override def output: Seq[Attribute] = Seq.empty
 }
 
-case class CreateTableAsSelect(
+private[hive] case class CreateTableAsSelect(
     tableDesc: HiveTable,
     child: LogicalPlan,
     allowExisting: Boolean) extends UnaryNode with Command {
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/ReflectionMagic.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/ReflectionMagic.scala
index c600b158c5460..4d053ae42c2ea 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/ReflectionMagic.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/ReflectionMagic.scala
@@ -30,7 +30,7 @@ private[client] object ReflectionException {
 /**
  * Provides implicit functions on any object for calling methods reflectively.
  */
-protected trait ReflectionMagic {
+private[client] trait ReflectionMagic {
     /** code for InstanceMagic
         println(
     (1 to 22).map { n =>

From c63e1a742b3e87e79a4466e9bd0b927a24645756 Mon Sep 17 00:00:00 2001
From: Reynold Xin <rxin@databricks.com>
Date: Sat, 30 May 2015 19:51:53 -0700
Subject: [PATCH 275/525] [SPARK-7971] Add JavaDoc style deprecation for
 deprecated DataFrame methods

Scala deprecated annotation actually doesn't show up in JavaDoc.

Author: Reynold Xin <rxin@databricks.com>

Closes #6523 from rxin/df-deprecated-javadoc and squashes the following commits:

26da2b2 [Reynold Xin] [SPARK-7971] Add JavaDoc style deprecation for deprecated DataFrame methods.
---
 .../org/apache/spark/sql/types/DataType.scala |  3 ++
 .../org/apache/spark/sql/DataFrame.scala      | 46 ++++++++++++++-----
 .../org/apache/spark/sql/SQLContext.scala     | 33 +++++++++++++
 3 files changed, 70 insertions(+), 12 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/DataType.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/DataType.scala
index 54604808e133e..1ba3a2686639f 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/DataType.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/DataType.scala
@@ -165,6 +165,9 @@ object DataType {
 
   def fromJson(json: String): DataType = parseDataType(parse(json))
 
+  /**
+   * @deprecated As of 1.2.0, replaced by `DataType.fromJson()`
+   */
   @deprecated("Use DataType.fromJson instead", "1.2.0")
   def fromCaseClassString(string: String): DataType = CaseClassStringParser(string)
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
index e90109446b642..034d887901975 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
@@ -57,14 +57,11 @@ private[sql] object DataFrame {
  * :: Experimental ::
  * A distributed collection of data organized into named columns.
  *
- * A [[DataFrame]] is equivalent to a relational table in Spark SQL. There are multiple ways
- * to create a [[DataFrame]]:
+ * A [[DataFrame]] is equivalent to a relational table in Spark SQL. The following example creates
+ * a [[DataFrame]] by pointing Spark SQL to a Parquet data set.
  * {{{
- *   // Create a DataFrame from Parquet files
- *   val people = sqlContext.parquetFile("...")
- *
- *   // Create a DataFrame from data sources
- *   val df = sqlContext.load("...", "json")
+ *   val people = sqlContext.read.parquet("...")  // in Scala
+ *   DataFrame people = sqlContext.read().parquet("...")  // in Java
  * }}}
  *
  * Once created, it can be manipulated using the various domain-specific-language (DSL) functions
@@ -86,8 +83,8 @@ private[sql] object DataFrame {
  * A more concrete example in Scala:
  * {{{
  *   // To create DataFrame using SQLContext
- *   val people = sqlContext.parquetFile("...")
- *   val department = sqlContext.parquetFile("...")
+ *   val people = sqlContext.read.parquet("...")
+ *   val department = sqlContext.read.parquet("...")
  *
  *   people.filter("age > 30")
  *     .join(department, people("deptId") === department("id"))
@@ -98,8 +95,8 @@ private[sql] object DataFrame {
  * and in Java:
  * {{{
  *   // To create DataFrame using SQLContext
- *   DataFrame people = sqlContext.parquetFile("...");
- *   DataFrame department = sqlContext.parquetFile("...");
+ *   DataFrame people = sqlContext.read().parquet("...");
+ *   DataFrame department = sqlContext.read().parquet("...");
  *
  *   people.filter("age".gt(30))
  *     .join(department, people.col("deptId").equalTo(department("id")))
@@ -1444,7 +1441,9 @@ class DataFrame private[sql](
   ////////////////////////////////////////////////////////////////////////////
   ////////////////////////////////////////////////////////////////////////////
 
-  /** Left here for backward compatibility. */
+  /**
+   * @deprecated As of 1.3.0, replaced by `toDF()`.
+   */
   @deprecated("use toDF", "1.3.0")
   def toSchemaRDD: DataFrame = this
 
@@ -1455,6 +1454,7 @@ class DataFrame private[sql](
    * given name; if you pass `false`, it will throw if the table already
    * exists.
    * @group output
+   * @deprecated As of 1.340, replaced by `write().jdbc()`.
    */
   @deprecated("Use write.jdbc()", "1.4.0")
   def createJDBCTable(url: String, table: String, allowExisting: Boolean): Unit = {
@@ -1473,6 +1473,7 @@ class DataFrame private[sql](
    * the RDD in order via the simple statement
    * `INSERT INTO table VALUES (?, ?, ..., ?)` should not fail.
    * @group output
+   * @deprecated As of 1.4.0, replaced by `write().jdbc()`.
    */
   @deprecated("Use write.jdbc()", "1.4.0")
   def insertIntoJDBC(url: String, table: String, overwrite: Boolean): Unit = {
@@ -1485,6 +1486,7 @@ class DataFrame private[sql](
    * Files that are written out using this method can be read back in as a [[DataFrame]]
    * using the `parquetFile` function in [[SQLContext]].
    * @group output
+   * @deprecated As of 1.4.0, replaced by `write().parquet()`.
    */
   @deprecated("Use write.parquet(path)", "1.4.0")
   def saveAsParquetFile(path: String): Unit = {
@@ -1508,6 +1510,7 @@ class DataFrame private[sql](
    * Also note that while this function can persist the table metadata into Hive's metastore,
    * the table will NOT be accessible from Hive, until SPARK-7550 is resolved.
    * @group output
+   * @deprecated As of 1.4.0, replaced by `write().saveAsTable(tableName)`.
    */
   @deprecated("Use write.saveAsTable(tableName)", "1.4.0")
   def saveAsTable(tableName: String): Unit = {
@@ -1526,6 +1529,7 @@ class DataFrame private[sql](
    * Also note that while this function can persist the table metadata into Hive's metastore,
    * the table will NOT be accessible from Hive, until SPARK-7550 is resolved.
    * @group output
+   * @deprecated As of 1.4.0, replaced by `write().mode(mode).saveAsTable(tableName)`.
    */
   @deprecated("Use write.mode(mode).saveAsTable(tableName)", "1.4.0")
   def saveAsTable(tableName: String, mode: SaveMode): Unit = {
@@ -1545,6 +1549,7 @@ class DataFrame private[sql](
    * Also note that while this function can persist the table metadata into Hive's metastore,
    * the table will NOT be accessible from Hive, until SPARK-7550 is resolved.
    * @group output
+   * @deprecated As of 1.4.0, replaced by `write().format(source).saveAsTable(tableName)`.
    */
   @deprecated("Use write.format(source).saveAsTable(tableName)", "1.4.0")
   def saveAsTable(tableName: String, source: String): Unit = {
@@ -1564,6 +1569,7 @@ class DataFrame private[sql](
    * Also note that while this function can persist the table metadata into Hive's metastore,
    * the table will NOT be accessible from Hive, until SPARK-7550 is resolved.
    * @group output
+   * @deprecated As of 1.4.0, replaced by `write().mode(mode).saveAsTable(tableName)`.
    */
   @deprecated("Use write.format(source).mode(mode).saveAsTable(tableName)", "1.4.0")
   def saveAsTable(tableName: String, source: String, mode: SaveMode): Unit = {
@@ -1582,6 +1588,8 @@ class DataFrame private[sql](
    * Also note that while this function can persist the table metadata into Hive's metastore,
    * the table will NOT be accessible from Hive, until SPARK-7550 is resolved.
    * @group output
+   * @deprecated As of 1.4.0, replaced by
+   *            `write().format(source).mode(mode).options(options).saveAsTable(tableName)`.
    */
   @deprecated("Use write.format(source).mode(mode).options(options).saveAsTable(tableName)",
     "1.4.0")
@@ -1606,6 +1614,8 @@ class DataFrame private[sql](
    * Also note that while this function can persist the table metadata into Hive's metastore,
    * the table will NOT be accessible from Hive, until SPARK-7550 is resolved.
    * @group output
+   * @deprecated As of 1.4.0, replaced by
+   *            `write().format(source).mode(mode).options(options).saveAsTable(tableName)`.
    */
   @deprecated("Use write.format(source).mode(mode).options(options).saveAsTable(tableName)",
     "1.4.0")
@@ -1622,6 +1632,7 @@ class DataFrame private[sql](
    * using the default data source configured by spark.sql.sources.default and
    * [[SaveMode.ErrorIfExists]] as the save mode.
    * @group output
+   * @deprecated As of 1.4.0, replaced by `write().save(path)`.
    */
   @deprecated("Use write.save(path)", "1.4.0")
   def save(path: String): Unit = {
@@ -1632,6 +1643,7 @@ class DataFrame private[sql](
    * Saves the contents of this DataFrame to the given path and [[SaveMode]] specified by mode,
    * using the default data source configured by spark.sql.sources.default.
    * @group output
+   * @deprecated As of 1.4.0, replaced by `write().mode(mode).save(path)`.
    */
   @deprecated("Use write.mode(mode).save(path)", "1.4.0")
   def save(path: String, mode: SaveMode): Unit = {
@@ -1642,6 +1654,7 @@ class DataFrame private[sql](
    * Saves the contents of this DataFrame to the given path based on the given data source,
    * using [[SaveMode.ErrorIfExists]] as the save mode.
    * @group output
+   * @deprecated As of 1.4.0, replaced by `write().format(source).save(path)`.
    */
   @deprecated("Use write.format(source).save(path)", "1.4.0")
   def save(path: String, source: String): Unit = {
@@ -1652,6 +1665,7 @@ class DataFrame private[sql](
    * Saves the contents of this DataFrame to the given path based on the given data source and
    * [[SaveMode]] specified by mode.
    * @group output
+   * @deprecated As of 1.4.0, replaced by `write().format(source).mode(mode).save(path)`.
    */
   @deprecated("Use write.format(source).mode(mode).save(path)", "1.4.0")
   def save(path: String, source: String, mode: SaveMode): Unit = {
@@ -1662,6 +1676,8 @@ class DataFrame private[sql](
    * Saves the contents of this DataFrame based on the given data source,
    * [[SaveMode]] specified by mode, and a set of options.
    * @group output
+   * @deprecated As of 1.4.0, replaced by
+   *            `write().format(source).mode(mode).options(options).save(path)`.
    */
   @deprecated("Use write.format(source).mode(mode).options(options).save()", "1.4.0")
   def save(
@@ -1676,6 +1692,8 @@ class DataFrame private[sql](
    * Saves the contents of this DataFrame based on the given data source,
    * [[SaveMode]] specified by mode, and a set of options
    * @group output
+   * @deprecated As of 1.4.0, replaced by
+   *            `write().format(source).mode(mode).options(options).save(path)`.
    */
   @deprecated("Use write.format(source).mode(mode).options(options).save()", "1.4.0")
   def save(
@@ -1689,6 +1707,8 @@ class DataFrame private[sql](
   /**
    * Adds the rows from this RDD to the specified table, optionally overwriting the existing data.
    * @group output
+   * @deprecated As of 1.4.0, replaced by
+   *            `write().mode(SaveMode.Append|SaveMode.Overwrite).saveAsTable(tableName)`.
    */
   @deprecated("Use write.mode(SaveMode.Append|SaveMode.Overwrite).saveAsTable(tableName)", "1.4.0")
   def insertInto(tableName: String, overwrite: Boolean): Unit = {
@@ -1699,6 +1719,8 @@ class DataFrame private[sql](
    * Adds the rows from this RDD to the specified table.
    * Throws an exception if the table already exists.
    * @group output
+   * @deprecated As of 1.4.0, replaced by
+   *            `write().mode(SaveMode.Append).saveAsTable(tableName)`.
    */
   @deprecated("Use write.mode(SaveMode.Append).saveAsTable(tableName)", "1.4.0")
   def insertInto(tableName: String): Unit = {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
index a32897c20b474..7384b24c50b16 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
@@ -1021,21 +1021,33 @@ class SQLContext(@transient val sparkContext: SparkContext)
   ////////////////////////////////////////////////////////////////////////////
   ////////////////////////////////////////////////////////////////////////////
 
+  /**
+   * @deprecated As of 1.3.0, replaced by `createDataFrame()`.
+   */
   @deprecated("use createDataFrame", "1.3.0")
   def applySchema(rowRDD: RDD[Row], schema: StructType): DataFrame = {
     createDataFrame(rowRDD, schema)
   }
 
+  /**
+   * @deprecated As of 1.3.0, replaced by `createDataFrame()`.
+   */
   @deprecated("use createDataFrame", "1.3.0")
   def applySchema(rowRDD: JavaRDD[Row], schema: StructType): DataFrame = {
     createDataFrame(rowRDD, schema)
   }
 
+  /**
+   * @deprecated As of 1.3.0, replaced by `createDataFrame()`.
+   */
   @deprecated("use createDataFrame", "1.3.0")
   def applySchema(rdd: RDD[_], beanClass: Class[_]): DataFrame = {
     createDataFrame(rdd, beanClass)
   }
 
+  /**
+   * @deprecated As of 1.3.0, replaced by `createDataFrame()`.
+   */
   @deprecated("use createDataFrame", "1.3.0")
   def applySchema(rdd: JavaRDD[_], beanClass: Class[_]): DataFrame = {
     createDataFrame(rdd, beanClass)
@@ -1046,6 +1058,7 @@ class SQLContext(@transient val sparkContext: SparkContext)
    * [[DataFrame]] if no paths are passed in.
    *
    * @group specificdata
+   * @deprecated As of 1.4.0, replaced by `read().parquet()`.
    */
   @deprecated("Use read.parquet()", "1.4.0")
   @scala.annotation.varargs
@@ -1065,6 +1078,7 @@ class SQLContext(@transient val sparkContext: SparkContext)
    * It goes through the entire dataset once to determine the schema.
    *
    * @group specificdata
+   * @deprecated As of 1.4.0, replaced by `read().json()`.
    */
   @deprecated("Use read.json()", "1.4.0")
   def jsonFile(path: String): DataFrame = {
@@ -1076,6 +1090,7 @@ class SQLContext(@transient val sparkContext: SparkContext)
    * returning the result as a [[DataFrame]].
    *
    * @group specificdata
+   * @deprecated As of 1.4.0, replaced by `read().json()`.
    */
   @deprecated("Use read.json()", "1.4.0")
   def jsonFile(path: String, schema: StructType): DataFrame = {
@@ -1084,6 +1099,7 @@ class SQLContext(@transient val sparkContext: SparkContext)
 
   /**
    * @group specificdata
+   * @deprecated As of 1.4.0, replaced by `read().json()`.
    */
   @deprecated("Use read.json()", "1.4.0")
   def jsonFile(path: String, samplingRatio: Double): DataFrame = {
@@ -1096,6 +1112,7 @@ class SQLContext(@transient val sparkContext: SparkContext)
    * It goes through the entire dataset once to determine the schema.
    *
    * @group specificdata
+   * @deprecated As of 1.4.0, replaced by `read().json()`.
    */
   @deprecated("Use read.json()", "1.4.0")
   def jsonRDD(json: RDD[String]): DataFrame = read.json(json)
@@ -1106,6 +1123,7 @@ class SQLContext(@transient val sparkContext: SparkContext)
    * It goes through the entire dataset once to determine the schema.
    *
    * @group specificdata
+   * @deprecated As of 1.4.0, replaced by `read().json()`.
    */
   @deprecated("Use read.json()", "1.4.0")
   def jsonRDD(json: JavaRDD[String]): DataFrame = read.json(json)
@@ -1115,6 +1133,7 @@ class SQLContext(@transient val sparkContext: SparkContext)
    * returning the result as a [[DataFrame]].
    *
    * @group specificdata
+   * @deprecated As of 1.4.0, replaced by `read().json()`.
    */
   @deprecated("Use read.json()", "1.4.0")
   def jsonRDD(json: RDD[String], schema: StructType): DataFrame = {
@@ -1126,6 +1145,7 @@ class SQLContext(@transient val sparkContext: SparkContext)
    * schema, returning the result as a [[DataFrame]].
    *
    * @group specificdata
+   * @deprecated As of 1.4.0, replaced by `read().json()`.
    */
   @deprecated("Use read.json()", "1.4.0")
   def jsonRDD(json: JavaRDD[String], schema: StructType): DataFrame = {
@@ -1137,6 +1157,7 @@ class SQLContext(@transient val sparkContext: SparkContext)
    * schema, returning the result as a [[DataFrame]].
    *
    * @group specificdata
+   * @deprecated As of 1.4.0, replaced by `read().json()`.
    */
   @deprecated("Use read.json()", "1.4.0")
   def jsonRDD(json: RDD[String], samplingRatio: Double): DataFrame = {
@@ -1148,6 +1169,7 @@ class SQLContext(@transient val sparkContext: SparkContext)
    * schema, returning the result as a [[DataFrame]].
    *
    * @group specificdata
+   * @deprecated As of 1.4.0, replaced by `read().json()`.
    */
   @deprecated("Use read.json()", "1.4.0")
   def jsonRDD(json: JavaRDD[String], samplingRatio: Double): DataFrame = {
@@ -1159,6 +1181,7 @@ class SQLContext(@transient val sparkContext: SparkContext)
    * using the default data source configured by spark.sql.sources.default.
    *
    * @group genericdata
+   * @deprecated As of 1.4.0, replaced by `read().load(path)`.
    */
   @deprecated("Use read.load(path)", "1.4.0")
   def load(path: String): DataFrame = {
@@ -1169,6 +1192,7 @@ class SQLContext(@transient val sparkContext: SparkContext)
    * Returns the dataset stored at path as a DataFrame, using the given data source.
    *
    * @group genericdata
+   * @deprecated As of 1.4.0, replaced by `read().format(source).load(path)`.
    */
   @deprecated("Use read.format(source).load(path)", "1.4.0")
   def load(path: String, source: String): DataFrame = {
@@ -1180,6 +1204,7 @@ class SQLContext(@transient val sparkContext: SparkContext)
    * a set of options as a DataFrame.
    *
    * @group genericdata
+   * @deprecated As of 1.4.0, replaced by `read().format(source).options(options).load()`.
    */
   @deprecated("Use read.format(source).options(options).load()", "1.4.0")
   def load(source: String, options: java.util.Map[String, String]): DataFrame = {
@@ -1191,6 +1216,7 @@ class SQLContext(@transient val sparkContext: SparkContext)
    * a set of options as a DataFrame.
    *
    * @group genericdata
+   * @deprecated As of 1.4.0, replaced by `read().format(source).options(options).load()`.
    */
   @deprecated("Use read.format(source).options(options).load()", "1.4.0")
   def load(source: String, options: Map[String, String]): DataFrame = {
@@ -1202,6 +1228,8 @@ class SQLContext(@transient val sparkContext: SparkContext)
    * a set of options as a DataFrame, using the given schema as the schema of the DataFrame.
    *
    * @group genericdata
+   * @deprecated As of 1.4.0, replaced by
+   *            `read().format(source).schema(schema).options(options).load()`.
    */
   @deprecated("Use read.format(source).schema(schema).options(options).load()", "1.4.0")
   def load(source: String, schema: StructType, options: java.util.Map[String, String]): DataFrame =
@@ -1214,6 +1242,8 @@ class SQLContext(@transient val sparkContext: SparkContext)
    * a set of options as a DataFrame, using the given schema as the schema of the DataFrame.
    *
    * @group genericdata
+   * @deprecated As of 1.4.0, replaced by
+   *            `read().format(source).schema(schema).options(options).load()`.
    */
   @deprecated("Use read.format(source).schema(schema).options(options).load()", "1.4.0")
   def load(source: String, schema: StructType, options: Map[String, String]): DataFrame = {
@@ -1225,6 +1255,7 @@ class SQLContext(@transient val sparkContext: SparkContext)
    * url named table.
    *
    * @group specificdata
+   * @deprecated As of 1.4.0, replaced by `read().jdbc()`.
    */
   @deprecated("use read.jdbc()", "1.4.0")
   def jdbc(url: String, table: String): DataFrame = {
@@ -1242,6 +1273,7 @@ class SQLContext(@transient val sparkContext: SparkContext)
    * @param numPartitions the number of partitions.  the range `minValue`-`maxValue` will be split
    *                      evenly into this many partitions
    * @group specificdata
+   * @deprecated As of 1.4.0, replaced by `read().jdbc()`.
    */
   @deprecated("use read.jdbc()", "1.4.0")
   def jdbc(
@@ -1261,6 +1293,7 @@ class SQLContext(@transient val sparkContext: SparkContext)
    * of the [[DataFrame]].
    *
    * @group specificdata
+   * @deprecated As of 1.4.0, replaced by `read().jdbc()`.
    */
   @deprecated("use read.jdbc()", "1.4.0")
   def jdbc(url: String, table: String, theParts: Array[String]): DataFrame = {

From 00a7137900d45188673da85cbcef4f02b7a266c1 Mon Sep 17 00:00:00 2001
From: Reynold Xin <rxin@databricks.com>
Date: Sat, 30 May 2015 20:10:02 -0700
Subject: [PATCH 276/525] Update documentation for the new DataFrame
 reader/writer interface.

Author: Reynold Xin <rxin@databricks.com>

Closes #6522 from rxin/sql-doc-1.4 and squashes the following commits:

c227be7 [Reynold Xin] Updated link.
040b6d7 [Reynold Xin] Update documentation for the new DataFrame reader/writer interface.
---
 docs/sql-programming-guide.md | 126 ++++++++++++++++++----------------
 1 file changed, 66 insertions(+), 60 deletions(-)

diff --git a/docs/sql-programming-guide.md b/docs/sql-programming-guide.md
index 2ea7572c6026a..282ea75e1e785 100644
--- a/docs/sql-programming-guide.md
+++ b/docs/sql-programming-guide.md
@@ -109,7 +109,7 @@ As an example, the following creates a `DataFrame` based on the content of a JSO
 val sc: SparkContext // An existing SparkContext.
 val sqlContext = new org.apache.spark.sql.SQLContext(sc)
 
-val df = sqlContext.jsonFile("examples/src/main/resources/people.json")
+val df = sqlContext.read.json("examples/src/main/resources/people.json")
 
 // Displays the content of the DataFrame to stdout
 df.show()
@@ -122,7 +122,7 @@ df.show()
 JavaSparkContext sc = ...; // An existing JavaSparkContext.
 SQLContext sqlContext = new org.apache.spark.sql.SQLContext(sc);
 
-DataFrame df = sqlContext.jsonFile("examples/src/main/resources/people.json");
+DataFrame df = sqlContext.read().json("examples/src/main/resources/people.json");
 
 // Displays the content of the DataFrame to stdout
 df.show();
@@ -135,7 +135,7 @@ df.show();
 from pyspark.sql import SQLContext
 sqlContext = SQLContext(sc)
 
-df = sqlContext.jsonFile("examples/src/main/resources/people.json")
+df = sqlContext.read.json("examples/src/main/resources/people.json")
 
 # Displays the content of the DataFrame to stdout
 df.show()
@@ -171,7 +171,7 @@ val sc: SparkContext // An existing SparkContext.
 val sqlContext = new org.apache.spark.sql.SQLContext(sc)
 
 // Create the DataFrame
-val df = sqlContext.jsonFile("examples/src/main/resources/people.json")
+val df = sqlContext.read.json("examples/src/main/resources/people.json")
 
 // Show the content of the DataFrame
 df.show()
@@ -221,7 +221,7 @@ JavaSparkContext sc // An existing SparkContext.
 SQLContext sqlContext = new org.apache.spark.sql.SQLContext(sc)
 
 // Create the DataFrame
-DataFrame df = sqlContext.jsonFile("examples/src/main/resources/people.json");
+DataFrame df = sqlContext.read().json("examples/src/main/resources/people.json");
 
 // Show the content of the DataFrame
 df.show();
@@ -277,7 +277,7 @@ from pyspark.sql import SQLContext
 sqlContext = SQLContext(sc)
 
 # Create the DataFrame
-df = sqlContext.jsonFile("examples/src/main/resources/people.json")
+df = sqlContext.read.json("examples/src/main/resources/people.json")
 
 # Show the content of the DataFrame
 df.show()
@@ -777,8 +777,8 @@ In the simplest form, the default data source (`parquet` unless otherwise config
 <div data-lang="scala"  markdown="1">
 
 {% highlight scala %}
-val df = sqlContext.load("examples/src/main/resources/users.parquet")
-df.select("name", "favorite_color").save("namesAndFavColors.parquet")
+val df = sqlContext.read.load("examples/src/main/resources/users.parquet")
+df.select("name", "favorite_color").write.save("namesAndFavColors.parquet")
 {% endhighlight %}
 
 </div>
@@ -787,8 +787,8 @@ df.select("name", "favorite_color").save("namesAndFavColors.parquet")
 
 {% highlight java %}
 
-DataFrame df = sqlContext.load("examples/src/main/resources/users.parquet");
-df.select("name", "favorite_color").save("namesAndFavColors.parquet");
+DataFrame df = sqlContext.read().load("examples/src/main/resources/users.parquet");
+df.select("name", "favorite_color").write().save("namesAndFavColors.parquet");
 
 {% endhighlight %}
 
@@ -798,8 +798,8 @@ df.select("name", "favorite_color").save("namesAndFavColors.parquet");
 
 {% highlight python %}
 
-df = sqlContext.load("examples/src/main/resources/users.parquet")
-df.select("name", "favorite_color").save("namesAndFavColors.parquet")
+df = sqlContext.read.load("examples/src/main/resources/users.parquet")
+df.select("name", "favorite_color").write.save("namesAndFavColors.parquet")
 
 {% endhighlight %}
 
@@ -827,8 +827,8 @@ using this syntax.
 <div data-lang="scala"  markdown="1">
 
 {% highlight scala %}
-val df = sqlContext.load("examples/src/main/resources/people.json", "json")
-df.select("name", "age").save("namesAndAges.parquet", "parquet")
+val df = sqlContext.read.format("json").load("examples/src/main/resources/people.json")
+df.select("name", "age").write.format("json").save("namesAndAges.parquet")
 {% endhighlight %}
 
 </div>
@@ -837,8 +837,8 @@ df.select("name", "age").save("namesAndAges.parquet", "parquet")
 
 {% highlight java %}
 
-DataFrame df = sqlContext.load("examples/src/main/resources/people.json", "json");
-df.select("name", "age").save("namesAndAges.parquet", "parquet");
+DataFrame df = sqlContext.read().format("json").load("examples/src/main/resources/people.json");
+df.select("name", "age").write().format("parquet").save("namesAndAges.parquet");
 
 {% endhighlight %}
 
@@ -848,8 +848,8 @@ df.select("name", "age").save("namesAndAges.parquet", "parquet");
 
 {% highlight python %}
 
-df = sqlContext.load("examples/src/main/resources/people.json", "json")
-df.select("name", "age").save("namesAndAges.parquet", "parquet")
+df = sqlContext.read.load("examples/src/main/resources/people.json", format="json")
+df.select("name", "age").write.save("namesAndAges.parquet", format="parquet")
 
 {% endhighlight %}
 
@@ -947,11 +947,11 @@ import sqlContext.implicits._
 val people: RDD[Person] = ... // An RDD of case class objects, from the previous example.
 
 // The RDD is implicitly converted to a DataFrame by implicits, allowing it to be stored using Parquet.
-people.saveAsParquetFile("people.parquet")
+people.write.parquet("people.parquet")
 
 // Read in the parquet file created above.  Parquet files are self-describing so the schema is preserved.
 // The result of loading a Parquet file is also a DataFrame.
-val parquetFile = sqlContext.parquetFile("people.parquet")
+val parquetFile = sqlContext.read.parquet("people.parquet")
 
 //Parquet files can also be registered as tables and then used in SQL statements.
 parquetFile.registerTempTable("parquetFile")
@@ -969,11 +969,11 @@ teenagers.map(t => "Name: " + t(0)).collect().foreach(println)
 DataFrame schemaPeople = ... // The DataFrame from the previous example.
 
 // DataFrames can be saved as Parquet files, maintaining the schema information.
-schemaPeople.saveAsParquetFile("people.parquet");
+schemaPeople.write().parquet("people.parquet");
 
 // Read in the Parquet file created above.  Parquet files are self-describing so the schema is preserved.
 // The result of loading a parquet file is also a DataFrame.
-DataFrame parquetFile = sqlContext.parquetFile("people.parquet");
+DataFrame parquetFile = sqlContext.read().parquet("people.parquet");
 
 //Parquet files can also be registered as tables and then used in SQL statements.
 parquetFile.registerTempTable("parquetFile");
@@ -995,11 +995,11 @@ List<String> teenagerNames = teenagers.javaRDD().map(new Function<Row, String>()
 schemaPeople # The DataFrame from the previous example.
 
 # DataFrames can be saved as Parquet files, maintaining the schema information.
-schemaPeople.saveAsParquetFile("people.parquet")
+schemaPeople.read.parquet("people.parquet")
 
 # Read in the Parquet file created above.  Parquet files are self-describing so the schema is preserved.
 # The result of loading a parquet file is also a DataFrame.
-parquetFile = sqlContext.parquetFile("people.parquet")
+parquetFile = sqlContext.write.parquet("people.parquet")
 
 # Parquet files can also be registered as tables and then used in SQL statements.
 parquetFile.registerTempTable("parquetFile");
@@ -1087,9 +1087,9 @@ path
 
 {% endhighlight %}
 
-By passing `path/to/table` to either `SQLContext.parquetFile` or `SQLContext.load`, Spark SQL will
-automatically extract the partitioning information from the paths.  Now the schema of the returned
-DataFrame becomes:
+By passing `path/to/table` to either `SQLContext.read.parquet` or `SQLContext.read.load`, Spark SQL
+will automatically extract the partitioning information from the paths.
+Now the schema of the returned DataFrame becomes:
 
 {% highlight text %}
 
@@ -1122,15 +1122,15 @@ import sqlContext.implicits._
 
 // Create a simple DataFrame, stored into a partition directory
 val df1 = sparkContext.makeRDD(1 to 5).map(i => (i, i * 2)).toDF("single", "double")
-df1.saveAsParquetFile("data/test_table/key=1")
+df1.write.parquet("data/test_table/key=1")
 
 // Create another DataFrame in a new partition directory,
 // adding a new column and dropping an existing column
 val df2 = sparkContext.makeRDD(6 to 10).map(i => (i, i * 3)).toDF("single", "triple")
-df2.saveAsParquetFile("data/test_table/key=2")
+df2.write.parquet("data/test_table/key=2")
 
 // Read the partitioned table
-val df3 = sqlContext.parquetFile("data/test_table")
+val df3 = sqlContext.read.parquet("data/test_table")
 df3.printSchema()
 
 // The final schema consists of all 3 columns in the Parquet files together
@@ -1269,12 +1269,10 @@ Configuration of Parquet can be done using the `setConf` method on `SQLContext`
 
 <div data-lang="scala"  markdown="1">
 Spark SQL can automatically infer the schema of a JSON dataset and load it as a DataFrame.
-This conversion can be done using one of two methods in a `SQLContext`:
+This conversion can be done using `SQLContext.read.json()` on either an RDD of String,
+or a JSON file.
 
-* `jsonFile` - loads data from a directory of JSON files where each line of the files is a JSON object.
-* `jsonRDD` - loads data from an existing RDD where each element of the RDD is a string containing a JSON object.
-
-Note that the file that is offered as _jsonFile_ is not a typical JSON file. Each
+Note that the file that is offered as _a json file_ is not a typical JSON file. Each
 line must contain a separate, self-contained valid JSON object. As a consequence,
 a regular multi-line JSON file will most often fail.
 
@@ -1285,8 +1283,7 @@ val sqlContext = new org.apache.spark.sql.SQLContext(sc)
 // A JSON dataset is pointed to by path.
 // The path can be either a single text file or a directory storing text files.
 val path = "examples/src/main/resources/people.json"
-// Create a DataFrame from the file(s) pointed to by path
-val people = sqlContext.jsonFile(path)
+val people = sqlContext.read.json(path)
 
 // The inferred schema can be visualized using the printSchema() method.
 people.printSchema()
@@ -1304,19 +1301,17 @@ val teenagers = sqlContext.sql("SELECT name FROM people WHERE age >= 13 AND age
 // an RDD[String] storing one JSON object per string.
 val anotherPeopleRDD = sc.parallelize(
   """{"name":"Yin","address":{"city":"Columbus","state":"Ohio"}}""" :: Nil)
-val anotherPeople = sqlContext.jsonRDD(anotherPeopleRDD)
+val anotherPeople = sqlContext.read.json(anotherPeopleRDD)
 {% endhighlight %}
 
 </div>
 
 <div data-lang="java"  markdown="1">
 Spark SQL can automatically infer the schema of a JSON dataset and load it as a DataFrame.
-This conversion can be done using one of two methods in a `SQLContext` :
-
-* `jsonFile` - loads data from a directory of JSON files where each line of the files is a JSON object.
-* `jsonRDD` - loads data from an existing RDD where each element of the RDD is a string containing a JSON object.
+This conversion can be done using `SQLContext.read().json()` on either an RDD of String,
+or a JSON file.
 
-Note that the file that is offered as _jsonFile_ is not a typical JSON file. Each
+Note that the file that is offered as _a json file_ is not a typical JSON file. Each
 line must contain a separate, self-contained valid JSON object. As a consequence,
 a regular multi-line JSON file will most often fail.
 
@@ -1326,9 +1321,7 @@ SQLContext sqlContext = new org.apache.spark.sql.SQLContext(sc);
 
 // A JSON dataset is pointed to by path.
 // The path can be either a single text file or a directory storing text files.
-String path = "examples/src/main/resources/people.json";
-// Create a DataFrame from the file(s) pointed to by path
-DataFrame people = sqlContext.jsonFile(path);
+DataFrame people = sqlContext.read().json("examples/src/main/resources/people.json");
 
 // The inferred schema can be visualized using the printSchema() method.
 people.printSchema();
@@ -1347,18 +1340,15 @@ DataFrame teenagers = sqlContext.sql("SELECT name FROM people WHERE age >= 13 AN
 List<String> jsonData = Arrays.asList(
   "{\"name\":\"Yin\",\"address\":{\"city\":\"Columbus\",\"state\":\"Ohio\"}}");
 JavaRDD<String> anotherPeopleRDD = sc.parallelize(jsonData);
-DataFrame anotherPeople = sqlContext.jsonRDD(anotherPeopleRDD);
+DataFrame anotherPeople = sqlContext.read().json(anotherPeopleRDD);
 {% endhighlight %}
 </div>
 
 <div data-lang="python"  markdown="1">
 Spark SQL can automatically infer the schema of a JSON dataset and load it as a DataFrame.
-This conversion can be done using one of two methods in a `SQLContext`:
-
-* `jsonFile` - loads data from a directory of JSON files where each line of the files is a JSON object.
-* `jsonRDD` - loads data from an existing RDD where each element of the RDD is a string containing a JSON object.
+This conversion can be done using `SQLContext.read.json` on a JSON file.
 
-Note that the file that is offered as _jsonFile_ is not a typical JSON file. Each
+Note that the file that is offered as _a json file_ is not a typical JSON file. Each
 line must contain a separate, self-contained valid JSON object. As a consequence,
 a regular multi-line JSON file will most often fail.
 
@@ -1369,9 +1359,7 @@ sqlContext = SQLContext(sc)
 
 # A JSON dataset is pointed to by path.
 # The path can be either a single text file or a directory storing text files.
-path = "examples/src/main/resources/people.json"
-# Create a DataFrame from the file(s) pointed to by path
-people = sqlContext.jsonFile(path)
+people = sqlContext.read.json("examples/src/main/resources/people.json")
 
 # The inferred schema can be visualized using the printSchema() method.
 people.printSchema()
@@ -1394,12 +1382,11 @@ anotherPeople = sqlContext.jsonRDD(anotherPeopleRDD)
 </div>
 
 <div data-lang="r"  markdown="1">
-Spark SQL can automatically infer the schema of a JSON dataset and load it as a DataFrame.
-This conversion can be done using one of two methods in a `SQLContext`:
-
-* `jsonFile` - loads data from a directory of JSON files where each line of the files is a JSON object.
+Spark SQL can automatically infer the schema of a JSON dataset and load it as a DataFrame. using
+the `jsonFile` function, which loads data from a directory of JSON files where each line of the
+files is a JSON object.
 
-Note that the file that is offered as _jsonFile_ is not a typical JSON file. Each
+Note that the file that is offered as _a json file_ is not a typical JSON file. Each
 line must contain a separate, self-contained valid JSON object. As a consequence,
 a regular multi-line JSON file will most often fail.
 
@@ -1883,6 +1870,25 @@ options.
 
 ## Upgrading from Spark SQL 1.3 to 1.4
 
+#### DataFrame data reader/writer interface
+
+Based on user feedback, we created a new, more fluid API for reading data in (`SQLContext.read`)
+and writing data out (`DataFrame.write`), 
+and deprecated the old APIs (e.g. `SQLContext.parquetFile`, `SQLContext.jsonFile`).
+
+See the API docs for `SQLContext.read` (
+  <a href="api/scala/index.html#org.apache.spark.sql.SQLContext@read:DataFrameReader">Scala</a>,
+  <a href="api/java/org/apache/spark/sql/SQLContext.html#read()">Java</a>,
+  <a href="api/python/pyspark.sql.html#pyspark.sql.SQLContext.read">Python</a>
+) and `DataFrame.write` (
+  <a href="api/scala/index.html#org.apache.spark.sql.DataFrame@write:DataFrameWriter">Scala</a>,
+  <a href="api/java/org/apache/spark/sql/DataFrame.html#write()">Java</a>,
+  <a href="api/python/pyspark.sql.html#pyspark.sql.DataFrame.write">Python</a>
+) more information.
+
+
+#### DataFrame.groupBy retains grouping columns
+
 Based on user feedback, we changed the default behavior of `DataFrame.groupBy().agg()` to retain the grouping columns in the resulting `DataFrame`. To keep the behavior in 1.3, set `spark.sql.retainGroupColumns` to `false`.
 
 <div class="codetabs">

From f7fe9e474417a68635a5ed1aa819d81a9be40895 Mon Sep 17 00:00:00 2001
From: Cheng Lian <lian@databricks.com>
Date: Sun, 31 May 2015 12:56:41 +0800
Subject: [PATCH 277/525] [SQL] [MINOR] Fixes a minor comment mistake in
 IsolatedClientLoader

Author: Cheng Lian <lian@databricks.com>

Closes #6521 from liancheng/classloader-comment-fix and squashes the following commits:

fc09606 [Cheng Lian] Addresses @srowen's comment
59945c5 [Cheng Lian] Fixes a minor comment mistake in IsolatedClientLoader
---
 .../spark/sql/hive/client/IsolatedClientLoader.scala   | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/IsolatedClientLoader.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/IsolatedClientLoader.scala
index 196a3d836cab2..16851fdd71a98 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/IsolatedClientLoader.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/IsolatedClientLoader.scala
@@ -90,14 +90,14 @@ private[hive] object IsolatedClientLoader {
  *    `ClientInterface`, unless `isolationOn` is set to `false`.
  *
  * @param version The version of hive on the classpath.  used to pick specific function signatures
- *                that are not compatibile accross versions.
+ *                that are not compatible across versions.
  * @param execJars A collection of jar files that must include hive and hadoop.
  * @param config   A set of options that will be added to the HiveConf of the constructed client.
  * @param isolationOn When true, custom versions of barrier classes will be constructed.  Must be
  *                    true unless loading the version of hive that is on Sparks classloader.
- * @param rootClassLoader The system root classloader.  Must not know about hive classes.
- * @param baseClassLoader The spark classloader that is used to load shared classes.
- *
+ * @param rootClassLoader The system root classloader.
+ * @param baseClassLoader The spark classloader that is used to load shared classes.  Must not know
+ *                        about Hive classes.
  */
 private[hive] class IsolatedClientLoader(
     val version: HiveVersion,
@@ -110,7 +110,7 @@ private[hive] class IsolatedClientLoader(
     val barrierPrefixes: Seq[String] = Seq.empty)
   extends Logging {
 
-  // Check to make sure that the root classloader does not know about Hive.
+  // Check to make sure that the base classloader does not know about Hive.
   assert(Try(baseClassLoader.loadClass("org.apache.hive.HiveConf")).isFailure)
 
   /** All jars used by the hive specific classloader. */

From 084fef76e90116c6465cd6fad7c0197c3e4d4313 Mon Sep 17 00:00:00 2001
From: Reynold Xin <rxin@databricks.com>
Date: Sat, 30 May 2015 23:36:32 -0700
Subject: [PATCH 278/525] [SPARK-7976] Add style checker to disallow overriding
 finalize.

Author: Reynold Xin <rxin@databricks.com>

Closes #6528 from rxin/style-finalizer and squashes the following commits:

a2211ca [Reynold Xin] [SPARK-7976] Enable NoFinalizeChecker.
---
 core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala | 2 ++
 scalastyle-config.xml                                           | 2 +-
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala b/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala
index a77bf42ce1d38..51388f01a31a3 100644
--- a/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala
+++ b/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala
@@ -843,6 +843,7 @@ private class PythonAccumulatorParam(@transient serverHost: String, serverPort:
  * An Wrapper for Python Broadcast, which is written into disk by Python. It also will
  * write the data into disk after deserialization, then Python can read it from disks.
  */
+// scalastyle:off no.finalize
 private[spark] class PythonBroadcast(@transient var path: String) extends Serializable {
 
   /**
@@ -884,3 +885,4 @@ private[spark] class PythonBroadcast(@transient var path: String) extends Serial
     }
   }
 }
+// scalastyle:on no.finalize
diff --git a/scalastyle-config.xml b/scalastyle-config.xml
index a0098169a0248..072c48062ca75 100644
--- a/scalastyle-config.xml
+++ b/scalastyle-config.xml
@@ -96,7 +96,7 @@
  <!-- <check level="error" class="org.scalastyle.scalariform.ReturnChecker" enabled="true"></check> -->
  <!-- <check level="error" class="org.scalastyle.scalariform.NullChecker" enabled="true"></check> -->
  <!-- <check level="error" class="org.scalastyle.scalariform.NoCloneChecker" enabled="true"></check> -->
- <!-- <check level="error" class="org.scalastyle.scalariform.NoFinalizeChecker" enabled="true"></check> -->
+  <check level="error" class="org.scalastyle.scalariform.NoFinalizeChecker" enabled="true"></check>
  <!-- <check level="error" class="org.scalastyle.scalariform.CovariantEqualsChecker" enabled="true"></check> -->
  <!-- <check level="error" class="org.scalastyle.scalariform.StructuralTypeChecker" enabled="true"></check> -->
  <!-- <check level="error" class="org.scalastyle.file.RegexChecker" enabled="true"> -->

From 8764dccebd44292ab6f6834640199aad451459c5 Mon Sep 17 00:00:00 2001
From: Cheng Lian <lian@databricks.com>
Date: Sat, 30 May 2015 23:49:42 -0700
Subject: [PATCH 279/525] [SQL] [MINOR] Adds @deprecated Scaladoc entry for
 SchemaRDD

Author: Cheng Lian <lian@databricks.com>

Closes #6529 from liancheng/schemardd-deprecation-fix and squashes the following commits:

49765c2 [Cheng Lian] Adds @deprecated Scaladoc entry for SchemaRDD
---
 sql/core/src/main/scala/org/apache/spark/sql/package.scala | 1 +
 1 file changed, 1 insertion(+)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/package.scala b/sql/core/src/main/scala/org/apache/spark/sql/package.scala
index 3f97a11ceb97d..4e94fd07a8771 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/package.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/package.scala
@@ -44,6 +44,7 @@ package object sql {
 
   /**
    * Type alias for [[DataFrame]]. Kept here for backward source compatibility for Scala.
+   * @deprecated As of 1.3.0, replaced by `DataFrame`.
    */
   @deprecated("1.3.0", "use DataFrame")
   type SchemaRDD = DataFrame

From 7896e99b2a0a160bd0b6c5c11cf40b6cbf4a65cf Mon Sep 17 00:00:00 2001
From: Reynold Xin <rxin@databricks.com>
Date: Sun, 31 May 2015 00:05:55 -0700
Subject: [PATCH 280/525] [SPARK-7975] Add style checker to disallow overriding
 equals covariantly.

Author: Reynold Xin <rxin@databricks.com>

This patch had conflicts when merged, resolved by
Committer: Reynold Xin <rxin@databricks.com>

Closes #6527 from rxin/covariant-equals and squashes the following commits:

e7d7784 [Reynold Xin] [SPARK-7975] Enforce CovariantEqualsChecker
---
 scalastyle-config.xml                                           | 2 +-
 .../main/scala/org/apache/spark/sql/parquet/newParquet.scala    | 2 +-
 .../scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala  | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/scalastyle-config.xml b/scalastyle-config.xml
index 072c48062ca75..3a984222167b0 100644
--- a/scalastyle-config.xml
+++ b/scalastyle-config.xml
@@ -97,7 +97,7 @@
  <!-- <check level="error" class="org.scalastyle.scalariform.NullChecker" enabled="true"></check> -->
  <!-- <check level="error" class="org.scalastyle.scalariform.NoCloneChecker" enabled="true"></check> -->
   <check level="error" class="org.scalastyle.scalariform.NoFinalizeChecker" enabled="true"></check>
- <!-- <check level="error" class="org.scalastyle.scalariform.CovariantEqualsChecker" enabled="true"></check> -->
+  <check level="error" class="org.scalastyle.scalariform.CovariantEqualsChecker" enabled="true"></check>
  <!-- <check level="error" class="org.scalastyle.scalariform.StructuralTypeChecker" enabled="true"></check> -->
  <!-- <check level="error" class="org.scalastyle.file.RegexChecker" enabled="true"> -->
  <!--  <parameters> -->
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/newParquet.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/newParquet.scala
index 8b3e1b2b59bf6..e439a18ac43aa 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/newParquet.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/newParquet.scala
@@ -155,7 +155,7 @@ private[sql] class ParquetRelation2(
     meta
   }
 
-  override def equals(other: scala.Any): Boolean = other match {
+  override def equals(other: Any): Boolean = other match {
     case that: ParquetRelation2 =>
       val schemaEquality = if (shouldMergeSchemas) {
         this.shouldMergeSchemas == that.shouldMergeSchemas
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
index 47b85731587d5..ca1f49b546bd7 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
@@ -596,7 +596,7 @@ private[hive] case class MetastoreRelation
 
   self: Product =>
 
-  override def equals(other: scala.Any): Boolean = other match {
+  override def equals(other: Any): Boolean = other match {
     case relation: MetastoreRelation =>
       databaseName == relation.databaseName &&
         tableName == relation.tableName &&

From 74fdc97c7206c6d715f128ef7c46055e0bb90760 Mon Sep 17 00:00:00 2001
From: Reynold Xin <rxin@databricks.com>
Date: Sun, 31 May 2015 00:16:22 -0700
Subject: [PATCH 281/525] [SPARK-3850] Trim trailing spaces for core.

Author: Reynold Xin <rxin@databricks.com>

Closes #6533 from rxin/whitespace-2 and squashes the following commits:

038314c [Reynold Xin] [SPARK-3850] Trim trailing spaces for core.
---
 .../scala/org/apache/spark/Aggregator.scala   |  4 +--
 .../scala/org/apache/spark/FutureAction.scala |  2 +-
 .../org/apache/spark/HeartbeatReceiver.scala  | 20 ++++++-------
 .../org/apache/spark/HttpFileServer.scala     |  4 +--
 .../scala/org/apache/spark/SparkConf.scala    | 12 ++++----
 .../scala/org/apache/spark/TestUtils.scala    |  2 +-
 .../apache/spark/api/java/JavaDoubleRDD.scala |  2 +-
 .../org/apache/spark/api/java/JavaRDD.scala   |  6 ++--
 .../apache/spark/api/python/PythonRDD.scala   |  4 +--
 .../org/apache/spark/api/r/RBackend.scala     |  4 +--
 .../apache/spark/api/r/RBackendHandler.scala  |  2 +-
 .../org/apache/spark/deploy/SparkSubmit.scala |  2 +-
 .../history/HistoryServerArguments.scala      |  2 +-
 .../master/ZooKeeperPersistenceEngine.scala   |  2 +-
 .../apache/spark/executor/TaskMetrics.scala   | 16 +++++------
 .../apache/spark/metrics/sink/Slf4jSink.scala |  4 +--
 .../apache/spark/metrics/sink/package.scala   |  2 +-
 .../apache/spark/rdd/AsyncRDDActions.scala    |  4 +--
 .../org/apache/spark/rdd/NewHadoopRDD.scala   |  2 +-
 .../apache/spark/rdd/PairRDDFunctions.scala   |  2 +-
 .../spark/scheduler/ReplayListenerBus.scala   |  4 +--
 .../org/apache/spark/scheduler/Task.scala     |  2 +-
 .../spark/scheduler/TaskSetManager.scala      |  4 +--
 .../CoarseGrainedSchedulerBackend.scala       |  2 +-
 .../mesos/MesosSchedulerBackendUtil.scala     |  4 +--
 .../spark/serializer/KryoSerializer.scala     |  2 +-
 .../hash/BlockStoreShuffleFetcher.scala       |  2 +-
 .../status/api/v1/OneStageResource.scala      |  2 +-
 .../storage/BlockManagerMasterEndpoint.scala  |  8 +++---
 .../spark/storage/DiskBlockManager.scala      |  2 +-
 .../spark/storage/TachyonBlockManager.scala   |  4 +--
 .../scala/org/apache/spark/ui/WebUI.scala     |  4 +--
 .../spark/ui/jobs/JobProgressListener.scala   |  2 +-
 .../spark/util/AsynchronousListenerBus.scala  |  2 +-
 .../org/apache/spark/util/SizeEstimator.scala |  4 +--
 .../collection/ExternalAppendOnlyMap.scala    |  4 +--
 .../util/collection/ExternalSorter.scala      |  2 +-
 .../scala/org/apache/spark/FailureSuite.scala |  6 ++--
 .../apache/spark/ImplicitOrderingSuite.scala  | 28 +++++++++----------
 .../org/apache/spark/SparkContextSuite.scala  | 10 +++----
 .../org/apache/spark/rdd/JdbcRDDSuite.scala   |  2 +-
 .../cluster/mesos/MemoryUtilsSuite.scala      |  4 +--
 .../mesos/MesosSchedulerBackendSuite.scala    |  4 +--
 .../serializer/KryoSerializerSuite.scala      |  6 ++--
 .../ProactiveClosureSerializationSuite.scala  | 18 ++++++------
 .../spark/util/ClosureCleanerSuite.scala      |  2 +-
 .../util/random/RandomSamplerSuite.scala      |  2 +-
 47 files changed, 117 insertions(+), 117 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/Aggregator.scala b/core/src/main/scala/org/apache/spark/Aggregator.scala
index b8a5f5016860f..ceeb58075d345 100644
--- a/core/src/main/scala/org/apache/spark/Aggregator.scala
+++ b/core/src/main/scala/org/apache/spark/Aggregator.scala
@@ -34,8 +34,8 @@ case class Aggregator[K, V, C] (
     mergeValue: (C, V) => C,
     mergeCombiners: (C, C) => C) {
 
-  // When spilling is enabled sorting will happen externally, but not necessarily with an 
-  // ExternalSorter. 
+  // When spilling is enabled sorting will happen externally, but not necessarily with an
+  // ExternalSorter.
   private val isSpillEnabled = SparkEnv.get.conf.getBoolean("spark.shuffle.spill", true)
 
   @deprecated("use combineValuesByKey with TaskContext argument", "0.9.0")
diff --git a/core/src/main/scala/org/apache/spark/FutureAction.scala b/core/src/main/scala/org/apache/spark/FutureAction.scala
index 91f9ef8ce7185..48792a958130c 100644
--- a/core/src/main/scala/org/apache/spark/FutureAction.scala
+++ b/core/src/main/scala/org/apache/spark/FutureAction.scala
@@ -150,7 +150,7 @@ class SimpleFutureAction[T] private[spark](jobWaiter: JobWaiter[_], resultFunc:
   }
 
   override def isCompleted: Boolean = jobWaiter.jobFinished
-  
+
   override def isCancelled: Boolean = _cancelled
 
   override def value: Option[Try[T]] = {
diff --git a/core/src/main/scala/org/apache/spark/HeartbeatReceiver.scala b/core/src/main/scala/org/apache/spark/HeartbeatReceiver.scala
index f2b024ff6cb67..6909015ff66e6 100644
--- a/core/src/main/scala/org/apache/spark/HeartbeatReceiver.scala
+++ b/core/src/main/scala/org/apache/spark/HeartbeatReceiver.scala
@@ -29,7 +29,7 @@ import org.apache.spark.util.{ThreadUtils, Utils}
 
 /**
  * A heartbeat from executors to the driver. This is a shared message used by several internal
- * components to convey liveness or execution information for in-progress tasks. It will also 
+ * components to convey liveness or execution information for in-progress tasks. It will also
  * expire the hosts that have not heartbeated for more than spark.network.timeout.
  */
 private[spark] case class Heartbeat(
@@ -43,8 +43,8 @@ private[spark] case class Heartbeat(
  */
 private[spark] case object TaskSchedulerIsSet
 
-private[spark] case object ExpireDeadHosts 
-    
+private[spark] case object ExpireDeadHosts
+
 private[spark] case class HeartbeatResponse(reregisterBlockManager: Boolean)
 
 /**
@@ -62,18 +62,18 @@ private[spark] class HeartbeatReceiver(sc: SparkContext)
 
   // "spark.network.timeout" uses "seconds", while `spark.storage.blockManagerSlaveTimeoutMs` uses
   // "milliseconds"
-  private val slaveTimeoutMs = 
+  private val slaveTimeoutMs =
     sc.conf.getTimeAsMs("spark.storage.blockManagerSlaveTimeoutMs", "120s")
-  private val executorTimeoutMs = 
+  private val executorTimeoutMs =
     sc.conf.getTimeAsSeconds("spark.network.timeout", s"${slaveTimeoutMs}ms") * 1000
-  
+
   // "spark.network.timeoutInterval" uses "seconds", while
   // "spark.storage.blockManagerTimeoutIntervalMs" uses "milliseconds"
-  private val timeoutIntervalMs = 
+  private val timeoutIntervalMs =
     sc.conf.getTimeAsMs("spark.storage.blockManagerTimeoutIntervalMs", "60s")
-  private val checkTimeoutIntervalMs = 
+  private val checkTimeoutIntervalMs =
     sc.conf.getTimeAsSeconds("spark.network.timeoutInterval", s"${timeoutIntervalMs}ms") * 1000
-  
+
   private var timeoutCheckingTask: ScheduledFuture[_] = null
 
   // "eventLoopThread" is used to run some pretty fast actions. The actions running in it should not
@@ -140,7 +140,7 @@ private[spark] class HeartbeatReceiver(sc: SparkContext)
       }
     }
   }
-  
+
   override def onStop(): Unit = {
     if (timeoutCheckingTask != null) {
       timeoutCheckingTask.cancel(true)
diff --git a/core/src/main/scala/org/apache/spark/HttpFileServer.scala b/core/src/main/scala/org/apache/spark/HttpFileServer.scala
index 7e706bcc42f04..7cf7bc0dc6810 100644
--- a/core/src/main/scala/org/apache/spark/HttpFileServer.scala
+++ b/core/src/main/scala/org/apache/spark/HttpFileServer.scala
@@ -50,8 +50,8 @@ private[spark] class HttpFileServer(
 
   def stop() {
     httpServer.stop()
-    
-    // If we only stop sc, but the driver process still run as a services then we need to delete 
+
+    // If we only stop sc, but the driver process still run as a services then we need to delete
     // the tmp dir, if not, it will create too many tmp dirs
     try {
       Utils.deleteRecursively(baseDir)
diff --git a/core/src/main/scala/org/apache/spark/SparkConf.scala b/core/src/main/scala/org/apache/spark/SparkConf.scala
index 4b5bcb54aa873..46d72841dccce 100644
--- a/core/src/main/scala/org/apache/spark/SparkConf.scala
+++ b/core/src/main/scala/org/apache/spark/SparkConf.scala
@@ -227,7 +227,7 @@ class SparkConf(loadDefaults: Boolean) extends Cloneable with Logging {
   def getSizeAsBytes(key: String, defaultValue: String): Long = {
     Utils.byteStringAsBytes(get(key, defaultValue))
   }
-  
+
   /**
    * Get a size parameter as Kibibytes; throws a NoSuchElementException if it's not set. If no
    * suffix is provided then Kibibytes are assumed.
@@ -244,7 +244,7 @@ class SparkConf(loadDefaults: Boolean) extends Cloneable with Logging {
   def getSizeAsKb(key: String, defaultValue: String): Long = {
     Utils.byteStringAsKb(get(key, defaultValue))
   }
-  
+
   /**
    * Get a size parameter as Mebibytes; throws a NoSuchElementException if it's not set. If no
    * suffix is provided then Mebibytes are assumed.
@@ -261,7 +261,7 @@ class SparkConf(loadDefaults: Boolean) extends Cloneable with Logging {
   def getSizeAsMb(key: String, defaultValue: String): Long = {
     Utils.byteStringAsMb(get(key, defaultValue))
   }
-  
+
   /**
    * Get a size parameter as Gibibytes; throws a NoSuchElementException if it's not set. If no
    * suffix is provided then Gibibytes are assumed.
@@ -278,7 +278,7 @@ class SparkConf(loadDefaults: Boolean) extends Cloneable with Logging {
   def getSizeAsGb(key: String, defaultValue: String): Long = {
     Utils.byteStringAsGb(get(key, defaultValue))
   }
-  
+
   /** Get a parameter as an Option */
   def getOption(key: String): Option[String] = {
     Option(settings.get(key)).orElse(getDeprecatedConfig(key, this))
@@ -480,7 +480,7 @@ private[spark] object SparkConf extends Logging {
           "spark.kryoserializer.buffer.mb was previously specified as '0.064'. Fractional values " +
           "are no longer accepted. To specify the equivalent now, one may use '64k'.")
     )
-    
+
     Map(configs.map { cfg => (cfg.key -> cfg) } : _*)
   }
 
@@ -508,7 +508,7 @@ private[spark] object SparkConf extends Logging {
     "spark.reducer.maxSizeInFlight" -> Seq(
       AlternateConfig("spark.reducer.maxMbInFlight", "1.4")),
     "spark.kryoserializer.buffer" ->
-        Seq(AlternateConfig("spark.kryoserializer.buffer.mb", "1.4", 
+        Seq(AlternateConfig("spark.kryoserializer.buffer.mb", "1.4",
           translation = s => s"${(s.toDouble * 1000).toInt}k")),
     "spark.kryoserializer.buffer.max" -> Seq(
       AlternateConfig("spark.kryoserializer.buffer.max.mb", "1.4")),
diff --git a/core/src/main/scala/org/apache/spark/TestUtils.scala b/core/src/main/scala/org/apache/spark/TestUtils.scala
index fe6320b504e15..a1ebbecf93b7b 100644
--- a/core/src/main/scala/org/apache/spark/TestUtils.scala
+++ b/core/src/main/scala/org/apache/spark/TestUtils.scala
@@ -51,7 +51,7 @@ private[spark] object TestUtils {
       classpathUrls: Seq[URL] = Seq()): URL = {
     val tempDir = Utils.createTempDir()
     val files1 = for (name <- classNames) yield {
-      createCompiledClass(name, tempDir, toStringValue, classpathUrls = classpathUrls) 
+      createCompiledClass(name, tempDir, toStringValue, classpathUrls = classpathUrls)
     }
     val files2 = for ((childName, baseName) <- classNamesWithBase) yield {
       createCompiledClass(childName, tempDir, toStringValue, baseName, classpathUrls)
diff --git a/core/src/main/scala/org/apache/spark/api/java/JavaDoubleRDD.scala b/core/src/main/scala/org/apache/spark/api/java/JavaDoubleRDD.scala
index 61af867b11b9c..a650df605b92e 100644
--- a/core/src/main/scala/org/apache/spark/api/java/JavaDoubleRDD.scala
+++ b/core/src/main/scala/org/apache/spark/api/java/JavaDoubleRDD.scala
@@ -137,7 +137,7 @@ class JavaDoubleRDD(val srdd: RDD[scala.Double])
    */
   def sample(withReplacement: Boolean, fraction: JDouble): JavaDoubleRDD =
     sample(withReplacement, fraction, Utils.random.nextLong)
-    
+
   /**
    * Return a sampled subset of this RDD.
    */
diff --git a/core/src/main/scala/org/apache/spark/api/java/JavaRDD.scala b/core/src/main/scala/org/apache/spark/api/java/JavaRDD.scala
index db4e996feb31c..ed312770ee131 100644
--- a/core/src/main/scala/org/apache/spark/api/java/JavaRDD.scala
+++ b/core/src/main/scala/org/apache/spark/api/java/JavaRDD.scala
@@ -101,7 +101,7 @@ class JavaRDD[T](val rdd: RDD[T])(implicit val classTag: ClassTag[T])
 
   /**
    * Return a sampled subset of this RDD.
-   * 
+   *
    * @param withReplacement can elements be sampled multiple times (replaced when sampled out)
    * @param fraction expected size of the sample as a fraction of this RDD's size
    *  without replacement: probability that each element is chosen; fraction must be [0, 1]
@@ -109,10 +109,10 @@ class JavaRDD[T](val rdd: RDD[T])(implicit val classTag: ClassTag[T])
    */
   def sample(withReplacement: Boolean, fraction: Double): JavaRDD[T] =
     sample(withReplacement, fraction, Utils.random.nextLong)
-    
+
   /**
    * Return a sampled subset of this RDD.
-   * 
+   *
    * @param withReplacement can elements be sampled multiple times (replaced when sampled out)
    * @param fraction expected size of the sample as a fraction of this RDD's size
    *  without replacement: probability that each element is chosen; fraction must be [0, 1]
diff --git a/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala b/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala
index 51388f01a31a3..55a37f8c944b2 100644
--- a/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala
+++ b/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala
@@ -797,10 +797,10 @@ private class PythonAccumulatorParam(@transient serverHost: String, serverPort:
 
   val bufferSize = SparkEnv.get.conf.getInt("spark.buffer.size", 65536)
 
-  /** 
+  /**
    * We try to reuse a single Socket to transfer accumulator updates, as they are all added
    * by the DAGScheduler's single-threaded actor anyway.
-   */ 
+   */
   @transient var socket: Socket = _
 
   def openSocket(): Socket = synchronized {
diff --git a/core/src/main/scala/org/apache/spark/api/r/RBackend.scala b/core/src/main/scala/org/apache/spark/api/r/RBackend.scala
index 0a91977928cee..d24c650d37bb0 100644
--- a/core/src/main/scala/org/apache/spark/api/r/RBackend.scala
+++ b/core/src/main/scala/org/apache/spark/api/r/RBackend.scala
@@ -44,11 +44,11 @@ private[spark] class RBackend {
     bossGroup = new NioEventLoopGroup(2)
     val workerGroup = bossGroup
     val handler = new RBackendHandler(this)
-  
+
     bootstrap = new ServerBootstrap()
       .group(bossGroup, workerGroup)
       .channel(classOf[NioServerSocketChannel])
-  
+
     bootstrap.childHandler(new ChannelInitializer[SocketChannel]() {
       def initChannel(ch: SocketChannel): Unit = {
         ch.pipeline()
diff --git a/core/src/main/scala/org/apache/spark/api/r/RBackendHandler.scala b/core/src/main/scala/org/apache/spark/api/r/RBackendHandler.scala
index 026a1b9380357..2e86984c66b3a 100644
--- a/core/src/main/scala/org/apache/spark/api/r/RBackendHandler.scala
+++ b/core/src/main/scala/org/apache/spark/api/r/RBackendHandler.scala
@@ -77,7 +77,7 @@ private[r] class RBackendHandler(server: RBackend)
     val reply = bos.toByteArray
     ctx.write(reply)
   }
-  
+
   override def channelReadComplete(ctx: ChannelHandlerContext): Unit = {
     ctx.flush()
   }
diff --git a/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala b/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
index d1b32ea0778db..8cf4d58847d8e 100644
--- a/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
@@ -869,7 +869,7 @@ private[spark] object SparkSubmitUtils {
       md.addDependency(dd)
     }
   }
-  
+
   /** Add exclusion rules for dependencies already included in the spark-assembly */
   def addExclusionRules(
       ivySettings: IvySettings,
diff --git a/core/src/main/scala/org/apache/spark/deploy/history/HistoryServerArguments.scala b/core/src/main/scala/org/apache/spark/deploy/history/HistoryServerArguments.scala
index a2a97a7877ce7..4692d22651c93 100644
--- a/core/src/main/scala/org/apache/spark/deploy/history/HistoryServerArguments.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/history/HistoryServerArguments.scala
@@ -23,7 +23,7 @@ import org.apache.spark.util.Utils
 /**
  * Command-line parser for the master.
  */
-private[history] class HistoryServerArguments(conf: SparkConf, args: Array[String]) 
+private[history] class HistoryServerArguments(conf: SparkConf, args: Array[String])
   extends Logging {
   private var propertiesFile: String = null
 
diff --git a/core/src/main/scala/org/apache/spark/deploy/master/ZooKeeperPersistenceEngine.scala b/core/src/main/scala/org/apache/spark/deploy/master/ZooKeeperPersistenceEngine.scala
index 80db6d474b5c1..328d95a7a0c68 100644
--- a/core/src/main/scala/org/apache/spark/deploy/master/ZooKeeperPersistenceEngine.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/master/ZooKeeperPersistenceEngine.scala
@@ -32,7 +32,7 @@ import org.apache.spark.deploy.SparkCuratorUtil
 private[master] class ZooKeeperPersistenceEngine(conf: SparkConf, val serialization: Serialization)
   extends PersistenceEngine
   with Logging {
-  
+
   private val WORKING_DIR = conf.get("spark.deploy.zookeeper.dir", "/spark") + "/master_status"
   private val zk: CuratorFramework = SparkCuratorUtil.newClient(conf)
 
diff --git a/core/src/main/scala/org/apache/spark/executor/TaskMetrics.scala b/core/src/main/scala/org/apache/spark/executor/TaskMetrics.scala
index d90ae405a0849..38b61d7242fce 100644
--- a/core/src/main/scala/org/apache/spark/executor/TaskMetrics.scala
+++ b/core/src/main/scala/org/apache/spark/executor/TaskMetrics.scala
@@ -43,22 +43,22 @@ class TaskMetrics extends Serializable {
   private var _hostname: String = _
   def hostname: String = _hostname
   private[spark] def setHostname(value: String) = _hostname = value
-  
+
   /**
    * Time taken on the executor to deserialize this task
    */
   private var _executorDeserializeTime: Long = _
   def executorDeserializeTime: Long = _executorDeserializeTime
   private[spark] def setExecutorDeserializeTime(value: Long) = _executorDeserializeTime = value
-  
-  
+
+
   /**
    * Time the executor spends actually running the task (including fetching shuffle data)
    */
   private var _executorRunTime: Long = _
   def executorRunTime: Long = _executorRunTime
   private[spark] def setExecutorRunTime(value: Long) = _executorRunTime = value
-  
+
   /**
    * The number of bytes this task transmitted back to the driver as the TaskResult
    */
@@ -315,7 +315,7 @@ class ShuffleReadMetrics extends Serializable {
   def remoteBlocksFetched: Int = _remoteBlocksFetched
   private[spark] def incRemoteBlocksFetched(value: Int) = _remoteBlocksFetched += value
   private[spark] def decRemoteBlocksFetched(value: Int) = _remoteBlocksFetched -= value
-  
+
   /**
    * Number of local blocks fetched in this shuffle by this task
    */
@@ -333,7 +333,7 @@ class ShuffleReadMetrics extends Serializable {
   def fetchWaitTime: Long = _fetchWaitTime
   private[spark] def incFetchWaitTime(value: Long) = _fetchWaitTime += value
   private[spark] def decFetchWaitTime(value: Long) = _fetchWaitTime -= value
-  
+
   /**
    * Total number of remote bytes read from the shuffle by this task
    */
@@ -381,7 +381,7 @@ class ShuffleWriteMetrics extends Serializable {
   def shuffleBytesWritten: Long = _shuffleBytesWritten
   private[spark] def incShuffleBytesWritten(value: Long) = _shuffleBytesWritten += value
   private[spark] def decShuffleBytesWritten(value: Long) = _shuffleBytesWritten -= value
-  
+
   /**
    * Time the task spent blocking on writes to disk or buffer cache, in nanoseconds
    */
@@ -389,7 +389,7 @@ class ShuffleWriteMetrics extends Serializable {
   def shuffleWriteTime: Long = _shuffleWriteTime
   private[spark] def incShuffleWriteTime(value: Long) = _shuffleWriteTime += value
   private[spark] def decShuffleWriteTime(value: Long) = _shuffleWriteTime -= value
-  
+
   /**
    * Total number of records written to the shuffle by this task
    */
diff --git a/core/src/main/scala/org/apache/spark/metrics/sink/Slf4jSink.scala b/core/src/main/scala/org/apache/spark/metrics/sink/Slf4jSink.scala
index e8b3074e8f1a6..11dfcfe2f04e1 100644
--- a/core/src/main/scala/org/apache/spark/metrics/sink/Slf4jSink.scala
+++ b/core/src/main/scala/org/apache/spark/metrics/sink/Slf4jSink.scala
@@ -26,9 +26,9 @@ import org.apache.spark.SecurityManager
 import org.apache.spark.metrics.MetricsSystem
 
 private[spark] class Slf4jSink(
-    val property: Properties, 
+    val property: Properties,
     val registry: MetricRegistry,
-    securityMgr: SecurityManager) 
+    securityMgr: SecurityManager)
   extends Sink {
   val SLF4J_DEFAULT_PERIOD = 10
   val SLF4J_DEFAULT_UNIT = "SECONDS"
diff --git a/core/src/main/scala/org/apache/spark/metrics/sink/package.scala b/core/src/main/scala/org/apache/spark/metrics/sink/package.scala
index 90e3aa70b99ef..670e683663324 100644
--- a/core/src/main/scala/org/apache/spark/metrics/sink/package.scala
+++ b/core/src/main/scala/org/apache/spark/metrics/sink/package.scala
@@ -20,4 +20,4 @@ package org.apache.spark.metrics
 /**
  * Sinks used in Spark's metrics system.
  */
-package object sink 
+package object sink
diff --git a/core/src/main/scala/org/apache/spark/rdd/AsyncRDDActions.scala b/core/src/main/scala/org/apache/spark/rdd/AsyncRDDActions.scala
index bbf1b83af0795..ca1eb1f4e4a9a 100644
--- a/core/src/main/scala/org/apache/spark/rdd/AsyncRDDActions.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/AsyncRDDActions.scala
@@ -85,9 +85,9 @@ class AsyncRDDActions[T: ClassTag](self: RDD[T]) extends Serializable with Loggi
             numPartsToTry = partsScanned * 4
           } else {
             // the left side of max is >=1 whenever partsScanned >= 2
-            numPartsToTry = Math.max(1, 
+            numPartsToTry = Math.max(1,
               (1.5 * num * partsScanned / results.size).toInt - partsScanned)
-            numPartsToTry = Math.min(numPartsToTry, partsScanned * 4) 
+            numPartsToTry = Math.min(numPartsToTry, partsScanned * 4)
           }
         }
 
diff --git a/core/src/main/scala/org/apache/spark/rdd/NewHadoopRDD.scala b/core/src/main/scala/org/apache/spark/rdd/NewHadoopRDD.scala
index 2ab967f4bb313..84456d6d868dc 100644
--- a/core/src/main/scala/org/apache/spark/rdd/NewHadoopRDD.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/NewHadoopRDD.scala
@@ -196,7 +196,7 @@ class NewHadoopRDD[K, V](
   override def getPreferredLocations(hsplit: Partition): Seq[String] = {
     val split = hsplit.asInstanceOf[NewHadoopPartition].serializableHadoopSplit.value
     val locs = HadoopRDD.SPLIT_INFO_REFLECTIONS match {
-      case Some(c) => 
+      case Some(c) =>
         try {
           val infos = c.newGetLocationInfo.invoke(split).asInstanceOf[Array[AnyRef]]
           Some(HadoopRDD.convertSplitLocationInfo(infos))
diff --git a/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala b/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala
index 004899f27b7a6..cfd3e26faf2b9 100644
--- a/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala
@@ -328,7 +328,7 @@ class PairRDDFunctions[K, V](self: RDD[(K, V)])
     reduceByKeyLocally(func)
   }
 
-  /** 
+  /**
    * Count the number of elements for each key, collecting the results to a local Map.
    *
    * Note that this method should only be used if the resulting map is expected to be small, as
diff --git a/core/src/main/scala/org/apache/spark/scheduler/ReplayListenerBus.scala b/core/src/main/scala/org/apache/spark/scheduler/ReplayListenerBus.scala
index 86f357abb8723..c6d957b65f3fb 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/ReplayListenerBus.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/ReplayListenerBus.scala
@@ -41,7 +41,7 @@ private[spark] class ReplayListenerBus extends SparkListenerBus with Logging {
    *
    * @param logData Stream containing event log data.
    * @param sourceName Filename (or other source identifier) from whence @logData is being read
-   * @param maybeTruncated Indicate whether log file might be truncated (some abnormal situations 
+   * @param maybeTruncated Indicate whether log file might be truncated (some abnormal situations
    *        encountered, log file might not finished writing) or not
    */
   def replay(
@@ -62,7 +62,7 @@ private[spark] class ReplayListenerBus extends SparkListenerBus with Logging {
             if (!maybeTruncated || lines.hasNext) {
               throw jpe
             } else {
-              logWarning(s"Got JsonParseException from log file $sourceName" + 
+              logWarning(s"Got JsonParseException from log file $sourceName" +
                 s" at line $lineNumber, the file might not have finished writing cleanly.")
             }
         }
diff --git a/core/src/main/scala/org/apache/spark/scheduler/Task.scala b/core/src/main/scala/org/apache/spark/scheduler/Task.scala
index 586d1e06204c1..15101c64f0503 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/Task.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/Task.scala
@@ -125,7 +125,7 @@ private[spark] abstract class Task[T](val stageId: Int, var partitionId: Int) ex
     if (interruptThread && taskThread != null) {
       taskThread.interrupt()
     }
-  }  
+  }
 }
 
 /**
diff --git a/core/src/main/scala/org/apache/spark/scheduler/TaskSetManager.scala b/core/src/main/scala/org/apache/spark/scheduler/TaskSetManager.scala
index d473e51abab80..673cd0e19eba2 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/TaskSetManager.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/TaskSetManager.scala
@@ -861,9 +861,9 @@ private[spark] class TaskSetManager(
       case TaskLocality.RACK_LOCAL => "spark.locality.wait.rack"
       case _ => null
     }
-    
+
     if (localityWaitKey != null) {
-      conf.getTimeAsMs(localityWaitKey, defaultWait)  
+      conf.getTimeAsMs(localityWaitKey, defaultWait)
     } else {
       0L
     }
diff --git a/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala b/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala
index c5bc6294a5577..fcad959540f5a 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala
@@ -84,7 +84,7 @@ class CoarseGrainedSchedulerBackend(scheduler: TaskSchedulerImpl, val rpcEnv: Rp
     override def onStart() {
       // Periodically revive offers to allow delay scheduling to work
       val reviveIntervalMs = conf.getTimeAsMs("spark.scheduler.revive.interval", "1s")
- 
+
       reviveThread.scheduleAtFixedRate(new Runnable {
         override def run(): Unit = Utils.tryLogNonFatalError {
           Option(self).foreach(_.send(ReviveOffers))
diff --git a/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosSchedulerBackendUtil.scala b/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosSchedulerBackendUtil.scala
index 2f2934c249eb0..e79c543a9de27 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosSchedulerBackendUtil.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosSchedulerBackendUtil.scala
@@ -37,14 +37,14 @@ private[mesos] object MesosSchedulerBackendUtil extends Logging {
           .newBuilder()
           .setMode(Volume.Mode.RW)
         spec match {
-          case Array(container_path) => 
+          case Array(container_path) =>
             Some(vol.setContainerPath(container_path))
           case Array(container_path, "rw") =>
             Some(vol.setContainerPath(container_path))
           case Array(container_path, "ro") =>
             Some(vol.setContainerPath(container_path)
               .setMode(Volume.Mode.RO))
-          case Array(host_path, container_path) => 
+          case Array(host_path, container_path) =>
             Some(vol.setContainerPath(container_path)
               .setHostPath(host_path))
           case Array(host_path, container_path, "rw") =>
diff --git a/core/src/main/scala/org/apache/spark/serializer/KryoSerializer.scala b/core/src/main/scala/org/apache/spark/serializer/KryoSerializer.scala
index 3f909885dbd66..cd8a82347a1e9 100644
--- a/core/src/main/scala/org/apache/spark/serializer/KryoSerializer.scala
+++ b/core/src/main/scala/org/apache/spark/serializer/KryoSerializer.scala
@@ -52,7 +52,7 @@ class KryoSerializer(conf: SparkConf)
   with Serializable {
 
   private val bufferSizeKb = conf.getSizeAsKb("spark.kryoserializer.buffer", "64k")
-  
+
   if (bufferSizeKb >= ByteUnit.GiB.toKiB(2)) {
     throw new IllegalArgumentException("spark.kryoserializer.buffer must be less than " +
       s"2048 mb, got: + ${ByteUnit.KiB.toMiB(bufferSizeKb)} mb.")
diff --git a/core/src/main/scala/org/apache/spark/shuffle/hash/BlockStoreShuffleFetcher.scala b/core/src/main/scala/org/apache/spark/shuffle/hash/BlockStoreShuffleFetcher.scala
index 80374adc44296..597d46a3d2223 100644
--- a/core/src/main/scala/org/apache/spark/shuffle/hash/BlockStoreShuffleFetcher.scala
+++ b/core/src/main/scala/org/apache/spark/shuffle/hash/BlockStoreShuffleFetcher.scala
@@ -80,7 +80,7 @@ private[hash] object BlockStoreShuffleFetcher extends Logging {
       blocksByAddress,
       serializer,
       // Note: we use getSizeAsMb when no suffix is provided for backwards compatibility
-      SparkEnv.get.conf.getSizeAsMb("spark.reducer.maxSizeInFlight", "48m") * 1024 * 1024) 
+      SparkEnv.get.conf.getSizeAsMb("spark.reducer.maxSizeInFlight", "48m") * 1024 * 1024)
     val itr = blockFetcherItr.flatMap(unpackBlock)
 
     val completionIter = CompletionIterator[T, Iterator[T]](itr, {
diff --git a/core/src/main/scala/org/apache/spark/status/api/v1/OneStageResource.scala b/core/src/main/scala/org/apache/spark/status/api/v1/OneStageResource.scala
index fd24aea63a8a1..f9812f06cf527 100644
--- a/core/src/main/scala/org/apache/spark/status/api/v1/OneStageResource.scala
+++ b/core/src/main/scala/org/apache/spark/status/api/v1/OneStageResource.scala
@@ -83,7 +83,7 @@ private[v1] class OneStageResource(ui: SparkUI) {
     withStageAttempt(stageId, stageAttemptId) { stage =>
       val tasks = stage.ui.taskData.values.map{AllStagesResource.convertTaskData}.toIndexedSeq
         .sorted(OneStageResource.ordering(sortBy))
-      tasks.slice(offset, offset + length)  
+      tasks.slice(offset, offset + length)
     }
   }
 
diff --git a/core/src/main/scala/org/apache/spark/storage/BlockManagerMasterEndpoint.scala b/core/src/main/scala/org/apache/spark/storage/BlockManagerMasterEndpoint.scala
index 3afb4c3c02e2d..2cd8c5297b741 100644
--- a/core/src/main/scala/org/apache/spark/storage/BlockManagerMasterEndpoint.scala
+++ b/core/src/main/scala/org/apache/spark/storage/BlockManagerMasterEndpoint.scala
@@ -292,16 +292,16 @@ class BlockManagerMasterEndpoint(
       blockManagerIdByExecutor.get(id.executorId) match {
         case Some(oldId) =>
           // A block manager of the same executor already exists, so remove it (assumed dead)
-          logError("Got two different block manager registrations on same executor - " 
+          logError("Got two different block manager registrations on same executor - "
               + s" will replace old one $oldId with new one $id")
-          removeExecutor(id.executorId)  
+          removeExecutor(id.executorId)
         case None =>
       }
       logInfo("Registering block manager %s with %s RAM, %s".format(
         id.hostPort, Utils.bytesToString(maxMemSize), id))
-      
+
       blockManagerIdByExecutor(id.executorId) = id
-      
+
       blockManagerInfo(id) = new BlockManagerInfo(
         id, System.currentTimeMillis(), maxMemSize, slaveEndpoint)
     }
diff --git a/core/src/main/scala/org/apache/spark/storage/DiskBlockManager.scala b/core/src/main/scala/org/apache/spark/storage/DiskBlockManager.scala
index d441a4d31b954..91ef86389a0c3 100644
--- a/core/src/main/scala/org/apache/spark/storage/DiskBlockManager.scala
+++ b/core/src/main/scala/org/apache/spark/storage/DiskBlockManager.scala
@@ -151,7 +151,7 @@ private[spark] class DiskBlockManager(blockManager: BlockManager, conf: SparkCon
     try {
       Utils.removeShutdownHook(shutdownHook)
     } catch {
-      case e: Exception => 
+      case e: Exception =>
         logError(s"Exception while removing shutdown hook.", e)
     }
     doStop()
diff --git a/core/src/main/scala/org/apache/spark/storage/TachyonBlockManager.scala b/core/src/main/scala/org/apache/spark/storage/TachyonBlockManager.scala
index fb4ba0eac9d9a..b53c86e89a273 100644
--- a/core/src/main/scala/org/apache/spark/storage/TachyonBlockManager.scala
+++ b/core/src/main/scala/org/apache/spark/storage/TachyonBlockManager.scala
@@ -100,7 +100,7 @@ private[spark] class TachyonBlockManager() extends ExternalBlockManager with Log
     try {
       os.write(bytes.array())
     } catch {
-      case NonFatal(e) => 
+      case NonFatal(e) =>
         logWarning(s"Failed to put bytes of block $blockId into Tachyon", e)
         os.cancel()
     } finally {
@@ -114,7 +114,7 @@ private[spark] class TachyonBlockManager() extends ExternalBlockManager with Log
     try {
       blockManager.dataSerializeStream(blockId, os, values)
     } catch {
-      case NonFatal(e) => 
+      case NonFatal(e) =>
         logWarning(s"Failed to put values of block $blockId into Tachyon", e)
         os.cancel()
     } finally {
diff --git a/core/src/main/scala/org/apache/spark/ui/WebUI.scala b/core/src/main/scala/org/apache/spark/ui/WebUI.scala
index 594df15e9cc85..2c84e4485996e 100644
--- a/core/src/main/scala/org/apache/spark/ui/WebUI.scala
+++ b/core/src/main/scala/org/apache/spark/ui/WebUI.scala
@@ -62,12 +62,12 @@ private[spark] abstract class WebUI(
     tab.pages.foreach(attachPage)
     tabs += tab
   }
-  
+
   def detachTab(tab: WebUITab) {
     tab.pages.foreach(detachPage)
     tabs -= tab
   }
-  
+
   def detachPage(page: WebUIPage) {
     pageToHandlers.remove(page).foreach(_.foreach(detachHandler))
   }
diff --git a/core/src/main/scala/org/apache/spark/ui/jobs/JobProgressListener.scala b/core/src/main/scala/org/apache/spark/ui/jobs/JobProgressListener.scala
index 246e191d64776..f39e961772c46 100644
--- a/core/src/main/scala/org/apache/spark/ui/jobs/JobProgressListener.scala
+++ b/core/src/main/scala/org/apache/spark/ui/jobs/JobProgressListener.scala
@@ -119,7 +119,7 @@ class JobProgressListener(conf: SparkConf) extends SparkListener with Logging {
       "failedStages" -> failedStages.size
     )
   }
-  
+
   // These collections may grow arbitrarily, but once Spark becomes idle they should shrink back to
   // some bound based on the `spark.ui.retainedStages` and `spark.ui.retainedJobs` settings:
   private[spark] def getSizesOfSoftSizeLimitedCollections: Map[String, Int] = {
diff --git a/core/src/main/scala/org/apache/spark/util/AsynchronousListenerBus.scala b/core/src/main/scala/org/apache/spark/util/AsynchronousListenerBus.scala
index ce7887b76ff96..1861d38640102 100644
--- a/core/src/main/scala/org/apache/spark/util/AsynchronousListenerBus.scala
+++ b/core/src/main/scala/org/apache/spark/util/AsynchronousListenerBus.scala
@@ -40,7 +40,7 @@ private[spark] abstract class AsynchronousListenerBus[L <: AnyRef, E](name: Stri
   self =>
 
   private var sparkContext: SparkContext = null
-  
+
   /* Cap the capacity of the event queue so we get an explicit error (rather than
    * an OOM exception) if it's perpetually being added to more quickly than it's being drained. */
   private val EVENT_QUEUE_CAPACITY = 10000
diff --git a/core/src/main/scala/org/apache/spark/util/SizeEstimator.scala b/core/src/main/scala/org/apache/spark/util/SizeEstimator.scala
index f1f6b5e1f93d8..0180399c9dad5 100644
--- a/core/src/main/scala/org/apache/spark/util/SizeEstimator.scala
+++ b/core/src/main/scala/org/apache/spark/util/SizeEstimator.scala
@@ -236,7 +236,7 @@ object SizeEstimator extends Logging {
         val s1 = sampleArray(array, state, rand, drawn, length)
         val s2 = sampleArray(array, state, rand, drawn, length)
         val size = math.min(s1, s2)
-        state.size += math.max(s1, s2) + 
+        state.size += math.max(s1, s2) +
           (size * ((length - ARRAY_SAMPLE_SIZE) / (ARRAY_SAMPLE_SIZE))).toLong
       }
     }
@@ -244,7 +244,7 @@ object SizeEstimator extends Logging {
 
   private def sampleArray(
       array: AnyRef,
-      state: SearchState, 
+      state: SearchState,
       rand: Random,
       drawn: OpenHashSet[Int],
       length: Int): Long = {
diff --git a/core/src/main/scala/org/apache/spark/util/collection/ExternalAppendOnlyMap.scala b/core/src/main/scala/org/apache/spark/util/collection/ExternalAppendOnlyMap.scala
index df2d6ad3b41a4..1e4531ef395ae 100644
--- a/core/src/main/scala/org/apache/spark/util/collection/ExternalAppendOnlyMap.scala
+++ b/core/src/main/scala/org/apache/spark/util/collection/ExternalAppendOnlyMap.scala
@@ -89,9 +89,9 @@ class ExternalAppendOnlyMap[K, V, C](
 
   // Number of bytes spilled in total
   private var _diskBytesSpilled = 0L
-  
+
   // Use getSizeAsKb (not bytes) to maintain backwards compatibility if no units are provided
-  private val fileBufferSize = 
+  private val fileBufferSize =
     sparkConf.getSizeAsKb("spark.shuffle.file.buffer", "32k").toInt * 1024
 
   // Write metrics for current spill
diff --git a/core/src/main/scala/org/apache/spark/util/collection/ExternalSorter.scala b/core/src/main/scala/org/apache/spark/util/collection/ExternalSorter.scala
index ef2dbb7ff0ae0..757dec66c203b 100644
--- a/core/src/main/scala/org/apache/spark/util/collection/ExternalSorter.scala
+++ b/core/src/main/scala/org/apache/spark/util/collection/ExternalSorter.scala
@@ -117,7 +117,7 @@ private[spark] class ExternalSorter[K, V, C](
   private val serInstance = ser.newInstance()
 
   private val spillingEnabled = conf.getBoolean("spark.shuffle.spill", true)
-  
+
   // Use getSizeAsKb (not bytes) to maintain backwards compatibility if no units are provided
   private val fileBufferSize = conf.getSizeAsKb("spark.shuffle.file.buffer", "32k").toInt * 1024
 
diff --git a/core/src/test/scala/org/apache/spark/FailureSuite.scala b/core/src/test/scala/org/apache/spark/FailureSuite.scala
index b18067e68f5a1..a8c8c6f73fb5a 100644
--- a/core/src/test/scala/org/apache/spark/FailureSuite.scala
+++ b/core/src/test/scala/org/apache/spark/FailureSuite.scala
@@ -117,7 +117,7 @@ class FailureSuite extends SparkFunSuite with LocalSparkContext {
       sc.parallelize(1 to 10, 2).map(x => a).count()
     }
     assert(thrown.getClass === classOf[SparkException])
-    assert(thrown.getMessage.contains("NotSerializableException") || 
+    assert(thrown.getMessage.contains("NotSerializableException") ||
       thrown.getCause.getClass === classOf[NotSerializableException])
 
     // Non-serializable closure in an earlier stage
@@ -125,7 +125,7 @@ class FailureSuite extends SparkFunSuite with LocalSparkContext {
       sc.parallelize(1 to 10, 2).map(x => (x, a)).partitionBy(new HashPartitioner(3)).count()
     }
     assert(thrown1.getClass === classOf[SparkException])
-    assert(thrown1.getMessage.contains("NotSerializableException") || 
+    assert(thrown1.getMessage.contains("NotSerializableException") ||
       thrown1.getCause.getClass === classOf[NotSerializableException])
 
     // Non-serializable closure in foreach function
@@ -133,7 +133,7 @@ class FailureSuite extends SparkFunSuite with LocalSparkContext {
       sc.parallelize(1 to 10, 2).foreach(x => println(a))
     }
     assert(thrown2.getClass === classOf[SparkException])
-    assert(thrown2.getMessage.contains("NotSerializableException") || 
+    assert(thrown2.getMessage.contains("NotSerializableException") ||
       thrown2.getCause.getClass === classOf[NotSerializableException])
 
     FailureSuiteState.clear()
diff --git a/core/src/test/scala/org/apache/spark/ImplicitOrderingSuite.scala b/core/src/test/scala/org/apache/spark/ImplicitOrderingSuite.scala
index e47173f8a8b03..4399f25626472 100644
--- a/core/src/test/scala/org/apache/spark/ImplicitOrderingSuite.scala
+++ b/core/src/test/scala/org/apache/spark/ImplicitOrderingSuite.scala
@@ -27,11 +27,11 @@ class ImplicitOrderingSuite extends SparkFunSuite with LocalSparkContext {
 
     // These RDD methods are in the companion object so that the unserializable ScalaTest Engine
     // won't be reachable from the closure object
-    
+
     // Infer orderings after basic maps to particular types
     val basicMapExpectations = ImplicitOrderingSuite.basicMapExpectations(rdd)
     basicMapExpectations.map({case (met, explain) => assert(met, explain)})
-    
+
     // Infer orderings for other RDD methods
     val otherRDDMethodExpectations = ImplicitOrderingSuite.otherRDDMethodExpectations(rdd)
     otherRDDMethodExpectations.map({case (met, explain) => assert(met, explain)})
@@ -48,30 +48,30 @@ private object ImplicitOrderingSuite {
   class OrderedClass extends Ordered[OrderedClass] {
     override def compare(o: OrderedClass): Int = throw new UnsupportedOperationException
   }
-  
+
   def basicMapExpectations(rdd: RDD[Int]): List[(Boolean, String)] = {
-    List((rdd.map(x => (x, x)).keyOrdering.isDefined, 
+    List((rdd.map(x => (x, x)).keyOrdering.isDefined,
             "rdd.map(x => (x, x)).keyOrdering.isDefined"),
-          (rdd.map(x => (1, x)).keyOrdering.isDefined, 
+          (rdd.map(x => (1, x)).keyOrdering.isDefined,
             "rdd.map(x => (1, x)).keyOrdering.isDefined"),
-          (rdd.map(x => (x.toString, x)).keyOrdering.isDefined, 
+          (rdd.map(x => (x.toString, x)).keyOrdering.isDefined,
             "rdd.map(x => (x.toString, x)).keyOrdering.isDefined"),
-          (rdd.map(x => (null, x)).keyOrdering.isDefined, 
+          (rdd.map(x => (null, x)).keyOrdering.isDefined,
             "rdd.map(x => (null, x)).keyOrdering.isDefined"),
-          (rdd.map(x => (new NonOrderedClass, x)).keyOrdering.isEmpty, 
+          (rdd.map(x => (new NonOrderedClass, x)).keyOrdering.isEmpty,
             "rdd.map(x => (new NonOrderedClass, x)).keyOrdering.isEmpty"),
-          (rdd.map(x => (new ComparableClass, x)).keyOrdering.isDefined, 
+          (rdd.map(x => (new ComparableClass, x)).keyOrdering.isDefined,
             "rdd.map(x => (new ComparableClass, x)).keyOrdering.isDefined"),
-          (rdd.map(x => (new OrderedClass, x)).keyOrdering.isDefined, 
+          (rdd.map(x => (new OrderedClass, x)).keyOrdering.isDefined,
             "rdd.map(x => (new OrderedClass, x)).keyOrdering.isDefined"))
   }
-  
+
   def otherRDDMethodExpectations(rdd: RDD[Int]): List[(Boolean, String)] = {
-    List((rdd.groupBy(x => x).keyOrdering.isDefined, 
+    List((rdd.groupBy(x => x).keyOrdering.isDefined,
            "rdd.groupBy(x => x).keyOrdering.isDefined"),
-         (rdd.groupBy(x => new NonOrderedClass).keyOrdering.isEmpty, 
+         (rdd.groupBy(x => new NonOrderedClass).keyOrdering.isEmpty,
            "rdd.groupBy(x => new NonOrderedClass).keyOrdering.isEmpty"),
-         (rdd.groupBy(x => new ComparableClass).keyOrdering.isDefined, 
+         (rdd.groupBy(x => new ComparableClass).keyOrdering.isDefined,
            "rdd.groupBy(x => new ComparableClass).keyOrdering.isDefined"),
          (rdd.groupBy(x => new OrderedClass).keyOrdering.isDefined,
            "rdd.groupBy(x => new OrderedClass).keyOrdering.isDefined"),
diff --git a/core/src/test/scala/org/apache/spark/SparkContextSuite.scala b/core/src/test/scala/org/apache/spark/SparkContextSuite.scala
index 93426822f704e..6838b35ab4cc8 100644
--- a/core/src/test/scala/org/apache/spark/SparkContextSuite.scala
+++ b/core/src/test/scala/org/apache/spark/SparkContextSuite.scala
@@ -71,22 +71,22 @@ class SparkContextSuite extends SparkFunSuite with LocalSparkContext {
     var sc2: SparkContext = null
     SparkContext.clearActiveContext()
     val conf = new SparkConf().setAppName("test").setMaster("local")
-    
+
     sc = SparkContext.getOrCreate(conf)
-    
+
     assert(sc.getConf.get("spark.app.name").equals("test"))
     sc2 = SparkContext.getOrCreate(new SparkConf().setAppName("test2").setMaster("local"))
     assert(sc2.getConf.get("spark.app.name").equals("test"))
     assert(sc === sc2)
     assert(sc eq sc2)
-    
+
     // Try creating second context to confirm that it's still possible, if desired
     sc2 = new SparkContext(new SparkConf().setAppName("test3").setMaster("local")
         .set("spark.driver.allowMultipleContexts", "true"))
-    
+
     sc2.stop()
   }
-  
+
   test("BytesWritable implicit conversion is correct") {
     // Regression test for SPARK-3121
     val bytesWritable = new BytesWritable()
diff --git a/core/src/test/scala/org/apache/spark/rdd/JdbcRDDSuite.scala b/core/src/test/scala/org/apache/spark/rdd/JdbcRDDSuite.scala
index a8466ed8c1dc2..08215a2bafc09 100644
--- a/core/src/test/scala/org/apache/spark/rdd/JdbcRDDSuite.scala
+++ b/core/src/test/scala/org/apache/spark/rdd/JdbcRDDSuite.scala
@@ -82,7 +82,7 @@ class JdbcRDDSuite extends SparkFunSuite with BeforeAndAfter with LocalSparkCont
     assert(rdd.count === 100)
     assert(rdd.reduce(_ + _) === 10100)
   }
-  
+
   test("large id overflow") {
     sc = new SparkContext("local", "test")
     val rdd = new JdbcRDD(
diff --git a/core/src/test/scala/org/apache/spark/scheduler/cluster/mesos/MemoryUtilsSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/cluster/mesos/MemoryUtilsSuite.scala
index d565132a06789..e72285d03d3ee 100644
--- a/core/src/test/scala/org/apache/spark/scheduler/cluster/mesos/MemoryUtilsSuite.scala
+++ b/core/src/test/scala/org/apache/spark/scheduler/cluster/mesos/MemoryUtilsSuite.scala
@@ -28,11 +28,11 @@ class MemoryUtilsSuite extends SparkFunSuite with MockitoSugar {
 
     val sc = mock[SparkContext]
     when(sc.conf).thenReturn(sparkConf)
-    
+
     // 384 > sc.executorMemory * 0.1 => 512 + 384 = 896
     when(sc.executorMemory).thenReturn(512)
     assert(MemoryUtils.calculateTotalMemory(sc) === 896)
-    
+
     // 384 < sc.executorMemory * 0.1 => 4096 + (4096 * 0.1) = 4505.6
     when(sc.executorMemory).thenReturn(4096)
     assert(MemoryUtils.calculateTotalMemory(sc) === 4505)
diff --git a/core/src/test/scala/org/apache/spark/scheduler/cluster/mesos/MesosSchedulerBackendSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/cluster/mesos/MesosSchedulerBackendSuite.scala
index 6f4ff0814b8da..68df46a41ddc8 100644
--- a/core/src/test/scala/org/apache/spark/scheduler/cluster/mesos/MesosSchedulerBackendSuite.scala
+++ b/core/src/test/scala/org/apache/spark/scheduler/cluster/mesos/MesosSchedulerBackendSuite.scala
@@ -79,11 +79,11 @@ class MesosSchedulerBackendSuite extends SparkFunSuite with LocalSparkContext wi
       .set("spark.mesos.executor.docker.image", "spark/mock")
       .set("spark.mesos.executor.docker.volumes", "/a,/b:/b,/c:/c:rw,/d:ro,/e:/e:ro")
       .set("spark.mesos.executor.docker.portmaps", "80:8080,53:53:tcp")
-     
+
     val listenerBus = mock[LiveListenerBus]
     listenerBus.post(
       SparkListenerExecutorAdded(anyLong, "s1", new ExecutorInfo("host1", 2, Map.empty)))
-                         
+
     val sc = mock[SparkContext]
     when(sc.executorMemory).thenReturn(100)
     when(sc.getSparkHome()).thenReturn(Option("/spark-home"))
diff --git a/core/src/test/scala/org/apache/spark/serializer/KryoSerializerSuite.scala b/core/src/test/scala/org/apache/spark/serializer/KryoSerializerSuite.scala
index c32fe232cc27c..23a1fdb0f5009 100644
--- a/core/src/test/scala/org/apache/spark/serializer/KryoSerializerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/serializer/KryoSerializerSuite.scala
@@ -36,7 +36,7 @@ class KryoSerializerSuite extends SparkFunSuite with SharedSparkContext {
   test("SPARK-7392 configuration limits") {
     val kryoBufferProperty = "spark.kryoserializer.buffer"
     val kryoBufferMaxProperty = "spark.kryoserializer.buffer.max"
-    
+
     def newKryoInstance(
         conf: SparkConf,
         bufferSize: String = "64k",
@@ -46,7 +46,7 @@ class KryoSerializerSuite extends SparkFunSuite with SharedSparkContext {
       kryoConf.set(kryoBufferMaxProperty, maxBufferSize)
       new KryoSerializer(kryoConf).newInstance()
     }
-    
+
     // test default values
     newKryoInstance(conf, "64k", "64m")
     // 2048m = 2097152k
@@ -69,7 +69,7 @@ class KryoSerializerSuite extends SparkFunSuite with SharedSparkContext {
     // test configuration with mb is supported properly
     newKryoInstance(conf, "8m", "9m")
   }
-  
+
   test("basic types") {
     val ser = new KryoSerializer(conf).newInstance()
     def check[T: ClassTag](t: T) {
diff --git a/core/src/test/scala/org/apache/spark/serializer/ProactiveClosureSerializationSuite.scala b/core/src/test/scala/org/apache/spark/serializer/ProactiveClosureSerializationSuite.scala
index 77d66864f755e..c657414e9e5c3 100644
--- a/core/src/test/scala/org/apache/spark/serializer/ProactiveClosureSerializationSuite.scala
+++ b/core/src/test/scala/org/apache/spark/serializer/ProactiveClosureSerializationSuite.scala
@@ -23,7 +23,7 @@ import org.apache.spark.rdd.RDD
 /* A trivial (but unserializable) container for trivial functions */
 class UnserializableClass {
   def op[T](x: T): String = x.toString
-  
+
   def pred[T](x: T): Boolean = x.toString.length % 2 == 0
 }
 
@@ -45,7 +45,7 @@ class ProactiveClosureSerializationSuite extends SparkFunSuite with SharedSparkC
   // iterating over a map from transformation names to functions that perform that
   // transformation on a given RDD, creating one test case for each
 
-  for (transformation <- 
+  for (transformation <-
       Map("map" -> xmap _,
           "flatMap" -> xflatMap _,
           "filter" -> xfilter _,
@@ -58,24 +58,24 @@ class ProactiveClosureSerializationSuite extends SparkFunSuite with SharedSparkC
       val ex = intercept[SparkException] {
         xf(data, uc)
       }
-      assert(ex.getMessage.contains("Task not serializable"), 
+      assert(ex.getMessage.contains("Task not serializable"),
         s"RDD.$name doesn't proactively throw NotSerializableException")
     }
   }
 
-  private def xmap(x: RDD[String], uc: UnserializableClass): RDD[String] = 
+  private def xmap(x: RDD[String], uc: UnserializableClass): RDD[String] =
     x.map(y => uc.op(y))
 
-  private def xflatMap(x: RDD[String], uc: UnserializableClass): RDD[String] = 
+  private def xflatMap(x: RDD[String], uc: UnserializableClass): RDD[String] =
     x.flatMap(y => Seq(uc.op(y)))
 
-  private def xfilter(x: RDD[String], uc: UnserializableClass): RDD[String] = 
+  private def xfilter(x: RDD[String], uc: UnserializableClass): RDD[String] =
     x.filter(y => uc.pred(y))
 
-  private def xmapPartitions(x: RDD[String], uc: UnserializableClass): RDD[String] = 
+  private def xmapPartitions(x: RDD[String], uc: UnserializableClass): RDD[String] =
     x.mapPartitions(_.map(y => uc.op(y)))
 
-  private def xmapPartitionsWithIndex(x: RDD[String], uc: UnserializableClass): RDD[String] = 
+  private def xmapPartitionsWithIndex(x: RDD[String], uc: UnserializableClass): RDD[String] =
     x.mapPartitionsWithIndex((_, it) => it.map(y => uc.op(y)))
-  
+
 }
diff --git a/core/src/test/scala/org/apache/spark/util/ClosureCleanerSuite.scala b/core/src/test/scala/org/apache/spark/util/ClosureCleanerSuite.scala
index a97a842f434fb..70cd27b04347d 100644
--- a/core/src/test/scala/org/apache/spark/util/ClosureCleanerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/util/ClosureCleanerSuite.scala
@@ -201,7 +201,7 @@ object TestObjectWithNestedReturns {
   def run(): Int = {
     withSpark(new SparkContext("local", "test")) { sc =>
       val nums = sc.parallelize(Array(1, 2, 3, 4))
-      nums.map {x => 
+      nums.map {x =>
         // this return is fine since it will not transfer control outside the closure
         def foo(): Int = { return 5; 1 }
         foo()
diff --git a/core/src/test/scala/org/apache/spark/util/random/RandomSamplerSuite.scala b/core/src/test/scala/org/apache/spark/util/random/RandomSamplerSuite.scala
index 2f1e6a39f4554..d6af0aebde733 100644
--- a/core/src/test/scala/org/apache/spark/util/random/RandomSamplerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/util/random/RandomSamplerSuite.scala
@@ -78,7 +78,7 @@ class RandomSamplerSuite extends SparkFunSuite with Matchers {
   }
 
   // Returns iterator over gap lengths between samples.
-  // This function assumes input data is integers sampled from the sequence of 
+  // This function assumes input data is integers sampled from the sequence of
   // increasing integers: {0, 1, 2, ...}.  This works because that is how I generate them,
   // and the samplers preserve their input order
   def gaps(data: Iterator[Int]): Iterator[Int] = {

From 564bc11e9827915c8652bc06f4bd591809dea4b1 Mon Sep 17 00:00:00 2001
From: Reynold Xin <rxin@databricks.com>
Date: Sun, 31 May 2015 00:47:56 -0700
Subject: [PATCH 282/525] [SPARK-3850] Trim trailing spaces for
 examples/streaming/yarn.

Author: Reynold Xin <rxin@databricks.com>

Closes #6530 from rxin/trim-whitespace-1 and squashes the following commits:

7b7b3a0 [Reynold Xin] Reset again.
dc14597 [Reynold Xin] Reset scalastyle.
cd556c4 [Reynold Xin] YARN, Kinesis, Flume.
4223fe1 [Reynold Xin] [SPARK-3850] Trim trailing spaces for examples/streaming.
---
 .../org/apache/spark/examples/LogQuery.scala  |  2 +-
 .../examples/mllib/DenseGaussianMixture.scala | 10 +++----
 .../examples/streaming/MQTTWordCount.scala    | 10 +++----
 .../streaming/flume/FlumeInputDStream.scala   | 10 +++----
 .../flume/FlumePollingInputDStream.scala      |  2 +-
 .../streaming/flume/FlumeStreamSuite.scala    |  2 +-
 .../spark/streaming/kafka/KafkaCluster.scala  |  2 +-
 .../spark/streaming/kafka/KafkaUtils.scala    |  8 +++---
 .../streaming/KinesisWordCountASL.scala       | 12 ++++----
 .../kinesis/KinesisCheckpointState.scala      |  8 +++---
 .../streaming/kinesis/KinesisReceiver.scala   |  8 +++---
 .../kinesis/KinesisRecordProcessor.scala      | 28 +++++++++----------
 .../org/apache/spark/graphx/EdgeSuite.scala   | 10 +++----
 .../streaming/receiver/BlockGenerator.scala   |  2 +-
 .../receiver/ReceivedBlockHandler.scala       |  2 +-
 .../spark/streaming/UISeleniumSuite.scala     |  4 ---
 .../streaming/util/WriteAheadLogSuite.scala   |  4 +--
 .../yarn/ClientDistributedCacheManager.scala  | 26 ++++++++---------
 .../deploy/yarn/YarnSparkHadoopUtil.scala     |  4 +--
 .../ClientDistributedCacheManagerSuite.scala  | 24 ++++++++--------
 20 files changed, 87 insertions(+), 91 deletions(-)

diff --git a/examples/src/main/scala/org/apache/spark/examples/LogQuery.scala b/examples/src/main/scala/org/apache/spark/examples/LogQuery.scala
index 32e02eab8b031..75c82117cbad2 100644
--- a/examples/src/main/scala/org/apache/spark/examples/LogQuery.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/LogQuery.scala
@@ -22,7 +22,7 @@ import org.apache.spark.SparkContext._
 
 /**
  * Executes a roll up-style query against Apache logs.
- *  
+ *
  * Usage: LogQuery [logFile]
  */
 object LogQuery {
diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/DenseGaussianMixture.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/DenseGaussianMixture.scala
index 9a1aab036aa0f..f8c71ccabc43b 100644
--- a/examples/src/main/scala/org/apache/spark/examples/mllib/DenseGaussianMixture.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/mllib/DenseGaussianMixture.scala
@@ -41,22 +41,22 @@ object DenseGaussianMixture {
   private def run(inputFile: String, k: Int, convergenceTol: Double, maxIterations: Int) {
     val conf = new SparkConf().setAppName("Gaussian Mixture Model EM example")
     val ctx = new SparkContext(conf)
-    
+
     val data = ctx.textFile(inputFile).map { line =>
       Vectors.dense(line.trim.split(' ').map(_.toDouble))
     }.cache()
-      
+
     val clusters = new GaussianMixture()
       .setK(k)
       .setConvergenceTol(convergenceTol)
       .setMaxIterations(maxIterations)
       .run(data)
-    
+
     for (i <- 0 until clusters.k) {
-      println("weight=%f\nmu=%s\nsigma=\n%s\n" format 
+      println("weight=%f\nmu=%s\nsigma=\n%s\n" format
         (clusters.weights(i), clusters.gaussians(i).mu, clusters.gaussians(i).sigma))
     }
-    
+
     println("Cluster labels (first <= 100):")
     val clusterLabels = clusters.predict(data)
     clusterLabels.take(100).foreach { x =>
diff --git a/examples/src/main/scala/org/apache/spark/examples/streaming/MQTTWordCount.scala b/examples/src/main/scala/org/apache/spark/examples/streaming/MQTTWordCount.scala
index b336751d81616..813c8554f5193 100644
--- a/examples/src/main/scala/org/apache/spark/examples/streaming/MQTTWordCount.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/streaming/MQTTWordCount.scala
@@ -40,7 +40,7 @@ object MQTTPublisher {
     StreamingExamples.setStreamingLogLevels()
 
     val Seq(brokerUrl, topic) = args.toSeq
-    
+
     var client: MqttClient = null
 
     try {
@@ -59,10 +59,10 @@ object MQTTPublisher {
           println(s"Published data. topic: ${msgtopic.getName()}; Message: $message")
         } catch {
           case e: MqttException if e.getReasonCode == MqttException.REASON_CODE_MAX_INFLIGHT =>
-            Thread.sleep(10) 
+            Thread.sleep(10)
             println("Queue is full, wait for to consume data from the message queue")
-        }  
-      }      
+        }
+      }
     } catch {
       case e: MqttException => println("Exception Caught: " + e)
     } finally {
@@ -107,7 +107,7 @@ object MQTTWordCount {
     val lines = MQTTUtils.createStream(ssc, brokerUrl, topic, StorageLevel.MEMORY_ONLY_SER_2)
     val words = lines.flatMap(x => x.split(" "))
     val wordCounts = words.map(x => (x, 1)).reduceByKey(_ + _)
-    
+
     wordCounts.print()
     ssc.start()
     ssc.awaitTermination()
diff --git a/external/flume/src/main/scala/org/apache/spark/streaming/flume/FlumeInputDStream.scala b/external/flume/src/main/scala/org/apache/spark/streaming/flume/FlumeInputDStream.scala
index 60e2994431b38..1e32a365a1eee 100644
--- a/external/flume/src/main/scala/org/apache/spark/streaming/flume/FlumeInputDStream.scala
+++ b/external/flume/src/main/scala/org/apache/spark/streaming/flume/FlumeInputDStream.scala
@@ -152,9 +152,9 @@ class FlumeReceiver(
       val channelFactory = new NioServerSocketChannelFactory(Executors.newCachedThreadPool(),
                                                              Executors.newCachedThreadPool())
       val channelPipelineFactory = new CompressionChannelPipelineFactory()
-      
+
       new NettyServer(
-        responder, 
+        responder,
         new InetSocketAddress(host, port),
         channelFactory,
         channelPipelineFactory,
@@ -188,12 +188,12 @@ class FlumeReceiver(
 
   override def preferredLocation: Option[String] = Option(host)
 
-  /** A Netty Pipeline factory that will decompress incoming data from 
+  /** A Netty Pipeline factory that will decompress incoming data from
     * and the Netty client and compress data going back to the client.
     *
     * The compression on the return is required because Flume requires
-    * a successful response to indicate it can remove the event/batch 
-    * from the configured channel 
+    * a successful response to indicate it can remove the event/batch
+    * from the configured channel
     */
   private[streaming]
   class CompressionChannelPipelineFactory extends ChannelPipelineFactory {
diff --git a/external/flume/src/main/scala/org/apache/spark/streaming/flume/FlumePollingInputDStream.scala b/external/flume/src/main/scala/org/apache/spark/streaming/flume/FlumePollingInputDStream.scala
index 92fa5b41be89e..583e7dca317ad 100644
--- a/external/flume/src/main/scala/org/apache/spark/streaming/flume/FlumePollingInputDStream.scala
+++ b/external/flume/src/main/scala/org/apache/spark/streaming/flume/FlumePollingInputDStream.scala
@@ -110,7 +110,7 @@ private[streaming] class FlumePollingReceiver(
 }
 
 /**
- * A wrapper around the transceiver and the Avro IPC API. 
+ * A wrapper around the transceiver and the Avro IPC API.
  * @param transceiver The transceiver to use for communication with Flume
  * @param client The client that the callbacks are received on.
  */
diff --git a/external/flume/src/test/scala/org/apache/spark/streaming/flume/FlumeStreamSuite.scala b/external/flume/src/test/scala/org/apache/spark/streaming/flume/FlumeStreamSuite.scala
index 3d9daeb6e4363..c926359987d89 100644
--- a/external/flume/src/test/scala/org/apache/spark/streaming/flume/FlumeStreamSuite.scala
+++ b/external/flume/src/test/scala/org/apache/spark/streaming/flume/FlumeStreamSuite.scala
@@ -138,7 +138,7 @@ class FlumeStreamSuite extends SparkFunSuite with BeforeAndAfter with Matchers w
       val status = client.appendBatch(inputEvents.toList)
       status should be (avro.Status.OK)
     }
-    
+
     eventually(timeout(10 seconds), interval(100 milliseconds)) {
       val outputEvents = outputBuffer.flatten.map { _.event }
       outputEvents.foreach {
diff --git a/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/KafkaCluster.scala b/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/KafkaCluster.scala
index 6cf254a7b69cb..65d51d87f8486 100644
--- a/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/KafkaCluster.scala
+++ b/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/KafkaCluster.scala
@@ -113,7 +113,7 @@ class KafkaCluster(val kafkaParams: Map[String, String]) extends Serializable {
       r.flatMap { tm: TopicMetadata =>
         tm.partitionsMetadata.map { pm: PartitionMetadata =>
           TopicAndPartition(tm.topic, pm.partitionId)
-        }    
+        }
       }
     }
   }
diff --git a/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/KafkaUtils.scala b/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/KafkaUtils.scala
index 8be2707528d93..0b8a391a2c569 100644
--- a/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/KafkaUtils.scala
+++ b/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/KafkaUtils.scala
@@ -315,7 +315,7 @@ object KafkaUtils {
    * Points to note:
    *  - No receivers: This stream does not use any receiver. It directly queries Kafka
    *  - Offsets: This does not use Zookeeper to store offsets. The consumed offsets are tracked
-   *    by the stream itself. For interoperability with Kafka monitoring tools that depend on 
+   *    by the stream itself. For interoperability with Kafka monitoring tools that depend on
    *    Zookeeper, you have to update Kafka/Zookeeper yourself from the streaming application.
    *    You can access the offsets used in each batch from the generated RDDs (see
    *    [[org.apache.spark.streaming.kafka.HasOffsetRanges]]).
@@ -363,7 +363,7 @@ object KafkaUtils {
    * Points to note:
    *  - No receivers: This stream does not use any receiver. It directly queries Kafka
    *  - Offsets: This does not use Zookeeper to store offsets. The consumed offsets are tracked
-   *    by the stream itself. For interoperability with Kafka monitoring tools that depend on 
+   *    by the stream itself. For interoperability with Kafka monitoring tools that depend on
    *    Zookeeper, you have to update Kafka/Zookeeper yourself from the streaming application.
    *    You can access the offsets used in each batch from the generated RDDs (see
    *    [[org.apache.spark.streaming.kafka.HasOffsetRanges]]).
@@ -427,7 +427,7 @@ object KafkaUtils {
    * Points to note:
    *  - No receivers: This stream does not use any receiver. It directly queries Kafka
    *  - Offsets: This does not use Zookeeper to store offsets. The consumed offsets are tracked
-   *    by the stream itself. For interoperability with Kafka monitoring tools that depend on 
+   *    by the stream itself. For interoperability with Kafka monitoring tools that depend on
    *    Zookeeper, you have to update Kafka/Zookeeper yourself from the streaming application.
    *    You can access the offsets used in each batch from the generated RDDs (see
    *    [[org.apache.spark.streaming.kafka.HasOffsetRanges]]).
@@ -489,7 +489,7 @@ object KafkaUtils {
    * Points to note:
    *  - No receivers: This stream does not use any receiver. It directly queries Kafka
    *  - Offsets: This does not use Zookeeper to store offsets. The consumed offsets are tracked
-   *    by the stream itself. For interoperability with Kafka monitoring tools that depend on 
+   *    by the stream itself. For interoperability with Kafka monitoring tools that depend on
    *    Zookeeper, you have to update Kafka/Zookeeper yourself from the streaming application.
    *    You can access the offsets used in each batch from the generated RDDs (see
    *    [[org.apache.spark.streaming.kafka.HasOffsetRanges]]).
diff --git a/extras/kinesis-asl/src/main/scala/org/apache/spark/examples/streaming/KinesisWordCountASL.scala b/extras/kinesis-asl/src/main/scala/org/apache/spark/examples/streaming/KinesisWordCountASL.scala
index 97c3476049289..be8b62d3cc6ba 100644
--- a/extras/kinesis-asl/src/main/scala/org/apache/spark/examples/streaming/KinesisWordCountASL.scala
+++ b/extras/kinesis-asl/src/main/scala/org/apache/spark/examples/streaming/KinesisWordCountASL.scala
@@ -119,7 +119,7 @@ object KinesisWordCountASL extends Logging {
     val batchInterval = Milliseconds(2000)
 
     // Kinesis checkpoint interval is the interval at which the DynamoDB is updated with information
-    // on sequence number of records that have been received. Same as batchInterval for this 
+    // on sequence number of records that have been received. Same as batchInterval for this
     // example.
     val kinesisCheckpointInterval = batchInterval
 
@@ -145,7 +145,7 @@ object KinesisWordCountASL extends Logging {
 
     // Map each word to a (word, 1) tuple so we can reduce by key to count the words
     val wordCounts = words.map(word => (word, 1)).reduceByKey(_ + _)
- 
+
     // Print the first 10 wordCounts
     wordCounts.print()
 
@@ -210,14 +210,14 @@ object KinesisWordProducerASL {
 
     val randomWords = List("spark", "you", "are", "my", "father")
     val totals = scala.collection.mutable.Map[String, Int]()
-  
+
     // Create the low-level Kinesis Client from the AWS Java SDK.
     val kinesisClient = new AmazonKinesisClient(new DefaultAWSCredentialsProviderChain())
     kinesisClient.setEndpoint(endpoint)
 
     println(s"Putting records onto stream $stream and endpoint $endpoint at a rate of" +
         s" $recordsPerSecond records per second and $wordsPerRecord words per record")
-  
+
     // Iterate and put records onto the stream per the given recordPerSec and wordsPerRecord
     for (i <- 1 to 10) {
       // Generate recordsPerSec records to put onto the stream
@@ -255,8 +255,8 @@ object KinesisWordProducerASL {
   }
 }
 
-/** 
- *  Utility functions for Spark Streaming examples. 
+/**
+ *  Utility functions for Spark Streaming examples.
  *  This has been lifted from the examples/ project to remove the circular dependency.
  */
 private[streaming] object StreamingExamples extends Logging {
diff --git a/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisCheckpointState.scala b/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisCheckpointState.scala
index 1c9b0c218ae18..83a4537559512 100644
--- a/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisCheckpointState.scala
+++ b/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisCheckpointState.scala
@@ -23,20 +23,20 @@ import org.apache.spark.util.{Clock, ManualClock, SystemClock}
 /**
  * This is a helper class for managing checkpoint clocks.
  *
- * @param checkpointInterval 
+ * @param checkpointInterval
  * @param currentClock.  Default to current SystemClock if none is passed in (mocking purposes)
  */
 private[kinesis] class KinesisCheckpointState(
-    checkpointInterval: Duration, 
+    checkpointInterval: Duration,
     currentClock: Clock = new SystemClock())
   extends Logging {
-  
+
   /* Initialize the checkpoint clock using the given currentClock + checkpointInterval millis */
   val checkpointClock = new ManualClock()
   checkpointClock.setTime(currentClock.getTimeMillis() + checkpointInterval.milliseconds)
 
   /**
-   * Check if it's time to checkpoint based on the current time and the derived time 
+   * Check if it's time to checkpoint based on the current time and the derived time
    *   for the next checkpoint
    *
    * @return true if it's time to checkpoint
diff --git a/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisReceiver.scala b/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisReceiver.scala
index 7dd8bfdc2a6db..1a8a4cecc1141 100644
--- a/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisReceiver.scala
+++ b/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisReceiver.scala
@@ -44,12 +44,12 @@ case class SerializableAWSCredentials(accessKeyId: String, secretKey: String)
  * https://github.com/awslabs/amazon-kinesis-client
  * This is a custom receiver used with StreamingContext.receiverStream(Receiver) as described here:
  *   http://spark.apache.org/docs/latest/streaming-custom-receivers.html
- * Instances of this class will get shipped to the Spark Streaming Workers to run within a 
+ * Instances of this class will get shipped to the Spark Streaming Workers to run within a
  *   Spark Executor.
  *
  * @param appName  Kinesis application name. Kinesis Apps are mapped to Kinesis Streams
  *                 by the Kinesis Client Library.  If you change the App name or Stream name,
- *                 the KCL will throw errors.  This usually requires deleting the backing  
+ *                 the KCL will throw errors.  This usually requires deleting the backing
  *                 DynamoDB table with the same name this Kinesis application.
  * @param streamName   Kinesis stream name
  * @param endpointUrl  Url of Kinesis service (e.g., https://kinesis.us-east-1.amazonaws.com)
@@ -87,7 +87,7 @@ private[kinesis] class KinesisReceiver(
    */
 
   /**
-   * workerId is used by the KCL should be based on the ip address of the actual Spark Worker 
+   * workerId is used by the KCL should be based on the ip address of the actual Spark Worker
    * where this code runs (not the driver's IP address.)
    */
   private var workerId: String = null
@@ -121,7 +121,7 @@ private[kinesis] class KinesisReceiver(
 
    /*
     *  RecordProcessorFactory creates impls of IRecordProcessor.
-    *  IRecordProcessor adapts the KCL to our Spark KinesisReceiver via the 
+    *  IRecordProcessor adapts the KCL to our Spark KinesisReceiver via the
     *  IRecordProcessor.processRecords() method.
     *  We're using our custom KinesisRecordProcessor in this case.
     */
diff --git a/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisRecordProcessor.scala b/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisRecordProcessor.scala
index f65e743c4e2a3..fe9e3a0c793e2 100644
--- a/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisRecordProcessor.scala
+++ b/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisRecordProcessor.scala
@@ -35,9 +35,9 @@ import com.amazonaws.services.kinesis.model.Record
 /**
  * Kinesis-specific implementation of the Kinesis Client Library (KCL) IRecordProcessor.
  * This implementation operates on the Array[Byte] from the KinesisReceiver.
- * The Kinesis Worker creates an instance of this KinesisRecordProcessor for each 
- *   shard in the Kinesis stream upon startup.  This is normally done in separate threads, 
- *   but the KCLs within the KinesisReceivers will balance themselves out if you create 
+ * The Kinesis Worker creates an instance of this KinesisRecordProcessor for each
+ *   shard in the Kinesis stream upon startup.  This is normally done in separate threads,
+ *   but the KCLs within the KinesisReceivers will balance themselves out if you create
  *   multiple Receivers.
  *
  * @param receiver Kinesis receiver
@@ -69,14 +69,14 @@ private[kinesis] class KinesisRecordProcessor(
    * and Spark Streaming's Receiver.store().
    *
    * @param batch list of records from the Kinesis stream shard
-   * @param checkpointer used to update Kinesis when this batch has been processed/stored 
+   * @param checkpointer used to update Kinesis when this batch has been processed/stored
    *   in the DStream
    */
   override def processRecords(batch: List[Record], checkpointer: IRecordProcessorCheckpointer) {
     if (!receiver.isStopped()) {
       try {
         /*
-         * Notes:  
+         * Notes:
          * 1) If we try to store the raw ByteBuffer from record.getData(), the Spark Streaming
          *    Receiver.store(ByteBuffer) attempts to deserialize the ByteBuffer using the
          *    internally-configured Spark serializer (kryo, etc).
@@ -84,19 +84,19 @@ private[kinesis] class KinesisRecordProcessor(
          *    ourselves from Spark's internal serialization strategy.
          * 3) For performance, the BlockGenerator is asynchronously queuing elements within its
          *    memory before creating blocks.  This prevents the small block scenario, but requires
-         *    that you register callbacks to know when a block has been generated and stored 
+         *    that you register callbacks to know when a block has been generated and stored
          *    (WAL is sufficient for storage) before can checkpoint back to the source.
         */
         batch.foreach(record => receiver.store(record.getData().array()))
-        
+
         logDebug(s"Stored:  Worker $workerId stored ${batch.size} records for shardId $shardId")
 
         /*
-         * Checkpoint the sequence number of the last record successfully processed/stored 
+         * Checkpoint the sequence number of the last record successfully processed/stored
          *   in the batch.
          * In this implementation, we're checkpointing after the given checkpointIntervalMillis.
-         * Note that this logic requires that processRecords() be called AND that it's time to 
-         *   checkpoint.  I point this out because there is no background thread running the 
+         * Note that this logic requires that processRecords() be called AND that it's time to
+         *   checkpoint.  I point this out because there is no background thread running the
          *   checkpointer.  Checkpointing is tested and trigger only when a new batch comes in.
          * If the worker is shutdown cleanly, checkpoint will happen (see shutdown() below).
          * However, if the worker dies unexpectedly, a checkpoint may not happen.
@@ -130,16 +130,16 @@ private[kinesis] class KinesisRecordProcessor(
       }
     } else {
       /* RecordProcessor has been stopped. */
-      logInfo(s"Stopped:  The Spark KinesisReceiver has stopped for workerId $workerId" + 
+      logInfo(s"Stopped:  The Spark KinesisReceiver has stopped for workerId $workerId" +
           s" and shardId $shardId.  No more records will be processed.")
     }
   }
 
   /**
    * Kinesis Client Library is shutting down this Worker for 1 of 2 reasons:
-   * 1) the stream is resharding by splitting or merging adjacent shards 
+   * 1) the stream is resharding by splitting or merging adjacent shards
    *     (ShutdownReason.TERMINATE)
-   * 2) the failed or latent Worker has stopped sending heartbeats for whatever reason 
+   * 2) the failed or latent Worker has stopped sending heartbeats for whatever reason
    *     (ShutdownReason.ZOMBIE)
    *
    * @param checkpointer used to perform a Kinesis checkpoint for ShutdownReason.TERMINATE
@@ -153,7 +153,7 @@ private[kinesis] class KinesisRecordProcessor(
        * Checkpoint to indicate that all records from the shard have been drained and processed.
        * It's now OK to read from the new shards that resulted from a resharding event.
        */
-      case ShutdownReason.TERMINATE => 
+      case ShutdownReason.TERMINATE =>
         KinesisRecordProcessor.retryRandom(checkpointer.checkpoint(), 4, 100)
 
       /*
diff --git a/graphx/src/test/scala/org/apache/spark/graphx/EdgeSuite.scala b/graphx/src/test/scala/org/apache/spark/graphx/EdgeSuite.scala
index 7629128010193..094a63472eaab 100644
--- a/graphx/src/test/scala/org/apache/spark/graphx/EdgeSuite.scala
+++ b/graphx/src/test/scala/org/apache/spark/graphx/EdgeSuite.scala
@@ -23,15 +23,15 @@ class EdgeSuite extends SparkFunSuite {
   test ("compare") {
     // decending order
     val testEdges: Array[Edge[Int]] = Array(
-      Edge(0x7FEDCBA987654321L, -0x7FEDCBA987654321L, 1), 
-      Edge(0x2345L, 0x1234L, 1), 
-      Edge(0x1234L, 0x5678L, 1), 
-      Edge(0x1234L, 0x2345L, 1), 
+      Edge(0x7FEDCBA987654321L, -0x7FEDCBA987654321L, 1),
+      Edge(0x2345L, 0x1234L, 1),
+      Edge(0x1234L, 0x5678L, 1),
+      Edge(0x1234L, 0x2345L, 1),
       Edge(-0x7FEDCBA987654321L, 0x7FEDCBA987654321L, 1)
     )
     // to ascending order
     val sortedEdges = testEdges.sorted(Edge.lexicographicOrdering[Int])
-    
+
     for (i <- 0 until testEdges.length) {
       assert(sortedEdges(i) == testEdges(testEdges.length - i - 1))
     }
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/receiver/BlockGenerator.scala b/streaming/src/main/scala/org/apache/spark/streaming/receiver/BlockGenerator.scala
index 0588517a2de39..8d73593ab6375 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/receiver/BlockGenerator.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/receiver/BlockGenerator.scala
@@ -191,7 +191,7 @@ private[streaming] class BlockGenerator(
     logError(message, t)
     listener.onError(message, t)
   }
-  
+
   private def pushBlock(block: Block) {
     listener.onPushBlock(block.id, block.buffer)
     logInfo("Pushed block " + block.id)
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/receiver/ReceivedBlockHandler.scala b/streaming/src/main/scala/org/apache/spark/streaming/receiver/ReceivedBlockHandler.scala
index 651b534ac1900..207d64d9414ee 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/receiver/ReceivedBlockHandler.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/receiver/ReceivedBlockHandler.scala
@@ -62,7 +62,7 @@ private[streaming] case class BlockManagerBasedStoreResult(blockId: StreamBlockI
 private[streaming] class BlockManagerBasedBlockHandler(
     blockManager: BlockManager, storageLevel: StorageLevel)
   extends ReceivedBlockHandler with Logging {
-  
+
   def storeBlock(blockId: StreamBlockId, block: ReceivedBlock): ReceivedBlockStoreResult = {
     val putResult: Seq[(BlockId, BlockStatus)] = block match {
       case ArrayBufferBlock(arrayBuffer) =>
diff --git a/streaming/src/test/scala/org/apache/spark/streaming/UISeleniumSuite.scala b/streaming/src/test/scala/org/apache/spark/streaming/UISeleniumSuite.scala
index 021d2c95a4aad..cbc24aee4fa1e 100644
--- a/streaming/src/test/scala/org/apache/spark/streaming/UISeleniumSuite.scala
+++ b/streaming/src/test/scala/org/apache/spark/streaming/UISeleniumSuite.scala
@@ -28,9 +28,6 @@ import org.scalatest.time.SpanSugar._
 
 import org.apache.spark._
 
-
-
-
 /**
  * Selenium tests for the Spark Web UI.
  */
@@ -197,4 +194,3 @@ class UISeleniumSuite
     }
   }
 }
-  
diff --git a/streaming/src/test/scala/org/apache/spark/streaming/util/WriteAheadLogSuite.scala b/streaming/src/test/scala/org/apache/spark/streaming/util/WriteAheadLogSuite.scala
index 0acf7068ef4a4..325ff7c74c39d 100644
--- a/streaming/src/test/scala/org/apache/spark/streaming/util/WriteAheadLogSuite.scala
+++ b/streaming/src/test/scala/org/apache/spark/streaming/util/WriteAheadLogSuite.scala
@@ -36,7 +36,7 @@ import org.apache.spark.{SparkConf, SparkException, SparkFunSuite}
 class WriteAheadLogSuite extends SparkFunSuite with BeforeAndAfter {
 
   import WriteAheadLogSuite._
-  
+
   val hadoopConf = new Configuration()
   var tempDir: File = null
   var testDir: String = null
@@ -359,7 +359,7 @@ object WriteAheadLogSuite {
     ): FileBasedWriteAheadLog = {
     if (manualClock.getTimeMillis() < 100000) manualClock.setTime(10000)
     val wal = new FileBasedWriteAheadLog(new SparkConf(), logDirectory, hadoopConf, 1, 1)
-    
+
     // Ensure that 500 does not get sorted after 2000, so put a high base value.
     data.foreach { item =>
       manualClock.advance(500)
diff --git a/yarn/src/main/scala/org/apache/spark/deploy/yarn/ClientDistributedCacheManager.scala b/yarn/src/main/scala/org/apache/spark/deploy/yarn/ClientDistributedCacheManager.scala
index 4ca6c903fcf12..3d3a966960e9f 100644
--- a/yarn/src/main/scala/org/apache/spark/deploy/yarn/ClientDistributedCacheManager.scala
+++ b/yarn/src/main/scala/org/apache/spark/deploy/yarn/ClientDistributedCacheManager.scala
@@ -43,22 +43,22 @@ private[spark] class ClientDistributedCacheManager() extends Logging {
    * Add a resource to the list of distributed cache resources. This list can
    * be sent to the ApplicationMaster and possibly the executors so that it can
    * be downloaded into the Hadoop distributed cache for use by this application.
-   * Adds the LocalResource to the localResources HashMap passed in and saves 
+   * Adds the LocalResource to the localResources HashMap passed in and saves
    * the stats of the resources to they can be sent to the executors and verified.
    *
    * @param fs FileSystem
    * @param conf Configuration
    * @param destPath path to the resource
    * @param localResources localResource hashMap to insert the resource into
-   * @param resourceType LocalResourceType 
+   * @param resourceType LocalResourceType
    * @param link link presented in the distributed cache to the destination
-   * @param statCache cache to store the file/directory stats 
+   * @param statCache cache to store the file/directory stats
    * @param appMasterOnly Whether to only add the resource to the app master
    */
   def addResource(
       fs: FileSystem,
       conf: Configuration,
-      destPath: Path, 
+      destPath: Path,
       localResources: HashMap[String, LocalResource],
       resourceType: LocalResourceType,
       link: String,
@@ -74,15 +74,15 @@ private[spark] class ClientDistributedCacheManager() extends Logging {
     amJarRsrc.setSize(destStatus.getLen())
     if (link == null || link.isEmpty()) throw new Exception("You must specify a valid link name")
     localResources(link) = amJarRsrc
-    
+
     if (!appMasterOnly) {
       val uri = destPath.toUri()
       val pathURI = new URI(uri.getScheme(), uri.getAuthority(), uri.getPath(), null, link)
       if (resourceType == LocalResourceType.FILE) {
-        distCacheFiles(pathURI.toString()) = (destStatus.getLen().toString(), 
+        distCacheFiles(pathURI.toString()) = (destStatus.getLen().toString(),
           destStatus.getModificationTime().toString(), visibility.name())
       } else {
-        distCacheArchives(pathURI.toString()) = (destStatus.getLen().toString(), 
+        distCacheArchives(pathURI.toString()) = (destStatus.getLen().toString(),
           destStatus.getModificationTime().toString(), visibility.name())
       }
     }
@@ -96,11 +96,11 @@ private[spark] class ClientDistributedCacheManager() extends Logging {
     val (sizes, timeStamps, visibilities) = tupleValues.unzip3
     if (keys.size > 0) {
       env("SPARK_YARN_CACHE_FILES") = keys.reduceLeft[String] { (acc, n) => acc + "," + n }
-      env("SPARK_YARN_CACHE_FILES_TIME_STAMPS") = 
+      env("SPARK_YARN_CACHE_FILES_TIME_STAMPS") =
         timeStamps.reduceLeft[String] { (acc, n) => acc + "," + n }
-      env("SPARK_YARN_CACHE_FILES_FILE_SIZES") = 
+      env("SPARK_YARN_CACHE_FILES_FILE_SIZES") =
         sizes.reduceLeft[String] { (acc, n) => acc + "," + n }
-      env("SPARK_YARN_CACHE_FILES_VISIBILITIES") = 
+      env("SPARK_YARN_CACHE_FILES_VISIBILITIES") =
         visibilities.reduceLeft[String] { (acc, n) => acc + "," + n }
     }
   }
@@ -113,11 +113,11 @@ private[spark] class ClientDistributedCacheManager() extends Logging {
     val (sizes, timeStamps, visibilities) = tupleValues.unzip3
     if (keys.size > 0) {
       env("SPARK_YARN_CACHE_ARCHIVES") = keys.reduceLeft[String] { (acc, n) => acc + "," + n }
-      env("SPARK_YARN_CACHE_ARCHIVES_TIME_STAMPS") = 
+      env("SPARK_YARN_CACHE_ARCHIVES_TIME_STAMPS") =
         timeStamps.reduceLeft[String] { (acc, n) => acc + "," + n }
       env("SPARK_YARN_CACHE_ARCHIVES_FILE_SIZES") =
         sizes.reduceLeft[String] { (acc, n) => acc + "," + n }
-      env("SPARK_YARN_CACHE_ARCHIVES_VISIBILITIES") = 
+      env("SPARK_YARN_CACHE_ARCHIVES_VISIBILITIES") =
         visibilities.reduceLeft[String] { (acc, n) => acc + "," + n }
     }
   }
@@ -197,7 +197,7 @@ private[spark] class ClientDistributedCacheManager() extends Logging {
   def getFileStatus(fs: FileSystem, uri: URI, statCache: Map[URI, FileStatus]): FileStatus = {
     val stat = statCache.get(uri) match {
       case Some(existstat) => existstat
-      case None => 
+      case None =>
         val newStat = fs.getFileStatus(new Path(uri))
         statCache.put(uri, newStat)
         newStat
diff --git a/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnSparkHadoopUtil.scala b/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnSparkHadoopUtil.scala
index 5e6531895c7ba..68d01c17ef720 100644
--- a/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnSparkHadoopUtil.scala
+++ b/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnSparkHadoopUtil.scala
@@ -144,9 +144,9 @@ class YarnSparkHadoopUtil extends SparkHadoopUtil {
 }
 
 object YarnSparkHadoopUtil {
-  // Additional memory overhead 
+  // Additional memory overhead
   // 10% was arrived at experimentally. In the interest of minimizing memory waste while covering
-  // the common cases. Memory overhead tends to grow with container size. 
+  // the common cases. Memory overhead tends to grow with container size.
 
   val MEMORY_OVERHEAD_FACTOR = 0.10
   val MEMORY_OVERHEAD_MIN = 384
diff --git a/yarn/src/test/scala/org/apache/spark/deploy/yarn/ClientDistributedCacheManagerSuite.scala b/yarn/src/test/scala/org/apache/spark/deploy/yarn/ClientDistributedCacheManagerSuite.scala
index 43a7334db874c..804dfecde7867 100644
--- a/yarn/src/test/scala/org/apache/spark/deploy/yarn/ClientDistributedCacheManagerSuite.scala
+++ b/yarn/src/test/scala/org/apache/spark/deploy/yarn/ClientDistributedCacheManagerSuite.scala
@@ -41,12 +41,12 @@ import org.apache.spark.SparkFunSuite
 class ClientDistributedCacheManagerSuite extends SparkFunSuite with MockitoSugar {
 
   class MockClientDistributedCacheManager extends ClientDistributedCacheManager {
-    override def getVisibility(conf: Configuration, uri: URI, statCache: Map[URI, FileStatus]): 
+    override def getVisibility(conf: Configuration, uri: URI, statCache: Map[URI, FileStatus]):
         LocalResourceVisibility = {
       LocalResourceVisibility.PRIVATE
     }
   }
-  
+
   test("test getFileStatus empty") {
     val distMgr = new ClientDistributedCacheManager()
     val fs = mock[FileSystem]
@@ -61,7 +61,7 @@ class ClientDistributedCacheManagerSuite extends SparkFunSuite with MockitoSugar
     val distMgr = new ClientDistributedCacheManager()
     val fs = mock[FileSystem]
     val uri = new URI("/tmp/testing")
-    val realFileStatus = new FileStatus(10, false, 1, 1024, 10, 10, null, "testOwner", 
+    val realFileStatus = new FileStatus(10, false, 1, 1024, 10, 10, null, "testOwner",
       null, new Path("/tmp/testing"))
     when(fs.getFileStatus(new Path(uri))).thenReturn(new FileStatus())
     val statCache: Map[URI, FileStatus] = HashMap[URI, FileStatus](uri -> realFileStatus)
@@ -78,7 +78,7 @@ class ClientDistributedCacheManagerSuite extends SparkFunSuite with MockitoSugar
     val statCache: Map[URI, FileStatus] = HashMap[URI, FileStatus]()
     when(fs.getFileStatus(destPath)).thenReturn(new FileStatus())
 
-    distMgr.addResource(fs, conf, destPath, localResources, LocalResourceType.FILE, "link", 
+    distMgr.addResource(fs, conf, destPath, localResources, LocalResourceType.FILE, "link",
       statCache, false)
     val resource = localResources("link")
     assert(resource.getVisibility() === LocalResourceVisibility.PRIVATE)
@@ -101,11 +101,11 @@ class ClientDistributedCacheManagerSuite extends SparkFunSuite with MockitoSugar
     assert(env.get("SPARK_YARN_CACHE_ARCHIVES_VISIBILITIES") === None)
 
     // add another one and verify both there and order correct
-    val realFileStatus = new FileStatus(20, false, 1, 1024, 10, 30, null, "testOwner", 
+    val realFileStatus = new FileStatus(20, false, 1, 1024, 10, 30, null, "testOwner",
       null, new Path("/tmp/testing2"))
     val destPath2 = new Path("file:///foo.invalid.com:8080/tmp/testing2")
     when(fs.getFileStatus(destPath2)).thenReturn(realFileStatus)
-    distMgr.addResource(fs, conf, destPath2, localResources, LocalResourceType.FILE, "link2", 
+    distMgr.addResource(fs, conf, destPath2, localResources, LocalResourceType.FILE, "link2",
       statCache, false)
     val resource2 = localResources("link2")
     assert(resource2.getVisibility() === LocalResourceVisibility.PRIVATE)
@@ -117,7 +117,7 @@ class ClientDistributedCacheManagerSuite extends SparkFunSuite with MockitoSugar
     val env2 = new HashMap[String, String]()
     distMgr.setDistFilesEnv(env2)
     val timestamps = env2("SPARK_YARN_CACHE_FILES_TIME_STAMPS").split(',')
-    val files = env2("SPARK_YARN_CACHE_FILES").split(',') 
+    val files = env2("SPARK_YARN_CACHE_FILES").split(',')
     val sizes = env2("SPARK_YARN_CACHE_FILES_FILE_SIZES").split(',')
     val visibilities = env2("SPARK_YARN_CACHE_FILES_VISIBILITIES") .split(',')
     assert(files(0) === "file:/foo.invalid.com:8080/tmp/testing#link")
@@ -141,7 +141,7 @@ class ClientDistributedCacheManagerSuite extends SparkFunSuite with MockitoSugar
     when(fs.getFileStatus(destPath)).thenReturn(new FileStatus())
 
     intercept[Exception] {
-      distMgr.addResource(fs, conf, destPath, localResources, LocalResourceType.FILE, null, 
+      distMgr.addResource(fs, conf, destPath, localResources, LocalResourceType.FILE, null,
         statCache, false)
     }
     assert(localResources.get("link") === None)
@@ -155,11 +155,11 @@ class ClientDistributedCacheManagerSuite extends SparkFunSuite with MockitoSugar
     val destPath = new Path("file:///foo.invalid.com:8080/tmp/testing")
     val localResources = HashMap[String, LocalResource]()
     val statCache: Map[URI, FileStatus] = HashMap[URI, FileStatus]()
-    val realFileStatus = new FileStatus(20, false, 1, 1024, 10, 30, null, "testOwner", 
+    val realFileStatus = new FileStatus(20, false, 1, 1024, 10, 30, null, "testOwner",
       null, new Path("/tmp/testing"))
     when(fs.getFileStatus(destPath)).thenReturn(realFileStatus)
 
-    distMgr.addResource(fs, conf, destPath, localResources, LocalResourceType.ARCHIVE, "link", 
+    distMgr.addResource(fs, conf, destPath, localResources, LocalResourceType.ARCHIVE, "link",
       statCache, true)
     val resource = localResources("link")
     assert(resource.getVisibility() === LocalResourceVisibility.PRIVATE)
@@ -189,11 +189,11 @@ class ClientDistributedCacheManagerSuite extends SparkFunSuite with MockitoSugar
     val destPath = new Path("file:///foo.invalid.com:8080/tmp/testing")
     val localResources = HashMap[String, LocalResource]()
     val statCache: Map[URI, FileStatus] = HashMap[URI, FileStatus]()
-    val realFileStatus = new FileStatus(20, false, 1, 1024, 10, 30, null, "testOwner", 
+    val realFileStatus = new FileStatus(20, false, 1, 1024, 10, 30, null, "testOwner",
       null, new Path("/tmp/testing"))
     when(fs.getFileStatus(destPath)).thenReturn(realFileStatus)
 
-    distMgr.addResource(fs, conf, destPath, localResources, LocalResourceType.ARCHIVE, "link", 
+    distMgr.addResource(fs, conf, destPath, localResources, LocalResourceType.ARCHIVE, "link",
       statCache, false)
     val resource = localResources("link")
     assert(resource.getVisibility() === LocalResourceVisibility.PRIVATE)

From 63a50be13d32b9e5f3aad8d1a6ba5362f17a252f Mon Sep 17 00:00:00 2001
From: Reynold Xin <rxin@databricks.com>
Date: Sun, 31 May 2015 00:48:49 -0700
Subject: [PATCH 283/525] [SPARK-3850] Trim trailing spaces for SQL.

Author: Reynold Xin <rxin@databricks.com>

Closes #6535 from rxin/whitespace-sql and squashes the following commits:

de50316 [Reynold Xin] [SPARK-3850] Trim trailing spaces for SQL.
---
 .../sql/catalyst/CatalystTypeConverters.scala  |  2 +-
 .../catalyst/analysis/HiveTypeCoercion.scala   |  4 ++--
 .../sql/catalyst/expressions/SortOrder.scala   |  2 +-
 .../sql/catalyst/expressions/aggregates.scala  | 16 ++++++++--------
 .../sql/catalyst/expressions/arithmetic.scala  |  2 +-
 .../catalyst/expressions/complexTypes.scala    |  2 +-
 .../expressions/mathfuncs/binary.scala         |  2 +-
 .../sql/catalyst/expressions/random.scala      |  2 +-
 .../expressions/stringOperations.scala         |  6 +++---
 .../apache/spark/sql/types/StructType.scala    |  2 +-
 .../ExpressionEvaluationSuite.scala            |  4 ++--
 .../optimizer/CombiningLimitsSuite.scala       |  4 ++--
 .../optimizer/ConstantFoldingSuite.scala       |  2 +-
 .../optimizer/FilterPushdownSuite.scala        |  4 ++--
 .../catalyst/optimizer/OptimizeInSuite.scala   |  2 +-
 .../apache/spark/sql/types/DataTypeSuite.scala |  6 +++---
 .../org/apache/spark/sql/GroupedData.scala     |  2 +-
 .../org/apache/spark/sql/api/r/SQLUtils.scala  |  4 ++--
 .../sql/execution/GeneratedAggregate.scala     |  8 ++++----
 .../spark/sql/execution/basicOperators.scala   |  2 +-
 .../sql/execution/stat/FrequentItems.scala     |  6 +++---
 .../sql/execution/stat/StatFunctions.scala     |  2 +-
 .../scala/org/apache/spark/sql/functions.scala |  2 +-
 .../org/apache/spark/sql/jdbc/JDBCRDD.scala    |  2 +-
 .../apache/spark/sql/jdbc/JDBCRelation.scala   |  6 +++---
 .../scala/org/apache/spark/sql/jdbc/jdbc.scala |  4 ++--
 .../spark/sql/sources/SqlNewHadoopRDD.scala    |  2 +-
 .../apache/spark/sql/DataFrameStatSuite.scala  |  2 +-
 .../org/apache/spark/sql/jdbc/JDBCSuite.scala  |  6 +++---
 .../apache/spark/sql/jdbc/JDBCWriteSuite.scala | 12 ++++++------
 .../apache/spark/sql/hive/client/package.scala |  2 +-
 .../sql/hive/execution/HiveTableScan.scala     |  4 ++--
 .../hive/execution/ScriptTransformation.scala  | 18 +++++++++---------
 .../org/apache/spark/sql/hive/hiveUdfs.scala   | 10 +++++-----
 .../spark/sql/hive/CachedTableSuite.scala      |  2 +-
 .../sql/hive/InsertIntoHiveTableSuite.scala    |  2 +-
 .../spark/sql/hive/client/VersionsSuite.scala  |  6 +++---
 .../hive/execution/HiveTableScanSuite.scala    |  8 ++++----
 .../sql/hive/execution/HiveUdfSuite.scala      |  2 +-
 39 files changed, 88 insertions(+), 88 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/CatalystTypeConverters.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/CatalystTypeConverters.scala
index 75a493b248f6e..1c0ddb5093d17 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/CatalystTypeConverters.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/CatalystTypeConverters.scala
@@ -233,7 +233,7 @@ object CatalystTypeConverters {
     case other => other
   }
 
-  /** 
+  /**
    * Converts Catalyst types used internally in rows to standard Scala types
    * This method is slow, and for batch conversion you should be using converter
    * produced by createToScalaConverter.
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala
index 195418d6dfb1f..96d7b96e60ee9 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala
@@ -296,8 +296,8 @@ trait HiveTypeCoercion {
   object InConversion extends Rule[LogicalPlan] {
     def apply(plan: LogicalPlan): LogicalPlan = plan transformAllExpressions {
       // Skip nodes who's children have not been resolved yet.
-      case e if !e.childrenResolved => e 
-      
+      case e if !e.childrenResolved => e
+
       case i @ In(a, b) if b.exists(_.dataType != a.dataType) =>
         i.makeCopy(Array(a, b.map(Cast(_, a.dataType))))
     }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/SortOrder.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/SortOrder.scala
index 195eec8e5cdc4..99340a14c9ecc 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/SortOrder.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/SortOrder.scala
@@ -29,7 +29,7 @@ case object Descending extends SortDirection
  * An expression that can be used to sort a tuple.  This class extends expression primarily so that
  * transformations over expression will descend into its child.
  */
-case class SortOrder(child: Expression, direction: SortDirection) extends Expression 
+case class SortOrder(child: Expression, direction: SortDirection) extends Expression
     with trees.UnaryNode[Expression] {
 
   override def dataType: DataType = child.dataType
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregates.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregates.scala
index 6c380d3084652..0266084a6d174 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregates.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregates.scala
@@ -394,13 +394,13 @@ case class Sum(child: Expression) extends PartialAggregate with trees.UnaryNode[
  * Combining    PartitionLevel   InputData
  *                           <-- null
  * Zero     <-- Zero         <-- null
- *                              
+ *
  *          <-- null         <-- no data
- * null     <-- null         <-- no data 
+ * null     <-- null         <-- no data
  */
 case class CombineSum(child: Expression) extends AggregateExpression {
   def this() = this(null)
-  
+
   override def children: Seq[Expression] = child :: Nil
   override def nullable: Boolean = true
   override def dataType: DataType = child.dataType
@@ -616,7 +616,7 @@ case class SumFunction(expr: Expression, base: AggregateExpression) extends Aggr
 
   private val sum = MutableLiteral(null, calcType)
 
-  private val addFunction = 
+  private val addFunction =
     Coalesce(Seq(Add(Coalesce(Seq(sum, zero)), Cast(expr, calcType)), sum, zero))
 
   override def update(input: Row): Unit = {
@@ -634,7 +634,7 @@ case class SumFunction(expr: Expression, base: AggregateExpression) extends Aggr
 
 case class CombineSumFunction(expr: Expression, base: AggregateExpression)
   extends AggregateFunction {
-  
+
   def this() = this(null, null) // Required for serialization.
 
   private val calcType =
@@ -649,12 +649,12 @@ case class CombineSumFunction(expr: Expression, base: AggregateExpression)
 
   private val sum = MutableLiteral(null, calcType)
 
-  private val addFunction = 
+  private val addFunction =
     Coalesce(Seq(Add(Coalesce(Seq(sum, zero)), Cast(expr, calcType)), sum, zero))
-  
+
   override def update(input: Row): Unit = {
     val result = expr.eval(input)
-    // partial sum result can be null only when no input rows present 
+    // partial sum result can be null only when no input rows present
     if(result != null) {
       sum.update(addFunction, input)
     }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/arithmetic.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/arithmetic.scala
index 34c833b260dc0..f2299d5db6e9f 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/arithmetic.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/arithmetic.scala
@@ -180,7 +180,7 @@ case class Divide(left: Expression, right: Expression) extends BinaryArithmetic
     case it: IntegralType => it.integral.asInstanceOf[Integral[Any]].quot
     case other => sys.error(s"Type $other does not support numeric operations")
   }
-  
+
   override def eval(input: Row): Any = {
     val evalE2 = right.eval(input)
     if (evalE2 == null || evalE2 == 0) {
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypes.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypes.scala
index e7cd7131a9e56..6398b8f9e4ed7 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypes.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypes.scala
@@ -26,7 +26,7 @@ import org.apache.spark.sql.types._
 case class CreateArray(children: Seq[Expression]) extends Expression {
 
   override def foldable: Boolean = children.forall(_.foldable)
-  
+
   lazy val childTypes = children.map(_.dataType).distinct
 
   override lazy val resolved =
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/mathfuncs/binary.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/mathfuncs/binary.scala
index 890efc9f52ca3..01f62ba0442e9 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/mathfuncs/binary.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/mathfuncs/binary.scala
@@ -26,7 +26,7 @@ import org.apache.spark.sql.types._
  * @param f The math function.
  * @param name The short name of the function
  */
-abstract class BinaryMathExpression(f: (Double, Double) => Double, name: String) 
+abstract class BinaryMathExpression(f: (Double, Double) => Double, name: String)
   extends BinaryExpression with Serializable with ExpectsInputTypes { self: Product =>
 
   override def symbol: String = null
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/random.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/random.scala
index de82c15680607..4f4f67a6e482c 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/random.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/random.scala
@@ -24,7 +24,7 @@ import org.apache.spark.util.random.XORShiftRandom
 
 /**
  * A Random distribution generating expression.
- * TODO: This can be made generic to generate any type of random distribution, or any type of  
+ * TODO: This can be made generic to generate any type of random distribution, or any type of
  * StructType.
  *
  * Since this expression is stateful, it cannot be a case object.
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala
index 83a44a12f0682..c4ef9c30907f1 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala
@@ -133,7 +133,7 @@ trait CaseConversionExpression extends ExpectsInputTypes {
  * A function that converts the characters of a string to uppercase.
  */
 case class Upper(child: Expression) extends UnaryExpression with CaseConversionExpression {
-  
+
   override def convert(v: UTF8String): UTF8String = v.toUpperCase()
 
   override def toString: String = s"Upper($child)"
@@ -143,7 +143,7 @@ case class Upper(child: Expression) extends UnaryExpression with CaseConversionE
  * A function that converts the characters of a string to lowercase.
  */
 case class Lower(child: Expression) extends UnaryExpression with CaseConversionExpression {
-  
+
   override def convert(v: UTF8String): UTF8String = v.toLowerCase()
 
   override def toString: String = s"Lower($child)"
@@ -223,7 +223,7 @@ case class Substring(str: Expression, pos: Expression, len: Expression)
   @inline
   def slicePos(startPos: Int, sliceLen: Int, length: () => Int): (Int, Int) = {
     // Hive and SQL use one-based indexing for SUBSTR arguments but also accept zero and
-    // negative indices for start positions. If a start index i is greater than 0, it 
+    // negative indices for start positions. If a start index i is greater than 0, it
     // refers to element i-1 in the sequence. If a start index i is less than 0, it refers
     // to the -ith element before the end of the sequence. If a start index i is 0, it
     // refers to the first element.
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/StructType.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/StructType.scala
index a4f30c825befb..193c08a4d0df7 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/StructType.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/StructType.scala
@@ -265,7 +265,7 @@ object StructType {
       case _ =>
         throw new SparkException(s"Failed to merge incompatible data types $left and $right")
     }
-  
+
   private[sql] def fieldsMap(fields: Array[StructField]): Map[String, StructField] = {
     import scala.collection.breakOut
     fields.map(s => (s.name, s))(breakOut)
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionEvaluationSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionEvaluationSuite.scala
index 10181366c2fcd..3f5a660f17e1d 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionEvaluationSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionEvaluationSuite.scala
@@ -1209,7 +1209,7 @@ class ExpressionEvaluationSuite extends ExpressionEvaluationBaseSuite {
   }
 
   /**
-   * Used for testing math functions for DataFrames. 
+   * Used for testing math functions for DataFrames.
    * @param c The DataFrame function
    * @param f The functions in scala.math
    * @param domain The set of values to run the function with
@@ -1217,7 +1217,7 @@ class ExpressionEvaluationSuite extends ExpressionEvaluationBaseSuite {
    * @tparam T Generic type for primitives
    */
   def unaryMathFunctionEvaluation[@specialized(Int, Double, Float, Long) T](
-      c: Expression => Expression, 
+      c: Expression => Expression,
       f: T => T,
       domain: Iterable[T] = (-20 to 20).map(_ * 0.1),
       expectNull: Boolean = false): Unit = {
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/CombiningLimitsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/CombiningLimitsSuite.scala
index a30052b38fc11..06c592f4905a3 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/CombiningLimitsSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/CombiningLimitsSuite.scala
@@ -71,7 +71,7 @@ class CombiningLimitsSuite extends PlanTest {
 
     comparePlans(optimized, correctAnswer)
   }
-  
+
   test("limits: combines two limits after ColumnPruning") {
     val originalQuery =
       testRelation
@@ -79,7 +79,7 @@ class CombiningLimitsSuite extends PlanTest {
         .limit(2)
         .select('a)
         .limit(5)
-        
+
     val optimized = Optimize.execute(originalQuery.analyze)
     val correctAnswer =
       testRelation
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/ConstantFoldingSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/ConstantFoldingSuite.scala
index 5697c2272b8e8..ec3b2f1edfa05 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/ConstantFoldingSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/ConstantFoldingSuite.scala
@@ -248,7 +248,7 @@ class ConstantFoldingSuite extends PlanTest {
 
     comparePlans(optimized, correctAnswer)
   }
-  
+
   test("Constant folding test: Fold In(v, list) into true or false") {
     var originalQuery =
       testRelation
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/FilterPushdownSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/FilterPushdownSuite.scala
index ff25470bf0946..17dc9124749e8 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/FilterPushdownSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/FilterPushdownSuite.scala
@@ -93,7 +93,7 @@ class FilterPushdownSuite extends PlanTest {
 
     comparePlans(optimized, correctAnswer)
   }
-  
+
   test("column pruning for Project(ne, Limit)") {
     val originalQuery =
       testRelation
@@ -109,7 +109,7 @@ class FilterPushdownSuite extends PlanTest {
 
     comparePlans(optimized, correctAnswer)
   }
-  
+
   // After this line is unimplemented.
   test("simple push down") {
     val originalQuery =
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/OptimizeInSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/OptimizeInSuite.scala
index 11b0859d3f066..1d433275fed2e 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/OptimizeInSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/OptimizeInSuite.scala
@@ -57,7 +57,7 @@ class OptimizeInSuite extends PlanTest {
 
     comparePlans(optimized, correctAnswer)
   }
-  
+
   test("OptimizedIn test: In clause not optimized in case filter has attributes") {
     val originalQuery =
       testRelation
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/types/DataTypeSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/types/DataTypeSuite.scala
index 543cdefc5293b..261c4fcad24aa 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/types/DataTypeSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/types/DataTypeSuite.scala
@@ -71,7 +71,7 @@ class DataTypeSuite extends SparkFunSuite {
 
   test("fieldsMap returns map of name to StructField") {
     val struct = StructType(
-      StructField("a", LongType) :: 
+      StructField("a", LongType) ::
       StructField("b", FloatType) :: Nil)
 
     val mapped = StructType.fieldsMap(struct.fields)
@@ -90,7 +90,7 @@ class DataTypeSuite extends SparkFunSuite {
 
     val right = StructType(List())
     val merged = left.merge(right)
-    
+
     assert(merged === left)
   }
 
@@ -133,7 +133,7 @@ class DataTypeSuite extends SparkFunSuite {
 
     val right = StructType(
       StructField("b", LongType) :: Nil)
-    
+
     intercept[SparkException] {
       left.merge(right)
     }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/GroupedData.scala b/sql/core/src/main/scala/org/apache/spark/sql/GroupedData.scala
index c4ceb0c173887..45b3e1bc627d5 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/GroupedData.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/GroupedData.scala
@@ -249,7 +249,7 @@ class GroupedData protected[sql](
   def mean(colNames: String*): DataFrame = {
     aggregateNumericColumns(colNames : _*)(Average)
   }
- 
+
   /**
    * Compute the max value for each numeric columns for each group.
    * The resulting [[DataFrame]] will also contain the grouping columns.
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/api/r/SQLUtils.scala b/sql/core/src/main/scala/org/apache/spark/sql/api/r/SQLUtils.scala
index 423ecdff5804a..604f3124e23ae 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/api/r/SQLUtils.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/api/r/SQLUtils.scala
@@ -106,7 +106,7 @@ private[r] object SQLUtils {
 
     dfCols.map { col =>
       colToRBytes(col)
-    } 
+    }
   }
 
   def convertRowsToColumns(localDF: Array[Row], numCols: Int): Array[Array[Any]] = {
@@ -121,7 +121,7 @@ private[r] object SQLUtils {
     val numRows = col.length
     val bos = new ByteArrayOutputStream()
     val dos = new DataOutputStream(bos)
-    
+
     SerDe.writeInt(dos, numRows)
 
     col.map { item =>
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/GeneratedAggregate.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/GeneratedAggregate.scala
index 2ec7d4fbc92de..3e27c1bde2dfd 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/GeneratedAggregate.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/GeneratedAggregate.scala
@@ -138,15 +138,15 @@ case class GeneratedAggregate(
           case UnscaledValue(e) => e
           case _ => expr
         }
-        // partial sum result can be null only when no input rows present 
+        // partial sum result can be null only when no input rows present
         val updateFunction = If(
           IsNotNull(actualExpr),
           Coalesce(
             Add(
-              Coalesce(currentSum :: zero :: Nil), 
+              Coalesce(currentSum :: zero :: Nil),
               Cast(expr, calcType)) :: currentSum :: zero :: Nil),
           currentSum)
-          
+
         val result =
           expr.dataType match {
             case DecimalType.Fixed(_, _) =>
@@ -155,7 +155,7 @@ case class GeneratedAggregate(
           }
 
         AggregateEvaluation(currentSum :: Nil, initialValue :: Nil, updateFunction :: Nil, result)
-        
+
       case m @ Max(expr) =>
         val currentMax = AttributeReference("currentMax", expr.dataType, nullable = true)()
         val initialValue = Literal.create(null, expr.dataType)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/basicOperators.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/basicOperators.scala
index 6cb67b4bbbb65..a30ade86441ca 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/basicOperators.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/basicOperators.scala
@@ -65,7 +65,7 @@ case class Filter(condition: Expression, child: SparkPlan) extends UnaryNode {
  * :: DeveloperApi ::
  * Sample the dataset.
  * @param lowerBound Lower-bound of the sampling probability (usually 0.0)
- * @param upperBound Upper-bound of the sampling probability. The expected fraction sampled 
+ * @param upperBound Upper-bound of the sampling probability. The expected fraction sampled
  *                   will be ub - lb.
  * @param withReplacement Whether to sample with replacement.
  * @param seed the random seed
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/stat/FrequentItems.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/stat/FrequentItems.scala
index fe8a81e3d0434..c41c21c0eeb50 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/stat/FrequentItems.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/stat/FrequentItems.scala
@@ -62,7 +62,7 @@ private[sql] object FrequentItems extends Logging {
   }
 
   /**
-   * Finding frequent items for columns, possibly with false positives. Using the 
+   * Finding frequent items for columns, possibly with false positives. Using the
    * frequent element count algorithm described in
    * [[http://dx.doi.org/10.1145/762471.762473, proposed by Karp, Schenker, and Papadimitriou]].
    * The `support` should be greater than 1e-4.
@@ -75,7 +75,7 @@ private[sql] object FrequentItems extends Logging {
    * @return A Local DataFrame with the Array of frequent items for each column.
    */
   private[sql] def singlePassFreqItems(
-      df: DataFrame, 
+      df: DataFrame,
       cols: Seq[String],
       support: Double): DataFrame = {
     require(support >= 1e-4, s"support ($support) must be greater than 1e-4.")
@@ -88,7 +88,7 @@ private[sql] object FrequentItems extends Logging {
       val index = originalSchema.fieldIndex(name)
       (name, originalSchema.fields(index).dataType)
     }
-    
+
     val freqItems = df.select(cols.map(Column(_)) : _*).rdd.aggregate(countMaps)(
       seqOp = (counts, row) => {
         var i = 0
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/stat/StatFunctions.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/stat/StatFunctions.scala
index d22f5fd2d439c..b1a8204dd5f71 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/stat/StatFunctions.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/stat/StatFunctions.scala
@@ -25,7 +25,7 @@ import org.apache.spark.sql.functions._
 import org.apache.spark.sql.types._
 
 private[sql] object StatFunctions extends Logging {
-  
+
   /** Calculate the Pearson Correlation Coefficient for the given columns */
   private[sql] def pearsonCorrelation(df: DataFrame, cols: Seq[String]): Double = {
     val counts = collectStatisticalData(df, cols)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
index 6dc17bbb2e768..77327f2b84eaa 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
@@ -1299,7 +1299,7 @@ object functions {
    * @since 1.4.0
    */
   def toRadians(columnName: String): Column = toRadians(Column(columnName))
-    
+
 
   //////////////////////////////////////////////////////////////////////////////////////////////
   //////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/jdbc/JDBCRDD.scala b/sql/core/src/main/scala/org/apache/spark/sql/jdbc/JDBCRDD.scala
index 2d8d950038e78..40b604d710dce 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/jdbc/JDBCRDD.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/jdbc/JDBCRDD.scala
@@ -304,7 +304,7 @@ private[sql] class JDBCRDD(
 
   // Each JDBC-to-Catalyst conversion corresponds to a tag defined here so that
   // we don't have to potentially poke around in the Metadata once for every
-  // row.  
+  // row.
   // Is there a better way to do this?  I'd rather be using a type that
   // contains only the tags I define.
   abstract class JDBCConversion
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/jdbc/JDBCRelation.scala b/sql/core/src/main/scala/org/apache/spark/sql/jdbc/JDBCRelation.scala
index 09d6865457df6..30f9190d45bf8 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/jdbc/JDBCRelation.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/jdbc/JDBCRelation.scala
@@ -54,7 +54,7 @@ private[sql] object JDBCRelation {
     if (numPartitions == 1) return Array[Partition](JDBCPartition(null, 0))
     // Overflow and silliness can happen if you subtract then divide.
     // Here we get a little roundoff, but that's (hopefully) OK.
-    val stride: Long = (partitioning.upperBound / numPartitions 
+    val stride: Long = (partitioning.upperBound / numPartitions
                       - partitioning.lowerBound / numPartitions)
     var i: Int = 0
     var currentValue: Long = partitioning.lowerBound
@@ -140,10 +140,10 @@ private[sql] case class JDBCRelation(
       filters,
       parts)
   }
-  
+
   override def insert(data: DataFrame, overwrite: Boolean): Unit = {
     data.write
       .mode(if (overwrite) SaveMode.Overwrite else SaveMode.Append)
       .jdbc(url, table, properties)
-  }  
+  }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/jdbc/jdbc.scala b/sql/core/src/main/scala/org/apache/spark/sql/jdbc/jdbc.scala
index f21dd29aca37f..dd8aaf6474895 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/jdbc/jdbc.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/jdbc/jdbc.scala
@@ -240,10 +240,10 @@ package object jdbc {
         }
       }
     }
-    
+
     def getDriverClassName(url: String): String = DriverManager.getDriver(url) match {
       case wrapper: DriverWrapper => wrapper.wrapped.getClass.getCanonicalName
-      case driver => driver.getClass.getCanonicalName  
+      case driver => driver.getClass.getCanonicalName
     }
   }
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/sources/SqlNewHadoopRDD.scala b/sql/core/src/main/scala/org/apache/spark/sql/sources/SqlNewHadoopRDD.scala
index a74a98631da35..ebad0c1564ec0 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/sources/SqlNewHadoopRDD.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/sources/SqlNewHadoopRDD.scala
@@ -216,7 +216,7 @@ private[sql] class SqlNewHadoopRDD[K, V](
   override def getPreferredLocations(hsplit: SparkPartition): Seq[String] = {
     val split = hsplit.asInstanceOf[SqlNewHadoopPartition].serializableHadoopSplit.value
     val locs = HadoopRDD.SPLIT_INFO_REFLECTIONS match {
-      case Some(c) => 
+      case Some(c) =>
         try {
           val infos = c.newGetLocationInfo.invoke(split).asInstanceOf[Array[AnyRef]]
           Some(HadoopRDD.convertSplitLocationInfo(infos))
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameStatSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameStatSuite.scala
index add0fd58e28c8..78de89f0b9f39 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameStatSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameStatSuite.scala
@@ -24,7 +24,7 @@ import org.apache.spark.sql.test.TestSQLContext
 import org.apache.spark.sql.test.TestSQLContext.implicits._
 
 class DataFrameStatSuite extends SparkFunSuite  {
-  
+
   val sqlCtx = TestSQLContext
   def toLetter(i: Int): String = (i + 97).toChar.toString
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCSuite.scala
index af279007c587e..e20c66cb2f1d7 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCSuite.scala
@@ -68,7 +68,7 @@ class JDBCSuite extends SparkFunSuite with BeforeAndAfter {
         |USING org.apache.spark.sql.jdbc
         |OPTIONS (url '$url', dbtable 'TEST.PEOPLE', user 'testUser', password 'testPass')
       """.stripMargin.replaceAll("\n", " "))
- 
+
     sql(
       s"""
         |CREATE TEMPORARY TABLE fetchtwo
@@ -76,7 +76,7 @@ class JDBCSuite extends SparkFunSuite with BeforeAndAfter {
         |OPTIONS (url '$url', dbtable 'TEST.PEOPLE', user 'testUser', password 'testPass',
         |         fetchSize '2')
       """.stripMargin.replaceAll("\n", " "))
- 
+
     sql(
       s"""
         |CREATE TEMPORARY TABLE parts
@@ -209,7 +209,7 @@ class JDBCSuite extends SparkFunSuite with BeforeAndAfter {
     assert(ids(1) === 2)
     assert(ids(2) === 3)
   }
- 
+
   test("SELECT second field when fetchSize is two") {
     val ids = sql("SELECT THEID FROM fetchtwo").collect().map(x => x.getInt(0)).sortWith(_ < _)
     assert(ids.size === 3)
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCWriteSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCWriteSuite.scala
index 3cd987b0b3383..2de8c1a6098e0 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCWriteSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCWriteSuite.scala
@@ -36,12 +36,12 @@ class JDBCWriteSuite extends SparkFunSuite with BeforeAndAfter {
   properties.setProperty("user", "testUser")
   properties.setProperty("password", "testPass")
   properties.setProperty("rowId", "false")
-    
+
   before {
     Class.forName("org.h2.Driver")
     conn = DriverManager.getConnection(url)
     conn.prepareStatement("create schema test").executeUpdate()
-   
+
     conn1 = DriverManager.getConnection(url1, properties)
     conn1.prepareStatement("create schema test").executeUpdate()
     conn1.prepareStatement("drop table if exists test.people").executeUpdate()
@@ -53,20 +53,20 @@ class JDBCWriteSuite extends SparkFunSuite with BeforeAndAfter {
     conn1.prepareStatement(
       "create table test.people1 (name TEXT(32) NOT NULL, theid INTEGER NOT NULL)").executeUpdate()
     conn1.commit()
-     
+
     TestSQLContext.sql(
       s"""
         |CREATE TEMPORARY TABLE PEOPLE
         |USING org.apache.spark.sql.jdbc
         |OPTIONS (url '$url1', dbtable 'TEST.PEOPLE', user 'testUser', password 'testPass')
       """.stripMargin.replaceAll("\n", " "))
-    
+
     TestSQLContext.sql(
       s"""
         |CREATE TEMPORARY TABLE PEOPLE1
         |USING org.apache.spark.sql.jdbc
         |OPTIONS (url '$url1', dbtable 'TEST.PEOPLE1', user 'testUser', password 'testPass')
-      """.stripMargin.replaceAll("\n", " "))  
+      """.stripMargin.replaceAll("\n", " "))
   }
 
   after {
@@ -152,5 +152,5 @@ class JDBCWriteSuite extends SparkFunSuite with BeforeAndAfter {
     TestSQLContext.sql("INSERT OVERWRITE TABLE PEOPLE1 SELECT * FROM PEOPLE")
     assert(2 == TestSQLContext.read.jdbc(url1, "TEST.PEOPLE1", properties).count)
     assert(2 == TestSQLContext.read.jdbc(url1, "TEST.PEOPLE1", properties).collect()(0).length)
-  } 
+  }
 }
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/package.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/package.scala
index 7db9200d47440..410d9881ac214 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/package.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/package.scala
@@ -29,5 +29,5 @@ package object client {
     case object v13 extends HiveVersion("0.13.1", false)
   }
   // scalastyle:on
-  
+
 }
\ No newline at end of file
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/HiveTableScan.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/HiveTableScan.scala
index 62dc4167b78dd..11ee5503146b9 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/HiveTableScan.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/HiveTableScan.scala
@@ -63,7 +63,7 @@ case class HiveTableScan(
     BindReferences.bindReference(pred, relation.partitionKeys)
   }
 
-  // Create a local copy of hiveconf,so that scan specific modifications should not impact 
+  // Create a local copy of hiveconf,so that scan specific modifications should not impact
   // other queries
   @transient
   private[this] val hiveExtraConf = new HiveConf(context.hiveconf)
@@ -72,7 +72,7 @@ case class HiveTableScan(
   addColumnMetadataToConf(hiveExtraConf)
 
   @transient
-  private[this] val hadoopReader = 
+  private[this] val hadoopReader =
     new HadoopTableReader(attributes, relation, context, hiveExtraConf)
 
   private[this] def castFromString(value: String, dataType: DataType) = {
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/ScriptTransformation.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/ScriptTransformation.scala
index 6f27a8626fc1e..fd623370cc407 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/ScriptTransformation.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/ScriptTransformation.scala
@@ -62,7 +62,7 @@ case class ScriptTransformation(
       val inputStream = proc.getInputStream
       val outputStream = proc.getOutputStream
       val reader = new BufferedReader(new InputStreamReader(inputStream))
- 
+
       val (outputSerde, outputSoi) = ioschema.initOutputSerDe(output)
 
       val iterator: Iterator[Row] = new Iterator[Row] with HiveInspectors {
@@ -95,7 +95,7 @@ case class ScriptTransformation(
             val raw = outputSerde.deserialize(writable)
             val dataList = outputSoi.getStructFieldsDataAsList(raw)
             val fieldList = outputSoi.getAllStructFieldRefs()
-            
+
             var i = 0
             dataList.foreach( element => {
               if (element == null) {
@@ -117,7 +117,7 @@ case class ScriptTransformation(
           if (!hasNext) {
             throw new NoSuchElementException
           }
- 
+
           if (outputSerde == null) {
             val prevLine = curLine
             curLine = reader.readLine()
@@ -192,7 +192,7 @@ case class HiveScriptIOSchema (
   val inputRowFormatMap = inputRowFormat.toMap.withDefault((k) => defaultFormat(k))
   val outputRowFormatMap = outputRowFormat.toMap.withDefault((k) => defaultFormat(k))
 
-  
+
   def initInputSerDe(input: Seq[Expression]): (AbstractSerDe, ObjectInspector) = {
     val (columns, columnTypes) = parseAttrs(input)
     val serde = initSerDe(inputSerdeClass, columns, columnTypes, inputSerdeProps)
@@ -206,13 +206,13 @@ case class HiveScriptIOSchema (
   }
 
   def parseAttrs(attrs: Seq[Expression]): (Seq[String], Seq[DataType]) = {
-                                                
+
     val columns = attrs.map {
       case aref: AttributeReference => aref.name
       case e: NamedExpression => e.name
       case _ => null
     }
- 
+
     val columnTypes = attrs.map {
       case aref: AttributeReference => aref.dataType
       case e: NamedExpression => e.dataType
@@ -221,7 +221,7 @@ case class HiveScriptIOSchema (
 
     (columns, columnTypes)
   }
- 
+
   def initSerDe(serdeClassName: String, columns: Seq[String],
     columnTypes: Seq[DataType], serdeProps: Seq[(String, String)]): AbstractSerDe = {
 
@@ -240,7 +240,7 @@ case class HiveScriptIOSchema (
         (kv._1.split("'")(1), kv._2.split("'")(1))
       }).toMap + (serdeConstants.LIST_COLUMNS -> columns.mkString(","))
       propsMap = propsMap + (serdeConstants.LIST_COLUMN_TYPES -> columnTypesNames)
-    
+
       val properties = new Properties()
       properties.putAll(propsMap)
       serde.initialize(null, properties)
@@ -261,7 +261,7 @@ case class HiveScriptIOSchema (
       null
     }
   }
- 
+
   def initOutputputSoi(outputSerde: AbstractSerDe): StructObjectInspector = {
     if (outputSerde != null) {
       outputSerde.getObjectInspector().asInstanceOf[StructObjectInspector]
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/hiveUdfs.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/hiveUdfs.scala
index bb116e3ab7de7..64a49c83cbad1 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/hiveUdfs.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/hiveUdfs.scala
@@ -555,12 +555,12 @@ private[hive] case class HiveUdafFunction(
     } else {
       funcWrapper.createFunction[AbstractGenericUDAFResolver]()
     }
-  
+
   private val inspectors = exprs.map(toInspector).toArray
-    
-  private val function = { 
+
+  private val function = {
     val parameterInfo = new SimpleGenericUDAFParameterInfo(inspectors, false, false)
-    resolver.getEvaluator(parameterInfo) 
+    resolver.getEvaluator(parameterInfo)
   }
 
   private val returnInspector = function.init(GenericUDAFEvaluator.Mode.COMPLETE, inspectors)
@@ -575,7 +575,7 @@ private[hive] case class HiveUdafFunction(
 
   @transient
   protected lazy val cached = new Array[AnyRef](exprs.length)
-  
+
   def update(input: Row): Unit = {
     val inputs = inputProjection(input)
     function.iterate(buffer, wrap(inputs, inspectors, cached))
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/CachedTableSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/CachedTableSuite.scala
index 945596db80326..39d315aaeab57 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/CachedTableSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/CachedTableSuite.scala
@@ -57,7 +57,7 @@ class CachedTableSuite extends QueryTest {
     checkAnswer(
       sql("SELECT * FROM src s"),
       preCacheResults)
-    
+
     uncacheTable("src")
     assertCached(sql("SELECT * FROM src"), 0)
   }
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/InsertIntoHiveTableSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/InsertIntoHiveTableSuite.scala
index 9cc4685499f19..aa5dbe2db6903 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/InsertIntoHiveTableSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/InsertIntoHiveTableSuite.scala
@@ -240,7 +240,7 @@ class InsertIntoHiveTableSuite extends QueryTest with BeforeAndAfter {
     checkAnswer(sql("select key,value from table_with_partition where ds='1' "),
       testData.collect().toSeq
     )
-    
+
     // test difference type of field
     sql("ALTER TABLE table_with_partition CHANGE COLUMN key key BIGINT")
     checkAnswer(sql("select key,value from table_with_partition where ds='1' "),
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/VersionsSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/VersionsSuite.scala
index 446a2f2d646e1..7eb4842726665 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/VersionsSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/VersionsSuite.scala
@@ -22,9 +22,9 @@ import org.apache.spark.sql.catalyst.util.quietly
 import org.apache.spark.util.Utils
 
 /**
- * A simple set of tests that call the methods of a hive ClientInterface, loading different version 
- * of hive from maven central.  These tests are simple in that they are mostly just testing to make 
- * sure that reflective calls are not throwing NoSuchMethod error, but the actually functionallity 
+ * A simple set of tests that call the methods of a hive ClientInterface, loading different version
+ * of hive from maven central.  These tests are simple in that they are mostly just testing to make
+ * sure that reflective calls are not throwing NoSuchMethod error, but the actually functionality
  * is not fully tested.
  */
 class VersionsSuite extends SparkFunSuite with Logging {
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveTableScanSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveTableScanSuite.scala
index 0ba4d11478211..2209fc2f30a3c 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveTableScanSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveTableScanSuite.scala
@@ -61,7 +61,7 @@ class HiveTableScanSuite extends HiveComparisonTest {
     TestHive.sql("select KEY from tb where VALUE='just_for_test' limit 5").collect()
     TestHive.sql("drop table tb")
   }
-  
+
   test("Spark-4077: timestamp query for null value") {
     TestHive.sql("DROP TABLE IF EXISTS timestamp_query_null")
     TestHive.sql(
@@ -71,11 +71,11 @@ class HiveTableScanSuite extends HiveComparisonTest {
         FIELDS TERMINATED BY ','
         LINES TERMINATED BY '\n'
       """.stripMargin)
-    val location = 
+    val location =
       Utils.getSparkClassLoader.getResource("data/files/issue-4077-data.txt").getFile()
-     
+
     TestHive.sql(s"LOAD DATA LOCAL INPATH '$location' INTO TABLE timestamp_query_null")
-    assert(TestHive.sql("SELECT time from timestamp_query_null limit 2").collect() 
+    assert(TestHive.sql("SELECT time from timestamp_query_null limit 2").collect()
       === Array(Row(java.sql.Timestamp.valueOf("2014-12-11 00:00:00")), Row(null)))
     TestHive.sql("DROP TABLE timestamp_query_null")
   }
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveUdfSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveUdfSuite.scala
index 7f49eac490572..ce5985888f540 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveUdfSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveUdfSuite.scala
@@ -101,7 +101,7 @@ class HiveUdfSuite extends QueryTest {
     sql("DROP TEMPORARY FUNCTION IF EXISTS test_avg")
     TestHive.reset()
   }
-  
+
   test("SPARK-2693 udaf aggregates test") {
     checkAnswer(sql("SELECT percentile(key, 1) FROM src LIMIT 1"),
       sql("SELECT max(key) FROM src").collect().toSeq)

From 4b5f12bac939a2f47a3a61365b5325d849b7b51f Mon Sep 17 00:00:00 2001
From: Reynold Xin <rxin@databricks.com>
Date: Sun, 31 May 2015 01:37:56 -0700
Subject: [PATCH 284/525] [SPARK-7979] Enforce structural type checker.

Author: Reynold Xin <rxin@databricks.com>

Closes #6536 from rxin/structural-type-checker and squashes the following commits:

f833151 [Reynold Xin] Fixed compilation.
633f9a1 [Reynold Xin] Fixed typo.
d1fa804 [Reynold Xin] [SPARK-7979] Enforce structural type checker.
---
 .../org/apache/spark/util/random/XORShiftRandomSuite.scala  | 2 +-
 .../apache/spark/examples/mllib/DecisionTreeRunner.scala    | 6 +++++-
 graphx/src/main/scala/org/apache/spark/graphx/EdgeRDD.scala | 4 +++-
 .../org/apache/spark/ml/classification/OneVsRest.scala      | 2 ++
 scalastyle-config.xml                                       | 3 +++
 5 files changed, 14 insertions(+), 3 deletions(-)

diff --git a/core/src/test/scala/org/apache/spark/util/random/XORShiftRandomSuite.scala b/core/src/test/scala/org/apache/spark/util/random/XORShiftRandomSuite.scala
index 6ca484ccd0c06..d26667bf720cf 100644
--- a/core/src/test/scala/org/apache/spark/util/random/XORShiftRandomSuite.scala
+++ b/core/src/test/scala/org/apache/spark/util/random/XORShiftRandomSuite.scala
@@ -28,7 +28,7 @@ import scala.language.reflectiveCalls
 
 class XORShiftRandomSuite extends SparkFunSuite with Matchers {
 
-  def fixture: Object {val seed: Long; val hundMil: Int; val xorRand: XORShiftRandom} = new {
+  private def fixture = new {
     val seed = 1L
     val xorRand = new XORShiftRandom(seed)
     val hundMil = 1e8.toInt
diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/DecisionTreeRunner.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/DecisionTreeRunner.scala
index b0613632c9946..3381941673db8 100644
--- a/examples/src/main/scala/org/apache/spark/examples/mllib/DecisionTreeRunner.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/mllib/DecisionTreeRunner.scala
@@ -22,7 +22,6 @@ import scala.language.reflectiveCalls
 import scopt.OptionParser
 
 import org.apache.spark.{SparkConf, SparkContext}
-import org.apache.spark.SparkContext._
 import org.apache.spark.mllib.evaluation.MulticlassMetrics
 import org.apache.spark.mllib.linalg.Vector
 import org.apache.spark.mllib.regression.LabeledPoint
@@ -354,7 +353,11 @@ object DecisionTreeRunner {
 
   /**
    * Calculates the mean squared error for regression.
+   *
+   * This is just for demo purpose. In general, don't copy this code because it is NOT efficient
+   * due to the use of structural types, which leads to one reflection call per record.
    */
+  // scalastyle:off structural.type
   private[mllib] def meanSquaredError(
       model: { def predict(features: Vector): Double },
       data: RDD[LabeledPoint]): Double = {
@@ -363,4 +366,5 @@ object DecisionTreeRunner {
       err * err
     }.mean()
   }
+  // scalastyle:on structural.type
 }
diff --git a/graphx/src/main/scala/org/apache/spark/graphx/EdgeRDD.scala b/graphx/src/main/scala/org/apache/spark/graphx/EdgeRDD.scala
index cc70b396a8dd4..4611a3ace219b 100644
--- a/graphx/src/main/scala/org/apache/spark/graphx/EdgeRDD.scala
+++ b/graphx/src/main/scala/org/apache/spark/graphx/EdgeRDD.scala
@@ -41,14 +41,16 @@ abstract class EdgeRDD[ED](
     @transient sc: SparkContext,
     @transient deps: Seq[Dependency[_]]) extends RDD[Edge[ED]](sc, deps) {
 
+  // scalastyle:off structural.type
   private[graphx] def partitionsRDD: RDD[(PartitionID, EdgePartition[ED, VD])] forSome { type VD }
+  // scalastyle:on structural.type
 
   override protected def getPartitions: Array[Partition] = partitionsRDD.partitions
 
   override def compute(part: Partition, context: TaskContext): Iterator[Edge[ED]] = {
     val p = firstParent[(PartitionID, EdgePartition[ED, _])].iterator(part, context)
     if (p.hasNext) {
-      p.next._2.iterator.map(_.copy())
+      p.next()._2.iterator.map(_.copy())
     } else {
       Iterator.empty
     }
diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/OneVsRest.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/OneVsRest.scala
index b8c7f3c5bc3b9..7b726da388075 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/classification/OneVsRest.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/classification/OneVsRest.scala
@@ -37,11 +37,13 @@ import org.apache.spark.storage.StorageLevel
  */
 private[ml] trait OneVsRestParams extends PredictorParams {
 
+  // scalastyle:off structural.type
   type ClassifierType = Classifier[F, E, M] forSome {
     type F
     type M <: ClassificationModel[F, M]
     type E <: Classifier[F, E, M]
   }
+  // scalastyle:on structural.type
 
   /**
    * param for the base binary classifier that we reduce multiclass classification into.
diff --git a/scalastyle-config.xml b/scalastyle-config.xml
index 3a984222167b0..75ef1e964b5ac 100644
--- a/scalastyle-config.xml
+++ b/scalastyle-config.xml
@@ -114,6 +114,9 @@
  <!--   <parameter name="maximum"><![CDATA[10]]></parameter> -->
  <!--  </parameters> -->
  <!-- </check> -->
+
+  <check level="error" class="org.scalastyle.scalariform.StructuralTypeChecker" enabled="true"></check>
+
  <check level="error" class="org.scalastyle.scalariform.UppercaseLChecker" enabled="true"></check>
  <check level="error" class="org.scalastyle.scalariform.SimplifyBooleanExpressionChecker" enabled="false"></check>
  <check level="error" class="org.scalastyle.scalariform.IfBraceChecker" enabled="true">

From d1d2def2f5f91e86f340656421170d1097f14854 Mon Sep 17 00:00:00 2001
From: zsxwing <zsxwing@gmail.com>
Date: Sun, 31 May 2015 11:18:12 -0700
Subject: [PATCH 285/525] [MINOR] Add license for dagre-d3 and graphlib-dot

Add license for dagre-d3 and graphlib-dot

Author: zsxwing <zsxwing@gmail.com>

Closes #6539 from zsxwing/LICENSE and squashes the following commits:

82b0475 [zsxwing] Add license for dagre-d3 and graphlib-dot
---
 LICENSE | 46 ++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 46 insertions(+)

diff --git a/LICENSE b/LICENSE
index 9d1b00beff748..d0cd0dcb4bdb7 100644
--- a/LICENSE
+++ b/LICENSE
@@ -853,6 +853,52 @@ and
 
 Vis.js may be distributed under either license.
 
+========================================================================
+For dagre-d3 (core/src/main/resources/org/apache/spark/ui/static/dagre-d3.min.js):
+========================================================================
+Copyright (c) 2013 Chris Pettitt
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+
+========================================================================
+For graphlib-dot (core/src/main/resources/org/apache/spark/ui/static/graphlib-dot.min.js):
+========================================================================
+Copyright (c) 2012-2013 Chris Pettitt
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+
 ========================================================================
 BSD-style licenses
 ========================================================================

From e1067d0ad1c32c678c23d76d7653b51770795831 Mon Sep 17 00:00:00 2001
From: Reynold Xin <rxin@databricks.com>
Date: Sun, 31 May 2015 11:35:30 -0700
Subject: [PATCH 286/525] [SPARK-3850] Trim trailing spaces for MLlib.

Author: Reynold Xin <rxin@databricks.com>

Closes #6534 from rxin/whitespace-mllib and squashes the following commits:

38926e3 [Reynold Xin] [SPARK-3850] Trim trailing spaces for MLlib.
---
 .../spark/ml/feature/StandardScaler.scala     | 10 +--
 .../ml/regression/LinearRegression.scala      |  2 +-
 .../mllib/api/python/PythonMLLibAPI.scala     |  4 +-
 .../mllib/clustering/GaussianMixture.scala    | 86 +++++++++----------
 .../clustering/GaussianMixtureModel.scala     | 22 ++---
 .../clustering/PowerIterationClustering.scala |  8 +-
 .../apache/spark/mllib/feature/Word2Vec.scala | 50 +++++------
 .../org/apache/spark/mllib/linalg/BLAS.scala  |  8 +-
 .../linalg/EigenValueDecomposition.scala      |  2 +-
 .../BinaryClassificationPMMLModelExport.scala | 10 +--
 .../mllib/pmml/export/PMMLModelExport.scala   |  4 +-
 .../pmml/export/PMMLModelExportFactory.scala  |  8 +-
 .../spark/mllib/random/RandomRDDs.scala       |  6 +-
 .../spark/mllib/recommendation/ALS.scala      |  2 +-
 .../mllib/regression/IsotonicRegression.scala | 10 +--
 .../distribution/MultivariateGaussian.scala   | 54 ++++++------
 .../mllib/tree/GradientBoostedTrees.scala     |  2 +-
 .../spark/mllib/tree/RandomForest.scala       |  2 +-
 .../org/apache/spark/mllib/util/MLUtils.scala |  2 +-
 .../evaluation/RegressionEvaluatorSuite.scala |  2 +-
 .../spark/ml/feature/BinarizerSuite.scala     |  2 +-
 .../clustering/GaussianMixtureSuite.scala     |  4 +-
 .../PowerIterationClusteringSuite.scala       |  2 +-
 .../apache/spark/mllib/linalg/BLASSuite.scala | 34 ++++----
 .../spark/mllib/linalg/VectorsSuite.scala     |  6 +-
 ...ryClassificationPMMLModelExportSuite.scala |  8 +-
 .../export/KMeansPMMLModelExportSuite.scala   |  2 +-
 .../export/PMMLModelExportFactorySuite.scala  | 10 +--
 .../MultivariateGaussianSuite.scala           | 14 +--
 .../spark/mllib/util/MLUtilsSuite.scala       |  2 +-
 30 files changed, 189 insertions(+), 189 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/StandardScaler.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/StandardScaler.scala
index fdd2494fc87a6..b0fd06d84fdb3 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/StandardScaler.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/StandardScaler.scala
@@ -35,13 +35,13 @@ private[feature] trait StandardScalerParams extends Params with HasInputCol with
 
   /**
    * Centers the data with mean before scaling.
-   * It will build a dense output, so this does not work on sparse input 
+   * It will build a dense output, so this does not work on sparse input
    * and will raise an exception.
    * Default: false
    * @group param
    */
   val withMean: BooleanParam = new BooleanParam(this, "withMean", "Center data with mean")
-  
+
   /**
    * Scales the data to unit standard deviation.
    * Default: true
@@ -68,13 +68,13 @@ class StandardScaler(override val uid: String) extends Estimator[StandardScalerM
 
   /** @group setParam */
   def setOutputCol(value: String): this.type = set(outputCol, value)
-  
+
   /** @group setParam */
   def setWithMean(value: Boolean): this.type = set(withMean, value)
-  
+
   /** @group setParam */
   def setWithStd(value: Boolean): this.type = set(withStd, value)
-  
+
   override def fit(dataset: DataFrame): StandardScalerModel = {
     transformSchema(dataset.schema, logging = true)
     val input = dataset.select($(inputCol)).map { case Row(v: Vector) => v }
diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala
index 7c40db1a40040..fe2a71a331694 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala
@@ -321,7 +321,7 @@ private class LeastSquaresAggregator(
     }
     (weightsArray, -sum + labelMean / labelStd, weightsArray.length)
   }
-  
+
   private val effectiveWeightsVector = Vectors.dense(effectiveWeightsArray)
 
   private val gradientSumArray = Array.ofDim[Double](dim)
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala b/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala
index 65f30fdba7393..16f3131796709 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala
@@ -399,7 +399,7 @@ private[python] class PythonMLLibAPI extends Serializable {
       val sigma = si.map(_.asInstanceOf[DenseMatrix])
       val gaussians = Array.tabulate(weight.length){
         i => new MultivariateGaussian(mean(i), sigma(i))
-      }      
+      }
       val model = new GaussianMixtureModel(weight, gaussians)
       model.predictSoft(data).map(Vectors.dense)
   }
@@ -494,7 +494,7 @@ private[python] class PythonMLLibAPI extends Serializable {
   def normalizeVector(p: Double, rdd: JavaRDD[Vector]): JavaRDD[Vector] = {
     new Normalizer(p).transform(rdd)
   }
-  
+
   /**
    * Java stub for StandardScaler.fit(). This stub returns a
    * handle to the Java object instead of the content of the Java object.
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixture.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixture.scala
index e9a23e40cc790..70b0e40948e51 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixture.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixture.scala
@@ -36,11 +36,11 @@ import org.apache.spark.util.Utils
  * independent Gaussian distributions with associated "mixing" weights
  * specifying each's contribution to the composite.
  *
- * Given a set of sample points, this class will maximize the log-likelihood 
- * for a mixture of k Gaussians, iterating until the log-likelihood changes by 
+ * Given a set of sample points, this class will maximize the log-likelihood
+ * for a mixture of k Gaussians, iterating until the log-likelihood changes by
  * less than convergenceTol, or until it has reached the max number of iterations.
  * While this process is generally guaranteed to converge, it is not guaranteed
- * to find a global optimum.  
+ * to find a global optimum.
  *
  * Note: For high-dimensional data (with many features), this algorithm may perform poorly.
  *       This is due to high-dimensional data (a) making it difficult to cluster at all (based
@@ -53,24 +53,24 @@ import org.apache.spark.util.Utils
  */
 @Experimental
 class GaussianMixture private (
-    private var k: Int, 
-    private var convergenceTol: Double, 
+    private var k: Int,
+    private var convergenceTol: Double,
     private var maxIterations: Int,
     private var seed: Long) extends Serializable {
-  
+
   /**
    * Constructs a default instance. The default parameters are {k: 2, convergenceTol: 0.01,
    * maxIterations: 100, seed: random}.
    */
   def this() = this(2, 0.01, 100, Utils.random.nextLong())
-  
+
   // number of samples per cluster to use when initializing Gaussians
   private val nSamples = 5
-  
-  // an initializing GMM can be provided rather than using the 
+
+  // an initializing GMM can be provided rather than using the
   // default random starting point
   private var initialModel: Option[GaussianMixtureModel] = None
-  
+
   /** Set the initial GMM starting point, bypassing the random initialization.
    *  You must call setK() prior to calling this method, and the condition
    *  (model.k == this.k) must be met; failure will result in an IllegalArgumentException
@@ -83,37 +83,37 @@ class GaussianMixture private (
     }
     this
   }
-  
+
   /** Return the user supplied initial GMM, if supplied */
   def getInitialModel: Option[GaussianMixtureModel] = initialModel
-  
+
   /** Set the number of Gaussians in the mixture model.  Default: 2 */
   def setK(k: Int): this.type = {
     this.k = k
     this
   }
-  
+
   /** Return the number of Gaussians in the mixture model */
   def getK: Int = k
-  
+
   /** Set the maximum number of iterations to run. Default: 100 */
   def setMaxIterations(maxIterations: Int): this.type = {
     this.maxIterations = maxIterations
     this
   }
-  
+
   /** Return the maximum number of iterations to run */
   def getMaxIterations: Int = maxIterations
-  
+
   /**
-   * Set the largest change in log-likelihood at which convergence is 
+   * Set the largest change in log-likelihood at which convergence is
    * considered to have occurred.
    */
   def setConvergenceTol(convergenceTol: Double): this.type = {
     this.convergenceTol = convergenceTol
     this
   }
-  
+
   /**
    * Return the largest change in log-likelihood at which convergence is
    * considered to have occurred.
@@ -132,41 +132,41 @@ class GaussianMixture private (
   /** Perform expectation maximization */
   def run(data: RDD[Vector]): GaussianMixtureModel = {
     val sc = data.sparkContext
-    
+
     // we will operate on the data as breeze data
     val breezeData = data.map(_.toBreeze).cache()
-    
+
     // Get length of the input vectors
     val d = breezeData.first().length
-    
+
     // Determine initial weights and corresponding Gaussians.
     // If the user supplied an initial GMM, we use those values, otherwise
     // we start with uniform weights, a random mean from the data, and
     // diagonal covariance matrices using component variances
-    // derived from the samples    
+    // derived from the samples
     val (weights, gaussians) = initialModel match {
       case Some(gmm) => (gmm.weights, gmm.gaussians)
-      
+
       case None => {
         val samples = breezeData.takeSample(withReplacement = true, k * nSamples, seed)
-        (Array.fill(k)(1.0 / k), Array.tabulate(k) { i => 
+        (Array.fill(k)(1.0 / k), Array.tabulate(k) { i =>
           val slice = samples.view(i * nSamples, (i + 1) * nSamples)
-          new MultivariateGaussian(vectorMean(slice), initCovariance(slice)) 
+          new MultivariateGaussian(vectorMean(slice), initCovariance(slice))
         })
       }
     }
-    
-    var llh = Double.MinValue // current log-likelihood 
+
+    var llh = Double.MinValue // current log-likelihood
     var llhp = 0.0            // previous log-likelihood
-    
+
     var iter = 0
     while (iter < maxIterations && math.abs(llh-llhp) > convergenceTol) {
       // create and broadcast curried cluster contribution function
       val compute = sc.broadcast(ExpectationSum.add(weights, gaussians)_)
-      
+
       // aggregate the cluster contribution for all sample points
       val sums = breezeData.aggregate(ExpectationSum.zero(k, d))(compute.value, _ += _)
-      
+
       // Create new distributions based on the partial assignments
       // (often referred to as the "M" step in literature)
       val sumWeights = sums.weights.sum
@@ -179,22 +179,22 @@ class GaussianMixture private (
         gaussians(i) = new MultivariateGaussian(mu, sums.sigmas(i) / sums.weights(i))
         i = i + 1
       }
-   
+
       llhp = llh // current becomes previous
       llh = sums.logLikelihood // this is the freshly computed log-likelihood
       iter += 1
-    } 
-    
+    }
+
     new GaussianMixtureModel(weights, gaussians)
   }
-    
+
   /** Average of dense breeze vectors */
   private def vectorMean(x: IndexedSeq[BV[Double]]): BDV[Double] = {
     val v = BDV.zeros[Double](x(0).length)
     x.foreach(xi => v += xi)
-    v / x.length.toDouble 
+    v / x.length.toDouble
   }
-  
+
   /**
    * Construct matrix where diagonal entries are element-wise
    * variance of input vectors (computes biased variance)
@@ -210,14 +210,14 @@ class GaussianMixture private (
 // companion class to provide zero constructor for ExpectationSum
 private object ExpectationSum {
   def zero(k: Int, d: Int): ExpectationSum = {
-    new ExpectationSum(0.0, Array.fill(k)(0.0), 
+    new ExpectationSum(0.0, Array.fill(k)(0.0),
       Array.fill(k)(BDV.zeros(d)), Array.fill(k)(BreezeMatrix.zeros(d, d)))
   }
-  
+
   // compute cluster contributions for each input point
   // (U, T) => U for aggregation
   def add(
-      weights: Array[Double], 
+      weights: Array[Double],
       dists: Array[MultivariateGaussian])
       (sums: ExpectationSum, x: BV[Double]): ExpectationSum = {
     val p = weights.zip(dists).map {
@@ -235,7 +235,7 @@ private object ExpectationSum {
       i = i + 1
     }
     sums
-  }  
+  }
 }
 
 // Aggregation class for partial expectation results
@@ -244,9 +244,9 @@ private class ExpectationSum(
     val weights: Array[Double],
     val means: Array[BDV[Double]],
     val sigmas: Array[BreezeMatrix[Double]]) extends Serializable {
-  
+
   val k = weights.length
-  
+
   def +=(x: ExpectationSum): ExpectationSum = {
     var i = 0
     while (i < k) {
@@ -257,5 +257,5 @@ private class ExpectationSum(
     }
     logLikelihood += x.logLikelihood
     this
-  }  
+  }
 }
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixtureModel.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixtureModel.scala
index 86353aed81156..5fc2cb1b62d33 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixtureModel.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixtureModel.scala
@@ -34,10 +34,10 @@ import org.apache.spark.sql.{SQLContext, Row}
 /**
  * :: Experimental ::
  *
- * Multivariate Gaussian Mixture Model (GMM) consisting of k Gaussians, where points 
- * are drawn from each Gaussian i=1..k with probability w(i); mu(i) and sigma(i) are 
- * the respective mean and covariance for each Gaussian distribution i=1..k. 
- * 
+ * Multivariate Gaussian Mixture Model (GMM) consisting of k Gaussians, where points
+ * are drawn from each Gaussian i=1..k with probability w(i); mu(i) and sigma(i) are
+ * the respective mean and covariance for each Gaussian distribution i=1..k.
+ *
  * @param weights Weights for each Gaussian distribution in the mixture, where weights(i) is
  *                the weight for Gaussian i, and weights.sum == 1
  * @param gaussians Array of MultivariateGaussian where gaussians(i) represents
@@ -45,9 +45,9 @@ import org.apache.spark.sql.{SQLContext, Row}
  */
 @Experimental
 class GaussianMixtureModel(
-  val weights: Array[Double], 
+  val weights: Array[Double],
   val gaussians: Array[MultivariateGaussian]) extends Serializable with Saveable{
-  
+
   require(weights.length == gaussians.length, "Length of weight and Gaussian arrays must match")
 
   override protected def formatVersion = "1.0"
@@ -64,20 +64,20 @@ class GaussianMixtureModel(
     val responsibilityMatrix = predictSoft(points)
     responsibilityMatrix.map(r => r.indexOf(r.max))
   }
-  
+
   /**
    * Given the input vectors, return the membership value of each vector
-   * to all mixture components. 
+   * to all mixture components.
    */
   def predictSoft(points: RDD[Vector]): RDD[Array[Double]] = {
     val sc = points.sparkContext
     val bcDists = sc.broadcast(gaussians)
     val bcWeights = sc.broadcast(weights)
-    points.map { x => 
+    points.map { x =>
       computeSoftAssignments(x.toBreeze.toDenseVector, bcDists.value, bcWeights.value, k)
     }
   }
-  
+
   /**
    * Compute the partial assignments for each vector
    */
@@ -89,7 +89,7 @@ class GaussianMixtureModel(
     val p = weights.zip(dists).map {
       case (weight, dist) => MLUtils.EPSILON + weight * dist.pdf(pt)
     }
-    val pSum = p.sum 
+    val pSum = p.sum
     for (i <- 0 until k) {
       p(i) /= pSum
     }
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/PowerIterationClustering.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/PowerIterationClustering.scala
index 1ed01c9d8ba0b..e7a243f854e33 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/PowerIterationClustering.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/PowerIterationClustering.scala
@@ -121,7 +121,7 @@ class PowerIterationClustering private[clustering] (
   import org.apache.spark.mllib.clustering.PowerIterationClustering._
 
   /** Constructs a PIC instance with default parameters: {k: 2, maxIterations: 100,
-   *  initMode: "random"}. 
+   *  initMode: "random"}.
    */
   def this() = this(k = 2, maxIterations = 100, initMode = "random")
 
@@ -243,7 +243,7 @@ object PowerIterationClustering extends Logging {
 
   /**
    * Generates random vertex properties (v0) to start power iteration.
-   * 
+   *
    * @param g a graph representing the normalized affinity matrix (W)
    * @return a graph with edges representing W and vertices representing a random vector
    *         with unit 1-norm
@@ -266,7 +266,7 @@ object PowerIterationClustering extends Logging {
    * Generates the degree vector as the vertex properties (v0) to start power iteration.
    * It is not exactly the node degrees but just the normalized sum similarities. Call it
    * as degree vector because it is used in the PIC paper.
-   * 
+   *
    * @param g a graph representing the normalized affinity matrix (W)
    * @return a graph with edges representing W and vertices representing the degree vector
    */
@@ -276,7 +276,7 @@ object PowerIterationClustering extends Logging {
     val v0 = g.vertices.mapValues(_ / sum)
     GraphImpl.fromExistingRDDs(VertexRDD(v0), g.edges)
   }
- 
+
   /**
    * Runs power iteration.
    * @param g input graph with edges representing the normalized affinity matrix (W) and vertices
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala b/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala
index 466ae95859b82..51546d41c36a6 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala
@@ -42,7 +42,7 @@ import org.apache.spark.util.random.XORShiftRandom
 import org.apache.spark.sql.{SQLContext, Row}
 
 /**
- *  Entry in vocabulary 
+ *  Entry in vocabulary
  */
 private case class VocabWord(
   var word: String,
@@ -56,18 +56,18 @@ private case class VocabWord(
  * :: Experimental ::
  * Word2Vec creates vector representation of words in a text corpus.
  * The algorithm first constructs a vocabulary from the corpus
- * and then learns vector representation of words in the vocabulary. 
- * The vector representation can be used as features in 
+ * and then learns vector representation of words in the vocabulary.
+ * The vector representation can be used as features in
  * natural language processing and machine learning algorithms.
- * 
- * We used skip-gram model in our implementation and hierarchical softmax 
+ *
+ * We used skip-gram model in our implementation and hierarchical softmax
  * method to train the model. The variable names in the implementation
  * matches the original C implementation.
  *
- * For original C implementation, see https://code.google.com/p/word2vec/ 
- * For research papers, see 
+ * For original C implementation, see https://code.google.com/p/word2vec/
+ * For research papers, see
  * Efficient Estimation of Word Representations in Vector Space
- * and 
+ * and
  * Distributed Representations of Words and Phrases and their Compositionality.
  */
 @Experimental
@@ -79,7 +79,7 @@ class Word2Vec extends Serializable with Logging {
   private var numIterations = 1
   private var seed = Utils.random.nextLong()
   private var minCount = 5
-  
+
   /**
    * Sets vector size (default: 100).
    */
@@ -122,15 +122,15 @@ class Word2Vec extends Serializable with Logging {
     this
   }
 
-  /** 
-   * Sets minCount, the minimum number of times a token must appear to be included in the word2vec 
+  /**
+   * Sets minCount, the minimum number of times a token must appear to be included in the word2vec
    * model's vocabulary (default: 5).
    */
   def setMinCount(minCount: Int): this.type = {
     this.minCount = minCount
     this
   }
-  
+
   private val EXP_TABLE_SIZE = 1000
   private val MAX_EXP = 6
   private val MAX_CODE_LENGTH = 40
@@ -150,13 +150,13 @@ class Word2Vec extends Serializable with Logging {
       .map(x => VocabWord(
         x._1,
         x._2,
-        new Array[Int](MAX_CODE_LENGTH), 
-        new Array[Int](MAX_CODE_LENGTH), 
+        new Array[Int](MAX_CODE_LENGTH),
+        new Array[Int](MAX_CODE_LENGTH),
         0))
       .filter(_.cn >= minCount)
       .collect()
       .sortWith((a, b) => a.cn > b.cn)
-    
+
     vocabSize = vocab.length
     require(vocabSize > 0, "The vocabulary size should be > 0. You may need to check " +
       "the setting of minCount, which could be large enough to remove all your words in sentences.")
@@ -198,8 +198,8 @@ class Word2Vec extends Serializable with Logging {
     }
     var pos1 = vocabSize - 1
     var pos2 = vocabSize
-    
-    var min1i = 0 
+
+    var min1i = 0
     var min2i = 0
 
     a = 0
@@ -268,15 +268,15 @@ class Word2Vec extends Serializable with Logging {
     val words = dataset.flatMap(x => x)
 
     learnVocab(words)
-    
+
     createBinaryTree()
-    
+
     val sc = dataset.context
 
     val expTable = sc.broadcast(createExpTable())
     val bcVocab = sc.broadcast(vocab)
     val bcVocabHash = sc.broadcast(vocabHash)
-    
+
     val sentences: RDD[Array[Int]] = words.mapPartitions { iter =>
       new Iterator[Array[Int]] {
         def hasNext: Boolean = iter.hasNext
@@ -297,7 +297,7 @@ class Word2Vec extends Serializable with Logging {
         }
       }
     }
-    
+
     val newSentences = sentences.repartition(numPartitions).cache()
     val initRandom = new XORShiftRandom(seed)
 
@@ -402,7 +402,7 @@ class Word2Vec extends Serializable with Logging {
       }
     }
     newSentences.unpersist()
-    
+
     val word2VecMap = mutable.HashMap.empty[String, Array[Float]]
     var i = 0
     while (i < vocabSize) {
@@ -480,7 +480,7 @@ class Word2VecModel private[mllib] (
 
   /**
    * Transforms a word to its vector representation
-   * @param word a word 
+   * @param word a word
    * @return vector representation of word
    */
   def transform(word: String): Vector = {
@@ -495,7 +495,7 @@ class Word2VecModel private[mllib] (
   /**
    * Find synonyms of a word
    * @param word a word
-   * @param num number of synonyms to find  
+   * @param num number of synonyms to find
    * @return array of (word, cosineSimilarity)
    */
   def findSynonyms(word: String, num: Int): Array[(String, Double)] = {
@@ -506,7 +506,7 @@ class Word2VecModel private[mllib] (
   /**
    * Find synonyms of the vector representation of a word
    * @param vector vector representation of a word
-   * @param num number of synonyms to find  
+   * @param num number of synonyms to find
    * @return array of (word, cosineSimilarity)
    */
   def findSynonyms(vector: Vector, num: Int): Array[(String, Double)] = {
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/BLAS.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/BLAS.scala
index ec38529cf8fae..557119f7b1cd1 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/BLAS.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/BLAS.scala
@@ -228,7 +228,7 @@ private[spark] object BLAS extends Serializable with Logging {
     }
     _nativeBLAS
   }
- 
+
   /**
    * A := alpha * x * x^T^ + A
    * @param alpha a real scalar that will be multiplied to x * x^T^.
@@ -264,7 +264,7 @@ private[spark] object BLAS extends Serializable with Logging {
         j += 1
       }
       i += 1
-    }    
+    }
   }
 
   private def syr(alpha: Double, x: SparseVector, A: DenseMatrix) {
@@ -505,7 +505,7 @@ private[spark] object BLAS extends Serializable with Logging {
     nativeBLAS.dgemv(tStrA, mA, nA, alpha, A.values, mA, x.values, 1, beta,
       y.values, 1)
   }
- 
+
   /**
    * y := alpha * A * x + beta * y
    * For `DenseMatrix` A and `SparseVector` x.
@@ -557,7 +557,7 @@ private[spark] object BLAS extends Serializable with Logging {
       }
     }
   }
- 
+
   /**
    * y := alpha * A * x + beta * y
    * For `SparseMatrix` A and `SparseVector` x.
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/EigenValueDecomposition.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/EigenValueDecomposition.scala
index 866936aa4f118..ae3ba3099c878 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/EigenValueDecomposition.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/EigenValueDecomposition.scala
@@ -81,7 +81,7 @@ private[mllib] object EigenValueDecomposition {
 
     require(n * ncv.toLong <= Integer.MAX_VALUE && ncv * (ncv.toLong + 8) <= Integer.MAX_VALUE,
       s"k = $k and/or n = $n are too large to compute an eigendecomposition")
-    
+
     var ido = new intW(0)
     var info = new intW(0)
     var resid = new Array[Double](n)
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/pmml/export/BinaryClassificationPMMLModelExport.scala b/mllib/src/main/scala/org/apache/spark/mllib/pmml/export/BinaryClassificationPMMLModelExport.scala
index 34b447584e521..622b53a252ac5 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/pmml/export/BinaryClassificationPMMLModelExport.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/pmml/export/BinaryClassificationPMMLModelExport.scala
@@ -27,10 +27,10 @@ import org.apache.spark.mllib.regression.GeneralizedLinearModel
  * PMML Model Export for GeneralizedLinearModel class with binary ClassificationModel
  */
 private[mllib] class BinaryClassificationPMMLModelExport(
-    model : GeneralizedLinearModel, 
+    model : GeneralizedLinearModel,
     description : String,
     normalizationMethod : RegressionNormalizationMethodType,
-    threshold: Double) 
+    threshold: Double)
   extends PMMLModelExport {
 
   populateBinaryClassificationPMML()
@@ -72,7 +72,7 @@ private[mllib] class BinaryClassificationPMMLModelExport(
            .withUsageType(FieldUsageType.ACTIVE))
          regressionTableYES.withNumericPredictors(new NumericPredictor(fields(i), model.weights(i)))
        }
-       
+
        // add target field
        val targetField = FieldName.create("target")
        dataDictionary
@@ -80,9 +80,9 @@ private[mllib] class BinaryClassificationPMMLModelExport(
        miningSchema
          .withMiningFields(new MiningField(targetField)
          .withUsageType(FieldUsageType.TARGET))
-       
+
        dataDictionary.withNumberOfFields(dataDictionary.getDataFields.size)
-       
+
        pmml.setDataDictionary(dataDictionary)
        pmml.withModels(regressionModel)
      }
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/pmml/export/PMMLModelExport.scala b/mllib/src/main/scala/org/apache/spark/mllib/pmml/export/PMMLModelExport.scala
index ebdeae50bb32f..c5fdecd3ca17f 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/pmml/export/PMMLModelExport.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/pmml/export/PMMLModelExport.scala
@@ -25,7 +25,7 @@ import scala.beans.BeanProperty
 import org.dmg.pmml.{Application, Header, PMML, Timestamp}
 
 private[mllib] trait PMMLModelExport {
-  
+
   /**
    * Holder of the exported model in PMML format
    */
@@ -33,7 +33,7 @@ private[mllib] trait PMMLModelExport {
   val pmml: PMML = new PMML
 
   setHeader(pmml)
-  
+
   private def setHeader(pmml: PMML): Unit = {
     val version = getClass.getPackage.getImplementationVersion
     val app = new Application().withName("Apache Spark MLlib").withVersion(version)
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/pmml/export/PMMLModelExportFactory.scala b/mllib/src/main/scala/org/apache/spark/mllib/pmml/export/PMMLModelExportFactory.scala
index c16e83d6a067d..29bd689e1185a 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/pmml/export/PMMLModelExportFactory.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/pmml/export/PMMLModelExportFactory.scala
@@ -27,9 +27,9 @@ import org.apache.spark.mllib.regression.LinearRegressionModel
 import org.apache.spark.mllib.regression.RidgeRegressionModel
 
 private[mllib] object PMMLModelExportFactory {
-  
+
   /**
-   * Factory object to help creating the necessary PMMLModelExport implementation 
+   * Factory object to help creating the necessary PMMLModelExport implementation
    * taking as input the machine learning model (for example KMeansModel).
    */
   def createPMMLModelExport(model: Any): PMMLModelExport = {
@@ -44,7 +44,7 @@ private[mllib] object PMMLModelExportFactory {
         new GeneralizedLinearPMMLModelExport(lasso, "lasso regression")
       case svm: SVMModel =>
         new BinaryClassificationPMMLModelExport(
-          svm, "linear SVM", RegressionNormalizationMethodType.NONE, 
+          svm, "linear SVM", RegressionNormalizationMethodType.NONE,
           svm.getThreshold.getOrElse(0.0))
       case logistic: LogisticRegressionModel =>
         if (logistic.numClasses == 2) {
@@ -60,5 +60,5 @@ private[mllib] object PMMLModelExportFactory {
           "PMML Export not supported for model: " + model.getClass.getName)
     }
   }
-  
+
 }
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/random/RandomRDDs.scala b/mllib/src/main/scala/org/apache/spark/mllib/random/RandomRDDs.scala
index 7db5a14fd45a5..174d5e0f6c9f0 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/random/RandomRDDs.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/random/RandomRDDs.scala
@@ -234,7 +234,7 @@ object RandomRDDs {
    *
    * @param sc SparkContext used to create the RDD.
    * @param shape shape parameter (> 0) for the gamma distribution
-   * @param scale scale parameter (> 0) for the gamma distribution  
+   * @param scale scale parameter (> 0) for the gamma distribution
    * @param size Size of the RDD.
    * @param numPartitions Number of partitions in the RDD (default: `sc.defaultParallelism`).
    * @param seed Random seed (default: a random long integer).
@@ -293,7 +293,7 @@ object RandomRDDs {
    *
    * @param sc SparkContext used to create the RDD.
    * @param mean mean for the log normal distribution
-   * @param std standard deviation for the log normal distribution  
+   * @param std standard deviation for the log normal distribution
    * @param size Size of the RDD.
    * @param numPartitions Number of partitions in the RDD (default: `sc.defaultParallelism`).
    * @param seed Random seed (default: a random long integer).
@@ -671,7 +671,7 @@ object RandomRDDs {
    *
    * @param sc SparkContext used to create the RDD.
    * @param shape shape parameter (> 0) for the gamma distribution.
-   * @param scale scale parameter (> 0) for the gamma distribution. 
+   * @param scale scale parameter (> 0) for the gamma distribution.
    * @param numRows Number of Vectors in the RDD.
    * @param numCols Number of elements in each Vector.
    * @param numPartitions Number of partitions in the RDD (default: `sc.defaultParallelism`)
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/recommendation/ALS.scala b/mllib/src/main/scala/org/apache/spark/mllib/recommendation/ALS.scala
index dddefe1944e9d..93290e6508529 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/recommendation/ALS.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/recommendation/ALS.scala
@@ -175,7 +175,7 @@ class ALS private (
   /**
    * :: DeveloperApi ::
    * Sets storage level for final RDDs (user/product used in MatrixFactorizationModel). The default
-   * value is `MEMORY_AND_DISK`. Users can change it to a serialized storage, e.g. 
+   * value is `MEMORY_AND_DISK`. Users can change it to a serialized storage, e.g.
    * `MEMORY_AND_DISK_SER` and set `spark.rdd.compress` to `true` to reduce the space requirement,
    * at the cost of speed.
    */
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/regression/IsotonicRegression.scala b/mllib/src/main/scala/org/apache/spark/mllib/regression/IsotonicRegression.scala
index 96e50faca2b19..f3b46c75c05f3 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/regression/IsotonicRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/regression/IsotonicRegression.scala
@@ -170,15 +170,15 @@ object IsotonicRegressionModel extends Loader[IsotonicRegressionModel] {
     case class Data(boundary: Double, prediction: Double)
 
     def save(
-        sc: SparkContext, 
-        path: String, 
-        boundaries: Array[Double], 
-        predictions: Array[Double], 
+        sc: SparkContext,
+        path: String,
+        boundaries: Array[Double],
+        predictions: Array[Double],
         isotonic: Boolean): Unit = {
       val sqlContext = new SQLContext(sc)
 
       val metadata = compact(render(
-        ("class" -> thisClassName) ~ ("version" -> thisFormatVersion) ~ 
+        ("class" -> thisClassName) ~ ("version" -> thisFormatVersion) ~
           ("isotonic" -> isotonic)))
       sc.parallelize(Seq(metadata), 1).saveAsTextFile(metadataPath(path))
 
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/stat/distribution/MultivariateGaussian.scala b/mllib/src/main/scala/org/apache/spark/mllib/stat/distribution/MultivariateGaussian.scala
index cd6add9d60b0d..cf51b24ff777f 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/stat/distribution/MultivariateGaussian.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/stat/distribution/MultivariateGaussian.scala
@@ -29,102 +29,102 @@ import org.apache.spark.mllib.util.MLUtils
  * the event that the covariance matrix is singular, the density will be computed in a
  * reduced dimensional subspace under which the distribution is supported.
  * (see [[http://en.wikipedia.org/wiki/Multivariate_normal_distribution#Degenerate_case]])
- * 
+ *
  * @param mu The mean vector of the distribution
  * @param sigma The covariance matrix of the distribution
  */
 @DeveloperApi
 class MultivariateGaussian (
-    val mu: Vector, 
+    val mu: Vector,
     val sigma: Matrix) extends Serializable {
 
   require(sigma.numCols == sigma.numRows, "Covariance matrix must be square")
   require(mu.size == sigma.numCols, "Mean vector length must match covariance matrix size")
-  
+
   private val breezeMu = mu.toBreeze.toDenseVector
-  
+
   /**
    * private[mllib] constructor
-   * 
+   *
    * @param mu The mean vector of the distribution
    * @param sigma The covariance matrix of the distribution
    */
   private[mllib] def this(mu: DBV[Double], sigma: DBM[Double]) = {
     this(Vectors.fromBreeze(mu), Matrices.fromBreeze(sigma))
   }
-  
+
   /**
    * Compute distribution dependent constants:
    *    rootSigmaInv = D^(-1/2)^ * U, where sigma = U * D * U.t
-   *    u = log((2*pi)^(-k/2)^ * det(sigma)^(-1/2)^) 
+   *    u = log((2*pi)^(-k/2)^ * det(sigma)^(-1/2)^)
    */
   private val (rootSigmaInv: DBM[Double], u: Double) = calculateCovarianceConstants
-  
+
   /** Returns density of this multivariate Gaussian at given point, x */
   def pdf(x: Vector): Double = {
     pdf(x.toBreeze)
   }
-  
+
   /** Returns the log-density of this multivariate Gaussian at given point, x */
   def logpdf(x: Vector): Double = {
     logpdf(x.toBreeze)
   }
-  
+
   /** Returns density of this multivariate Gaussian at given point, x */
   private[mllib] def pdf(x: BV[Double]): Double = {
     math.exp(logpdf(x))
   }
-  
+
   /** Returns the log-density of this multivariate Gaussian at given point, x */
   private[mllib] def logpdf(x: BV[Double]): Double = {
     val delta = x - breezeMu
     val v = rootSigmaInv * delta
     u + v.t * v * -0.5
   }
-  
+
   /**
    * Calculate distribution dependent components used for the density function:
    *    pdf(x) = (2*pi)^(-k/2)^ * det(sigma)^(-1/2)^ * exp((-1/2) * (x-mu).t * inv(sigma) * (x-mu))
    * where k is length of the mean vector.
-   * 
-   * We here compute distribution-fixed parts 
+   *
+   * We here compute distribution-fixed parts
    *  log((2*pi)^(-k/2)^ * det(sigma)^(-1/2)^)
    * and
    *  D^(-1/2)^ * U, where sigma = U * D * U.t
-   *  
+   *
    * Both the determinant and the inverse can be computed from the singular value decomposition
    * of sigma.  Noting that covariance matrices are always symmetric and positive semi-definite,
    * we can use the eigendecomposition. We also do not compute the inverse directly; noting
-   * that 
-   * 
+   * that
+   *
    *    sigma = U * D * U.t
-   *    inv(Sigma) = U * inv(D) * U.t 
+   *    inv(Sigma) = U * inv(D) * U.t
    *               = (D^{-1/2}^ * U).t * (D^{-1/2}^ * U)
-   * 
+   *
    * and thus
-   * 
+   *
    *    -0.5 * (x-mu).t * inv(Sigma) * (x-mu) = -0.5 * norm(D^{-1/2}^ * U  * (x-mu))^2^
-   *  
-   * To guard against singular covariance matrices, this method computes both the 
+   *
+   * To guard against singular covariance matrices, this method computes both the
    * pseudo-determinant and the pseudo-inverse (Moore-Penrose).  Singular values are considered
    * to be non-zero only if they exceed a tolerance based on machine precision, matrix size, and
    * relation to the maximum singular value (same tolerance used by, e.g., Octave).
    */
   private def calculateCovarianceConstants: (DBM[Double], Double) = {
     val eigSym.EigSym(d, u) = eigSym(sigma.toBreeze.toDenseMatrix) // sigma = u * diag(d) * u.t
-    
+
     // For numerical stability, values are considered to be non-zero only if they exceed tol.
     // This prevents any inverted value from exceeding (eps * n * max(d))^-1
     val tol = MLUtils.EPSILON * max(d) * d.length
-    
+
     try {
       // log(pseudo-determinant) is sum of the logs of all non-zero singular values
       val logPseudoDetSigma = d.activeValuesIterator.filter(_ > tol).map(math.log).sum
-      
-      // calculate the root-pseudo-inverse of the diagonal matrix of singular values 
+
+      // calculate the root-pseudo-inverse of the diagonal matrix of singular values
       // by inverting the square root of all non-zero values
       val pinvS = diag(new DBV(d.map(v => if (v > tol) math.sqrt(1.0 / v) else 0.0).toArray))
-    
+
       (pinvS * u, -0.5 * (mu.size * math.log(2.0 * math.Pi) + logPseudoDetSigma))
     } catch {
       case uex: UnsupportedOperationException =>
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/GradientBoostedTrees.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/GradientBoostedTrees.scala
index e3ddc7053693c..a835f96d5d0e3 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/tree/GradientBoostedTrees.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/GradientBoostedTrees.scala
@@ -270,7 +270,7 @@ object GradientBoostedTrees extends Logging {
     logInfo(s"$timer")
 
     if (persistedInput) input.unpersist()
-    
+
     if (validate) {
       new GradientBoostedTreesModel(
         boostingStrategy.treeStrategy.algo,
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/RandomForest.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/RandomForest.scala
index 99d0e3cf2fd6d..069959976a188 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/tree/RandomForest.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/RandomForest.scala
@@ -474,7 +474,7 @@ object RandomForest extends Serializable with Logging {
       val (treeIndex, node) = nodeQueue.head
       // Choose subset of features for node (if subsampling).
       val featureSubset: Option[Array[Int]] = if (metadata.subsamplingFeatures) {
-        Some(SamplingUtils.reservoirSampleAndCount(Range(0, 
+        Some(SamplingUtils.reservoirSampleAndCount(Range(0,
           metadata.numFeatures).iterator, metadata.numFeaturesPerNode, rng.nextLong)._1)
       } else {
         None
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala b/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala
index 681f4c618d302..541f3288b6c43 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala
@@ -265,7 +265,7 @@ object MLUtils {
     }
     Vectors.fromBreeze(vector1)
   }
- 
+
   /**
    * Returns the squared Euclidean distance between two vectors. The following formula will be used
    * if it does not introduce too much numerical error:
diff --git a/mllib/src/test/scala/org/apache/spark/ml/evaluation/RegressionEvaluatorSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/evaluation/RegressionEvaluatorSuite.scala
index 9da0618abd23c..36a1ac6b7996d 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/evaluation/RegressionEvaluatorSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/evaluation/RegressionEvaluatorSuite.scala
@@ -38,7 +38,7 @@ class RegressionEvaluatorSuite extends SparkFunSuite with MLlibTestSparkContext
     val dataset = sqlContext.createDataFrame(
       sc.parallelize(LinearDataGenerator.generateLinearInput(
         6.3, Array(4.7, 7.2), Array(0.9, -1.3), Array(0.7, 1.2), 100, 42, 0.1), 2))
-    
+
     /**
      * Using the following R code to load the data, train the model and evaluate metrics.
      *
diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/BinarizerSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/BinarizerSuite.scala
index d4631518e0f5b..7953bd0417191 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/feature/BinarizerSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/feature/BinarizerSuite.scala
@@ -47,7 +47,7 @@ class BinarizerSuite extends SparkFunSuite with MLlibTestSparkContext {
 
   test("Binarize continuous features with setter") {
     val threshold: Double = 0.2
-    val thresholdBinarized: Array[Double] = data.map(x => if (x > threshold) 1.0 else 0.0) 
+    val thresholdBinarized: Array[Double] = data.map(x => if (x > threshold) 1.0 else 0.0)
     val dataFrame: DataFrame = sqlContext.createDataFrame(
         data.zip(thresholdBinarized)).toDF("feature", "expected")
 
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/clustering/GaussianMixtureSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/clustering/GaussianMixtureSuite.scala
index a3b085e441491..b218d72f1268a 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/clustering/GaussianMixtureSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/clustering/GaussianMixtureSuite.scala
@@ -46,7 +46,7 @@ class GaussianMixtureSuite extends SparkFunSuite with MLlibTestSparkContext {
     }
 
   }
-  
+
   test("two clusters") {
     val data = sc.parallelize(GaussianTestData.data)
 
@@ -62,7 +62,7 @@ class GaussianMixtureSuite extends SparkFunSuite with MLlibTestSparkContext {
     val Ew = Array(1.0 / 3.0, 2.0 / 3.0)
     val Emu = Array(Vectors.dense(-4.3673), Vectors.dense(5.1604))
     val Esigma = Array(Matrices.dense(1, 1, Array(1.1098)), Matrices.dense(1, 1, Array(0.86644)))
-    
+
     val gmm = new GaussianMixture()
       .setK(2)
       .setInitialModel(initialGmm)
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/clustering/PowerIterationClusteringSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/clustering/PowerIterationClusteringSuite.scala
index 3903712879928..19e65f1b53ab5 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/clustering/PowerIterationClusteringSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/clustering/PowerIterationClusteringSuite.scala
@@ -56,7 +56,7 @@ class PowerIterationClusteringSuite extends SparkFunSuite with MLlibTestSparkCon
       predictions(a.cluster) += a.id
     }
     assert(predictions.toSet == Set((0 to 3).toSet, (4 to 15).toSet))
- 
+
     val model2 = new PowerIterationClustering()
       .setK(2)
       .setInitializationMode("degree")
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/linalg/BLASSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/linalg/BLASSuite.scala
index bcc2e657f3fd4..b0f3f71113c57 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/linalg/BLASSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/linalg/BLASSuite.scala
@@ -139,7 +139,7 @@ class BLASSuite extends SparkFunSuite {
     syr(alpha, x, dA)
 
     assert(dA ~== expected absTol 1e-15)
- 
+
     val dB =
       new DenseMatrix(3, 4, Array(0.0, 1.2, 2.2, 3.1, 1.2, 3.2, 5.3, 4.6, 2.2, 5.3, 1.8, 3.0))
 
@@ -148,7 +148,7 @@ class BLASSuite extends SparkFunSuite {
         syr(alpha, x, dB)
       }
     }
- 
+
     val dC =
       new DenseMatrix(3, 3, Array(0.0, 1.2, 2.2, 1.2, 3.2, 5.3, 2.2, 5.3, 1.8))
 
@@ -157,7 +157,7 @@ class BLASSuite extends SparkFunSuite {
         syr(alpha, x, dC)
       }
     }
- 
+
     val y = new DenseVector(Array(0.0, 2.7, 3.5, 2.1, 1.5))
 
     withClue("Size of vector must match the rank of matrix") {
@@ -255,13 +255,13 @@ class BLASSuite extends SparkFunSuite {
     val dA =
       new DenseMatrix(4, 3, Array(0.0, 1.0, 0.0, 0.0, 2.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 3.0))
     val sA = new SparseMatrix(4, 3, Array(0, 1, 3, 4), Array(1, 0, 2, 3), Array(1.0, 2.0, 1.0, 3.0))
- 
+
     val dA2 =
       new DenseMatrix(4, 3, Array(0.0, 2.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 3.0), true)
     val sA2 =
       new SparseMatrix(4, 3, Array(0, 1, 2, 3, 4), Array(1, 0, 1, 2), Array(2.0, 1.0, 1.0, 3.0),
         true)
- 
+
     val dx = new DenseVector(Array(1.0, 2.0, 3.0))
     val sx = dx.toSparse
     val expected = new DenseVector(Array(4.0, 1.0, 2.0, 9.0))
@@ -270,7 +270,7 @@ class BLASSuite extends SparkFunSuite {
     assert(sA.multiply(dx) ~== expected absTol 1e-15)
     assert(dA.multiply(sx) ~== expected absTol 1e-15)
     assert(sA.multiply(sx) ~== expected absTol 1e-15)
- 
+
     val y1 = new DenseVector(Array(1.0, 3.0, 1.0, 0.0))
     val y2 = y1.copy
     val y3 = y1.copy
@@ -287,7 +287,7 @@ class BLASSuite extends SparkFunSuite {
     val y14 = y1.copy
     val y15 = y1.copy
     val y16 = y1.copy
- 
+
     val expected2 = new DenseVector(Array(6.0, 7.0, 4.0, 9.0))
     val expected3 = new DenseVector(Array(10.0, 8.0, 6.0, 18.0))
 
@@ -295,42 +295,42 @@ class BLASSuite extends SparkFunSuite {
     gemv(1.0, sA, dx, 2.0, y2)
     gemv(1.0, dA, sx, 2.0, y3)
     gemv(1.0, sA, sx, 2.0, y4)
- 
+
     gemv(1.0, dA2, dx, 2.0, y5)
     gemv(1.0, sA2, dx, 2.0, y6)
     gemv(1.0, dA2, sx, 2.0, y7)
     gemv(1.0, sA2, sx, 2.0, y8)
- 
+
     gemv(2.0, dA, dx, 2.0, y9)
     gemv(2.0, sA, dx, 2.0, y10)
     gemv(2.0, dA, sx, 2.0, y11)
     gemv(2.0, sA, sx, 2.0, y12)
- 
+
     gemv(2.0, dA2, dx, 2.0, y13)
     gemv(2.0, sA2, dx, 2.0, y14)
     gemv(2.0, dA2, sx, 2.0, y15)
     gemv(2.0, sA2, sx, 2.0, y16)
- 
+
     assert(y1 ~== expected2 absTol 1e-15)
     assert(y2 ~== expected2 absTol 1e-15)
     assert(y3 ~== expected2 absTol 1e-15)
     assert(y4 ~== expected2 absTol 1e-15)
- 
+
     assert(y5 ~== expected2 absTol 1e-15)
     assert(y6 ~== expected2 absTol 1e-15)
     assert(y7 ~== expected2 absTol 1e-15)
     assert(y8 ~== expected2 absTol 1e-15)
- 
+
     assert(y9 ~== expected3 absTol 1e-15)
     assert(y10 ~== expected3 absTol 1e-15)
     assert(y11 ~== expected3 absTol 1e-15)
     assert(y12 ~== expected3 absTol 1e-15)
- 
+
     assert(y13 ~== expected3 absTol 1e-15)
     assert(y14 ~== expected3 absTol 1e-15)
     assert(y15 ~== expected3 absTol 1e-15)
     assert(y16 ~== expected3 absTol 1e-15)
- 
+
     withClue("columns of A don't match the rows of B") {
       intercept[Exception] {
         gemv(1.0, dA.transpose, dx, 2.0, y1)
@@ -345,12 +345,12 @@ class BLASSuite extends SparkFunSuite {
         gemv(1.0, sA.transpose, sx, 2.0, y1)
       }
     }
- 
+
     val dAT =
       new DenseMatrix(3, 4, Array(0.0, 2.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 3.0))
     val sAT =
       new SparseMatrix(3, 4, Array(0, 1, 2, 3, 4), Array(1, 0, 1, 2), Array(2.0, 1.0, 1.0, 3.0))
- 
+
     val dATT = dAT.transpose
     val sATT = sAT.transpose
 
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/linalg/VectorsSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/linalg/VectorsSuite.scala
index c6d29dcdb0f2b..c4ae0a16f7c04 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/linalg/VectorsSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/linalg/VectorsSuite.scala
@@ -214,13 +214,13 @@ class VectorsSuite extends SparkFunSuite {
 
       val squaredDist = breezeSquaredDistance(sparseVector1.toBreeze, sparseVector2.toBreeze)
 
-      // SparseVector vs. SparseVector 
-      assert(Vectors.sqdist(sparseVector1, sparseVector2) ~== squaredDist relTol 1E-8) 
+      // SparseVector vs. SparseVector
+      assert(Vectors.sqdist(sparseVector1, sparseVector2) ~== squaredDist relTol 1E-8)
       // DenseVector  vs. SparseVector
       assert(Vectors.sqdist(denseVector1, sparseVector2) ~== squaredDist relTol 1E-8)
       // DenseVector  vs. DenseVector
       assert(Vectors.sqdist(denseVector1, denseVector2) ~== squaredDist relTol 1E-8)
-    }    
+    }
   }
 
   test("foreachActive") {
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/pmml/export/BinaryClassificationPMMLModelExportSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/pmml/export/BinaryClassificationPMMLModelExportSuite.scala
index 7a724fc78b1d9..4c6e76e47419b 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/pmml/export/BinaryClassificationPMMLModelExportSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/pmml/export/BinaryClassificationPMMLModelExportSuite.scala
@@ -53,13 +53,13 @@ class BinaryClassificationPMMLModelExportSuite extends SparkFunSuite {
     // ensure logistic regression has normalization method set to LOGIT
     assert(pmmlRegressionModel.getNormalizationMethod() == RegressionNormalizationMethodType.LOGIT)
   }
-  
+
   test("linear SVM PMML export") {
     val linearInput = LinearDataGenerator.generateLinearInput(3.0, Array(10.0, 10.0), 1, 17)
     val svmModel = new SVMModel(linearInput(0).features, linearInput(0).label)
-    
+
     val svmModelExport = PMMLModelExportFactory.createPMMLModelExport(svmModel)
-    
+
     // assert that the PMML format is as expected
     assert(svmModelExport.isInstanceOf[PMMLModelExport])
     val pmml = svmModelExport.getPmml
@@ -80,5 +80,5 @@ class BinaryClassificationPMMLModelExportSuite extends SparkFunSuite {
     // ensure linear SVM has normalization method set to NONE
     assert(pmmlRegressionModel.getNormalizationMethod() == RegressionNormalizationMethodType.NONE)
   }
-  
+
 }
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/pmml/export/KMeansPMMLModelExportSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/pmml/export/KMeansPMMLModelExportSuite.scala
index a1a683559a54c..b3f9750afa730 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/pmml/export/KMeansPMMLModelExportSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/pmml/export/KMeansPMMLModelExportSuite.scala
@@ -45,5 +45,5 @@ class KMeansPMMLModelExportSuite extends SparkFunSuite {
     val pmmlClusteringModel = pmml.getModels.get(0).asInstanceOf[ClusteringModel]
     assert(pmmlClusteringModel.getNumberOfClusters === clusterCenters.length)
   }
-  
+
 }
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/pmml/export/PMMLModelExportFactorySuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/pmml/export/PMMLModelExportFactorySuite.scala
index 0d194005a30b2..af49450961750 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/pmml/export/PMMLModelExportFactorySuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/pmml/export/PMMLModelExportFactorySuite.scala
@@ -60,25 +60,25 @@ class PMMLModelExportFactorySuite extends SparkFunSuite {
   test("PMMLModelExportFactory create BinaryClassificationPMMLModelExport "
     + "when passing a LogisticRegressionModel or SVMModel") {
     val linearInput = LinearDataGenerator.generateLinearInput(3.0, Array(10.0, 10.0), 1, 17)
-    
+
     val logisticRegressionModel =
       new LogisticRegressionModel(linearInput(0).features, linearInput(0).label)
     val logisticRegressionModelExport =
       PMMLModelExportFactory.createPMMLModelExport(logisticRegressionModel)
     assert(logisticRegressionModelExport.isInstanceOf[BinaryClassificationPMMLModelExport])
-    
+
     val svmModel = new SVMModel(linearInput(0).features, linearInput(0).label)
     val svmModelExport = PMMLModelExportFactory.createPMMLModelExport(svmModel)
     assert(svmModelExport.isInstanceOf[BinaryClassificationPMMLModelExport])
   }
-  
+
   test("PMMLModelExportFactory throw IllegalArgumentException "
     + "when passing a Multinomial Logistic Regression") {
     /** 3 classes, 2 features */
     val multiclassLogisticRegressionModel = new LogisticRegressionModel(
-      weights = Vectors.dense(0.1, 0.2, 0.3, 0.4), intercept = 1.0, 
+      weights = Vectors.dense(0.1, 0.2, 0.3, 0.4), intercept = 1.0,
       numFeatures = 2, numClasses = 3)
-    
+
     intercept[IllegalArgumentException] {
       PMMLModelExportFactory.createPMMLModelExport(multiclassLogisticRegressionModel)
     }
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/stat/distribution/MultivariateGaussianSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/stat/distribution/MultivariateGaussianSuite.scala
index 703b623536315..aa60deb665aeb 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/stat/distribution/MultivariateGaussianSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/stat/distribution/MultivariateGaussianSuite.scala
@@ -26,39 +26,39 @@ class MultivariateGaussianSuite extends SparkFunSuite with MLlibTestSparkContext
   test("univariate") {
     val x1 = Vectors.dense(0.0)
     val x2 = Vectors.dense(1.5)
-                     
+
     val mu = Vectors.dense(0.0)
     val sigma1 = Matrices.dense(1, 1, Array(1.0))
     val dist1 = new MultivariateGaussian(mu, sigma1)
     assert(dist1.pdf(x1) ~== 0.39894 absTol 1E-5)
     assert(dist1.pdf(x2) ~== 0.12952 absTol 1E-5)
-    
+
     val sigma2 = Matrices.dense(1, 1, Array(4.0))
     val dist2 = new MultivariateGaussian(mu, sigma2)
     assert(dist2.pdf(x1) ~== 0.19947 absTol 1E-5)
     assert(dist2.pdf(x2) ~== 0.15057 absTol 1E-5)
   }
-  
+
   test("multivariate") {
     val x1 = Vectors.dense(0.0, 0.0)
     val x2 = Vectors.dense(1.0, 1.0)
-    
+
     val mu = Vectors.dense(0.0, 0.0)
     val sigma1 = Matrices.dense(2, 2, Array(1.0, 0.0, 0.0, 1.0))
     val dist1 = new MultivariateGaussian(mu, sigma1)
     assert(dist1.pdf(x1) ~== 0.15915 absTol 1E-5)
     assert(dist1.pdf(x2) ~== 0.05855 absTol 1E-5)
-    
+
     val sigma2 = Matrices.dense(2, 2, Array(4.0, -1.0, -1.0, 2.0))
     val dist2 = new MultivariateGaussian(mu, sigma2)
     assert(dist2.pdf(x1) ~== 0.060155 absTol 1E-5)
     assert(dist2.pdf(x2) ~== 0.033971 absTol 1E-5)
   }
-  
+
   test("multivariate degenerate") {
     val x1 = Vectors.dense(0.0, 0.0)
     val x2 = Vectors.dense(1.0, 1.0)
-    
+
     val mu = Vectors.dense(0.0, 0.0)
     val sigma = Matrices.dense(2, 2, Array(1.0, 1.0, 1.0, 1.0))
     val dist = new MultivariateGaussian(mu, sigma)
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/util/MLUtilsSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/util/MLUtilsSuite.scala
index 87b3661f77944..734b7babec7be 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/util/MLUtilsSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/util/MLUtilsSuite.scala
@@ -62,7 +62,7 @@ class MLUtilsSuite extends SparkFunSuite with MLlibTestSparkContext {
       val fastSquaredDist3 =
         fastSquaredDistance(v2, norm2, v3, norm3, precision)
       assert((fastSquaredDist3 - squaredDist2) <= precision * squaredDist2, s"failed with m = $m")
-      if (m > 10) { 
+      if (m > 10) {
         val v4 = Vectors.sparse(n, indices.slice(0, m - 10),
           indices.map(i => a(i) + 0.5).slice(0, m - 10))
         val norm4 = Vectors.norm(v4, 2.0)

From 0674700303da3e4737d73f5fabd2a925ec712f63 Mon Sep 17 00:00:00 2001
From: Yuhao Yang <hhbyyh@gmail.com>
Date: Sun, 31 May 2015 11:51:49 -0700
Subject: [PATCH 287/525] [SPARK-7949] [MLLIB] [DOC] update document with some
 missing save/load

add save load for examples:
KMeansModel
PowerIterationClusteringModel
Word2VecModel
IsotonicRegressionModel

Author: Yuhao Yang <hhbyyh@gmail.com>

Closes #6498 from hhbyyh/docSaveLoad and squashes the following commits:

7f9f06d [Yuhao Yang] add missing imports
c604cad [Yuhao Yang] Merge remote-tracking branch 'upstream/master' into docSaveLoad
1dd77cc [Yuhao Yang] update document with some missing save/load
---
 docs/mllib-clustering.md          | 28 ++++++++++++++++++++++++----
 docs/mllib-feature-extraction.md  |  6 +++++-
 docs/mllib-isotonic-regression.md | 10 +++++++++-
 3 files changed, 38 insertions(+), 6 deletions(-)

diff --git a/docs/mllib-clustering.md b/docs/mllib-clustering.md
index f41ca70952eb7..dac22f736e8cb 100644
--- a/docs/mllib-clustering.md
+++ b/docs/mllib-clustering.md
@@ -47,7 +47,7 @@ Set Sum of Squared Error (WSSSE). You can reduce this error measure by increasin
 optimal *k* is usually one where there is an "elbow" in the WSSSE graph.
 
 {% highlight scala %}
-import org.apache.spark.mllib.clustering.KMeans
+import org.apache.spark.mllib.clustering.{KMeans, KMeansModel}
 import org.apache.spark.mllib.linalg.Vectors
 
 // Load and parse the data
@@ -62,6 +62,10 @@ val clusters = KMeans.train(parsedData, numClusters, numIterations)
 // Evaluate clustering by computing Within Set Sum of Squared Errors
 val WSSSE = clusters.computeCost(parsedData)
 println("Within Set Sum of Squared Errors = " + WSSSE)
+
+// Save and load model
+clusters.save(sc, "myModelPath")
+val sameModel = KMeansModel.load(sc, "myModelPath")
 {% endhighlight %}
 </div>
 
@@ -110,6 +114,10 @@ public class KMeansExample {
     // Evaluate clustering by computing Within Set Sum of Squared Errors
     double WSSSE = clusters.computeCost(parsedData.rdd());
     System.out.println("Within Set Sum of Squared Errors = " + WSSSE);
+
+    // Save and load model
+    clusters.save(sc.sc(), "myModelPath");
+    KMeansModel sameModel = KMeansModel.load(sc.sc(), "myModelPath");
   }
 }
 {% endhighlight %}
@@ -124,7 +132,7 @@ Within Set Sum of Squared Error (WSSSE). You can reduce this error measure by in
 fact the optimal *k* is usually one where there is an "elbow" in the WSSSE graph.
 
 {% highlight python %}
-from pyspark.mllib.clustering import KMeans
+from pyspark.mllib.clustering import KMeans, KMeansModel
 from numpy import array
 from math import sqrt
 
@@ -143,6 +151,10 @@ def error(point):
 
 WSSSE = parsedData.map(lambda point: error(point)).reduce(lambda x, y: x + y)
 print("Within Set Sum of Squared Error = " + str(WSSSE))
+
+# Save and load model
+clusters.save(sc, "myModelPath")
+sameModel = KMeansModel.load(sc, "myModelPath")
 {% endhighlight %}
 </div>
 
@@ -312,12 +324,12 @@ Calling `PowerIterationClustering.run` returns a
 which contains the computed clustering assignments.
 
 {% highlight scala %}
-import org.apache.spark.mllib.clustering.PowerIterationClustering
+import org.apache.spark.mllib.clustering.{PowerIterationClustering, PowerIterationClusteringModel}
 import org.apache.spark.mllib.linalg.Vectors
 
 val similarities: RDD[(Long, Long, Double)] = ...
 
-val pic = new PowerIteartionClustering()
+val pic = new PowerIterationClustering()
   .setK(3)
   .setMaxIterations(20)
 val model = pic.run(similarities)
@@ -325,6 +337,10 @@ val model = pic.run(similarities)
 model.assignments.foreach { a =>
   println(s"${a.id} -> ${a.cluster}")
 }
+
+// Save and load model
+model.save(sc, "myModelPath")
+val sameModel = PowerIterationClusteringModel.load(sc, "myModelPath")
 {% endhighlight %}
 
 A full example that produces the experiment described in the PIC paper can be found under
@@ -360,6 +376,10 @@ PowerIterationClusteringModel model = pic.run(similarities);
 for (PowerIterationClustering.Assignment a: model.assignments().toJavaRDD().collect()) {
   System.out.println(a.id() + " -> " + a.cluster());
 }
+
+// Save and load model
+model.save(sc.sc(), "myModelPath");
+PowerIterationClusteringModel sameModel = PowerIterationClusteringModel.load(sc.sc(), "myModelPath");
 {% endhighlight %}
 </div>
 
diff --git a/docs/mllib-feature-extraction.md b/docs/mllib-feature-extraction.md
index 1f6ad8b13d730..4fe470a8de810 100644
--- a/docs/mllib-feature-extraction.md
+++ b/docs/mllib-feature-extraction.md
@@ -188,7 +188,7 @@ Here we assume the extracted file is `text8` and in same directory as you run th
 import org.apache.spark._
 import org.apache.spark.rdd._
 import org.apache.spark.SparkContext._
-import org.apache.spark.mllib.feature.Word2Vec
+import org.apache.spark.mllib.feature.{Word2Vec, Word2VecModel}
 
 val input = sc.textFile("text8").map(line => line.split(" ").toSeq)
 
@@ -201,6 +201,10 @@ val synonyms = model.findSynonyms("china", 40)
 for((synonym, cosineSimilarity) <- synonyms) {
   println(s"$synonym $cosineSimilarity")
 }
+
+// Save and load model
+model.save(sc, "myModelPath")
+val sameModel = Word2VecModel.load(sc, "myModelPath")
 {% endhighlight %}
 </div>
 <div data-lang="python">
diff --git a/docs/mllib-isotonic-regression.md b/docs/mllib-isotonic-regression.md
index b521c2f27cd6e..5732bc4c7e79e 100644
--- a/docs/mllib-isotonic-regression.md
+++ b/docs/mllib-isotonic-regression.md
@@ -60,7 +60,7 @@ Model is created using the training set and a mean squared error is calculated f
 labels and real labels in the test set.
 
 {% highlight scala %}
-import org.apache.spark.mllib.regression.IsotonicRegression
+import org.apache.spark.mllib.regression.{IsotonicRegression, IsotonicRegressionModel}
 
 val data = sc.textFile("data/mllib/sample_isotonic_regression_data.txt")
 
@@ -88,6 +88,10 @@ val predictionAndLabel = test.map { point =>
 // Calculate mean squared error between predicted and real labels.
 val meanSquaredError = predictionAndLabel.map{case(p, l) => math.pow((p - l), 2)}.mean()
 println("Mean Squared Error = " + meanSquaredError)
+
+// Save and load model
+model.save(sc, "myModelPath")
+val sameModel = IsotonicRegressionModel.load(sc, "myModelPath")
 {% endhighlight %}
 </div>
 
@@ -150,6 +154,10 @@ Double meanSquaredError = new JavaDoubleRDD(predictionAndLabel.map(
 ).rdd()).mean();
 
 System.out.println("Mean Squared Error = " + meanSquaredError);
+
+// Save and load model
+model.save(sc.sc(), "myModelPath");
+IsotonicRegressionModel sameModel = IsotonicRegressionModel.load(sc.sc(), "myModelPath");
 {% endhighlight %}
 </div>
 </div>

From 866652c903d06d1cb4356283e0741119d84dcc21 Mon Sep 17 00:00:00 2001
From: Reynold Xin <rxin@databricks.com>
Date: Sun, 31 May 2015 14:23:42 -0700
Subject: [PATCH 288/525] [SPARK-3850] Turn style checker on for trailing
 whitespaces.

Author: Reynold Xin <rxin@databricks.com>

Closes #6541 from rxin/trailing-whitespace-on and squashes the following commits:

f72ebe4 [Reynold Xin] [SPARK-3850] Turn style checker on for trailing whitespaces.
---
 scalastyle-config.xml                                          | 3 +++
 .../scala/org/apache/spark/sql/DataFrameStatFunctions.scala    | 2 +-
 .../org/apache/spark/sql/hive/execution/HiveQuerySuite.scala   | 2 +-
 3 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/scalastyle-config.xml b/scalastyle-config.xml
index 75ef1e964b5ac..f52b09551adde 100644
--- a/scalastyle-config.xml
+++ b/scalastyle-config.xml
@@ -50,6 +50,9 @@
  */]]></parameter>
   </parameters>
  </check>
+
+  <check level="error" class="org.scalastyle.file.WhitespaceEndOfLineChecker" enabled="true"></check>
+
  <check level="error" class="org.scalastyle.scalariform.SpacesAfterPlusChecker" enabled="true"></check>
  <check level="error" class="org.scalastyle.scalariform.SpacesBeforePlusChecker" enabled="true"></check>
  <check level="error" class="org.scalastyle.file.WhitespaceEndOfLineChecker" enabled="false"></check>
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala
index 5d106c1ac2674..b624eaa201ea4 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala
@@ -43,7 +43,7 @@ final class DataFrameStatFunctions private[sql](df: DataFrame) {
 
   /**
    * Calculates the correlation of two columns of a DataFrame. Currently only supports the Pearson
-   * Correlation Coefficient. For Spearman Correlation, consider using RDD methods found in 
+   * Correlation Coefficient. For Spearman Correlation, consider using RDD methods found in
    * MLlib's Statistics.
    *
    * @param col1 the name of the column
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala
index 4af31d482ce42..440b7c87b0da2 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala
@@ -57,7 +57,7 @@ class HiveQuerySuite extends HiveComparisonTest with BeforeAndAfter {
     // https://cwiki.apache.org/confluence/display/Hive/DeveloperGuide+UDTF
     sql(
       """
-        |CREATE TEMPORARY FUNCTION udtf_count2 
+        |CREATE TEMPORARY FUNCTION udtf_count2
         |AS 'org.apache.spark.sql.hive.execution.GenericUDTFCount2'
       """.stripMargin)
   }

From 46576ab303e50c54c3bd464f8939953efe644574 Mon Sep 17 00:00:00 2001
From: Sun Rui <rui.sun@intel.com>
Date: Sun, 31 May 2015 15:01:21 -0700
Subject: [PATCH 289/525] [SPARK-7227] [SPARKR] Support fillna / dropna in R
 DataFrame.

Author: Sun Rui <rui.sun@intel.com>

Closes #6183 from sun-rui/SPARK-7227 and squashes the following commits:

dd6f5b3 [Sun Rui] Rename readEnv() back to readMap(). Add alias na.omit() for dropna().
41cf725 [Sun Rui] [SPARK-7227][SPARKR] Support fillna / dropna in R DataFrame.
---
 R/pkg/NAMESPACE                               |   2 +
 R/pkg/R/DataFrame.R                           | 125 ++++++++++++++++++
 R/pkg/R/generics.R                            |  18 +++
 R/pkg/R/serialize.R                           |  10 +-
 R/pkg/inst/tests/test_sparkSQL.R              | 109 +++++++++++++++
 .../scala/org/apache/spark/api/r/SerDe.scala  |   6 +-
 6 files changed, 267 insertions(+), 3 deletions(-)

diff --git a/R/pkg/NAMESPACE b/R/pkg/NAMESPACE
index 411126a377950..f9447f6c3288d 100644
--- a/R/pkg/NAMESPACE
+++ b/R/pkg/NAMESPACE
@@ -19,9 +19,11 @@ exportMethods("arrange",
               "count",
               "describe",
               "distinct",
+              "dropna",
               "dtypes",
               "except",
               "explain",
+              "fillna",
               "filter",
               "first",
               "group_by",
diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R
index e79d324838fe3..0af5cb8881e35 100644
--- a/R/pkg/R/DataFrame.R
+++ b/R/pkg/R/DataFrame.R
@@ -1429,3 +1429,128 @@ setMethod("describe",
             sdf <- callJMethod(x@sdf, "describe", listToSeq(colList))
             dataFrame(sdf)
           })
+
+#' dropna
+#'
+#' Returns a new DataFrame omitting rows with null values.
+#'
+#' @param x A SparkSQL DataFrame.
+#' @param how "any" or "all".
+#'            if "any", drop a row if it contains any nulls.
+#'            if "all", drop a row only if all its values are null.
+#'            if minNonNulls is specified, how is ignored.
+#' @param minNonNulls If specified, drop rows that have less than
+#'                    minNonNulls non-null values.
+#'                    This overwrites the how parameter.
+#' @param cols Optional list of column names to consider.
+#' @return A DataFrame
+#' 
+#' @rdname nafunctions
+#' @export
+#' @examples
+#'\dontrun{
+#' sc <- sparkR.init()
+#' sqlCtx <- sparkRSQL.init(sc)
+#' path <- "path/to/file.json"
+#' df <- jsonFile(sqlCtx, path)
+#' dropna(df)
+#' }
+setMethod("dropna",
+          signature(x = "DataFrame"),
+          function(x, how = c("any", "all"), minNonNulls = NULL, cols = NULL) {
+            how <- match.arg(how)
+            if (is.null(cols)) {
+              cols <- columns(x)
+            }
+            if (is.null(minNonNulls)) {
+              minNonNulls <- if (how == "any") { length(cols) } else { 1 }
+            }
+            
+            naFunctions <- callJMethod(x@sdf, "na")
+            sdf <- callJMethod(naFunctions, "drop",
+                               as.integer(minNonNulls), listToSeq(as.list(cols)))
+            dataFrame(sdf)
+          })
+
+#' @aliases dropna
+#' @export
+setMethod("na.omit",
+          signature(x = "DataFrame"),
+          function(x, how = c("any", "all"), minNonNulls = NULL, cols = NULL) {
+            dropna(x, how, minNonNulls, cols)
+          })
+
+#' fillna
+#'
+#' Replace null values.
+#'
+#' @param x A SparkSQL DataFrame.
+#' @param value Value to replace null values with.
+#'              Should be an integer, numeric, character or named list.
+#'              If the value is a named list, then cols is ignored and
+#'              value must be a mapping from column name (character) to 
+#'              replacement value. The replacement value must be an
+#'              integer, numeric or character.
+#' @param cols optional list of column names to consider.
+#'             Columns specified in cols that do not have matching data
+#'             type are ignored. For example, if value is a character, and 
+#'             subset contains a non-character column, then the non-character
+#'             column is simply ignored.
+#' @return A DataFrame
+#' 
+#' @rdname nafunctions
+#' @export
+#' @examples
+#'\dontrun{
+#' sc <- sparkR.init()
+#' sqlCtx <- sparkRSQL.init(sc)
+#' path <- "path/to/file.json"
+#' df <- jsonFile(sqlCtx, path)
+#' fillna(df, 1)
+#' fillna(df, list("age" = 20, "name" = "unknown"))
+#' }
+setMethod("fillna",
+          signature(x = "DataFrame"),
+          function(x, value, cols = NULL) {
+            if (!(class(value) %in% c("integer", "numeric", "character", "list"))) {
+              stop("value should be an integer, numeric, charactor or named list.")
+            }
+            
+            if (class(value) == "list") {
+              # Check column names in the named list
+              colNames <- names(value)
+              if (length(colNames) == 0 || !all(colNames != "")) {
+                stop("value should be an a named list with each name being a column name.")
+              }
+              
+              # Convert to the named list to an environment to be passed to JVM
+              valueMap <- new.env()
+              for (col in colNames) {
+                # Check each item in the named list is of valid type
+                v <- value[[col]]
+                if (!(class(v) %in% c("integer", "numeric", "character"))) {
+                  stop("Each item in value should be an integer, numeric or charactor.")
+                }
+                valueMap[[col]] <- v
+              }
+              
+              # When value is a named list, caller is expected not to pass in cols
+              if (!is.null(cols)) {
+                warning("When value is a named list, cols is ignored!")
+                cols <- NULL
+              }
+              
+              value <- valueMap
+            } else if (is.integer(value)) {
+              # Cast an integer to a numeric
+              value <- as.numeric(value)
+            }
+            
+            naFunctions <- callJMethod(x@sdf, "na")
+            sdf <- if (length(cols) == 0) {
+              callJMethod(naFunctions, "fill", value)
+            } else {
+              callJMethod(naFunctions, "fill", value, listToSeq(as.list(cols)))
+            }
+            dataFrame(sdf)
+          })
diff --git a/R/pkg/R/generics.R b/R/pkg/R/generics.R
index 1f4fc6adaca8d..12e09176c9f92 100644
--- a/R/pkg/R/generics.R
+++ b/R/pkg/R/generics.R
@@ -396,6 +396,20 @@ setGeneric("columns", function(x) {standardGeneric("columns") })
 #' @export
 setGeneric("describe", function(x, col, ...) { standardGeneric("describe") })
 
+#' @rdname nafunctions
+#' @export
+setGeneric("dropna",
+           function(x, how = c("any", "all"), minNonNulls = NULL, cols = NULL) { 
+             standardGeneric("dropna") 
+           })
+
+#' @rdname nafunctions
+#' @export
+setGeneric("na.omit",
+           function(x, how = c("any", "all"), minNonNulls = NULL, cols = NULL) { 
+             standardGeneric("na.omit") 
+           })
+
 #' @rdname schema
 #' @export
 setGeneric("dtypes", function(x) { standardGeneric("dtypes") })
@@ -408,6 +422,10 @@ setGeneric("explain", function(x, ...) { standardGeneric("explain") })
 #' @export
 setGeneric("except", function(x, y) { standardGeneric("except") })
 
+#' @rdname nafunctions
+#' @export
+setGeneric("fillna", function(x, value, cols = NULL) { standardGeneric("fillna") })
+
 #' @rdname filter
 #' @export
 setGeneric("filter", function(x, condition) { standardGeneric("filter") })
diff --git a/R/pkg/R/serialize.R b/R/pkg/R/serialize.R
index c53d0a961016f..2081786e6f833 100644
--- a/R/pkg/R/serialize.R
+++ b/R/pkg/R/serialize.R
@@ -160,6 +160,14 @@ writeList <- function(con, arr) {
   }
 }
 
+# Used to pass arrays where the elements can be of different types
+writeGenericList <- function(con, list) {
+  writeInt(con, length(list))
+  for (elem in list) {
+    writeObject(con, elem)
+  }
+}
+  
 # Used to pass in hash maps required on Java side.
 writeEnv <- function(con, env) {
   len <- length(env)
@@ -168,7 +176,7 @@ writeEnv <- function(con, env) {
   if (len > 0) {
     writeList(con, as.list(ls(env)))
     vals <- lapply(ls(env), function(x) { env[[x]] })
-    writeList(con, as.list(vals))
+    writeGenericList(con, as.list(vals))
   }
 }
 
diff --git a/R/pkg/inst/tests/test_sparkSQL.R b/R/pkg/inst/tests/test_sparkSQL.R
index 1857e636e8577..d2d82e791e876 100644
--- a/R/pkg/inst/tests/test_sparkSQL.R
+++ b/R/pkg/inst/tests/test_sparkSQL.R
@@ -32,6 +32,15 @@ jsonPath <- tempfile(pattern="sparkr-test", fileext=".tmp")
 parquetPath <- tempfile(pattern="sparkr-test", fileext=".parquet")
 writeLines(mockLines, jsonPath)
 
+# For test nafunctions, like dropna(), fillna(),...
+mockLinesNa <- c("{\"name\":\"Bob\",\"age\":16,\"height\":176.5}",
+                 "{\"name\":\"Alice\",\"age\":null,\"height\":164.3}",
+                 "{\"name\":\"David\",\"age\":60,\"height\":null}",
+                 "{\"name\":\"Amy\",\"age\":null,\"height\":null}",
+                 "{\"name\":null,\"age\":null,\"height\":null}")
+jsonPathNa <- tempfile(pattern="sparkr-test", fileext=".tmp")
+writeLines(mockLinesNa, jsonPathNa)
+
 test_that("infer types", {
   expect_equal(infer_type(1L), "integer")
   expect_equal(infer_type(1.0), "double")
@@ -765,5 +774,105 @@ test_that("describe() on a DataFrame", {
   expect_equal(collect(stats)[5, "age"], "30")
 })
 
+test_that("dropna() on a DataFrame", {
+  df <- jsonFile(sqlContext, jsonPathNa)
+  rows <- collect(df)
+
+  # drop with columns
+  
+  expected <- rows[!is.na(rows$name),]
+  actual <- collect(dropna(df, cols = "name"))
+  expect_true(identical(expected, actual))
+
+  expected <- rows[!is.na(rows$age),]
+  actual <- collect(dropna(df, cols = "age"))
+  row.names(expected) <- row.names(actual)
+  # identical on two dataframes does not work here. Don't know why.
+  # use identical on all columns as a workaround.
+  expect_true(identical(expected$age, actual$age))
+  expect_true(identical(expected$height, actual$height))
+  expect_true(identical(expected$name, actual$name))
+  
+  expected <- rows[!is.na(rows$age) & !is.na(rows$height),]
+  actual <- collect(dropna(df, cols = c("age", "height")))
+  expect_true(identical(expected, actual))
+
+  expected <- rows[!is.na(rows$age) & !is.na(rows$height) & !is.na(rows$name),]
+  actual <- collect(dropna(df))
+  expect_true(identical(expected, actual))
+  
+  # drop with how
+
+  expected <- rows[!is.na(rows$age) & !is.na(rows$height) & !is.na(rows$name),]
+  actual <- collect(dropna(df))
+  expect_true(identical(expected, actual))
+
+  expected <- rows[!is.na(rows$age) | !is.na(rows$height) | !is.na(rows$name),]
+  actual <- collect(dropna(df, "all"))
+  expect_true(identical(expected, actual))
+  
+  expected <- rows[!is.na(rows$age) & !is.na(rows$height) & !is.na(rows$name),]
+  actual <- collect(dropna(df, "any"))
+  expect_true(identical(expected, actual))
+
+  expected <- rows[!is.na(rows$age) & !is.na(rows$height),]
+  actual <- collect(dropna(df, "any", cols = c("age", "height")))
+  expect_true(identical(expected, actual))
+
+  expected <- rows[!is.na(rows$age) | !is.na(rows$height),]
+  actual <- collect(dropna(df, "all", cols = c("age", "height")))
+  expect_true(identical(expected, actual))
+  
+  # drop with threshold
+  
+  expected <- rows[as.integer(!is.na(rows$age)) + as.integer(!is.na(rows$height)) >= 2,]
+  actual <- collect(dropna(df, minNonNulls = 2, cols = c("age", "height")))
+  expect_true(identical(expected, actual))  
+
+  expected <- rows[as.integer(!is.na(rows$age)) + 
+                   as.integer(!is.na(rows$height)) +
+                   as.integer(!is.na(rows$name)) >= 3,]
+  actual <- collect(dropna(df, minNonNulls = 3, cols = c("name", "age", "height")))
+  expect_true(identical(expected, actual))
+})
+
+test_that("fillna() on a DataFrame", {
+  df <- jsonFile(sqlContext, jsonPathNa)
+  rows <- collect(df)
+  
+  # fill with value
+  
+  expected <- rows
+  expected$age[is.na(expected$age)] <- 50
+  expected$height[is.na(expected$height)] <- 50.6
+  actual <- collect(fillna(df, 50.6))
+  expect_true(identical(expected, actual))
+
+  expected <- rows
+  expected$name[is.na(expected$name)] <- "unknown"
+  actual <- collect(fillna(df, "unknown"))
+  expect_true(identical(expected, actual))
+
+  expected <- rows
+  expected$age[is.na(expected$age)] <- 50
+  actual <- collect(fillna(df, 50.6, "age"))
+  expect_true(identical(expected, actual))
+
+  expected <- rows
+  expected$name[is.na(expected$name)] <- "unknown"
+  actual <- collect(fillna(df, "unknown", c("age", "name")))
+  expect_true(identical(expected, actual))
+  
+  # fill with named list
+
+  expected <- rows
+  expected$age[is.na(expected$age)] <- 50
+  expected$height[is.na(expected$height)] <- 50.6
+  expected$name[is.na(expected$name)] <- "unknown"
+  actual <- collect(fillna(df, list("age" = 50, "height" = 50.6, "name" = "unknown")))
+  expect_true(identical(expected, actual))  
+})
+
 unlink(parquetPath)
 unlink(jsonPath)
+unlink(jsonPathNa)
diff --git a/core/src/main/scala/org/apache/spark/api/r/SerDe.scala b/core/src/main/scala/org/apache/spark/api/r/SerDe.scala
index 371dfe454d1a2..f8e3f1a79082e 100644
--- a/core/src/main/scala/org/apache/spark/api/r/SerDe.scala
+++ b/core/src/main/scala/org/apache/spark/api/r/SerDe.scala
@@ -157,9 +157,11 @@ private[spark] object SerDe {
       val keysLen = readInt(in)
       val keys = (0 until keysLen).map(_ => readTypedObject(in, keysType))
 
-      val valuesType = readObjectType(in)
       val valuesLen = readInt(in)
-      val values = (0 until valuesLen).map(_ => readTypedObject(in, valuesType))
+      val values = (0 until valuesLen).map(_ => {
+        val valueType = readObjectType(in)
+        readTypedObject(in, valueType)
+      })
       mapAsJavaMap(keys.zip(values).toMap)
     } else {
       new java.util.HashMap[Object, Object]()

From 9126ea4d1c5c468f3662e76e0231b4d64c7c9699 Mon Sep 17 00:00:00 2001
From: Josh Rosen <joshrosen@databricks.com>
Date: Sun, 31 May 2015 15:17:05 -0700
Subject: [PATCH 290/525] [MINOR] Enable PySpark SQL readerwriter and window
 tests

PySpark SQL's `readerwriter` and `window` doctests weren't being run by our test runner script; this patch re-enables them.

Author: Josh Rosen <joshrosen@databricks.com>

Closes #6542 from JoshRosen/enable-more-pyspark-sql-tests and squashes the following commits:

9f46ce4 [Josh Rosen] Enable PySpark SQL readerwriter and window tests.
---
 python/run-tests | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/python/run-tests b/python/run-tests
index fcfb49556b7cf..17dda3eadac0c 100755
--- a/python/run-tests
+++ b/python/run-tests
@@ -76,6 +76,8 @@ function run_sql_tests() {
     run_test "pyspark.sql.dataframe"
     run_test "pyspark.sql.group"
     run_test "pyspark.sql.functions"
+    run_test "pyspark.sql.readwriter"
+    run_test "pyspark.sql.window"
     run_test "pyspark.sql.tests"
 }
 

From 6f006b5f5fca649ac51745212d8fd44b1609b9ae Mon Sep 17 00:00:00 2001
From: Reynold Xin <rxin@databricks.com>
Date: Sun, 31 May 2015 18:04:57 -0700
Subject: [PATCH 291/525] [SPARK-7986] Split scalastyle config into 3 sections.

(1) rules that we enforce.
(2) rules that we would like to enforce, but haven't cleaned up the codebase to
    turn on yet (or we need to make the scalastyle rule more configurable).
(3) rules that we don't want to enforce.

Author: Reynold Xin <rxin@databricks.com>

Closes #6543 from rxin/scalastyle and squashes the following commits:

beefaab [Reynold Xin] [SPARK-7986] Split scalastyle config into 3 sections.
---
 scalastyle-config.xml | 290 +++++++++++++++++++++++++-----------------
 1 file changed, 174 insertions(+), 116 deletions(-)

diff --git a/scalastyle-config.xml b/scalastyle-config.xml
index f52b09551adde..d6f927b6fa803 100644
--- a/scalastyle-config.xml
+++ b/scalastyle-config.xml
@@ -14,25 +14,41 @@
   ~ See the License for the specific language governing permissions and
   ~ limitations under the License.
   -->
-<!-- If you wish to turn off checking for a section of code, you can put a comment in the source
- before and after the section, with the following syntax: -->
-<!-- // scalastyle:off -->
-<!-- ... -->
-<!-- // naughty stuff -->
-<!-- ... -->
-<!-- // scalastyle:on -->
+<!--
+
+If you wish to turn off checking for a section of code, you can put a comment in the source
+before and after the section, with the following syntax:
+
+  // scalastyle:off
+  ...  // stuff that breaks the styles
+  // scalastyle:on
+
+You can also disable only one rule, by specifying its rule id, as specified in:
+  http://www.scalastyle.org/rules-0.7.0.html
+
+  // scalastyle:off no.finalize
+  override def finalize(): Unit = ...
+  // scalastyle:on no.finalize
+
+This file is divided into 3 sections:
+ (1) rules that we enforce.
+ (2) rules that we would like to enforce, but haven't cleaned up the codebase to turn on yet
+     (or we need to make the scalastyle rule more configurable).
+ (3) rules that we don't want to enforce.
+-->
 
 <scalastyle>
- <name>Scalastyle standard configuration</name>
- <check level="error" class="org.scalastyle.file.FileTabChecker" enabled="true"></check>
- <!-- <check level="error" class="org.scalastyle.file.FileLengthChecker" enabled="true"> -->
- <!--  <parameters> -->
- <!--   <parameter name="maxFileLength"><![CDATA[800]]></parameter> -->
- <!--  </parameters> -->
- <!-- </check> -->
- <check level="error" class="org.scalastyle.file.HeaderMatchesChecker" enabled="true">
-  <parameters>
-   <parameter name="header"><![CDATA[/*
+  <name>Scalastyle standard configuration</name>
+
+  <!-- ================================================================================ -->
+  <!--                               rules we enforce                                   -->
+  <!-- ================================================================================ -->
+
+  <check level="error" class="org.scalastyle.file.FileTabChecker" enabled="true"></check>
+
+  <check level="error" class="org.scalastyle.file.HeaderMatchesChecker" enabled="true">
+    <parameters>
+       <parameter name="header"><![CDATA[/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
@@ -48,122 +64,164 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */]]></parameter>
-  </parameters>
- </check>
+    </parameters>
+  </check>
+
+  <check level="error" class="org.scalastyle.scalariform.SpacesAfterPlusChecker" enabled="true"></check>
+
+  <check level="error" class="org.scalastyle.scalariform.SpacesBeforePlusChecker" enabled="true"></check>
 
   <check level="error" class="org.scalastyle.file.WhitespaceEndOfLineChecker" enabled="true"></check>
 
- <check level="error" class="org.scalastyle.scalariform.SpacesAfterPlusChecker" enabled="true"></check>
- <check level="error" class="org.scalastyle.scalariform.SpacesBeforePlusChecker" enabled="true"></check>
- <check level="error" class="org.scalastyle.file.WhitespaceEndOfLineChecker" enabled="false"></check>
- <check level="error" class="org.scalastyle.file.FileLineLengthChecker" enabled="true">
-  <parameters>
-   <parameter name="maxLineLength"><![CDATA[100]]></parameter>
-   <parameter name="tabSize"><![CDATA[2]]></parameter>
-   <parameter name="ignoreImports">true</parameter>
-  </parameters>
- </check>
- <check level="error" class="org.scalastyle.scalariform.ClassNamesChecker" enabled="true">
-  <parameters>
-   <parameter name="regex"><![CDATA[[A-Z][A-Za-z]*]]></parameter>
-  </parameters>
- </check>
- <check level="error" class="org.scalastyle.scalariform.ObjectNamesChecker" enabled="true">
-  <parameters>
-   <parameter name="regex"><![CDATA[[A-Z][A-Za-z]*]]></parameter>
-  </parameters>
- </check>
- <check level="error" class="org.scalastyle.scalariform.PackageObjectNamesChecker" enabled="true">
-  <parameters>
-   <parameter name="regex"><![CDATA[^[a-z][A-Za-z]*$]]></parameter>
-  </parameters>
- </check>
- <check level="error" class="org.scalastyle.scalariform.EqualsHashCodeChecker" enabled="false"></check>
- <!-- <check level="error" class="org.scalastyle.scalariform.IllegalImportsChecker" enabled="true"> -->
- <!--  <parameters> -->
- <!--   <parameter name="illegalImports"><![CDATA[sun._,java.awt._]]></parameter> -->
- <!--  </parameters> -->
- <!-- </check> -->
- <check level="error" class="org.scalastyle.scalariform.ParameterNumberChecker" enabled="true">
-  <parameters>
-   <parameter name="maxParameters"><![CDATA[10]]></parameter>
-  </parameters>
- </check>
- <!-- <check level="error" class="org.scalastyle.scalariform.MagicNumberChecker" enabled="true"> -->
- <!--  <parameters> -->
- <!--   <parameter name="ignore"><![CDATA[-1,0,1,2,3]]></parameter> -->
- <!--  </parameters> -->
- <!-- </check> -->
- <check level="error" class="org.scalastyle.scalariform.NoWhitespaceBeforeLeftBracketChecker" enabled="false"></check>
- <check level="error" class="org.scalastyle.scalariform.NoWhitespaceAfterLeftBracketChecker" enabled="false"></check>
- <!-- <check level="error" class="org.scalastyle.scalariform.ReturnChecker" enabled="true"></check> -->
- <!-- <check level="error" class="org.scalastyle.scalariform.NullChecker" enabled="true"></check> -->
- <!-- <check level="error" class="org.scalastyle.scalariform.NoCloneChecker" enabled="true"></check> -->
+  <check level="error" class="org.scalastyle.file.FileLineLengthChecker" enabled="true">
+    <parameters>
+      <parameter name="maxLineLength"><![CDATA[100]]></parameter>
+      <parameter name="tabSize"><![CDATA[2]]></parameter>
+      <parameter name="ignoreImports">true</parameter>
+    </parameters>
+  </check>
+
+  <check level="error" class="org.scalastyle.scalariform.ClassNamesChecker" enabled="true">
+    <parameters><parameter name="regex"><![CDATA[[A-Z][A-Za-z]*]]></parameter></parameters>
+  </check>
+
+  <check level="error" class="org.scalastyle.scalariform.ObjectNamesChecker" enabled="true">
+    <parameters><parameter name="regex"><![CDATA[[A-Z][A-Za-z]*]]></parameter></parameters>
+  </check>
+
+  <check level="error" class="org.scalastyle.scalariform.PackageObjectNamesChecker" enabled="true">
+    <parameters><parameter name="regex"><![CDATA[^[a-z][A-Za-z]*$]]></parameter></parameters>
+  </check>
+
+  <check level="error" class="org.scalastyle.scalariform.ParameterNumberChecker" enabled="true">
+    <parameters><parameter name="maxParameters"><![CDATA[10]]></parameter></parameters>
+  </check>
+
   <check level="error" class="org.scalastyle.scalariform.NoFinalizeChecker" enabled="true"></check>
+
   <check level="error" class="org.scalastyle.scalariform.CovariantEqualsChecker" enabled="true"></check>
- <!-- <check level="error" class="org.scalastyle.scalariform.StructuralTypeChecker" enabled="true"></check> -->
- <!-- <check level="error" class="org.scalastyle.file.RegexChecker" enabled="true"> -->
- <!--  <parameters> -->
- <!--   <parameter name="regex"><![CDATA[println]]></parameter> -->
- <!--  </parameters> -->
- <!-- </check> -->
- <!-- <check level="error" class="org.scalastyle.scalariform.NumberOfTypesChecker" enabled="true"> -->
- <!--  <parameters> -->
- <!--   <parameter name="maxTypes"><![CDATA[30]]></parameter> -->
- <!--  </parameters> -->
- <!-- </check> -->
- <!-- <check level="error" class="org.scalastyle.scalariform.CyclomaticComplexityChecker" enabled="true"> -->
- <!--  <parameters> -->
- <!--   <parameter name="maximum"><![CDATA[10]]></parameter> -->
- <!--  </parameters> -->
- <!-- </check> -->
 
   <check level="error" class="org.scalastyle.scalariform.StructuralTypeChecker" enabled="true"></check>
 
- <check level="error" class="org.scalastyle.scalariform.UppercaseLChecker" enabled="true"></check>
- <check level="error" class="org.scalastyle.scalariform.SimplifyBooleanExpressionChecker" enabled="false"></check>
- <check level="error" class="org.scalastyle.scalariform.IfBraceChecker" enabled="true">
-  <parameters>
-   <parameter name="singleLineAllowed"><![CDATA[true]]></parameter>
-   <parameter name="doubleLineAllowed"><![CDATA[true]]></parameter>
-  </parameters>
- </check>
- <!-- <check level="error" class="org.scalastyle.scalariform.MethodLengthChecker" enabled="true"> -->
- <!--  <parameters> -->
- <!--   <parameter name="maxLength"><![CDATA[50]]></parameter> -->
- <!--  </parameters> -->
- <!-- </check> -->
- <!-- <check level="error" class="org.scalastyle.scalariform.MethodNamesChecker" enabled="true"> -->
- <!--  <parameters> -->
- <!--   <parameter name="regex"><![CDATA[^[a-z][A-Za-z0-9]*$]]></parameter> -->
- <!--  </parameters> -->
- <!-- </check> -->
- <!-- <check level="error" class="org.scalastyle.scalariform.NumberOfMethodsInTypeChecker" enabled="true"> -->
- <!--  <parameters> -->
- <!--   <parameter name="maxMethods"><![CDATA[30]]></parameter> -->
- <!--  </parameters> -->
- <!-- </check> -->
- <check level="error" class="org.scalastyle.scalariform.PublicMethodsHaveTypeChecker" enabled="true"></check>
- <check level="error" class="org.scalastyle.file.NewLineAtEofChecker" enabled="true"></check>
- <check level="error" class="org.scalastyle.file.NoNewLineAtEofChecker" enabled="false"></check>
- <check level="error" class="org.scalastyle.scalariform.NonASCIICharacterChecker" enabled="true"></check>
- <check level="error" class="org.scalastyle.scalariform.SpaceAfterCommentStartChecker" enabled="true"></check>
- <check level="error" class="org.scalastyle.scalariform.EnsureSingleSpaceBeforeTokenChecker" enabled="true">
+  <check level="error" class="org.scalastyle.scalariform.UppercaseLChecker" enabled="true"></check>
+
+  <check level="error" class="org.scalastyle.scalariform.IfBraceChecker" enabled="true">
+    <parameters>
+      <parameter name="singleLineAllowed"><![CDATA[true]]></parameter>
+      <parameter name="doubleLineAllowed"><![CDATA[true]]></parameter>
+    </parameters>
+  </check>
+
+  <check level="error" class="org.scalastyle.scalariform.PublicMethodsHaveTypeChecker" enabled="true"></check>
+
+  <check level="error" class="org.scalastyle.file.NewLineAtEofChecker" enabled="true"></check>
+
+  <check level="error" class="org.scalastyle.scalariform.NonASCIICharacterChecker" enabled="true"></check>
+
+  <check level="error" class="org.scalastyle.scalariform.SpaceAfterCommentStartChecker" enabled="true"></check>
+
+  <check level="error" class="org.scalastyle.scalariform.EnsureSingleSpaceBeforeTokenChecker" enabled="true">
    <parameters>
      <parameter name="tokens">ARROW, EQUALS, ELSE, TRY, CATCH, FINALLY, LARROW, RARROW</parameter>
    </parameters>
- </check>
+  </check>
+
   <check level="error" class="org.scalastyle.scalariform.EnsureSingleSpaceAfterTokenChecker" enabled="true">
     <parameters>
      <parameter name="tokens">ARROW, EQUALS, COMMA, COLON, IF, ELSE, DO, WHILE, FOR, MATCH, TRY, CATCH, FINALLY, LARROW, RARROW</parameter>
     </parameters>
   </check>
+
+  <!-- ??? usually shouldn't be checked into the code base. -->
   <check level="error" class="org.scalastyle.scalariform.NotImplementedErrorUsage" enabled="true"></check>
+
   <!-- As of SPARK-7558, all tests in Spark should extend o.a.s.SparkFunSuite instead of FunSuite directly -->
-  <check level="error" class="org.scalastyle.scalariform.TokenChecker" enabled="true">
-   <parameters>
-    <parameter name="regex">^FunSuite[A-Za-z]*$</parameter>
-   </parameters>
-   <customMessage>Tests must extend org.apache.spark.SparkFunSuite instead.</customMessage>
+  <check customId="funsuite" level="error" class="org.scalastyle.scalariform.TokenChecker" enabled="true">
+    <parameters><parameter name="regex">^FunSuite[A-Za-z]*$</parameter></parameters>
+    <customMessage>Tests must extend org.apache.spark.SparkFunSuite instead.</customMessage>
+  </check>
+
+  <!-- ================================================================================ -->
+  <!--       rules we'd like to enforce, but haven't cleaned up the codebase yet        -->
+  <!-- ================================================================================ -->
+
+  <!-- SPARK-7977 We should turn this on, but we'd need to add whitelist to files that are using it first. -->
+  <check customId="println" level="error" class="org.scalastyle.scalariform.TokenChecker" enabled="false">
+    <parameters><parameter name="regex">^println$</parameter></parameters>
+    <customMessage><![CDATA[Are you sure you want to println? If yes, wrap the code block with
+      // scalastyle:off println
+      println(...)
+      // scalastyle:on println]]></customMessage>
+  </check>
+
+  <!-- We cannot turn the following two on, because it'd fail a lot of string interpolation use cases. -->
+  <!-- Ideally the following two rules should be configurable to rule out string interpolation. -->
+  <check level="error" class="org.scalastyle.scalariform.NoWhitespaceBeforeLeftBracketChecker" enabled="false"></check>
+  <check level="error" class="org.scalastyle.scalariform.NoWhitespaceAfterLeftBracketChecker" enabled="false"></check>
+
+  <!-- This breaks symbolic method names so we don't turn it on. -->
+  <!-- Maybe we should update it to allow basic symbolic names, and then we are good to go. -->
+  <check level="error" class="org.scalastyle.scalariform.MethodNamesChecker" enabled="false">
+    <parameters>
+    <parameter name="regex"><![CDATA[^[a-z][A-Za-z0-9]*$]]></parameter>
+    </parameters>
+  </check>
+
+  <!-- Should turn this on, but we have a few places that need to be fixed first -->
+  <check level="error" class="org.scalastyle.scalariform.EqualsHashCodeChecker" enabled="false"></check>
+
+  <!-- ================================================================================ -->
+  <!--                               rules we don't want                                -->
+  <!-- ================================================================================ -->
+
+  <check level="error" class="org.scalastyle.scalariform.IllegalImportsChecker" enabled="false">
+    <parameters><parameter name="illegalImports"><![CDATA[sun._,java.awt._]]></parameter></parameters>
   </check>
+
+  <!-- We want the opposite of this: NewLineAtEofChecker -->
+  <check level="error" class="org.scalastyle.file.NoNewLineAtEofChecker" enabled="false"></check>
+
+  <!-- This one complains about all kinds of random things. Disable. -->
+  <check level="error" class="org.scalastyle.scalariform.SimplifyBooleanExpressionChecker" enabled="false"></check>
+
+  <!-- We use return quite a bit for control flows and guards -->
+  <check level="error" class="org.scalastyle.scalariform.ReturnChecker" enabled="false"></check>
+
+  <!-- We use null a lot in low level code and to interface with 3rd party code -->
+  <check level="error" class="org.scalastyle.scalariform.NullChecker" enabled="false"></check>
+
+  <!-- Doesn't seem super big deal here ... -->
+  <check level="error" class="org.scalastyle.scalariform.NoCloneChecker" enabled="false"></check>
+
+  <!-- Doesn't seem super big deal here ... -->
+  <check level="error" class="org.scalastyle.file.FileLengthChecker" enabled="false">
+    <parameters><parameter name="maxFileLength">800></parameter></parameters>
+  </check>
+
+  <!-- Doesn't seem super big deal here ... -->
+  <check level="error" class="org.scalastyle.scalariform.NumberOfTypesChecker" enabled="false">
+    <parameters><parameter name="maxTypes">30</parameter></parameters>
+  </check>
+
+  <!-- Doesn't seem super big deal here ... -->
+  <check level="error" class="org.scalastyle.scalariform.CyclomaticComplexityChecker" enabled="false">
+    <parameters><parameter name="maximum">10</parameter></parameters>
+  </check>
+
+  <!-- Doesn't seem super big deal here ... -->
+  <check level="error" class="org.scalastyle.scalariform.MethodLengthChecker" enabled="false">
+    <parameters><parameter name="maxLength">50</parameter></parameters>
+  </check>
+
+  <!-- Not exactly feasible to enforce this right now. -->
+  <!-- It is also infrequent that somebody introduces a new class with a lot of methods. -->
+  <check level="error" class="org.scalastyle.scalariform.NumberOfMethodsInTypeChecker" enabled="false">
+    <parameters><parameter name="maxMethods"><![CDATA[30]]></parameter></parameters>
+  </check>
+
+  <!-- Doesn't seem super big deal here, and we have a lot of magic numbers ... -->
+  <check level="error" class="org.scalastyle.scalariform.MagicNumberChecker" enabled="false">
+    <parameters><parameter name="ignore">-1,0,1,2,3</parameter></parameters>
+  </check>
+
 </scalastyle>

From 91777a1c3ad3b3ec7b65d5a0413209a9baf6b36a Mon Sep 17 00:00:00 2001
From: Davies Liu <davies@databricks.com>
Date: Sun, 31 May 2015 19:55:57 -0700
Subject: [PATCH 292/525] [SPARK-7978] [SQL] [PYSPARK] DecimalType should not
 be singleton

Author: Davies Liu <davies@databricks.com>

Closes #6532 from davies/decimal and squashes the following commits:

c7fcbce [Davies Liu] Update tests.py
1425359 [Davies Liu] DecimalType should not be singleton
---
 python/pyspark/sql/tests.py |  9 +++++++++
 python/pyspark/sql/types.py | 18 ++++++++++++++++--
 2 files changed, 25 insertions(+), 2 deletions(-)

diff --git a/python/pyspark/sql/tests.py b/python/pyspark/sql/tests.py
index 5c53c3a8ed4f1..76384d31f1bf4 100644
--- a/python/pyspark/sql/tests.py
+++ b/python/pyspark/sql/tests.py
@@ -100,6 +100,15 @@ def test_data_type_eq(self):
         lt2 = pickle.loads(pickle.dumps(LongType()))
         self.assertEquals(lt, lt2)
 
+    # regression test for SPARK-7978
+    def test_decimal_type(self):
+        t1 = DecimalType()
+        t2 = DecimalType(10, 2)
+        self.assertTrue(t2 is not t1)
+        self.assertNotEqual(t1, t2)
+        t3 = DecimalType(8)
+        self.assertNotEqual(t2, t3)
+
 
 class SQLTests(ReusedPySparkTestCase):
 
diff --git a/python/pyspark/sql/types.py b/python/pyspark/sql/types.py
index 9e7e9f04bc35d..b6ec6137c9180 100644
--- a/python/pyspark/sql/types.py
+++ b/python/pyspark/sql/types.py
@@ -97,8 +97,6 @@ class AtomicType(DataType):
     """An internal type used to represent everything that is not
     null, UDTs, arrays, structs, and maps."""
 
-    __metaclass__ = DataTypeSingleton
-
 
 class NumericType(AtomicType):
     """Numeric data types.
@@ -109,6 +107,8 @@ class IntegralType(NumericType):
     """Integral data types.
     """
 
+    __metaclass__ = DataTypeSingleton
+
 
 class FractionalType(NumericType):
     """Fractional data types.
@@ -119,26 +119,36 @@ class StringType(AtomicType):
     """String data type.
     """
 
+    __metaclass__ = DataTypeSingleton
+
 
 class BinaryType(AtomicType):
     """Binary (byte array) data type.
     """
 
+    __metaclass__ = DataTypeSingleton
+
 
 class BooleanType(AtomicType):
     """Boolean data type.
     """
 
+    __metaclass__ = DataTypeSingleton
+
 
 class DateType(AtomicType):
     """Date (datetime.date) data type.
     """
 
+    __metaclass__ = DataTypeSingleton
+
 
 class TimestampType(AtomicType):
     """Timestamp (datetime.datetime) data type.
     """
 
+    __metaclass__ = DataTypeSingleton
+
 
 class DecimalType(FractionalType):
     """Decimal (decimal.Decimal) data type.
@@ -172,11 +182,15 @@ class DoubleType(FractionalType):
     """Double data type, representing double precision floats.
     """
 
+    __metaclass__ = DataTypeSingleton
+
 
 class FloatType(FractionalType):
     """Float data type, representing single precision floats.
     """
 
+    __metaclass__ = DataTypeSingleton
+
 
 class ByteType(IntegralType):
     """Byte data type, i.e. a signed integer in a single byte.

From a0e46a0d2ad23ce6a64e6ebdf2ccc776208696b6 Mon Sep 17 00:00:00 2001
From: Wenchen Fan <cloud0fan@outlook.com>
Date: Sun, 31 May 2015 21:01:46 -0700
Subject: [PATCH 293/525] [SPARK-7952][SPARK-7984][SQL] equality check between
 boolean type and numeric type is broken.

The origin code has several problems:
* `true <=> 1` will return false as we didn't set a rule to handle it.
* `true = a` where `a` is not `Literal` and its value is 1, will return false as we only handle literal values.

Author: Wenchen Fan <cloud0fan@outlook.com>

Closes #6505 from cloud-fan/tmp1 and squashes the following commits:

77f0f39 [Wenchen Fan] minor fix
b6401ba [Wenchen Fan] add type coercion for CaseKeyWhen and address comments
ebc8c61 [Wenchen Fan] use SQLTestUtils and If
625973c [Wenchen Fan] improve
9ba2130 [Wenchen Fan] address comments
fc0d741 [Wenchen Fan] fix style
2846a04 [Wenchen Fan] fix 7952
---
 .../catalyst/analysis/HiveTypeCoercion.scala  | 101 +++++++++++++-----
 .../sql/catalyst/expressions/predicates.scala |   5 +-
 .../analysis/HiveTypeCoercionSuite.scala      |  55 ++++++++--
 .../ExpressionEvaluationSuite.scala           |   8 +-
 .../org/apache/spark/sql/SQLQuerySuite.scala  |  36 +++++--
 5 files changed, 158 insertions(+), 47 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala
index 96d7b96e60ee9..edcc918bfe921 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala
@@ -76,7 +76,7 @@ trait HiveTypeCoercion {
     WidenTypes ::
     PromoteStrings ::
     DecimalPrecision ::
-    BooleanComparisons ::
+    BooleanEqualization ::
     StringToIntegralCasts ::
     FunctionArgumentConversion ::
     CaseWhenCoercion ::
@@ -119,7 +119,7 @@ trait HiveTypeCoercion {
    * the appropriate numeric equivalent.
    */
   object ConvertNaNs extends Rule[LogicalPlan] {
-    val stringNaN = Literal("NaN")
+    private val stringNaN = Literal("NaN")
 
     def apply(plan: LogicalPlan): LogicalPlan = plan transform {
       case q: LogicalPlan => q transformExpressions {
@@ -349,17 +349,17 @@ trait HiveTypeCoercion {
     import scala.math.{max, min}
 
     // Conversion rules for integer types into fixed-precision decimals
-    val intTypeToFixed: Map[DataType, DecimalType] = Map(
+    private val intTypeToFixed: Map[DataType, DecimalType] = Map(
       ByteType -> DecimalType(3, 0),
       ShortType -> DecimalType(5, 0),
       IntegerType -> DecimalType(10, 0),
       LongType -> DecimalType(20, 0)
     )
 
-    def isFloat(t: DataType): Boolean = t == FloatType || t == DoubleType
+    private def isFloat(t: DataType): Boolean = t == FloatType || t == DoubleType
 
     // Conversion rules for float and double into fixed-precision decimals
-    val floatTypeToFixed: Map[DataType, DecimalType] = Map(
+    private val floatTypeToFixed: Map[DataType, DecimalType] = Map(
       FloatType -> DecimalType(7, 7),
       DoubleType -> DecimalType(15, 15)
     )
@@ -482,30 +482,66 @@ trait HiveTypeCoercion {
   }
 
   /**
-   * Changes Boolean values to Bytes so that expressions like true < false can be Evaluated.
+   * Changes numeric values to booleans so that expressions like true = 1 can be evaluated.
    */
-  object BooleanComparisons extends Rule[LogicalPlan] {
-    val trueValues = Seq(1, 1L, 1.toByte, 1.toShort, new java.math.BigDecimal(1)).map(Literal(_))
-    val falseValues = Seq(0, 0L, 0.toByte, 0.toShort, new java.math.BigDecimal(0)).map(Literal(_))
+  object BooleanEqualization extends Rule[LogicalPlan] {
+    private val trueValues = Seq(1.toByte, 1.toShort, 1, 1L, new java.math.BigDecimal(1))
+    private val falseValues = Seq(0.toByte, 0.toShort, 0, 0L, new java.math.BigDecimal(0))
+
+    private def buildCaseKeyWhen(booleanExpr: Expression, numericExpr: Expression) = {
+      CaseKeyWhen(numericExpr, Seq(
+        Literal(trueValues.head), booleanExpr,
+        Literal(falseValues.head), Not(booleanExpr),
+        Literal(false)))
+    }
+
+    private def transform(booleanExpr: Expression, numericExpr: Expression) = {
+      If(Or(IsNull(booleanExpr), IsNull(numericExpr)),
+        Literal.create(null, BooleanType),
+        buildCaseKeyWhen(booleanExpr, numericExpr))
+    }
+
+    private def transformNullSafe(booleanExpr: Expression, numericExpr: Expression) = {
+      CaseWhen(Seq(
+        And(IsNull(booleanExpr), IsNull(numericExpr)), Literal(true),
+        Or(IsNull(booleanExpr), IsNull(numericExpr)), Literal(false),
+        buildCaseKeyWhen(booleanExpr, numericExpr)
+      ))
+    }
 
     def apply(plan: LogicalPlan): LogicalPlan = plan transformAllExpressions {
       // Skip nodes who's children have not been resolved yet.
       case e if !e.childrenResolved => e
 
-      // Hive treats (true = 1) as true and (false = 0) as true.
-      case EqualTo(l @ BooleanType(), r) if trueValues.contains(r) => l
-      case EqualTo(l, r @ BooleanType()) if trueValues.contains(l) => r
-      case EqualTo(l @ BooleanType(), r) if falseValues.contains(r) => Not(l)
-      case EqualTo(l, r @ BooleanType()) if falseValues.contains(l) => Not(r)
-
-      // No need to change other EqualTo operators as that actually makes sense for boolean types.
-      case e: EqualTo => e
-      // No need to change the EqualNullSafe operators, too
-      case e: EqualNullSafe => e
-      // Otherwise turn them to Byte types so that there exists and ordering.
-      case p: BinaryComparison if p.left.dataType == BooleanType &&
-                                  p.right.dataType == BooleanType =>
-        p.makeCopy(Array(Cast(p.left, ByteType), Cast(p.right, ByteType)))
+      // Hive treats (true = 1) as true and (false = 0) as true,
+      // all other cases are considered as false.
+
+      // We may simplify the expression if one side is literal numeric values
+      case EqualTo(l @ BooleanType(), Literal(value, _: NumericType))
+        if trueValues.contains(value) => l
+      case EqualTo(l @ BooleanType(), Literal(value, _: NumericType))
+        if falseValues.contains(value) => Not(l)
+      case EqualTo(Literal(value, _: NumericType), r @ BooleanType())
+        if trueValues.contains(value) => r
+      case EqualTo(Literal(value, _: NumericType), r @ BooleanType())
+        if falseValues.contains(value) => Not(r)
+      case EqualNullSafe(l @ BooleanType(), Literal(value, _: NumericType))
+        if trueValues.contains(value) => And(IsNotNull(l), l)
+      case EqualNullSafe(l @ BooleanType(), Literal(value, _: NumericType))
+        if falseValues.contains(value) => And(IsNotNull(l), Not(l))
+      case EqualNullSafe(Literal(value, _: NumericType), r @ BooleanType())
+        if trueValues.contains(value) => And(IsNotNull(r), r)
+      case EqualNullSafe(Literal(value, _: NumericType), r @ BooleanType())
+        if falseValues.contains(value) => And(IsNotNull(r), Not(r))
+
+      case EqualTo(l @ BooleanType(), r @ NumericType()) =>
+        transform(l , r)
+      case EqualTo(l @ NumericType(), r @ BooleanType()) =>
+        transform(r, l)
+      case EqualNullSafe(l @ BooleanType(), r @ NumericType()) =>
+        transformNullSafe(l, r)
+      case EqualNullSafe(l @ NumericType(), r @ BooleanType()) =>
+        transformNullSafe(r, l)
     }
   }
 
@@ -606,7 +642,7 @@ trait HiveTypeCoercion {
     import HiveTypeCoercion._
 
     def apply(plan: LogicalPlan): LogicalPlan = plan transformAllExpressions {
-      case cw: CaseWhenLike if !cw.resolved && cw.childrenResolved && !cw.valueTypesEqual =>
+      case cw: CaseWhenLike if cw.childrenResolved && !cw.valueTypesEqual =>
         logDebug(s"Input values for null casting ${cw.valueTypes.mkString(",")}")
         val commonType = cw.valueTypes.reduce { (v1, v2) =>
           findTightestCommonType(v1, v2).getOrElse(sys.error(
@@ -625,6 +661,23 @@ trait HiveTypeCoercion {
           case CaseKeyWhen(key, _) =>
             CaseKeyWhen(key, transformedBranches)
         }
+
+      case ckw: CaseKeyWhen if ckw.childrenResolved && !ckw.resolved =>
+        val commonType = (ckw.key +: ckw.whenList).map(_.dataType).reduce { (v1, v2) =>
+          findTightestCommonType(v1, v2).getOrElse(sys.error(
+            s"Types in CASE WHEN must be the same or coercible to a common type: $v1 != $v2"))
+        }
+        val transformedBranches = ckw.branches.sliding(2, 2).map {
+          case Seq(when, then) if when.dataType != commonType =>
+            Seq(Cast(when, commonType), then)
+          case s => s
+        }.reduce(_ ++ _)
+        val transformedKey = if (ckw.key.dataType != commonType) {
+          Cast(ckw.key, commonType)
+        } else {
+          ckw.key
+        }
+        CaseKeyWhen(transformedKey, transformedBranches)
     }
   }
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala
index e2d1c8115e051..4f422d69c4382 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala
@@ -366,7 +366,7 @@ trait CaseWhenLike extends Expression {
 
   // both then and else val should be considered.
   def valueTypes: Seq[DataType] = (thenList ++ elseValue).map(_.dataType)
-  def valueTypesEqual: Boolean = valueTypes.distinct.size <= 1
+  def valueTypesEqual: Boolean = valueTypes.distinct.size == 1
 
   override def dataType: DataType = {
     if (!resolved) {
@@ -442,7 +442,8 @@ case class CaseKeyWhen(key: Expression, branches: Seq[Expression]) extends CaseW
   override def children: Seq[Expression] = key +: branches
 
   override lazy val resolved: Boolean =
-    childrenResolved && valueTypesEqual
+    childrenResolved && valueTypesEqual &&
+    (key +: whenList).map(_.dataType).distinct.size == 1
 
   /** Written in imperative fashion for performance considerations. */
   override def eval(input: Row): Any = {
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercionSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercionSuite.scala
index f0101f4a88f86..a0798428db094 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercionSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercionSuite.scala
@@ -20,7 +20,8 @@ package org.apache.spark.sql.catalyst.analysis
 import org.apache.spark.sql.catalyst.plans.PlanTest
 
 import org.apache.spark.sql.catalyst.expressions._
-import org.apache.spark.sql.catalyst.plans.logical.{LocalRelation, Project}
+import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, LocalRelation, Project}
+import org.apache.spark.sql.catalyst.rules.Rule
 import org.apache.spark.sql.types._
 
 class HiveTypeCoercionSuite extends PlanTest {
@@ -104,15 +105,16 @@ class HiveTypeCoercionSuite extends PlanTest {
     widenTest(ArrayType(IntegerType), StructType(Seq()), None)
   }
 
+  private def ruleTest(rule: Rule[LogicalPlan], initial: Expression, transformed: Expression) {
+    val testRelation = LocalRelation(AttributeReference("a", IntegerType)())
+    comparePlans(
+      rule(Project(Seq(Alias(initial, "a")()), testRelation)),
+      Project(Seq(Alias(transformed, "a")()), testRelation))
+  }
+
   test("coalesce casts") {
     val fac = new HiveTypeCoercion { }.FunctionArgumentConversion
-    def ruleTest(initial: Expression, transformed: Expression) {
-      val testRelation = LocalRelation(AttributeReference("a", IntegerType)())
-      comparePlans(
-        fac(Project(Seq(Alias(initial, "a")()), testRelation)),
-        Project(Seq(Alias(transformed, "a")()), testRelation))
-    }
-    ruleTest(
+    ruleTest(fac,
       Coalesce(Literal(1.0)
         :: Literal(1)
         :: Literal.create(1.0, FloatType)
@@ -121,7 +123,7 @@ class HiveTypeCoercionSuite extends PlanTest {
         :: Cast(Literal(1), DoubleType)
         :: Cast(Literal.create(1.0, FloatType), DoubleType)
         :: Nil))
-    ruleTest(
+    ruleTest(fac,
       Coalesce(Literal(1L)
         :: Literal(1)
         :: Literal(new java.math.BigDecimal("1000000000000000000000"))
@@ -131,4 +133,39 @@ class HiveTypeCoercionSuite extends PlanTest {
         :: Cast(Literal(new java.math.BigDecimal("1000000000000000000000")), DecimalType())
         :: Nil))
   }
+
+  test("type coercion for CaseKeyWhen") {
+    val cwc = new HiveTypeCoercion {}.CaseWhenCoercion
+    ruleTest(cwc,
+      CaseKeyWhen(Literal(1.toShort), Seq(Literal(1), Literal("a"))),
+      CaseKeyWhen(Cast(Literal(1.toShort), IntegerType), Seq(Literal(1), Literal("a")))
+    )
+    // Will remove exception expectation in PR#6405
+    intercept[RuntimeException] {
+      ruleTest(cwc,
+        CaseKeyWhen(Literal(true), Seq(Literal(1), Literal("a"))),
+        CaseKeyWhen(Literal(true), Seq(Literal(1), Literal("a")))
+      )
+    }
+  }
+
+  test("type coercion simplification for equal to") {
+    val be = new HiveTypeCoercion {}.BooleanEqualization
+    ruleTest(be,
+      EqualTo(Literal(true), Literal(1)),
+      Literal(true)
+    )
+    ruleTest(be,
+      EqualTo(Literal(true), Literal(0)),
+      Not(Literal(true))
+    )
+    ruleTest(be,
+      EqualNullSafe(Literal(true), Literal(1)),
+      And(IsNotNull(Literal(true)), Literal(true))
+    )
+    ruleTest(be,
+      EqualNullSafe(Literal(true), Literal(0)),
+      And(IsNotNull(Literal(true)), Not(Literal(true)))
+    )
+  }
 }
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionEvaluationSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionEvaluationSuite.scala
index 3f5a660f17e1d..b6927485f42bf 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionEvaluationSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionEvaluationSuite.scala
@@ -862,7 +862,7 @@ class ExpressionEvaluationSuite extends ExpressionEvaluationBaseSuite {
     val c5 = 'a.string.at(4)
     val c6 = 'a.string.at(5)
 
-    val literalNull = Literal.create(null, BooleanType)
+    val literalNull = Literal.create(null, IntegerType)
     val literalInt = Literal(1)
     val literalString = Literal("a")
 
@@ -871,12 +871,12 @@ class ExpressionEvaluationSuite extends ExpressionEvaluationBaseSuite {
     checkEvaluation(CaseKeyWhen(c2, Seq(literalInt, c4, c5)), "a", row)
     checkEvaluation(CaseKeyWhen(c2, Seq(c1, c4, c5)), "b", row)
     checkEvaluation(CaseKeyWhen(c4, Seq(literalString, c2, c3)), 1, row)
-    checkEvaluation(CaseKeyWhen(c4, Seq(c1, c3, c5, c2, Literal(3))), 3, row)
+    checkEvaluation(CaseKeyWhen(c4, Seq(c6, c3, c5, c2, Literal(3))), 3, row)
 
     checkEvaluation(CaseKeyWhen(literalInt, Seq(c2, c4, c5)), "a", row)
     checkEvaluation(CaseKeyWhen(literalString, Seq(c5, c2, c4, c3)), 2, row)
-    checkEvaluation(CaseKeyWhen(literalInt, Seq(c5, c2, c4, c3)), null, row)
-    checkEvaluation(CaseKeyWhen(literalNull, Seq(c5, c2, c1, c3)), 2, row)
+    checkEvaluation(CaseKeyWhen(c6, Seq(c5, c2, c4, c3)), null, row)
+    checkEvaluation(CaseKeyWhen(literalNull, Seq(c2, c5, c1, c6)), "c", row)
   }
 
   test("complex type") {
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
index bf18bf854aa4a..63f7d314fb699 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
@@ -24,7 +24,7 @@ import org.apache.spark.sql.catalyst.errors.DialectException
 import org.apache.spark.sql.execution.GeneratedAggregate
 import org.apache.spark.sql.functions._
 import org.apache.spark.sql.TestData._
-import org.apache.spark.sql.test.TestSQLContext
+import org.apache.spark.sql.test.{SQLTestUtils, TestSQLContext}
 import org.apache.spark.sql.test.TestSQLContext.{udf => _, _}
 
 import org.apache.spark.sql.types._
@@ -32,12 +32,12 @@ import org.apache.spark.sql.types._
 /** A SQL Dialect for testing purpose, and it can not be nested type */
 class MyDialect extends DefaultParserDialect
 
-class SQLQuerySuite extends QueryTest with BeforeAndAfterAll {
+class SQLQuerySuite extends QueryTest with BeforeAndAfterAll with SQLTestUtils {
   // Make sure the tables are loaded.
   TestData
 
-  import org.apache.spark.sql.test.TestSQLContext.implicits._
-  val sqlCtx = TestSQLContext
+  val sqlContext = TestSQLContext
+  import sqlContext.implicits._
 
   test("SPARK-6743: no columns from cache") {
     Seq(
@@ -915,7 +915,7 @@ class SQLQuerySuite extends QueryTest with BeforeAndAfterAll {
       Row(values(0).toInt, values(1), values(2).toBoolean, v4)
     }
 
-    val df1 = sqlCtx.createDataFrame(rowRDD1, schema1)
+    val df1 = createDataFrame(rowRDD1, schema1)
     df1.registerTempTable("applySchema1")
     checkAnswer(
       sql("SELECT * FROM applySchema1"),
@@ -945,7 +945,7 @@ class SQLQuerySuite extends QueryTest with BeforeAndAfterAll {
       Row(Row(values(0).toInt, values(2).toBoolean), Map(values(1) -> v4))
     }
 
-    val df2 = sqlCtx.createDataFrame(rowRDD2, schema2)
+    val df2 = createDataFrame(rowRDD2, schema2)
     df2.registerTempTable("applySchema2")
     checkAnswer(
       sql("SELECT * FROM applySchema2"),
@@ -970,7 +970,7 @@ class SQLQuerySuite extends QueryTest with BeforeAndAfterAll {
       Row(Row(values(0).toInt, values(2).toBoolean), scala.collection.mutable.Map(values(1) -> v4))
     }
 
-    val df3 = sqlCtx.createDataFrame(rowRDD3, schema2)
+    val df3 = createDataFrame(rowRDD3, schema2)
     df3.registerTempTable("applySchema3")
 
     checkAnswer(
@@ -1015,7 +1015,7 @@ class SQLQuerySuite extends QueryTest with BeforeAndAfterAll {
       .build()
     val schemaWithMeta = new StructType(Array(
       schema("id"), schema("name").copy(metadata = metadata), schema("age")))
-    val personWithMeta = sqlCtx.createDataFrame(person.rdd, schemaWithMeta)
+    val personWithMeta = createDataFrame(person.rdd, schemaWithMeta)
     def validateMetadata(rdd: DataFrame): Unit = {
       assert(rdd.schema("name").metadata.getString(docKey) == docValue)
     }
@@ -1331,4 +1331,24 @@ class SQLQuerySuite extends QueryTest with BeforeAndAfterAll {
 
     checkAnswer(sql("SELECT a.`c.b`, `b.$q`[0].`a@!.q`, `q.w`.`w.i&`[0] FROM t"), Row(1, 1, 1))
   }
+
+  test("SPARK-7952: fix the equality check between boolean and numeric types") {
+    withTempTable("t") {
+      // numeric field i, boolean field j, result of i = j, result of i <=> j
+      Seq[(Integer, java.lang.Boolean, java.lang.Boolean, java.lang.Boolean)](
+        (1, true, true, true),
+        (0, false, true, true),
+        (2, true, false, false),
+        (2, false, false, false),
+        (null, true, null, false),
+        (null, false, null, false),
+        (0, null, null, false),
+        (1, null, null, false),
+        (null, null, null, true)
+      ).toDF("i", "b", "r1", "r2").registerTempTable("t")
+
+      checkAnswer(sql("select i = b from t"), sql("select r1 from t"))
+      checkAnswer(sql("select i <=> b from t"), sql("select r2 from t"))
+    }
+  }
 }

From 3c0156899dc1ec1f7dfe6d7c8af47fa6dc7d00bf Mon Sep 17 00:00:00 2001
From: Reynold Xin <rxin@databricks.com>
Date: Sun, 31 May 2015 23:55:45 -0700
Subject: [PATCH 294/525] Update README to include DataFrames and zinc.

Also cut trailing whitespaces.

Author: Reynold Xin <rxin@databricks.com>

Closes #6548 from rxin/readme and squashes the following commits:

630efc3 [Reynold Xin] Update README to include DataFrames and zinc.
---
 README.md | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/README.md b/README.md
index 9c09d40e2bdae..380422ca00dbe 100644
--- a/README.md
+++ b/README.md
@@ -3,8 +3,8 @@
 Spark is a fast and general cluster computing system for Big Data. It provides
 high-level APIs in Scala, Java, and Python, and an optimized engine that
 supports general computation graphs for data analysis. It also supports a
-rich set of higher-level tools including Spark SQL for SQL and structured
-data processing, MLlib for machine learning, GraphX for graph processing,
+rich set of higher-level tools including Spark SQL for SQL and DataFrames,
+MLlib for machine learning, GraphX for graph processing,
 and Spark Streaming for stream processing.
 
 <http://spark.apache.org/>
@@ -22,7 +22,7 @@ This README file only contains basic setup instructions.
 Spark is built using [Apache Maven](http://maven.apache.org/).
 To build Spark and its example programs, run:
 
-    mvn -DskipTests clean package
+    build/mvn -DskipTests clean package
 
 (You do not need to do this if you downloaded a pre-built package.)
 More detailed documentation is available from the project site, at
@@ -43,7 +43,7 @@ Try the following command, which should return 1000:
 Alternatively, if you prefer Python, you can use the Python shell:
 
     ./bin/pyspark
-    
+
 And run the following command, which should also return 1000:
 
     >>> sc.parallelize(range(1000)).count()
@@ -58,9 +58,9 @@ To run one of them, use `./bin/run-example <class> [params]`. For example:
 will run the Pi example locally.
 
 You can set the MASTER environment variable when running examples to submit
-examples to a cluster. This can be a mesos:// or spark:// URL, 
-"yarn-cluster" or "yarn-client" to run on YARN, and "local" to run 
-locally with one thread, or "local[N]" to run locally with N threads. You 
+examples to a cluster. This can be a mesos:// or spark:// URL,
+"yarn-cluster" or "yarn-client" to run on YARN, and "local" to run
+locally with one thread, or "local[N]" to run locally with N threads. You
 can also use an abbreviated class name if the class is in the `examples`
 package. For instance:
 
@@ -75,7 +75,7 @@ can be run using:
 
     ./dev/run-tests
 
-Please see the guidance on how to 
+Please see the guidance on how to
 [run tests for a module, or individual tests](https://cwiki.apache.org/confluence/display/SPARK/Useful+Developer+Tools).
 
 ## A Note About Hadoop Versions

From e7c7e51f2ec158d12a8429f753225c746f92d513 Mon Sep 17 00:00:00 2001
From: Nishkam Ravi <nravi@cloudera.com>
Date: Mon, 1 Jun 2015 21:34:41 +0100
Subject: [PATCH 295/525] [DOC] Minor modification to Streaming docs with
 regards to parallel data receiving

pwendell tdas

Author: Nishkam Ravi <nravi@cloudera.com>
Author: nishkamravi2 <nishkamravi@gmail.com>
Author: nravi <nravi@c1704.halxg.cloudera.com>

Closes #6544 from nishkamravi2/master_nravi and squashes the following commits:

46e8c03 [Nishkam Ravi] Slight modification to streaming docs
---
 docs/streaming-programming-guide.md | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/docs/streaming-programming-guide.md b/docs/streaming-programming-guide.md
index bd863d48d53e3..42b33947873b0 100644
--- a/docs/streaming-programming-guide.md
+++ b/docs/streaming-programming-guide.md
@@ -1946,10 +1946,10 @@ creates a single receiver (running on a worker machine) that receives a single s
 Receiving multiple data streams can therefore be achieved by creating multiple input DStreams
 and configuring them to receive different partitions of the data stream from the source(s).
 For example, a single Kafka input DStream receiving two topics of data can be split into two
-Kafka input streams, each receiving only one topic. This would run two receivers on two workers,
-thus allowing data to be received in parallel, and increasing overall throughput. These multiple
-DStream can be unioned together to create a single DStream. Then the transformations that was
-being applied on the single input DStream can applied on the unified stream. This is done as follows.
+Kafka input streams, each receiving only one topic. This would run two receivers,
+allowing data to be received in parallel, and increasing overall throughput. These multiple
+DStreams can be unioned together to create a single DStream. Then the transformations that were
+being applied on a single input DStream can be applied on the unified stream. This is done as follows.
 
 <div class="codetabs">
 <div data-lang="scala" markdown="1">

From b7ab0299b03ae833d5811f380e4594837879f8ae Mon Sep 17 00:00:00 2001
From: Davies Liu <davies@databricks.com>
Date: Mon, 1 Jun 2015 14:40:08 -0700
Subject: [PATCH 296/525] [SPARK-7497] [PYSPARK] [STREAMING] fix streaming
 flaky tests

Increase the duration and timeout in streaming python tests.

Author: Davies Liu <davies@databricks.com>

Closes #6239 from davies/flaky_tests and squashes the following commits:

d6aee8f [Davies Liu] fix window tests
26317f7 [Davies Liu] Merge branch 'master' of github.com:apache/spark into flaky_tests
7947db6 [Davies Liu] fix streaming flaky tests
---
 python/pyspark/streaming/tests.py | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/python/pyspark/streaming/tests.py b/python/pyspark/streaming/tests.py
index 33ea8c9293d74..46cb18b2e8ef9 100644
--- a/python/pyspark/streaming/tests.py
+++ b/python/pyspark/streaming/tests.py
@@ -41,8 +41,8 @@
 
 class PySparkStreamingTestCase(unittest.TestCase):
 
-    timeout = 4  # seconds
-    duration = .2
+    timeout = 10  # seconds
+    duration = .5
 
     @classmethod
     def setUpClass(cls):
@@ -379,13 +379,13 @@ def func(dstream):
 
 class WindowFunctionTests(PySparkStreamingTestCase):
 
-    timeout = 5
+    timeout = 15
 
     def test_window(self):
         input = [range(1), range(2), range(3), range(4), range(5)]
 
         def func(dstream):
-            return dstream.window(.6, .2).count()
+            return dstream.window(1.5, .5).count()
 
         expected = [[1], [3], [6], [9], [12], [9], [5]]
         self._test_func(input, func, expected)
@@ -394,7 +394,7 @@ def test_count_by_window(self):
         input = [range(1), range(2), range(3), range(4), range(5)]
 
         def func(dstream):
-            return dstream.countByWindow(.6, .2)
+            return dstream.countByWindow(1.5, .5)
 
         expected = [[1], [3], [6], [9], [12], [9], [5]]
         self._test_func(input, func, expected)
@@ -403,7 +403,7 @@ def test_count_by_window_large(self):
         input = [range(1), range(2), range(3), range(4), range(5), range(6)]
 
         def func(dstream):
-            return dstream.countByWindow(1, .2)
+            return dstream.countByWindow(2.5, .5)
 
         expected = [[1], [3], [6], [10], [15], [20], [18], [15], [11], [6]]
         self._test_func(input, func, expected)
@@ -412,7 +412,7 @@ def test_count_by_value_and_window(self):
         input = [range(1), range(2), range(3), range(4), range(5), range(6)]
 
         def func(dstream):
-            return dstream.countByValueAndWindow(1, .2)
+            return dstream.countByValueAndWindow(2.5, .5)
 
         expected = [[1], [2], [3], [4], [5], [6], [6], [6], [6], [6]]
         self._test_func(input, func, expected)
@@ -421,7 +421,7 @@ def test_group_by_key_and_window(self):
         input = [[('a', i)] for i in range(5)]
 
         def func(dstream):
-            return dstream.groupByKeyAndWindow(.6, .2).mapValues(list)
+            return dstream.groupByKeyAndWindow(1.5, .5).mapValues(list)
 
         expected = [[('a', [0])], [('a', [0, 1])], [('a', [0, 1, 2])], [('a', [1, 2, 3])],
                     [('a', [2, 3, 4])], [('a', [3, 4])], [('a', [4])]]

From 90c606925e7ec8f65f28e2290a0048f64af8c6a6 Mon Sep 17 00:00:00 2001
From: Xiangrui Meng <meng@databricks.com>
Date: Mon, 1 Jun 2015 15:05:14 -0700
Subject: [PATCH 297/525] [SPARK-7584] [MLLIB] User guide for VectorAssembler

This PR adds a section in the user guide for `VectorAssembler` with code examples in Python/Java/Scala. It also adds a unit test in Java.

jkbradley

Author: Xiangrui Meng <meng@databricks.com>

Closes #6556 from mengxr/SPARK-7584 and squashes the following commits:

11313f6 [Xiangrui Meng] simplify Java example
0cd47f3 [Xiangrui Meng] update user guide
fd36292 [Xiangrui Meng] update Java unit test
ce61ca0 [Xiangrui Meng] add Java unit test for VectorAssembler
e399942 [Xiangrui Meng] scala/python example code
---
 docs/ml-features.md                           | 114 ++++++++++++++++++
 .../ml/feature/JavaVectorAssemblerSuite.java  |  78 ++++++++++++
 2 files changed, 192 insertions(+)
 create mode 100644 mllib/src/test/java/org/apache/spark/ml/feature/JavaVectorAssemblerSuite.java

diff --git a/docs/ml-features.md b/docs/ml-features.md
index 81f1b8823a8ce..9ee5696122717 100644
--- a/docs/ml-features.md
+++ b/docs/ml-features.md
@@ -964,5 +964,119 @@ DataFrame transformedData = transformer.transform(dataFrame);
 </div>
 </div>
 
+## VectorAssembler
+
+`VectorAssembler` is a transformer that combines a given list of columns into a single vector
+column.
+It is useful for combining raw features and features generated by different feature transformers
+into a single feature vector, in order to train ML models like logistic regression and decision
+trees.
+`VectorAssembler` accepts the following input column types: all numeric types, boolean type,
+and vector type.
+In each row, the values of the input columns will be concatenated into a vector in the specified
+order.
+
+**Examples**
+
+Assume that we have a DataFrame with the columns `id`, `hour`, `mobile`, `userFeatures`,
+and `clicked`:
+
+~~~
+ id | hour | mobile | userFeatures     | clicked
+----|------|--------|------------------|---------
+ 0  | 18   | 1.0    | [0.0, 10.0, 0.5] | 1.0
+~~~
+
+`userFeatures` is a vector column that contains three user features.
+We want to combine `hour`, `mobile`, and `userFeatures` into a single feature vector
+called `features` and use it to predict `clicked` or not.
+If we set `VectorAssembler`'s input columns to `hour`, `mobile`, and `userFeatures` and
+output column to `features`, after transformation we should get the following DataFrame:
+
+~~~
+ id | hour | mobile | userFeatures     | clicked | features
+----|------|--------|------------------|---------|-----------------------------
+ 0  | 18   | 1.0    | [0.0, 10.0, 0.5] | 1.0     | [18.0, 1.0, 0.0, 10.0, 0.5]
+~~~
+
+<div class="codetabs">
+<div data-lang="scala" markdown="1">
+
+[`VectorAssembler`](api/scala/index.html#org.apache.spark.ml.feature.VectorAssembler) takes an array
+of input column names and an output column name.
+
+{% highlight scala %}
+import org.apache.spark.mllib.linalg.Vectors
+import org.apache.spark.ml.feature.VectorAssembler
+
+val dataset = sqlContext.createDataFrame(
+  Seq((0, 18, 1.0, Vectors.dense(0.0, 10.0, 0.5), 1.0))
+).toDF("id", "hour", "mobile", "userFeatures", "clicked")
+val assembler = new VectorAssembler()
+  .setInputCols(Array("hour", "mobile", "userFeatures"))
+  .setOutputCol("features")
+val output = assembler.transform(dataset)
+println(output.select("features", "clicked").first())
+{% endhighlight %}
+</div>
+
+<div data-lang="java" markdown="1">
+
+[`VectorAssembler`](api/java/org/apache/spark/ml/feature/VectorAssembler.html) takes an array
+of input column names and an output column name.
+
+{% highlight java %}
+import java.util.Arrays;
+
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.mllib.linalg.VectorUDT;
+import org.apache.spark.mllib.linalg.Vectors;
+import org.apache.spark.sql.DataFrame;
+import org.apache.spark.sql.Row;
+import org.apache.spark.sql.RowFactory;
+import org.apache.spark.sql.types.*;
+import static org.apache.spark.sql.types.DataTypes.*;
+
+StructType schema = createStructType(new StructField[] {
+  createStructField("id", IntegerType, false),
+  createStructField("hour", IntegerType, false),
+  createStructField("mobile", DoubleType, false),
+  createStructField("userFeatures", new VectorUDT(), false),
+  createStructField("clicked", DoubleType, false)
+});
+Row row = RowFactory.create(0, 18, 1.0, Vectors.dense(0.0, 10.0, 0.5), 1.0);
+JavaRDD<Row> rdd = jsc.parallelize(Arrays.asList(row));
+DataFrame dataset = sqlContext.createDataFrame(rdd, schema);
+
+VectorAssembler assembler = new VectorAssembler()
+  .setInputCols(new String[] {"hour", "mobile", "userFeatures"})
+  .setOutputCol("features");
+
+DataFrame output = assembler.transform(dataset);
+System.out.println(output.select("features", "clicked").first());
+{% endhighlight %}
+</div>
+
+<div data-lang="python" markdown="1">
+
+[`VectorAssembler`](api/python/pyspark.ml.html#pyspark.ml.feature.VectorAssembler) takes a list
+of input column names and an output column name.
+
+{% highlight python %}
+from pyspark.mllib.linalg import Vectors
+from pyspark.ml.feature import VectorAssembler
+
+dataset = sqlContext.createDataFrame(
+    [(0, 18, 1.0, Vectors.dense([0.0, 10.0, 0.5]), 1.0)],
+    ["id", "hour", "mobile", "userFeatures", "clicked"])
+assembler = VectorAssembler(
+    inputCols=["hour", "mobile", "userFeatures"],
+    outputCol="features")
+output = assembler.transform(dataset)
+print(output.select("features", "clicked").first())
+{% endhighlight %}
+</div>
+</div>
+
 # Feature Selectors
 
diff --git a/mllib/src/test/java/org/apache/spark/ml/feature/JavaVectorAssemblerSuite.java b/mllib/src/test/java/org/apache/spark/ml/feature/JavaVectorAssemblerSuite.java
new file mode 100644
index 0000000000000..b7c564caad3bd
--- /dev/null
+++ b/mllib/src/test/java/org/apache/spark/ml/feature/JavaVectorAssemblerSuite.java
@@ -0,0 +1,78 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.ml.feature;
+
+import java.util.Arrays;
+
+import org.junit.After;
+import org.junit.Assert;
+import org.junit.Before;
+import org.junit.Test;
+
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.mllib.linalg.Vector;
+import org.apache.spark.mllib.linalg.VectorUDT;
+import org.apache.spark.mllib.linalg.Vectors;
+import org.apache.spark.sql.DataFrame;
+import org.apache.spark.sql.Row;
+import org.apache.spark.sql.RowFactory;
+import org.apache.spark.sql.SQLContext;
+import org.apache.spark.sql.types.*;
+import static org.apache.spark.sql.types.DataTypes.*;
+
+public class JavaVectorAssemblerSuite {
+  private transient JavaSparkContext jsc;
+  private transient SQLContext sqlContext;
+
+  @Before
+  public void setUp() {
+    jsc = new JavaSparkContext("local", "JavaVectorAssemblerSuite");
+    sqlContext = new SQLContext(jsc);
+  }
+
+  @After
+  public void tearDown() {
+    jsc.stop();
+    jsc = null;
+  }
+
+  @Test
+  public void testVectorAssembler() {
+    StructType schema = createStructType(new StructField[] {
+      createStructField("id", IntegerType, false),
+      createStructField("x", DoubleType, false),
+      createStructField("y", new VectorUDT(), false),
+      createStructField("name", StringType, false),
+      createStructField("z", new VectorUDT(), false),
+      createStructField("n", LongType, false)
+    });
+    Row row = RowFactory.create(
+      0, 0.0, Vectors.dense(1.0, 2.0), "a",
+      Vectors.sparse(2, new int[] {1}, new double[] {3.0}), 10L);
+    JavaRDD<Row> rdd = jsc.parallelize(Arrays.asList(row));
+    DataFrame dataset = sqlContext.createDataFrame(rdd, schema);
+    VectorAssembler assembler = new VectorAssembler()
+      .setInputCols(new String[] {"x", "y", "z", "n"})
+      .setOutputCol("features");
+    DataFrame output = assembler.transform(dataset);
+    Assert.assertEquals(
+      Vectors.sparse(6, new int[] {1, 2, 4, 5}, new double[] {1.0, 2.0, 3.0, 10.0}),
+      output.select("features").first().<Vector>getAs(0));
+  }
+}

From 2f9c7519d6a3f867100979b5e7ced3f72b7d9adc Mon Sep 17 00:00:00 2001
From: Tathagata Das <tathagata.das1565@gmail.com>
Date: Mon, 1 Jun 2015 20:04:57 -0700
Subject: [PATCH 298/525] [SPARK-7958] [STREAMING] Handled exception in
 StreamingContext.start() to prevent leaking of actors

StreamingContext.start() can throw exception because DStream.validateAtStart() fails (say, checkpoint directory not set for StateDStream). But by then JobScheduler, JobGenerator, and ReceiverTracker has already started, along with their actors. But those cannot be shutdown because the only way to do that is call StreamingContext.stop() which cannot be called as the context has not been marked as ACTIVE.

The solution in this PR is to stop the internal scheduler if start throw exception, and mark the context as STOPPED.

Author: Tathagata Das <tathagata.das1565@gmail.com>

Closes #6559 from tdas/SPARK-7958 and squashes the following commits:

20b2ec1 [Tathagata Das] Added synchronized
790b617 [Tathagata Das] Handled exception in StreamingContext.start()
---
 .../spark/streaming/StreamingContext.scala      | 17 +++++++++++++----
 .../streaming/scheduler/JobScheduler.scala      |  4 ++++
 .../spark/streaming/StreamingContextSuite.scala | 16 ++++++++++++++++
 3 files changed, 33 insertions(+), 4 deletions(-)

diff --git a/streaming/src/main/scala/org/apache/spark/streaming/StreamingContext.scala b/streaming/src/main/scala/org/apache/spark/streaming/StreamingContext.scala
index 25842d502543e..624a31ddc2b89 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/StreamingContext.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/StreamingContext.scala
@@ -23,6 +23,7 @@ import java.util.concurrent.atomic.{AtomicInteger, AtomicReference}
 import scala.collection.Map
 import scala.collection.mutable.Queue
 import scala.reflect.ClassTag
+import scala.util.control.NonFatal
 
 import akka.actor.{Props, SupervisorStrategy}
 import org.apache.hadoop.conf.Configuration
@@ -576,18 +577,26 @@ class StreamingContext private[streaming] (
   def start(): Unit = synchronized {
     state match {
       case INITIALIZED =>
-        validate()
         startSite.set(DStream.getCreationSite())
         sparkContext.setCallSite(startSite.get)
         StreamingContext.ACTIVATION_LOCK.synchronized {
           StreamingContext.assertNoOtherContextIsActive()
-          scheduler.start()
-          uiTab.foreach(_.attach())
-          state = StreamingContextState.ACTIVE
+          try {
+            validate()
+            scheduler.start()
+            state = StreamingContextState.ACTIVE
+          } catch {
+            case NonFatal(e) =>
+              logError("Error starting the context, marking it as stopped", e)
+              scheduler.stop(false)
+              state = StreamingContextState.STOPPED
+              throw e
+          }
           StreamingContext.setActiveContext(this)
         }
         shutdownHookRef = Utils.addShutdownHook(
           StreamingContext.SHUTDOWN_HOOK_PRIORITY)(stopOnShutdown)
+        uiTab.foreach(_.attach())
         logInfo("StreamingContext started")
       case ACTIVE =>
         logWarning("StreamingContext has already been started")
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/JobScheduler.scala b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/JobScheduler.scala
index 1d1ddaaccf217..4af9b6d3b56ab 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/JobScheduler.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/JobScheduler.scala
@@ -126,6 +126,10 @@ class JobScheduler(val ssc: StreamingContext) extends Logging {
     eventLoop.post(ErrorReported(msg, e))
   }
 
+  def isStarted(): Boolean = synchronized {
+    eventLoop != null
+  }
+
   private def processEvent(event: JobSchedulerEvent) {
     try {
       event match {
diff --git a/streaming/src/test/scala/org/apache/spark/streaming/StreamingContextSuite.scala b/streaming/src/test/scala/org/apache/spark/streaming/StreamingContextSuite.scala
index d304c9a7328f3..819dd2ccfe915 100644
--- a/streaming/src/test/scala/org/apache/spark/streaming/StreamingContextSuite.scala
+++ b/streaming/src/test/scala/org/apache/spark/streaming/StreamingContextSuite.scala
@@ -151,6 +151,22 @@ class StreamingContextSuite extends SparkFunSuite with BeforeAndAfter with Timeo
     assert(StreamingContext.getActive().isEmpty)
   }
 
+  test("start failure should stop internal components") {
+    ssc = new StreamingContext(conf, batchDuration)
+    val inputStream = addInputStream(ssc)
+    val updateFunc = (values: Seq[Int], state: Option[Int]) => {
+      Some(values.sum + state.getOrElse(0))
+    }
+    inputStream.map(x => (x, 1)).updateStateByKey[Int](updateFunc)
+    // Require that the start fails because checkpoint directory was not set
+    intercept[Exception] {
+      ssc.start()
+    }
+    assert(ssc.getState() === StreamingContextState.STOPPED)
+    assert(ssc.scheduler.isStarted === false)
+  }
+
+
   test("start multiple times") {
     ssc = new StreamingContext(master, appName, batchDuration)
     addInputStream(ssc).register()

From 15d7c90aeb0d51851f7ebb4c75c9b249ad88dfeb Mon Sep 17 00:00:00 2001
From: Andrew Or <andrew@databricks.com>
Date: Mon, 1 Jun 2015 19:39:03 -0700
Subject: [PATCH 299/525] [MINOR] [UI] Improve error message on log page

Currently if a bad log type if specified, then we get blank.
We should provide a more informative error message.
---
 .../spark/deploy/worker/ui/LogPage.scala      |  6 ++
 .../spark/deploy/worker/ui/LogPageSuite.scala | 70 +++++++++++++++++++
 2 files changed, 76 insertions(+)
 create mode 100644 core/src/test/scala/org/apache/spark/deploy/worker/ui/LogPageSuite.scala

diff --git a/core/src/main/scala/org/apache/spark/deploy/worker/ui/LogPage.scala b/core/src/main/scala/org/apache/spark/deploy/worker/ui/LogPage.scala
index 88170d4df3053..dc2bee6f2bdca 100644
--- a/core/src/main/scala/org/apache/spark/deploy/worker/ui/LogPage.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/worker/ui/LogPage.scala
@@ -29,6 +29,7 @@ import org.apache.spark.util.logging.RollingFileAppender
 private[ui] class LogPage(parent: WorkerWebUI) extends WebUIPage("logPage") with Logging {
   private val worker = parent.worker
   private val workDir = parent.workDir
+  private val supportedLogTypes = Set("stderr", "stdout")
 
   def renderLog(request: HttpServletRequest): String = {
     val defaultBytes = 100 * 1024
@@ -129,6 +130,11 @@ private[ui] class LogPage(parent: WorkerWebUI) extends WebUIPage("logPage") with
       offsetOption: Option[Long],
       byteLength: Int
     ): (String, Long, Long, Long) = {
+
+    if (!supportedLogTypes.contains(logType)) {
+      return ("Error: Log type must be one of " + supportedLogTypes.mkString(", "), 0, 0, 0)
+    }
+
     try {
       val files = RollingFileAppender.getSortedRolledOverFiles(logDirectory, logType)
       logDebug(s"Sorted log files of type $logType in $logDirectory:\n${files.mkString("\n")}")
diff --git a/core/src/test/scala/org/apache/spark/deploy/worker/ui/LogPageSuite.scala b/core/src/test/scala/org/apache/spark/deploy/worker/ui/LogPageSuite.scala
new file mode 100644
index 0000000000000..572360ddb95d4
--- /dev/null
+++ b/core/src/test/scala/org/apache/spark/deploy/worker/ui/LogPageSuite.scala
@@ -0,0 +1,70 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.deploy.worker.ui
+
+import java.io.{File, FileWriter}
+
+import org.mockito.Mockito.mock
+import org.scalatest.PrivateMethodTester
+
+import org.apache.spark.SparkFunSuite
+
+class LogPageSuite extends SparkFunSuite with PrivateMethodTester {
+
+  test("get logs simple") {
+    val webui = mock(classOf[WorkerWebUI])
+    val logPage = new LogPage(webui)
+
+    // Prepare some fake log files to read later
+    val out = "some stdout here"
+    val err = "some stderr here"
+    val tmpDir = new File(sys.props("java.io.tmpdir"))
+    val tmpOut = new File(tmpDir, "stdout")
+    val tmpErr = new File(tmpDir, "stderr")
+    val tmpRand = new File(tmpDir, "random")
+    write(tmpOut, out)
+    write(tmpErr, err)
+    write(tmpRand, "1 6 4 5 2 7 8")
+
+    // Get the logs. All log types other than "stderr" or "stdout" will be rejected
+    val getLog = PrivateMethod[(String, Long, Long, Long)]('getLog)
+    val (stdout, _, _, _) =
+      logPage invokePrivate getLog(tmpDir.getAbsolutePath, "stdout", None, 100)
+    val (stderr, _, _, _) =
+      logPage invokePrivate getLog(tmpDir.getAbsolutePath, "stderr", None, 100)
+    val (error1, _, _, _) =
+      logPage invokePrivate getLog(tmpDir.getAbsolutePath, "random", None, 100)
+    val (error2, _, _, _) =
+      logPage invokePrivate getLog(tmpDir.getAbsolutePath, "does-not-exist.txt", None, 100)
+    assert(stdout === out)
+    assert(stderr === err)
+    assert(error1.startsWith("Error"))
+    assert(error2.startsWith("Error"))
+  }
+
+  /** Write the specified string to the file. */
+  private def write(f: File, s: String): Unit = {
+    val writer = new FileWriter(f)
+    try {
+      writer.write(s)
+    } finally {
+      writer.close()
+    }
+  }
+
+}

From 6b44278ef7cd2a278dfa67e8393ef30775c72726 Mon Sep 17 00:00:00 2001
From: Shivaram Venkataraman <shivaram@cs.berkeley.edu>
Date: Mon, 1 Jun 2015 21:01:14 -0700
Subject: [PATCH 300/525] [SPARK-8028] [SPARKR] Use addJar instead of setJars
 in SparkR

This prevents the spark.jars from being cleared while using `--packages` or `--jars`

cc pwendell davies brkyvz

Author: Shivaram Venkataraman <shivaram@cs.berkeley.edu>

Closes #6568 from shivaram/SPARK-8028 and squashes the following commits:

3a9cf1f [Shivaram Venkataraman] Use addJar instead of setJars in SparkR This prevents the spark.jars from being cleared
---
 core/src/main/scala/org/apache/spark/api/r/RRDD.scala | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/api/r/RRDD.scala b/core/src/main/scala/org/apache/spark/api/r/RRDD.scala
index e020458888e4a..4dfa7325934ff 100644
--- a/core/src/main/scala/org/apache/spark/api/r/RRDD.scala
+++ b/core/src/main/scala/org/apache/spark/api/r/RRDD.scala
@@ -355,7 +355,6 @@ private[r] object RRDD {
 
     val sparkConf = new SparkConf().setAppName(appName)
                                    .setSparkHome(sparkHome)
-                                   .setJars(jars)
 
     // Override `master` if we have a user-specified value
     if (master != "") {
@@ -373,7 +372,11 @@ private[r] object RRDD {
       sparkConf.setExecutorEnv(name.asInstanceOf[String], value.asInstanceOf[String])
     }
 
-    new JavaSparkContext(sparkConf)
+    val jsc = new JavaSparkContext(sparkConf)
+    jars.foreach { jar =>
+      jsc.addJar(jar)
+    }
+    jsc
   }
 
   /**

From 6396cc0303ceabea53c4df436ffa50b82b7e233f Mon Sep 17 00:00:00 2001
From: Reynold Xin <rxin@databricks.com>
Date: Mon, 1 Jun 2015 21:11:19 -0700
Subject: [PATCH 301/525] [SPARK-7982][SQL] DataFrame.stat.crosstab should use
 0 instead of null for pairs that don't appear

Author: Reynold Xin <rxin@databricks.com>

Closes #6566 from rxin/crosstab and squashes the following commits:

e0ace1c [Reynold Xin] [SPARK-7982][SQL] DataFrame.stat.crosstab should use 0 instead of null for pairs that don't appear
---
 .../apache/spark/sql/execution/stat/StatFunctions.scala  | 9 ++++++---
 .../scala/org/apache/spark/sql/DataFrameStatSuite.scala  | 4 ++--
 2 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/stat/StatFunctions.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/stat/StatFunctions.scala
index b1a8204dd5f71..93383e5a62f11 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/stat/StatFunctions.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/stat/StatFunctions.scala
@@ -18,7 +18,7 @@
 package org.apache.spark.sql.execution.stat
 
 import org.apache.spark.Logging
-import org.apache.spark.sql.{Column, DataFrame}
+import org.apache.spark.sql.{Row, Column, DataFrame}
 import org.apache.spark.sql.catalyst.expressions.{GenericMutableRow, Cast}
 import org.apache.spark.sql.catalyst.plans.logical.LocalRelation
 import org.apache.spark.sql.functions._
@@ -116,7 +116,10 @@ private[sql] object StatFunctions extends Logging {
       s"exceed 1e4. Currently $columnSize")
     val table = counts.groupBy(_.get(0)).map { case (col1Item, rows) =>
       val countsRow = new GenericMutableRow(columnSize + 1)
-      rows.foreach { row =>
+      rows.foreach { (row: Row) =>
+        // row.get(0) is column 1
+        // row.get(1) is column 2
+        // row.get(3) is the frequency
         countsRow.setLong(distinctCol2.get(row.get(1)).get + 1, row.getLong(2))
       }
       // the value of col1 is the first value, the rest are the counts
@@ -126,6 +129,6 @@ private[sql] object StatFunctions extends Logging {
     val headerNames = distinctCol2.map(r => StructField(r._1.toString, LongType)).toSeq
     val schema = StructType(StructField(tableName, StringType) +: headerNames)
 
-    new DataFrame(df.sqlContext, LocalRelation(schema.toAttributes, table))
+    new DataFrame(df.sqlContext, LocalRelation(schema.toAttributes, table)).na.fill(0.0)
   }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameStatSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameStatSuite.scala
index 78de89f0b9f39..438f479459dfe 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameStatSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameStatSuite.scala
@@ -74,10 +74,10 @@ class DataFrameStatSuite extends SparkFunSuite  {
     val rows: Array[Row] = crosstab.collect().sortBy(_.getString(0))
     assert(rows(0).get(0).toString === "0")
     assert(rows(0).getLong(1) === 2L)
-    assert(rows(0).get(2) === null)
+    assert(rows(0).get(2) === 0L)
     assert(rows(1).get(0).toString === "1")
     assert(rows(1).getLong(1) === 1L)
-    assert(rows(1).get(2) === null)
+    assert(rows(1).get(2) === 0L)
     assert(rows(2).get(0).toString === "2")
     assert(rows(2).getLong(1) === 2L)
     assert(rows(2).getLong(2) === 1L)

From 89f642a0e8c3a6bc9149a0bb413f1a8939cb0283 Mon Sep 17 00:00:00 2001
From: Reynold Xin <rxin@databricks.com>
Date: Mon, 1 Jun 2015 21:13:15 -0700
Subject: [PATCH 302/525] [SPARK-8026][SQL] Add Column.alias to Scala/Java
 DataFrame API

Author: Reynold Xin <rxin@databricks.com>

Closes #6565 from rxin/alias and squashes the following commits:

286d880 [Reynold Xin] [SPARK-8026][SQL] Add Column.alias to Scala/Java DataFrame API
---
 .../src/main/scala/org/apache/spark/sql/Column.scala | 12 ++++++++++++
 .../org/apache/spark/sql/ColumnExpressionSuite.scala |  6 ++++++
 2 files changed, 18 insertions(+)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/Column.scala b/sql/core/src/main/scala/org/apache/spark/sql/Column.scala
index b49b1d327289f..d3efa83380d04 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/Column.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/Column.scala
@@ -716,6 +716,18 @@ class Column(protected[sql] val expr: Expression) extends Logging {
    */
   def endsWith(literal: String): Column = this.endsWith(lit(literal))
 
+  /**
+   * Gives the column an alias. Same as `as`.
+   * {{{
+   *   // Renames colA to colB in select output.
+   *   df.select($"colA".alias("colB"))
+   * }}}
+   *
+   * @group expr_ops
+   * @since 1.4.0
+   */
+  def alias(alias: String): Column = as(alias)
+
   /**
    * Gives the column an alias.
    * {{{
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/ColumnExpressionSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/ColumnExpressionSuite.scala
index d006b83fc075a..b8bb1bff9ea72 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/ColumnExpressionSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/ColumnExpressionSuite.scala
@@ -27,6 +27,12 @@ import org.apache.spark.sql.types._
 class ColumnExpressionSuite extends QueryTest {
   import org.apache.spark.sql.TestData._
 
+  test("alias") {
+    val df = Seq((1, Seq(1, 2, 3))).toDF("a", "intList")
+    assert(df.select(df("a").as("b")).columns.head === "b")
+    assert(df.select(df("a").alias("b")).columns.head === "b")
+  }
+
   test("single explode") {
     val df = Seq((1, Seq(1, 2, 3))).toDF("a", "intList")
     checkAnswer(

From cae9306c4f437c722baa57593fe83f4b7d82dbff Mon Sep 17 00:00:00 2001
From: Shivaram Venkataraman <shivaram@cs.berkeley.edu>
Date: Mon, 1 Jun 2015 21:21:45 -0700
Subject: [PATCH 303/525] [SPARK-8027] [SPARKR] Add maven profile to build R
 package docs

Also use that profile in create-release.sh

cc pwendell -- Note that this means that we need `knitr` and `roxygen` installed on the machines used for building the release. Let me know if you need help with that.

Author: Shivaram Venkataraman <shivaram@cs.berkeley.edu>

Closes #6567 from shivaram/SPARK-8027 and squashes the following commits:

8dc8ecf [Shivaram Venkataraman] Add maven profile to build R package docs Also use that profile in create-release.sh
---
 core/pom.xml                         | 23 +++++++++++++++++++++++
 dev/create-release/create-release.sh | 16 ++++++++--------
 2 files changed, 31 insertions(+), 8 deletions(-)

diff --git a/core/pom.xml b/core/pom.xml
index 5c02be831ce06..a02184222e9f0 100644
--- a/core/pom.xml
+++ b/core/pom.xml
@@ -481,6 +481,29 @@
         </plugins>
       </build>
     </profile>
+    <profile>
+      <id>sparkr-docs</id>
+      <build>
+        <plugins>
+          <plugin>
+            <groupId>org.codehaus.mojo</groupId>
+            <artifactId>exec-maven-plugin</artifactId>
+            <executions>
+              <execution>
+                <id>sparkr-pkg-docs</id>
+                <phase>compile</phase>
+                <goals>
+                  <goal>exec</goal>
+                </goals>
+              </execution>
+            </executions>
+            <configuration>
+              <executable>..${path.separator}R${path.separator}create-docs${script.extension}</executable>
+            </configuration>
+          </plugin>
+        </plugins>
+      </build>
+    </profile>
   </profiles>
 
 </project>
diff --git a/dev/create-release/create-release.sh b/dev/create-release/create-release.sh
index 54274a83f6d66..0b14a618e755c 100755
--- a/dev/create-release/create-release.sh
+++ b/dev/create-release/create-release.sh
@@ -228,14 +228,14 @@ if [[ ! "$@" =~ --skip-package ]]; then
 
   # We increment the Zinc port each time to avoid OOM's and other craziness if multiple builds
   # share the same Zinc server.
-  make_binary_release "hadoop1" "-Psparkr -Phadoop-1 -Phive -Phive-thriftserver" "3030" &
-  make_binary_release "hadoop1-scala2.11" "-Psparkr -Phadoop-1 -Phive -Dscala-2.11" "3031" &
-  make_binary_release "cdh4" "-Psparkr -Phadoop-1 -Phive -Phive-thriftserver -Dhadoop.version=2.0.0-mr1-cdh4.2.0" "3032" &
-  make_binary_release "hadoop2.3" "-Psparkr -Phadoop-2.3 -Phive -Phive-thriftserver -Pyarn" "3033" &
-  make_binary_release "hadoop2.4" "-Psparkr -Phadoop-2.4 -Phive -Phive-thriftserver -Pyarn" "3034" &
-  make_binary_release "mapr3" "-Pmapr3 -Psparkr -Phive -Phive-thriftserver" "3035" &
-  make_binary_release "mapr4" "-Pmapr4 -Psparkr -Pyarn -Phive -Phive-thriftserver" "3036" &
-  make_binary_release "hadoop2.4-without-hive" "-Psparkr -Phadoop-2.4 -Pyarn" "3037" &
+  make_binary_release "hadoop1" "-Psparkr -Psparkr-docs -Phadoop-1 -Phive -Phive-thriftserver" "3030" &
+  make_binary_release "hadoop1-scala2.11" "-Psparkr -Psparkr-docs -Phadoop-1 -Phive -Dscala-2.11" "3031" &
+  make_binary_release "cdh4" "-Psparkr -Psparkr-docs -Phadoop-1 -Phive -Phive-thriftserver -Dhadoop.version=2.0.0-mr1-cdh4.2.0" "3032" &
+  make_binary_release "hadoop2.3" "-Psparkr -Psparkr-docs  -Phadoop-2.3 -Phive -Phive-thriftserver -Pyarn" "3033" &
+  make_binary_release "hadoop2.4" "-Psparkr -Psparkr-docs -Phadoop-2.4 -Phive -Phive-thriftserver -Pyarn" "3034" &
+  make_binary_release "mapr3" "-Pmapr3 -Psparkr -Psparkr-docs -Phive -Phive-thriftserver" "3035" &
+  make_binary_release "mapr4" "-Pmapr4 -Psparkr -Psparkr-docs -Pyarn -Phive -Phive-thriftserver" "3036" &
+  make_binary_release "hadoop2.4-without-hive" "-Psparkr -Psparkr-docs -Phadoop-2.4 -Pyarn" "3037" &
   wait
   rm -rf spark-$RELEASE_VERSION-bin-*/
 

From 4c868b9943a2d86107d1f15f8df9830aac36fb75 Mon Sep 17 00:00:00 2001
From: Reynold Xin <rxin@databricks.com>
Date: Mon, 1 Jun 2015 21:29:39 -0700
Subject: [PATCH 304/525] [minor doc] Add exploratory data analysis warning for
 DataFrame.stat.freqItem API

Author: Reynold Xin <rxin@databricks.com>

Closes #6569 from rxin/freqItemsWarning and squashes the following commits:

7eec145 [Reynold Xin] [minor doc] Add exploratory data analysis warning for DataFrame.stat.freqItem API.
---
 python/pyspark/sql/dataframe.py                      |  3 +++
 .../apache/spark/sql/DataFrameStatFunctions.scala    | 12 ++++++++++++
 2 files changed, 15 insertions(+)

diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py
index 936487519a645..a82b6b87c413e 100644
--- a/python/pyspark/sql/dataframe.py
+++ b/python/pyspark/sql/dataframe.py
@@ -1170,6 +1170,9 @@ def freqItems(self, cols, support=None):
         "http://dx.doi.org/10.1145/762471.762473, proposed by Karp, Schenker, and Papadimitriou".
         :func:`DataFrame.freqItems` and :func:`DataFrameStatFunctions.freqItems` are aliases.
 
+        This function is meant for exploratory data analysis, as we make no guarantee about the
+        backward compatibility of the schema of the resulting DataFrame.
+
         :param cols: Names of the columns to calculate frequent items for as a list or tuple of
             strings.
         :param support: The frequency with which to consider an item 'frequent'. Default is 1%.
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala
index b624eaa201ea4..edb9ed7bba56a 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala
@@ -97,6 +97,9 @@ final class DataFrameStatFunctions private[sql](df: DataFrame) {
    * [[http://dx.doi.org/10.1145/762471.762473, proposed by Karp, Schenker, and Papadimitriou]].
    * The `support` should be greater than 1e-4.
    *
+   * This function is meant for exploratory data analysis, as we make no guarantee about the
+   * backward compatibility of the schema of the resulting [[DataFrame]].
+   *
    * @param cols the names of the columns to search frequent items in.
    * @param support The minimum frequency for an item to be considered `frequent`. Should be greater
    *                than 1e-4.
@@ -114,6 +117,9 @@ final class DataFrameStatFunctions private[sql](df: DataFrame) {
    * [[http://dx.doi.org/10.1145/762471.762473, proposed by Karp, Schenker, and Papadimitriou]].
    * Uses a `default` support of 1%.
    *
+   * This function is meant for exploratory data analysis, as we make no guarantee about the
+   * backward compatibility of the schema of the resulting [[DataFrame]].
+   *
    * @param cols the names of the columns to search frequent items in.
    * @return A Local DataFrame with the Array of frequent items for each column.
    *
@@ -128,6 +134,9 @@ final class DataFrameStatFunctions private[sql](df: DataFrame) {
    * frequent element count algorithm described in
    * [[http://dx.doi.org/10.1145/762471.762473, proposed by Karp, Schenker, and Papadimitriou]].
    *
+   * This function is meant for exploratory data analysis, as we make no guarantee about the
+   * backward compatibility of the schema of the resulting [[DataFrame]].
+   *
    * @param cols the names of the columns to search frequent items in.
    * @return A Local DataFrame with the Array of frequent items for each column.
    *
@@ -143,6 +152,9 @@ final class DataFrameStatFunctions private[sql](df: DataFrame) {
    * [[http://dx.doi.org/10.1145/762471.762473, proposed by Karp, Schenker, and Papadimitriou]].
    * Uses a `default` support of 1%.
    *
+   * This function is meant for exploratory data analysis, as we make no guarantee about the
+   * backward compatibility of the schema of the resulting [[DataFrame]].
+   *
    * @param cols the names of the columns to search frequent items in.
    * @return A Local DataFrame with the Array of frequent items for each column.
    *

From 91f6be87bc5cff41ca7a9cca9fdcc4678a4e7086 Mon Sep 17 00:00:00 2001
From: Yin Huai <yhuai@databricks.com>
Date: Mon, 1 Jun 2015 21:33:57 -0700
Subject: [PATCH 305/525] [SPARK-8020] Spark SQL in spark-defaults.conf make
 metadataHive get constructed too early

https://issues.apache.org/jira/browse/SPARK-8020

Author: Yin Huai <yhuai@databricks.com>

Closes #6563 from yhuai/SPARK-8020 and squashes the following commits:

4e5addc [Yin Huai] style
bf766c6 [Yin Huai] Failed test.
0398f5b [Yin Huai] First populate the SQLConf and then construct executionHive and metadataHive.
---
 .../org/apache/spark/sql/SQLContext.scala     | 25 +++++++++--
 .../spark/sql/hive/client/VersionsSuite.scala | 45 ++++++++++++++++++-
 2 files changed, 66 insertions(+), 4 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
index 7384b24c50b16..91e6385dec81b 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
@@ -182,9 +182,28 @@ class SQLContext(@transient val sparkContext: SparkContext)
     conf.dialect
   }
 
-  sparkContext.getConf.getAll.foreach {
-    case (key, value) if key.startsWith("spark.sql") => setConf(key, value)
-    case _ =>
+  {
+    // We extract spark sql settings from SparkContext's conf and put them to
+    // Spark SQL's conf.
+    // First, we populate the SQLConf (conf). So, we can make sure that other values using
+    // those settings in their construction can get the correct settings.
+    // For example, metadataHive in HiveContext may need both spark.sql.hive.metastore.version
+    // and spark.sql.hive.metastore.jars to get correctly constructed.
+    val properties = new Properties
+    sparkContext.getConf.getAll.foreach {
+      case (key, value) if key.startsWith("spark.sql") => properties.setProperty(key, value)
+      case _ =>
+    }
+    // We directly put those settings to conf to avoid of calling setConf, which may have
+    // side-effects. For example, in HiveContext, setConf may cause executionHive and metadataHive
+    // get constructed. If we call setConf directly, the constructed metadataHive may have
+    // wrong settings, or the construction may fail.
+    conf.setConf(properties)
+    // After we have populated SQLConf, we call setConf to populate other confs in the subclass
+    // (e.g. hiveconf in HiveContext).
+    properties.foreach {
+      case (key, value) => setConf(key, value)
+    }
   }
 
   @transient
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/VersionsSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/VersionsSuite.scala
index 7eb4842726665..deceb67d2b966 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/VersionsSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/VersionsSuite.scala
@@ -17,7 +17,8 @@
 
 package org.apache.spark.sql.hive.client
 
-import org.apache.spark.{Logging, SparkFunSuite}
+import org.apache.spark.sql.hive.HiveContext
+import org.apache.spark.{Logging, SparkConf, SparkContext, SparkFunSuite}
 import org.apache.spark.sql.catalyst.util.quietly
 import org.apache.spark.util.Utils
 
@@ -37,6 +38,48 @@ class VersionsSuite extends SparkFunSuite with Logging {
       "hive.metastore.warehouse.dir" -> warehousePath.toString)
   }
 
+  test("SPARK-8020: successfully create a HiveContext with metastore settings in Spark conf.") {
+    val sparkConf =
+      new SparkConf() {
+        // We are not really clone it. We need to keep the custom getAll.
+        override def clone: SparkConf = this
+
+        override def getAll: Array[(String, String)] = {
+          val allSettings = super.getAll
+          val metastoreVersion = get("spark.sql.hive.metastore.version")
+          val metastoreJars = get("spark.sql.hive.metastore.jars")
+
+          val others = allSettings.filterNot { case (key, _) =>
+            key == "spark.sql.hive.metastore.version" || key == "spark.sql.hive.metastore.jars"
+          }
+
+          // Put metastore.version to the first one. It is needed to trigger the exception
+          // caused by SPARK-8020. Other problems triggered by SPARK-8020
+          // (e.g. using Hive 0.13.1's metastore client to connect to the a 0.12 metastore)
+          // are not easy to test.
+          Array(
+            ("spark.sql.hive.metastore.version" -> metastoreVersion),
+            ("spark.sql.hive.metastore.jars" -> metastoreJars)) ++ others
+        }
+      }
+    sparkConf
+      .set("spark.sql.hive.metastore.version", "12")
+      .set("spark.sql.hive.metastore.jars", "maven")
+
+    val hiveContext = new HiveContext(
+      new SparkContext(
+        "local[2]",
+        "TestSQLContextInVersionsSuite",
+        sparkConf)) {
+
+      protected override def configure(): Map[String, String] = buildConf
+
+    }
+
+    // Make sure all metastore related lazy vals got created.
+    hiveContext.tables()
+  }
+
   test("success sanity check") {
     val badClient = IsolatedClientLoader.forVersion("13", buildConf()).client
     val db = new HiveDatabase("default", "")

From 75dda33f3e037d550c4ab55d438661070804c717 Mon Sep 17 00:00:00 2001
From: Reynold Xin <rxin@databricks.com>
Date: Mon, 1 Jun 2015 21:35:55 -0700
Subject: [PATCH 306/525] Revert "[SPARK-8020] Spark SQL in spark-defaults.conf
 make metadataHive get constructed too early"

This reverts commit 91f6be87bc5cff41ca7a9cca9fdcc4678a4e7086.
---
 .../org/apache/spark/sql/SQLContext.scala     | 25 ++---------
 .../spark/sql/hive/client/VersionsSuite.scala | 45 +------------------
 2 files changed, 4 insertions(+), 66 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
index 91e6385dec81b..7384b24c50b16 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
@@ -182,28 +182,9 @@ class SQLContext(@transient val sparkContext: SparkContext)
     conf.dialect
   }
 
-  {
-    // We extract spark sql settings from SparkContext's conf and put them to
-    // Spark SQL's conf.
-    // First, we populate the SQLConf (conf). So, we can make sure that other values using
-    // those settings in their construction can get the correct settings.
-    // For example, metadataHive in HiveContext may need both spark.sql.hive.metastore.version
-    // and spark.sql.hive.metastore.jars to get correctly constructed.
-    val properties = new Properties
-    sparkContext.getConf.getAll.foreach {
-      case (key, value) if key.startsWith("spark.sql") => properties.setProperty(key, value)
-      case _ =>
-    }
-    // We directly put those settings to conf to avoid of calling setConf, which may have
-    // side-effects. For example, in HiveContext, setConf may cause executionHive and metadataHive
-    // get constructed. If we call setConf directly, the constructed metadataHive may have
-    // wrong settings, or the construction may fail.
-    conf.setConf(properties)
-    // After we have populated SQLConf, we call setConf to populate other confs in the subclass
-    // (e.g. hiveconf in HiveContext).
-    properties.foreach {
-      case (key, value) => setConf(key, value)
-    }
+  sparkContext.getConf.getAll.foreach {
+    case (key, value) if key.startsWith("spark.sql") => setConf(key, value)
+    case _ =>
   }
 
   @transient
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/VersionsSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/VersionsSuite.scala
index deceb67d2b966..7eb4842726665 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/VersionsSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/VersionsSuite.scala
@@ -17,8 +17,7 @@
 
 package org.apache.spark.sql.hive.client
 
-import org.apache.spark.sql.hive.HiveContext
-import org.apache.spark.{Logging, SparkConf, SparkContext, SparkFunSuite}
+import org.apache.spark.{Logging, SparkFunSuite}
 import org.apache.spark.sql.catalyst.util.quietly
 import org.apache.spark.util.Utils
 
@@ -38,48 +37,6 @@ class VersionsSuite extends SparkFunSuite with Logging {
       "hive.metastore.warehouse.dir" -> warehousePath.toString)
   }
 
-  test("SPARK-8020: successfully create a HiveContext with metastore settings in Spark conf.") {
-    val sparkConf =
-      new SparkConf() {
-        // We are not really clone it. We need to keep the custom getAll.
-        override def clone: SparkConf = this
-
-        override def getAll: Array[(String, String)] = {
-          val allSettings = super.getAll
-          val metastoreVersion = get("spark.sql.hive.metastore.version")
-          val metastoreJars = get("spark.sql.hive.metastore.jars")
-
-          val others = allSettings.filterNot { case (key, _) =>
-            key == "spark.sql.hive.metastore.version" || key == "spark.sql.hive.metastore.jars"
-          }
-
-          // Put metastore.version to the first one. It is needed to trigger the exception
-          // caused by SPARK-8020. Other problems triggered by SPARK-8020
-          // (e.g. using Hive 0.13.1's metastore client to connect to the a 0.12 metastore)
-          // are not easy to test.
-          Array(
-            ("spark.sql.hive.metastore.version" -> metastoreVersion),
-            ("spark.sql.hive.metastore.jars" -> metastoreJars)) ++ others
-        }
-      }
-    sparkConf
-      .set("spark.sql.hive.metastore.version", "12")
-      .set("spark.sql.hive.metastore.jars", "maven")
-
-    val hiveContext = new HiveContext(
-      new SparkContext(
-        "local[2]",
-        "TestSQLContextInVersionsSuite",
-        sparkConf)) {
-
-      protected override def configure(): Map[String, String] = buildConf
-
-    }
-
-    // Make sure all metastore related lazy vals got created.
-    hiveContext.tables()
-  }
-
   test("success sanity check") {
     val badClient = IsolatedClientLoader.forVersion("13", buildConf()).client
     val db = new HiveDatabase("default", "")

From 7f74bb3bc6d29c53e67af6b6eec336f2d083322a Mon Sep 17 00:00:00 2001
From: zsxwing <zsxwing@gmail.com>
Date: Mon, 1 Jun 2015 21:36:49 -0700
Subject: [PATCH 307/525] [SPARK-8025][Streaming]Add JavaDoc style deprecation
 for deprecated Streaming methods

Scala `deprecated` annotation actually doesn't show up in JavaDoc.

Author: zsxwing <zsxwing@gmail.com>

Closes #6564 from zsxwing/SPARK-8025 and squashes the following commits:

2faa2bb [zsxwing] Add JavaDoc style deprecation for deprecated Streaming methods
---
 .../org/apache/spark/streaming/StreamingContext.scala     | 8 ++++++++
 .../spark/streaming/api/java/JavaStreamingContext.scala   | 7 +++++++
 .../org/apache/spark/streaming/dstream/DStream.scala      | 4 ++++
 3 files changed, 19 insertions(+)

diff --git a/streaming/src/main/scala/org/apache/spark/streaming/StreamingContext.scala b/streaming/src/main/scala/org/apache/spark/streaming/StreamingContext.scala
index 624a31ddc2b89..9cd9684d36404 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/StreamingContext.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/StreamingContext.scala
@@ -271,6 +271,8 @@ class StreamingContext private[streaming] (
    * Create an input stream with any arbitrary user implemented receiver.
    * Find more details at: http://spark.apache.org/docs/latest/streaming-custom-receivers.html
    * @param receiver Custom implementation of Receiver
+   *
+   * @deprecated As of 1.0.0", replaced by `receiverStream`.
    */
   @deprecated("Use receiverStream", "1.0.0")
   def networkStream[T: ClassTag](receiver: Receiver[T]): ReceiverInputDStream[T] = {
@@ -617,6 +619,8 @@ class StreamingContext private[streaming] (
    * Wait for the execution to stop. Any exceptions that occurs during the execution
    * will be thrown in this thread.
    * @param timeout time to wait in milliseconds
+   *
+   * @deprecated As of 1.3.0, replaced by `awaitTerminationOrTimeout(Long)`.
    */
   @deprecated("Use awaitTerminationOrTimeout(Long) instead", "1.3.0")
   def awaitTermination(timeout: Long) {
@@ -741,6 +745,10 @@ object StreamingContext extends Logging {
     }
   }
 
+  /**
+   * @deprecated As of 1.3.0, replaced by implicit functions in the DStream companion object.
+   *             This is kept here only for backward compatibility.
+   */
   @deprecated("Replaced by implicit functions in the DStream companion object. This is " +
     "kept here only for backward compatibility.", "1.3.0")
   def toPairDStreamFunctions[K, V](stream: DStream[(K, V)])
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaStreamingContext.scala b/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaStreamingContext.scala
index b639b94d5ca47..989e3a729ebc2 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaStreamingContext.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaStreamingContext.scala
@@ -148,6 +148,9 @@ class JavaStreamingContext(val ssc: StreamingContext) extends Closeable {
   /** The underlying SparkContext */
   val sparkContext = new JavaSparkContext(ssc.sc)
 
+  /**
+   * @deprecated As of 0.9.0, replaced by `sparkContext`
+   */
   @deprecated("use sparkContext", "0.9.0")
   val sc: JavaSparkContext = sparkContext
 
@@ -619,6 +622,7 @@ class JavaStreamingContext(val ssc: StreamingContext) extends Closeable {
    * Wait for the execution to stop. Any exceptions that occurs during the execution
    * will be thrown in this thread.
    * @param timeout time to wait in milliseconds
+   * @deprecated As of 1.3.0, replaced by `awaitTerminationOrTimeout(Long)`.
    */
   @deprecated("Use awaitTerminationOrTimeout(Long) instead", "1.3.0")
   def awaitTermination(timeout: Long): Unit = {
@@ -677,6 +681,7 @@ object JavaStreamingContext {
    *
    * @param checkpointPath Checkpoint directory used in an earlier JavaStreamingContext program
    * @param factory        JavaStreamingContextFactory object to create a new JavaStreamingContext
+   * @deprecated As of 1.4.0, replaced by `getOrCreate` without JavaStreamingContextFactor.
    */
   @deprecated("use getOrCreate without JavaStreamingContextFactor", "1.4.0")
   def getOrCreate(
@@ -699,6 +704,7 @@ object JavaStreamingContext {
    * @param factory        JavaStreamingContextFactory object to create a new JavaStreamingContext
    * @param hadoopConf     Hadoop configuration if necessary for reading from any HDFS compatible
    *                       file system
+   * @deprecated As of 1.4.0, replaced by `getOrCreate` without JavaStreamingContextFactor.
    */
   @deprecated("use getOrCreate without JavaStreamingContextFactory", "1.4.0")
   def getOrCreate(
@@ -724,6 +730,7 @@ object JavaStreamingContext {
    *                       file system
    * @param createOnError  Whether to create a new JavaStreamingContext if there is an
    *                       error in reading checkpoint data.
+   * @deprecated As of 1.4.0, replaced by `getOrCreate` without JavaStreamingContextFactor.
    */
   @deprecated("use getOrCreate without JavaStreamingContextFactory", "1.4.0")
   def getOrCreate(
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/dstream/DStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/dstream/DStream.scala
index 6efcc193bfccc..192aa6a139bcb 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/dstream/DStream.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/dstream/DStream.scala
@@ -603,6 +603,8 @@ abstract class DStream[T: ClassTag] (
   /**
    * Apply a function to each RDD in this DStream. This is an output operator, so
    * 'this' DStream will be registered as an output stream and therefore materialized.
+   *
+   * @deprecated As of 0.9.0, replaced by `foreachRDD`.
    */
   @deprecated("use foreachRDD", "0.9.0")
   def foreach(foreachFunc: RDD[T] => Unit): Unit = ssc.withScope {
@@ -612,6 +614,8 @@ abstract class DStream[T: ClassTag] (
   /**
    * Apply a function to each RDD in this DStream. This is an output operator, so
    * 'this' DStream will be registered as an output stream and therefore materialized.
+   *
+   * @deprecated As of 0.9.0, replaced by `foreachRDD`.
    */
   @deprecated("use foreachRDD", "0.9.0")
   def foreach(foreachFunc: (RDD[T], Time) => Unit): Unit = ssc.withScope {

From e797dba58e8cafdd30683dd1e0263f00ce30ccc0 Mon Sep 17 00:00:00 2001
From: Yin Huai <yhuai@databricks.com>
Date: Mon, 1 Jun 2015 21:40:17 -0700
Subject: [PATCH 308/525] [SPARK-7965] [SPARK-7972] [SQL] Handle expressions
 containing multiple window expressions and make parser match window frames in
 case insensitive way

JIRAs:
https://issues.apache.org/jira/browse/SPARK-7965
https://issues.apache.org/jira/browse/SPARK-7972

Author: Yin Huai <yhuai@databricks.com>

Closes #6524 from yhuai/7965-7972 and squashes the following commits:

c12c79c [Yin Huai] Add doc for returned value.
de64328 [Yin Huai] Address rxin's comments.
fc9b1ad [Yin Huai] wip
2996da4 [Yin Huai] scala style
20b65b7 [Yin Huai] Handle expressions containing multiple window expressions.
9568b21 [Yin Huai] case insensitive matches
41f633d [Yin Huai] Failed test case.
---
 .../sql/catalyst/analysis/Analyzer.scala      | 108 +++++++++++++-----
 .../org/apache/spark/sql/hive/HiveQl.scala    |  22 +++-
 .../sql/hive/execution/SQLQuerySuite.scala    |  36 ++++++
 3 files changed, 134 insertions(+), 32 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
index df37889eedcf0..8e9fec70704e6 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
@@ -633,10 +633,10 @@ class Analyzer(
    *    it into the plan tree.
    */
   object ExtractWindowExpressions extends Rule[LogicalPlan] {
-    def hasWindowFunction(projectList: Seq[NamedExpression]): Boolean =
+    private def hasWindowFunction(projectList: Seq[NamedExpression]): Boolean =
       projectList.exists(hasWindowFunction)
 
-    def hasWindowFunction(expr: NamedExpression): Boolean = {
+    private def hasWindowFunction(expr: NamedExpression): Boolean = {
       expr.find {
         case window: WindowExpression => true
         case _ => false
@@ -644,14 +644,24 @@ class Analyzer(
     }
 
     /**
-     * From a Seq of [[NamedExpression]]s, extract window expressions and
-     * other regular expressions.
+     * From a Seq of [[NamedExpression]]s, extract expressions containing window expressions and
+     * other regular expressions that do not contain any window expression. For example, for
+     * `col1, Sum(col2 + col3) OVER (PARTITION BY col4 ORDER BY col5)`, we will extract
+     * `col1`, `col2 + col3`, `col4`, and `col5` out and replace them appearances in
+     * the window expression as attribute references. So, the first returned value will be
+     * `[Sum(_w0) OVER (PARTITION BY _w1 ORDER BY _w2)]` and the second returned value will be
+     * [col1, col2 + col3 as _w0, col4 as _w1, col5 as _w2].
+     *
+     * @return (seq of expressions containing at lease one window expressions,
+     *          seq of non-window expressions)
      */
-    def extract(
+    private def extract(
         expressions: Seq[NamedExpression]): (Seq[NamedExpression], Seq[NamedExpression]) = {
-      // First, we simple partition the input expressions to two part, one having
-      // WindowExpressions and another one without WindowExpressions.
-      val (windowExpressions, regularExpressions) = expressions.partition(hasWindowFunction)
+      // First, we partition the input expressions to two part. For the first part,
+      // every expression in it contain at least one WindowExpression.
+      // Expressions in the second part do not have any WindowExpression.
+      val (expressionsWithWindowFunctions, regularExpressions) =
+        expressions.partition(hasWindowFunction)
 
       // Then, we need to extract those regular expressions used in the WindowExpression.
       // For example, when we have col1 - Sum(col2 + col3) OVER (PARTITION BY col4 ORDER BY col5),
@@ -660,8 +670,8 @@ class Analyzer(
       val extractedExprBuffer = new ArrayBuffer[NamedExpression]()
       def extractExpr(expr: Expression): Expression = expr match {
         case ne: NamedExpression =>
-          // If a named expression is not in regularExpressions, add extract it and replace it
-          // with an AttributeReference.
+          // If a named expression is not in regularExpressions, add it to
+          // extractedExprBuffer and replace it with an AttributeReference.
           val missingExpr =
             AttributeSet(Seq(expr)) -- (regularExpressions ++ extractedExprBuffer)
           if (missingExpr.nonEmpty) {
@@ -678,8 +688,9 @@ class Analyzer(
           withName.toAttribute
       }
 
-      // Now, we extract expressions from windowExpressions by using extractExpr.
-      val newWindowExpressions = windowExpressions.map {
+      // Now, we extract regular expressions from expressionsWithWindowFunctions
+      // by using extractExpr.
+      val newExpressionsWithWindowFunctions = expressionsWithWindowFunctions.map {
         _.transform {
           // Extracts children expressions of a WindowFunction (input parameters of
           // a WindowFunction).
@@ -705,37 +716,80 @@ class Analyzer(
         }.asInstanceOf[NamedExpression]
       }
 
-      (newWindowExpressions, regularExpressions ++ extractedExprBuffer)
-    }
+      (newExpressionsWithWindowFunctions, regularExpressions ++ extractedExprBuffer)
+    } // end of extract
 
     /**
      * Adds operators for Window Expressions. Every Window operator handles a single Window Spec.
      */
-    def addWindow(windowExpressions: Seq[NamedExpression], child: LogicalPlan): LogicalPlan = {
-      // First, we group window expressions based on their Window Spec.
-      val groupedWindowExpression = windowExpressions.groupBy { expr =>
-        val windowSpec = expr.collectFirst {
+    private def addWindow(
+        expressionsWithWindowFunctions: Seq[NamedExpression],
+        child: LogicalPlan): LogicalPlan = {
+      // First, we need to extract all WindowExpressions from expressionsWithWindowFunctions
+      // and put those extracted WindowExpressions to extractedWindowExprBuffer.
+      // This step is needed because it is possible that an expression contains multiple
+      // WindowExpressions with different Window Specs.
+      // After extracting WindowExpressions, we need to construct a project list to generate
+      // expressionsWithWindowFunctions based on extractedWindowExprBuffer.
+      // For example, for "sum(a) over (...) / sum(b) over (...)", we will first extract
+      // "sum(a) over (...)" and "sum(b) over (...)" out, and assign "_we0" as the alias to
+      // "sum(a) over (...)" and "_we1" as the alias to "sum(b) over (...)".
+      // Then, the projectList will be [_we0/_we1].
+      val extractedWindowExprBuffer = new ArrayBuffer[NamedExpression]()
+      val newExpressionsWithWindowFunctions = expressionsWithWindowFunctions.map {
+        // We need to use transformDown because we want to trigger
+        // "case alias @ Alias(window: WindowExpression, _)" first.
+        _.transformDown {
+          case alias @ Alias(window: WindowExpression, _) =>
+            // If a WindowExpression has an assigned alias, just use it.
+            extractedWindowExprBuffer += alias
+            alias.toAttribute
+          case window: WindowExpression =>
+            // If there is no alias assigned to the WindowExpressions. We create an
+            // internal column.
+            val withName = Alias(window, s"_we${extractedWindowExprBuffer.length}")()
+            extractedWindowExprBuffer += withName
+            withName.toAttribute
+        }.asInstanceOf[NamedExpression]
+      }
+
+      // Second, we group extractedWindowExprBuffer based on their Window Spec.
+      val groupedWindowExpressions = extractedWindowExprBuffer.groupBy { expr =>
+        val distinctWindowSpec = expr.collect {
           case window: WindowExpression => window.windowSpec
+        }.distinct
+
+        // We do a final check and see if we only have a single Window Spec defined in an
+        // expressions.
+        if (distinctWindowSpec.length == 0 ) {
+          failAnalysis(s"$expr does not have any WindowExpression.")
+        } else if (distinctWindowSpec.length > 1) {
+          // newExpressionsWithWindowFunctions only have expressions with a single
+          // WindowExpression. If we reach here, we have a bug.
+          failAnalysis(s"$expr has multiple Window Specifications ($distinctWindowSpec)." +
+            s"Please file a bug report with this error message, stack trace, and the query.")
+        } else {
+          distinctWindowSpec.head
         }
-        windowSpec.getOrElse(
-          failAnalysis(s"$windowExpressions does not have any WindowExpression."))
       }.toSeq
 
-      // For every Window Spec, we add a Window operator and set currentChild as the child of it.
+      // Third, for every Window Spec, we add a Window operator and set currentChild as the
+      // child of it.
       var currentChild = child
       var i = 0
-      while (i < groupedWindowExpression.size) {
-        val (windowSpec, windowExpressions) = groupedWindowExpression(i)
+      while (i < groupedWindowExpressions.size) {
+        val (windowSpec, windowExpressions) = groupedWindowExpressions(i)
         // Set currentChild to the newly created Window operator.
         currentChild = Window(currentChild.output, windowExpressions, windowSpec, currentChild)
 
-        // Move to next WindowExpression.
+        // Move to next Window Spec.
         i += 1
       }
 
-      // We return the top operator.
-      currentChild
-    }
+      // Finally, we create a Project to output currentChild's output
+      // newExpressionsWithWindowFunctions.
+      Project(currentChild.output ++ newExpressionsWithWindowFunctions, currentChild)
+    } // end of addWindow
 
     // We have to use transformDown at here to make sure the rule of
     // "Aggregate with Having clause" will be triggered.
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala
index 253bf1125262e..a5ca3613c5e00 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala
@@ -1561,6 +1561,10 @@ https://cwiki.apache.org/confluence/display/Hive/Enhanced+Aggregation%2C+Cube%2C
          """.stripMargin)
   }
 
+  /* Case insensitive matches for Window Specification */
+  val PRECEDING = "(?i)preceding".r
+  val FOLLOWING = "(?i)following".r
+  val CURRENT = "(?i)current".r
   def nodesToWindowSpecification(nodes: Seq[ASTNode]): WindowSpec = nodes match {
     case Token(windowName, Nil) :: Nil =>
       // Refer to a window spec defined in the window clause.
@@ -1614,11 +1618,19 @@ https://cwiki.apache.org/confluence/display/Hive/Enhanced+Aggregation%2C+Cube%2C
         } else {
           val frameType = rowFrame.map(_ => RowFrame).getOrElse(RangeFrame)
           def nodeToBoundary(node: Node): FrameBoundary = node match {
-            case Token("preceding", Token(count, Nil) :: Nil) =>
-              if (count == "unbounded") UnboundedPreceding else ValuePreceding(count.toInt)
-            case Token("following", Token(count, Nil) :: Nil) =>
-              if (count == "unbounded") UnboundedFollowing else ValueFollowing(count.toInt)
-            case Token("current", Nil) => CurrentRow
+            case Token(PRECEDING(), Token(count, Nil) :: Nil) =>
+              if (count.toLowerCase() == "unbounded") {
+                UnboundedPreceding
+              } else {
+                ValuePreceding(count.toInt)
+              }
+            case Token(FOLLOWING(), Token(count, Nil) :: Nil) =>
+              if (count.toLowerCase() == "unbounded") {
+                UnboundedFollowing
+              } else {
+                ValueFollowing(count.toInt)
+              }
+            case Token(CURRENT(), Nil) => CurrentRow
             case _ =>
               throw new NotImplementedError(
                 s"""No parse rules for the Window Frame Boundary based on Node ${node.getName}
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
index 27863a60145d7..aba3becb1bce2 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
@@ -780,6 +780,42 @@ class SQLQuerySuite extends QueryTest {
       ).map(i => Row(i._1, i._2, i._3, i._4)))
   }
 
+  test("window function: multiple window expressions in a single expression") {
+    val nums = sparkContext.parallelize(1 to 10).map(x => (x, x % 2)).toDF("x", "y")
+    nums.registerTempTable("nums")
+
+    val expected =
+      Row(1, 1, 1, 55, 1, 57) ::
+      Row(0, 2, 3, 55, 2, 60) ::
+      Row(1, 3, 6, 55, 4, 65) ::
+      Row(0, 4, 10, 55, 6, 71) ::
+      Row(1, 5, 15, 55, 9, 79) ::
+      Row(0, 6, 21, 55, 12, 88) ::
+      Row(1, 7, 28, 55, 16, 99) ::
+      Row(0, 8, 36, 55, 20, 111) ::
+      Row(1, 9, 45, 55, 25, 125) ::
+      Row(0, 10, 55, 55, 30, 140) :: Nil
+
+    val actual = sql(
+      """
+        |SELECT
+        |  y,
+        |  x,
+        |  sum(x) OVER w1 AS running_sum,
+        |  sum(x) OVER w2 AS total_sum,
+        |  sum(x) OVER w3 AS running_sum_per_y,
+        |  ((sum(x) OVER w1) + (sum(x) OVER w2) + (sum(x) OVER w3)) as combined2
+        |FROM nums
+        |WINDOW w1 AS (ORDER BY x ROWS BETWEEN UnBOUNDED PRECEDiNG AND CuRRENT RoW),
+        |       w2 AS (ORDER BY x ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOuNDED FoLLOWING),
+        |       w3 AS (PARTITION BY y ORDER BY x ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW)
+      """.stripMargin)
+
+    checkAnswer(actual, expected)
+
+    dropTempTable("nums")
+  }
+
   test("test case key when") {
     (1 to 5).map(i => (i, i.toString)).toDF("k", "v").registerTempTable("t")
     checkAnswer(

From b53a0116473a03607c5be3e4135151b4932acc06 Mon Sep 17 00:00:00 2001
From: Reynold Xin <rxin@databricks.com>
Date: Mon, 1 Jun 2015 21:41:53 -0700
Subject: [PATCH 309/525] Fixed typo in the previous commit.

---
 .../scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
index 8e9fec70704e6..bc17169f35a46 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
@@ -647,7 +647,7 @@ class Analyzer(
      * From a Seq of [[NamedExpression]]s, extract expressions containing window expressions and
      * other regular expressions that do not contain any window expression. For example, for
      * `col1, Sum(col2 + col3) OVER (PARTITION BY col4 ORDER BY col5)`, we will extract
-     * `col1`, `col2 + col3`, `col4`, and `col5` out and replace them appearances in
+     * `col1`, `col2 + col3`, `col4`, and `col5` out and replace their appearances in
      * the window expression as attribute references. So, the first returned value will be
      * `[Sum(_w0) OVER (PARTITION BY _w1 ORDER BY _w2)]` and the second returned value will be
      * [col1, col2 + col3 as _w0, col4 as _w1, col5 as _w2].

From 0221c7f0efe2512f3ae3839b83aa8abb0806d516 Mon Sep 17 00:00:00 2001
From: Xiangrui Meng <meng@databricks.com>
Date: Mon, 1 Jun 2015 22:03:29 -0700
Subject: [PATCH 310/525] [SPARK-7582] [MLLIB] user guide for StringIndexer

This PR adds a Java unit test and user guide for `StringIndexer`. I put it before `OneHotEncoder` because they are closely related. jkbradley

Author: Xiangrui Meng <meng@databricks.com>

Closes #6561 from mengxr/SPARK-7582 and squashes the following commits:

4bba4f1 [Xiangrui Meng] fix example
ba1cd1b [Xiangrui Meng] fix style
7fa18d1 [Xiangrui Meng] add user guide for StringIndexer
136cb93 [Xiangrui Meng] add a Java unit test for StringIndexer
---
 docs/ml-features.md                           | 116 ++++++++++++++++++
 .../ml/feature/JavaStringIndexerSuite.java    |  77 ++++++++++++
 2 files changed, 193 insertions(+)
 create mode 100644 mllib/src/test/java/org/apache/spark/ml/feature/JavaStringIndexerSuite.java

diff --git a/docs/ml-features.md b/docs/ml-features.md
index 9ee5696122717..f88c0248c1a8a 100644
--- a/docs/ml-features.md
+++ b/docs/ml-features.md
@@ -456,6 +456,122 @@ for expanded in polyDF.select("polyFeatures").take(3):
 </div>
 </div>
 
+## StringIndexer
+
+`StringIndexer` encodes a string column of labels to a column of label indices.
+The indices are in `[0, numLabels)`, ordered by label frequencies.
+So the most frequent label gets index `0`.
+If the input column is numeric, we cast it to string and index the string values.
+
+**Examples**
+
+Assume that we have the following DataFrame with columns `id` and `category`:
+
+~~~~
+ id | category
+----|----------
+ 0  | a
+ 1  | b
+ 2  | c
+ 3  | a
+ 4  | a
+ 5  | c
+~~~~
+
+`category` is a string column with three labels: "a", "b", and "c".
+Applying `StringIndexer` with `category` as the input column and `categoryIndex` as the output
+column, we should get the following:
+
+~~~~
+ id | category | categoryIndex
+----|----------|---------------
+ 0  | a        | 0.0
+ 1  | b        | 2.0
+ 2  | c        | 1.0
+ 3  | a        | 0.0
+ 4  | a        | 0.0
+ 5  | c        | 1.0
+~~~~
+
+"a" gets index `0` because it is the most frequent, followed by "c" with index `1` and "b" with
+index `2`.
+
+<div class="codetabs">
+
+<div data-lang="scala" markdown="1">
+
+[`StringIndexer`](api/scala/index.html#org.apache.spark.ml.feature.StringIndexer) takes an input
+column name and an output column name.
+
+{% highlight scala %}
+import org.apache.spark.ml.feature.StringIndexer
+
+val df = sqlContext.createDataFrame(
+  Seq((0, "a"), (1, "b"), (2, "c"), (3, "a"), (4, "a"), (5, "c"))
+).toDF("id", "category")
+val indexer = new StringIndexer()
+  .setInputCol("category")
+  .setOutputCol("categoryIndex")
+val indexed = indexer.fit(df).transform(df)
+indexed.show()
+{% endhighlight %}
+</div>
+
+<div data-lang="java" markdown="1">
+[`StringIndexer`](api/java/org/apache/spark/ml/feature/StringIndexer.html) takes an input column
+name and an output column name.
+
+{% highlight java %}
+import java.util.Arrays;
+
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.ml.feature.StringIndexer;
+import org.apache.spark.sql.DataFrame;
+import org.apache.spark.sql.Row;
+import org.apache.spark.sql.RowFactory;
+import org.apache.spark.sql.types.StructField;
+import org.apache.spark.sql.types.StructType;
+import static org.apache.spark.sql.types.DataTypes.*;
+
+JavaRDD<Row> jrdd = jsc.parallelize(Arrays.asList(
+  RowFactory.create(0, "a"),
+  RowFactory.create(1, "b"),
+  RowFactory.create(2, "c"),
+  RowFactory.create(3, "a"),
+  RowFactory.create(4, "a"),
+  RowFactory.create(5, "c")
+));
+StructType schema = new StructType(new StructField[] {
+  createStructField("id", DoubleType, false),
+  createStructField("category", StringType, false)
+});
+DataFrame df = sqlContext.createDataFrame(jrdd, schema);
+StringIndexer indexer = new StringIndexer()
+  .setInputCol("category")
+  .setOutputCol("categoryIndex");
+DataFrame indexed = indexer.fit(df).transform(df);
+indexed.show();
+{% endhighlight %}
+</div>
+
+<div data-lang="python" markdown="1">
+
+[`StringIndexer`](api/python/pyspark.ml.html#pyspark.ml.feature.StringIndexer) takes an input
+column name and an output column name.
+
+{% highlight python %}
+from pyspark.ml.feature import StringIndexer
+
+df = sqlContext.createDataFrame(
+    [(0, "a"), (1, "b"), (2, "c"), (3, "a"), (4, "a"), (5, "c")],
+    ["id", "category"])
+indexer = StringIndexer(inputCol="category", outputCol="categoryIndex")
+indexed = indexer.fit(df).transform(df)
+indexed.show()
+{% endhighlight %}
+</div>
+</div>
+
 ## OneHotEncoder
 
 [One-hot encoding](http://en.wikipedia.org/wiki/One-hot) maps a column of label indices to a column of binary vectors, with at most a single one-value. This encoding allows algorithms which expect continuous features, such as Logistic Regression, to use categorical features 
diff --git a/mllib/src/test/java/org/apache/spark/ml/feature/JavaStringIndexerSuite.java b/mllib/src/test/java/org/apache/spark/ml/feature/JavaStringIndexerSuite.java
new file mode 100644
index 0000000000000..35b18c5308f61
--- /dev/null
+++ b/mllib/src/test/java/org/apache/spark/ml/feature/JavaStringIndexerSuite.java
@@ -0,0 +1,77 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.ml.feature;
+
+import java.util.Arrays;
+
+import org.junit.After;
+import org.junit.Assert;
+import org.junit.Before;
+import org.junit.Test;
+
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.sql.DataFrame;
+import org.apache.spark.sql.Row;
+import org.apache.spark.sql.RowFactory;
+import org.apache.spark.sql.SQLContext;
+import org.apache.spark.sql.types.StructField;
+import org.apache.spark.sql.types.StructType;
+import static org.apache.spark.sql.types.DataTypes.*;
+
+public class JavaStringIndexerSuite {
+  private transient JavaSparkContext jsc;
+  private transient SQLContext sqlContext;
+
+  @Before
+  public void setUp() {
+    jsc = new JavaSparkContext("local", "JavaStringIndexerSuite");
+    sqlContext = new SQLContext(jsc);
+  }
+
+  @After
+  public void tearDown() {
+    jsc.stop();
+    sqlContext = null;
+  }
+
+  @Test
+  public void testStringIndexer() {
+    StructType schema = createStructType(new StructField[] {
+      createStructField("id", IntegerType, false),
+      createStructField("label", StringType, false)
+    });
+    JavaRDD<Row> rdd = jsc.parallelize(
+      Arrays.asList(c(0, "a"), c(1, "b"), c(2, "c"), c(3, "a"), c(4, "a"), c(5, "c")));
+    DataFrame dataset = sqlContext.createDataFrame(rdd, schema);
+
+    StringIndexer indexer = new StringIndexer()
+      .setInputCol("label")
+      .setOutputCol("labelIndex");
+    DataFrame output = indexer.fit(dataset).transform(dataset);
+
+    Assert.assertArrayEquals(
+      new Row[] { c(0, 0.0), c(1, 2.0), c(2, 1.0), c(3, 0.0), c(4, 0.0), c(5, 1.0) },
+      output.orderBy("id").select("id", "labelIndex").collect());
+  }
+
+  /** An alias for RowFactory.create. */
+  private Row c(Object... values) {
+    return RowFactory.create(values);
+  }
+}

From bcb47ad7718b843fbd25cd1e228a7b7e6e5b8686 Mon Sep 17 00:00:00 2001
From: Davies Liu <davies@databricks.com>
Date: Mon, 1 Jun 2015 23:12:29 -0700
Subject: [PATCH 311/525] [SPARK-6917] [SQL] DecimalType is not read back when
 non-native type exists

cc yhuai

Author: Davies Liu <davies@databricks.com>

Closes #6558 from davies/decimalType and squashes the following commits:

c877ca8 [Davies Liu] Update ParquetConverter.scala
48cc57c [Davies Liu] Update ParquetConverter.scala
b43845c [Davies Liu] add test
3b4a94f [Davies Liu] DecimalType is not read back when non-native type exists
---
 .../apache/spark/sql/parquet/ParquetConverter.scala |  4 +++-
 .../spark/sql/parquet/ParquetQuerySuite.scala       | 13 +++++++++++++
 2 files changed, 16 insertions(+), 1 deletion(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetConverter.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetConverter.scala
index 1b4196ab0be35..caa9f045537d0 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetConverter.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetConverter.scala
@@ -243,8 +243,10 @@ private[parquet] abstract class CatalystConverter extends GroupConverter {
   /**
    * Read a decimal value from a Parquet Binary into "dest". Only supports decimals that fit in
    * a long (i.e. precision <= 18)
+   *
+   * Returned value is needed by CatalystConverter, which doesn't reuse the Decimal object.
    */
-  protected[parquet] def readDecimal(dest: Decimal, value: Binary, ctype: DecimalType): Unit = {
+  protected[parquet] def readDecimal(dest: Decimal, value: Binary, ctype: DecimalType): Decimal = {
     val precision = ctype.precisionInfo.get.precision
     val scale = ctype.precisionInfo.get.scale
     val bytes = value.getBytes
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetQuerySuite.scala
index b98ba09ccfc2d..304936fb2be8e 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetQuerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetQuerySuite.scala
@@ -19,6 +19,7 @@ package org.apache.spark.sql.parquet
 
 import org.scalatest.BeforeAndAfterAll
 
+import org.apache.spark.sql.types._
 import org.apache.spark.sql.{SQLConf, QueryTest}
 import org.apache.spark.sql.catalyst.expressions.Row
 import org.apache.spark.sql.test.TestSQLContext
@@ -111,6 +112,18 @@ class ParquetQuerySuiteBase extends QueryTest with ParquetTest {
         List(Row("same", "run_5", 100)))
     }
   }
+
+  test("SPARK-6917 DecimalType should work with non-native types") {
+    val data = (1 to 10).map(i => Row(Decimal(i, 18, 0), new java.sql.Timestamp(i)))
+    val schema = StructType(List(StructField("d", DecimalType(18, 0), false),
+      StructField("time", TimestampType, false)).toArray)
+    withTempPath { file =>
+      val df = sqlContext.createDataFrame(sparkContext.parallelize(data), schema)
+      df.write.parquet(file.getCanonicalPath)
+      val df2 = sqlContext.read.parquet(file.getCanonicalPath)
+      checkAnswer(df2, df.collect().toSeq)
+    }
+  }
 }
 
 class ParquetDataSourceOnQuerySuite extends ParquetQuerySuiteBase with BeforeAndAfterAll {

From 7b7f7b6c6fd903e2ecfc886d29eaa9df58adcfc3 Mon Sep 17 00:00:00 2001
From: Yin Huai <yhuai@databricks.com>
Date: Tue, 2 Jun 2015 00:16:56 -0700
Subject: [PATCH 312/525] [SPARK-8020] [SQL] Spark SQL conf in
 spark-defaults.conf make metadataHive get constructed too early

https://issues.apache.org/jira/browse/SPARK-8020

Author: Yin Huai <yhuai@databricks.com>

Closes #6571 from yhuai/SPARK-8020-1 and squashes the following commits:

0398f5b [Yin Huai] First populate the SQLConf and then construct executionHive and metadataHive.
---
 .../org/apache/spark/sql/SQLContext.scala     | 25 ++++++++++++++++---
 1 file changed, 22 insertions(+), 3 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
index 7384b24c50b16..91e6385dec81b 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
@@ -182,9 +182,28 @@ class SQLContext(@transient val sparkContext: SparkContext)
     conf.dialect
   }
 
-  sparkContext.getConf.getAll.foreach {
-    case (key, value) if key.startsWith("spark.sql") => setConf(key, value)
-    case _ =>
+  {
+    // We extract spark sql settings from SparkContext's conf and put them to
+    // Spark SQL's conf.
+    // First, we populate the SQLConf (conf). So, we can make sure that other values using
+    // those settings in their construction can get the correct settings.
+    // For example, metadataHive in HiveContext may need both spark.sql.hive.metastore.version
+    // and spark.sql.hive.metastore.jars to get correctly constructed.
+    val properties = new Properties
+    sparkContext.getConf.getAll.foreach {
+      case (key, value) if key.startsWith("spark.sql") => properties.setProperty(key, value)
+      case _ =>
+    }
+    // We directly put those settings to conf to avoid of calling setConf, which may have
+    // side-effects. For example, in HiveContext, setConf may cause executionHive and metadataHive
+    // get constructed. If we call setConf directly, the constructed metadataHive may have
+    // wrong settings, or the construction may fail.
+    conf.setConf(properties)
+    // After we have populated SQLConf, we call setConf to populate other confs in the subclass
+    // (e.g. hiveconf in HiveContext).
+    properties.foreach {
+      case (key, value) => setConf(key, value)
+    }
   }
 
   @transient

From 0f80990bfac1e9969644952d1d8edaf7d26fb436 Mon Sep 17 00:00:00 2001
From: Yin Huai <yhuai@databricks.com>
Date: Tue, 2 Jun 2015 00:20:52 -0700
Subject: [PATCH 313/525] [SPARK-8023][SQL] Add "deterministic" attribute to
 Expression to avoid collapsing nondeterministic projects.

This closes #6570.

Author: Yin Huai <yhuai@databricks.com>
Author: Reynold Xin <rxin@databricks.com>

Closes #6573 from rxin/deterministic and squashes the following commits:

356cd22 [Reynold Xin] Added unit test for the optimizer.
da3fde1 [Reynold Xin] Merge pull request #6570 from yhuai/SPARK-8023
da56200 [Yin Huai] Comments.
e38f264 [Yin Huai] Comment.
f9d6a73 [Yin Huai] Add a deterministic method to Expression.
---
 .../sql/catalyst/expressions/Expression.scala |  8 ++
 .../sql/catalyst/expressions/random.scala     |  2 +
 .../sql/catalyst/optimizer/Optimizer.scala    | 11 ++-
 .../optimizer/ProjectCollapsingSuite.scala    | 73 +++++++++++++++++++
 .../spark/sql/ColumnExpressionSuite.scala     | 41 ++++++++++-
 .../org/apache/spark/sql/hive/hiveUdfs.scala  |  4 +
 6 files changed, 137 insertions(+), 2 deletions(-)
 create mode 100644 sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/ProjectCollapsingSuite.scala

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala
index d19928784442e..adc6505d69cdf 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala
@@ -37,7 +37,15 @@ abstract class Expression extends TreeNode[Expression] {
    *  - A [[Cast]] or [[UnaryMinus]] is foldable if its child is foldable
    */
   def foldable: Boolean = false
+
+  /**
+   * Returns true when the current expression always return the same result for fixed input values.
+   */
+  // TODO: Need to define explicit input values vs implicit input values.
+  def deterministic: Boolean = true
+
   def nullable: Boolean
+
   def references: AttributeSet = AttributeSet(children.flatMap(_.references.iterator))
 
   /** Returns the result of evaluating this expression on a given input Row */
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/random.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/random.scala
index 4f4f67a6e482c..b2647124c4e49 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/random.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/random.scala
@@ -38,6 +38,8 @@ abstract class RDG(seed: Long) extends LeafExpression with Serializable {
    */
   @transient protected lazy val rng = new XORShiftRandom(seed + TaskContext.get().partitionId())
 
+  override def deterministic: Boolean = false
+
   override def nullable: Boolean = false
 
   override def dataType: DataType = DoubleType
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
index c2818d957cc79..b25fb48f55e2b 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
@@ -179,8 +179,17 @@ object ColumnPruning extends Rule[LogicalPlan] {
  * expressions into one single expression.
  */
 object ProjectCollapsing extends Rule[LogicalPlan] {
+
+  /** Returns true if any expression in projectList is non-deterministic. */
+  private def hasNondeterministic(projectList: Seq[NamedExpression]): Boolean = {
+    projectList.exists(expr => expr.find(!_.deterministic).isDefined)
+  }
+
   def apply(plan: LogicalPlan): LogicalPlan = plan transformUp {
-    case Project(projectList1, Project(projectList2, child)) =>
+    // We only collapse these two Projects if the child Project's expressions are all
+    // deterministic.
+    case Project(projectList1, Project(projectList2, child))
+         if !hasNondeterministic(projectList2) =>
       // Create a map of Aliases to their values from the child projection.
       // e.g., 'SELECT ... FROM (SELECT a + b AS c, d ...)' produces Map(c -> Alias(a + b, c)).
       val aliasMap = AttributeMap(projectList2.collect {
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/ProjectCollapsingSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/ProjectCollapsingSuite.scala
new file mode 100644
index 0000000000000..151654bffbd66
--- /dev/null
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/ProjectCollapsingSuite.scala
@@ -0,0 +1,73 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.catalyst.optimizer
+
+import org.apache.spark.sql.catalyst.analysis.EliminateSubQueries
+import org.apache.spark.sql.catalyst.dsl.plans._
+import org.apache.spark.sql.catalyst.dsl.expressions._
+import org.apache.spark.sql.catalyst.expressions.Rand
+import org.apache.spark.sql.catalyst.plans.PlanTest
+import org.apache.spark.sql.catalyst.plans.logical.{LocalRelation, LogicalPlan}
+import org.apache.spark.sql.catalyst.rules.RuleExecutor
+
+
+class ProjectCollapsingSuite extends PlanTest {
+  object Optimize extends RuleExecutor[LogicalPlan] {
+    val batches =
+      Batch("Subqueries", FixedPoint(10), EliminateSubQueries) ::
+        Batch("ProjectCollapsing", Once, ProjectCollapsing) :: Nil
+  }
+
+  val testRelation = LocalRelation('a.int, 'b.int)
+
+  test("collapse two deterministic, independent projects into one") {
+    val query = testRelation
+      .select(('a + 1).as('a_plus_1), 'b)
+      .select('a_plus_1, ('b + 1).as('b_plus_1))
+
+    val optimized = Optimize.execute(query.analyze)
+    val correctAnswer = testRelation.select(('a + 1).as('a_plus_1), ('b + 1).as('b_plus_1)).analyze
+
+    comparePlans(optimized, correctAnswer)
+  }
+
+  test("collapse two deterministic, dependent projects into one") {
+    val query = testRelation
+      .select(('a + 1).as('a_plus_1), 'b)
+      .select(('a_plus_1 + 1).as('a_plus_2), 'b)
+
+    val optimized = Optimize.execute(query.analyze)
+
+    val correctAnswer = testRelation.select(
+      (('a + 1).as('a_plus_1) + 1).as('a_plus_2),
+      'b).analyze
+
+    comparePlans(optimized, correctAnswer)
+  }
+
+  test("do not collapse nondeterministic projects") {
+    val query = testRelation
+      .select(Rand(10).as('rand))
+      .select(('rand + 1).as('rand1), ('rand + 2).as('rand2))
+
+    val optimized = Optimize.execute(query.analyze)
+    val correctAnswer = query.analyze
+
+    comparePlans(optimized, correctAnswer)
+  }
+}
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/ColumnExpressionSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/ColumnExpressionSuite.scala
index b8bb1bff9ea72..bfba379d9a518 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/ColumnExpressionSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/ColumnExpressionSuite.scala
@@ -19,6 +19,7 @@ package org.apache.spark.sql
 
 import org.scalatest.Matchers._
 
+import org.apache.spark.sql.execution.Project
 import org.apache.spark.sql.functions._
 import org.apache.spark.sql.test.TestSQLContext
 import org.apache.spark.sql.test.TestSQLContext.implicits._
@@ -452,13 +453,51 @@ class ColumnExpressionSuite extends QueryTest {
   }
 
   test("rand") {
-    val randCol = testData.select('key, rand(5L).as("rand"))
+    val randCol = testData.select($"key", rand(5L).as("rand"))
     randCol.columns.length should be (2)
     val rows = randCol.collect()
     rows.foreach { row =>
       assert(row.getDouble(1) <= 1.0)
       assert(row.getDouble(1) >= 0.0)
     }
+
+    def checkNumProjects(df: DataFrame, expectedNumProjects: Int): Unit = {
+      val projects = df.queryExecution.executedPlan.collect {
+        case project: Project => project
+      }
+      assert(projects.size === expectedNumProjects)
+    }
+
+    // We first create a plan with two Projects.
+    // Project [rand + 1 AS rand1, rand - 1 AS rand2]
+    //   Project [key, (Rand 5 + 1) AS rand]
+    //     LogicalRDD [key, value]
+    // Because Rand function is not deterministic, the column rand is not deterministic.
+    // So, in the optimizer, we will not collapse Project [rand + 1 AS rand1, rand - 1 AS rand2]
+    // and Project [key, Rand 5 AS rand]. The final plan still has two Projects.
+    val dfWithTwoProjects =
+      testData
+        .select($"key", (rand(5L) + 1).as("rand"))
+        .select(($"rand" + 1).as("rand1"), ($"rand" - 1).as("rand2"))
+    checkNumProjects(dfWithTwoProjects, 2)
+
+    // Now, we add one more project rand1 - rand2 on top of the query plan.
+    // Since rand1 and rand2 are deterministic (they basically apply +/- to the generated
+    // rand value), we can collapse rand1 - rand2 to the Project generating rand1 and rand2.
+    // So, the plan will be optimized from ...
+    // Project [(rand1 - rand2) AS (rand1 - rand2)]
+    //   Project [rand + 1 AS rand1, rand - 1 AS rand2]
+    //     Project [key, (Rand 5 + 1) AS rand]
+    //       LogicalRDD [key, value]
+    // to ...
+    // Project [((rand + 1 AS rand1) - (rand - 1 AS rand2)) AS (rand1 - rand2)]
+    //   Project [key, Rand 5 AS rand]
+    //     LogicalRDD [key, value]
+    val dfWithThreeProjects = dfWithTwoProjects.select($"rand1" - $"rand2")
+    checkNumProjects(dfWithThreeProjects, 2)
+    dfWithThreeProjects.collect().foreach { row =>
+      assert(row.getDouble(0) === 2.0 +- 0.0001)
+    }
   }
 
   test("randn") {
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/hiveUdfs.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/hiveUdfs.scala
index 64a49c83cbad1..1658bb93b0b79 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/hiveUdfs.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/hiveUdfs.scala
@@ -78,6 +78,8 @@ private[hive] case class HiveSimpleUdf(funcWrapper: HiveFunctionWrapper, childre
 
   type UDFType = UDF
 
+  override def deterministic: Boolean = isUDFDeterministic
+
   override def nullable: Boolean = true
 
   @transient
@@ -140,6 +142,8 @@ private[hive] case class HiveGenericUdf(funcWrapper: HiveFunctionWrapper, childr
   extends Expression with HiveInspectors with Logging {
   type UDFType = GenericUDF
 
+  override def deterministic: Boolean = isUDFDeterministic
+
   override def nullable: Boolean = true
 
   @transient

From 445647a1a36e1e24076a9fe506492fac462c66ad Mon Sep 17 00:00:00 2001
From: Davies Liu <davies@databricks.com>
Date: Tue, 2 Jun 2015 08:37:18 -0700
Subject: [PATCH 314/525] [SPARK-8021] [SQL] [PYSPARK] make Python read/write
 API consistent with Scala

add schema()/format()/options() for reader,  add mode()/format()/options()/partitionBy() for writer

cc rxin yhuai  pwendell

Author: Davies Liu <davies@databricks.com>

Closes #6578 from davies/readwrite and squashes the following commits:

720d293 [Davies Liu] address comments
b65dfa2 [Davies Liu] Update readwriter.py
1299ab6 [Davies Liu] make Python API consistent with Scala
---
 python/pyspark/sql/readwriter.py | 121 ++++++++++++++++++++++++-------
 1 file changed, 94 insertions(+), 27 deletions(-)

diff --git a/python/pyspark/sql/readwriter.py b/python/pyspark/sql/readwriter.py
index b6fd413bec7db..d17d87419fe3d 100644
--- a/python/pyspark/sql/readwriter.py
+++ b/python/pyspark/sql/readwriter.py
@@ -43,6 +43,39 @@ def _df(self, jdf):
         from pyspark.sql.dataframe import DataFrame
         return DataFrame(jdf, self._sqlContext)
 
+    @since(1.4)
+    def format(self, source):
+        """
+        Specifies the input data source format.
+        """
+        self._jreader = self._jreader.format(source)
+        return self
+
+    @since(1.4)
+    def schema(self, schema):
+        """
+        Specifies the input schema. Some data sources (e.g. JSON) can
+        infer the input schema automatically from data. By specifying
+        the schema here, the underlying data source can skip the schema
+        inference step, and thus speed up data loading.
+
+        :param schema: a StructType object
+        """
+        if not isinstance(schema, StructType):
+            raise TypeError("schema should be StructType")
+        jschema = self._sqlContext._ssql_ctx.parseDataType(schema.json())
+        self._jreader = self._jreader.schema(jschema)
+        return self
+
+    @since(1.4)
+    def options(self, **options):
+        """
+        Adds input options for the underlying data source.
+        """
+        for k in options:
+            self._jreader = self._jreader.option(k, options[k])
+        return self
+
     @since(1.4)
     def load(self, path=None, format=None, schema=None, **options):
         """Loads data from a data source and returns it as a :class`DataFrame`.
@@ -52,20 +85,15 @@ def load(self, path=None, format=None, schema=None, **options):
         :param schema: optional :class:`StructType` for the input schema.
         :param options: all other string options
         """
-        jreader = self._jreader
         if format is not None:
-            jreader = jreader.format(format)
+            self.format(format)
         if schema is not None:
-            if not isinstance(schema, StructType):
-                raise TypeError("schema should be StructType")
-            jschema = self._sqlContext._ssql_ctx.parseDataType(schema.json())
-            jreader = jreader.schema(jschema)
-        for k in options:
-            jreader = jreader.option(k, options[k])
+            self.schema(schema)
+        self.options(**options)
         if path is not None:
-            return self._df(jreader.load(path))
+            return self._df(self._jreader.load(path))
         else:
-            return self._df(jreader.load())
+            return self._df(self._jreader.load())
 
     @since(1.4)
     def json(self, path, schema=None):
@@ -105,12 +133,9 @@ def json(self, path, schema=None):
          |    |-- field5: array (nullable = true)
          |    |    |-- element: integer (containsNull = true)
         """
-        if schema is None:
-            jdf = self._jreader.json(path)
-        else:
-            jschema = self._sqlContext._ssql_ctx.parseDataType(schema.json())
-            jdf = self._jreader.schema(jschema).json(path)
-        return self._df(jdf)
+        if schema is not None:
+            self.schema(schema)
+        return self._df(self._jreader.json(path))
 
     @since(1.4)
     def table(self, tableName):
@@ -194,6 +219,51 @@ def __init__(self, df):
         self._sqlContext = df.sql_ctx
         self._jwrite = df._jdf.write()
 
+    @since(1.4)
+    def mode(self, saveMode):
+        """
+        Specifies the behavior when data or table already exists. Options include:
+
+        * `append`: Append contents of this :class:`DataFrame` to existing data.
+        * `overwrite`: Overwrite existing data.
+        * `error`: Throw an exception if data already exists.
+        * `ignore`: Silently ignore this operation if data already exists.
+        """
+        self._jwrite = self._jwrite.mode(saveMode)
+        return self
+
+    @since(1.4)
+    def format(self, source):
+        """
+        Specifies the underlying output data source. Built-in options include
+        "parquet", "json", etc.
+        """
+        self._jwrite = self._jwrite.format(source)
+        return self
+
+    @since(1.4)
+    def options(self, **options):
+        """
+        Adds output options for the underlying data source.
+        """
+        for k in options:
+            self._jwrite = self._jwrite.option(k, options[k])
+        return self
+
+    @since(1.4)
+    def partitionBy(self, *cols):
+        """
+        Partitions the output by the given columns on the file system.
+        If specified, the output is laid out on the file system similar
+        to Hive's partitioning scheme.
+
+        :param cols: name of columns
+        """
+        if len(cols) == 1 and isinstance(cols[0], (list, tuple)):
+            cols = cols[0]
+        self._jwrite = self._jwrite.partitionBy(_to_seq(self._sqlContext._sc, cols))
+        return self
+
     @since(1.4)
     def save(self, path=None, format=None, mode="error", **options):
         """
@@ -216,16 +286,15 @@ def save(self, path=None, format=None, mode="error", **options):
         :param mode: one of `append`, `overwrite`, `error`, `ignore` (default: error)
         :param options: all other string options
         """
-        jwrite = self._jwrite.mode(mode)
+        self.mode(mode).options(**options)
         if format is not None:
-            jwrite = jwrite.format(format)
-        for k in options:
-            jwrite = jwrite.option(k, options[k])
+            self.format(format)
         if path is None:
-            jwrite.save()
+            self._jwrite.save()
         else:
-            jwrite.save(path)
+            self._jwrite.save(path)
 
+    @since(1.4)
     def insertInto(self, tableName, overwrite=False):
         """
         Inserts the content of the :class:`DataFrame` to the specified table.
@@ -256,12 +325,10 @@ def saveAsTable(self, name, format=None, mode="error", **options):
         :param mode: one of `append`, `overwrite`, `error`, `ignore` (default: error)
         :param options: all other string options
         """
-        jwrite = self._jwrite.mode(mode)
+        self.mode(mode).options(**options)
         if format is not None:
-            jwrite = jwrite.format(format)
-        for k in options:
-            jwrite = jwrite.option(k, options[k])
-        return jwrite.saveAsTable(name)
+            self.format(format)
+        return self._jwrite.saveAsTable(name)
 
     @since(1.4)
     def json(self, path, mode="error"):

From bd97840d5ccc3f0bfde1e5cfc7abeac9681997ab Mon Sep 17 00:00:00 2001
From: Xiangrui Meng <meng@databricks.com>
Date: Tue, 2 Jun 2015 08:51:00 -0700
Subject: [PATCH 315/525] [SPARK-7432] [MLLIB] fix flaky CrossValidator doctest

The new test uses CV to compare `maxIter=0` and `maxIter=1`, and validate on the evaluation result. jkbradley

Author: Xiangrui Meng <meng@databricks.com>

Closes #6572 from mengxr/SPARK-7432 and squashes the following commits:

c236bb8 [Xiangrui Meng] fix flacky cv doctest
---
 python/pyspark/ml/tuning.py | 19 +++++++++----------
 1 file changed, 9 insertions(+), 10 deletions(-)

diff --git a/python/pyspark/ml/tuning.py b/python/pyspark/ml/tuning.py
index 497841b6c8ce6..0bf988fd72f14 100644
--- a/python/pyspark/ml/tuning.py
+++ b/python/pyspark/ml/tuning.py
@@ -91,20 +91,19 @@ class CrossValidator(Estimator):
     >>> from pyspark.ml.evaluation import BinaryClassificationEvaluator
     >>> from pyspark.mllib.linalg import Vectors
     >>> dataset = sqlContext.createDataFrame(
-    ...     [(Vectors.dense([0.0, 1.0]), 0.0),
-    ...      (Vectors.dense([1.0, 2.0]), 1.0),
-    ...      (Vectors.dense([0.55, 3.0]), 0.0),
-    ...      (Vectors.dense([0.45, 4.0]), 1.0),
-    ...      (Vectors.dense([0.51, 5.0]), 1.0)] * 10,
+    ...     [(Vectors.dense([0.0]), 0.0),
+    ...      (Vectors.dense([0.4]), 1.0),
+    ...      (Vectors.dense([0.5]), 0.0),
+    ...      (Vectors.dense([0.6]), 1.0),
+    ...      (Vectors.dense([1.0]), 1.0)] * 10,
     ...     ["features", "label"])
     >>> lr = LogisticRegression()
-    >>> grid = ParamGridBuilder().addGrid(lr.maxIter, [0, 1, 5]).build()
+    >>> grid = ParamGridBuilder().addGrid(lr.maxIter, [0, 1]).build()
     >>> evaluator = BinaryClassificationEvaluator()
     >>> cv = CrossValidator(estimator=lr, estimatorParamMaps=grid, evaluator=evaluator)
-    >>> # SPARK-7432: The following test is flaky.
-    >>> # cvModel = cv.fit(dataset)
-    >>> # expected = lr.fit(dataset, {lr.maxIter: 5}).transform(dataset)
-    >>> # cvModel.transform(dataset).collect() == expected.collect()
+    >>> cvModel = cv.fit(dataset)
+    >>> evaluator.evaluate(cvModel.transform(dataset))
+    0.8333...
     """
 
     # a placeholder to make it appear in the generated doc

From 1bb5d716c0351cd0b4c11b397fd778f30db39bd9 Mon Sep 17 00:00:00 2001
From: Cheng Lian <lian@databricks.com>
Date: Wed, 3 Jun 2015 00:59:50 +0800
Subject: [PATCH 316/525] [SPARK-8037] [SQL] Ignores files whose name starts
 with dot in HadoopFsRelation

Author: Cheng Lian <lian@databricks.com>

Closes #6581 from liancheng/spark-8037 and squashes the following commits:

d08e97b [Cheng Lian] Ignores files whose name starts with dot in HadoopFsRelation
---
 .../spark/sql/sources/PartitioningUtils.scala |  2 +-
 .../apache/spark/sql/sources/interfaces.scala | 11 +++++++----
 .../ParquetPartitionDiscoverySuite.scala      | 19 ++++++++++++++++++-
 3 files changed, 26 insertions(+), 6 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/sources/PartitioningUtils.scala b/sql/core/src/main/scala/org/apache/spark/sql/sources/PartitioningUtils.scala
index dafdf0f8b4564..c4c99de5a38dc 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/sources/PartitioningUtils.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/sources/PartitioningUtils.scala
@@ -187,7 +187,7 @@ private[sql] object PartitioningUtils {
       Seq.empty
     } else {
       assert(distinctPartitionsColNames.size == 1, {
-        val list = distinctPartitionsColNames.mkString("\t", "\n", "")
+        val list = distinctPartitionsColNames.mkString("\t", "\n\t", "")
         s"Conflicting partition column names detected:\n$list"
       })
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/sources/interfaces.scala b/sql/core/src/main/scala/org/apache/spark/sql/sources/interfaces.scala
index b1b997c030a60..c4ffa8de52640 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/sources/interfaces.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/sources/interfaces.scala
@@ -379,10 +379,10 @@ abstract class HadoopFsRelation private[sql](maybePartitionSpec: Option[Partitio
     var leafDirToChildrenFiles = mutable.Map.empty[Path, Array[FileStatus]]
 
     def refresh(): Unit = {
-      // We don't filter files/directories whose name start with "_" or "." here, as specific data
-      // sources may take advantages over them (e.g. Parquet _metadata and _common_metadata files).
-      // But "_temporary" directories are explicitly ignored since failed tasks/jobs may leave
-      // partial/corrupted data files there.
+      // We don't filter files/directories whose name start with "_" except "_temporary" here, as
+      // specific data sources may take advantages over them (e.g. Parquet _metadata and
+      // _common_metadata files). "_temporary" directories are explicitly ignored since failed
+      // tasks/jobs may leave partial/corrupted data files there.
       def listLeafFilesAndDirs(fs: FileSystem, status: FileStatus): Set[FileStatus] = {
         if (status.getPath.getName.toLowerCase == "_temporary") {
           Set.empty
@@ -400,6 +400,9 @@ abstract class HadoopFsRelation private[sql](maybePartitionSpec: Option[Partitio
         val fs = hdfsPath.getFileSystem(hadoopConf)
         val qualified = hdfsPath.makeQualified(fs.getUri, fs.getWorkingDirectory)
         Try(fs.getFileStatus(qualified)).toOption.toArray.flatMap(listLeafFilesAndDirs(fs, _))
+      }.filterNot { status =>
+        // SPARK-8037: Ignores files like ".DS_Store" and other hidden files/directories
+        status.getPath.getName.startsWith(".")
       }
 
       val files = statuses.filterNot(_.isDir)
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetPartitionDiscoverySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetPartitionDiscoverySuite.scala
index f231589e9674d..3b29979452ad9 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetPartitionDiscoverySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetPartitionDiscoverySuite.scala
@@ -18,10 +18,11 @@ package org.apache.spark.sql.parquet
 
 import java.io.File
 import java.math.BigInteger
-import java.sql.{Timestamp, Date}
+import java.sql.Timestamp
 
 import scala.collection.mutable.ArrayBuffer
 
+import com.google.common.io.Files
 import org.apache.hadoop.fs.Path
 
 import org.apache.spark.sql.catalyst.expressions.Literal
@@ -432,4 +433,20 @@ class ParquetPartitionDiscoverySuite extends QueryTest with ParquetTest {
       checkAnswer(read.load(dir.toString).select(fields: _*), row)
     }
   }
+
+  test("SPARK-8037: Ignores files whose name starts with dot") {
+    withTempPath { dir =>
+      val df = (1 to 3).map(i => (i, i, i, i)).toDF("a", "b", "c", "d")
+
+      df.write
+        .format("parquet")
+        .partitionBy("b", "c", "d")
+        .save(dir.getCanonicalPath)
+
+      Files.touch(new File(s"${dir.getCanonicalPath}/b=1", ".DS_Store"))
+      Files.createParentDirs(new File(s"${dir.getCanonicalPath}/b=1/c=1/.foo/bar"))
+
+      checkAnswer(read.format("parquet").load(dir.getCanonicalPath), df)
+    }
+  }
 }

From 0071bd8d31f13abfe73b9d141a818412d374dce0 Mon Sep 17 00:00:00 2001
From: Marcelo Vanzin <vanzin@cloudera.com>
Date: Tue, 2 Jun 2015 11:20:33 -0700
Subject: [PATCH 317/525] [SPARK-8015] [FLUME] Remove Guava dependency from
 flume-sink.

The minimal change would be to disable shading of Guava in the module,
and rely on the transitive dependency from other libraries instead. But
since Guava's use is so localized, I think it's better to just not use
it instead, so I replaced that code and removed all traces of Guava from
the module's build.

Author: Marcelo Vanzin <vanzin@cloudera.com>

Closes #6555 from vanzin/SPARK-8015 and squashes the following commits:

c0ceea8 [Marcelo Vanzin] Add comments about dependency management.
c38228d [Marcelo Vanzin] Add guava dep in test scope.
b7a0349 [Marcelo Vanzin] Add libthrift exclusion.
6e0942d [Marcelo Vanzin] Add comment in pom.
2d79260 [Marcelo Vanzin] [SPARK-8015] [flume] Remove Guava dependency from flume-sink.
---
 external/flume-sink/pom.xml                   | 39 +++++++++++++++++++
 .../flume/sink/SparkAvroCallbackHandler.scala |  4 +-
 .../flume/sink/SparkSinkThreadFactory.scala   | 35 +++++++++++++++++
 .../streaming/flume/sink/SparkSinkSuite.scala |  6 +--
 4 files changed, 77 insertions(+), 7 deletions(-)
 create mode 100644 external/flume-sink/src/main/scala/org/apache/spark/streaming/flume/sink/SparkSinkThreadFactory.scala

diff --git a/external/flume-sink/pom.xml b/external/flume-sink/pom.xml
index 1f3e619d97a24..71f2b6fe18bd1 100644
--- a/external/flume-sink/pom.xml
+++ b/external/flume-sink/pom.xml
@@ -42,15 +42,46 @@
     <dependency>
       <groupId>org.apache.flume</groupId>
       <artifactId>flume-ng-sdk</artifactId>
+      <exclusions>
+        <!-- Guava is excluded to avoid its use in this module. -->
+        <exclusion>
+          <groupId>com.google.guava</groupId>
+          <artifactId>guava</artifactId>
+        </exclusion>
+        <!--
+          Exclude libthrift since the flume poms seem to confuse sbt, which fails to find the
+          dependency.
+        -->
+        <exclusion>
+          <groupId>org.apache.thrift</groupId>
+          <artifactId>libthrift</artifactId>
+        </exclusion>
+      </exclusions>
     </dependency>
     <dependency>
       <groupId>org.apache.flume</groupId>
       <artifactId>flume-ng-core</artifactId>
+      <exclusions>
+        <exclusion>
+          <groupId>com.google.guava</groupId>
+          <artifactId>guava</artifactId>
+        </exclusion>
+        <exclusion>
+          <groupId>org.apache.thrift</groupId>
+          <artifactId>libthrift</artifactId>
+        </exclusion>
+      </exclusions>
     </dependency>
     <dependency>
       <groupId>org.scala-lang</groupId>
       <artifactId>scala-library</artifactId>
     </dependency>
+    <dependency>
+      <!-- Add Guava in test scope since flume actually needs it. -->
+      <groupId>com.google.guava</groupId>
+      <artifactId>guava</artifactId>
+      <scope>test</scope>
+    </dependency>
     <dependency>
       <!--
         Netty explicitly added in test as it has been excluded from
@@ -85,6 +116,14 @@
           </execution>
         </executions>
       </plugin>
+      <plugin>
+        <groupId>org.apache.maven.plugins</groupId>
+        <artifactId>maven-shade-plugin</artifactId>
+        <configuration>
+          <!-- Disable all relocations defined in the parent pom. -->
+          <relocations combine.self="override" />
+        </configuration>
+      </plugin>
     </plugins>
   </build>
 </project>
diff --git a/external/flume-sink/src/main/scala/org/apache/spark/streaming/flume/sink/SparkAvroCallbackHandler.scala b/external/flume-sink/src/main/scala/org/apache/spark/streaming/flume/sink/SparkAvroCallbackHandler.scala
index fd01807fc3ac4..dc2a4ab138e18 100644
--- a/external/flume-sink/src/main/scala/org/apache/spark/streaming/flume/sink/SparkAvroCallbackHandler.scala
+++ b/external/flume-sink/src/main/scala/org/apache/spark/streaming/flume/sink/SparkAvroCallbackHandler.scala
@@ -21,7 +21,6 @@ import java.util.concurrent.atomic.AtomicLong
 
 import scala.collection.mutable
 
-import com.google.common.util.concurrent.ThreadFactoryBuilder
 import org.apache.flume.Channel
 import org.apache.commons.lang3.RandomStringUtils
 
@@ -45,8 +44,7 @@ import org.apache.commons.lang3.RandomStringUtils
 private[flume] class SparkAvroCallbackHandler(val threads: Int, val channel: Channel,
   val transactionTimeout: Int, val backOffInterval: Int) extends SparkFlumeProtocol with Logging {
   val transactionExecutorOpt = Option(Executors.newFixedThreadPool(threads,
-    new ThreadFactoryBuilder().setDaemon(true)
-      .setNameFormat("Spark Sink Processor Thread - %d").build()))
+    new SparkSinkThreadFactory("Spark Sink Processor Thread - %d")))
   // Protected by `sequenceNumberToProcessor`
   private val sequenceNumberToProcessor = mutable.HashMap[CharSequence, TransactionProcessor]()
   // This sink will not persist sequence numbers and reuses them if it gets restarted.
diff --git a/external/flume-sink/src/main/scala/org/apache/spark/streaming/flume/sink/SparkSinkThreadFactory.scala b/external/flume-sink/src/main/scala/org/apache/spark/streaming/flume/sink/SparkSinkThreadFactory.scala
new file mode 100644
index 0000000000000..845fc8debda75
--- /dev/null
+++ b/external/flume-sink/src/main/scala/org/apache/spark/streaming/flume/sink/SparkSinkThreadFactory.scala
@@ -0,0 +1,35 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.streaming.flume.sink
+
+import java.util.concurrent.ThreadFactory
+import java.util.concurrent.atomic.AtomicLong
+
+/**
+ * Thread factory that generates daemon threads with a specified name format.
+ */
+private[sink] class SparkSinkThreadFactory(nameFormat: String) extends ThreadFactory {
+
+  private val threadId = new AtomicLong()
+
+  override def newThread(r: Runnable): Thread = {
+    val t = new Thread(r, nameFormat.format(threadId.incrementAndGet()))
+    t.setDaemon(true)
+    t
+  }
+
+}
diff --git a/external/flume-sink/src/test/scala/org/apache/spark/streaming/flume/sink/SparkSinkSuite.scala b/external/flume-sink/src/test/scala/org/apache/spark/streaming/flume/sink/SparkSinkSuite.scala
index 605b3fe71017f..fa43629d49771 100644
--- a/external/flume-sink/src/test/scala/org/apache/spark/streaming/flume/sink/SparkSinkSuite.scala
+++ b/external/flume-sink/src/test/scala/org/apache/spark/streaming/flume/sink/SparkSinkSuite.scala
@@ -24,7 +24,6 @@ import scala.collection.JavaConversions._
 import scala.concurrent.{ExecutionContext, Future}
 import scala.util.{Failure, Success}
 
-import com.google.common.util.concurrent.ThreadFactoryBuilder
 import org.apache.avro.ipc.NettyTransceiver
 import org.apache.avro.ipc.specific.SpecificRequestor
 import org.apache.flume.Context
@@ -194,9 +193,8 @@ class SparkSinkSuite extends FunSuite {
     count: Int): Seq[(NettyTransceiver, SparkFlumeProtocol.Callback)] = {
 
     (1 to count).map(_ => {
-      lazy val channelFactoryExecutor =
-        Executors.newCachedThreadPool(new ThreadFactoryBuilder().setDaemon(true).
-          setNameFormat("Flume Receiver Channel Thread - %d").build())
+      lazy val channelFactoryExecutor = Executors.newCachedThreadPool(
+        new SparkSinkThreadFactory("Flume Receiver Channel Thread - %d"))
       lazy val channelFactory =
         new NioClientSocketChannelFactory(channelFactoryExecutor, channelFactoryExecutor)
       val transceiver = new NettyTransceiver(address, channelFactory)

From ad06727fe985ca243ebdaaba55cd7d35a4749d0a Mon Sep 17 00:00:00 2001
From: Mike Dusenberry <dusenberrymw@gmail.com>
Date: Tue, 2 Jun 2015 12:38:14 -0700
Subject: [PATCH 318/525] [SPARK-7985] [ML] [MLlib] [Docs] Remove
 "fittingParamMap" references. Updating ML Doc "Estimator, Transformer, and
 Param" examples.

Updating ML Doc's *"Estimator, Transformer, and Param"* example to use `model.extractParamMap` instead of `model.fittingParamMap`, which no longer exists.

mengxr, I believe this addresses (part of) the *update documentation* TODO list item from [PR 5820](https://github.com/apache/spark/pull/5820).

Author: Mike Dusenberry <dusenberrymw@gmail.com>

Closes #6514 from dusenberrymw/Fix_ML_Doc_Estimator_Transformer_Param_Example and squashes the following commits:

6366e1f [Mike Dusenberry] Updating instances of model.extractParamMap to model.parent.extractParamMap, since the Params of the parent Estimator could possibly differ from thos of the Model.
d850e0e [Mike Dusenberry] Removing all references to "fittingParamMap" throughout Spark, since it has been removed.
0480304 [Mike Dusenberry] Updating the ML Doc "Estimator, Transformer, and Param" Java example to use model.extractParamMap() instead of model.fittingParamMap(), which no longer exists.
7d34939 [Mike Dusenberry] Updating ML Doc "Estimator, Transformer, and Param" example to use model.extractParamMap instead of model.fittingParamMap, which no longer exists.
---
 docs/ml-guide.md                                          | 8 ++++----
 .../apache/spark/ml/classification/GBTClassifier.scala    | 2 +-
 .../spark/ml/classification/RandomForestClassifier.scala  | 2 +-
 .../org/apache/spark/ml/regression/GBTRegressor.scala     | 2 +-
 .../spark/ml/regression/RandomForestRegressor.scala       | 2 +-
 .../ml/classification/DecisionTreeClassifierSuite.scala   | 2 +-
 .../spark/ml/classification/GBTClassifierSuite.scala      | 2 +-
 .../ml/classification/RandomForestClassifierSuite.scala   | 2 +-
 .../spark/ml/regression/DecisionTreeRegressorSuite.scala  | 2 +-
 .../apache/spark/ml/regression/GBTRegressorSuite.scala    | 2 +-
 .../spark/ml/regression/RandomForestRegressorSuite.scala  | 2 +-
 11 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/docs/ml-guide.md b/docs/ml-guide.md
index c5f50ed7990f1..4eb622d4b95e8 100644
--- a/docs/ml-guide.md
+++ b/docs/ml-guide.md
@@ -207,7 +207,7 @@ val model1 = lr.fit(training.toDF)
 // we can view the parameters it used during fit().
 // This prints the parameter (name: value) pairs, where names are unique IDs for this
 // LogisticRegression instance.
-println("Model 1 was fit using parameters: " + model1.fittingParamMap)
+println("Model 1 was fit using parameters: " + model1.parent.extractParamMap)
 
 // We may alternatively specify parameters using a ParamMap,
 // which supports several methods for specifying parameters.
@@ -222,7 +222,7 @@ val paramMapCombined = paramMap ++ paramMap2
 // Now learn a new model using the paramMapCombined parameters.
 // paramMapCombined overrides all parameters set earlier via lr.set* methods.
 val model2 = lr.fit(training.toDF, paramMapCombined)
-println("Model 2 was fit using parameters: " + model2.fittingParamMap)
+println("Model 2 was fit using parameters: " + model2.parent.extractParamMap)
 
 // Prepare test data.
 val test = sc.parallelize(Seq(
@@ -289,7 +289,7 @@ LogisticRegressionModel model1 = lr.fit(training);
 // we can view the parameters it used during fit().
 // This prints the parameter (name: value) pairs, where names are unique IDs for this
 // LogisticRegression instance.
-System.out.println("Model 1 was fit using parameters: " + model1.fittingParamMap());
+System.out.println("Model 1 was fit using parameters: " + model1.parent().extractParamMap());
 
 // We may alternatively specify parameters using a ParamMap.
 ParamMap paramMap = new ParamMap();
@@ -305,7 +305,7 @@ ParamMap paramMapCombined = paramMap.$plus$plus(paramMap2);
 // Now learn a new model using the paramMapCombined parameters.
 // paramMapCombined overrides all parameters set earlier via lr.set* methods.
 LogisticRegressionModel model2 = lr.fit(training, paramMapCombined);
-System.out.println("Model 2 was fit using parameters: " + model2.fittingParamMap());
+System.out.println("Model 2 was fit using parameters: " + model2.parent().extractParamMap());
 
 // Prepare test documents.
 List<LabeledPoint> localTest = Lists.newArrayList(
diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/GBTClassifier.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/GBTClassifier.scala
index d8592eb2d947d..62f4b51f770e9 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/classification/GBTClassifier.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/classification/GBTClassifier.scala
@@ -208,7 +208,7 @@ private[ml] object GBTClassificationModel {
     require(oldModel.algo == OldAlgo.Classification, "Cannot convert GradientBoostedTreesModel" +
       s" with algo=${oldModel.algo} (old API) to GBTClassificationModel (new API).")
     val newTrees = oldModel.trees.map { tree =>
-      // parent, fittingParamMap for each tree is null since there are no good ways to set these.
+      // parent for each tree is null since there is no good way to set this.
       DecisionTreeRegressionModel.fromOld(tree, null, categoricalFeatures)
     }
     val uid = if (parent != null) parent.uid else Identifiable.randomUID("gbtc")
diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/RandomForestClassifier.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/RandomForestClassifier.scala
index 67600ebd7b38e..852a67e066322 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/classification/RandomForestClassifier.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/classification/RandomForestClassifier.scala
@@ -170,7 +170,7 @@ private[ml] object RandomForestClassificationModel {
     require(oldModel.algo == OldAlgo.Classification, "Cannot convert RandomForestModel" +
       s" with algo=${oldModel.algo} (old API) to RandomForestClassificationModel (new API).")
     val newTrees = oldModel.trees.map { tree =>
-      // parent, fittingParamMap for each tree is null since there are no good ways to set these.
+      // parent for each tree is null since there is no good way to set this.
       DecisionTreeClassificationModel.fromOld(tree, null, categoricalFeatures)
     }
     val uid = if (parent != null) parent.uid else Identifiable.randomUID("rfc")
diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/GBTRegressor.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/GBTRegressor.scala
index 69f4f5414c8c6..b7e374bb6cb49 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/regression/GBTRegressor.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/regression/GBTRegressor.scala
@@ -198,7 +198,7 @@ private[ml] object GBTRegressionModel {
     require(oldModel.algo == OldAlgo.Regression, "Cannot convert GradientBoostedTreesModel" +
       s" with algo=${oldModel.algo} (old API) to GBTRegressionModel (new API).")
     val newTrees = oldModel.trees.map { tree =>
-      // parent, fittingParamMap for each tree is null since there are no good ways to set these.
+      // parent for each tree is null since there is no good way to set this.
       DecisionTreeRegressionModel.fromOld(tree, null, categoricalFeatures)
     }
     val uid = if (parent != null) parent.uid else Identifiable.randomUID("gbtr")
diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/RandomForestRegressor.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/RandomForestRegressor.scala
index ae767a17329d2..49a1f7ce8c995 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/regression/RandomForestRegressor.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/regression/RandomForestRegressor.scala
@@ -152,7 +152,7 @@ private[ml] object RandomForestRegressionModel {
     require(oldModel.algo == OldAlgo.Regression, "Cannot convert RandomForestModel" +
       s" with algo=${oldModel.algo} (old API) to RandomForestRegressionModel (new API).")
     val newTrees = oldModel.trees.map { tree =>
-      // parent, fittingParamMap for each tree is null since there are no good ways to set these.
+      // parent for each tree is null since there is no good way to set this.
       DecisionTreeRegressionModel.fromOld(tree, null, categoricalFeatures)
     }
     new RandomForestRegressionModel(parent.uid, newTrees)
diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/DecisionTreeClassifierSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/DecisionTreeClassifierSuite.scala
index 40554f6ef94a8..ae40b0b8ff854 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/classification/DecisionTreeClassifierSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/classification/DecisionTreeClassifierSuite.scala
@@ -265,7 +265,7 @@ private[ml] object DecisionTreeClassifierSuite extends SparkFunSuite {
     val oldTree = OldDecisionTree.train(data, oldStrategy)
     val newData: DataFrame = TreeTests.setMetadata(data, categoricalFeatures, numClasses)
     val newTree = dt.fit(newData)
-    // Use parent, fittingParamMap from newTree since these are not checked anyways.
+    // Use parent from newTree since this is not checked anyways.
     val oldTreeAsNew = DecisionTreeClassificationModel.fromOld(
       oldTree, newTree.parent.asInstanceOf[DecisionTreeClassifier], categoricalFeatures)
     TreeTests.checkEqual(oldTreeAsNew, newTree)
diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/GBTClassifierSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/GBTClassifierSuite.scala
index 09327051621e0..1302da3c373ff 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/classification/GBTClassifierSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/classification/GBTClassifierSuite.scala
@@ -127,7 +127,7 @@ private object GBTClassifierSuite {
     val oldModel = oldGBT.run(data)
     val newData: DataFrame = TreeTests.setMetadata(data, categoricalFeatures, numClasses = 2)
     val newModel = gbt.fit(newData)
-    // Use parent, fittingParamMap from newTree since these are not checked anyways.
+    // Use parent from newTree since this is not checked anyways.
     val oldModelAsNew = GBTClassificationModel.fromOld(
       oldModel, newModel.parent.asInstanceOf[GBTClassifier], categoricalFeatures)
     TreeTests.checkEqual(oldModelAsNew, newModel)
diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/RandomForestClassifierSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/RandomForestClassifierSuite.scala
index f699d0c374d2f..eee9355a67be3 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/classification/RandomForestClassifierSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/classification/RandomForestClassifierSuite.scala
@@ -157,7 +157,7 @@ private object RandomForestClassifierSuite {
       data, oldStrategy, rf.getNumTrees, rf.getFeatureSubsetStrategy, rf.getSeed.toInt)
     val newData: DataFrame = TreeTests.setMetadata(data, categoricalFeatures, numClasses)
     val newModel = rf.fit(newData)
-    // Use parent, fittingParamMap from newTree since these are not checked anyways.
+    // Use parent from newTree since this is not checked anyways.
     val oldModelAsNew = RandomForestClassificationModel.fromOld(
       oldModel, newModel.parent.asInstanceOf[RandomForestClassifier], categoricalFeatures)
     TreeTests.checkEqual(oldModelAsNew, newModel)
diff --git a/mllib/src/test/scala/org/apache/spark/ml/regression/DecisionTreeRegressorSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/regression/DecisionTreeRegressorSuite.scala
index 1182b89a8e3aa..33aa9d0d62343 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/regression/DecisionTreeRegressorSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/regression/DecisionTreeRegressorSuite.scala
@@ -82,7 +82,7 @@ private[ml] object DecisionTreeRegressorSuite extends SparkFunSuite {
     val oldTree = OldDecisionTree.train(data, oldStrategy)
     val newData: DataFrame = TreeTests.setMetadata(data, categoricalFeatures, numClasses = 0)
     val newTree = dt.fit(newData)
-    // Use parent, fittingParamMap from newTree since these are not checked anyways.
+    // Use parent from newTree since this is not checked anyways.
     val oldTreeAsNew = DecisionTreeRegressionModel.fromOld(
       oldTree, newTree.parent.asInstanceOf[DecisionTreeRegressor], categoricalFeatures)
     TreeTests.checkEqual(oldTreeAsNew, newTree)
diff --git a/mllib/src/test/scala/org/apache/spark/ml/regression/GBTRegressorSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/regression/GBTRegressorSuite.scala
index f8a1469fee313..98fb3d3f5f22c 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/regression/GBTRegressorSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/regression/GBTRegressorSuite.scala
@@ -128,7 +128,7 @@ private object GBTRegressorSuite {
     val oldModel = oldGBT.run(data)
     val newData: DataFrame = TreeTests.setMetadata(data, categoricalFeatures, numClasses = 0)
     val newModel = gbt.fit(newData)
-    // Use parent, fittingParamMap from newTree since these are not checked anyways.
+    // Use parent from newTree since this is not checked anyways.
     val oldModelAsNew = GBTRegressionModel.fromOld(
       oldModel, newModel.parent.asInstanceOf[GBTRegressor], categoricalFeatures)
     TreeTests.checkEqual(oldModelAsNew, newModel)
diff --git a/mllib/src/test/scala/org/apache/spark/ml/regression/RandomForestRegressorSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/regression/RandomForestRegressorSuite.scala
index 78911560945a2..b24ecaa57c89b 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/regression/RandomForestRegressorSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/regression/RandomForestRegressorSuite.scala
@@ -113,7 +113,7 @@ private object RandomForestRegressorSuite extends SparkFunSuite {
       data, oldStrategy, rf.getNumTrees, rf.getFeatureSubsetStrategy, rf.getSeed.toInt)
     val newData: DataFrame = TreeTests.setMetadata(data, categoricalFeatures, numClasses = 0)
     val newModel = rf.fit(newData)
-    // Use parent, fittingParamMap from newTree since these are not checked anyways.
+    // Use parent from newTree since this is not checked anyways.
     val oldModelAsNew = RandomForestRegressionModel.fromOld(
       oldModel, newModel.parent.asInstanceOf[RandomForestRegressor], categoricalFeatures)
     TreeTests.checkEqual(oldModelAsNew, newModel)

From 686a45f0b9c50ede2a80854ed6a155ee8a9a4f5c Mon Sep 17 00:00:00 2001
From: Cheng Lian <lian@databricks.com>
Date: Tue, 2 Jun 2015 13:32:13 -0700
Subject: [PATCH 319/525] [SPARK-8014] [SQL] Avoid premature metadata discovery
 when writing a HadoopFsRelation with a save mode other than Append

The current code references the schema of the DataFrame to be written before checking save mode. This triggers expensive metadata discovery prematurely. For save mode other than `Append`, this metadata discovery is useless since we either ignore the result (for `Ignore` and `ErrorIfExists`) or delete existing files (for `Overwrite`) later.

This PR fixes this issue by deferring metadata discovery after save mode checking.

Author: Cheng Lian <lian@databricks.com>

Closes #6583 from liancheng/spark-8014 and squashes the following commits:

1aafabd [Cheng Lian] Updates comments
088abaa [Cheng Lian] Avoids schema merging and partition discovery when data schema and partition schema are defined
8fbd93f [Cheng Lian] Fixes SPARK-8014
---
 .../apache/spark/sql/parquet/newParquet.scala |  2 +-
 .../apache/spark/sql/sources/commands.scala   | 20 +++++--
 .../org/apache/spark/sql/sources/ddl.scala    | 16 ++---
 .../apache/spark/sql/sources/interfaces.scala |  2 +-
 .../sql/sources/hadoopFsRelationSuites.scala  | 59 ++++++++++++++-----
 5 files changed, 67 insertions(+), 32 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/newParquet.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/newParquet.scala
index e439a18ac43aa..824ae36968c32 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/newParquet.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/newParquet.scala
@@ -190,7 +190,7 @@ private[sql] class ParquetRelation2(
     }
   }
 
-  override def dataSchema: StructType = metadataCache.dataSchema
+  override def dataSchema: StructType = maybeDataSchema.getOrElse(metadataCache.dataSchema)
 
   override private[sql] def refresh(): Unit = {
     super.refresh()
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/sources/commands.scala b/sql/core/src/main/scala/org/apache/spark/sql/sources/commands.scala
index 3132067d562f6..71f016b1f14de 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/sources/commands.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/sources/commands.scala
@@ -30,9 +30,10 @@ import org.apache.spark._
 import org.apache.spark.mapred.SparkHadoopMapRedUtil
 import org.apache.spark.mapreduce.SparkHadoopMapReduceUtil
 import org.apache.spark.sql.catalyst.CatalystTypeConverters
+import org.apache.spark.sql.catalyst.analysis.UnresolvedAttribute
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.expressions.codegen.GenerateProjection
-import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
+import org.apache.spark.sql.catalyst.plans.logical.{Project, LogicalPlan}
 import org.apache.spark.sql.execution.RunnableCommand
 import org.apache.spark.sql.types.StructType
 import org.apache.spark.sql.{DataFrame, SQLConf, SQLContext, SaveMode}
@@ -94,10 +95,19 @@ private[sql] case class InsertIntoHadoopFsRelation(
 
       // We create a DataFrame by applying the schema of relation to the data to make sure.
       // We are writing data based on the expected schema,
-      val df = sqlContext.createDataFrame(
-        DataFrame(sqlContext, query).queryExecution.toRdd,
-        relation.schema,
-        needsConversion = false)
+      val df = {
+        // For partitioned relation r, r.schema's column ordering can be different from the column
+        // ordering of data.logicalPlan (partition columns are all moved after data column). We
+        // need a Project to adjust the ordering, so that inside InsertIntoHadoopFsRelation, we can
+        // safely apply the schema of r.schema to the data.
+        val project = Project(
+          relation.schema.map(field => new UnresolvedAttribute(Seq(field.name))), query)
+
+        sqlContext.createDataFrame(
+          DataFrame(sqlContext, project).queryExecution.toRdd,
+          relation.schema,
+          needsConversion = false)
+      }
 
       val partitionColumns = relation.partitionColumns.fieldNames
       if (partitionColumns.isEmpty) {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/sources/ddl.scala b/sql/core/src/main/scala/org/apache/spark/sql/sources/ddl.scala
index 22587f5a1c6f1..20afd60cb7767 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/sources/ddl.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/sources/ddl.scala
@@ -25,7 +25,7 @@ import org.apache.hadoop.fs.Path
 import org.apache.spark.Logging
 import org.apache.spark.deploy.SparkHadoopUtil
 import org.apache.spark.sql.catalyst.AbstractSparkSQLParser
-import org.apache.spark.sql.catalyst.analysis.{UnresolvedAttribute, UnresolvedRelation}
+import org.apache.spark.sql.catalyst.analysis.UnresolvedRelation
 import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference, Row}
 import org.apache.spark.sql.catalyst.plans.logical._
 import org.apache.spark.sql.execution.RunnableCommand
@@ -322,19 +322,13 @@ private[sql] object ResolvedDataSource {
           Some(partitionColumnsSchema(data.schema, partitionColumns)),
           caseInsensitiveOptions)
 
-        // For partitioned relation r, r.schema's column ordering is different with the column
-        // ordering of data.logicalPlan. We need a Project to adjust the ordering.
-        // So, inside InsertIntoHadoopFsRelation, we can safely apply the schema of r.schema to
-        // the data.
-        val project =
-          Project(
-            r.schema.map(field => new UnresolvedAttribute(Seq(field.name))),
-            data.logicalPlan)
-
+        // For partitioned relation r, r.schema's column ordering can be different from the column
+        // ordering of data.logicalPlan (partition columns are all moved after data column).  This
+        // will be adjusted within InsertIntoHadoopFsRelation.
         sqlContext.executePlan(
           InsertIntoHadoopFsRelation(
             r,
-            project,
+            data.logicalPlan,
             mode)).toRdd
         r
       case _ =>
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/sources/interfaces.scala b/sql/core/src/main/scala/org/apache/spark/sql/sources/interfaces.scala
index c4ffa8de52640..f5bd2d2941ca0 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/sources/interfaces.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/sources/interfaces.scala
@@ -503,7 +503,7 @@ abstract class HadoopFsRelation private[sql](maybePartitionSpec: Option[Partitio
    */
   override lazy val schema: StructType = {
     val dataSchemaColumnNames = dataSchema.map(_.name.toLowerCase).toSet
-    StructType(dataSchema ++ partitionSpec.partitionColumns.filterNot { column =>
+    StructType(dataSchema ++ partitionColumns.filterNot { column =>
       dataSchemaColumnNames.contains(column.name.toLowerCase)
     })
   }
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/sources/hadoopFsRelationSuites.scala b/sql/hive/src/test/scala/org/apache/spark/sql/sources/hadoopFsRelationSuites.scala
index af36fa6f1faae..74095426741e3 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/sources/hadoopFsRelationSuites.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/sources/hadoopFsRelationSuites.scala
@@ -17,6 +17,9 @@
 
 package org.apache.spark.sql.sources
 
+import java.io.File
+
+import com.google.common.io.Files
 import org.apache.hadoop.fs.Path
 
 import org.apache.spark.{SparkException, SparkFunSuite}
@@ -453,6 +456,20 @@ abstract class HadoopFsRelationTest extends QueryTest with SQLTestUtils {
       }
     }
   }
+
+  test("SPARK-7616: adjust column name order accordingly when saving partitioned table") {
+    val df = (1 to 3).map(i => (i, s"val_$i", i * 2)).toDF("a", "b", "c")
+
+    df.write
+      .format(dataSourceName)
+      .mode(SaveMode.Overwrite)
+      .partitionBy("c", "a")
+      .saveAsTable("t")
+
+    withTable("t") {
+      checkAnswer(table("t"), df.select('b, 'c, 'a).collect())
+    }
+  }
 }
 
 class SimpleTextHadoopFsRelationSuite extends HadoopFsRelationTest {
@@ -534,20 +551,6 @@ class ParquetHadoopFsRelationSuite extends HadoopFsRelationTest {
     }
   }
 
-  test("SPARK-7616: adjust column name order accordingly when saving partitioned table") {
-    val df = (1 to 3).map(i => (i, s"val_$i", i * 2)).toDF("a", "b", "c")
-
-    df.write
-      .format("parquet")
-      .mode(SaveMode.Overwrite)
-      .partitionBy("c", "a")
-      .saveAsTable("t")
-
-    withTable("t") {
-      checkAnswer(table("t"), df.select('b, 'c, 'a).collect())
-    }
-  }
-
   test("SPARK-7868: _temporary directories should be ignored") {
     withTempPath { dir =>
       val df = Seq("a", "b", "c").zipWithIndex.toDF()
@@ -563,4 +566,32 @@ class ParquetHadoopFsRelationSuite extends HadoopFsRelationTest {
       checkAnswer(read.format("parquet").load(dir.getCanonicalPath), df.collect())
     }
   }
+
+  test("SPARK-8014: Avoid scanning output directory when SaveMode isn't SaveMode.Append") {
+    withTempDir { dir =>
+      val path = dir.getCanonicalPath
+      val df = Seq(1 -> "a").toDF()
+
+      // Creates an arbitrary file.  If this directory gets scanned, ParquetRelation2 will throw
+      // since it's not a valid Parquet file.
+      val emptyFile = new File(path, "empty")
+      Files.createParentDirs(emptyFile)
+      Files.touch(emptyFile)
+
+      // This shouldn't throw anything.
+      df.write.format("parquet").mode(SaveMode.Ignore).save(path)
+
+      // This should only complain that the destination directory already exists, rather than file
+      // "empty" is not a Parquet file.
+      assert {
+        intercept[RuntimeException] {
+          df.write.format("parquet").mode(SaveMode.ErrorIfExists).save(path)
+        }.getMessage.contains("already exists")
+      }
+
+      // This shouldn't throw anything.
+      df.write.format("parquet").mode(SaveMode.Overwrite).save(path)
+      checkAnswer(read.format("parquet").load(path), df)
+    }
+  }
 }

From 605ddbb27c8482fc0107b21c19d4e4ae19348f35 Mon Sep 17 00:00:00 2001
From: Davies Liu <davies@databricks.com>
Date: Tue, 2 Jun 2015 13:38:06 -0700
Subject: [PATCH 320/525] [SPARK-8038] [SQL] [PYSPARK] fix Column.when() and
 otherwise()

Thanks ogirardot, closes #6580

cc rxin JoshRosen

Author: Davies Liu <davies@databricks.com>

Closes #6590 from davies/when and squashes the following commits:

c0f2069 [Davies Liu] fix Column.when() and otherwise()
---
 python/pyspark/sql/column.py | 31 ++++++++++++++++++++++++++++---
 1 file changed, 28 insertions(+), 3 deletions(-)

diff --git a/python/pyspark/sql/column.py b/python/pyspark/sql/column.py
index 8dc5039f587f0..1ecec5b126505 100644
--- a/python/pyspark/sql/column.py
+++ b/python/pyspark/sql/column.py
@@ -315,6 +315,14 @@ def between(self, lowerBound, upperBound):
         """
         A boolean expression that is evaluated to true if the value of this
         expression is between the given columns.
+
+        >>> df.select(df.name, df.age.between(2, 4)).show()
+        +-----+--------------------------+
+        | name|((age >= 2) && (age <= 4))|
+        +-----+--------------------------+
+        |Alice|                      true|
+        |  Bob|                     false|
+        +-----+--------------------------+
         """
         return (self >= lowerBound) & (self <= upperBound)
 
@@ -328,12 +336,20 @@ def when(self, condition, value):
 
         :param condition: a boolean :class:`Column` expression.
         :param value: a literal value, or a :class:`Column` expression.
+
+        >>> from pyspark.sql import functions as F
+        >>> df.select(df.name, F.when(df.age > 4, 1).when(df.age < 3, -1).otherwise(0)).show()
+        +-----+--------------------------------------------------------+
+        | name|CASE WHEN (age > 4) THEN 1 WHEN (age < 3) THEN -1 ELSE 0|
+        +-----+--------------------------------------------------------+
+        |Alice|                                                      -1|
+        |  Bob|                                                       1|
+        +-----+--------------------------------------------------------+
         """
-        sc = SparkContext._active_spark_context
         if not isinstance(condition, Column):
             raise TypeError("condition should be a Column")
         v = value._jc if isinstance(value, Column) else value
-        jc = sc._jvm.functions.when(condition._jc, v)
+        jc = self._jc.when(condition._jc, v)
         return Column(jc)
 
     @since(1.4)
@@ -345,9 +361,18 @@ def otherwise(self, value):
         See :func:`pyspark.sql.functions.when` for example usage.
 
         :param value: a literal value, or a :class:`Column` expression.
+
+        >>> from pyspark.sql import functions as F
+        >>> df.select(df.name, F.when(df.age > 3, 1).otherwise(0)).show()
+        +-----+---------------------------------+
+        | name|CASE WHEN (age > 3) THEN 1 ELSE 0|
+        +-----+---------------------------------+
+        |Alice|                                0|
+        |  Bob|                                1|
+        +-----+---------------------------------+
         """
         v = value._jc if isinstance(value, Column) else value
-        jc = self._jc.otherwise(value)
+        jc = self._jc.otherwise(v)
         return Column(jc)
 
     @since(1.4)

From 89f21f66b5549524d1a6e4fb576a4f80d9fef903 Mon Sep 17 00:00:00 2001
From: Xiangrui Meng <meng@databricks.com>
Date: Tue, 2 Jun 2015 16:51:17 -0700
Subject: [PATCH 321/525] [SPARK-8049] [MLLIB] drop tmp col from OneVsRest
 output

The temporary column should be dropped after we get the prediction column. harsha2010

Author: Xiangrui Meng <meng@databricks.com>

Closes #6592 from mengxr/SPARK-8049 and squashes the following commits:

1d89107 [Xiangrui Meng] use SparkFunSuite
6ee70de [Xiangrui Meng] drop tmp col from OneVsRest output
---
 .../org/apache/spark/ml/classification/OneVsRest.scala   | 1 +
 .../apache/spark/ml/classification/OneVsRestSuite.scala  | 9 +++++++++
 2 files changed, 10 insertions(+)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/OneVsRest.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/OneVsRest.scala
index 7b726da388075..825f9ed1b54b2 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/classification/OneVsRest.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/classification/OneVsRest.scala
@@ -131,6 +131,7 @@ final class OneVsRestModel private[ml] (
     // output label and label metadata as prediction
     val labelUdf = callUDF(label, DoubleType, col(accColName))
     aggregatedDataset.withColumn($(predictionCol), labelUdf.as($(predictionCol), labelMetadata))
+      .drop(accColName)
   }
 }
 
diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/OneVsRestSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/OneVsRestSuite.scala
index f439f3261f06f..1d04ccb509057 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/classification/OneVsRestSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/classification/OneVsRestSuite.scala
@@ -93,6 +93,15 @@ class OneVsRestSuite extends SparkFunSuite with MLlibTestSparkContext {
     val datasetWithLabelMetadata = dataset.select(labelWithMetadata, features)
     ova.fit(datasetWithLabelMetadata)
   }
+
+  test("SPARK-8049: OneVsRest shouldn't output temp columns") {
+    val logReg = new LogisticRegression()
+      .setMaxIter(1)
+    val ovr = new OneVsRest()
+      .setClassifier(logReg)
+    val output = ovr.fit(dataset).transform(dataset)
+    assert(output.schema.fieldNames.toSet === Set("label", "features", "prediction"))
+  }
 }
 
 private class MockLogisticRegression(uid: String) extends LogisticRegression(uid) {

From 5cd6a63d9692d153751747e0293dc030d73a6194 Mon Sep 17 00:00:00 2001
From: Cheng Lian <lian@databricks.com>
Date: Tue, 2 Jun 2015 17:07:13 -0700
Subject: [PATCH 322/525] [SQL] [TEST] [MINOR] Follow-up of PR #6493, use Guava
 API to ensure Java 6 friendliness

This is a follow-up of PR #6493, which has been reverted in branch-1.4 because it uses Java 7 specific APIs and breaks Java 6 build. This PR replaces those APIs with equivalent Guava ones to ensure Java 6 friendliness.

cc andrewor14 pwendell, this should also be back ported to branch-1.4.

Author: Cheng Lian <lian@databricks.com>

Closes #6547 from liancheng/override-log4j and squashes the following commits:

c900cfd [Cheng Lian] Addresses Shixiong's comment
72da795 [Cheng Lian] Uses Guava API to ensure Java 6 friendliness
---
 .../sql/hive/thriftserver/HiveThriftServer2Suites.scala  | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/HiveThriftServer2Suites.scala b/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/HiveThriftServer2Suites.scala
index da511ebd05ad2..a93a3dee43511 100644
--- a/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/HiveThriftServer2Suites.scala
+++ b/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/HiveThriftServer2Suites.scala
@@ -19,8 +19,6 @@ package org.apache.spark.sql.hive.thriftserver
 
 import java.io.File
 import java.net.URL
-import java.nio.charset.StandardCharsets
-import java.nio.file.{Files, Paths}
 import java.sql.{Date, DriverManager, Statement}
 
 import scala.collection.mutable.ArrayBuffer
@@ -29,6 +27,8 @@ import scala.concurrent.{Await, Promise}
 import scala.sys.process.{Process, ProcessLogger}
 import scala.util.{Random, Try}
 
+import com.google.common.base.Charsets.UTF_8
+import com.google.common.io.Files
 import org.apache.hadoop.hive.conf.HiveConf.ConfVars
 import org.apache.hive.jdbc.HiveDriver
 import org.apache.hive.service.auth.PlainSaslHelper
@@ -441,13 +441,14 @@ abstract class HiveThriftServer2Test extends SparkFunSuite with BeforeAndAfterAl
       val tempLog4jConf = Utils.createTempDir().getCanonicalPath
 
       Files.write(
-        Paths.get(s"$tempLog4jConf/log4j.properties"),
         """log4j.rootCategory=INFO, console
           |log4j.appender.console=org.apache.log4j.ConsoleAppender
           |log4j.appender.console.target=System.err
           |log4j.appender.console.layout=org.apache.log4j.PatternLayout
           |log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n
-        """.stripMargin.getBytes(StandardCharsets.UTF_8))
+        """.stripMargin,
+        new File(s"$tempLog4jConf/log4j.properties"),
+        UTF_8)
 
       tempLog4jConf + File.pathSeparator + sys.props("java.class.path")
     }

From c3f4c3257194ba34ccd298d13ea1edcfc75f7552 Mon Sep 17 00:00:00 2001
From: Ram Sriharsha <rsriharsha@hw11853.local>
Date: Tue, 2 Jun 2015 18:53:04 -0700
Subject: [PATCH 323/525] [SPARK-7387] [ML] [DOC] CrossValidator example code
 in Python

Author: Ram Sriharsha <rsriharsha@hw11853.local>

Closes #6358 from harsha2010/SPARK-7387 and squashes the following commits:

63efda2 [Ram Sriharsha] more examples for classifier to distinguish mapreduce from spark properly
aeb6bb6 [Ram Sriharsha] Python Style Fix
54a500c [Ram Sriharsha] Merge branch 'master' into SPARK-7387
615e91c [Ram Sriharsha] cleanup
204c4e3 [Ram Sriharsha] Merge branch 'master' into SPARK-7387
7246d35 [Ram Sriharsha] [SPARK-7387][ml][doc] CrossValidator example code in Python
---
 .../src/main/python/ml/cross_validator.py     | 96 +++++++++++++++++++
 .../main/python/ml/simple_params_example.py   |  4 +-
 2 files changed, 98 insertions(+), 2 deletions(-)
 create mode 100644 examples/src/main/python/ml/cross_validator.py

diff --git a/examples/src/main/python/ml/cross_validator.py b/examples/src/main/python/ml/cross_validator.py
new file mode 100644
index 0000000000000..f0ca97c724940
--- /dev/null
+++ b/examples/src/main/python/ml/cross_validator.py
@@ -0,0 +1,96 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from __future__ import print_function
+
+from pyspark import SparkContext
+from pyspark.ml import Pipeline
+from pyspark.ml.classification import LogisticRegression
+from pyspark.ml.evaluation import BinaryClassificationEvaluator
+from pyspark.ml.feature import HashingTF, Tokenizer
+from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
+from pyspark.sql import Row, SQLContext
+
+"""
+A simple example demonstrating model selection using CrossValidator.
+This example also demonstrates how Pipelines are Estimators.
+Run with:
+
+  bin/spark-submit examples/src/main/python/ml/cross_validator.py
+"""
+
+if __name__ == "__main__":
+    sc = SparkContext(appName="CrossValidatorExample")
+    sqlContext = SQLContext(sc)
+
+    # Prepare training documents, which are labeled.
+    LabeledDocument = Row("id", "text", "label")
+    training = sc.parallelize([(0, "a b c d e spark", 1.0),
+                               (1, "b d", 0.0),
+                               (2, "spark f g h", 1.0),
+                               (3, "hadoop mapreduce", 0.0),
+                               (4, "b spark who", 1.0),
+                               (5, "g d a y", 0.0),
+                               (6, "spark fly", 1.0),
+                               (7, "was mapreduce", 0.0),
+                               (8, "e spark program", 1.0),
+                               (9, "a e c l", 0.0),
+                               (10, "spark compile", 1.0),
+                               (11, "hadoop software", 0.0)
+                               ]) \
+        .map(lambda x: LabeledDocument(*x)).toDF()
+
+    # Configure an ML pipeline, which consists of tree stages: tokenizer, hashingTF, and lr.
+    tokenizer = Tokenizer(inputCol="text", outputCol="words")
+    hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features")
+    lr = LogisticRegression(maxIter=10)
+    pipeline = Pipeline(stages=[tokenizer, hashingTF, lr])
+
+    # We now treat the Pipeline as an Estimator, wrapping it in a CrossValidator instance.
+    # This will allow us to jointly choose parameters for all Pipeline stages.
+    # A CrossValidator requires an Estimator, a set of Estimator ParamMaps, and an Evaluator.
+    # We use a ParamGridBuilder to construct a grid of parameters to search over.
+    # With 3 values for hashingTF.numFeatures and 2 values for lr.regParam,
+    # this grid will have 3 x 2 = 6 parameter settings for CrossValidator to choose from.
+    paramGrid = ParamGridBuilder() \
+        .addGrid(hashingTF.numFeatures, [10, 100, 1000]) \
+        .addGrid(lr.regParam, [0.1, 0.01]) \
+        .build()
+
+    crossval = CrossValidator(estimator=pipeline,
+                              estimatorParamMaps=paramGrid,
+                              evaluator=BinaryClassificationEvaluator(),
+                              numFolds=2)  # use 3+ folds in practice
+
+    # Run cross-validation, and choose the best set of parameters.
+    cvModel = crossval.fit(training)
+
+    # Prepare test documents, which are unlabeled.
+    Document = Row("id", "text")
+    test = sc.parallelize([(4L, "spark i j k"),
+                           (5L, "l m n"),
+                           (6L, "mapreduce spark"),
+                           (7L, "apache hadoop")]) \
+        .map(lambda x: Document(*x)).toDF()
+
+    # Make predictions on test documents. cvModel uses the best model found (lrModel).
+    prediction = cvModel.transform(test)
+    selected = prediction.select("id", "text", "probability", "prediction")
+    for row in selected.collect():
+        print(row)
+
+    sc.stop()
diff --git a/examples/src/main/python/ml/simple_params_example.py b/examples/src/main/python/ml/simple_params_example.py
index 3933d59b52cd1..a9f29dab2d602 100644
--- a/examples/src/main/python/ml/simple_params_example.py
+++ b/examples/src/main/python/ml/simple_params_example.py
@@ -41,8 +41,8 @@
 
     # prepare training data.
     # We create an RDD of LabeledPoints and convert them into a DataFrame.
-    # Spark DataFrames can automatically infer the schema from named tuples
-    # and LabeledPoint implements __reduce__ to behave like a named tuple.
+    # A LabeledPoint is an Object with two fields named label and features
+    # and Spark SQL identifies these fields and creates the schema appropriately.
     training = sc.parallelize([
         LabeledPoint(1.0, DenseVector([0.0, 1.1, 0.1])),
         LabeledPoint(0.0, DenseVector([2.0, 1.0, -1.0])),

From a86b3e9b9b75f5af4fdbba22e87769058f023204 Mon Sep 17 00:00:00 2001
From: DB Tsai <dbt@netflix.com>
Date: Tue, 2 Jun 2015 19:12:08 -0700
Subject: [PATCH 324/525] [SPARK-7547] [ML] Scala Example code for ElasticNet

This is scala example code for both linear and logistic regression. Python and Java versions are to be added.

Author: DB Tsai <dbt@netflix.com>

Closes #6576 from dbtsai/elasticNetExample and squashes the following commits:

e7ca406 [DB Tsai] fix test
6bb6d77 [DB Tsai] fix suite and remove duplicated setMaxIter
136e0dd [DB Tsai] address feedback
1ec29d4 [DB Tsai] fix style
9462f5f [DB Tsai] add example
---
 .../examples/ml/LinearRegressionExample.scala | 142 ++++++++++++++++
 .../ml/LogisticRegressionExample.scala        | 159 ++++++++++++++++++
 .../classification/LogisticRegression.scala   |   8 +-
 .../ml/param/shared/SharedParamsCodeGen.scala |   2 +-
 .../spark/ml/param/shared/sharedParams.scala  |   4 +-
 .../ml/regression/LinearRegression.scala      |   2 +-
 .../apache/spark/ml/param/ParamsSuite.scala   |   6 +-
 7 files changed, 314 insertions(+), 9 deletions(-)
 create mode 100644 examples/src/main/scala/org/apache/spark/examples/ml/LinearRegressionExample.scala
 create mode 100644 examples/src/main/scala/org/apache/spark/examples/ml/LogisticRegressionExample.scala

diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/LinearRegressionExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/LinearRegressionExample.scala
new file mode 100644
index 0000000000000..b54466fd48bc5
--- /dev/null
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/LinearRegressionExample.scala
@@ -0,0 +1,142 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.examples.ml
+
+import scala.collection.mutable
+import scala.language.reflectiveCalls
+
+import scopt.OptionParser
+
+import org.apache.spark.{SparkConf, SparkContext}
+import org.apache.spark.examples.mllib.AbstractParams
+import org.apache.spark.ml.{Pipeline, PipelineStage}
+import org.apache.spark.ml.regression.{LinearRegression, LinearRegressionModel}
+import org.apache.spark.sql.DataFrame
+
+/**
+ * An example runner for linear regression with elastic-net (mixing L1/L2) regularization.
+ * Run with
+ * {{{
+ * bin/run-example ml.LinearRegressionExample [options]
+ * }}}
+ * A synthetic dataset can be found at `data/mllib/sample_linear_regression_data.txt` which can be
+ * trained by
+ * {{{
+ * bin/run-example ml.LinearRegressionExample --regParam 0.15 --elasticNetParam 1.0 \
+ *   data/mllib/sample_linear_regression_data.txt
+ * }}}
+ * If you use it as a template to create your own app, please use `spark-submit` to submit your app.
+ */
+object LinearRegressionExample {
+
+  case class Params(
+      input: String = null,
+      testInput: String = "",
+      dataFormat: String = "libsvm",
+      regParam: Double = 0.0,
+      elasticNetParam: Double = 0.0,
+      maxIter: Int = 100,
+      tol: Double = 1E-6,
+      fracTest: Double = 0.2) extends AbstractParams[Params]
+
+  def main(args: Array[String]) {
+    val defaultParams = Params()
+
+    val parser = new OptionParser[Params]("LinearRegressionExample") {
+      head("LinearRegressionExample: an example Linear Regression with Elastic-Net app.")
+      opt[Double]("regParam")
+        .text(s"regularization parameter, default: ${defaultParams.regParam}")
+        .action((x, c) => c.copy(regParam = x))
+      opt[Double]("elasticNetParam")
+        .text(s"ElasticNet mixing parameter. For alpha = 0, the penalty is an L2 penalty. " +
+        s"For alpha = 1, it is an L1 penalty. For 0 < alpha < 1, the penalty is a combination of " +
+        s"L1 and L2, default: ${defaultParams.elasticNetParam}")
+        .action((x, c) => c.copy(elasticNetParam = x))
+      opt[Int]("maxIter")
+        .text(s"maximum number of iterations, default: ${defaultParams.maxIter}")
+        .action((x, c) => c.copy(maxIter = x))
+      opt[Double]("tol")
+        .text(s"the convergence tolerance of iterations, Smaller value will lead " +
+        s"to higher accuracy with the cost of more iterations, default: ${defaultParams.tol}")
+        .action((x, c) => c.copy(tol = x))
+      opt[Double]("fracTest")
+        .text(s"fraction of data to hold out for testing.  If given option testInput, " +
+        s"this option is ignored. default: ${defaultParams.fracTest}")
+        .action((x, c) => c.copy(fracTest = x))
+      opt[String]("testInput")
+        .text(s"input path to test dataset.  If given, option fracTest is ignored." +
+        s" default: ${defaultParams.testInput}")
+        .action((x, c) => c.copy(testInput = x))
+      opt[String]("dataFormat")
+        .text("data format: libsvm (default), dense (deprecated in Spark v1.1)")
+        .action((x, c) => c.copy(dataFormat = x))
+      arg[String]("<input>")
+        .text("input path to labeled examples")
+        .required()
+        .action((x, c) => c.copy(input = x))
+      checkConfig { params =>
+        if (params.fracTest < 0 || params.fracTest >= 1) {
+          failure(s"fracTest ${params.fracTest} value incorrect; should be in [0,1).")
+        } else {
+          success
+        }
+      }
+    }
+
+    parser.parse(args, defaultParams).map { params =>
+      run(params)
+    }.getOrElse {
+      sys.exit(1)
+    }
+  }
+
+  def run(params: Params) {
+    val conf = new SparkConf().setAppName(s"LinearRegressionExample with $params")
+    val sc = new SparkContext(conf)
+
+    println(s"LinearRegressionExample with parameters:\n$params")
+
+    // Load training and test data and cache it.
+    val (training: DataFrame, test: DataFrame) = DecisionTreeExample.loadDatasets(sc, params.input,
+      params.dataFormat, params.testInput, "regression", params.fracTest)
+
+    val lir = new LinearRegression()
+      .setFeaturesCol("features")
+      .setLabelCol("label")
+      .setRegParam(params.regParam)
+      .setElasticNetParam(params.elasticNetParam)
+      .setMaxIter(params.maxIter)
+      .setTol(params.tol)
+
+    // Train the model
+    val startTime = System.nanoTime()
+    val lirModel = lir.fit(training)
+    val elapsedTime = (System.nanoTime() - startTime) / 1e9
+    println(s"Training time: $elapsedTime seconds")
+
+    // Print the weights and intercept for linear regression.
+    println(s"Weights: ${lirModel.weights} Intercept: ${lirModel.intercept}")
+
+    println("Training data results:")
+    DecisionTreeExample.evaluateRegressionModel(lirModel, training, "label")
+    println("Test data results:")
+    DecisionTreeExample.evaluateRegressionModel(lirModel, test, "label")
+
+    sc.stop()
+  }
+}
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/LogisticRegressionExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/LogisticRegressionExample.scala
new file mode 100644
index 0000000000000..b12f833ce94c8
--- /dev/null
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/LogisticRegressionExample.scala
@@ -0,0 +1,159 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.examples.ml
+
+import scala.collection.mutable
+import scala.language.reflectiveCalls
+
+import scopt.OptionParser
+
+import org.apache.spark.{SparkConf, SparkContext}
+import org.apache.spark.examples.mllib.AbstractParams
+import org.apache.spark.ml.{Pipeline, PipelineStage}
+import org.apache.spark.ml.classification.{LogisticRegression, LogisticRegressionModel}
+import org.apache.spark.ml.feature.StringIndexer
+import org.apache.spark.sql.DataFrame
+
+/**
+ * An example runner for logistic regression with elastic-net (mixing L1/L2) regularization.
+ * Run with
+ * {{{
+ * bin/run-example ml.LogisticRegressionExample [options]
+ * }}}
+ * A synthetic dataset can be found at `data/mllib/sample_libsvm_data.txt` which can be
+ * trained by
+ * {{{
+ * bin/run-example ml.LogisticRegressionExample --regParam 0.3 --elasticNetParam 0.8 \
+ *   data/mllib/sample_libsvm_data.txt
+ * }}}
+ * If you use it as a template to create your own app, please use `spark-submit` to submit your app.
+ */
+object LogisticRegressionExample {
+
+  case class Params(
+      input: String = null,
+      testInput: String = "",
+      dataFormat: String = "libsvm",
+      regParam: Double = 0.0,
+      elasticNetParam: Double = 0.0,
+      maxIter: Int = 100,
+      fitIntercept: Boolean = true,
+      tol: Double = 1E-6,
+      fracTest: Double = 0.2) extends AbstractParams[Params]
+
+  def main(args: Array[String]) {
+    val defaultParams = Params()
+
+    val parser = new OptionParser[Params]("LogisticRegressionExample") {
+      head("LogisticRegressionExample: an example Logistic Regression with Elastic-Net app.")
+      opt[Double]("regParam")
+        .text(s"regularization parameter, default: ${defaultParams.regParam}")
+        .action((x, c) => c.copy(regParam = x))
+      opt[Double]("elasticNetParam")
+        .text(s"ElasticNet mixing parameter. For alpha = 0, the penalty is an L2 penalty. " +
+        s"For alpha = 1, it is an L1 penalty. For 0 < alpha < 1, the penalty is a combination of " +
+        s"L1 and L2, default: ${defaultParams.elasticNetParam}")
+        .action((x, c) => c.copy(elasticNetParam = x))
+      opt[Int]("maxIter")
+        .text(s"maximum number of iterations, default: ${defaultParams.maxIter}")
+        .action((x, c) => c.copy(maxIter = x))
+      opt[Boolean]("fitIntercept")
+        .text(s"whether to fit an intercept term, default: ${defaultParams.fitIntercept}")
+        .action((x, c) => c.copy(fitIntercept = x))
+      opt[Double]("tol")
+        .text(s"the convergence tolerance of iterations, Smaller value will lead " +
+        s"to higher accuracy with the cost of more iterations, default: ${defaultParams.tol}")
+        .action((x, c) => c.copy(tol = x))
+      opt[Double]("fracTest")
+        .text(s"fraction of data to hold out for testing.  If given option testInput, " +
+        s"this option is ignored. default: ${defaultParams.fracTest}")
+        .action((x, c) => c.copy(fracTest = x))
+      opt[String]("testInput")
+        .text(s"input path to test dataset.  If given, option fracTest is ignored." +
+        s" default: ${defaultParams.testInput}")
+        .action((x, c) => c.copy(testInput = x))
+      opt[String]("dataFormat")
+        .text("data format: libsvm (default), dense (deprecated in Spark v1.1)")
+        .action((x, c) => c.copy(dataFormat = x))
+      arg[String]("<input>")
+        .text("input path to labeled examples")
+        .required()
+        .action((x, c) => c.copy(input = x))
+      checkConfig { params =>
+        if (params.fracTest < 0 || params.fracTest >= 1) {
+          failure(s"fracTest ${params.fracTest} value incorrect; should be in [0,1).")
+        } else {
+          success
+        }
+      }
+    }
+
+    parser.parse(args, defaultParams).map { params =>
+      run(params)
+    }.getOrElse {
+      sys.exit(1)
+    }
+  }
+
+  def run(params: Params) {
+    val conf = new SparkConf().setAppName(s"LogisticRegressionExample with $params")
+    val sc = new SparkContext(conf)
+
+    println(s"LogisticRegressionExample with parameters:\n$params")
+
+    // Load training and test data and cache it.
+    val (training: DataFrame, test: DataFrame) = DecisionTreeExample.loadDatasets(sc, params.input,
+      params.dataFormat, params.testInput, "classification", params.fracTest)
+
+    // Set up Pipeline
+    val stages = new mutable.ArrayBuffer[PipelineStage]()
+
+    val labelIndexer = new StringIndexer()
+      .setInputCol("labelString")
+      .setOutputCol("indexedLabel")
+    stages += labelIndexer
+
+    val lor = new LogisticRegression()
+      .setFeaturesCol("features")
+      .setLabelCol("indexedLabel")
+      .setRegParam(params.regParam)
+      .setElasticNetParam(params.elasticNetParam)
+      .setMaxIter(params.maxIter)
+      .setTol(params.tol)
+
+    stages += lor
+    val pipeline = new Pipeline().setStages(stages.toArray)
+
+    // Fit the Pipeline
+    val startTime = System.nanoTime()
+    val pipelineModel = pipeline.fit(training)
+    val elapsedTime = (System.nanoTime() - startTime) / 1e9
+    println(s"Training time: $elapsedTime seconds")
+
+    val lirModel = pipelineModel.stages.last.asInstanceOf[LogisticRegressionModel]
+    // Print the weights and intercept for logistic regression.
+    println(s"Weights: ${lirModel.weights} Intercept: ${lirModel.intercept}")
+
+    println("Training data results:")
+    DecisionTreeExample.evaluateClassificationModel(pipelineModel, training, "indexedLabel")
+    println("Test data results:")
+    DecisionTreeExample.evaluateClassificationModel(pipelineModel, test, "indexedLabel")
+
+    sc.stop()
+  }
+}
diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
index d13109d9da4c0..f136bcee9cf2b 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
@@ -74,7 +74,7 @@ class LogisticRegression(override val uid: String)
   setDefault(elasticNetParam -> 0.0)
 
   /**
-   * Set the maximal number of iterations.
+   * Set the maximum number of iterations.
    * Default is 100.
    * @group setParam
    */
@@ -90,7 +90,11 @@ class LogisticRegression(override val uid: String)
   def setTol(value: Double): this.type = set(tol, value)
   setDefault(tol -> 1E-6)
 
-  /** @group setParam */
+  /**
+   * Whether to fit an intercept term.
+   * Default is true.
+   * @group setParam
+   * */
   def setFitIntercept(value: Boolean): this.type = set(fitIntercept, value)
   setDefault(fitIntercept -> true)
 
diff --git a/mllib/src/main/scala/org/apache/spark/ml/param/shared/SharedParamsCodeGen.scala b/mllib/src/main/scala/org/apache/spark/ml/param/shared/SharedParamsCodeGen.scala
index 1ffb5eddc36bd..8ffbcf0d8bc71 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/param/shared/SharedParamsCodeGen.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/param/shared/SharedParamsCodeGen.scala
@@ -33,7 +33,7 @@ private[shared] object SharedParamsCodeGen {
     val params = Seq(
       ParamDesc[Double]("regParam", "regularization parameter (>= 0)",
         isValid = "ParamValidators.gtEq(0)"),
-      ParamDesc[Int]("maxIter", "max number of iterations (>= 0)",
+      ParamDesc[Int]("maxIter", "maximum number of iterations (>= 0)",
         isValid = "ParamValidators.gtEq(0)"),
       ParamDesc[String]("featuresCol", "features column name", Some("\"features\"")),
       ParamDesc[String]("labelCol", "label column name", Some("\"label\"")),
diff --git a/mllib/src/main/scala/org/apache/spark/ml/param/shared/sharedParams.scala b/mllib/src/main/scala/org/apache/spark/ml/param/shared/sharedParams.scala
index ed08417bd4df8..a0c8ccdac9ad9 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/param/shared/sharedParams.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/param/shared/sharedParams.scala
@@ -45,10 +45,10 @@ private[ml] trait HasRegParam extends Params {
 private[ml] trait HasMaxIter extends Params {
 
   /**
-   * Param for max number of iterations (>= 0).
+   * Param for maximum number of iterations (>= 0).
    * @group param
    */
-  final val maxIter: IntParam = new IntParam(this, "maxIter", "max number of iterations (>= 0)", ParamValidators.gtEq(0))
+  final val maxIter: IntParam = new IntParam(this, "maxIter", "maximum number of iterations (>= 0)", ParamValidators.gtEq(0))
 
   /** @group getParam */
   final def getMaxIter: Int = $(maxIter)
diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala
index fe2a71a331694..70cd8e9e87fae 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala
@@ -83,7 +83,7 @@ class LinearRegression(override val uid: String)
   setDefault(elasticNetParam -> 0.0)
 
   /**
-   * Set the maximal number of iterations.
+   * Set the maximum number of iterations.
    * Default is 100.
    * @group setParam
    */
diff --git a/mllib/src/test/scala/org/apache/spark/ml/param/ParamsSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/param/ParamsSuite.scala
index f80e7749098a5..96094d7a099aa 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/param/ParamsSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/param/ParamsSuite.scala
@@ -27,7 +27,7 @@ class ParamsSuite extends SparkFunSuite {
     import solver.{maxIter, inputCol}
 
     assert(maxIter.name === "maxIter")
-    assert(maxIter.doc === "max number of iterations (>= 0)")
+    assert(maxIter.doc === "maximum number of iterations (>= 0)")
     assert(maxIter.parent === uid)
     assert(maxIter.toString === s"${uid}__maxIter")
     assert(!maxIter.isValid(-1))
@@ -36,7 +36,7 @@ class ParamsSuite extends SparkFunSuite {
 
     solver.setMaxIter(5)
     assert(solver.explainParam(maxIter) ===
-      "maxIter: max number of iterations (>= 0) (default: 10, current: 5)")
+      "maxIter: maximum number of iterations (>= 0) (default: 10, current: 5)")
 
     assert(inputCol.toString === s"${uid}__inputCol")
 
@@ -120,7 +120,7 @@ class ParamsSuite extends SparkFunSuite {
     intercept[NoSuchElementException](solver.getInputCol)
 
     assert(solver.explainParam(maxIter) ===
-      "maxIter: max number of iterations (>= 0) (default: 10, current: 100)")
+      "maxIter: maximum number of iterations (>= 0) (default: 10, current: 100)")
     assert(solver.explainParams() ===
       Seq(inputCol, maxIter).map(solver.explainParam).mkString("\n"))
 

From cafd5056e12a15f0ebf8015d52dfab999c4443b8 Mon Sep 17 00:00:00 2001
From: Josh Rosen <joshrosen@databricks.com>
Date: Tue, 2 Jun 2015 22:11:03 -0700
Subject: [PATCH 325/525] [SPARK-7691] [SQL] Refactor CatalystTypeConverter to
 use type-specific row accessors

This patch significantly refactors CatalystTypeConverters to both clean up the code and enable these conversions to work with future Project Tungsten features.

At a high level, I've reorganized the code so that all functions dealing with the same type are grouped together into type-specific subclasses of `CatalystTypeConveter`.  In addition, I've added new methods that allow the Catalyst Row -> Scala Row conversions to access the Catalyst row's fields through type-specific `getTYPE()` methods rather than the generic `get()` / `Row.apply` methods.  This refactoring is a blocker to being able to unit test new operators that I'm developing as part of Project Tungsten, since those operators may output `UnsafeRow` instances which don't support the generic `get()`.

The stricter type usage of types here has uncovered some bugs in other parts of Spark SQL:

- #6217: DescribeCommand is assigned wrong output attributes in SparkStrategies
- #6218: DataFrame.describe() should cast all aggregates to String
- #6400: Use output schema, not relation schema, for data source input conversion

Spark SQL current has undefined behavior for what happens when you try to create a DataFrame from user-specified rows whose values don't match the declared schema.  According to the `createDataFrame()` Scaladoc:

>  It is important to make sure that the structure of every [[Row]] of the provided RDD matches the provided schema. Otherwise, there will be runtime exception.

Given this, it sounds like it's technically not a break of our API contract to fail-fast when the data types don't match. However, there appear to be many cases where we don't fail even though the types don't match. For example, `JavaHashingTFSuite.hasingTF` passes a column of integers values for a "label" column which is supposed to contain floats.  This column isn't actually read or modified as part of query processing, so its actual concrete type doesn't seem to matter. In other cases, there could be situations where we have generic numeric aggregates that tolerate being called with different numeric types than the schema specified, but this can be okay due to numeric conversions.

In the long run, we will probably want to come up with precise semantics for implicit type conversions / widening when converting Java / Scala rows to Catalyst rows.  Until then, though, I think that failing fast with a ClassCastException is a reasonable behavior; this is the approach taken in this patch.  Note that certain optimizations in the inbound conversion functions for primitive types mean that we'll probably preserve the old undefined behavior in a majority of cases.

Author: Josh Rosen <joshrosen@databricks.com>

Closes #6222 from JoshRosen/catalyst-converters-refactoring and squashes the following commits:

740341b [Josh Rosen] Optimize method dispatch for primitive type conversions
befc613 [Josh Rosen] Add tests to document Option-handling behavior.
5989593 [Josh Rosen] Use new SparkFunSuite base in CatalystTypeConvertersSuite
6edf7f8 [Josh Rosen] Re-add convertToScala(), since a Hive test still needs it
3f7b2d8 [Josh Rosen] Initialize converters lazily so that the attributes are resolved first
6ad0ebb [Josh Rosen] Fix JavaHashingTFSuite ClassCastException
677ff27 [Josh Rosen] Fix null handling bug; add tests.
8033d4c [Josh Rosen] Fix serialization error in UserDefinedGenerator.
85bba9d [Josh Rosen] Fix wrong input data in InMemoryColumnarQuerySuite
9c0e4e1 [Josh Rosen] Remove last use of convertToScala().
ae3278d [Josh Rosen] Throw ClassCastException errors during inbound conversions.
7ca7fcb [Josh Rosen] Comments and cleanup
1e87a45 [Josh Rosen] WIP refactoring of CatalystTypeConverters
---
 .../spark/ml/feature/JavaHashingTFSuite.java  |   6 +-
 .../sql/catalyst/CatalystTypeConverters.scala | 558 ++++++++++--------
 .../sql/catalyst/expressions/generators.scala |  19 +-
 .../CatalystTypeConvertersSuite.scala         |  62 ++
 .../columnar/InMemoryColumnarQuerySuite.scala |   2 +-
 5 files changed, 382 insertions(+), 265 deletions(-)
 create mode 100644 sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/CatalystTypeConvertersSuite.scala

diff --git a/mllib/src/test/java/org/apache/spark/ml/feature/JavaHashingTFSuite.java b/mllib/src/test/java/org/apache/spark/ml/feature/JavaHashingTFSuite.java
index da2218056307e..599e9cfd23ad4 100644
--- a/mllib/src/test/java/org/apache/spark/ml/feature/JavaHashingTFSuite.java
+++ b/mllib/src/test/java/org/apache/spark/ml/feature/JavaHashingTFSuite.java
@@ -55,9 +55,9 @@ public void tearDown() {
   @Test
   public void hashingTF() {
     JavaRDD<Row> jrdd = jsc.parallelize(Lists.newArrayList(
-      RowFactory.create(0, "Hi I heard about Spark"),
-      RowFactory.create(0, "I wish Java could use case classes"),
-      RowFactory.create(1, "Logistic regression models are neat")
+      RowFactory.create(0.0, "Hi I heard about Spark"),
+      RowFactory.create(0.0, "I wish Java could use case classes"),
+      RowFactory.create(1.0, "Logistic regression models are neat")
     ));
     StructType schema = new StructType(new StructField[]{
       new StructField("label", DataTypes.DoubleType, false, Metadata.empty()),
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/CatalystTypeConverters.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/CatalystTypeConverters.scala
index 1c0ddb5093d17..2e7b4c236d8f8 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/CatalystTypeConverters.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/CatalystTypeConverters.scala
@@ -18,7 +18,10 @@
 package org.apache.spark.sql.catalyst
 
 import java.lang.{Iterable => JavaIterable}
+import java.math.{BigDecimal => JavaBigDecimal}
+import java.sql.Date
 import java.util.{Map => JavaMap}
+import javax.annotation.Nullable
 
 import scala.collection.mutable.HashMap
 
@@ -34,197 +37,338 @@ object CatalystTypeConverters {
   // Since the map values can be mutable, we explicitly import scala.collection.Map at here.
   import scala.collection.Map
 
+  private def isPrimitive(dataType: DataType): Boolean = {
+    dataType match {
+      case BooleanType => true
+      case ByteType => true
+      case ShortType => true
+      case IntegerType => true
+      case LongType => true
+      case FloatType => true
+      case DoubleType => true
+      case _ => false
+    }
+  }
+
+  private def getConverterForType(dataType: DataType): CatalystTypeConverter[Any, Any, Any] = {
+    val converter = dataType match {
+      case udt: UserDefinedType[_] => UDTConverter(udt)
+      case arrayType: ArrayType => ArrayConverter(arrayType.elementType)
+      case mapType: MapType => MapConverter(mapType.keyType, mapType.valueType)
+      case structType: StructType => StructConverter(structType)
+      case StringType => StringConverter
+      case DateType => DateConverter
+      case dt: DecimalType => BigDecimalConverter
+      case BooleanType => BooleanConverter
+      case ByteType => ByteConverter
+      case ShortType => ShortConverter
+      case IntegerType => IntConverter
+      case LongType => LongConverter
+      case FloatType => FloatConverter
+      case DoubleType => DoubleConverter
+      case _ => IdentityConverter
+    }
+    converter.asInstanceOf[CatalystTypeConverter[Any, Any, Any]]
+  }
+
   /**
-   * Converts Scala objects to catalyst rows / types. This method is slow, and for batch
-   * conversion you should be using converter produced by createToCatalystConverter.
-   * Note: This is always called after schemaFor has been called.
-   *       This ordering is important for UDT registration.
+   * Converts a Scala type to its Catalyst equivalent (and vice versa).
+   *
+   * @tparam ScalaInputType The type of Scala values that can be converted to Catalyst.
+   * @tparam ScalaOutputType The type of Scala values returned when converting Catalyst to Scala.
+   * @tparam CatalystType The internal Catalyst type used to represent values of this Scala type.
    */
-  def convertToCatalyst(a: Any, dataType: DataType): Any = (a, dataType) match {
-    // Check UDT first since UDTs can override other types
-    case (obj, udt: UserDefinedType[_]) =>
-      udt.serialize(obj)
-
-    case (o: Option[_], _) =>
-      o.map(convertToCatalyst(_, dataType)).orNull
-
-    case (s: Seq[_], arrayType: ArrayType) =>
-      s.map(convertToCatalyst(_, arrayType.elementType))
-
-    case (jit: JavaIterable[_], arrayType: ArrayType) => {
-      val iter = jit.iterator
-      var listOfItems: List[Any] = List()
-      while (iter.hasNext) {
-        val item = iter.next()
-        listOfItems :+= convertToCatalyst(item, arrayType.elementType)
+  private abstract class CatalystTypeConverter[ScalaInputType, ScalaOutputType, CatalystType]
+    extends Serializable {
+
+    /**
+     * Converts a Scala type to its Catalyst equivalent while automatically handling nulls
+     * and Options.
+     */
+    final def toCatalyst(@Nullable maybeScalaValue: Any): CatalystType = {
+      if (maybeScalaValue == null) {
+        null.asInstanceOf[CatalystType]
+      } else if (maybeScalaValue.isInstanceOf[Option[ScalaInputType]]) {
+        val opt = maybeScalaValue.asInstanceOf[Option[ScalaInputType]]
+        if (opt.isDefined) {
+          toCatalystImpl(opt.get)
+        } else {
+          null.asInstanceOf[CatalystType]
+        }
+      } else {
+        toCatalystImpl(maybeScalaValue.asInstanceOf[ScalaInputType])
       }
-      listOfItems
     }
 
-    case (s: Array[_], arrayType: ArrayType) =>
-      s.toSeq.map(convertToCatalyst(_, arrayType.elementType))
+    /**
+     * Given a Catalyst row, convert the value at column `column` to its Scala equivalent.
+     */
+    final def toScala(row: Row, column: Int): ScalaOutputType = {
+      if (row.isNullAt(column)) null.asInstanceOf[ScalaOutputType] else toScalaImpl(row, column)
+    }
+
+    /**
+     * Convert a Catalyst value to its Scala equivalent.
+     */
+    def toScala(@Nullable catalystValue: CatalystType): ScalaOutputType
+
+    /**
+     * Converts a Scala value to its Catalyst equivalent.
+     * @param scalaValue the Scala value, guaranteed not to be null.
+     * @return the Catalyst value.
+     */
+    protected def toCatalystImpl(scalaValue: ScalaInputType): CatalystType
+
+    /**
+     * Given a Catalyst row, convert the value at column `column` to its Scala equivalent.
+     * This method will only be called on non-null columns.
+     */
+    protected def toScalaImpl(row: Row, column: Int): ScalaOutputType
+  }
 
-    case (m: Map[_, _], mapType: MapType) =>
-      m.map { case (k, v) =>
-        convertToCatalyst(k, mapType.keyType) -> convertToCatalyst(v, mapType.valueType)
-      }
+  private object IdentityConverter extends CatalystTypeConverter[Any, Any, Any] {
+    override def toCatalystImpl(scalaValue: Any): Any = scalaValue
+    override def toScala(catalystValue: Any): Any = catalystValue
+    override def toScalaImpl(row: Row, column: Int): Any = row(column)
+  }
 
-    case (jmap: JavaMap[_, _], mapType: MapType) =>
-      val iter = jmap.entrySet.iterator
-      var listOfEntries: List[(Any, Any)] = List()
-      while (iter.hasNext) {
-        val entry = iter.next()
-        listOfEntries :+= (convertToCatalyst(entry.getKey, mapType.keyType),
-          convertToCatalyst(entry.getValue, mapType.valueType))
+  private case class UDTConverter(
+      udt: UserDefinedType[_]) extends CatalystTypeConverter[Any, Any, Any] {
+    override def toCatalystImpl(scalaValue: Any): Any = udt.serialize(scalaValue)
+    override def toScala(catalystValue: Any): Any = udt.deserialize(catalystValue)
+    override def toScalaImpl(row: Row, column: Int): Any = toScala(row(column))
+  }
+
+  /** Converter for arrays, sequences, and Java iterables. */
+  private case class ArrayConverter(
+      elementType: DataType) extends CatalystTypeConverter[Any, Seq[Any], Seq[Any]] {
+
+    private[this] val elementConverter = getConverterForType(elementType)
+
+    override def toCatalystImpl(scalaValue: Any): Seq[Any] = {
+      scalaValue match {
+        case a: Array[_] => a.toSeq.map(elementConverter.toCatalyst)
+        case s: Seq[_] => s.map(elementConverter.toCatalyst)
+        case i: JavaIterable[_] =>
+          val iter = i.iterator
+          var convertedIterable: List[Any] = List()
+          while (iter.hasNext) {
+            val item = iter.next()
+            convertedIterable :+= elementConverter.toCatalyst(item)
+          }
+          convertedIterable
       }
-      listOfEntries.toMap
-
-    case (p: Product, structType: StructType) =>
-      val ar = new Array[Any](structType.size)
-      val iter = p.productIterator
-      var idx = 0
-      while (idx < structType.size) {
-        ar(idx) = convertToCatalyst(iter.next(), structType.fields(idx).dataType)
-        idx += 1
+    }
+
+    override def toScala(catalystValue: Seq[Any]): Seq[Any] = {
+      if (catalystValue == null) {
+        null
+      } else {
+        catalystValue.asInstanceOf[Seq[_]].map(elementConverter.toScala)
       }
-      new GenericRowWithSchema(ar, structType)
+    }
 
-    case (d: String, _) =>
-      UTF8String(d)
+    override def toScalaImpl(row: Row, column: Int): Seq[Any] =
+      toScala(row(column).asInstanceOf[Seq[Any]])
+  }
+
+  private case class MapConverter(
+      keyType: DataType,
+      valueType: DataType)
+    extends CatalystTypeConverter[Any, Map[Any, Any], Map[Any, Any]] {
 
-    case (d: BigDecimal, _) =>
-      Decimal(d)
+    private[this] val keyConverter = getConverterForType(keyType)
+    private[this] val valueConverter = getConverterForType(valueType)
 
-    case (d: java.math.BigDecimal, _) =>
-      Decimal(d)
+    override def toCatalystImpl(scalaValue: Any): Map[Any, Any] = scalaValue match {
+      case m: Map[_, _] =>
+        m.map { case (k, v) =>
+          keyConverter.toCatalyst(k) -> valueConverter.toCatalyst(v)
+        }
 
-    case (d: java.sql.Date, _) =>
-      DateUtils.fromJavaDate(d)
+      case jmap: JavaMap[_, _] =>
+        val iter = jmap.entrySet.iterator
+        val convertedMap: HashMap[Any, Any] = HashMap()
+        while (iter.hasNext) {
+          val entry = iter.next()
+          val key = keyConverter.toCatalyst(entry.getKey)
+          convertedMap(key) = valueConverter.toCatalyst(entry.getValue)
+        }
+        convertedMap
+    }
 
-    case (r: Row, structType: StructType) =>
-      val converters = structType.fields.map {
-        f => (item: Any) => convertToCatalyst(item, f.dataType)
+    override def toScala(catalystValue: Map[Any, Any]): Map[Any, Any] = {
+      if (catalystValue == null) {
+        null
+      } else {
+        catalystValue.map { case (k, v) =>
+          keyConverter.toScala(k) -> valueConverter.toScala(v)
+        }
       }
-      convertRowWithConverters(r, structType, converters)
+    }
 
-    case (other, _) =>
-      other
+    override def toScalaImpl(row: Row, column: Int): Map[Any, Any] =
+      toScala(row(column).asInstanceOf[Map[Any, Any]])
   }
 
-  /**
-   * Creates a converter function that will convert Scala objects to the specified catalyst type.
-   * Typical use case would be converting a collection of rows that have the same schema. You will
-   * call this function once to get a converter, and apply it to every row.
-   */
-  private[sql] def createToCatalystConverter(dataType: DataType): Any => Any = {
-    def extractOption(item: Any): Any = item match {
-      case opt: Option[_] => opt.orNull
-      case other => other
-    }
+  private case class StructConverter(
+      structType: StructType) extends CatalystTypeConverter[Any, Row, Row] {
 
-    dataType match {
-      // Check UDT first since UDTs can override other types
-      case udt: UserDefinedType[_] =>
-        (item) => extractOption(item) match {
-          case null => null
-          case other => udt.serialize(other)
-        }
+    private[this] val converters = structType.fields.map { f => getConverterForType(f.dataType) }
 
-      case arrayType: ArrayType =>
-        val elementConverter = createToCatalystConverter(arrayType.elementType)
-        (item: Any) => {
-          extractOption(item) match {
-            case a: Array[_] => a.toSeq.map(elementConverter)
-            case s: Seq[_] => s.map(elementConverter)
-            case i: JavaIterable[_] => {
-              val iter = i.iterator
-              var convertedIterable: List[Any] = List()
-              while (iter.hasNext) {
-                val item = iter.next()
-                convertedIterable :+= elementConverter(item)
-              }
-              convertedIterable
-            }
-            case null => null
-          }
+    override def toCatalystImpl(scalaValue: Any): Row = scalaValue match {
+      case row: Row =>
+        val ar = new Array[Any](row.size)
+        var idx = 0
+        while (idx < row.size) {
+          ar(idx) = converters(idx).toCatalyst(row(idx))
+          idx += 1
         }
-
-      case mapType: MapType =>
-        val keyConverter = createToCatalystConverter(mapType.keyType)
-        val valueConverter = createToCatalystConverter(mapType.valueType)
-        (item: Any) => {
-          extractOption(item) match {
-            case m: Map[_, _] =>
-              m.map { case (k, v) =>
-                keyConverter(k) -> valueConverter(v)
-              }
-
-            case jmap: JavaMap[_, _] =>
-              val iter = jmap.entrySet.iterator
-              val convertedMap: HashMap[Any, Any] = HashMap()
-              while (iter.hasNext) {
-                val entry = iter.next()
-                convertedMap(keyConverter(entry.getKey)) = valueConverter(entry.getValue)
-              }
-              convertedMap
-
-            case null => null
-          }
+        new GenericRowWithSchema(ar, structType)
+
+      case p: Product =>
+        val ar = new Array[Any](structType.size)
+        val iter = p.productIterator
+        var idx = 0
+        while (idx < structType.size) {
+          ar(idx) = converters(idx).toCatalyst(iter.next())
+          idx += 1
         }
+        new GenericRowWithSchema(ar, structType)
+    }
 
-      case structType: StructType =>
-        val converters = structType.fields.map(f => createToCatalystConverter(f.dataType))
-        (item: Any) => {
-          extractOption(item) match {
-            case r: Row =>
-              convertRowWithConverters(r, structType, converters)
-
-            case p: Product =>
-              val ar = new Array[Any](structType.size)
-              val iter = p.productIterator
-              var idx = 0
-              while (idx < structType.size) {
-                ar(idx) = converters(idx)(iter.next())
-                idx += 1
-              }
-              new GenericRowWithSchema(ar, structType)
-
-            case null =>
-              null
-          }
+    override def toScala(row: Row): Row = {
+      if (row == null) {
+        null
+      } else {
+        val ar = new Array[Any](row.size)
+        var idx = 0
+        while (idx < row.size) {
+          ar(idx) = converters(idx).toScala(row, idx)
+          idx += 1
         }
-
-      case dateType: DateType => (item: Any) => extractOption(item) match {
-        case d: java.sql.Date => DateUtils.fromJavaDate(d)
-        case other => other
+        new GenericRowWithSchema(ar, structType)
       }
+    }
 
-      case dataType: StringType => (item: Any) => extractOption(item) match {
-        case s: String => UTF8String(s)
-        case other => other
-      }
+    override def toScalaImpl(row: Row, column: Int): Row = toScala(row(column).asInstanceOf[Row])
+  }
+
+  private object StringConverter extends CatalystTypeConverter[Any, String, Any] {
+    override def toCatalystImpl(scalaValue: Any): UTF8String = scalaValue match {
+      case str: String => UTF8String(str)
+      case utf8: UTF8String => utf8
+    }
+    override def toScala(catalystValue: Any): String = catalystValue match {
+      case null => null
+      case str: String => str
+      case utf8: UTF8String => utf8.toString()
+    }
+    override def toScalaImpl(row: Row, column: Int): String = row(column).toString
+  }
+
+  private object DateConverter extends CatalystTypeConverter[Date, Date, Any] {
+    override def toCatalystImpl(scalaValue: Date): Int = DateUtils.fromJavaDate(scalaValue)
+    override def toScala(catalystValue: Any): Date =
+      if (catalystValue == null) null else DateUtils.toJavaDate(catalystValue.asInstanceOf[Int])
+    override def toScalaImpl(row: Row, column: Int): Date = toScala(row.getInt(column))
+  }
+
+  private object BigDecimalConverter extends CatalystTypeConverter[Any, JavaBigDecimal, Decimal] {
+    override def toCatalystImpl(scalaValue: Any): Decimal = scalaValue match {
+      case d: BigDecimal => Decimal(d)
+      case d: JavaBigDecimal => Decimal(d)
+      case d: Decimal => d
+    }
+    override def toScala(catalystValue: Decimal): JavaBigDecimal = catalystValue.toJavaBigDecimal
+    override def toScalaImpl(row: Row, column: Int): JavaBigDecimal = row.get(column) match {
+      case d: JavaBigDecimal => d
+      case d: Decimal => d.toJavaBigDecimal
+    }
+  }
+
+  private abstract class PrimitiveConverter[T] extends CatalystTypeConverter[T, Any, Any] {
+    final override def toScala(catalystValue: Any): Any = catalystValue
+    final override def toCatalystImpl(scalaValue: T): Any = scalaValue
+  }
+
+  private object BooleanConverter extends PrimitiveConverter[Boolean] {
+    override def toScalaImpl(row: Row, column: Int): Boolean = row.getBoolean(column)
+  }
+
+  private object ByteConverter extends PrimitiveConverter[Byte] {
+    override def toScalaImpl(row: Row, column: Int): Byte = row.getByte(column)
+  }
+
+  private object ShortConverter extends PrimitiveConverter[Short] {
+    override def toScalaImpl(row: Row, column: Int): Short = row.getShort(column)
+  }
+
+  private object IntConverter extends PrimitiveConverter[Int] {
+    override def toScalaImpl(row: Row, column: Int): Int = row.getInt(column)
+  }
+
+  private object LongConverter extends PrimitiveConverter[Long] {
+    override def toScalaImpl(row: Row, column: Int): Long = row.getLong(column)
+  }
+
+  private object FloatConverter extends PrimitiveConverter[Float] {
+    override def toScalaImpl(row: Row, column: Int): Float = row.getFloat(column)
+  }
 
-      case _ =>
-        (item: Any) => extractOption(item) match {
-          case d: BigDecimal => Decimal(d)
-          case d: java.math.BigDecimal => Decimal(d)
-          case other => other
+  private object DoubleConverter extends PrimitiveConverter[Double] {
+    override def toScalaImpl(row: Row, column: Int): Double = row.getDouble(column)
+  }
+
+  /**
+   * Converts Scala objects to catalyst rows / types. This method is slow, and for batch
+   * conversion you should be using converter produced by createToCatalystConverter.
+   * Note: This is always called after schemaFor has been called.
+   *       This ordering is important for UDT registration.
+   */
+  def convertToCatalyst(scalaValue: Any, dataType: DataType): Any = {
+    getConverterForType(dataType).toCatalyst(scalaValue)
+  }
+
+  /**
+   * Creates a converter function that will convert Scala objects to the specified Catalyst type.
+   * Typical use case would be converting a collection of rows that have the same schema. You will
+   * call this function once to get a converter, and apply it to every row.
+   */
+  private[sql] def createToCatalystConverter(dataType: DataType): Any => Any = {
+    if (isPrimitive(dataType)) {
+      // Although the `else` branch here is capable of handling inbound conversion of primitives,
+      // we add some special-case handling for those types here. The motivation for this relates to
+      // Java method invocation costs: if we have rows that consist entirely of primitive columns,
+      // then returning the same conversion function for all of the columns means that the call site
+      // will be monomorphic instead of polymorphic. In microbenchmarks, this actually resulted in
+      // a measurable performance impact. Note that this optimization will be unnecessary if we
+      // use code generation to construct Scala Row -> Catalyst Row converters.
+      def convert(maybeScalaValue: Any): Any = {
+        if (maybeScalaValue.isInstanceOf[Option[Any]]) {
+          maybeScalaValue.asInstanceOf[Option[Any]].orNull
+        } else {
+          maybeScalaValue
         }
+      }
+      convert
+    } else {
+      getConverterForType(dataType).toCatalyst
     }
   }
 
   /**
-   *  Converts Scala objects to catalyst rows / types.
+   *  Converts Scala objects to Catalyst rows / types.
    *
    *  Note: This should be called before do evaluation on Row
    *        (It does not support UDT)
    *  This is used to create an RDD or test results with correct types for Catalyst.
    */
   def convertToCatalyst(a: Any): Any = a match {
-    case s: String => UTF8String(s)
-    case d: java.sql.Date => DateUtils.fromJavaDate(d)
-    case d: BigDecimal => Decimal(d)
-    case d: java.math.BigDecimal => Decimal(d)
+    case s: String => StringConverter.toCatalyst(s)
+    case d: Date => DateConverter.toCatalyst(d)
+    case d: BigDecimal => BigDecimalConverter.toCatalyst(d)
+    case d: JavaBigDecimal => BigDecimalConverter.toCatalyst(d)
     case seq: Seq[Any] => seq.map(convertToCatalyst)
     case r: Row => Row(r.toSeq.map(convertToCatalyst): _*)
     case arr: Array[Any] => arr.toSeq.map(convertToCatalyst).toArray
@@ -238,33 +382,8 @@ object CatalystTypeConverters {
    * This method is slow, and for batch conversion you should be using converter
    * produced by createToScalaConverter.
    */
-  def convertToScala(a: Any, dataType: DataType): Any = (a, dataType) match {
-    // Check UDT first since UDTs can override other types
-    case (d, udt: UserDefinedType[_]) =>
-      udt.deserialize(d)
-
-    case (s: Seq[_], arrayType: ArrayType) =>
-      s.map(convertToScala(_, arrayType.elementType))
-
-    case (m: Map[_, _], mapType: MapType) =>
-      m.map { case (k, v) =>
-        convertToScala(k, mapType.keyType) -> convertToScala(v, mapType.valueType)
-      }
-
-    case (r: Row, s: StructType) =>
-      convertRowToScala(r, s)
-
-    case (d: Decimal, _: DecimalType) =>
-      d.toJavaBigDecimal
-
-    case (i: Int, DateType) =>
-      DateUtils.toJavaDate(i)
-
-    case (s: UTF8String, StringType) =>
-      s.toString()
-
-    case (other, _) =>
-      other
+  def convertToScala(catalystValue: Any, dataType: DataType): Any = {
+    getConverterForType(dataType).toScala(catalystValue)
   }
 
   /**
@@ -272,82 +391,7 @@ object CatalystTypeConverters {
    * Typical use case would be converting a collection of rows that have the same schema. You will
    * call this function once to get a converter, and apply it to every row.
    */
-  private[sql] def createToScalaConverter(dataType: DataType): Any => Any = dataType match {
-    // Check UDT first since UDTs can override other types
-    case udt: UserDefinedType[_] =>
-      (item: Any) => if (item == null) null else udt.deserialize(item)
-
-    case arrayType: ArrayType =>
-      val elementConverter = createToScalaConverter(arrayType.elementType)
-      (item: Any) => if (item == null) null else item.asInstanceOf[Seq[_]].map(elementConverter)
-
-    case mapType: MapType =>
-      val keyConverter = createToScalaConverter(mapType.keyType)
-      val valueConverter = createToScalaConverter(mapType.valueType)
-      (item: Any) => if (item == null) {
-        null
-      } else {
-        item.asInstanceOf[Map[_, _]].map { case (k, v) =>
-          keyConverter(k) -> valueConverter(v)
-        }
-      }
-
-    case s: StructType =>
-      val converters = s.fields.map(f => createToScalaConverter(f.dataType))
-      (item: Any) => {
-        if (item == null) {
-          null
-        } else {
-          convertRowWithConverters(item.asInstanceOf[Row], s, converters)
-        }
-      }
-
-    case _: DecimalType =>
-      (item: Any) => item match {
-        case d: Decimal => d.toJavaBigDecimal
-        case other => other
-      }
-
-    case DateType =>
-      (item: Any) => item match {
-        case i: Int => DateUtils.toJavaDate(i)
-        case other => other
-      }
-
-    case StringType =>
-      (item: Any) => item match {
-        case s: UTF8String => s.toString()
-        case other => other
-      }
-
-    case other =>
-      (item: Any) => item
-  }
-
-  def convertRowToScala(r: Row, schema: StructType): Row = {
-    val ar = new Array[Any](r.size)
-    var idx = 0
-    while (idx < r.size) {
-      ar(idx) = convertToScala(r(idx), schema.fields(idx).dataType)
-      idx += 1
-    }
-    new GenericRowWithSchema(ar, schema)
-  }
-
-  /**
-   * Converts a row by applying the provided set of converter functions. It is used for both
-   * toScala and toCatalyst conversions.
-   */
-  private[sql] def convertRowWithConverters(
-      row: Row,
-      schema: StructType,
-      converters: Array[Any => Any]): Row = {
-    val ar = new Array[Any](row.size)
-    var idx = 0
-    while (idx < row.size) {
-      ar(idx) = converters(idx)(row(idx))
-      idx += 1
-    }
-    new GenericRowWithSchema(ar, schema)
+  private[sql] def createToScalaConverter(dataType: DataType): Any => Any = {
+    getConverterForType(dataType).toScala
   }
 }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/generators.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/generators.scala
index 634138010fd21..b6191eafba71b 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/generators.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/generators.scala
@@ -71,12 +71,23 @@ case class UserDefinedGenerator(
     children: Seq[Expression])
   extends Generator {
 
+  @transient private[this] var inputRow: InterpretedProjection = _
+  @transient private[this] var convertToScala: (Row) => Row = _
+
+  private def initializeConverters(): Unit = {
+    inputRow = new InterpretedProjection(children)
+    convertToScala = {
+      val inputSchema = StructType(children.map(e => StructField(e.simpleString, e.dataType, true)))
+      CatalystTypeConverters.createToScalaConverter(inputSchema)
+    }.asInstanceOf[(Row => Row)]
+  }
+
   override def eval(input: Row): TraversableOnce[Row] = {
-    // TODO(davies): improve this
+    if (inputRow == null) {
+      initializeConverters()
+    }
     // Convert the objects into Scala Type before calling function, we need schema to support UDT
-    val inputSchema = StructType(children.map(e => StructField(e.simpleString, e.dataType, true)))
-    val inputRow = new InterpretedProjection(children)
-    function(CatalystTypeConverters.convertToScala(inputRow(input), inputSchema).asInstanceOf[Row])
+    function(convertToScala(inputRow(input)))
   }
 
   override def toString: String = s"UserDefinedGenerator(${children.mkString(",")})"
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/CatalystTypeConvertersSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/CatalystTypeConvertersSuite.scala
new file mode 100644
index 0000000000000..df0f04563edcf
--- /dev/null
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/CatalystTypeConvertersSuite.scala
@@ -0,0 +1,62 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.catalyst
+
+import org.apache.spark.SparkFunSuite
+import org.apache.spark.sql.Row
+import org.apache.spark.sql.types._
+
+class CatalystTypeConvertersSuite extends SparkFunSuite {
+
+  private val simpleTypes: Seq[DataType] = Seq(
+    StringType,
+    DateType,
+    BooleanType,
+    ByteType,
+    ShortType,
+    IntegerType,
+    LongType,
+    FloatType,
+    DoubleType)
+
+  test("null handling in rows") {
+    val schema = StructType(simpleTypes.map(t => StructField(t.getClass.getName, t)))
+    val convertToCatalyst = CatalystTypeConverters.createToCatalystConverter(schema)
+    val convertToScala = CatalystTypeConverters.createToScalaConverter(schema)
+
+    val scalaRow = Row.fromSeq(Seq.fill(simpleTypes.length)(null))
+    assert(convertToScala(convertToCatalyst(scalaRow)) === scalaRow)
+  }
+
+  test("null handling for individual values") {
+    for (dataType <- simpleTypes) {
+      assert(CatalystTypeConverters.createToScalaConverter(dataType)(null) === null)
+    }
+  }
+
+  test("option handling in convertToCatalyst") {
+    // convertToCatalyst doesn't handle unboxing from Options. This is inconsistent with
+    // createToCatalystConverter but it may not actually matter as this is only called internally
+    // in a handful of places where we don't expect to receive Options.
+    assert(CatalystTypeConverters.convertToCatalyst(Some(123)) === Some(123))
+  }
+
+  test("option handling in createToCatalystConverter") {
+    assert(CatalystTypeConverters.createToCatalystConverter(IntegerType)(Some(123)) === 123)
+  }
+}
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/columnar/InMemoryColumnarQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/columnar/InMemoryColumnarQuerySuite.scala
index 56591d9dba29e..055453e688e73 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/columnar/InMemoryColumnarQuerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/columnar/InMemoryColumnarQuerySuite.scala
@@ -173,7 +173,7 @@ class InMemoryColumnarQuerySuite extends QueryTest {
           new Timestamp(i),
           (1 to i).toSeq,
           (0 to i).map(j => s"map_key_$j" -> (Long.MaxValue - j)).toMap,
-          Row((i - 0.25).toFloat, (1 to i).toSeq))
+          Row((i - 0.25).toFloat, Seq(true, false, null)))
       }
     createDataFrame(rdd, schema).registerTempTable("InMemoryCache_different_data_types")
     // Cache the table.

From 07c16cb5ba9cb0bfe34e8c0efbf06540a22d4e4e Mon Sep 17 00:00:00 2001
From: "Joseph K. Bradley" <joseph@databricks.com>
Date: Tue, 2 Jun 2015 22:56:56 -0700
Subject: [PATCH 326/525] [SPARK-8053] [MLLIB] renamed scalingVector to
 scalingVec

I searched the Spark codebase for all occurrences of "scalingVector"

CC: mengxr

Author: Joseph K. Bradley <joseph@databricks.com>

Closes #6596 from jkbradley/scalingVec-rename and squashes the following commits:

d3812f8 [Joseph K. Bradley] renamed scalingVector to scalingVec
---
 .../spark/ml/feature/ElementwiseProduct.scala      |  2 +-
 .../spark/mllib/feature/ElementwiseProduct.scala   | 14 +++++++-------
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/ElementwiseProduct.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/ElementwiseProduct.scala
index 3ae1833390152..1e758cb775de7 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/ElementwiseProduct.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/ElementwiseProduct.scala
@@ -41,7 +41,7 @@ class ElementwiseProduct(override val uid: String)
     * the vector to multiply with input vectors
     * @group param
     */
-  val scalingVec: Param[Vector] = new Param(this, "scalingVector", "vector for hadamard product")
+  val scalingVec: Param[Vector] = new Param(this, "scalingVec", "vector for hadamard product")
 
   /** @group setParam */
   def setScalingVec(value: Vector): this.type = set(scalingVec, value)
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/ElementwiseProduct.scala b/mllib/src/main/scala/org/apache/spark/mllib/feature/ElementwiseProduct.scala
index b0985baf9b278..d67fe6c3ee4f8 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/feature/ElementwiseProduct.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/feature/ElementwiseProduct.scala
@@ -25,10 +25,10 @@ import org.apache.spark.mllib.linalg._
  * Outputs the Hadamard product (i.e., the element-wise product) of each input vector with a
  * provided "weight" vector. In other words, it scales each column of the dataset by a scalar
  * multiplier.
- * @param scalingVector The values used to scale the reference vector's individual components.
+ * @param scalingVec The values used to scale the reference vector's individual components.
  */
 @Experimental
-class ElementwiseProduct(val scalingVector: Vector) extends VectorTransformer {
+class ElementwiseProduct(val scalingVec: Vector) extends VectorTransformer {
 
   /**
    * Does the hadamard product transformation.
@@ -37,15 +37,15 @@ class ElementwiseProduct(val scalingVector: Vector) extends VectorTransformer {
    * @return transformed vector.
    */
   override def transform(vector: Vector): Vector = {
-    require(vector.size == scalingVector.size,
-      s"vector sizes do not match: Expected ${scalingVector.size} but found ${vector.size}")
+    require(vector.size == scalingVec.size,
+      s"vector sizes do not match: Expected ${scalingVec.size} but found ${vector.size}")
     vector match {
       case dv: DenseVector =>
         val values: Array[Double] = dv.values.clone()
-        val dim = scalingVector.size
+        val dim = scalingVec.size
         var i = 0
         while (i < dim) {
-          values(i) *= scalingVector(i)
+          values(i) *= scalingVec(i)
           i += 1
         }
         Vectors.dense(values)
@@ -54,7 +54,7 @@ class ElementwiseProduct(val scalingVector: Vector) extends VectorTransformer {
         val dim = values.length
         var i = 0
         while (i < dim) {
-          values(i) *= scalingVector(indices(i))
+          values(i) *= scalingVec(indices(i))
           i += 1
         }
         Vectors.sparse(size, indices, values)

From ccaa823290cbe859cd224ac0f7071dfd0218b669 Mon Sep 17 00:00:00 2001
From: WangTaoTheTonic <wangtao111@huawei.com>
Date: Tue, 2 Jun 2015 22:59:48 -0700
Subject: [PATCH 327/525] [MINOR] make the launcher project name consistent
 with others

I found this by chance while building spark and think it is better to keep its name consistent with other sub-projects (Spark Project *).

I am not gonna file JIRA as it is a pretty small issue.

Author: WangTaoTheTonic <wangtao111@huawei.com>

Closes #6603 from WangTaoTheTonic/projName and squashes the following commits:

994b3ba [WangTaoTheTonic] make the project name consistent
---
 launcher/pom.xml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/launcher/pom.xml b/launcher/pom.xml
index ebfa7685eaa18..cc177d23dff77 100644
--- a/launcher/pom.xml
+++ b/launcher/pom.xml
@@ -29,7 +29,7 @@
   <groupId>org.apache.spark</groupId>
   <artifactId>spark-launcher_2.10</artifactId>
   <packaging>jar</packaging>
-  <name>Spark Launcher Project</name>
+  <name>Spark Project Launcher</name>
   <url>http://spark.apache.org/</url>
   <properties>
     <sbt.project.name>launcher</sbt.project.name>

From 43adbd56114ba80039a23909b0a30d393eaacc62 Mon Sep 17 00:00:00 2001
From: Yuhao Yang <hhbyyh@gmail.com>
Date: Tue, 2 Jun 2015 23:15:38 -0700
Subject: [PATCH 328/525] [SPARK-8043] [MLLIB] [DOC] update NaiveBayes and SVM
 examples in doc

jira: https://issues.apache.org/jira/browse/SPARK-8043

I found some issues during testing the save/load examples in markdown Documents, as a part of 1.4 QA plan

Author: Yuhao Yang <hhbyyh@gmail.com>

Closes #6584 from hhbyyh/naiveDocExample and squashes the following commits:

a01a206 [Yuhao Yang] fix for Gaussian mixture
2fb8b96 [Yuhao Yang] update NaiveBayes and SVM examples in doc
---
 docs/mllib-clustering.md     |  6 +++---
 docs/mllib-linear-methods.md | 24 ++++++++++--------------
 docs/mllib-naive-bayes.md    |  2 +-
 3 files changed, 14 insertions(+), 18 deletions(-)

diff --git a/docs/mllib-clustering.md b/docs/mllib-clustering.md
index dac22f736e8cb..1b088969ddc25 100644
--- a/docs/mllib-clustering.md
+++ b/docs/mllib-clustering.md
@@ -249,11 +249,11 @@ public class GaussianMixtureExample {
     GaussianMixtureModel gmm = new GaussianMixture().setK(2).run(parsedData.rdd());
 
     // Save and load GaussianMixtureModel
-    gmm.save(sc, "myGMMModel")
-    GaussianMixtureModel sameModel = GaussianMixtureModel.load(sc, "myGMMModel")
+    gmm.save(sc.sc(), "myGMMModel");
+    GaussianMixtureModel sameModel = GaussianMixtureModel.load(sc.sc(), "myGMMModel");
     // Output the parameters of the mixture model
     for(int j=0; j<gmm.k(); j++) {
-        System.out.println("weight=%f\nmu=%s\nsigma=\n%s\n",
+        System.out.printf("weight=%f\nmu=%s\nsigma=\n%s\n",
             gmm.weights()[j], gmm.gaussians()[j].mu(), gmm.gaussians()[j].sigma());
     }
   }
diff --git a/docs/mllib-linear-methods.md b/docs/mllib-linear-methods.md
index 8029edca16002..3dc8cc902fa72 100644
--- a/docs/mllib-linear-methods.md
+++ b/docs/mllib-linear-methods.md
@@ -163,11 +163,8 @@ object, and make predictions with the resulting model to compute the training
 error.
 
 {% highlight scala %}
-import org.apache.spark.SparkContext
 import org.apache.spark.mllib.classification.{SVMModel, SVMWithSGD}
 import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics
-import org.apache.spark.mllib.regression.LabeledPoint
-import org.apache.spark.mllib.linalg.Vectors
 import org.apache.spark.mllib.util.MLUtils
 
 // Load training data in LIBSVM format.
@@ -231,15 +228,13 @@ calling `.rdd()` on your `JavaRDD` object. A self-contained application example
 that is equivalent to the provided example in Scala is given bellow:
 
 {% highlight java %}
-import java.util.Random;
-
 import scala.Tuple2;
 
 import org.apache.spark.api.java.*;
 import org.apache.spark.api.java.function.Function;
 import org.apache.spark.mllib.classification.*;
 import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics;
-import org.apache.spark.mllib.linalg.Vector;
+
 import org.apache.spark.mllib.regression.LabeledPoint;
 import org.apache.spark.mllib.util.MLUtils;
 import org.apache.spark.SparkConf;
@@ -282,8 +277,8 @@ public class SVMClassifier {
     System.out.println("Area under ROC = " + auROC);
 
     // Save and load model
-    model.save(sc.sc(), "myModelPath");
-    SVMModel sameModel = SVMModel.load(sc.sc(), "myModelPath");
+    model.save(sc, "myModelPath");
+    SVMModel sameModel = SVMModel.load(sc, "myModelPath");
   }
 }
 {% endhighlight %}
@@ -315,15 +310,12 @@ a dependency.
 </div>
 
 <div data-lang="python" markdown="1">
-The following example shows how to load a sample dataset, build Logistic Regression model,
+The following example shows how to load a sample dataset, build SVM model,
 and make predictions with the resulting model to compute the training error.
 
-Note that the Python API does not yet support model save/load but will in the future.
-
 {% highlight python %}
-from pyspark.mllib.classification import LogisticRegressionWithSGD
+from pyspark.mllib.classification import SVMWithSGD, SVMModel
 from pyspark.mllib.regression import LabeledPoint
-from numpy import array
 
 # Load and parse the data
 def parsePoint(line):
@@ -334,12 +326,16 @@ data = sc.textFile("data/mllib/sample_svm_data.txt")
 parsedData = data.map(parsePoint)
 
 # Build the model
-model = LogisticRegressionWithSGD.train(parsedData)
+model = SVMWithSGD.train(parsedData, iterations=100)
 
 # Evaluating the model on training data
 labelsAndPreds = parsedData.map(lambda p: (p.label, model.predict(p.features)))
 trainErr = labelsAndPreds.filter(lambda (v, p): v != p).count() / float(parsedData.count())
 print("Training Error = " + str(trainErr))
+
+# Save and load model
+model.save(sc, "myModelPath")
+sameModel = SVMModel.load(sc, "myModelPath")
 {% endhighlight %}
 </div>
 </div>
diff --git a/docs/mllib-naive-bayes.md b/docs/mllib-naive-bayes.md
index acdcc371487f8..bf6d124fd5d8d 100644
--- a/docs/mllib-naive-bayes.md
+++ b/docs/mllib-naive-bayes.md
@@ -53,7 +53,7 @@ val splits = parsedData.randomSplit(Array(0.6, 0.4), seed = 11L)
 val training = splits(0)
 val test = splits(1)
 
-val model = NaiveBayes.train(training, lambda = 1.0, model = "multinomial")
+val model = NaiveBayes.train(training, lambda = 1.0, modelType = "multinomial")
 
 val predictionAndLabel = test.map(p => (model.predict(p.features), p.label))
 val accuracy = 1.0 * predictionAndLabel.filter(x => x._1 == x._2).count() / test.count()

From 452eb82dd722e5dfd00ee47bb8b6353933b0016e Mon Sep 17 00:00:00 2001
From: MechCoder <manojkumarsivaraj334@gmail.com>
Date: Tue, 2 Jun 2015 23:24:47 -0700
Subject: [PATCH 329/525] [SPARK-8032] [PYSPARK] Make version checking for
 NumPy in MLlib more robust

The current checking does version `1.x' is less than `1.4' this will fail if x has greater than 1 digit, since x > 4, however `1.x` < `1.4`

It fails in my system since I have version `1.10` :P

Author: MechCoder <manojkumarsivaraj334@gmail.com>

Closes #6579 from MechCoder/np_ver and squashes the following commits:

15430f8 [MechCoder] fix syntax error
893fb7e [MechCoder] remove equal to
e35f0d4 [MechCoder] minor
e89376c [MechCoder] Better checking
22703dd [MechCoder] [SPARK-8032] Make version checking for NumPy in MLlib more robust
---
 python/pyspark/mllib/__init__.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/python/pyspark/mllib/__init__.py b/python/pyspark/mllib/__init__.py
index b11aed2c3afda..acba3a717d21a 100644
--- a/python/pyspark/mllib/__init__.py
+++ b/python/pyspark/mllib/__init__.py
@@ -23,7 +23,9 @@
 # MLlib currently needs NumPy 1.4+, so complain if lower
 
 import numpy
-if numpy.version.version < '1.4':
+
+ver = [int(x) for x in numpy.version.version.split('.')[:2]]
+if ver < [1, 4]:
     raise Exception("MLlib requires NumPy 1.4+")
 
 __all__ = ['classification', 'clustering', 'feature', 'fpm', 'linalg', 'random',

From ce320cb2dbf28825f80795ce569735888f98d6e8 Mon Sep 17 00:00:00 2001
From: Reynold Xin <rxin@databricks.com>
Date: Wed, 3 Jun 2015 00:23:34 -0700
Subject: [PATCH 330/525] [SPARK-8060] Improve DataFrame Python test coverage
 and documentation.

Author: Reynold Xin <rxin@databricks.com>

Closes #6601 from rxin/python-read-write-test-and-doc and squashes the following commits:

baa8ad5 [Reynold Xin] Code review feedback.
f081d47 [Reynold Xin] More documentation updates.
c9902fa [Reynold Xin] [SPARK-8060] Improve DataFrame Python reader/writer interface doc and testing.
---
 .rat-excludes                                 |   1 +
 python/pyspark/sql/__init__.py                |  13 +-
 python/pyspark/sql/context.py                 |  89 +++----
 python/pyspark/sql/dataframe.py               |  82 +++----
 python/pyspark/sql/readwriter.py              | 217 ++++++++----------
 python/pyspark/sql/tests.py                   |   2 +
 .../sql/parquet_partitioned/_SUCCESS          |   0
 .../sql/parquet_partitioned/_common_metadata  | Bin 0 -> 210 bytes
 .../sql/parquet_partitioned/_metadata         | Bin 0 -> 743 bytes
 .../day=1/.part-r-00008.gz.parquet.crc        | Bin 0 -> 12 bytes
 .../month=9/day=1/part-r-00008.gz.parquet     | Bin 0 -> 322 bytes
 .../day=25/.part-r-00002.gz.parquet.crc       | Bin 0 -> 12 bytes
 .../day=25/.part-r-00004.gz.parquet.crc       | Bin 0 -> 12 bytes
 .../month=10/day=25/part-r-00002.gz.parquet   | Bin 0 -> 343 bytes
 .../month=10/day=25/part-r-00004.gz.parquet   | Bin 0 -> 343 bytes
 .../day=26/.part-r-00005.gz.parquet.crc       | Bin 0 -> 12 bytes
 .../month=10/day=26/part-r-00005.gz.parquet   | Bin 0 -> 333 bytes
 .../day=1/.part-r-00007.gz.parquet.crc        | Bin 0 -> 12 bytes
 .../month=9/day=1/part-r-00007.gz.parquet     | Bin 0 -> 343 bytes
 python/test_support/sql/people.json           |   3 +
 20 files changed, 180 insertions(+), 227 deletions(-)
 create mode 100644 python/test_support/sql/parquet_partitioned/_SUCCESS
 create mode 100644 python/test_support/sql/parquet_partitioned/_common_metadata
 create mode 100644 python/test_support/sql/parquet_partitioned/_metadata
 create mode 100644 python/test_support/sql/parquet_partitioned/year=2014/month=9/day=1/.part-r-00008.gz.parquet.crc
 create mode 100644 python/test_support/sql/parquet_partitioned/year=2014/month=9/day=1/part-r-00008.gz.parquet
 create mode 100644 python/test_support/sql/parquet_partitioned/year=2015/month=10/day=25/.part-r-00002.gz.parquet.crc
 create mode 100644 python/test_support/sql/parquet_partitioned/year=2015/month=10/day=25/.part-r-00004.gz.parquet.crc
 create mode 100644 python/test_support/sql/parquet_partitioned/year=2015/month=10/day=25/part-r-00002.gz.parquet
 create mode 100644 python/test_support/sql/parquet_partitioned/year=2015/month=10/day=25/part-r-00004.gz.parquet
 create mode 100644 python/test_support/sql/parquet_partitioned/year=2015/month=10/day=26/.part-r-00005.gz.parquet.crc
 create mode 100644 python/test_support/sql/parquet_partitioned/year=2015/month=10/day=26/part-r-00005.gz.parquet
 create mode 100644 python/test_support/sql/parquet_partitioned/year=2015/month=9/day=1/.part-r-00007.gz.parquet.crc
 create mode 100644 python/test_support/sql/parquet_partitioned/year=2015/month=9/day=1/part-r-00007.gz.parquet
 create mode 100644 python/test_support/sql/people.json

diff --git a/.rat-excludes b/.rat-excludes
index c0f81b57fe09d..8f2722cbd001f 100644
--- a/.rat-excludes
+++ b/.rat-excludes
@@ -82,3 +82,4 @@ local-1426633911242/*
 local-1430917381534/*
 DESCRIPTION
 NAMESPACE
+test_support/*
diff --git a/python/pyspark/sql/__init__.py b/python/pyspark/sql/__init__.py
index 726d288d97b2e..ad9c891ba1c04 100644
--- a/python/pyspark/sql/__init__.py
+++ b/python/pyspark/sql/__init__.py
@@ -45,11 +45,20 @@
 
 
 def since(version):
+    """
+    A decorator that annotates a function to append the version of Spark the function was added.
+    """
+    import re
+    indent_p = re.compile(r'\n( +)')
+
     def deco(f):
-        f.__doc__ = f.__doc__.rstrip() + "\n\n.. versionadded:: %s" % version
+        indents = indent_p.findall(f.__doc__)
+        indent = ' ' * (min(len(m) for m in indents) if indents else 0)
+        f.__doc__ = f.__doc__.rstrip() + "\n\n%s.. versionadded:: %s" % (indent, version)
         return f
     return deco
 
+
 from pyspark.sql.types import Row
 from pyspark.sql.context import SQLContext, HiveContext
 from pyspark.sql.column import Column
@@ -58,7 +67,9 @@ def deco(f):
 from pyspark.sql.readwriter import DataFrameReader, DataFrameWriter
 from pyspark.sql.window import Window, WindowSpec
 
+
 __all__ = [
     'SQLContext', 'HiveContext', 'DataFrame', 'GroupedData', 'Column', 'Row',
     'DataFrameNaFunctions', 'DataFrameStatFunctions', 'Window', 'WindowSpec',
+    'DataFrameReader', 'DataFrameWriter'
 ]
diff --git a/python/pyspark/sql/context.py b/python/pyspark/sql/context.py
index 22f6257dfe02d..9fdf43c3e6eb5 100644
--- a/python/pyspark/sql/context.py
+++ b/python/pyspark/sql/context.py
@@ -124,7 +124,10 @@ def getConf(self, key, defaultValue):
     @property
     @since("1.3.1")
     def udf(self):
-        """Returns a :class:`UDFRegistration` for UDF registration."""
+        """Returns a :class:`UDFRegistration` for UDF registration.
+
+        :return: :class:`UDFRegistration`
+        """
         return UDFRegistration(self)
 
     @since(1.4)
@@ -138,7 +141,7 @@ def range(self, start, end, step=1, numPartitions=None):
         :param end: the end value (exclusive)
         :param step: the incremental step (default: 1)
         :param numPartitions: the number of partitions of the DataFrame
-        :return: A new DataFrame
+        :return: :class:`DataFrame`
 
         >>> sqlContext.range(1, 7, 2).collect()
         [Row(id=1), Row(id=3), Row(id=5)]
@@ -195,8 +198,8 @@ def _inferSchema(self, rdd, samplingRatio=None):
             raise ValueError("The first row in RDD is empty, "
                              "can not infer schema")
         if type(first) is dict:
-            warnings.warn("Using RDD of dict to inferSchema is deprecated,"
-                          "please use pyspark.sql.Row instead")
+            warnings.warn("Using RDD of dict to inferSchema is deprecated. "
+                          "Use pyspark.sql.Row instead")
 
         if samplingRatio is None:
             schema = _infer_schema(first)
@@ -219,7 +222,7 @@ def inferSchema(self, rdd, samplingRatio=None):
         """
         .. note:: Deprecated in 1.3, use :func:`createDataFrame` instead.
         """
-        warnings.warn("inferSchema is deprecated, please use createDataFrame instead")
+        warnings.warn("inferSchema is deprecated, please use createDataFrame instead.")
 
         if isinstance(rdd, DataFrame):
             raise TypeError("Cannot apply schema to DataFrame")
@@ -262,6 +265,7 @@ def createDataFrame(self, data, schema=None, samplingRatio=None):
             :class:`list`, or :class:`pandas.DataFrame`.
         :param schema: a :class:`StructType` or list of column names. default None.
         :param samplingRatio: the sample ratio of rows used for inferring
+        :return: :class:`DataFrame`
 
         >>> l = [('Alice', 1)]
         >>> sqlContext.createDataFrame(l).collect()
@@ -359,18 +363,15 @@ def registerDataFrameAsTable(self, df, tableName):
         else:
             raise ValueError("Can only register DataFrame as table")
 
-    @since(1.0)
     def parquetFile(self, *paths):
         """Loads a Parquet file, returning the result as a :class:`DataFrame`.
 
-        >>> import tempfile, shutil
-        >>> parquetFile = tempfile.mkdtemp()
-        >>> shutil.rmtree(parquetFile)
-        >>> df.saveAsParquetFile(parquetFile)
-        >>> df2 = sqlContext.parquetFile(parquetFile)
-        >>> sorted(df.collect()) == sorted(df2.collect())
-        True
+        .. note:: Deprecated in 1.4, use :func:`DataFrameReader.parquet` instead.
+
+        >>> sqlContext.parquetFile('python/test_support/sql/parquet_partitioned').dtypes
+        [('name', 'string'), ('year', 'int'), ('month', 'int'), ('day', 'int')]
         """
+        warnings.warn("parquetFile is deprecated. Use read.parquet() instead.")
         gateway = self._sc._gateway
         jpaths = gateway.new_array(gateway.jvm.java.lang.String, len(paths))
         for i in range(0, len(paths)):
@@ -378,39 +379,15 @@ def parquetFile(self, *paths):
         jdf = self._ssql_ctx.parquetFile(jpaths)
         return DataFrame(jdf, self)
 
-    @since(1.0)
     def jsonFile(self, path, schema=None, samplingRatio=1.0):
         """Loads a text file storing one JSON object per line as a :class:`DataFrame`.
 
-        If the schema is provided, applies the given schema to this JSON dataset.
-        Otherwise, it samples the dataset with ratio ``samplingRatio`` to determine the schema.
-
-        >>> import tempfile, shutil
-        >>> jsonFile = tempfile.mkdtemp()
-        >>> shutil.rmtree(jsonFile)
-        >>> with open(jsonFile, 'w') as f:
-        ...     f.writelines(jsonStrings)
-        >>> df1 = sqlContext.jsonFile(jsonFile)
-        >>> df1.printSchema()
-        root
-         |-- field1: long (nullable = true)
-         |-- field2: string (nullable = true)
-         |-- field3: struct (nullable = true)
-         |    |-- field4: long (nullable = true)
+        .. note:: Deprecated in 1.4, use :func:`DataFrameReader.json` instead.
 
-        >>> from pyspark.sql.types import *
-        >>> schema = StructType([
-        ...     StructField("field2", StringType()),
-        ...     StructField("field3",
-        ...         StructType([StructField("field5", ArrayType(IntegerType()))]))])
-        >>> df2 = sqlContext.jsonFile(jsonFile, schema)
-        >>> df2.printSchema()
-        root
-         |-- field2: string (nullable = true)
-         |-- field3: struct (nullable = true)
-         |    |-- field5: array (nullable = true)
-         |    |    |-- element: integer (containsNull = true)
+        >>> sqlContext.jsonFile('python/test_support/sql/people.json').dtypes
+        [('age', 'bigint'), ('name', 'string')]
         """
+        warnings.warn("jsonFile is deprecated. Use read.json() instead.")
         if schema is None:
             df = self._ssql_ctx.jsonFile(path, samplingRatio)
         else:
@@ -462,21 +439,16 @@ def func(iterator):
             df = self._ssql_ctx.jsonRDD(jrdd.rdd(), scala_datatype)
         return DataFrame(df, self)
 
-    @since(1.3)
     def load(self, path=None, source=None, schema=None, **options):
         """Returns the dataset in a data source as a :class:`DataFrame`.
 
-        The data source is specified by the ``source`` and a set of ``options``.
-        If ``source`` is not specified, the default data source configured by
-        ``spark.sql.sources.default`` will be used.
-
-        Optionally, a schema can be provided as the schema of the returned DataFrame.
+        .. note:: Deprecated in 1.4, use :func:`DataFrameReader.load` instead.
         """
+        warnings.warn("load is deprecated. Use read.load() instead.")
         return self.read.load(path, source, schema, **options)
 
     @since(1.3)
-    def createExternalTable(self, tableName, path=None, source=None,
-                            schema=None, **options):
+    def createExternalTable(self, tableName, path=None, source=None, schema=None, **options):
         """Creates an external table based on the dataset in a data source.
 
         It returns the DataFrame associated with the external table.
@@ -487,6 +459,8 @@ def createExternalTable(self, tableName, path=None, source=None,
 
         Optionally, a schema can be provided as the schema of the returned :class:`DataFrame` and
         created external table.
+
+        :return: :class:`DataFrame`
         """
         if path is not None:
             options["path"] = path
@@ -508,6 +482,8 @@ def createExternalTable(self, tableName, path=None, source=None,
     def sql(self, sqlQuery):
         """Returns a :class:`DataFrame` representing the result of the given query.
 
+        :return: :class:`DataFrame`
+
         >>> sqlContext.registerDataFrameAsTable(df, "table1")
         >>> df2 = sqlContext.sql("SELECT field1 AS f1, field2 as f2 from table1")
         >>> df2.collect()
@@ -519,6 +495,8 @@ def sql(self, sqlQuery):
     def table(self, tableName):
         """Returns the specified table as a :class:`DataFrame`.
 
+        :return: :class:`DataFrame`
+
         >>> sqlContext.registerDataFrameAsTable(df, "table1")
         >>> df2 = sqlContext.table("table1")
         >>> sorted(df.collect()) == sorted(df2.collect())
@@ -536,6 +514,9 @@ def tables(self, dbName=None):
         The returned DataFrame has two columns: ``tableName`` and ``isTemporary``
         (a column with :class:`BooleanType` indicating if a table is a temporary one or not).
 
+        :param dbName: string, name of the database to use.
+        :return: :class:`DataFrame`
+
         >>> sqlContext.registerDataFrameAsTable(df, "table1")
         >>> df2 = sqlContext.tables()
         >>> df2.filter("tableName = 'table1'").first()
@@ -550,7 +531,8 @@ def tables(self, dbName=None):
     def tableNames(self, dbName=None):
         """Returns a list of names of tables in the database ``dbName``.
 
-        If ``dbName`` is not specified, the current database will be used.
+        :param dbName: string, name of the database to use. Default to the current database.
+        :return: list of table names, in string
 
         >>> sqlContext.registerDataFrameAsTable(df, "table1")
         >>> "table1" in sqlContext.tableNames()
@@ -585,8 +567,7 @@ def read(self):
         Returns a :class:`DataFrameReader` that can be used to read data
         in as a :class:`DataFrame`.
 
-        >>> sqlContext.read
-        <pyspark.sql.readwriter.DataFrameReader object at ...>
+        :return: :class:`DataFrameReader`
         """
         return DataFrameReader(self)
 
@@ -644,10 +625,14 @@ def register(self, name, f, returnType=StringType()):
 
 
 def _test():
+    import os
     import doctest
     from pyspark.context import SparkContext
     from pyspark.sql import Row, SQLContext
     import pyspark.sql.context
+
+    os.chdir(os.environ["SPARK_HOME"])
+
     globs = pyspark.sql.context.__dict__.copy()
     sc = SparkContext('local[4]', 'PythonTest')
     globs['sc'] = sc
diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py
index a82b6b87c413e..7673153abe0e2 100644
--- a/python/pyspark/sql/dataframe.py
+++ b/python/pyspark/sql/dataframe.py
@@ -44,7 +44,7 @@ class DataFrame(object):
     A :class:`DataFrame` is equivalent to a relational table in Spark SQL,
     and can be created using various functions in :class:`SQLContext`::
 
-        people = sqlContext.parquetFile("...")
+        people = sqlContext.read.parquet("...")
 
     Once created, it can be manipulated using the various domain-specific-language
     (DSL) functions defined in: :class:`DataFrame`, :class:`Column`.
@@ -56,8 +56,8 @@ class DataFrame(object):
     A more concrete example::
 
         # To create DataFrame using SQLContext
-        people = sqlContext.parquetFile("...")
-        department = sqlContext.parquetFile("...")
+        people = sqlContext.read.parquet("...")
+        department = sqlContext.read.parquet("...")
 
         people.filter(people.age > 30).join(department, people.deptId == department.id)) \
           .groupBy(department.name, "gender").agg({"salary": "avg", "age": "max"})
@@ -120,21 +120,12 @@ def toJSON(self, use_unicode=True):
         rdd = self._jdf.toJSON()
         return RDD(rdd.toJavaRDD(), self._sc, UTF8Deserializer(use_unicode))
 
-    @since(1.3)
     def saveAsParquetFile(self, path):
         """Saves the contents as a Parquet file, preserving the schema.
 
-        Files that are written out using this method can be read back in as
-        a :class:`DataFrame` using :func:`SQLContext.parquetFile`.
-
-        >>> import tempfile, shutil
-        >>> parquetFile = tempfile.mkdtemp()
-        >>> shutil.rmtree(parquetFile)
-        >>> df.saveAsParquetFile(parquetFile)
-        >>> df2 = sqlContext.parquetFile(parquetFile)
-        >>> sorted(df2.collect()) == sorted(df.collect())
-        True
+        .. note:: Deprecated in 1.4, use :func:`DataFrameWriter.parquet` instead.
         """
+        warnings.warn("saveAsParquetFile is deprecated. Use write.parquet() instead.")
         self._jdf.saveAsParquetFile(path)
 
     @since(1.3)
@@ -151,69 +142,45 @@ def registerTempTable(self, name):
         """
         self._jdf.registerTempTable(name)
 
-    @since(1.3)
     def registerAsTable(self, name):
-        """DEPRECATED: use :func:`registerTempTable` instead"""
-        warnings.warn("Use registerTempTable instead of registerAsTable.", DeprecationWarning)
+        """
+        .. note:: Deprecated in 1.4, use :func:`registerTempTable` instead.
+        """
+        warnings.warn("Use registerTempTable instead of registerAsTable.")
         self.registerTempTable(name)
 
-    @since(1.3)
     def insertInto(self, tableName, overwrite=False):
         """Inserts the contents of this :class:`DataFrame` into the specified table.
 
-        Optionally overwriting any existing data.
+        .. note:: Deprecated in 1.4, use :func:`DataFrameWriter.insertInto` instead.
         """
+        warnings.warn("insertInto is deprecated. Use write.insertInto() instead.")
         self.write.insertInto(tableName, overwrite)
 
-    @since(1.3)
     def saveAsTable(self, tableName, source=None, mode="error", **options):
         """Saves the contents of this :class:`DataFrame` to a data source as a table.
 
-        The data source is specified by the ``source`` and a set of ``options``.
-        If ``source`` is not specified, the default data source configured by
-        ``spark.sql.sources.default`` will be used.
-
-        Additionally, mode is used to specify the behavior of the saveAsTable operation when
-        table already exists in the data source. There are four modes:
-
-        * `append`: Append contents of this :class:`DataFrame` to existing data.
-        * `overwrite`: Overwrite existing data.
-        * `error`: Throw an exception if data already exists.
-        * `ignore`: Silently ignore this operation if data already exists.
+        .. note:: Deprecated in 1.4, use :func:`DataFrameWriter.saveAsTable` instead.
         """
+        warnings.warn("insertInto is deprecated. Use write.saveAsTable() instead.")
         self.write.saveAsTable(tableName, source, mode, **options)
 
     @since(1.3)
     def save(self, path=None, source=None, mode="error", **options):
         """Saves the contents of the :class:`DataFrame` to a data source.
 
-        The data source is specified by the ``source`` and a set of ``options``.
-        If ``source`` is not specified, the default data source configured by
-        ``spark.sql.sources.default`` will be used.
-
-        Additionally, mode is used to specify the behavior of the save operation when
-        data already exists in the data source. There are four modes:
-
-        * `append`: Append contents of this :class:`DataFrame` to existing data.
-        * `overwrite`: Overwrite existing data.
-        * `error`: Throw an exception if data already exists.
-        * `ignore`: Silently ignore this operation if data already exists.
+        .. note:: Deprecated in 1.4, use :func:`DataFrameWriter.save` instead.
         """
+        warnings.warn("insertInto is deprecated. Use write.save() instead.")
         return self.write.save(path, source, mode, **options)
 
     @property
     @since(1.4)
     def write(self):
         """
-        Interface for saving the content of the :class:`DataFrame` out
-        into external storage.
-
-        :return :class:`DataFrameWriter`
+        Interface for saving the content of the :class:`DataFrame` out into external storage.
 
-        .. note:: Experimental
-
-        >>> df.write
-        <pyspark.sql.readwriter.DataFrameWriter object at ...>
+        :return: :class:`DataFrameWriter`
         """
         return DataFrameWriter(self)
 
@@ -636,6 +603,9 @@ def describe(self, *cols):
         This include count, mean, stddev, min, and max. If no columns are
         given, this function computes statistics for all numerical columns.
 
+        .. note:: This function is meant for exploratory data analysis, as we make no \
+        guarantee about the backward compatibility of the schema of the resulting DataFrame.
+
         >>> df.describe().show()
         +-------+---+
         |summary|age|
@@ -653,9 +623,11 @@ def describe(self, *cols):
     @ignore_unicode_prefix
     @since(1.3)
     def head(self, n=None):
-        """
-        Returns the first ``n`` rows as a list of :class:`Row`,
-        or the first :class:`Row` if ``n`` is ``None.``
+        """Returns the first ``n`` rows.
+
+        :param n: int, default 1. Number of rows to return.
+        :return: If n is greater than 1, return a list of :class:`Row`.
+            If n is 1, return a single Row.
 
         >>> df.head()
         Row(age=2, name=u'Alice')
@@ -1170,8 +1142,8 @@ def freqItems(self, cols, support=None):
         "http://dx.doi.org/10.1145/762471.762473, proposed by Karp, Schenker, and Papadimitriou".
         :func:`DataFrame.freqItems` and :func:`DataFrameStatFunctions.freqItems` are aliases.
 
-        This function is meant for exploratory data analysis, as we make no guarantee about the
-        backward compatibility of the schema of the resulting DataFrame.
+        .. note::  This function is meant for exploratory data analysis, as we make no \
+        guarantee about the backward compatibility of the schema of the resulting DataFrame.
 
         :param cols: Names of the columns to calculate frequent items for as a list or tuple of
             strings.
diff --git a/python/pyspark/sql/readwriter.py b/python/pyspark/sql/readwriter.py
index d17d87419fe3d..f036644acc961 100644
--- a/python/pyspark/sql/readwriter.py
+++ b/python/pyspark/sql/readwriter.py
@@ -45,18 +45,24 @@ def _df(self, jdf):
 
     @since(1.4)
     def format(self, source):
-        """
-        Specifies the input data source format.
+        """Specifies the input data source format.
+
+        :param source: string, name of the data source, e.g. 'json', 'parquet'.
+
+        >>> df = sqlContext.read.format('json').load('python/test_support/sql/people.json')
+        >>> df.dtypes
+        [('age', 'bigint'), ('name', 'string')]
+
         """
         self._jreader = self._jreader.format(source)
         return self
 
     @since(1.4)
     def schema(self, schema):
-        """
-        Specifies the input schema. Some data sources (e.g. JSON) can
-        infer the input schema automatically from data. By specifying
-        the schema here, the underlying data source can skip the schema
+        """Specifies the input schema.
+
+        Some data sources (e.g. JSON) can infer the input schema automatically from data.
+        By specifying the schema here, the underlying data source can skip the schema
         inference step, and thus speed up data loading.
 
         :param schema: a StructType object
@@ -69,8 +75,7 @@ def schema(self, schema):
 
     @since(1.4)
     def options(self, **options):
-        """
-        Adds input options for the underlying data source.
+        """Adds input options for the underlying data source.
         """
         for k in options:
             self._jreader = self._jreader.option(k, options[k])
@@ -84,6 +89,10 @@ def load(self, path=None, format=None, schema=None, **options):
         :param format: optional string for format of the data source. Default to 'parquet'.
         :param schema: optional :class:`StructType` for the input schema.
         :param options: all other string options
+
+        >>> df = sqlContext.read.load('python/test_support/sql/parquet_partitioned')
+        >>> df.dtypes
+        [('name', 'string'), ('year', 'int'), ('month', 'int'), ('day', 'int')]
         """
         if format is not None:
             self.format(format)
@@ -107,31 +116,10 @@ def json(self, path, schema=None):
         :param path: string, path to the JSON dataset.
         :param schema: an optional :class:`StructType` for the input schema.
 
-        >>> import tempfile, shutil
-        >>> jsonFile = tempfile.mkdtemp()
-        >>> shutil.rmtree(jsonFile)
-        >>> with open(jsonFile, 'w') as f:
-        ...     f.writelines(jsonStrings)
-        >>> df1 = sqlContext.read.json(jsonFile)
-        >>> df1.printSchema()
-        root
-         |-- field1: long (nullable = true)
-         |-- field2: string (nullable = true)
-         |-- field3: struct (nullable = true)
-         |    |-- field4: long (nullable = true)
-
-        >>> from pyspark.sql.types import *
-        >>> schema = StructType([
-        ...     StructField("field2", StringType()),
-        ...     StructField("field3",
-        ...         StructType([StructField("field5", ArrayType(IntegerType()))]))])
-        >>> df2 = sqlContext.read.json(jsonFile, schema)
-        >>> df2.printSchema()
-        root
-         |-- field2: string (nullable = true)
-         |-- field3: struct (nullable = true)
-         |    |-- field5: array (nullable = true)
-         |    |    |-- element: integer (containsNull = true)
+        >>> df = sqlContext.read.json('python/test_support/sql/people.json')
+        >>> df.dtypes
+        [('age', 'bigint'), ('name', 'string')]
+
         """
         if schema is not None:
             self.schema(schema)
@@ -141,10 +129,12 @@ def json(self, path, schema=None):
     def table(self, tableName):
         """Returns the specified table as a :class:`DataFrame`.
 
-        >>> sqlContext.registerDataFrameAsTable(df, "table1")
-        >>> df2 = sqlContext.read.table("table1")
-        >>> sorted(df.collect()) == sorted(df2.collect())
-        True
+        :param tableName: string, name of the table.
+
+        >>> df = sqlContext.read.parquet('python/test_support/sql/parquet_partitioned')
+        >>> df.registerTempTable('tmpTable')
+        >>> sqlContext.read.table('tmpTable').dtypes
+        [('name', 'string'), ('year', 'int'), ('month', 'int'), ('day', 'int')]
         """
         return self._df(self._jreader.table(tableName))
 
@@ -152,13 +142,9 @@ def table(self, tableName):
     def parquet(self, *path):
         """Loads a Parquet file, returning the result as a :class:`DataFrame`.
 
-        >>> import tempfile, shutil
-        >>> parquetFile = tempfile.mkdtemp()
-        >>> shutil.rmtree(parquetFile)
-        >>> df.saveAsParquetFile(parquetFile)
-        >>> df2 = sqlContext.read.parquet(parquetFile)
-        >>> sorted(df.collect()) == sorted(df2.collect())
-        True
+        >>> df = sqlContext.read.parquet('python/test_support/sql/parquet_partitioned')
+        >>> df.dtypes
+        [('name', 'string'), ('year', 'int'), ('month', 'int'), ('day', 'int')]
         """
         return self._df(self._jreader.parquet(_to_seq(self._sqlContext._sc, path)))
 
@@ -221,30 +207,34 @@ def __init__(self, df):
 
     @since(1.4)
     def mode(self, saveMode):
-        """
-        Specifies the behavior when data or table already exists. Options include:
+        """Specifies the behavior when data or table already exists.
+
+        Options include:
 
         * `append`: Append contents of this :class:`DataFrame` to existing data.
         * `overwrite`: Overwrite existing data.
         * `error`: Throw an exception if data already exists.
         * `ignore`: Silently ignore this operation if data already exists.
+
+        >>> df.write.mode('append').parquet(os.path.join(tempfile.mkdtemp(), 'data'))
         """
         self._jwrite = self._jwrite.mode(saveMode)
         return self
 
     @since(1.4)
     def format(self, source):
-        """
-        Specifies the underlying output data source. Built-in options include
-        "parquet", "json", etc.
+        """Specifies the underlying output data source.
+
+        :param source: string, name of the data source, e.g. 'json', 'parquet'.
+
+        >>> df.write.format('json').save(os.path.join(tempfile.mkdtemp(), 'data'))
         """
         self._jwrite = self._jwrite.format(source)
         return self
 
     @since(1.4)
     def options(self, **options):
-        """
-        Adds output options for the underlying data source.
+        """Adds output options for the underlying data source.
         """
         for k in options:
             self._jwrite = self._jwrite.option(k, options[k])
@@ -252,12 +242,14 @@ def options(self, **options):
 
     @since(1.4)
     def partitionBy(self, *cols):
-        """
-        Partitions the output by the given columns on the file system.
+        """Partitions the output by the given columns on the file system.
+
         If specified, the output is laid out on the file system similar
         to Hive's partitioning scheme.
 
         :param cols: name of columns
+
+        >>> df.write.partitionBy('year', 'month').parquet(os.path.join(tempfile.mkdtemp(), 'data'))
         """
         if len(cols) == 1 and isinstance(cols[0], (list, tuple)):
             cols = cols[0]
@@ -266,25 +258,23 @@ def partitionBy(self, *cols):
 
     @since(1.4)
     def save(self, path=None, format=None, mode="error", **options):
-        """
-        Saves the contents of the :class:`DataFrame` to a data source.
+        """Saves the contents of the :class:`DataFrame` to a data source.
 
         The data source is specified by the ``format`` and a set of ``options``.
         If ``format`` is not specified, the default data source configured by
         ``spark.sql.sources.default`` will be used.
 
-        Additionally, mode is used to specify the behavior of the save operation when
-        data already exists in the data source. There are four modes:
-
-        * `append`: Append contents of this :class:`DataFrame` to existing data.
-        * `overwrite`: Overwrite existing data.
-        * `error`: Throw an exception if data already exists.
-        * `ignore`: Silently ignore this operation if data already exists.
-
         :param path: the path in a Hadoop supported file system
         :param format: the format used to save
-        :param mode: one of `append`, `overwrite`, `error`, `ignore` (default: error)
+        :param mode: specifies the behavior of the save operation when data already exists.
+
+            * ``append``: Append contents of this :class:`DataFrame` to existing data.
+            * ``overwrite``: Overwrite existing data.
+            * ``ignore``: Silently ignore this operation if data already exists.
+            * ``error`` (default case): Throw an exception if data already exists.
         :param options: all other string options
+
+        >>> df.write.mode('append').parquet(os.path.join(tempfile.mkdtemp(), 'data'))
         """
         self.mode(mode).options(**options)
         if format is not None:
@@ -296,8 +286,8 @@ def save(self, path=None, format=None, mode="error", **options):
 
     @since(1.4)
     def insertInto(self, tableName, overwrite=False):
-        """
-        Inserts the content of the :class:`DataFrame` to the specified table.
+        """Inserts the content of the :class:`DataFrame` to the specified table.
+
         It requires that the schema of the class:`DataFrame` is the same as the
         schema of the table.
 
@@ -307,8 +297,7 @@ def insertInto(self, tableName, overwrite=False):
 
     @since(1.4)
     def saveAsTable(self, name, format=None, mode="error", **options):
-        """
-        Saves the content of the :class:`DataFrame` as the specified table.
+        """Saves the content of the :class:`DataFrame` as the specified table.
 
         In the case the table already exists, behavior of this function depends on the
         save mode, specified by the `mode` function (default to throwing an exception).
@@ -328,67 +317,58 @@ def saveAsTable(self, name, format=None, mode="error", **options):
         self.mode(mode).options(**options)
         if format is not None:
             self.format(format)
-        return self._jwrite.saveAsTable(name)
+        self._jwrite.saveAsTable(name)
 
     @since(1.4)
     def json(self, path, mode="error"):
-        """
-        Saves the content of the :class:`DataFrame` in JSON format at the
-        specified path.
+        """Saves the content of the :class:`DataFrame` in JSON format at the specified path.
 
-        Additionally, mode is used to specify the behavior of the save operation when
-        data already exists in the data source. There are four modes:
+        :param path: the path in any Hadoop supported file system
+        :param mode: specifies the behavior of the save operation when data already exists.
 
-        * `append`: Append contents of this :class:`DataFrame` to existing data.
-        * `overwrite`: Overwrite existing data.
-        * `error`: Throw an exception if data already exists.
-        * `ignore`: Silently ignore this operation if data already exists.
+            * ``append``: Append contents of this :class:`DataFrame` to existing data.
+            * ``overwrite``: Overwrite existing data.
+            * ``ignore``: Silently ignore this operation if data already exists.
+            * ``error`` (default case): Throw an exception if data already exists.
 
-        :param path: the path in any Hadoop supported file system
-        :param mode: one of `append`, `overwrite`, `error`, `ignore` (default: error)
+        >>> df.write.json(os.path.join(tempfile.mkdtemp(), 'data'))
         """
-        return self._jwrite.mode(mode).json(path)
+        self._jwrite.mode(mode).json(path)
 
     @since(1.4)
     def parquet(self, path, mode="error"):
-        """
-        Saves the content of the :class:`DataFrame` in Parquet format at the
-        specified path.
+        """Saves the content of the :class:`DataFrame` in Parquet format at the specified path.
 
-        Additionally, mode is used to specify the behavior of the save operation when
-        data already exists in the data source. There are four modes:
+        :param path: the path in any Hadoop supported file system
+        :param mode: specifies the behavior of the save operation when data already exists.
 
-        * `append`: Append contents of this :class:`DataFrame` to existing data.
-        * `overwrite`: Overwrite existing data.
-        * `error`: Throw an exception if data already exists.
-        * `ignore`: Silently ignore this operation if data already exists.
+            * ``append``: Append contents of this :class:`DataFrame` to existing data.
+            * ``overwrite``: Overwrite existing data.
+            * ``ignore``: Silently ignore this operation if data already exists.
+            * ``error`` (default case): Throw an exception if data already exists.
 
-        :param path: the path in any Hadoop supported file system
-        :param mode: one of `append`, `overwrite`, `error`, `ignore` (default: error)
+        >>> df.write.parquet(os.path.join(tempfile.mkdtemp(), 'data'))
         """
-        return self._jwrite.mode(mode).parquet(path)
+        self._jwrite.mode(mode).parquet(path)
 
     @since(1.4)
     def jdbc(self, url, table, mode="error", properties={}):
-        """
-        Saves the content of the :class:`DataFrame` to a external database table
-        via JDBC.
-
-        In the case the table already exists in the external database,
-        behavior of this function depends on the save mode, specified by the `mode`
-        function (default to throwing an exception). There are four modes:
+        """Saves the content of the :class:`DataFrame` to a external database table via JDBC.
 
-        * `append`: Append contents of this :class:`DataFrame` to existing data.
-        * `overwrite`: Overwrite existing data.
-        * `error`: Throw an exception if data already exists.
-        * `ignore`: Silently ignore this operation if data already exists.
+        .. note:: Don't create too many partitions in parallel on a large cluster;\
+        otherwise Spark might crash your external database systems.
 
-        :param url: a JDBC URL of the form `jdbc:subprotocol:subname`
+        :param url: a JDBC URL of the form ``jdbc:subprotocol:subname``
         :param table: Name of the table in the external database.
-        :param mode: one of `append`, `overwrite`, `error`, `ignore` (default: error)
+        :param mode: specifies the behavior of the save operation when data already exists.
+
+            * ``append``: Append contents of this :class:`DataFrame` to existing data.
+            * ``overwrite``: Overwrite existing data.
+            * ``ignore``: Silently ignore this operation if data already exists.
+            * ``error`` (default case): Throw an exception if data already exists.
         :param properties: JDBC database connection arguments, a list of
-                                    arbitrary string tag/value. Normally at least a
-                                    "user" and "password" property should be included.
+                           arbitrary string tag/value. Normally at least a
+                           "user" and "password" property should be included.
         """
         jprop = JavaClass("java.util.Properties", self._sqlContext._sc._gateway._gateway_client)()
         for k in properties:
@@ -398,24 +378,23 @@ def jdbc(self, url, table, mode="error", properties={}):
 
 def _test():
     import doctest
+    import os
+    import tempfile
     from pyspark.context import SparkContext
     from pyspark.sql import Row, SQLContext
     import pyspark.sql.readwriter
+
+    os.chdir(os.environ["SPARK_HOME"])
+
     globs = pyspark.sql.readwriter.__dict__.copy()
     sc = SparkContext('local[4]', 'PythonTest')
+
+    globs['tempfile'] = tempfile
+    globs['os'] = os
     globs['sc'] = sc
     globs['sqlContext'] = SQLContext(sc)
-    globs['df'] = sc.parallelize([(2, 'Alice'), (5, 'Bob')]) \
-        .toDF(StructType([StructField('age', IntegerType()),
-                          StructField('name', StringType())]))
-    jsonStrings = [
-        '{"field1": 1, "field2": "row1", "field3":{"field4":11}}',
-        '{"field1" : 2, "field3":{"field4":22, "field5": [10, 11]},'
-        '"field6":[{"field7": "row2"}]}',
-        '{"field1" : null, "field2": "row3", '
-        '"field3":{"field4":33, "field5": []}}'
-    ]
-    globs['jsonStrings'] = jsonStrings
+    globs['df'] = globs['sqlContext'].read.parquet('python/test_support/sql/parquet_partitioned')
+
     (failure_count, test_count) = doctest.testmod(
         pyspark.sql.readwriter, globs=globs,
         optionflags=doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE | doctest.REPORT_NDIFF)
diff --git a/python/pyspark/sql/tests.py b/python/pyspark/sql/tests.py
index 76384d31f1bf4..6e498f0af0af5 100644
--- a/python/pyspark/sql/tests.py
+++ b/python/pyspark/sql/tests.py
@@ -753,8 +753,10 @@ def setUpClass(cls):
         try:
             cls.sc._jvm.org.apache.hadoop.hive.conf.HiveConf()
         except py4j.protocol.Py4JError:
+            cls.tearDownClass()
             raise unittest.SkipTest("Hive is not available")
         except TypeError:
+            cls.tearDownClass()
             raise unittest.SkipTest("Hive is not available")
         os.unlink(cls.tempdir.name)
         _scala_HiveContext =\
diff --git a/python/test_support/sql/parquet_partitioned/_SUCCESS b/python/test_support/sql/parquet_partitioned/_SUCCESS
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/python/test_support/sql/parquet_partitioned/_common_metadata b/python/test_support/sql/parquet_partitioned/_common_metadata
new file mode 100644
index 0000000000000000000000000000000000000000..7ef2320651dee5f2a4d588ecf8d0b281a2bc3018
GIT binary patch
literal 210
zcmYk1!3u&v5QYcw=+#i_AOju(TauuIw{9J!W6@#L&7^f#ch@4s*Xz03qM*|Z%=dr8
zpKo@l?}W+LRZ<$?0pE+Az!kJ%F~9^uFPsH)sVYKST3i^>Emc>dJ5KD<^~?|@@1$Xd
zmekN-KcIQE3^UY5^@YI%&o$$v#_TZQTWe3Bk^F(Rs4OUY&gqF;!bVwwKPhIzI37m`
brr(!~MnyNKbS*`ck~LYXVg*kC$ZeY!e^o%_

literal 0
HcmV?d00001

diff --git a/python/test_support/sql/parquet_partitioned/_metadata b/python/test_support/sql/parquet_partitioned/_metadata
new file mode 100644
index 0000000000000000000000000000000000000000..78a1ca7d38279147e00a826e627f7514d988bd8e
GIT binary patch
literal 743
zcmb7?&riZI6vyjCA%^RgE^;7EFfke3h76U+ftwc+!pZpKP`3&TWgA`5Oyuq#;D6^o
zW7!Xai3#`)eXs57`{ea~hy9VQD!Or7;$bLM1*p}A0!smz(FOq8iT<e9pqWs@do9au
zo3k(wlis!Ik)&sv5#gfAo0haIJbuS=KVMLxRcdNgcaA|t&q}E!P0{YOkF&}RJnS<+
zT{Iv~o+>~h>;mEB2-`{-EoU3j+6jrYuY)zEJn-EKp==XmwCF#y_WraHO@felu$%|`
z(K_3`IXh{d_L=r}G$4ZdFmoBn%lg_3s`$k}26efUv-!gz5!`pDu$%|Kx;hW}7?X&&
z6N+Ow_$iL(tWW^v;TxV&K|CS|yk8=bL=<&VEcn6|$UrYXWnPTB4@<Pxn!HM#v6bit
zW0@E%7$eV2X2_@1Kt)m1U9MJ7D*#V((KTnh{z`f5he6%O9c*#;0(>g~45h?>03XD`
A@&Et;

literal 0
HcmV?d00001

diff --git a/python/test_support/sql/parquet_partitioned/year=2014/month=9/day=1/.part-r-00008.gz.parquet.crc b/python/test_support/sql/parquet_partitioned/year=2014/month=9/day=1/.part-r-00008.gz.parquet.crc
new file mode 100644
index 0000000000000000000000000000000000000000..e93f42ed6f35018df1ae76c9746b26bd299c0339
GIT binary patch
literal 12
TcmYc;N@ieSU}9KgSR(=e5vKy4

literal 0
HcmV?d00001

diff --git a/python/test_support/sql/parquet_partitioned/year=2014/month=9/day=1/part-r-00008.gz.parquet b/python/test_support/sql/parquet_partitioned/year=2014/month=9/day=1/part-r-00008.gz.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..461c382937ecdf13db1003bd2bf8c9a79c510d3e
GIT binary patch
literal 322
zcmWG=3^EjD5S0?O(-CC?GT1~pWF(j!b27n%7y}T<cXNO!AV^M1NJvOaVVl%vIg?@i
z`I+jxKtWL^NgWTCqWt_4Q6>ga9#tj@mb}E=R8<BBF(yeFNf|W`8BrcdQ#J+;Nr;Rn
zix`twgjkIlhYgxdKv5<JNf`;v{GxQd#Dc`+j8whig2bY1z2d?gJs_j7G_^#pD8F1U
zH?<@&C9xz?BC1-cq_QAY$x5lXq^LBxL`g>}Ei*MIrC7--x>^b35TF#8(m_&~nU@Y!
zm{*#UlbDnPQ~}hQs-pxmRLQEkwl=nwK|&g8rEYGKLRo52ab|v=f}x(7o<UKvF~bj_
J-vfZ52LMLQQ}O@+

literal 0
HcmV?d00001

diff --git a/python/test_support/sql/parquet_partitioned/year=2015/month=10/day=25/.part-r-00002.gz.parquet.crc b/python/test_support/sql/parquet_partitioned/year=2015/month=10/day=25/.part-r-00002.gz.parquet.crc
new file mode 100644
index 0000000000000000000000000000000000000000..b63c4d6d1e1dc71969b22cdd24ccd4f0673b22c0
GIT binary patch
literal 12
TcmYc;N@ieSU}6Z*|K$w;5@-WP

literal 0
HcmV?d00001

diff --git a/python/test_support/sql/parquet_partitioned/year=2015/month=10/day=25/.part-r-00004.gz.parquet.crc b/python/test_support/sql/parquet_partitioned/year=2015/month=10/day=25/.part-r-00004.gz.parquet.crc
new file mode 100644
index 0000000000000000000000000000000000000000..5bc0ebd7135638686a8e064ac02fa673b06228d4
GIT binary patch
literal 12
TcmYc;N@ieSU}D(K7dIOK5$yu~

literal 0
HcmV?d00001

diff --git a/python/test_support/sql/parquet_partitioned/year=2015/month=10/day=25/part-r-00002.gz.parquet b/python/test_support/sql/parquet_partitioned/year=2015/month=10/day=25/part-r-00002.gz.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..62a63915beac2840b31ae7f1e8dec1c26b268b89
GIT binary patch
literal 343
zcmWG=3^EjD5LFU&(-CC?GT1~pWF**qGm|qCQ*+=9F$N%z@8$qeK#-i2kdTm;!aix=
z%#)IxD=isX?px~$0wqM5By~Jkit_VIM41>wc~qGsSn?8cQ&kxl#F!*yBxTe%WJGx+
zP1zVYBq1`QEMiPz1!7Ye)i`Y6w!s--Yk|^C43aVun)yZPdWi*z$r-77#RZ8)*?Pr=
zIeI`wVQFfKUQvFzUT$hhVoG93qC`}+Qb}b&s*;sbaY<2Wa*2|TQd(wePD-(oRdlry
z$VEUYFr|Z}Ff%V5s4%ZICnqr}2dDz5HC0CmW~h=?b!}~IErWzK)JomlB89TlqT<Z_
XJOx8NGd+W%WMc+KMh1oeV2}a;7wTKD

literal 0
HcmV?d00001

diff --git a/python/test_support/sql/parquet_partitioned/year=2015/month=10/day=25/part-r-00004.gz.parquet b/python/test_support/sql/parquet_partitioned/year=2015/month=10/day=25/part-r-00004.gz.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..67665a7b55da6413d2a534d6f46061e380f70b2e
GIT binary patch
literal 343
zcmYjN!AiqG6x=MK&7qebHe@kiS!kgU7mFwu^w5iVEIoKoglv;-4Q{gOW}}3VFYqS%
z5q^(f<IQc=;9O>4=Dm5m#e6wK01<Q*AZSDnIlk9l@;a9y>4Xam4qu!d8N{n7iq=X0
zb^h^Qf1OXk&E@BCXbc2#aBV9oHG%*Q#?Z5KmhmwFF2p|eCytK>__PNc{No_og>K=#
zSrg}?YwN_m*4PkW-<wLcp{!u>1E*!d)FUmof*P@{xTZ=z(~N7DFwMN%hUmKBBqXI)
zRjf%s)+rZBNy58^>@G6ao`QeDG~bwDUJ1cg!X(Tn56ItA5;kpn-vaO8xAG`cqbIJ)
UROX`@J)_4eJ^_{mz{0%r8w^idvH$=8

literal 0
HcmV?d00001

diff --git a/python/test_support/sql/parquet_partitioned/year=2015/month=10/day=26/.part-r-00005.gz.parquet.crc b/python/test_support/sql/parquet_partitioned/year=2015/month=10/day=26/.part-r-00005.gz.parquet.crc
new file mode 100644
index 0000000000000000000000000000000000000000..ae94a15d08c816a6fb60117395462aa640da5026
GIT binary patch
literal 12
TcmYc;N@ieSU}AV;?~?@p63+t^

literal 0
HcmV?d00001

diff --git a/python/test_support/sql/parquet_partitioned/year=2015/month=10/day=26/part-r-00005.gz.parquet b/python/test_support/sql/parquet_partitioned/year=2015/month=10/day=26/part-r-00005.gz.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..6cb8538aa890475f9128c8025e2e9b0f72b1b785
GIT binary patch
literal 333
zcmZut%SyvQ6di}O1}V6jFoVICffR%|SVR}Wjjr4X#ib%-noMhO^5|ruNXTAs=gv>?
zC;T(PTdm;2vpI)*&V6vFr<W6g$Syex2?o(8u1|Xx(yDAt9s&$s+!X<mSscgnwD<b)
z9K64wt!i_)4P}B^c<z|8v;-qE2rzXDnX3RH4>O<nW7j7GcGO2M`*DclVK4QbHpcGB
zKMTRai1~D{Gz%dsncON-SJK>xdZ`77WuvSx<%7tTm8rCnUbWmlR*FZwwx&re5BWS(
zI<0wh-SX8nV0}~gCzurr2o{aja;6~xtt#ZdLwVG8-A#w+&U)p3ZbtXY)LB`KCgNBe
NnB)+B!ULx8$S(}ES498-

literal 0
HcmV?d00001

diff --git a/python/test_support/sql/parquet_partitioned/year=2015/month=9/day=1/.part-r-00007.gz.parquet.crc b/python/test_support/sql/parquet_partitioned/year=2015/month=9/day=1/.part-r-00007.gz.parquet.crc
new file mode 100644
index 0000000000000000000000000000000000000000..58d9bb5fc5883f0b65b637a8deea2672b9fee8a2
GIT binary patch
literal 12
TcmYc;N@ieSU}C7hE>;2n5}^Yd

literal 0
HcmV?d00001

diff --git a/python/test_support/sql/parquet_partitioned/year=2015/month=9/day=1/part-r-00007.gz.parquet b/python/test_support/sql/parquet_partitioned/year=2015/month=9/day=1/part-r-00007.gz.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..9b00805481e7b311763d48ff965966b4d09daa99
GIT binary patch
literal 343
zcmYjN%SyvQ6ulWjn?*Mw6EYaE45TQ;!6Lc{g1Asz2A7JEX)>*$d37>TB;*6cow)In
z`~?5Ok8oNwcsA$2IrpB+4bQKq7%;_`K1Ny$u;n_#kSm$S%U;-^vHN1JNh6*`Q8Z76
zug0@?@&54%+h==UTiU>g_*bSZON9~Ok%t_!;JNSsY(!k*PAnIX$ngLy^5bCBMs{Vt
z858TYZ|lXTR@(@O>+F|u!Fa{vd%^08%O$H<8Pj6b2*qUi$a0~0!WDOJTB@EZK?7PV
z*~E(abe@VVscCTA()C5!+K~S*m=+5iESfCivrH%SsPO6EQW~^fch`Zl^ILh4%khJd
Vby^nVDLY|@GCl&s00{L<zX6fLTR{K-

literal 0
HcmV?d00001

diff --git a/python/test_support/sql/people.json b/python/test_support/sql/people.json
new file mode 100644
index 0000000000000..50a859cbd7ee8
--- /dev/null
+++ b/python/test_support/sql/people.json
@@ -0,0 +1,3 @@
+{"name":"Michael"}
+{"name":"Andy", "age":30}
+{"name":"Justin", "age":19}

From d38cf217e0c6bfbf451c659675280b43a08bc70f Mon Sep 17 00:00:00 2001
From: Wenchen Fan <cloud0fan@outlook.com>
Date: Wed, 3 Jun 2015 00:47:52 -0700
Subject: [PATCH 331/525] [SPARK-7562][SPARK-6444][SQL] Improve error reporting
 for expression data type mismatch

It seems hard to find a common pattern of checking types in `Expression`. Sometimes we know what input types we need(like `And`, we know we need two booleans), sometimes we just have some rules(like `Add`, we need 2 numeric types which are equal). So I defined a general interface `checkInputDataTypes` in `Expression` which returns a `TypeCheckResult`. `TypeCheckResult` can tell whether this expression passes the type checking or what the type mismatch is.

This PR mainly works on apply input types checking for arithmetic and predicate expressions.

TODO: apply type checking interface to more expressions.

Author: Wenchen Fan <cloud0fan@outlook.com>

Closes #6405 from cloud-fan/6444 and squashes the following commits:

b5ff31b [Wenchen Fan] address comments
b917275 [Wenchen Fan] rebase
39929d9 [Wenchen Fan] add todo
0808fd2 [Wenchen Fan] make constrcutor of TypeCheckResult private
3bee157 [Wenchen Fan] and decimal type coercion rule for binary comparison
8883025 [Wenchen Fan] apply type check interface to CaseWhen
cffb67c [Wenchen Fan] to have resolved call the data type check function
6eaadff [Wenchen Fan] add equal type constraint to EqualTo
3affbd8 [Wenchen Fan] more fixes
654d46a [Wenchen Fan] improve tests
e0a3628 [Wenchen Fan] improve error message
1524ff6 [Wenchen Fan] fix style
69ca3fe [Wenchen Fan] add error message and tests
c71d02c [Wenchen Fan] fix hive tests
6491721 [Wenchen Fan] use value class TypeCheckResult
7ae76b9 [Wenchen Fan] address comments
cb77e4f [Wenchen Fan] Improve error reporting for expression data type mismatch
---
 .../org/apache/spark/SparkFunSuite.scala      |   4 +-
 .../sql/catalyst/analysis/CheckAnalysis.scala |  12 +-
 .../catalyst/analysis/HiveTypeCoercion.scala  | 132 ++++----
 .../catalyst/analysis/TypeCheckResult.scala   |  45 +++
 .../sql/catalyst/expressions/Expression.scala |  28 +-
 .../sql/catalyst/expressions/arithmetic.scala | 308 +++++++-----------
 .../expressions/mathfuncs/binary.scala        |  17 +-
 .../sql/catalyst/expressions/predicates.scala | 226 ++++++-------
 .../sql/catalyst/optimizer/Optimizer.scala    |   4 +
 .../spark/sql/catalyst/util/DateUtils.scala   |   2 +-
 .../spark/sql/catalyst/util/TypeUtils.scala   |  56 ++++
 .../org/apache/spark/sql/types/DataType.scala |   2 +-
 .../analysis/DecimalPrecisionSuite.scala      |   6 +-
 .../analysis/HiveTypeCoercionSuite.scala      |  15 +-
 .../ExpressionTypeCheckingSuite.scala         | 143 ++++++++
 .../apache/spark/sql/json/InferSchema.scala   |   2 +-
 .../org/apache/spark/sql/json/JsonRDD.scala   |   2 +-
 17 files changed, 583 insertions(+), 421 deletions(-)
 create mode 100644 sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/TypeCheckResult.scala
 create mode 100644 sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/TypeUtils.scala
 create mode 100644 sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionTypeCheckingSuite.scala

diff --git a/core/src/test/scala/org/apache/spark/SparkFunSuite.scala b/core/src/test/scala/org/apache/spark/SparkFunSuite.scala
index 8cb344332668f..9be9db01c7de9 100644
--- a/core/src/test/scala/org/apache/spark/SparkFunSuite.scala
+++ b/core/src/test/scala/org/apache/spark/SparkFunSuite.scala
@@ -30,8 +30,8 @@ private[spark] abstract class SparkFunSuite extends FunSuite with Logging {
    * Log the suite name and the test name before and after each test.
    *
    * Subclasses should never override this method. If they wish to run
-   * custom code before and after each test, they should should mix in
-   * the {{org.scalatest.BeforeAndAfter}} trait instead.
+   * custom code before and after each test, they should mix in the
+   * {{org.scalatest.BeforeAndAfter}} trait instead.
    */
   final protected override def withFixture(test: NoArgTest): Outcome = {
     val testName = test.text
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala
index 193dc6b6546b5..c0695ae369421 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala
@@ -62,15 +62,17 @@ trait CheckAnalysis {
             val from = operator.inputSet.map(_.name).mkString(", ")
             a.failAnalysis(s"cannot resolve '${a.prettyString}' given input columns $from")
 
+          case e: Expression if e.checkInputDataTypes().isFailure =>
+            e.checkInputDataTypes() match {
+              case TypeCheckResult.TypeCheckFailure(message) =>
+                e.failAnalysis(
+                  s"cannot resolve '${e.prettyString}' due to data type mismatch: $message")
+            }
+
           case c: Cast if !c.resolved =>
             failAnalysis(
               s"invalid cast from ${c.child.dataType.simpleString} to ${c.dataType.simpleString}")
 
-          case b: BinaryExpression if !b.resolved =>
-            failAnalysis(
-              s"invalid expression ${b.prettyString} " +
-              s"between ${b.left.dataType.simpleString} and ${b.right.dataType.simpleString}")
-
           case WindowExpression(UnresolvedWindowFunction(name, _), _) =>
             failAnalysis(
               s"Could not resolve window function '$name'. " +
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala
index edcc918bfe921..b064600e94fac 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala
@@ -41,7 +41,7 @@ object HiveTypeCoercion {
    * with primitive types, because in that case the precision and scale of the result depends on
    * the operation. Those rules are implemented in [[HiveTypeCoercion.DecimalPrecision]].
    */
-  val findTightestCommonType: (DataType, DataType) => Option[DataType] = {
+  val findTightestCommonTypeOfTwo: (DataType, DataType) => Option[DataType] = {
     case (t1, t2) if t1 == t2 => Some(t1)
     case (NullType, t1) => Some(t1)
     case (t1, NullType) => Some(t1)
@@ -57,6 +57,17 @@ object HiveTypeCoercion {
 
     case _ => None
   }
+
+  /**
+   * Find the tightest common type of a set of types by continuously applying
+   * `findTightestCommonTypeOfTwo` on these types.
+   */
+  private def findTightestCommonType(types: Seq[DataType]) = {
+    types.foldLeft[Option[DataType]](Some(NullType))((r, c) => r match {
+      case None => None
+      case Some(d) => findTightestCommonTypeOfTwo(d, c)
+    })
+  }
 }
 
 /**
@@ -180,7 +191,7 @@ trait HiveTypeCoercion {
 
           case (l, r) if l.dataType != r.dataType =>
             logDebug(s"Resolving mismatched union input ${l.dataType}, ${r.dataType}")
-            findTightestCommonType(l.dataType, r.dataType).map { widestType =>
+            findTightestCommonTypeOfTwo(l.dataType, r.dataType).map { widestType =>
               val newLeft =
                 if (l.dataType == widestType) l else Alias(Cast(l, widestType), l.name)()
               val newRight =
@@ -217,7 +228,7 @@ trait HiveTypeCoercion {
         case e if !e.childrenResolved => e
 
         case b: BinaryExpression if b.left.dataType != b.right.dataType =>
-          findTightestCommonType(b.left.dataType, b.right.dataType).map { widestType =>
+          findTightestCommonTypeOfTwo(b.left.dataType, b.right.dataType).map { widestType =>
             val newLeft =
               if (b.left.dataType == widestType) b.left else Cast(b.left, widestType)
             val newRight =
@@ -441,21 +452,18 @@ trait HiveTypeCoercion {
             DecimalType(min(p1 - s1, p2 - s2) + max(s1, s2), max(s1, s2))
           )
 
-        case LessThan(e1 @ DecimalType.Expression(p1, s1),
-                      e2 @ DecimalType.Expression(p2, s2)) if p1 != p2 || s1 != s2 =>
-          LessThan(Cast(e1, DecimalType.Unlimited), Cast(e2, DecimalType.Unlimited))
-
-        case LessThanOrEqual(e1 @ DecimalType.Expression(p1, s1),
-                             e2 @ DecimalType.Expression(p2, s2)) if p1 != p2 || s1 != s2 =>
-          LessThanOrEqual(Cast(e1, DecimalType.Unlimited), Cast(e2, DecimalType.Unlimited))
-
-        case GreaterThan(e1 @ DecimalType.Expression(p1, s1),
-                         e2 @ DecimalType.Expression(p2, s2)) if p1 != p2 || s1 != s2 =>
-          GreaterThan(Cast(e1, DecimalType.Unlimited), Cast(e2, DecimalType.Unlimited))
-
-        case GreaterThanOrEqual(e1 @ DecimalType.Expression(p1, s1),
-                                e2 @ DecimalType.Expression(p2, s2)) if p1 != p2 || s1 != s2 =>
-          GreaterThanOrEqual(Cast(e1, DecimalType.Unlimited), Cast(e2, DecimalType.Unlimited))
+        // When we compare 2 decimal types with different precisions, cast them to the smallest
+        // common precision.
+        case b @ BinaryComparison(e1 @ DecimalType.Expression(p1, s1),
+                                  e2 @ DecimalType.Expression(p2, s2)) if p1 != p2 || s1 != s2 =>
+          val resultType = DecimalType(max(p1, p2), max(s1, s2))
+          b.makeCopy(Array(Cast(e1, resultType), Cast(e2, resultType)))
+        case b @ BinaryComparison(e1 @ DecimalType.Fixed(_, _), e2)
+          if e2.dataType == DecimalType.Unlimited =>
+          b.makeCopy(Array(Cast(e1, DecimalType.Unlimited), e2))
+        case b @ BinaryComparison(e1, e2 @ DecimalType.Fixed(_, _))
+          if e1.dataType == DecimalType.Unlimited =>
+          b.makeCopy(Array(e1, Cast(e2, DecimalType.Unlimited)))
 
         // Promote integers inside a binary expression with fixed-precision decimals to decimals,
         // and fixed-precision decimals in an expression with floats / doubles to doubles
@@ -570,7 +578,7 @@ trait HiveTypeCoercion {
 
       case a @ CreateArray(children) if !a.resolved =>
         val commonType = a.childTypes.reduce(
-          (a, b) => findTightestCommonType(a, b).getOrElse(StringType))
+          (a, b) => findTightestCommonTypeOfTwo(a, b).getOrElse(StringType))
         CreateArray(
           children.map(c => if (c.dataType == commonType) c else Cast(c, commonType)))
 
@@ -599,14 +607,9 @@ trait HiveTypeCoercion {
       // from the list. So we need to make sure the return type is deterministic and
       // compatible with every child column.
       case Coalesce(es) if es.map(_.dataType).distinct.size > 1 =>
-        val dt: Option[DataType] = Some(NullType)
         val types = es.map(_.dataType)
-        val rt = types.foldLeft(dt)((r, c) => r match {
-          case None => None
-          case Some(d) => findTightestCommonType(d, c)
-        })
-        rt match {
-          case Some(finaldt) => Coalesce(es.map(Cast(_, finaldt)))
+        findTightestCommonType(types) match {
+          case Some(finalDataType) => Coalesce(es.map(Cast(_, finalDataType)))
           case None =>
             sys.error(s"Could not determine return type of Coalesce for ${types.mkString(",")}")
         }
@@ -619,17 +622,13 @@ trait HiveTypeCoercion {
    */
   object Division extends Rule[LogicalPlan] {
     def apply(plan: LogicalPlan): LogicalPlan = plan transformAllExpressions {
-      // Skip nodes who's children have not been resolved yet.
-      case e if !e.childrenResolved => e
+      // Skip nodes who has not been resolved yet,
+      // as this is an extra rule which should be applied at last.
+      case e if !e.resolved => e
 
       // Decimal and Double remain the same
-      case d: Divide if d.resolved && d.dataType == DoubleType => d
-      case d: Divide if d.resolved && d.dataType.isInstanceOf[DecimalType] => d
-
-      case Divide(l, r) if l.dataType.isInstanceOf[DecimalType] =>
-        Divide(l, Cast(r, DecimalType.Unlimited))
-      case Divide(l, r) if r.dataType.isInstanceOf[DecimalType] =>
-        Divide(Cast(l, DecimalType.Unlimited), r)
+      case d: Divide if d.dataType == DoubleType => d
+      case d: Divide if d.dataType.isInstanceOf[DecimalType] => d
 
       case Divide(l, r) => Divide(Cast(l, DoubleType), Cast(r, DoubleType))
     }
@@ -642,42 +641,33 @@ trait HiveTypeCoercion {
     import HiveTypeCoercion._
 
     def apply(plan: LogicalPlan): LogicalPlan = plan transformAllExpressions {
-      case cw: CaseWhenLike if cw.childrenResolved && !cw.valueTypesEqual =>
-        logDebug(s"Input values for null casting ${cw.valueTypes.mkString(",")}")
-        val commonType = cw.valueTypes.reduce { (v1, v2) =>
-          findTightestCommonType(v1, v2).getOrElse(sys.error(
-            s"Types in CASE WHEN must be the same or coercible to a common type: $v1 != $v2"))
-        }
-        val transformedBranches = cw.branches.sliding(2, 2).map {
-          case Seq(when, value) if value.dataType != commonType =>
-            Seq(when, Cast(value, commonType))
-          case Seq(elseVal) if elseVal.dataType != commonType =>
-            Seq(Cast(elseVal, commonType))
-          case s => s
-        }.reduce(_ ++ _)
-        cw match {
-          case _: CaseWhen =>
-            CaseWhen(transformedBranches)
-          case CaseKeyWhen(key, _) =>
-            CaseKeyWhen(key, transformedBranches)
-        }
-
-      case ckw: CaseKeyWhen if ckw.childrenResolved && !ckw.resolved =>
-        val commonType = (ckw.key +: ckw.whenList).map(_.dataType).reduce { (v1, v2) =>
-          findTightestCommonType(v1, v2).getOrElse(sys.error(
-            s"Types in CASE WHEN must be the same or coercible to a common type: $v1 != $v2"))
-        }
-        val transformedBranches = ckw.branches.sliding(2, 2).map {
-          case Seq(when, then) if when.dataType != commonType =>
-            Seq(Cast(when, commonType), then)
-          case s => s
-        }.reduce(_ ++ _)
-        val transformedKey = if (ckw.key.dataType != commonType) {
-          Cast(ckw.key, commonType)
-        } else {
-          ckw.key
-        }
-        CaseKeyWhen(transformedKey, transformedBranches)
+      case c: CaseWhenLike if c.childrenResolved && !c.valueTypesEqual =>
+        logDebug(s"Input values for null casting ${c.valueTypes.mkString(",")}")
+        val maybeCommonType = findTightestCommonType(c.valueTypes)
+        maybeCommonType.map { commonType =>
+          val castedBranches = c.branches.grouped(2).map {
+            case Seq(when, value) if value.dataType != commonType =>
+              Seq(when, Cast(value, commonType))
+            case Seq(elseVal) if elseVal.dataType != commonType =>
+              Seq(Cast(elseVal, commonType))
+            case other => other
+          }.reduce(_ ++ _)
+          c match {
+            case _: CaseWhen => CaseWhen(castedBranches)
+            case CaseKeyWhen(key, _) => CaseKeyWhen(key, castedBranches)
+          }
+        }.getOrElse(c)
+
+      case c: CaseKeyWhen if c.childrenResolved && !c.resolved =>
+        val maybeCommonType = findTightestCommonType((c.key +: c.whenList).map(_.dataType))
+        maybeCommonType.map { commonType =>
+          val castedBranches = c.branches.grouped(2).map {
+            case Seq(when, then) if when.dataType != commonType =>
+              Seq(Cast(when, commonType), then)
+            case other => other
+          }.reduce(_ ++ _)
+          CaseKeyWhen(Cast(c.key, commonType), castedBranches)
+        }.getOrElse(c)
     }
   }
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/TypeCheckResult.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/TypeCheckResult.scala
new file mode 100644
index 0000000000000..79c3528a522d3
--- /dev/null
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/TypeCheckResult.scala
@@ -0,0 +1,45 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.catalyst.analysis
+
+/**
+ * Represents the result of `Expression.checkInputDataTypes`.
+ * We will throw `AnalysisException` in `CheckAnalysis` if `isFailure` is true.
+ */
+trait TypeCheckResult {
+  def isFailure: Boolean = !isSuccess
+  def isSuccess: Boolean
+}
+
+object TypeCheckResult {
+
+  /**
+   * Represents the successful result of `Expression.checkInputDataTypes`.
+   */
+  object TypeCheckSuccess extends TypeCheckResult {
+    def isSuccess: Boolean = true
+  }
+
+  /**
+   * Represents the failing result of `Expression.checkInputDataTypes`,
+   * with a error message to show the reason of failure.
+   */
+  case class TypeCheckFailure(message: String) extends TypeCheckResult {
+    def isSuccess: Boolean = false
+  }
+}
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala
index adc6505d69cdf..3cf851aec15ea 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala
@@ -17,7 +17,7 @@
 
 package org.apache.spark.sql.catalyst.expressions
 
-import org.apache.spark.sql.catalyst.analysis.UnresolvedAttribute
+import org.apache.spark.sql.catalyst.analysis.{TypeCheckResult, UnresolvedAttribute}
 import org.apache.spark.sql.catalyst.trees
 import org.apache.spark.sql.catalyst.trees.TreeNode
 import org.apache.spark.sql.types._
@@ -53,11 +53,12 @@ abstract class Expression extends TreeNode[Expression] {
 
   /**
    * Returns `true` if this expression and all its children have been resolved to a specific schema
-   * and `false` if it still contains any unresolved placeholders. Implementations of expressions
-   * should override this if the resolution of this type of expression involves more than just
-   * the resolution of its children.
+   * and input data types checking passed, and `false` if it still contains any unresolved
+   * placeholders or has data types mismatch.
+   * Implementations of expressions should override this if the resolution of this type of
+   * expression involves more than just the resolution of its children and type checking.
    */
-  lazy val resolved: Boolean = childrenResolved
+  lazy val resolved: Boolean = childrenResolved && checkInputDataTypes().isSuccess
 
   /**
    * Returns the [[DataType]] of the result of evaluating this expression.  It is
@@ -94,12 +95,21 @@ abstract class Expression extends TreeNode[Expression] {
       case (i1, i2) => i1 == i2
     }
   }
+
+  /**
+   * Checks the input data types, returns `TypeCheckResult.success` if it's valid,
+   * or returns a `TypeCheckResult` with an error message if invalid.
+   * Note: it's not valid to call this method until `childrenResolved == true`
+   * TODO: we should remove the default implementation and implement it for all
+   * expressions with proper error message.
+   */
+  def checkInputDataTypes(): TypeCheckResult = TypeCheckResult.TypeCheckSuccess
 }
 
 abstract class BinaryExpression extends Expression with trees.BinaryNode[Expression] {
   self: Product =>
 
-  def symbol: String
+  def symbol: String = sys.error(s"BinaryExpressions must override either toString or symbol")
 
   override def foldable: Boolean = left.foldable && right.foldable
 
@@ -133,7 +143,13 @@ case class GroupExpression(children: Seq[Expression]) extends Expression {
  * so that the proper type conversions can be performed in the analyzer.
  */
 trait ExpectsInputTypes {
+  self: Expression =>
 
   def expectedChildTypes: Seq[DataType]
 
+  override def checkInputDataTypes(): TypeCheckResult = {
+    // We will always do type casting for `ExpectsInputTypes` in `HiveTypeCoercion`,
+    // so type mismatch error won't be reported here, but for underling `Cast`s.
+    TypeCheckResult.TypeCheckSuccess
+  }
 }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/arithmetic.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/arithmetic.scala
index f2299d5db6e9f..2ac53f8f6613f 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/arithmetic.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/arithmetic.scala
@@ -17,72 +17,89 @@
 
 package org.apache.spark.sql.catalyst.expressions
 
-import org.apache.spark.sql.catalyst.analysis.UnresolvedException
-import org.apache.spark.sql.catalyst.errors.TreeNodeException
+import org.apache.spark.sql.catalyst.analysis.TypeCheckResult
+import org.apache.spark.sql.catalyst.util.TypeUtils
 import org.apache.spark.sql.types._
 
-case class UnaryMinus(child: Expression) extends UnaryExpression {
+abstract class UnaryArithmetic extends UnaryExpression {
+  self: Product =>
 
-  override def dataType: DataType = child.dataType
   override def foldable: Boolean = child.foldable
   override def nullable: Boolean = child.nullable
-  override def toString: String = s"-$child"
-
-  lazy val numeric = dataType match {
-    case n: NumericType => n.numeric.asInstanceOf[Numeric[Any]]
-    case other => sys.error(s"Type $other does not support numeric operations")
-  }
+  override def dataType: DataType = child.dataType
 
   override def eval(input: Row): Any = {
     val evalE = child.eval(input)
     if (evalE == null) {
       null
     } else {
-      numeric.negate(evalE)
+      evalInternal(evalE)
     }
   }
+
+  protected def evalInternal(evalE: Any): Any =
+    sys.error(s"UnaryArithmetics must override either eval or evalInternal")
 }
 
-case class Sqrt(child: Expression) extends UnaryExpression {
+case class UnaryMinus(child: Expression) extends UnaryArithmetic {
+  override def toString: String = s"-$child"
+
+  override def checkInputDataTypes(): TypeCheckResult =
+    TypeUtils.checkForNumericExpr(child.dataType, "operator -")
 
+  private lazy val numeric = TypeUtils.getNumeric(dataType)
+
+  protected override def evalInternal(evalE: Any) = numeric.negate(evalE)
+}
+
+case class Sqrt(child: Expression) extends UnaryArithmetic {
   override def dataType: DataType = DoubleType
-  override def foldable: Boolean = child.foldable
   override def nullable: Boolean = true
   override def toString: String = s"SQRT($child)"
 
-  lazy val numeric = child.dataType match {
-    case n: NumericType => n.numeric.asInstanceOf[Numeric[Any]]
-    case other => sys.error(s"Type $other does not support non-negative numeric operations")
-  }
+  override def checkInputDataTypes(): TypeCheckResult =
+    TypeUtils.checkForNumericExpr(child.dataType, "function sqrt")
 
-  override def eval(input: Row): Any = {
-    val evalE = child.eval(input)
-    if (evalE == null) {
-      null
-    } else {
-      val value = numeric.toDouble(evalE)
-      if (value < 0) null
-      else math.sqrt(value)
-    }
+  private lazy val numeric = TypeUtils.getNumeric(child.dataType)
+
+  protected override def evalInternal(evalE: Any) = {
+    val value = numeric.toDouble(evalE)
+    if (value < 0) null
+    else math.sqrt(value)
   }
 }
 
+/**
+ * A function that get the absolute value of the numeric value.
+ */
+case class Abs(child: Expression) extends UnaryArithmetic {
+  override def toString: String = s"Abs($child)"
+
+  override def checkInputDataTypes(): TypeCheckResult =
+    TypeUtils.checkForNumericExpr(child.dataType, "function abs")
+
+  private lazy val numeric = TypeUtils.getNumeric(dataType)
+
+  protected override def evalInternal(evalE: Any) = numeric.abs(evalE)
+}
+
 abstract class BinaryArithmetic extends BinaryExpression {
   self: Product =>
 
-  override lazy val resolved =
-    left.resolved && right.resolved &&
-    left.dataType == right.dataType &&
-    !DecimalType.isFixed(left.dataType)
-
-  override def dataType: DataType = {
-    if (!resolved) {
-      throw new UnresolvedException(this,
-        s"datatype. Can not resolve due to differing types ${left.dataType}, ${right.dataType}")
+  override def dataType: DataType = left.dataType
+
+  override def checkInputDataTypes(): TypeCheckResult = {
+    if (left.dataType != right.dataType) {
+      TypeCheckResult.TypeCheckFailure(
+        s"differing types in ${this.getClass.getSimpleName} " +
+        s"(${left.dataType} and ${right.dataType}).")
+    } else {
+      checkTypesInternal(dataType)
     }
-    left.dataType
   }
 
+  protected def checkTypesInternal(t: DataType): TypeCheckResult
+
   override def eval(input: Row): Any = {
     val evalE1 = left.eval(input)
     if(evalE1 == null) {
@@ -97,88 +114,65 @@ abstract class BinaryArithmetic extends BinaryExpression {
     }
   }
 
-  def evalInternal(evalE1: Any, evalE2: Any): Any =
-    sys.error(s"BinaryExpressions must either override eval or evalInternal")
+  protected def evalInternal(evalE1: Any, evalE2: Any): Any =
+    sys.error(s"BinaryArithmetics must override either eval or evalInternal")
 }
 
 case class Add(left: Expression, right: Expression) extends BinaryArithmetic {
   override def symbol: String = "+"
 
-  lazy val numeric = dataType match {
-    case n: NumericType => n.numeric.asInstanceOf[Numeric[Any]]
-    case other => sys.error(s"Type $other does not support numeric operations")
-  }
+  override lazy val resolved =
+    childrenResolved && checkInputDataTypes().isSuccess && !DecimalType.isFixed(dataType)
 
-  override def eval(input: Row): Any = {
-    val evalE1 = left.eval(input)
-    if(evalE1 == null) {
-      null
-    } else {
-      val evalE2 = right.eval(input)
-      if (evalE2 == null) {
-        null
-      } else {
-        numeric.plus(evalE1, evalE2)
-      }
-    }
-  }
+  protected def checkTypesInternal(t: DataType) =
+    TypeUtils.checkForNumericExpr(t, "operator " + symbol)
+
+  private lazy val numeric = TypeUtils.getNumeric(dataType)
+
+  protected override def evalInternal(evalE1: Any, evalE2: Any) = numeric.plus(evalE1, evalE2)
 }
 
 case class Subtract(left: Expression, right: Expression) extends BinaryArithmetic {
   override def symbol: String = "-"
 
-  lazy val numeric = dataType match {
-    case n: NumericType => n.numeric.asInstanceOf[Numeric[Any]]
-    case other => sys.error(s"Type $other does not support numeric operations")
-  }
+  override lazy val resolved =
+    childrenResolved && checkInputDataTypes().isSuccess && !DecimalType.isFixed(dataType)
 
-  override def eval(input: Row): Any = {
-    val evalE1 = left.eval(input)
-    if(evalE1 == null) {
-      null
-    } else {
-      val evalE2 = right.eval(input)
-      if (evalE2 == null) {
-        null
-      } else {
-        numeric.minus(evalE1, evalE2)
-      }
-    }
-  }
+  protected def checkTypesInternal(t: DataType) =
+    TypeUtils.checkForNumericExpr(t, "operator " + symbol)
+
+  private lazy val numeric = TypeUtils.getNumeric(dataType)
+
+  protected override def evalInternal(evalE1: Any, evalE2: Any) = numeric.minus(evalE1, evalE2)
 }
 
 case class Multiply(left: Expression, right: Expression) extends BinaryArithmetic {
   override def symbol: String = "*"
 
-  lazy val numeric = dataType match {
-    case n: NumericType => n.numeric.asInstanceOf[Numeric[Any]]
-    case other => sys.error(s"Type $other does not support numeric operations")
-  }
+  override lazy val resolved =
+    childrenResolved && checkInputDataTypes().isSuccess && !DecimalType.isFixed(dataType)
 
-  override def eval(input: Row): Any = {
-    val evalE1 = left.eval(input)
-    if(evalE1 == null) {
-      null
-    } else {
-      val evalE2 = right.eval(input)
-      if (evalE2 == null) {
-        null
-      } else {
-        numeric.times(evalE1, evalE2)
-      }
-    }
-  }
+  protected def checkTypesInternal(t: DataType) =
+    TypeUtils.checkForNumericExpr(t, "operator " + symbol)
+
+  private lazy val numeric = TypeUtils.getNumeric(dataType)
+
+  protected override def evalInternal(evalE1: Any, evalE2: Any) = numeric.times(evalE1, evalE2)
 }
 
 case class Divide(left: Expression, right: Expression) extends BinaryArithmetic {
   override def symbol: String = "/"
-
   override def nullable: Boolean = true
 
-  lazy val div: (Any, Any) => Any = dataType match {
+  override lazy val resolved =
+    childrenResolved && checkInputDataTypes().isSuccess && !DecimalType.isFixed(dataType)
+
+  protected def checkTypesInternal(t: DataType) =
+    TypeUtils.checkForNumericExpr(t, "operator " + symbol)
+
+  private lazy val div: (Any, Any) => Any = dataType match {
     case ft: FractionalType => ft.fractional.asInstanceOf[Fractional[Any]].div
     case it: IntegralType => it.integral.asInstanceOf[Integral[Any]].quot
-    case other => sys.error(s"Type $other does not support numeric operations")
   }
 
   override def eval(input: Row): Any = {
@@ -198,13 +192,17 @@ case class Divide(left: Expression, right: Expression) extends BinaryArithmetic
 
 case class Remainder(left: Expression, right: Expression) extends BinaryArithmetic {
   override def symbol: String = "%"
-
   override def nullable: Boolean = true
 
-  lazy val integral = dataType match {
+  override lazy val resolved =
+    childrenResolved && checkInputDataTypes().isSuccess && !DecimalType.isFixed(dataType)
+
+  protected def checkTypesInternal(t: DataType) =
+    TypeUtils.checkForNumericExpr(t, "operator " + symbol)
+
+  private lazy val integral = dataType match {
     case i: IntegralType => i.integral.asInstanceOf[Integral[Any]]
     case i: FractionalType => i.asIntegral.asInstanceOf[Integral[Any]]
-    case other => sys.error(s"Type $other does not support numeric operations")
   }
 
   override def eval(input: Row): Any = {
@@ -228,7 +226,10 @@ case class Remainder(left: Expression, right: Expression) extends BinaryArithmet
 case class BitwiseAnd(left: Expression, right: Expression) extends BinaryArithmetic {
   override def symbol: String = "&"
 
-  lazy val and: (Any, Any) => Any = dataType match {
+  protected def checkTypesInternal(t: DataType) =
+    TypeUtils.checkForBitwiseExpr(t, "operator " + symbol)
+
+  private lazy val and: (Any, Any) => Any = dataType match {
     case ByteType =>
       ((evalE1: Byte, evalE2: Byte) => (evalE1 & evalE2).toByte).asInstanceOf[(Any, Any) => Any]
     case ShortType =>
@@ -237,10 +238,9 @@ case class BitwiseAnd(left: Expression, right: Expression) extends BinaryArithme
       ((evalE1: Int, evalE2: Int) => evalE1 & evalE2).asInstanceOf[(Any, Any) => Any]
     case LongType =>
       ((evalE1: Long, evalE2: Long) => evalE1 & evalE2).asInstanceOf[(Any, Any) => Any]
-    case other => sys.error(s"Unsupported bitwise & operation on $other")
   }
 
-  override def evalInternal(evalE1: Any, evalE2: Any): Any = and(evalE1, evalE2)
+  protected override def evalInternal(evalE1: Any, evalE2: Any) = and(evalE1, evalE2)
 }
 
 /**
@@ -249,7 +249,10 @@ case class BitwiseAnd(left: Expression, right: Expression) extends BinaryArithme
 case class BitwiseOr(left: Expression, right: Expression) extends BinaryArithmetic {
   override def symbol: String = "|"
 
-  lazy val or: (Any, Any) => Any = dataType match {
+  protected def checkTypesInternal(t: DataType) =
+    TypeUtils.checkForBitwiseExpr(t, "operator " + symbol)
+
+  private lazy val or: (Any, Any) => Any = dataType match {
     case ByteType =>
       ((evalE1: Byte, evalE2: Byte) => (evalE1 | evalE2).toByte).asInstanceOf[(Any, Any) => Any]
     case ShortType =>
@@ -258,10 +261,9 @@ case class BitwiseOr(left: Expression, right: Expression) extends BinaryArithmet
       ((evalE1: Int, evalE2: Int) => evalE1 | evalE2).asInstanceOf[(Any, Any) => Any]
     case LongType =>
       ((evalE1: Long, evalE2: Long) => evalE1 | evalE2).asInstanceOf[(Any, Any) => Any]
-    case other => sys.error(s"Unsupported bitwise | operation on $other")
   }
 
-  override def evalInternal(evalE1: Any, evalE2: Any): Any = or(evalE1, evalE2)
+  protected override def evalInternal(evalE1: Any, evalE2: Any) = or(evalE1, evalE2)
 }
 
 /**
@@ -270,7 +272,10 @@ case class BitwiseOr(left: Expression, right: Expression) extends BinaryArithmet
 case class BitwiseXor(left: Expression, right: Expression) extends BinaryArithmetic {
   override def symbol: String = "^"
 
-  lazy val xor: (Any, Any) => Any = dataType match {
+  protected def checkTypesInternal(t: DataType) =
+    TypeUtils.checkForBitwiseExpr(t, "operator " + symbol)
+
+  private lazy val xor: (Any, Any) => Any = dataType match {
     case ByteType =>
       ((evalE1: Byte, evalE2: Byte) => (evalE1 ^ evalE2).toByte).asInstanceOf[(Any, Any) => Any]
     case ShortType =>
@@ -279,23 +284,21 @@ case class BitwiseXor(left: Expression, right: Expression) extends BinaryArithme
       ((evalE1: Int, evalE2: Int) => evalE1 ^ evalE2).asInstanceOf[(Any, Any) => Any]
     case LongType =>
       ((evalE1: Long, evalE2: Long) => evalE1 ^ evalE2).asInstanceOf[(Any, Any) => Any]
-    case other => sys.error(s"Unsupported bitwise ^ operation on $other")
   }
 
-  override def evalInternal(evalE1: Any, evalE2: Any): Any = xor(evalE1, evalE2)
+  protected override def evalInternal(evalE1: Any, evalE2: Any): Any = xor(evalE1, evalE2)
 }
 
 /**
  * A function that calculates bitwise not(~) of a number.
  */
-case class BitwiseNot(child: Expression) extends UnaryExpression {
-
-  override def dataType: DataType = child.dataType
-  override def foldable: Boolean = child.foldable
-  override def nullable: Boolean = child.nullable
+case class BitwiseNot(child: Expression) extends UnaryArithmetic {
   override def toString: String = s"~$child"
 
-  lazy val not: (Any) => Any = dataType match {
+  override def checkInputDataTypes(): TypeCheckResult =
+    TypeUtils.checkForBitwiseExpr(child.dataType, "operator ~")
+
+  private lazy val not: (Any) => Any = dataType match {
     case ByteType =>
       ((evalE: Byte) => (~evalE).toByte).asInstanceOf[(Any) => Any]
     case ShortType =>
@@ -304,43 +307,18 @@ case class BitwiseNot(child: Expression) extends UnaryExpression {
       ((evalE: Int) => ~evalE).asInstanceOf[(Any) => Any]
     case LongType =>
       ((evalE: Long) => ~evalE).asInstanceOf[(Any) => Any]
-    case other => sys.error(s"Unsupported bitwise ~ operation on $other")
   }
 
-  override def eval(input: Row): Any = {
-    val evalE = child.eval(input)
-    if (evalE == null) {
-      null
-    } else {
-      not(evalE)
-    }
-  }
+  protected override def evalInternal(evalE: Any) = not(evalE)
 }
 
-case class MaxOf(left: Expression, right: Expression) extends Expression {
-
-  override def foldable: Boolean = left.foldable && right.foldable
-
+case class MaxOf(left: Expression, right: Expression) extends BinaryArithmetic {
   override def nullable: Boolean = left.nullable && right.nullable
 
-  override def children: Seq[Expression] = left :: right :: Nil
-
-  override lazy val resolved =
-    left.resolved && right.resolved &&
-    left.dataType == right.dataType
+  protected def checkTypesInternal(t: DataType) =
+    TypeUtils.checkForOrderingExpr(t, "function maxOf")
 
-  override def dataType: DataType = {
-    if (!resolved) {
-      throw new UnresolvedException(this,
-        s"datatype. Can not resolve due to differing types ${left.dataType}, ${right.dataType}")
-    }
-    left.dataType
-  }
-
-  lazy val ordering = left.dataType match {
-    case i: AtomicType => i.ordering.asInstanceOf[Ordering[Any]]
-    case other => sys.error(s"Type $other does not support ordered operations")
-  }
+  private lazy val ordering = TypeUtils.getOrdering(dataType)
 
   override def eval(input: Row): Any = {
     val evalE1 = left.eval(input)
@@ -361,30 +339,13 @@ case class MaxOf(left: Expression, right: Expression) extends Expression {
   override def toString: String = s"MaxOf($left, $right)"
 }
 
-case class MinOf(left: Expression, right: Expression) extends Expression {
-
-  override def foldable: Boolean = left.foldable && right.foldable
-
+case class MinOf(left: Expression, right: Expression) extends BinaryArithmetic {
   override def nullable: Boolean = left.nullable && right.nullable
 
-  override def children: Seq[Expression] = left :: right :: Nil
+  protected def checkTypesInternal(t: DataType) =
+    TypeUtils.checkForOrderingExpr(t, "function minOf")
 
-  override lazy val resolved =
-    left.resolved && right.resolved &&
-    left.dataType == right.dataType
-
-  override def dataType: DataType = {
-    if (!resolved) {
-      throw new UnresolvedException(this,
-        s"datatype. Can not resolve due to differing types ${left.dataType}, ${right.dataType}")
-    }
-    left.dataType
-  }
-
-  lazy val ordering = left.dataType match {
-    case i: AtomicType => i.ordering.asInstanceOf[Ordering[Any]]
-    case other => sys.error(s"Type $other does not support ordered operations")
-  }
+  private lazy val ordering = TypeUtils.getOrdering(dataType)
 
   override def eval(input: Row): Any = {
     val evalE1 = left.eval(input)
@@ -404,28 +365,3 @@ case class MinOf(left: Expression, right: Expression) extends Expression {
 
   override def toString: String = s"MinOf($left, $right)"
 }
-
-/**
- * A function that get the absolute value of the numeric value.
- */
-case class Abs(child: Expression) extends UnaryExpression  {
-
-  override def dataType: DataType = child.dataType
-  override def foldable: Boolean = child.foldable
-  override def nullable: Boolean = child.nullable
-  override def toString: String = s"Abs($child)"
-
-  lazy val numeric = dataType match {
-    case n: NumericType => n.numeric.asInstanceOf[Numeric[Any]]
-    case other => sys.error(s"Type $other does not support numeric operations")
-  }
-
-  override def eval(input: Row): Any = {
-    val evalE = child.eval(input)
-    if (evalE == null) {
-      null
-    } else {
-      numeric.abs(evalE)
-    }
-  }
-}
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/mathfuncs/binary.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/mathfuncs/binary.scala
index 01f62ba0442e9..db853a2b97fad 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/mathfuncs/binary.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/mathfuncs/binary.scala
@@ -29,17 +29,10 @@ import org.apache.spark.sql.types._
 abstract class BinaryMathExpression(f: (Double, Double) => Double, name: String)
   extends BinaryExpression with Serializable with ExpectsInputTypes { self: Product =>
 
-  override def symbol: String = null
   override def expectedChildTypes: Seq[DataType] = Seq(DoubleType, DoubleType)
 
-  override def nullable: Boolean = left.nullable || right.nullable
   override def toString: String = s"$name($left, $right)"
 
-  override lazy val resolved =
-    left.resolved && right.resolved &&
-      left.dataType == right.dataType &&
-      !DecimalType.isFixed(left.dataType)
-
   override def dataType: DataType = DoubleType
 
   override def eval(input: Row): Any = {
@@ -58,9 +51,8 @@ abstract class BinaryMathExpression(f: (Double, Double) => Double, name: String)
   }
 }
 
-case class Atan2(
-    left: Expression,
-    right: Expression) extends BinaryMathExpression(math.atan2, "ATAN2") {
+case class Atan2(left: Expression, right: Expression)
+  extends BinaryMathExpression(math.atan2, "ATAN2") {
 
   override def eval(input: Row): Any = {
     val evalE1 = left.eval(input)
@@ -80,8 +72,7 @@ case class Atan2(
   }
 }
 
-case class Hypot(
-    left: Expression,
-    right: Expression) extends BinaryMathExpression(math.hypot, "HYPOT")
+case class Hypot(left: Expression, right: Expression)
+  extends BinaryMathExpression(math.hypot, "HYPOT")
 
 case class Pow(left: Expression, right: Expression) extends BinaryMathExpression(math.pow, "POWER")
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala
index 4f422d69c4382..807021d50e8e0 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala
@@ -17,10 +17,10 @@
 
 package org.apache.spark.sql.catalyst.expressions
 
-import org.apache.spark.sql.catalyst.analysis.UnresolvedException
-import org.apache.spark.sql.catalyst.errors.TreeNodeException
+import org.apache.spark.sql.catalyst.analysis.TypeCheckResult
 import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
-import org.apache.spark.sql.types.{DataType, BinaryType, BooleanType, AtomicType}
+import org.apache.spark.sql.catalyst.util.TypeUtils
+import org.apache.spark.sql.types.{BinaryType, BooleanType, DataType}
 
 object InterpretedPredicate {
   def create(expression: Expression, inputSchema: Seq[Attribute]): (Row => Boolean) =
@@ -171,22 +171,51 @@ case class Or(left: Expression, right: Expression)
 
 abstract class BinaryComparison extends BinaryExpression with Predicate {
   self: Product =>
-}
 
-case class EqualTo(left: Expression, right: Expression) extends BinaryComparison {
-  override def symbol: String = "="
+  override def checkInputDataTypes(): TypeCheckResult = {
+    if (left.dataType != right.dataType) {
+      TypeCheckResult.TypeCheckFailure(
+        s"differing types in ${this.getClass.getSimpleName} " +
+        s"(${left.dataType} and ${right.dataType}).")
+    } else {
+      checkTypesInternal(dataType)
+    }
+  }
+
+  protected def checkTypesInternal(t: DataType): TypeCheckResult
 
   override def eval(input: Row): Any = {
-    val l = left.eval(input)
-    if (l == null) {
+    val evalE1 = left.eval(input)
+    if (evalE1 == null) {
       null
     } else {
-      val r = right.eval(input)
-      if (r == null) null
-      else if (left.dataType != BinaryType) l == r
-      else java.util.Arrays.equals(l.asInstanceOf[Array[Byte]], r.asInstanceOf[Array[Byte]])
+      val evalE2 = right.eval(input)
+      if (evalE2 == null) {
+        null
+      } else {
+        evalInternal(evalE1, evalE2)
+      }
     }
   }
+
+  protected def evalInternal(evalE1: Any, evalE2: Any): Any =
+    sys.error(s"BinaryComparisons must override either eval or evalInternal")
+}
+
+object BinaryComparison {
+  def unapply(b: BinaryComparison): Option[(Expression, Expression)] =
+    Some((b.left, b.right))
+}
+
+case class EqualTo(left: Expression, right: Expression) extends BinaryComparison {
+  override def symbol: String = "="
+
+  override protected def checkTypesInternal(t: DataType) = TypeCheckResult.TypeCheckSuccess
+
+  protected override def evalInternal(l: Any, r: Any) = {
+    if (left.dataType != BinaryType) l == r
+    else java.util.Arrays.equals(l.asInstanceOf[Array[Byte]], r.asInstanceOf[Array[Byte]])
+  }
 }
 
 case class EqualNullSafe(left: Expression, right: Expression) extends BinaryComparison {
@@ -194,6 +223,8 @@ case class EqualNullSafe(left: Expression, right: Expression) extends BinaryComp
 
   override def nullable: Boolean = false
 
+  override protected def checkTypesInternal(t: DataType) = TypeCheckResult.TypeCheckSuccess
+
   override def eval(input: Row): Any = {
     val l = left.eval(input)
     val r = right.eval(input)
@@ -210,117 +241,45 @@ case class EqualNullSafe(left: Expression, right: Expression) extends BinaryComp
 case class LessThan(left: Expression, right: Expression) extends BinaryComparison {
   override def symbol: String = "<"
 
-  lazy val ordering: Ordering[Any] = {
-    if (left.dataType != right.dataType) {
-      throw new TreeNodeException(this,
-        s"Types do not match ${left.dataType} != ${right.dataType}")
-    }
-    left.dataType match {
-      case i: AtomicType => i.ordering.asInstanceOf[Ordering[Any]]
-      case other => sys.error(s"Type $other does not support ordered operations")
-    }
-  }
+  override protected def checkTypesInternal(t: DataType) =
+    TypeUtils.checkForOrderingExpr(left.dataType, "operator " + symbol)
 
-  override def eval(input: Row): Any = {
-    val evalE1 = left.eval(input)
-    if (evalE1 == null) {
-      null
-    } else {
-      val evalE2 = right.eval(input)
-      if (evalE2 == null) {
-        null
-      } else {
-        ordering.lt(evalE1, evalE2)
-      }
-    }
-  }
+  private lazy val ordering = TypeUtils.getOrdering(left.dataType)
+
+  protected override def evalInternal(evalE1: Any, evalE2: Any) = ordering.lt(evalE1, evalE2)
 }
 
 case class LessThanOrEqual(left: Expression, right: Expression) extends BinaryComparison {
   override def symbol: String = "<="
 
-  lazy val ordering: Ordering[Any] = {
-    if (left.dataType != right.dataType) {
-      throw new TreeNodeException(this,
-        s"Types do not match ${left.dataType} != ${right.dataType}")
-    }
-    left.dataType match {
-      case i: AtomicType => i.ordering.asInstanceOf[Ordering[Any]]
-      case other => sys.error(s"Type $other does not support ordered operations")
-    }
-  }
+  override protected def checkTypesInternal(t: DataType) =
+    TypeUtils.checkForOrderingExpr(left.dataType, "operator " + symbol)
 
-  override def eval(input: Row): Any = {
-    val evalE1 = left.eval(input)
-    if (evalE1 == null) {
-      null
-    } else {
-      val evalE2 = right.eval(input)
-      if (evalE2 == null) {
-        null
-      } else {
-        ordering.lteq(evalE1, evalE2)
-      }
-    }
-  }
+  private lazy val ordering = TypeUtils.getOrdering(left.dataType)
+
+  protected override def evalInternal(evalE1: Any, evalE2: Any) = ordering.lteq(evalE1, evalE2)
 }
 
 case class GreaterThan(left: Expression, right: Expression) extends BinaryComparison {
   override def symbol: String = ">"
 
-  lazy val ordering: Ordering[Any] = {
-    if (left.dataType != right.dataType) {
-      throw new TreeNodeException(this,
-        s"Types do not match ${left.dataType} != ${right.dataType}")
-    }
-    left.dataType match {
-      case i: AtomicType => i.ordering.asInstanceOf[Ordering[Any]]
-      case other => sys.error(s"Type $other does not support ordered operations")
-    }
-  }
+  override protected def checkTypesInternal(t: DataType) =
+    TypeUtils.checkForOrderingExpr(left.dataType, "operator " + symbol)
 
-  override def eval(input: Row): Any = {
-    val evalE1 = left.eval(input)
-    if(evalE1 == null) {
-      null
-    } else {
-      val evalE2 = right.eval(input)
-      if (evalE2 == null) {
-        null
-      } else {
-        ordering.gt(evalE1, evalE2)
-      }
-    }
-  }
+  private lazy val ordering = TypeUtils.getOrdering(left.dataType)
+
+  protected override def evalInternal(evalE1: Any, evalE2: Any) = ordering.gt(evalE1, evalE2)
 }
 
 case class GreaterThanOrEqual(left: Expression, right: Expression) extends BinaryComparison {
   override def symbol: String = ">="
 
-  lazy val ordering: Ordering[Any] = {
-    if (left.dataType != right.dataType) {
-      throw new TreeNodeException(this,
-        s"Types do not match ${left.dataType} != ${right.dataType}")
-    }
-    left.dataType match {
-      case i: AtomicType => i.ordering.asInstanceOf[Ordering[Any]]
-      case other => sys.error(s"Type $other does not support ordered operations")
-    }
-  }
+  override protected def checkTypesInternal(t: DataType) =
+    TypeUtils.checkForOrderingExpr(left.dataType, "operator " + symbol)
 
-  override def eval(input: Row): Any = {
-    val evalE1 = left.eval(input)
-    if (evalE1 == null) {
-      null
-    } else {
-      val evalE2 = right.eval(input)
-      if (evalE2 == null) {
-        null
-      } else {
-        ordering.gteq(evalE1, evalE2)
-      }
-    }
-  }
+  private lazy val ordering = TypeUtils.getOrdering(left.dataType)
+
+  protected override def evalInternal(evalE1: Any, evalE2: Any) = ordering.gteq(evalE1, evalE2)
 }
 
 case class If(predicate: Expression, trueValue: Expression, falseValue: Expression)
@@ -329,16 +288,20 @@ case class If(predicate: Expression, trueValue: Expression, falseValue: Expressi
   override def children: Seq[Expression] = predicate :: trueValue :: falseValue :: Nil
   override def nullable: Boolean = trueValue.nullable || falseValue.nullable
 
-  override lazy val resolved = childrenResolved && trueValue.dataType == falseValue.dataType
-  override def dataType: DataType = {
-    if (!resolved) {
-      throw new UnresolvedException(
-        this,
-        s"Can not resolve due to differing types ${trueValue.dataType}, ${falseValue.dataType}")
+  override def checkInputDataTypes(): TypeCheckResult = {
+    if (predicate.dataType != BooleanType) {
+      TypeCheckResult.TypeCheckFailure(
+        s"type of predicate expression in If should be boolean, not ${predicate.dataType}")
+    } else if (trueValue.dataType != falseValue.dataType) {
+      TypeCheckResult.TypeCheckFailure(
+        s"differing types in If (${trueValue.dataType} and ${falseValue.dataType}).")
+    } else {
+      TypeCheckResult.TypeCheckSuccess
     }
-    trueValue.dataType
   }
 
+  override def dataType: DataType = trueValue.dataType
+
   override def eval(input: Row): Any = {
     if (true == predicate.eval(input)) {
       trueValue.eval(input)
@@ -364,17 +327,23 @@ trait CaseWhenLike extends Expression {
     branches.sliding(2, 2).collect { case Seq(_, thenExpr) => thenExpr }.toSeq
   val elseValue = if (branches.length % 2 == 0) None else Option(branches.last)
 
-  // both then and else val should be considered.
+  // both then and else expressions should be considered.
   def valueTypes: Seq[DataType] = (thenList ++ elseValue).map(_.dataType)
   def valueTypesEqual: Boolean = valueTypes.distinct.size == 1
 
-  override def dataType: DataType = {
-    if (!resolved) {
-      throw new UnresolvedException(this, "cannot resolve due to differing types in some branches")
+  override def checkInputDataTypes(): TypeCheckResult = {
+    if (valueTypesEqual) {
+      checkTypesInternal()
+    } else {
+      TypeCheckResult.TypeCheckFailure(
+        "THEN and ELSE expressions should all be same type or coercible to a common type")
     }
-    valueTypes.head
   }
 
+  protected def checkTypesInternal(): TypeCheckResult
+
+  override def dataType: DataType = thenList.head.dataType
+
   override def nullable: Boolean = {
     // If no value is nullable and no elseValue is provided, the whole statement defaults to null.
     thenList.exists(_.nullable) || (elseValue.map(_.nullable).getOrElse(true))
@@ -395,10 +364,16 @@ case class CaseWhen(branches: Seq[Expression]) extends CaseWhenLike {
 
   override def children: Seq[Expression] = branches
 
-  override lazy val resolved: Boolean =
-    childrenResolved &&
-    whenList.forall(_.dataType == BooleanType) &&
-    valueTypesEqual
+  override protected def checkTypesInternal(): TypeCheckResult = {
+    if (whenList.forall(_.dataType == BooleanType)) {
+      TypeCheckResult.TypeCheckSuccess
+    } else {
+      val index = whenList.indexWhere(_.dataType != BooleanType)
+      TypeCheckResult.TypeCheckFailure(
+        s"WHEN expressions in CaseWhen should all be boolean type, " +
+        s"but the ${index + 1}th when expression's type is ${whenList(index)}")
+    }
+  }
 
   /** Written in imperative fashion for performance considerations. */
   override def eval(input: Row): Any = {
@@ -441,9 +416,14 @@ case class CaseKeyWhen(key: Expression, branches: Seq[Expression]) extends CaseW
 
   override def children: Seq[Expression] = key +: branches
 
-  override lazy val resolved: Boolean =
-    childrenResolved && valueTypesEqual &&
-    (key +: whenList).map(_.dataType).distinct.size == 1
+  override protected def checkTypesInternal(): TypeCheckResult = {
+    if ((key +: whenList).map(_.dataType).distinct.size > 1) {
+      TypeCheckResult.TypeCheckFailure(
+        "key and WHEN expressions should all be same type or coercible to a common type")
+    } else {
+      TypeCheckResult.TypeCheckSuccess
+    }
+  }
 
   /** Written in imperative fashion for performance considerations. */
   override def eval(input: Row): Any = {
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
index b25fb48f55e2b..5c6379b8d44b0 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
@@ -273,6 +273,10 @@ object NullPropagation extends Rule[LogicalPlan] {
       case e @ Substring(_, Literal(null, _), _) => Literal.create(null, e.dataType)
       case e @ Substring(_, _, Literal(null, _)) => Literal.create(null, e.dataType)
 
+      // MaxOf and MinOf can't do null propagation
+      case e: MaxOf => e
+      case e: MinOf => e
+
       // Put exceptional cases above if any
       case e: BinaryArithmetic => e.children match {
         case Literal(null, _) :: right :: Nil => Literal.create(null, e.dataType)
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateUtils.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateUtils.scala
index 3f92be4a55d7d..ad649acf536f9 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateUtils.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateUtils.scala
@@ -24,7 +24,7 @@ import java.util.{Calendar, TimeZone}
 import org.apache.spark.sql.catalyst.expressions.Cast
 
 /**
- * helper function to convert between Int value of days since 1970-01-01 and java.sql.Date
+ * Helper function to convert between Int value of days since 1970-01-01 and java.sql.Date
  */
 object DateUtils {
   private val MILLIS_PER_DAY = 86400000
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/TypeUtils.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/TypeUtils.scala
new file mode 100644
index 0000000000000..0bb12d2039ffc
--- /dev/null
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/TypeUtils.scala
@@ -0,0 +1,56 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.catalyst.util
+
+import org.apache.spark.sql.catalyst.analysis.TypeCheckResult
+import org.apache.spark.sql.types._
+
+/**
+ * Helper function to check for valid data types
+ */
+object TypeUtils {
+  def checkForNumericExpr(t: DataType, caller: String): TypeCheckResult = {
+    if (t.isInstanceOf[NumericType] || t == NullType) {
+      TypeCheckResult.TypeCheckSuccess
+    } else {
+      TypeCheckResult.TypeCheckFailure(s"$caller accepts numeric types, not $t")
+    }
+  }
+
+  def checkForBitwiseExpr(t: DataType, caller: String): TypeCheckResult = {
+    if (t.isInstanceOf[IntegralType] || t == NullType) {
+      TypeCheckResult.TypeCheckSuccess
+    } else {
+      TypeCheckResult.TypeCheckFailure(s"$caller accepts integral types, not $t")
+    }
+  }
+
+  def checkForOrderingExpr(t: DataType, caller: String): TypeCheckResult = {
+    if (t.isInstanceOf[AtomicType] || t == NullType) {
+      TypeCheckResult.TypeCheckSuccess
+    } else {
+      TypeCheckResult.TypeCheckFailure(s"$caller accepts non-complex types, not $t")
+    }
+  }
+
+  def getNumeric(t: DataType): Numeric[Any] =
+    t.asInstanceOf[NumericType].numeric.asInstanceOf[Numeric[Any]]
+
+  def getOrdering(t: DataType): Ordering[Any] =
+    t.asInstanceOf[AtomicType].ordering.asInstanceOf[Ordering[Any]]
+}
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/DataType.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/DataType.scala
index 1ba3a2686639f..74677ddfcad65 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/DataType.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/DataType.scala
@@ -107,7 +107,7 @@ protected[sql] abstract class AtomicType extends DataType {
 abstract class NumericType extends AtomicType {
   // Unfortunately we can't get this implicitly as that breaks Spark Serialization. In order for
   // implicitly[Numeric[JvmType]] to be valid, we have to change JvmType from a type variable to a
-  // type parameter and and add a numeric annotation (i.e., [JvmType : Numeric]). This gets
+  // type parameter and add a numeric annotation (i.e., [JvmType : Numeric]). This gets
   // desugared by the compiler into an argument to the objects constructor. This means there is no
   // longer an no argument constructor and thus the JVM cannot serialize the object anymore.
   private[sql] val numeric: Numeric[InternalType]
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/DecimalPrecisionSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/DecimalPrecisionSuite.scala
index 1b8d18ded2257..7bac97b7894f5 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/DecimalPrecisionSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/DecimalPrecisionSuite.scala
@@ -92,8 +92,10 @@ class DecimalPrecisionSuite extends SparkFunSuite with BeforeAndAfter {
   }
 
   test("Comparison operations") {
-    checkComparison(LessThan(i, d1), DecimalType.Unlimited)
-    checkComparison(LessThanOrEqual(d1, d2), DecimalType.Unlimited)
+    checkComparison(EqualTo(i, d1), DecimalType(10, 1))
+    checkComparison(EqualNullSafe(d2, d1), DecimalType(5, 2))
+    checkComparison(LessThan(i, d1), DecimalType(10, 1))
+    checkComparison(LessThanOrEqual(d1, d2), DecimalType(5, 2))
     checkComparison(GreaterThan(d2, u), DecimalType.Unlimited)
     checkComparison(GreaterThanOrEqual(d1, f), DoubleType)
     checkComparison(GreaterThan(d2, d2), DecimalType(5, 2))
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercionSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercionSuite.scala
index a0798428db094..0df446636ea89 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercionSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercionSuite.scala
@@ -28,11 +28,11 @@ class HiveTypeCoercionSuite extends PlanTest {
 
   test("tightest common bound for types") {
     def widenTest(t1: DataType, t2: DataType, tightestCommon: Option[DataType]) {
-      var found = HiveTypeCoercion.findTightestCommonType(t1, t2)
+      var found = HiveTypeCoercion.findTightestCommonTypeOfTwo(t1, t2)
       assert(found == tightestCommon,
         s"Expected $tightestCommon as tightest common type for $t1 and $t2, found $found")
       // Test both directions to make sure the widening is symmetric.
-      found = HiveTypeCoercion.findTightestCommonType(t2, t1)
+      found = HiveTypeCoercion.findTightestCommonTypeOfTwo(t2, t1)
       assert(found == tightestCommon,
         s"Expected $tightestCommon as tightest common type for $t2 and $t1, found $found")
     }
@@ -140,13 +140,10 @@ class HiveTypeCoercionSuite extends PlanTest {
       CaseKeyWhen(Literal(1.toShort), Seq(Literal(1), Literal("a"))),
       CaseKeyWhen(Cast(Literal(1.toShort), IntegerType), Seq(Literal(1), Literal("a")))
     )
-    // Will remove exception expectation in PR#6405
-    intercept[RuntimeException] {
-      ruleTest(cwc,
-        CaseKeyWhen(Literal(true), Seq(Literal(1), Literal("a"))),
-        CaseKeyWhen(Literal(true), Seq(Literal(1), Literal("a")))
-      )
-    }
+    ruleTest(cwc,
+      CaseKeyWhen(Literal(true), Seq(Literal(1), Literal("a"))),
+      CaseKeyWhen(Literal(true), Seq(Literal(1), Literal("a")))
+    )
   }
 
   test("type coercion simplification for equal to") {
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionTypeCheckingSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionTypeCheckingSuite.scala
new file mode 100644
index 0000000000000..dcb3635c5ccae
--- /dev/null
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionTypeCheckingSuite.scala
@@ -0,0 +1,143 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.catalyst.expressions
+
+import org.apache.spark.SparkFunSuite
+import org.apache.spark.sql.AnalysisException
+import org.apache.spark.sql.catalyst.analysis.SimpleAnalyzer
+import org.apache.spark.sql.catalyst.dsl.expressions._
+import org.apache.spark.sql.catalyst.dsl.plans._
+import org.apache.spark.sql.catalyst.plans.logical.LocalRelation
+import org.apache.spark.sql.types.StringType
+
+class ExpressionTypeCheckingSuite extends SparkFunSuite {
+
+  val testRelation = LocalRelation(
+    'intField.int,
+    'stringField.string,
+    'booleanField.boolean,
+    'complexField.array(StringType))
+
+  def assertError(expr: Expression, errorMessage: String): Unit = {
+    val e = intercept[AnalysisException] {
+      assertSuccess(expr)
+    }
+    assert(e.getMessage.contains(
+      s"cannot resolve '${expr.prettyString}' due to data type mismatch:"))
+    assert(e.getMessage.contains(errorMessage))
+  }
+
+  def assertSuccess(expr: Expression): Unit = {
+    val analyzed = testRelation.select(expr.as("c")).analyze
+    SimpleAnalyzer.checkAnalysis(analyzed)
+  }
+
+  def assertErrorForDifferingTypes(expr: Expression): Unit = {
+    assertError(expr,
+      s"differing types in ${expr.getClass.getSimpleName} (IntegerType and BooleanType).")
+  }
+
+  test("check types for unary arithmetic") {
+    assertError(UnaryMinus('stringField), "operator - accepts numeric type")
+    assertSuccess(Sqrt('stringField)) // We will cast String to Double for sqrt
+    assertError(Sqrt('booleanField), "function sqrt accepts numeric type")
+    assertError(Abs('stringField), "function abs accepts numeric type")
+    assertError(BitwiseNot('stringField), "operator ~ accepts integral type")
+  }
+
+  test("check types for binary arithmetic") {
+    // We will cast String to Double for binary arithmetic
+    assertSuccess(Add('intField, 'stringField))
+    assertSuccess(Subtract('intField, 'stringField))
+    assertSuccess(Multiply('intField, 'stringField))
+    assertSuccess(Divide('intField, 'stringField))
+    assertSuccess(Remainder('intField, 'stringField))
+    // checkAnalysis(BitwiseAnd('intField, 'stringField))
+
+    assertErrorForDifferingTypes(Add('intField, 'booleanField))
+    assertErrorForDifferingTypes(Subtract('intField, 'booleanField))
+    assertErrorForDifferingTypes(Multiply('intField, 'booleanField))
+    assertErrorForDifferingTypes(Divide('intField, 'booleanField))
+    assertErrorForDifferingTypes(Remainder('intField, 'booleanField))
+    assertErrorForDifferingTypes(BitwiseAnd('intField, 'booleanField))
+    assertErrorForDifferingTypes(BitwiseOr('intField, 'booleanField))
+    assertErrorForDifferingTypes(BitwiseXor('intField, 'booleanField))
+    assertErrorForDifferingTypes(MaxOf('intField, 'booleanField))
+    assertErrorForDifferingTypes(MinOf('intField, 'booleanField))
+
+    assertError(Add('booleanField, 'booleanField), "operator + accepts numeric type")
+    assertError(Subtract('booleanField, 'booleanField), "operator - accepts numeric type")
+    assertError(Multiply('booleanField, 'booleanField), "operator * accepts numeric type")
+    assertError(Divide('booleanField, 'booleanField), "operator / accepts numeric type")
+    assertError(Remainder('booleanField, 'booleanField), "operator % accepts numeric type")
+
+    assertError(BitwiseAnd('booleanField, 'booleanField), "operator & accepts integral type")
+    assertError(BitwiseOr('booleanField, 'booleanField), "operator | accepts integral type")
+    assertError(BitwiseXor('booleanField, 'booleanField), "operator ^ accepts integral type")
+
+    assertError(MaxOf('complexField, 'complexField), "function maxOf accepts non-complex type")
+    assertError(MinOf('complexField, 'complexField), "function minOf accepts non-complex type")
+  }
+
+  test("check types for predicates") {
+    // We will cast String to Double for binary comparison
+    assertSuccess(EqualTo('intField, 'stringField))
+    assertSuccess(EqualNullSafe('intField, 'stringField))
+    assertSuccess(LessThan('intField, 'stringField))
+    assertSuccess(LessThanOrEqual('intField, 'stringField))
+    assertSuccess(GreaterThan('intField, 'stringField))
+    assertSuccess(GreaterThanOrEqual('intField, 'stringField))
+
+    // We will transform EqualTo with numeric and boolean types to CaseKeyWhen
+    assertSuccess(EqualTo('intField, 'booleanField))
+    assertSuccess(EqualNullSafe('intField, 'booleanField))
+
+    assertError(EqualTo('intField, 'complexField), "differing types")
+    assertError(EqualNullSafe('intField, 'complexField), "differing types")
+
+    assertErrorForDifferingTypes(LessThan('intField, 'booleanField))
+    assertErrorForDifferingTypes(LessThanOrEqual('intField, 'booleanField))
+    assertErrorForDifferingTypes(GreaterThan('intField, 'booleanField))
+    assertErrorForDifferingTypes(GreaterThanOrEqual('intField, 'booleanField))
+
+    assertError(
+      LessThan('complexField, 'complexField), "operator < accepts non-complex type")
+    assertError(
+      LessThanOrEqual('complexField, 'complexField), "operator <= accepts non-complex type")
+    assertError(
+      GreaterThan('complexField, 'complexField), "operator > accepts non-complex type")
+    assertError(
+      GreaterThanOrEqual('complexField, 'complexField), "operator >= accepts non-complex type")
+
+    assertError(
+      If('intField, 'stringField, 'stringField),
+      "type of predicate expression in If should be boolean")
+    assertErrorForDifferingTypes(If('booleanField, 'intField, 'booleanField))
+
+    assertError(
+      CaseWhen(Seq('booleanField, 'intField, 'booleanField, 'complexField)),
+      "THEN and ELSE expressions should all be same type or coercible to a common type")
+    assertError(
+      CaseKeyWhen('intField, Seq('intField, 'stringField, 'intField, 'complexField)),
+      "THEN and ELSE expressions should all be same type or coercible to a common type")
+    assertError(
+      CaseWhen(Seq('booleanField, 'intField, 'intField, 'intField)),
+      "WHEN expressions in CaseWhen should all be boolean type")
+
+  }
+}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/json/InferSchema.scala b/sql/core/src/main/scala/org/apache/spark/sql/json/InferSchema.scala
index 06aa19ef09bd2..565d10247f10e 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/json/InferSchema.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/json/InferSchema.scala
@@ -147,7 +147,7 @@ private[sql] object InferSchema {
    * Returns the most general data type for two given data types.
    */
   private[json] def compatibleType(t1: DataType, t2: DataType): DataType = {
-    HiveTypeCoercion.findTightestCommonType(t1, t2).getOrElse {
+    HiveTypeCoercion.findTightestCommonTypeOfTwo(t1, t2).getOrElse {
       // t1 or t2 is a StructType, ArrayType, or an unexpected type.
       (t1, t2) match {
         case (other: DataType, NullType) => other
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/json/JsonRDD.scala b/sql/core/src/main/scala/org/apache/spark/sql/json/JsonRDD.scala
index 95eb1174b1dd6..7e1e21f5fbb99 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/json/JsonRDD.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/json/JsonRDD.scala
@@ -155,7 +155,7 @@ private[sql] object JsonRDD extends Logging {
    * Returns the most general data type for two given data types.
    */
   private[json] def compatibleType(t1: DataType, t2: DataType): DataType = {
-    HiveTypeCoercion.findTightestCommonType(t1, t2) match {
+    HiveTypeCoercion.findTightestCommonTypeOfTwo(t1, t2) match {
       case Some(commonType) => commonType
       case None =>
         // t1 or t2 is a StructType, ArrayType, or an unexpected type.

From 28dbde3874ccdd44b73675938719b69336d23dac Mon Sep 17 00:00:00 2001
From: Yuhao Yang <hhbyyh@gmail.com>
Date: Wed, 3 Jun 2015 13:15:57 +0200
Subject: [PATCH 332/525] [SPARK-7983] [MLLIB] Add require for one-based
 indices in loadLibSVMFile

jira: https://issues.apache.org/jira/browse/SPARK-7983

Customers frequently use zero-based indices in their LIBSVM files. No warnings or errors from Spark will be reported during their computation afterwards, and usually it will lead to wired result for many algorithms (like GBDT).

add a quick check.

Author: Yuhao Yang <hhbyyh@gmail.com>

Closes #6538 from hhbyyh/loadSVM and squashes the following commits:

79d9c11 [Yuhao Yang] optimization as respond to comments
4310710 [Yuhao Yang] merge conflict
96460f1 [Yuhao Yang] merge conflict
20a2811 [Yuhao Yang] use require
6e4f8ca [Yuhao Yang] add check for ascending order
9956365 [Yuhao Yang] add ut for 0-based loadlibsvm exception
5bd1f9a [Yuhao Yang] add require for one-based in loadLIBSVM
---
 .../org/apache/spark/mllib/util/MLUtils.scala | 12 +++++++
 .../spark/mllib/util/MLUtilsSuite.scala       | 35 +++++++++++++++++++
 2 files changed, 47 insertions(+)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala b/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala
index 541f3288b6c43..52d6468a72af7 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala
@@ -82,6 +82,18 @@ object MLUtils {
           val value = indexAndValue(1).toDouble
           (index, value)
         }.unzip
+
+        // check if indices are one-based and in ascending order
+        var previous = -1
+        var i = 0
+        val indicesLength = indices.length
+        while (i < indicesLength) {
+          val current = indices(i)
+          require(current > previous, "indices should be one-based and in ascending order" )
+          previous = current
+          i += 1
+        }
+
         (label, indices.toArray, values.toArray)
       }
 
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/util/MLUtilsSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/util/MLUtilsSuite.scala
index 734b7babec7be..70219e9ad9d3e 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/util/MLUtilsSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/util/MLUtilsSuite.scala
@@ -25,6 +25,7 @@ import breeze.linalg.{squaredDistance => breezeSquaredDistance}
 import com.google.common.base.Charsets
 import com.google.common.io.Files
 
+import org.apache.spark.SparkException
 import org.apache.spark.SparkFunSuite
 import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vectors}
 import org.apache.spark.mllib.regression.LabeledPoint
@@ -108,6 +109,40 @@ class MLUtilsSuite extends SparkFunSuite with MLlibTestSparkContext {
     Utils.deleteRecursively(tempDir)
   }
 
+  test("loadLibSVMFile throws IllegalArgumentException when indices is zero-based") {
+    val lines =
+      """
+        |0
+        |0 0:4.0 4:5.0 6:6.0
+      """.stripMargin
+    val tempDir = Utils.createTempDir()
+    val file = new File(tempDir.getPath, "part-00000")
+    Files.write(lines, file, Charsets.US_ASCII)
+    val path = tempDir.toURI.toString
+
+    intercept[SparkException] {
+      loadLibSVMFile(sc, path).collect()
+    }
+    Utils.deleteRecursively(tempDir)
+  }
+
+  test("loadLibSVMFile throws IllegalArgumentException when indices is not in ascending order") {
+    val lines =
+      """
+        |0
+        |0 3:4.0 2:5.0 6:6.0
+      """.stripMargin
+    val tempDir = Utils.createTempDir()
+    val file = new File(tempDir.getPath, "part-00000")
+    Files.write(lines, file, Charsets.US_ASCII)
+    val path = tempDir.toURI.toString
+
+    intercept[SparkException] {
+      loadLibSVMFile(sc, path).collect()
+    }
+    Utils.deleteRecursively(tempDir)
+  }
+
   test("saveAsLibSVMFile") {
     val examples = sc.parallelize(Seq(
       LabeledPoint(1.1, Vectors.sparse(3, Seq((0, 1.23), (2, 4.56)))),

From f1646e1023bd03e27268a8aa2ea11b6cc284075f Mon Sep 17 00:00:00 2001
From: Yin Huai <yhuai@databricks.com>
Date: Wed, 3 Jun 2015 09:26:21 -0700
Subject: [PATCH 333/525] [SPARK-7973] [SQL] Increase the timeout of two
 CliSuite tests.

https://issues.apache.org/jira/browse/SPARK-7973

Author: Yin Huai <yhuai@databricks.com>

Closes #6525 from yhuai/SPARK-7973 and squashes the following commits:

763b821 [Yin Huai] Also change the timeout of "Single command with -e" to 2 minutes.
e598a08 [Yin Huai] Increase the timeout to 3 minutes.
---
 .../org/apache/spark/sql/hive/thriftserver/CliSuite.scala     | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/CliSuite.scala b/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/CliSuite.scala
index 3732af7870b93..13b0c5951dddc 100644
--- a/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/CliSuite.scala
+++ b/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/CliSuite.scala
@@ -133,7 +133,7 @@ class CliSuite extends SparkFunSuite with BeforeAndAfter with Logging {
   }
 
   test("Single command with -e") {
-    runCliWithin(1.minute, Seq("-e", "SHOW DATABASES;"))("" -> "OK")
+    runCliWithin(2.minute, Seq("-e", "SHOW DATABASES;"))("" -> "OK")
   }
 
   test("Single command with --database") {
@@ -165,7 +165,7 @@ class CliSuite extends SparkFunSuite with BeforeAndAfter with Logging {
     val dataFilePath =
       Thread.currentThread().getContextClassLoader.getResource("data/files/small_kv.txt")
 
-    runCliWithin(1.minute, Seq("--jars", s"$jarFile"))(
+    runCliWithin(3.minute, Seq("--jars", s"$jarFile"))(
       """CREATE TABLE t1(key string, val string)
         |ROW FORMAT SERDE 'org.apache.hive.hcatalog.data.JsonSerDe';
       """.stripMargin

From 2c4d550eda0e6f33d2d575825c3faef4c9217067 Mon Sep 17 00:00:00 2001
From: Patrick Wendell <patrick@databricks.com>
Date: Wed, 3 Jun 2015 10:11:27 -0700
Subject: [PATCH 334/525] [SPARK-7801] [BUILD] Updating versions to SPARK 1.5.0

Author: Patrick Wendell <patrick@databricks.com>

Closes #6328 from pwendell/spark-1.5-update and squashes the following commits:

2f42d02 [Patrick Wendell] A few more excludes
4bebcf0 [Patrick Wendell] Update to RC4
61aaf46 [Patrick Wendell] Using new release candidate
55f1610 [Patrick Wendell] Another exclude
04b4f04 [Patrick Wendell] More issues with transient 1.4 changes
36f549b [Patrick Wendell] [SPARK-7801] [BUILD] Updating versions to SPARK 1.5.0
---
 assembly/pom.xml                                   |  2 +-
 bagel/pom.xml                                      |  2 +-
 core/pom.xml                                       |  2 +-
 core/src/main/scala/org/apache/spark/package.scala |  2 +-
 docs/_config.yml                                   |  4 ++--
 examples/pom.xml                                   |  2 +-
 external/flume-sink/pom.xml                        |  2 +-
 external/flume/pom.xml                             |  2 +-
 external/kafka-assembly/pom.xml                    |  2 +-
 external/kafka/pom.xml                             |  2 +-
 external/mqtt/pom.xml                              |  2 +-
 external/twitter/pom.xml                           |  2 +-
 external/zeromq/pom.xml                            |  2 +-
 extras/java8-tests/pom.xml                         |  2 +-
 extras/kinesis-asl/pom.xml                         |  2 +-
 extras/spark-ganglia-lgpl/pom.xml                  |  2 +-
 graphx/pom.xml                                     |  2 +-
 launcher/pom.xml                                   |  2 +-
 mllib/pom.xml                                      |  2 +-
 network/common/pom.xml                             |  2 +-
 network/shuffle/pom.xml                            |  2 +-
 network/yarn/pom.xml                               |  2 +-
 pom.xml                                            | 14 +++++++++++++-
 project/MimaBuild.scala                            |  3 ++-
 project/MimaExcludes.scala                         | 14 ++++++++++++++
 repl/pom.xml                                       |  2 +-
 sql/catalyst/pom.xml                               |  2 +-
 sql/core/pom.xml                                   |  2 +-
 sql/hive-thriftserver/pom.xml                      |  2 +-
 sql/hive/pom.xml                                   |  2 +-
 streaming/pom.xml                                  |  2 +-
 tools/pom.xml                                      |  2 +-
 unsafe/pom.xml                                     |  2 +-
 yarn/pom.xml                                       |  2 +-
 34 files changed, 61 insertions(+), 34 deletions(-)

diff --git a/assembly/pom.xml b/assembly/pom.xml
index 626c8577e31fe..e9c6d26ccddc7 100644
--- a/assembly/pom.xml
+++ b/assembly/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.10</artifactId>
-    <version>1.4.0-SNAPSHOT</version>
+    <version>1.5.0-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/bagel/pom.xml b/bagel/pom.xml
index 132cd433d78a2..ed5c37e595a96 100644
--- a/bagel/pom.xml
+++ b/bagel/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.10</artifactId>
-    <version>1.4.0-SNAPSHOT</version>
+    <version>1.5.0-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/core/pom.xml b/core/pom.xml
index a02184222e9f0..e35694e9e98b4 100644
--- a/core/pom.xml
+++ b/core/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.10</artifactId>
-    <version>1.4.0-SNAPSHOT</version>
+    <version>1.5.0-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/core/src/main/scala/org/apache/spark/package.scala b/core/src/main/scala/org/apache/spark/package.scala
index 2ab41ba488ff6..8ae76c5f72f2e 100644
--- a/core/src/main/scala/org/apache/spark/package.scala
+++ b/core/src/main/scala/org/apache/spark/package.scala
@@ -43,5 +43,5 @@ package org.apache
 
 package object spark {
   // For package docs only
-  val SPARK_VERSION = "1.4.0-SNAPSHOT"
+  val SPARK_VERSION = "1.5.0-SNAPSHOT"
 }
diff --git a/docs/_config.yml b/docs/_config.yml
index b22b627f09007..c0e031a83ba9c 100644
--- a/docs/_config.yml
+++ b/docs/_config.yml
@@ -14,8 +14,8 @@ include:
 
 # These allow the documentation to be updated with newer releases
 # of Spark, Scala, and Mesos.
-SPARK_VERSION: 1.4.0-SNAPSHOT
-SPARK_VERSION_SHORT: 1.4.0
+SPARK_VERSION: 1.5.0-SNAPSHOT
+SPARK_VERSION_SHORT: 1.5.0
 SCALA_BINARY_VERSION: "2.10"
 SCALA_VERSION: "2.10.4"
 MESOS_VERSION: 0.21.0
diff --git a/examples/pom.xml b/examples/pom.xml
index e4efee7b5e647..e6884b09dca94 100644
--- a/examples/pom.xml
+++ b/examples/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.10</artifactId>
-    <version>1.4.0-SNAPSHOT</version>
+    <version>1.5.0-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/external/flume-sink/pom.xml b/external/flume-sink/pom.xml
index 71f2b6fe18bd1..7a7dccc3d0922 100644
--- a/external/flume-sink/pom.xml
+++ b/external/flume-sink/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.10</artifactId>
-    <version>1.4.0-SNAPSHOT</version>
+    <version>1.5.0-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/flume/pom.xml b/external/flume/pom.xml
index a345c03582ad6..14f7daaf417e0 100644
--- a/external/flume/pom.xml
+++ b/external/flume/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.10</artifactId>
-    <version>1.4.0-SNAPSHOT</version>
+    <version>1.5.0-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kafka-assembly/pom.xml b/external/kafka-assembly/pom.xml
index 0b79f47647f6b..8059c443827ef 100644
--- a/external/kafka-assembly/pom.xml
+++ b/external/kafka-assembly/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.10</artifactId>
-    <version>1.4.0-SNAPSHOT</version>
+    <version>1.5.0-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kafka/pom.xml b/external/kafka/pom.xml
index 5734d55bf4784..ded863bd985e8 100644
--- a/external/kafka/pom.xml
+++ b/external/kafka/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.10</artifactId>
-    <version>1.4.0-SNAPSHOT</version>
+    <version>1.5.0-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/mqtt/pom.xml b/external/mqtt/pom.xml
index 7d102e10ab60f..0e41e5781784b 100644
--- a/external/mqtt/pom.xml
+++ b/external/mqtt/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.10</artifactId>
-    <version>1.4.0-SNAPSHOT</version>
+    <version>1.5.0-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/twitter/pom.xml b/external/twitter/pom.xml
index d28e3e1846d70..178ae8de13b57 100644
--- a/external/twitter/pom.xml
+++ b/external/twitter/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.10</artifactId>
-    <version>1.4.0-SNAPSHOT</version>
+    <version>1.5.0-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/zeromq/pom.xml b/external/zeromq/pom.xml
index 9998c11c85171..37bfd10d43663 100644
--- a/external/zeromq/pom.xml
+++ b/external/zeromq/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.10</artifactId>
-    <version>1.4.0-SNAPSHOT</version>
+    <version>1.5.0-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/extras/java8-tests/pom.xml b/extras/java8-tests/pom.xml
index 4351a8a12fe21..f138251748c9e 100644
--- a/extras/java8-tests/pom.xml
+++ b/extras/java8-tests/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.10</artifactId>
-    <version>1.4.0-SNAPSHOT</version>
+    <version>1.5.0-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/extras/kinesis-asl/pom.xml b/extras/kinesis-asl/pom.xml
index 25847a1b33d9c..4787991572b61 100644
--- a/extras/kinesis-asl/pom.xml
+++ b/extras/kinesis-asl/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.10</artifactId>
-    <version>1.4.0-SNAPSHOT</version>
+    <version>1.5.0-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/extras/spark-ganglia-lgpl/pom.xml b/extras/spark-ganglia-lgpl/pom.xml
index e14bbae4a9b6e..478d0019a25f0 100644
--- a/extras/spark-ganglia-lgpl/pom.xml
+++ b/extras/spark-ganglia-lgpl/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.10</artifactId>
-    <version>1.4.0-SNAPSHOT</version>
+    <version>1.5.0-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/graphx/pom.xml b/graphx/pom.xml
index 28b41228feb3d..853dea9a7795e 100644
--- a/graphx/pom.xml
+++ b/graphx/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.10</artifactId>
-    <version>1.4.0-SNAPSHOT</version>
+    <version>1.5.0-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/launcher/pom.xml b/launcher/pom.xml
index cc177d23dff77..48dd0d5f9106b 100644
--- a/launcher/pom.xml
+++ b/launcher/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.10</artifactId>
-    <version>1.4.0-SNAPSHOT</version>
+    <version>1.5.0-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/mllib/pom.xml b/mllib/pom.xml
index 65c647a91d192..b16058ddc203a 100644
--- a/mllib/pom.xml
+++ b/mllib/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.10</artifactId>
-    <version>1.4.0-SNAPSHOT</version>
+    <version>1.5.0-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/network/common/pom.xml b/network/common/pom.xml
index 0c3147761cfc5..a85e0a66f4a30 100644
--- a/network/common/pom.xml
+++ b/network/common/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.10</artifactId>
-    <version>1.4.0-SNAPSHOT</version>
+    <version>1.5.0-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/network/shuffle/pom.xml b/network/shuffle/pom.xml
index 7dc7c65825e34..4b5bfcb6f04bc 100644
--- a/network/shuffle/pom.xml
+++ b/network/shuffle/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.10</artifactId>
-    <version>1.4.0-SNAPSHOT</version>
+    <version>1.5.0-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/network/yarn/pom.xml b/network/yarn/pom.xml
index 1e2e9c80af6cc..a99f7c4392d3d 100644
--- a/network/yarn/pom.xml
+++ b/network/yarn/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.10</artifactId>
-    <version>1.4.0-SNAPSHOT</version>
+    <version>1.5.0-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/pom.xml b/pom.xml
index 711edf9efad2b..0b1aaad7566bc 100644
--- a/pom.xml
+++ b/pom.xml
@@ -26,7 +26,7 @@
   </parent>
   <groupId>org.apache.spark</groupId>
   <artifactId>spark-parent_2.10</artifactId>
-  <version>1.4.0-SNAPSHOT</version>
+  <version>1.5.0-SNAPSHOT</version>
   <packaging>pom</packaging>
   <name>Spark Project Parent POM</name>
   <url>http://spark.apache.org/</url>
@@ -269,6 +269,18 @@
         <enabled>false</enabled>
       </snapshots>
     </repository>
+    <!-- TODO: This can be deleted after Spark 1.4 is posted -->
+    <repository>
+      <id>spark-1.4-staging</id>
+      <name>Spark 1.4 RC4 Staging Repository</name>
+      <url>https://repository.apache.org/content/repositories/orgapachespark-1112</url>
+      <releases>
+        <enabled>true</enabled>
+      </releases>
+      <snapshots>
+        <enabled>false</enabled>
+      </snapshots>
+    </repository>
   </repositories>
   <pluginRepositories>
     <pluginRepository>
diff --git a/project/MimaBuild.scala b/project/MimaBuild.scala
index dde92949fa175..5812b72f0aa78 100644
--- a/project/MimaBuild.scala
+++ b/project/MimaBuild.scala
@@ -91,7 +91,8 @@ object MimaBuild {
 
   def mimaSettings(sparkHome: File, projectRef: ProjectRef) = {
     val organization = "org.apache.spark"
-    val previousSparkVersion = "1.3.0"
+    // TODO: Change this once Spark 1.4.0 is released
+    val previousSparkVersion = "1.4.0-rc4"
     val fullId = "spark-" + projectRef.project + "_2.10"
     mimaDefaultSettings ++
     Seq(previousArtifact := Some(organization % fullId % previousSparkVersion),
diff --git a/project/MimaExcludes.scala b/project/MimaExcludes.scala
index 8da72b3fa7cdb..34371c9659423 100644
--- a/project/MimaExcludes.scala
+++ b/project/MimaExcludes.scala
@@ -34,6 +34,20 @@ import com.typesafe.tools.mima.core.ProblemFilters._
 object MimaExcludes {
     def excludes(version: String) =
       version match {
+        case v if v.startsWith("1.5") =>
+          Seq(
+            MimaBuild.excludeSparkPackage("deploy"),
+            // These are needed if checking against the sbt build, since they are part of
+            // the maven-generated artifacts in 1.3.
+            excludePackage("org.spark-project.jetty"),
+            MimaBuild.excludeSparkPackage("unused"),
+            // JavaRDDLike is not meant to be extended by user programs
+            ProblemFilters.exclude[MissingMethodProblem](
+              "org.apache.spark.api.java.JavaRDDLike.partitioner"),
+            // Mima false positive (was a private[spark] class)
+            ProblemFilters.exclude[MissingClassProblem](
+              "org.apache.spark.util.collection.PairIterator")
+          )
         case v if v.startsWith("1.4") =>
           Seq(
             MimaBuild.excludeSparkPackage("deploy"),
diff --git a/repl/pom.xml b/repl/pom.xml
index 6e5cb7f77e1df..85f7bc8ac1024 100644
--- a/repl/pom.xml
+++ b/repl/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.10</artifactId>
-    <version>1.4.0-SNAPSHOT</version>
+    <version>1.5.0-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/sql/catalyst/pom.xml b/sql/catalyst/pom.xml
index d9e1cdb84bb27..bf0a7327a58a2 100644
--- a/sql/catalyst/pom.xml
+++ b/sql/catalyst/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.10</artifactId>
-    <version>1.4.0-SNAPSHOT</version>
+    <version>1.5.0-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/sql/core/pom.xml b/sql/core/pom.xml
index 8210c552603ea..3192f81ffaecd 100644
--- a/sql/core/pom.xml
+++ b/sql/core/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.10</artifactId>
-    <version>1.4.0-SNAPSHOT</version>
+    <version>1.5.0-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/sql/hive-thriftserver/pom.xml b/sql/hive-thriftserver/pom.xml
index 20d3c7d4c5959..73e6ccdb1eaf8 100644
--- a/sql/hive-thriftserver/pom.xml
+++ b/sql/hive-thriftserver/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.10</artifactId>
-    <version>1.4.0-SNAPSHOT</version>
+    <version>1.5.0-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/sql/hive/pom.xml b/sql/hive/pom.xml
index 923ffabb9b99e..a17546d706248 100644
--- a/sql/hive/pom.xml
+++ b/sql/hive/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.10</artifactId>
-    <version>1.4.0-SNAPSHOT</version>
+    <version>1.5.0-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/streaming/pom.xml b/streaming/pom.xml
index 49d035a1e9696..697895e72fe5b 100644
--- a/streaming/pom.xml
+++ b/streaming/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.10</artifactId>
-    <version>1.4.0-SNAPSHOT</version>
+    <version>1.5.0-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/tools/pom.xml b/tools/pom.xml
index 1c6f3e83a1819..feffde4c857eb 100644
--- a/tools/pom.xml
+++ b/tools/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.10</artifactId>
-    <version>1.4.0-SNAPSHOT</version>
+    <version>1.5.0-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/unsafe/pom.xml b/unsafe/pom.xml
index 2fd17267ac427..62c6354f1e203 100644
--- a/unsafe/pom.xml
+++ b/unsafe/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.10</artifactId>
-    <version>1.4.0-SNAPSHOT</version>
+    <version>1.5.0-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/yarn/pom.xml b/yarn/pom.xml
index e207a46809684..644def7501dc8 100644
--- a/yarn/pom.xml
+++ b/yarn/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.10</artifactId>
-    <version>1.4.0-SNAPSHOT</version>
+    <version>1.5.0-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 

From d053a31be93d789e3f26cf55d747ecf6ca386c29 Mon Sep 17 00:00:00 2001
From: animesh <animesh@apache.spark>
Date: Wed, 3 Jun 2015 11:28:18 -0700
Subject: [PATCH 335/525] [SPARK-7980] [SQL] Support SQLContext.range(end)

1. range() overloaded in SQLContext.scala
2. range() modified in python sql context.py
3. Tests added accordingly in DataFrameSuite.scala and python sql tests.py

Author: animesh <animesh@apache.spark>

Closes #6609 from animeshbaranawal/SPARK-7980 and squashes the following commits:

935899c [animesh] SPARK-7980:python+scala changes
---
 python/pyspark/sql/context.py                        | 12 ++++++++++--
 python/pyspark/sql/tests.py                          |  2 ++
 .../main/scala/org/apache/spark/sql/SQLContext.scala | 11 +++++++++++
 .../scala/org/apache/spark/sql/DataFrameSuite.scala  |  8 ++++++++
 4 files changed, 31 insertions(+), 2 deletions(-)

diff --git a/python/pyspark/sql/context.py b/python/pyspark/sql/context.py
index 9fdf43c3e6eb5..1bebfc48376b4 100644
--- a/python/pyspark/sql/context.py
+++ b/python/pyspark/sql/context.py
@@ -131,7 +131,7 @@ def udf(self):
         return UDFRegistration(self)
 
     @since(1.4)
-    def range(self, start, end, step=1, numPartitions=None):
+    def range(self, start, end=None, step=1, numPartitions=None):
         """
         Create a :class:`DataFrame` with single LongType column named `id`,
         containing elements in a range from `start` to `end` (exclusive) with
@@ -145,10 +145,18 @@ def range(self, start, end, step=1, numPartitions=None):
 
         >>> sqlContext.range(1, 7, 2).collect()
         [Row(id=1), Row(id=3), Row(id=5)]
+
+        >>> sqlContext.range(3).collect()
+        [Row(id=0), Row(id=1), Row(id=2)]
         """
         if numPartitions is None:
             numPartitions = self._sc.defaultParallelism
-        jdf = self._ssql_ctx.range(int(start), int(end), int(step), int(numPartitions))
+
+        if end is None:
+            jdf = self._ssql_ctx.range(0, int(start), int(step), int(numPartitions))
+        else:
+            jdf = self._ssql_ctx.range(int(start), int(end), int(step), int(numPartitions))
+
         return DataFrame(jdf, self)
 
     @ignore_unicode_prefix
diff --git a/python/pyspark/sql/tests.py b/python/pyspark/sql/tests.py
index 6e498f0af0af5..a6fce50c76c2b 100644
--- a/python/pyspark/sql/tests.py
+++ b/python/pyspark/sql/tests.py
@@ -131,6 +131,8 @@ def test_range(self):
         self.assertEqual(self.sqlCtx.range(1, 1).count(), 0)
         self.assertEqual(self.sqlCtx.range(1, 0, -1).count(), 1)
         self.assertEqual(self.sqlCtx.range(0, 1 << 40, 1 << 39).count(), 2)
+        self.assertEqual(self.sqlCtx.range(-2).count(), 0)
+        self.assertEqual(self.sqlCtx.range(3).count(), 3)
 
     def test_explode(self):
         from pyspark.sql.functions import explode
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
index 91e6385dec81b..f08fb4fafe650 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
@@ -717,6 +717,17 @@ class SQLContext(@transient val sparkContext: SparkContext)
       StructType(StructField("id", LongType, nullable = false) :: Nil))
   }
 
+  /**
+   * :: Experimental ::
+   * Creates a [[DataFrame]] with a single [[LongType]] column named `id`, containing elements
+   * in an range from 0 to `end`(exclusive) with step value 1.
+   *
+   * @since 1.4.0
+   * @group dataframe
+   */
+  @Experimental
+  def range(end: Long): DataFrame = range(0, end)
+
   /**
    * :: Experimental ::
    * Creates a [[DataFrame]] with a single [[LongType]] column named `id`, containing elements
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
index a4fd1058afce5..9aaec2b064d76 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
@@ -576,5 +576,13 @@ class DataFrameSuite extends QueryTest {
     val res9 = TestSQLContext.range(Long.MaxValue, Long.MinValue, Long.MinValue, 100).select("id")
     assert(res9.count == 2)
     assert(res9.agg(sum("id")).as("sumid").collect() === Seq(Row(Long.MaxValue - 1)))
+
+    // only end provided as argument
+    val res10 = TestSQLContext.range(10).select("id")
+    assert(res10.count == 10)
+    assert(res10.agg(sum("id")).as("sumid").collect() === Seq(Row(45)))
+
+    val res11 = TestSQLContext.range(-1).select("id")
+    assert(res11.count == 0)
   }
 }

From d2a86eb8f0fcc02304604da56c589ea58c77587a Mon Sep 17 00:00:00 2001
From: Hari Shreedharan <hshreedharan@apache.org>
Date: Wed, 3 Jun 2015 13:43:13 -0500
Subject: [PATCH 336/525] [SPARK-7161] [HISTORY SERVER] Provide REST api to
 download event logs fro...

...m History Server

This PR adds a new API that allows the user to download event logs for an application as a zip file. APIs have been added to download all logs for a given application or just for a specific attempt.

This also add an additional method to the ApplicationHistoryProvider to get the raw files, zipped.

Author: Hari Shreedharan <hshreedharan@apache.org>

Closes #5792 from harishreedharan/eventlog-download and squashes the following commits:

221cc26 [Hari Shreedharan] Update docs with new API information.
a131be6 [Hari Shreedharan] Fix style issues.
5528bd8 [Hari Shreedharan] Merge branch 'master' into eventlog-download
6e8156e [Hari Shreedharan] Simplify tests, use Guava stream copy methods.
d8ddede [Hari Shreedharan] Remove unnecessary case in EventLogDownloadResource.
ffffb53 [Hari Shreedharan] Changed interface to use zip stream. Added more tests.
1100b40 [Hari Shreedharan] Ensure that `Path` does not appear in interfaces, by rafactoring interfaces.
5a5f3e2 [Hari Shreedharan] Fix test ordering issue.
0b66948 [Hari Shreedharan] Minor formatting/import fixes.
4fc518c [Hari Shreedharan] Fix rat failures.
a48b91f [Hari Shreedharan] Refactor to make attemptId optional in the API. Also added tests.
0fc1424 [Hari Shreedharan] File download now works for individual attempts and the entire application.
350d7e8 [Hari Shreedharan] Merge remote-tracking branch 'asf/master' into eventlog-download
fd6ab00 [Hari Shreedharan] Fix style issues
32b7662 [Hari Shreedharan] Use UIRoot directly in ApiRootResource. Also, use `Response` class to set headers.
7b362b2 [Hari Shreedharan] Almost working.
3d18ebc [Hari Shreedharan] [WIP] Try getting the event log download to work.
---
 .rat-excludes                                 |  2 +
 .../history/ApplicationHistoryProvider.scala  | 11 +++
 .../deploy/history/FsHistoryProvider.scala    | 63 ++++++++++++-
 .../spark/deploy/history/HistoryServer.scala  |  8 ++
 .../spark/status/api/v1/ApiRootResource.scala | 20 +++++
 .../api/v1/EventLogDownloadResource.scala     | 70 +++++++++++++++
 .../application_list_json_expectation.json    | 16 ++++
 .../completed_app_list_json_expectation.json  | 16 ++++
 .../minDate_app_list_json_expectation.json    | 34 +++++--
 .../spark-events/local-1430917381535_1        |  5 ++
 .../spark-events/local-1430917381535_2        |  5 ++
 .../history/FsHistoryProviderSuite.scala      | 40 ++++++++-
 .../deploy/history/HistoryServerSuite.scala   | 88 +++++++++++++++++--
 docs/monitoring.md                            |  8 ++
 14 files changed, 367 insertions(+), 19 deletions(-)
 create mode 100644 core/src/main/scala/org/apache/spark/status/api/v1/EventLogDownloadResource.scala
 create mode 100644 core/src/test/resources/spark-events/local-1430917381535_1
 create mode 100644 core/src/test/resources/spark-events/local-1430917381535_2

diff --git a/.rat-excludes b/.rat-excludes
index 8f2722cbd001f..994c7e86f8a91 100644
--- a/.rat-excludes
+++ b/.rat-excludes
@@ -80,6 +80,8 @@ local-1425081759269/*
 local-1426533911241/*
 local-1426633911242/*
 local-1430917381534/*
+local-1430917381535_1
+local-1430917381535_2
 DESCRIPTION
 NAMESPACE
 test_support/*
diff --git a/core/src/main/scala/org/apache/spark/deploy/history/ApplicationHistoryProvider.scala b/core/src/main/scala/org/apache/spark/deploy/history/ApplicationHistoryProvider.scala
index 298a8201960d1..5f5e0fe1c34d7 100644
--- a/core/src/main/scala/org/apache/spark/deploy/history/ApplicationHistoryProvider.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/history/ApplicationHistoryProvider.scala
@@ -17,6 +17,9 @@
 
 package org.apache.spark.deploy.history
 
+import java.util.zip.ZipOutputStream
+
+import org.apache.spark.SparkException
 import org.apache.spark.ui.SparkUI
 
 private[spark] case class ApplicationAttemptInfo(
@@ -62,4 +65,12 @@ private[history] abstract class ApplicationHistoryProvider {
    */
   def getConfig(): Map[String, String] = Map()
 
+  /**
+   * Writes out the event logs to the output stream provided. The logs will be compressed into a
+   * single zip file and written out.
+   * @throws SparkException if the logs for the app id cannot be found.
+   */
+  @throws(classOf[SparkException])
+  def writeEventLogs(appId: String, attemptId: Option[String], zipStream: ZipOutputStream): Unit
+
 }
diff --git a/core/src/main/scala/org/apache/spark/deploy/history/FsHistoryProvider.scala b/core/src/main/scala/org/apache/spark/deploy/history/FsHistoryProvider.scala
index 45c2be34c8680..52b149b273e4b 100644
--- a/core/src/main/scala/org/apache/spark/deploy/history/FsHistoryProvider.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/history/FsHistoryProvider.scala
@@ -17,16 +17,18 @@
 
 package org.apache.spark.deploy.history
 
-import java.io.{BufferedInputStream, FileNotFoundException, IOException, InputStream}
+import java.io.{BufferedInputStream, FileNotFoundException, InputStream, IOException, OutputStream}
 import java.util.concurrent.{ExecutorService, Executors, TimeUnit}
+import java.util.zip.{ZipEntry, ZipOutputStream}
 
 import scala.collection.mutable
 
+import com.google.common.io.ByteStreams
 import com.google.common.util.concurrent.{MoreExecutors, ThreadFactoryBuilder}
-import org.apache.hadoop.fs.{FileStatus, Path}
+import org.apache.hadoop.fs.{FileStatus, FileSystem, Path}
 import org.apache.hadoop.fs.permission.AccessControlException
 
-import org.apache.spark.{Logging, SecurityManager, SparkConf}
+import org.apache.spark.{Logging, SecurityManager, SparkConf, SparkException}
 import org.apache.spark.deploy.SparkHadoopUtil
 import org.apache.spark.io.CompressionCodec
 import org.apache.spark.scheduler._
@@ -59,7 +61,8 @@ private[history] class FsHistoryProvider(conf: SparkConf, clock: Clock)
     .map { d => Utils.resolveURI(d).toString }
     .getOrElse(DEFAULT_LOG_DIR)
 
-  private val fs = Utils.getHadoopFileSystem(logDir, SparkHadoopUtil.get.newConfiguration(conf))
+  private val hadoopConf = SparkHadoopUtil.get.newConfiguration(conf)
+  private val fs = Utils.getHadoopFileSystem(logDir, hadoopConf)
 
   // Used by check event thread and clean log thread.
   // Scheduled thread pool size must be one, otherwise it will have concurrent issues about fs
@@ -219,6 +222,58 @@ private[history] class FsHistoryProvider(conf: SparkConf, clock: Clock)
     }
   }
 
+  override def writeEventLogs(
+      appId: String,
+      attemptId: Option[String],
+      zipStream: ZipOutputStream): Unit = {
+
+    /**
+     * This method compresses the files passed in, and writes the compressed data out into the
+     * [[OutputStream]] passed in. Each file is written as a new [[ZipEntry]] with its name being
+     * the name of the file being compressed.
+     */
+    def zipFileToStream(file: Path, entryName: String, outputStream: ZipOutputStream): Unit = {
+      val fs = FileSystem.get(hadoopConf)
+      val inputStream = fs.open(file, 1 * 1024 * 1024) // 1MB Buffer
+      try {
+        outputStream.putNextEntry(new ZipEntry(entryName))
+        ByteStreams.copy(inputStream, outputStream)
+        outputStream.closeEntry()
+      } finally {
+        inputStream.close()
+      }
+    }
+
+    applications.get(appId) match {
+      case Some(appInfo) =>
+        try {
+          // If no attempt is specified, or there is no attemptId for attempts, return all attempts
+          appInfo.attempts.filter { attempt =>
+            attempt.attemptId.isEmpty || attemptId.isEmpty || attempt.attemptId.get == attemptId.get
+          }.foreach { attempt =>
+            val logPath = new Path(logDir, attempt.logPath)
+            // If this is a legacy directory, then add the directory to the zipStream and add
+            // each file to that directory.
+            if (isLegacyLogDirectory(fs.getFileStatus(logPath))) {
+              val files = fs.listFiles(logPath, false)
+              zipStream.putNextEntry(new ZipEntry(attempt.logPath + "/"))
+              zipStream.closeEntry()
+              while (files.hasNext) {
+                val file = files.next().getPath
+                zipFileToStream(file, attempt.logPath + Path.SEPARATOR + file.getName, zipStream)
+              }
+            } else {
+              zipFileToStream(new Path(logDir, attempt.logPath), attempt.logPath, zipStream)
+            }
+          }
+        } finally {
+          zipStream.close()
+        }
+      case None => throw new SparkException(s"Logs for $appId not found.")
+    }
+  }
+
+
   /**
    * Replay the log files in the list and merge the list of old applications with new ones
    */
diff --git a/core/src/main/scala/org/apache/spark/deploy/history/HistoryServer.scala b/core/src/main/scala/org/apache/spark/deploy/history/HistoryServer.scala
index 5a0eb585a9049..10638afb74900 100644
--- a/core/src/main/scala/org/apache/spark/deploy/history/HistoryServer.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/history/HistoryServer.scala
@@ -18,6 +18,7 @@
 package org.apache.spark.deploy.history
 
 import java.util.NoSuchElementException
+import java.util.zip.ZipOutputStream
 import javax.servlet.http.{HttpServlet, HttpServletRequest, HttpServletResponse}
 
 import com.google.common.cache._
@@ -173,6 +174,13 @@ class HistoryServer(
     getApplicationList().iterator.map(ApplicationsListResource.appHistoryInfoToPublicAppInfo)
   }
 
+  override def writeEventLogs(
+      appId: String,
+      attemptId: Option[String],
+      zipStream: ZipOutputStream): Unit = {
+    provider.writeEventLogs(appId, attemptId, zipStream)
+  }
+
   /**
    * Returns the provider configuration to show in the listing page.
    *
diff --git a/core/src/main/scala/org/apache/spark/status/api/v1/ApiRootResource.scala b/core/src/main/scala/org/apache/spark/status/api/v1/ApiRootResource.scala
index f73c742732dec..9af90ee5ecd9d 100644
--- a/core/src/main/scala/org/apache/spark/status/api/v1/ApiRootResource.scala
+++ b/core/src/main/scala/org/apache/spark/status/api/v1/ApiRootResource.scala
@@ -16,6 +16,7 @@
  */
 package org.apache.spark.status.api.v1
 
+import java.util.zip.ZipOutputStream
 import javax.servlet.ServletContext
 import javax.ws.rs._
 import javax.ws.rs.core.{Context, Response}
@@ -164,6 +165,18 @@ private[v1] class ApiRootResource extends UIRootFromServletContext {
     }
   }
 
+  @Path("applications/{appId}/logs")
+  def getEventLogs(
+    @PathParam("appId") appId: String): EventLogDownloadResource = {
+    new EventLogDownloadResource(uiRoot, appId, None)
+  }
+
+  @Path("applications/{appId}/{attemptId}/logs")
+  def getEventLogs(
+    @PathParam("appId") appId: String,
+    @PathParam("attemptId") attemptId: String): EventLogDownloadResource = {
+    new EventLogDownloadResource(uiRoot, appId, Some(attemptId))
+  }
 }
 
 private[spark] object ApiRootResource {
@@ -193,6 +206,13 @@ private[spark] trait UIRoot {
   def getSparkUI(appKey: String): Option[SparkUI]
   def getApplicationInfoList: Iterator[ApplicationInfo]
 
+  def writeEventLogs(appId: String, attemptId: Option[String], zipStream: ZipOutputStream): Unit = {
+    Response.serverError()
+      .entity("Event logs are only available through the history server.")
+      .status(Response.Status.SERVICE_UNAVAILABLE)
+      .build()
+  }
+
   /**
    * Get the spark UI with the given appID, and apply a function
    * to it.  If there is no such app, throw an appropriate exception
diff --git a/core/src/main/scala/org/apache/spark/status/api/v1/EventLogDownloadResource.scala b/core/src/main/scala/org/apache/spark/status/api/v1/EventLogDownloadResource.scala
new file mode 100644
index 0000000000000..d416dba8324d8
--- /dev/null
+++ b/core/src/main/scala/org/apache/spark/status/api/v1/EventLogDownloadResource.scala
@@ -0,0 +1,70 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.status.api.v1
+
+import java.io.OutputStream
+import java.util.zip.ZipOutputStream
+import javax.ws.rs.{GET, Produces}
+import javax.ws.rs.core.{MediaType, Response, StreamingOutput}
+
+import scala.util.control.NonFatal
+
+import org.apache.spark.{Logging, SparkConf}
+import org.apache.spark.deploy.SparkHadoopUtil
+
+@Produces(Array(MediaType.APPLICATION_OCTET_STREAM))
+private[v1] class EventLogDownloadResource(
+    val uIRoot: UIRoot,
+    val appId: String,
+    val attemptId: Option[String]) extends Logging {
+  val conf = SparkHadoopUtil.get.newConfiguration(new SparkConf)
+
+  @GET
+  def getEventLogs(): Response = {
+    try {
+      val fileName = {
+        attemptId match {
+          case Some(id) => s"eventLogs-$appId-$id.zip"
+          case None => s"eventLogs-$appId.zip"
+        }
+      }
+
+      val stream = new StreamingOutput {
+        override def write(output: OutputStream) = {
+          val zipStream = new ZipOutputStream(output)
+          try {
+            uIRoot.writeEventLogs(appId, attemptId, zipStream)
+          } finally {
+            zipStream.close()
+          }
+
+        }
+      }
+
+      Response.ok(stream)
+        .header("Content-Disposition", s"attachment; filename=$fileName")
+        .header("Content-Type", MediaType.APPLICATION_OCTET_STREAM)
+        .build()
+    } catch {
+      case NonFatal(e) =>
+        Response.serverError()
+          .entity(s"Event logs are not available for app: $appId.")
+          .status(Response.Status.SERVICE_UNAVAILABLE)
+          .build()
+    }
+  }
+}
diff --git a/core/src/test/resources/HistoryServerExpectations/application_list_json_expectation.json b/core/src/test/resources/HistoryServerExpectations/application_list_json_expectation.json
index ce4fe80b66aa5..d575bf2f284b9 100644
--- a/core/src/test/resources/HistoryServerExpectations/application_list_json_expectation.json
+++ b/core/src/test/resources/HistoryServerExpectations/application_list_json_expectation.json
@@ -7,6 +7,22 @@
     "sparkUser" : "irashid",
     "completed" : true
   } ]
+}, {
+  "id" : "local-1430917381535",
+  "name" : "Spark shell",
+  "attempts" : [ {
+    "attemptId" : "2",
+    "startTime" : "2015-05-06T13:03:00.893GMT",
+    "endTime" : "2015-05-06T13:03:00.950GMT",
+    "sparkUser" : "irashid",
+    "completed" : true
+  }, {
+    "attemptId" : "1",
+    "startTime" : "2015-05-06T13:03:00.880GMT",
+    "endTime" : "2015-05-06T13:03:00.890GMT",
+    "sparkUser" : "irashid",
+    "completed" : true
+  } ]
 }, {
   "id" : "local-1426533911241",
   "name" : "Spark shell",
diff --git a/core/src/test/resources/HistoryServerExpectations/completed_app_list_json_expectation.json b/core/src/test/resources/HistoryServerExpectations/completed_app_list_json_expectation.json
index ce4fe80b66aa5..d575bf2f284b9 100644
--- a/core/src/test/resources/HistoryServerExpectations/completed_app_list_json_expectation.json
+++ b/core/src/test/resources/HistoryServerExpectations/completed_app_list_json_expectation.json
@@ -7,6 +7,22 @@
     "sparkUser" : "irashid",
     "completed" : true
   } ]
+}, {
+  "id" : "local-1430917381535",
+  "name" : "Spark shell",
+  "attempts" : [ {
+    "attemptId" : "2",
+    "startTime" : "2015-05-06T13:03:00.893GMT",
+    "endTime" : "2015-05-06T13:03:00.950GMT",
+    "sparkUser" : "irashid",
+    "completed" : true
+  }, {
+    "attemptId" : "1",
+    "startTime" : "2015-05-06T13:03:00.880GMT",
+    "endTime" : "2015-05-06T13:03:00.890GMT",
+    "sparkUser" : "irashid",
+    "completed" : true
+  } ]
 }, {
   "id" : "local-1426533911241",
   "name" : "Spark shell",
diff --git a/core/src/test/resources/HistoryServerExpectations/minDate_app_list_json_expectation.json b/core/src/test/resources/HistoryServerExpectations/minDate_app_list_json_expectation.json
index dca86fe5f7e6a..15c2de8ef99ea 100644
--- a/core/src/test/resources/HistoryServerExpectations/minDate_app_list_json_expectation.json
+++ b/core/src/test/resources/HistoryServerExpectations/minDate_app_list_json_expectation.json
@@ -7,6 +7,22 @@
     "sparkUser" : "irashid",
     "completed" : true
   } ]
+},  {
+  "id" : "local-1430917381535",
+  "name" : "Spark shell",
+  "attempts" : [ {
+    "attemptId" : "2",
+    "startTime" : "2015-05-06T13:03:00.893GMT",
+    "endTime" : "2015-05-06T13:03:00.950GMT",
+    "sparkUser" : "irashid",
+    "completed" : true
+  }, {
+    "attemptId" : "1",
+    "startTime" : "2015-05-06T13:03:00.880GMT",
+    "endTime" : "2015-05-06T13:03:00.890GMT",
+    "sparkUser" : "irashid",
+    "completed" : true
+  } ]
 }, {
   "id" : "local-1426533911241",
   "name" : "Spark shell",
@@ -24,12 +40,14 @@
     "completed" : true
   } ]
 }, {
-  "id" : "local-1425081759269",
-  "name" : "Spark shell",
-  "attempts" : [ {
-    "startTime" : "2015-02-28T00:02:38.277GMT",
-    "endTime" : "2015-02-28T00:02:46.912GMT",
-    "sparkUser" : "irashid",
-    "completed" : true
-  } ]
+    "id": "local-1425081759269",
+    "name": "Spark shell",
+    "attempts": [
+      {
+        "startTime": "2015-02-28T00:02:38.277GMT",
+        "endTime": "2015-02-28T00:02:46.912GMT",
+        "sparkUser": "irashid",
+        "completed": true
+      }
+    ]
 } ]
\ No newline at end of file
diff --git a/core/src/test/resources/spark-events/local-1430917381535_1 b/core/src/test/resources/spark-events/local-1430917381535_1
new file mode 100644
index 0000000000000..d5a1303344825
--- /dev/null
+++ b/core/src/test/resources/spark-events/local-1430917381535_1
@@ -0,0 +1,5 @@
+{"Event":"SparkListenerLogStart","Spark Version":"1.4.0-SNAPSHOT"}
+{"Event":"SparkListenerBlockManagerAdded","Block Manager ID":{"Executor ID":"driver","Host":"localhost","Port":61103},"Maximum Memory":278019440,"Timestamp":1430917380880}
+{"Event":"SparkListenerEnvironmentUpdate","JVM Information":{"Java Home":"/Library/Java/JavaVirtualMachines/jdk1.8.0_25.jdk/Contents/Home/jre","Java Version":"1.8.0_25 (Oracle Corporation)","Scala Version":"version 2.10.4"},"Spark Properties":{"spark.driver.host":"192.168.1.102","spark.eventLog.enabled":"true","spark.driver.port":"61101","spark.repl.class.uri":"http://192.168.1.102:61100","spark.jars":"","spark.app.name":"Spark shell","spark.scheduler.mode":"FIFO","spark.executor.id":"driver","spark.master":"local[*]","spark.eventLog.dir":"/Users/irashid/github/kraps/core/src/test/resources/spark-events","spark.fileserver.uri":"http://192.168.1.102:61102","spark.tachyonStore.folderName":"spark-aaaf41b3-d1dd-447f-8951-acf51490758b","spark.app.id":"local-1430917381534"},"System Properties":{"java.io.tmpdir":"/var/folders/36/m29jw1z95qv4ywb1c4n0rz000000gp/T/","line.separator":"\n","path.separator":":","sun.management.compiler":"HotSpot 64-Bit Tiered Compilers","SPARK_SUBMIT":"true","sun.cpu.endian":"little","java.specification.version":"1.8","java.vm.specification.name":"Java Virtual Machine Specification","java.vendor":"Oracle Corporation","java.vm.specification.version":"1.8","user.home":"/Users/irashid","file.encoding.pkg":"sun.io","sun.nio.ch.bugLevel":"","ftp.nonProxyHosts":"local|*.local|169.254/16|*.169.254/16","sun.arch.data.model":"64","sun.boot.library.path":"/Library/Java/JavaVirtualMachines/jdk1.8.0_25.jdk/Contents/Home/jre/lib","user.dir":"/Users/irashid/github/spark","java.library.path":"/Users/irashid/Library/Java/Extensions:/Library/Java/Extensions:/Network/Library/Java/Extensions:/System/Library/Java/Extensions:/usr/lib/java:.","sun.cpu.isalist":"","os.arch":"x86_64","java.vm.version":"25.25-b02","java.endorsed.dirs":"/Library/Java/JavaVirtualMachines/jdk1.8.0_25.jdk/Contents/Home/jre/lib/endorsed","java.runtime.version":"1.8.0_25-b17","java.vm.info":"mixed mode","java.ext.dirs":"/Users/irashid/Library/Java/Extensions:/Library/Java/JavaVirtualMachines/jdk1.8.0_25.jdk/Contents/Home/jre/lib/ext:/Library/Java/Extensions:/Network/Library/Java/Extensions:/System/Library/Java/Extensions:/usr/lib/java","java.runtime.name":"Java(TM) SE Runtime Environment","file.separator":"/","java.class.version":"52.0","scala.usejavacp":"true","java.specification.name":"Java Platform API Specification","sun.boot.class.path":"/Library/Java/JavaVirtualMachines/jdk1.8.0_25.jdk/Contents/Home/jre/lib/resources.jar:/Library/Java/JavaVirtualMachines/jdk1.8.0_25.jdk/Contents/Home/jre/lib/rt.jar:/Library/Java/JavaVirtualMachines/jdk1.8.0_25.jdk/Contents/Home/jre/lib/sunrsasign.jar:/Library/Java/JavaVirtualMachines/jdk1.8.0_25.jdk/Contents/Home/jre/lib/jsse.jar:/Library/Java/JavaVirtualMachines/jdk1.8.0_25.jdk/Contents/Home/jre/lib/jce.jar:/Library/Java/JavaVirtualMachines/jdk1.8.0_25.jdk/Contents/Home/jre/lib/charsets.jar:/Library/Java/JavaVirtualMachines/jdk1.8.0_25.jdk/Contents/Home/jre/lib/jfr.jar:/Library/Java/JavaVirtualMachines/jdk1.8.0_25.jdk/Contents/Home/jre/classes","file.encoding":"UTF-8","user.timezone":"America/Chicago","java.specification.vendor":"Oracle Corporation","sun.java.launcher":"SUN_STANDARD","os.version":"10.9.5","sun.os.patch.level":"unknown","gopherProxySet":"false","java.vm.specification.vendor":"Oracle Corporation","user.country":"US","sun.jnu.encoding":"UTF-8","http.nonProxyHosts":"local|*.local|169.254/16|*.169.254/16","user.language":"en","socksNonProxyHosts":"local|*.local|169.254/16|*.169.254/16","java.vendor.url":"http://java.oracle.com/","java.awt.printerjob":"sun.lwawt.macosx.CPrinterJob","java.awt.graphicsenv":"sun.awt.CGraphicsEnvironment","awt.toolkit":"sun.lwawt.macosx.LWCToolkit","os.name":"Mac OS X","java.vm.vendor":"Oracle Corporation","java.vendor.url.bug":"http://bugreport.sun.com/bugreport/","user.name":"irashid","java.vm.name":"Java HotSpot(TM) 64-Bit Server VM","sun.java.command":"org.apache.spark.deploy.SparkSubmit --conf spark.eventLog.enabled=true --conf spark.eventLog.dir=/Users/irashid/github/kraps/core/src/test/resources/spark-events --class org.apache.spark.repl.Main spark-shell","java.home":"/Library/Java/JavaVirtualMachines/jdk1.8.0_25.jdk/Contents/Home/jre","java.version":"1.8.0_25","sun.io.unicode.encoding":"UnicodeBig"},"Classpath Entries":{"/etc/hadoop":"System Classpath","/Users/irashid/github/spark/lib_managed/jars/datanucleus-rdbms-3.2.9.jar":"System Classpath","/Users/irashid/github/spark/conf/":"System Classpath","/Users/irashid/github/spark/assembly/target/scala-2.10/spark-assembly-1.4.0-SNAPSHOT-hadoop2.5.0.jar":"System Classpath","/Users/irashid/github/spark/lib_managed/jars/datanucleus-core-3.2.10.jar":"System Classpath","/Users/irashid/github/spark/lib_managed/jars/datanucleus-api-jdo-3.2.6.jar":"System Classpath"}}
+{"Event":"SparkListenerApplicationStart","App Name":"Spark shell","App ID":"local-1430917381535","Timestamp":1430917380880,"User":"irashid","App Attempt ID":"1"}
+{"Event":"SparkListenerApplicationEnd","Timestamp":1430917380890}
\ No newline at end of file
diff --git a/core/src/test/resources/spark-events/local-1430917381535_2 b/core/src/test/resources/spark-events/local-1430917381535_2
new file mode 100644
index 0000000000000..abb637a22e1e3
--- /dev/null
+++ b/core/src/test/resources/spark-events/local-1430917381535_2
@@ -0,0 +1,5 @@
+{"Event":"SparkListenerLogStart","Spark Version":"1.4.0-SNAPSHOT"}
+{"Event":"SparkListenerBlockManagerAdded","Block Manager ID":{"Executor ID":"driver","Host":"localhost","Port":61103},"Maximum Memory":278019440,"Timestamp":1430917380893}
+{"Event":"SparkListenerEnvironmentUpdate","JVM Information":{"Java Home":"/Library/Java/JavaVirtualMachines/jdk1.8.0_25.jdk/Contents/Home/jre","Java Version":"1.8.0_25 (Oracle Corporation)","Scala Version":"version 2.10.4"},"Spark Properties":{"spark.driver.host":"192.168.1.102","spark.eventLog.enabled":"true","spark.driver.port":"61101","spark.repl.class.uri":"http://192.168.1.102:61100","spark.jars":"","spark.app.name":"Spark shell","spark.scheduler.mode":"FIFO","spark.executor.id":"driver","spark.master":"local[*]","spark.eventLog.dir":"/Users/irashid/github/kraps/core/src/test/resources/spark-events","spark.fileserver.uri":"http://192.168.1.102:61102","spark.tachyonStore.folderName":"spark-aaaf41b3-d1dd-447f-8951-acf51490758b","spark.app.id":"local-1430917381534"},"System Properties":{"java.io.tmpdir":"/var/folders/36/m29jw1z95qv4ywb1c4n0rz000000gp/T/","line.separator":"\n","path.separator":":","sun.management.compiler":"HotSpot 64-Bit Tiered Compilers","SPARK_SUBMIT":"true","sun.cpu.endian":"little","java.specification.version":"1.8","java.vm.specification.name":"Java Virtual Machine Specification","java.vendor":"Oracle Corporation","java.vm.specification.version":"1.8","user.home":"/Users/irashid","file.encoding.pkg":"sun.io","sun.nio.ch.bugLevel":"","ftp.nonProxyHosts":"local|*.local|169.254/16|*.169.254/16","sun.arch.data.model":"64","sun.boot.library.path":"/Library/Java/JavaVirtualMachines/jdk1.8.0_25.jdk/Contents/Home/jre/lib","user.dir":"/Users/irashid/github/spark","java.library.path":"/Users/irashid/Library/Java/Extensions:/Library/Java/Extensions:/Network/Library/Java/Extensions:/System/Library/Java/Extensions:/usr/lib/java:.","sun.cpu.isalist":"","os.arch":"x86_64","java.vm.version":"25.25-b02","java.endorsed.dirs":"/Library/Java/JavaVirtualMachines/jdk1.8.0_25.jdk/Contents/Home/jre/lib/endorsed","java.runtime.version":"1.8.0_25-b17","java.vm.info":"mixed mode","java.ext.dirs":"/Users/irashid/Library/Java/Extensions:/Library/Java/JavaVirtualMachines/jdk1.8.0_25.jdk/Contents/Home/jre/lib/ext:/Library/Java/Extensions:/Network/Library/Java/Extensions:/System/Library/Java/Extensions:/usr/lib/java","java.runtime.name":"Java(TM) SE Runtime Environment","file.separator":"/","java.class.version":"52.0","scala.usejavacp":"true","java.specification.name":"Java Platform API Specification","sun.boot.class.path":"/Library/Java/JavaVirtualMachines/jdk1.8.0_25.jdk/Contents/Home/jre/lib/resources.jar:/Library/Java/JavaVirtualMachines/jdk1.8.0_25.jdk/Contents/Home/jre/lib/rt.jar:/Library/Java/JavaVirtualMachines/jdk1.8.0_25.jdk/Contents/Home/jre/lib/sunrsasign.jar:/Library/Java/JavaVirtualMachines/jdk1.8.0_25.jdk/Contents/Home/jre/lib/jsse.jar:/Library/Java/JavaVirtualMachines/jdk1.8.0_25.jdk/Contents/Home/jre/lib/jce.jar:/Library/Java/JavaVirtualMachines/jdk1.8.0_25.jdk/Contents/Home/jre/lib/charsets.jar:/Library/Java/JavaVirtualMachines/jdk1.8.0_25.jdk/Contents/Home/jre/lib/jfr.jar:/Library/Java/JavaVirtualMachines/jdk1.8.0_25.jdk/Contents/Home/jre/classes","file.encoding":"UTF-8","user.timezone":"America/Chicago","java.specification.vendor":"Oracle Corporation","sun.java.launcher":"SUN_STANDARD","os.version":"10.9.5","sun.os.patch.level":"unknown","gopherProxySet":"false","java.vm.specification.vendor":"Oracle Corporation","user.country":"US","sun.jnu.encoding":"UTF-8","http.nonProxyHosts":"local|*.local|169.254/16|*.169.254/16","user.language":"en","socksNonProxyHosts":"local|*.local|169.254/16|*.169.254/16","java.vendor.url":"http://java.oracle.com/","java.awt.printerjob":"sun.lwawt.macosx.CPrinterJob","java.awt.graphicsenv":"sun.awt.CGraphicsEnvironment","awt.toolkit":"sun.lwawt.macosx.LWCToolkit","os.name":"Mac OS X","java.vm.vendor":"Oracle Corporation","java.vendor.url.bug":"http://bugreport.sun.com/bugreport/","user.name":"irashid","java.vm.name":"Java HotSpot(TM) 64-Bit Server VM","sun.java.command":"org.apache.spark.deploy.SparkSubmit --conf spark.eventLog.enabled=true --conf spark.eventLog.dir=/Users/irashid/github/kraps/core/src/test/resources/spark-events --class org.apache.spark.repl.Main spark-shell","java.home":"/Library/Java/JavaVirtualMachines/jdk1.8.0_25.jdk/Contents/Home/jre","java.version":"1.8.0_25","sun.io.unicode.encoding":"UnicodeBig"},"Classpath Entries":{"/etc/hadoop":"System Classpath","/Users/irashid/github/spark/lib_managed/jars/datanucleus-rdbms-3.2.9.jar":"System Classpath","/Users/irashid/github/spark/conf/":"System Classpath","/Users/irashid/github/spark/assembly/target/scala-2.10/spark-assembly-1.4.0-SNAPSHOT-hadoop2.5.0.jar":"System Classpath","/Users/irashid/github/spark/lib_managed/jars/datanucleus-core-3.2.10.jar":"System Classpath","/Users/irashid/github/spark/lib_managed/jars/datanucleus-api-jdo-3.2.6.jar":"System Classpath"}}
+{"Event":"SparkListenerApplicationStart","App Name":"Spark shell","App ID":"local-1430917381535","Timestamp":1430917380893,"User":"irashid","App Attempt ID":"2"}
+{"Event":"SparkListenerApplicationEnd","Timestamp":1430917380950}
\ No newline at end of file
diff --git a/core/src/test/scala/org/apache/spark/deploy/history/FsHistoryProviderSuite.scala b/core/src/test/scala/org/apache/spark/deploy/history/FsHistoryProviderSuite.scala
index 0f6933df9e6bc..09075eeb539aa 100644
--- a/core/src/test/scala/org/apache/spark/deploy/history/FsHistoryProviderSuite.scala
+++ b/core/src/test/scala/org/apache/spark/deploy/history/FsHistoryProviderSuite.scala
@@ -17,12 +17,16 @@
 
 package org.apache.spark.deploy.history
 
-import java.io.{BufferedOutputStream, File, FileOutputStream, OutputStreamWriter}
+import java.io.{BufferedOutputStream, ByteArrayInputStream, ByteArrayOutputStream, File,
+  FileOutputStream, OutputStreamWriter}
 import java.net.URI
 import java.util.concurrent.TimeUnit
+import java.util.zip.{ZipInputStream, ZipOutputStream}
 
 import scala.io.Source
 
+import com.google.common.base.Charsets
+import com.google.common.io.{ByteStreams, Files}
 import org.apache.hadoop.fs.Path
 import org.json4s.jackson.JsonMethods._
 import org.scalatest.BeforeAndAfter
@@ -335,6 +339,40 @@ class FsHistoryProviderSuite extends SparkFunSuite with BeforeAndAfter with Matc
     assert(!log2.exists())
   }
 
+  test("Event log copy") {
+    val provider = new FsHistoryProvider(createTestConf())
+    val logs = (1 to 2).map { i =>
+      val log = newLogFile("downloadApp1", Some(s"attempt$i"), inProgress = false)
+      writeFile(log, true, None,
+        SparkListenerApplicationStart(
+          "downloadApp1", Some("downloadApp1"), 5000 * i, "test", Some(s"attempt$i")),
+        SparkListenerApplicationEnd(5001 * i)
+      )
+      log
+    }
+    provider.checkForLogs()
+
+    (1 to 2).foreach { i =>
+      val underlyingStream = new ByteArrayOutputStream()
+      val outputStream = new ZipOutputStream(underlyingStream)
+      provider.writeEventLogs("downloadApp1", Some(s"attempt$i"), outputStream)
+      outputStream.close()
+      val inputStream = new ZipInputStream(new ByteArrayInputStream(underlyingStream.toByteArray))
+      var totalEntries = 0
+      var entry = inputStream.getNextEntry
+      entry should not be null
+      while (entry != null) {
+        val actual = new String(ByteStreams.toByteArray(inputStream), Charsets.UTF_8)
+        val expected = Files.toString(logs.find(_.getName == entry.getName).get, Charsets.UTF_8)
+        actual should be (expected)
+        totalEntries += 1
+        entry = inputStream.getNextEntry
+      }
+      totalEntries should be (1)
+      inputStream.close()
+    }
+  }
+
   /**
    * Asks the provider to check for logs and calls a function to perform checks on the updated
    * app list. Example:
diff --git a/core/src/test/scala/org/apache/spark/deploy/history/HistoryServerSuite.scala b/core/src/test/scala/org/apache/spark/deploy/history/HistoryServerSuite.scala
index 14f2d1a5894b8..e5b5e1bb65337 100644
--- a/core/src/test/scala/org/apache/spark/deploy/history/HistoryServerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/deploy/history/HistoryServerSuite.scala
@@ -16,10 +16,13 @@
  */
 package org.apache.spark.deploy.history
 
-import java.io.{File, FileInputStream, FileWriter, IOException}
+import java.io.{File, FileInputStream, FileWriter, InputStream, IOException}
 import java.net.{HttpURLConnection, URL}
+import java.util.zip.ZipInputStream
 import javax.servlet.http.{HttpServletRequest, HttpServletResponse}
 
+import com.google.common.base.Charsets
+import com.google.common.io.{ByteStreams, Files}
 import org.apache.commons.io.{FileUtils, IOUtils}
 import org.mockito.Mockito.when
 import org.scalatest.{BeforeAndAfter, Matchers}
@@ -147,6 +150,70 @@ class HistoryServerSuite extends SparkFunSuite with BeforeAndAfter with Matchers
     }
   }
 
+  test("download all logs for app with multiple attempts") {
+    doDownloadTest("local-1430917381535", None)
+  }
+
+  test("download one log for app with multiple attempts") {
+    (1 to 2).foreach { attemptId => doDownloadTest("local-1430917381535", Some(attemptId)) }
+  }
+
+  test("download legacy logs - all attempts") {
+    doDownloadTest("local-1426533911241", None, legacy = true)
+  }
+
+  test("download legacy logs - single  attempts") {
+    (1 to 2). foreach {
+      attemptId => doDownloadTest("local-1426533911241", Some(attemptId), legacy = true)
+    }
+  }
+
+  // Test that the files are downloaded correctly, and validate them.
+  def doDownloadTest(appId: String, attemptId: Option[Int], legacy: Boolean = false): Unit = {
+
+    val url = attemptId match {
+      case Some(id) =>
+        new URL(s"${generateURL(s"applications/$appId")}/$id/logs")
+      case None =>
+        new URL(s"${generateURL(s"applications/$appId")}/logs")
+    }
+
+    val (code, inputStream, error) = HistoryServerSuite.connectAndGetInputStream(url)
+    code should be (HttpServletResponse.SC_OK)
+    inputStream should not be None
+    error should be (None)
+
+    val zipStream = new ZipInputStream(inputStream.get)
+    var entry = zipStream.getNextEntry
+    entry should not be null
+    val totalFiles = {
+      if (legacy) {
+        attemptId.map { x => 3 }.getOrElse(6)
+      } else {
+        attemptId.map { x => 1 }.getOrElse(2)
+      }
+    }
+    var filesCompared = 0
+    while (entry != null) {
+      if (!entry.isDirectory) {
+        val expectedFile = {
+          if (legacy) {
+            val splits = entry.getName.split("/")
+            new File(new File(logDir, splits(0)), splits(1))
+          } else {
+            new File(logDir, entry.getName)
+          }
+        }
+        val expected = Files.toString(expectedFile, Charsets.UTF_8)
+        val actual = new String(ByteStreams.toByteArray(zipStream), Charsets.UTF_8)
+        actual should be (expected)
+        filesCompared += 1
+      }
+      entry = zipStream.getNextEntry
+    }
+    filesCompared should be (totalFiles)
+  }
+
   test("response codes on bad paths") {
     val badAppId = getContentAndCode("applications/foobar")
     badAppId._1 should be (HttpServletResponse.SC_NOT_FOUND)
@@ -202,7 +269,11 @@ class HistoryServerSuite extends SparkFunSuite with BeforeAndAfter with Matchers
   }
 
   def getUrl(path: String): String = {
-    HistoryServerSuite.getUrl(new URL(s"http://localhost:$port/api/v1/$path"))
+    HistoryServerSuite.getUrl(generateURL(path))
+  }
+
+  def generateURL(path: String): URL = {
+    new URL(s"http://localhost:$port/api/v1/$path")
   }
 
   def generateExpectation(name: String, path: String): Unit = {
@@ -233,13 +304,18 @@ object HistoryServerSuite {
   }
 
   def getContentAndCode(url: URL): (Int, Option[String], Option[String]) = {
+    val (code, in, errString) = connectAndGetInputStream(url)
+    val inString = in.map(IOUtils.toString)
+    (code, inString, errString)
+  }
+
+  def connectAndGetInputStream(url: URL): (Int, Option[InputStream], Option[String]) = {
     val connection = url.openConnection().asInstanceOf[HttpURLConnection]
     connection.setRequestMethod("GET")
     connection.connect()
     val code = connection.getResponseCode()
-    val inString = try {
-      val in = Option(connection.getInputStream())
-      in.map(IOUtils.toString)
+    val inStream = try {
+      Option(connection.getInputStream())
     } catch {
       case io: IOException => None
     }
@@ -249,7 +325,7 @@ object HistoryServerSuite {
     } catch {
       case io: IOException => None
     }
-    (code, inString, errString)
+    (code, inStream, errString)
   }
 
 
diff --git a/docs/monitoring.md b/docs/monitoring.md
index e75018499003a..31ecddc6dbbb9 100644
--- a/docs/monitoring.md
+++ b/docs/monitoring.md
@@ -228,6 +228,14 @@ for a running application, at `http://localhost:4040/api/v1`.
     <td><code>/applications/[app-id]/storage/rdd/[rdd-id]</code></td>
     <td>Details for the storage status of a given RDD</td>
   </tr>
+  <tr>
+    <td><code>/applications/[app-id]/logs</code></td>
+    <td>Download the event logs for all attempts of the given application as a zip file</td>
+  </tr>
+  <tr>
+    <td><code>/applications/[app-id]/[attempt-id/logs</code></td>
+    <td>Download the event logs for the specified attempt of the given application as a zip file</td>
+  </tr>
 </table>
 
 When running on Yarn, each application has multiple attempts, so `[app-id]` is actually

From 708c63bbbe9580eb774fe47e23ef61338103afda Mon Sep 17 00:00:00 2001
From: Sun Rui <rui.sun@intel.com>
Date: Wed, 3 Jun 2015 11:56:35 -0700
Subject: [PATCH 337/525] [SPARK-8063] [SPARKR] Spark master URL conflict
 between MASTER env variable and --master command line option.

Author: Sun Rui <rui.sun@intel.com>

Closes #6605 from sun-rui/SPARK-8063 and squashes the following commits:

51ca48b [Sun Rui] [SPARK-8063][SPARKR] Spark master URL conflict between MASTER env variable and --master command line option.
---
 R/pkg/inst/profile/shell.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/R/pkg/inst/profile/shell.R b/R/pkg/inst/profile/shell.R
index ca94f1d4e7fd5..773b6ecf582d9 100644
--- a/R/pkg/inst/profile/shell.R
+++ b/R/pkg/inst/profile/shell.R
@@ -24,7 +24,7 @@
   old <- getOption("defaultPackages")
   options(defaultPackages = c(old, "SparkR"))
 
-  sc <- SparkR::sparkR.init(Sys.getenv("MASTER", unset = ""))
+  sc <- SparkR::sparkR.init()
   assign("sc", sc, envir=.GlobalEnv)
   sqlContext <- SparkR::sparkRSQL.init(sc)
   assign("sqlContext", sqlContext, envir=.GlobalEnv)

From 939e4f3d8def16dfe03f0196be8e1c218a9daa32 Mon Sep 17 00:00:00 2001
From: Reynold Xin <rxin@databricks.com>
Date: Wed, 3 Jun 2015 13:57:57 -0700
Subject: [PATCH 338/525] [SPARK-8074] Parquet should throw AnalysisException
 during setup for data type/name related failures.

Author: Reynold Xin <rxin@databricks.com>

Closes #6608 from rxin/parquet-analysis and squashes the following commits:

b5dc8e2 [Reynold Xin] Code review feedback.
5617cf6 [Reynold Xin] [SPARK-8074] Parquet should throw AnalysisException during setup for data type/name related failures.
---
 .../spark/sql/parquet/ParquetTypes.scala      | 20 +++++++++----------
 .../apache/spark/sql/parquet/newParquet.scala | 14 +++++++------
 2 files changed, 17 insertions(+), 17 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTypes.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTypes.scala
index 6698b19c7477d..f8a5d84549336 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTypes.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTypes.scala
@@ -19,7 +19,7 @@ package org.apache.spark.sql.parquet
 
 import java.io.IOException
 
-import scala.collection.mutable.ArrayBuffer
+import scala.collection.JavaConversions._
 import scala.util.Try
 
 import org.apache.hadoop.conf.Configuration
@@ -33,12 +33,11 @@ import parquet.schema.PrimitiveType.{PrimitiveTypeName => ParquetPrimitiveTypeNa
 import parquet.schema.Type.Repetition
 import parquet.schema.{ConversionPatterns, DecimalMetadata, GroupType => ParquetGroupType, MessageType, OriginalType => ParquetOriginalType, PrimitiveType => ParquetPrimitiveType, Type => ParquetType, Types => ParquetTypes}
 
+import org.apache.spark.Logging
+import org.apache.spark.sql.AnalysisException
 import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference}
 import org.apache.spark.sql.types._
-import org.apache.spark.{Logging, SparkException}
 
-// Implicits
-import scala.collection.JavaConversions._
 
 /** A class representing Parquet info fields we care about, for passing back to Parquet */
 private[parquet] case class ParquetTypeInfo(
@@ -73,13 +72,12 @@ private[parquet] object ParquetTypesConverter extends Logging {
       case ParquetPrimitiveTypeName.INT96 if int96AsTimestamp => TimestampType
       case ParquetPrimitiveTypeName.INT96 =>
         // TODO: add BigInteger type? TODO(andre) use DecimalType instead????
-        sys.error("Potential loss of precision: cannot convert INT96")
+        throw new AnalysisException("Potential loss of precision: cannot convert INT96")
       case ParquetPrimitiveTypeName.FIXED_LEN_BYTE_ARRAY
         if (originalType == ParquetOriginalType.DECIMAL && decimalInfo.getPrecision <= 18) =>
           // TODO: for now, our reader only supports decimals that fit in a Long
           DecimalType(decimalInfo.getPrecision, decimalInfo.getScale)
-      case _ => sys.error(
-        s"Unsupported parquet datatype $parquetType")
+      case _ => throw new AnalysisException(s"Unsupported parquet datatype $parquetType")
     }
   }
 
@@ -371,7 +369,7 @@ private[parquet] object ParquetTypesConverter extends Logging {
             parquetKeyType,
             parquetValueType)
         }
-        case _ => sys.error(s"Unsupported datatype $ctype")
+        case _ => throw new AnalysisException(s"Unsupported datatype $ctype")
       }
     }
   }
@@ -403,7 +401,7 @@ private[parquet] object ParquetTypesConverter extends Logging {
   def convertFromString(string: String): Seq[Attribute] = {
     Try(DataType.fromJson(string)).getOrElse(DataType.fromCaseClassString(string)) match {
       case s: StructType => s.toAttributes
-      case other => sys.error(s"Can convert $string to row")
+      case other => throw new AnalysisException(s"Can convert $string to row")
     }
   }
 
@@ -411,8 +409,8 @@ private[parquet] object ParquetTypesConverter extends Logging {
     // ,;{}()\n\t= and space character are special characters in Parquet schema
     schema.map(_.name).foreach { name =>
       if (name.matches(".*[ ,;{}()\n\t=].*")) {
-        sys.error(
-          s"""Attribute name "$name" contains invalid character(s) among " ,;{}()\n\t=".
+        throw new AnalysisException(
+          s"""Attribute name "$name" contains invalid character(s) among " ,;{}()\\n\\t=".
              |Please use alias to rename it.
            """.stripMargin.split("\n").mkString(" "))
       }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/newParquet.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/newParquet.scala
index 824ae36968c32..bf55e2383ab56 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/newParquet.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/newParquet.scala
@@ -39,6 +39,7 @@ import org.apache.spark.broadcast.Broadcast
 import org.apache.spark.deploy.SparkHadoopUtil
 import org.apache.spark.rdd.RDD._
 import org.apache.spark.rdd.RDD
+import org.apache.spark.sql.AnalysisException
 import org.apache.spark.sql.sources._
 import org.apache.spark.sql.types.{DataType, StructType}
 import org.apache.spark.sql.{Row, SQLConf, SQLContext}
@@ -83,7 +84,7 @@ private[sql] class ParquetOutputWriter(path: String, context: TaskAttemptContext
             case partFilePattern(id) => id.toInt
             case name if name.startsWith("_") => 0
             case name if name.startsWith(".") => 0
-            case name => sys.error(
+            case name => throw new AnalysisException(
               s"Trying to write Parquet files to directory $outputPath, " +
                 s"but found items with illegal name '$name'.")
           }.reduceOption(_ max _).getOrElse(0)
@@ -380,11 +381,12 @@ private[sql] class ParquetRelation2(
       // time-consuming.
       if (dataSchema == null) {
         dataSchema = {
-          val dataSchema0 =
-            maybeDataSchema
-              .orElse(readSchema())
-              .orElse(maybeMetastoreSchema)
-              .getOrElse(sys.error("Failed to get the schema."))
+          val dataSchema0 = maybeDataSchema
+            .orElse(readSchema())
+            .orElse(maybeMetastoreSchema)
+            .getOrElse(throw new AnalysisException(
+              s"Failed to discover schema of Parquet file(s) in the following location(s):\n" +
+                paths.mkString("\n\t")))
 
           // If this Parquet relation is converted from a Hive Metastore table, must reconcile case
           // case insensitivity issue and possible schema mismatch (probably caused by schema

From 2c5a06cafd2885ff5431fa96485db2564ae1cce3 Mon Sep 17 00:00:00 2001
From: Reynold Xin <rxin@databricks.com>
Date: Wed, 3 Jun 2015 14:19:10 -0700
Subject: [PATCH 339/525] Update documentation for [SPARK-7980] [SQL] Support
 SQLContext.range(end)

---
 python/pyspark/sql/context.py                 |  2 ++
 .../org/apache/spark/sql/SQLContext.scala     | 20 +++++++++----------
 2 files changed, 12 insertions(+), 10 deletions(-)

diff --git a/python/pyspark/sql/context.py b/python/pyspark/sql/context.py
index 1bebfc48376b4..599c9ac5794a2 100644
--- a/python/pyspark/sql/context.py
+++ b/python/pyspark/sql/context.py
@@ -146,6 +146,8 @@ def range(self, start, end=None, step=1, numPartitions=None):
         >>> sqlContext.range(1, 7, 2).collect()
         [Row(id=1), Row(id=3), Row(id=5)]
 
+        If only one argument is specified, it will be used as the end value.
+
         >>> sqlContext.range(3).collect()
         [Row(id=0), Row(id=1), Row(id=2)]
         """
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
index f08fb4fafe650..0aab7fa8709b8 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
@@ -705,33 +705,33 @@ class SQLContext(@transient val sparkContext: SparkContext)
   /**
    * :: Experimental ::
    * Creates a [[DataFrame]] with a single [[LongType]] column named `id`, containing elements
-   * in an range from `start` to `end`(exclusive) with step value 1.
+   * in an range from 0 to `end` (exclusive) with step value 1.
    *
-   * @since 1.4.0
+   * @since 1.4.1
    * @group dataframe
    */
   @Experimental
-  def range(start: Long, end: Long): DataFrame = {
-    createDataFrame(
-      sparkContext.range(start, end).map(Row(_)),
-      StructType(StructField("id", LongType, nullable = false) :: Nil))
-  }
+  def range(end: Long): DataFrame = range(0, end)
 
   /**
    * :: Experimental ::
    * Creates a [[DataFrame]] with a single [[LongType]] column named `id`, containing elements
-   * in an range from 0 to `end`(exclusive) with step value 1.
+   * in an range from `start` to `end` (exclusive) with step value 1.
    *
    * @since 1.4.0
    * @group dataframe
    */
   @Experimental
-  def range(end: Long): DataFrame = range(0, end)
+  def range(start: Long, end: Long): DataFrame = {
+    createDataFrame(
+      sparkContext.range(start, end).map(Row(_)),
+      StructType(StructField("id", LongType, nullable = false) :: Nil))
+  }
 
   /**
    * :: Experimental ::
    * Creates a [[DataFrame]] with a single [[LongType]] column named `id`, containing elements
-   * in an range from `start` to `end`(exclusive) with an step value, with partition number
+   * in an range from `start` to `end` (exclusive) with an step value, with partition number
    * specified.
    *
    * @since 1.4.0

From 20a26b595c74ac41cf7c19e6091d7e675e503321 Mon Sep 17 00:00:00 2001
From: "Joseph K. Bradley" <joseph@databricks.com>
Date: Wed, 3 Jun 2015 14:34:20 -0700
Subject: [PATCH 340/525] [SPARK-8054] [MLLIB] Added several Java-friendly APIs
 + unit tests

Java-friendly APIs added:
* GaussianMixture.run()
* GaussianMixtureModel.predict()
* DistributedLDAModel.javaTopicDistributions()
* StreamingKMeans: trainOn, predictOn, predictOnValues
* Statistics.corr
* params
  * added doc to w() since Java docs do not inherit doc
  * removed non-Java-friendly w() from StringArrayParam and DoubleArrayParam
  * made DoubleArrayParam Java-friendly w() actually Java-friendly

I generated the doc and verified all changes.

CC: mengxr

Author: Joseph K. Bradley <joseph@databricks.com>

Closes #6562 from jkbradley/java-api-1.4 and squashes the following commits:

c16821b [Joseph K. Bradley] Small fixes based on code review.
d955581 [Joseph K. Bradley] unit test fixes
29b6b0d [Joseph K. Bradley] small fixes
fe6dcfe [Joseph K. Bradley] Added several Java-friendly APIs + unit tests: NaiveBayes, GaussianMixture, LDA, StreamingKMeans, Statistics.corr, params
---
 .../org/apache/spark/ml/param/params.scala    | 20 ++---
 .../mllib/clustering/GaussianMixture.scala    |  4 +
 .../clustering/GaussianMixtureModel.scala     |  7 +-
 .../spark/mllib/clustering/LDAModel.scala     |  6 ++
 .../mllib/clustering/StreamingKMeans.scala    | 18 ++++
 .../apache/spark/mllib/stat/Statistics.scala  |  9 ++
 .../spark/ml/param/JavaParamsSuite.java       |  1 +
 .../apache/spark/ml/param/JavaTestParams.java | 29 +++++--
 .../JavaStreamingLogisticRegressionSuite.java |  3 +-
 .../clustering/JavaGaussianMixtureSuite.java  | 64 +++++++++++++++
 .../spark/mllib/clustering/JavaLDASuite.java  |  4 +
 .../clustering/JavaStreamingKMeansSuite.java  | 82 +++++++++++++++++++
 .../spark/mllib/stat/JavaStatisticsSuite.java | 56 +++++++++++++
 13 files changed, 284 insertions(+), 19 deletions(-)
 rename mllib/src/test/java/org/apache/spark/{ml => mllib}/classification/JavaStreamingLogisticRegressionSuite.java (95%)
 create mode 100644 mllib/src/test/java/org/apache/spark/mllib/clustering/JavaGaussianMixtureSuite.java
 create mode 100644 mllib/src/test/java/org/apache/spark/mllib/clustering/JavaStreamingKMeansSuite.java
 create mode 100644 mllib/src/test/java/org/apache/spark/mllib/stat/JavaStatisticsSuite.java

diff --git a/mllib/src/main/scala/org/apache/spark/ml/param/params.scala b/mllib/src/main/scala/org/apache/spark/ml/param/params.scala
index 473488dce9b0d..ba94d6a3a80a9 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/param/params.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/param/params.scala
@@ -69,14 +69,10 @@ class Param[T](val parent: String, val name: String, val doc: String, val isVali
     }
   }
 
-  /**
-   * Creates a param pair with the given value (for Java).
-   */
+  /** Creates a param pair with the given value (for Java). */
   def w(value: T): ParamPair[T] = this -> value
 
-  /**
-   * Creates a param pair with the given value (for Scala).
-   */
+  /** Creates a param pair with the given value (for Scala). */
   def ->(value: T): ParamPair[T] = ParamPair(this, value)
 
   override final def toString: String = s"${parent}__$name"
@@ -190,6 +186,7 @@ class DoubleParam(parent: String, name: String, doc: String, isValid: Double =>
 
   def this(parent: Identifiable, name: String, doc: String) = this(parent.uid, name, doc)
 
+  /** Creates a param pair with the given value (for Java). */
   override def w(value: Double): ParamPair[Double] = super.w(value)
 }
 
@@ -209,6 +206,7 @@ class IntParam(parent: String, name: String, doc: String, isValid: Int => Boolea
 
   def this(parent: Identifiable, name: String, doc: String) = this(parent.uid, name, doc)
 
+  /** Creates a param pair with the given value (for Java). */
   override def w(value: Int): ParamPair[Int] = super.w(value)
 }
 
@@ -228,6 +226,7 @@ class FloatParam(parent: String, name: String, doc: String, isValid: Float => Bo
 
   def this(parent: Identifiable, name: String, doc: String) = this(parent.uid, name, doc)
 
+  /** Creates a param pair with the given value (for Java). */
   override def w(value: Float): ParamPair[Float] = super.w(value)
 }
 
@@ -247,6 +246,7 @@ class LongParam(parent: String, name: String, doc: String, isValid: Long => Bool
 
   def this(parent: Identifiable, name: String, doc: String) = this(parent.uid, name, doc)
 
+  /** Creates a param pair with the given value (for Java). */
   override def w(value: Long): ParamPair[Long] = super.w(value)
 }
 
@@ -260,6 +260,7 @@ class BooleanParam(parent: String, name: String, doc: String) // No need for isV
 
   def this(parent: Identifiable, name: String, doc: String) = this(parent.uid, name, doc)
 
+  /** Creates a param pair with the given value (for Java). */
   override def w(value: Boolean): ParamPair[Boolean] = super.w(value)
 }
 
@@ -274,8 +275,6 @@ class StringArrayParam(parent: Params, name: String, doc: String, isValid: Array
   def this(parent: Params, name: String, doc: String) =
     this(parent, name, doc, ParamValidators.alwaysTrue)
 
-  override def w(value: Array[String]): ParamPair[Array[String]] = super.w(value)
-
   /** Creates a param pair with a [[java.util.List]] of values (for Java and Python). */
   def w(value: java.util.List[String]): ParamPair[Array[String]] = w(value.asScala.toArray)
 }
@@ -291,10 +290,9 @@ class DoubleArrayParam(parent: Params, name: String, doc: String, isValid: Array
   def this(parent: Params, name: String, doc: String) =
     this(parent, name, doc, ParamValidators.alwaysTrue)
 
-  override def w(value: Array[Double]): ParamPair[Array[Double]] = super.w(value)
-
   /** Creates a param pair with a [[java.util.List]] of values (for Java and Python). */
-  def w(value: java.util.List[Double]): ParamPair[Array[Double]] = w(value.asScala.toArray)
+  def w(value: java.util.List[java.lang.Double]): ParamPair[Array[Double]] =
+    w(value.asScala.map(_.asInstanceOf[Double]).toArray)
 }
 
 /**
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixture.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixture.scala
index 70b0e40948e51..fc509d2ba1470 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixture.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixture.scala
@@ -22,6 +22,7 @@ import scala.collection.mutable.IndexedSeq
 import breeze.linalg.{diag, DenseMatrix => BreezeMatrix, DenseVector => BDV, Vector => BV}
 
 import org.apache.spark.annotation.Experimental
+import org.apache.spark.api.java.JavaRDD
 import org.apache.spark.mllib.linalg.{BLAS, DenseMatrix, Matrices, Vector, Vectors}
 import org.apache.spark.mllib.stat.distribution.MultivariateGaussian
 import org.apache.spark.mllib.util.MLUtils
@@ -188,6 +189,9 @@ class GaussianMixture private (
     new GaussianMixtureModel(weights, gaussians)
   }
 
+  /** Java-friendly version of [[run()]] */
+  def run(data: JavaRDD[Vector]): GaussianMixtureModel = run(data.rdd)
+
   /** Average of dense breeze vectors */
   private def vectorMean(x: IndexedSeq[BV[Double]]): BDV[Double] = {
     val v = BDV.zeros[Double](x(0).length)
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixtureModel.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixtureModel.scala
index 5fc2cb1b62d33..cb807c8038101 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixtureModel.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixtureModel.scala
@@ -25,6 +25,7 @@ import org.json4s.jackson.JsonMethods._
 
 import org.apache.spark.SparkContext
 import org.apache.spark.annotation.Experimental
+import org.apache.spark.api.java.JavaRDD
 import org.apache.spark.mllib.linalg.{Vector, Matrices, Matrix}
 import org.apache.spark.mllib.stat.distribution.MultivariateGaussian
 import org.apache.spark.mllib.util.{MLUtils, Loader, Saveable}
@@ -46,7 +47,7 @@ import org.apache.spark.sql.{SQLContext, Row}
 @Experimental
 class GaussianMixtureModel(
   val weights: Array[Double],
-  val gaussians: Array[MultivariateGaussian]) extends Serializable with Saveable{
+  val gaussians: Array[MultivariateGaussian]) extends Serializable with Saveable {
 
   require(weights.length == gaussians.length, "Length of weight and Gaussian arrays must match")
 
@@ -65,6 +66,10 @@ class GaussianMixtureModel(
     responsibilityMatrix.map(r => r.indexOf(r.max))
   }
 
+  /** Java-friendly version of [[predict()]] */
+  def predict(points: JavaRDD[Vector]): JavaRDD[java.lang.Integer] =
+    predict(points.rdd).toJavaRDD().asInstanceOf[JavaRDD[java.lang.Integer]]
+
   /**
    * Given the input vectors, return the membership value of each vector
    * to all mixture components.
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala
index 6cf26445f20a0..974b26924dfb8 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala
@@ -20,6 +20,7 @@ package org.apache.spark.mllib.clustering
 import breeze.linalg.{DenseMatrix => BDM, normalize, sum => brzSum}
 
 import org.apache.spark.annotation.Experimental
+import org.apache.spark.api.java.JavaPairRDD
 import org.apache.spark.graphx.{VertexId, EdgeContext, Graph}
 import org.apache.spark.mllib.linalg.{Vectors, Vector, Matrices, Matrix}
 import org.apache.spark.rdd.RDD
@@ -345,6 +346,11 @@ class DistributedLDAModel private (
     }
   }
 
+  /** Java-friendly version of [[topicDistributions]] */
+  def javaTopicDistributions: JavaPairRDD[java.lang.Long, Vector] = {
+    JavaPairRDD.fromRDD(topicDistributions.asInstanceOf[RDD[(java.lang.Long, Vector)]])
+  }
+
   // TODO:
   // override def topicDistributions(documents: RDD[(Long, Vector)]): RDD[(Long, Vector)] = ???
 
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/StreamingKMeans.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/StreamingKMeans.scala
index c21e4fe7dc9b6..d9b34cec64894 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/StreamingKMeans.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/StreamingKMeans.scala
@@ -21,8 +21,10 @@ import scala.reflect.ClassTag
 
 import org.apache.spark.Logging
 import org.apache.spark.annotation.Experimental
+import org.apache.spark.api.java.JavaSparkContext._
 import org.apache.spark.mllib.linalg.{BLAS, Vector, Vectors}
 import org.apache.spark.rdd.RDD
+import org.apache.spark.streaming.api.java.{JavaPairDStream, JavaDStream}
 import org.apache.spark.streaming.dstream.DStream
 import org.apache.spark.util.Utils
 import org.apache.spark.util.random.XORShiftRandom
@@ -234,6 +236,9 @@ class StreamingKMeans(
     }
   }
 
+  /** Java-friendly version of `trainOn`. */
+  def trainOn(data: JavaDStream[Vector]): Unit = trainOn(data.dstream)
+
   /**
    * Use the clustering model to make predictions on batches of data from a DStream.
    *
@@ -245,6 +250,11 @@ class StreamingKMeans(
     data.map(model.predict)
   }
 
+  /** Java-friendly version of `predictOn`. */
+  def predictOn(data: JavaDStream[Vector]): JavaDStream[java.lang.Integer] = {
+    JavaDStream.fromDStream(predictOn(data.dstream).asInstanceOf[DStream[java.lang.Integer]])
+  }
+
   /**
    * Use the model to make predictions on the values of a DStream and carry over its keys.
    *
@@ -257,6 +267,14 @@ class StreamingKMeans(
     data.mapValues(model.predict)
   }
 
+  /** Java-friendly version of `predictOnValues`. */
+  def predictOnValues[K](
+      data: JavaPairDStream[K, Vector]): JavaPairDStream[K, java.lang.Integer] = {
+    implicit val tag = fakeClassTag[K]
+    JavaPairDStream.fromPairDStream(
+      predictOnValues(data.dstream).asInstanceOf[DStream[(K, java.lang.Integer)]])
+  }
+
   /** Check whether cluster centers have been initialized. */
   private[this] def assertInitialized(): Unit = {
     if (model.clusterCenters == null) {
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/stat/Statistics.scala b/mllib/src/main/scala/org/apache/spark/mllib/stat/Statistics.scala
index b3fad0c52d655..900007ec6bc74 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/stat/Statistics.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/stat/Statistics.scala
@@ -18,6 +18,7 @@
 package org.apache.spark.mllib.stat
 
 import org.apache.spark.annotation.Experimental
+import org.apache.spark.api.java.JavaRDD
 import org.apache.spark.mllib.linalg.distributed.RowMatrix
 import org.apache.spark.mllib.linalg.{Matrix, Vector}
 import org.apache.spark.mllib.regression.LabeledPoint
@@ -80,6 +81,10 @@ object Statistics {
    */
   def corr(x: RDD[Double], y: RDD[Double]): Double = Correlations.corr(x, y)
 
+  /** Java-friendly version of [[corr()]] */
+  def corr(x: JavaRDD[java.lang.Double], y: JavaRDD[java.lang.Double]): Double =
+    corr(x.rdd.asInstanceOf[RDD[Double]], y.rdd.asInstanceOf[RDD[Double]])
+
   /**
    * Compute the correlation for the input RDDs using the specified method.
    * Methods currently supported: `pearson` (default), `spearman`.
@@ -96,6 +101,10 @@ object Statistics {
    */
   def corr(x: RDD[Double], y: RDD[Double], method: String): Double = Correlations.corr(x, y, method)
 
+  /** Java-friendly version of [[corr()]] */
+  def corr(x: JavaRDD[java.lang.Double], y: JavaRDD[java.lang.Double], method: String): Double =
+    corr(x.rdd.asInstanceOf[RDD[Double]], y.rdd.asInstanceOf[RDD[Double]], method)
+
   /**
    * Conduct Pearson's chi-squared goodness of fit test of the observed data against the
    * expected distribution.
diff --git a/mllib/src/test/java/org/apache/spark/ml/param/JavaParamsSuite.java b/mllib/src/test/java/org/apache/spark/ml/param/JavaParamsSuite.java
index e7df10dfa63ac..9890155e9f865 100644
--- a/mllib/src/test/java/org/apache/spark/ml/param/JavaParamsSuite.java
+++ b/mllib/src/test/java/org/apache/spark/ml/param/JavaParamsSuite.java
@@ -50,6 +50,7 @@ public void testParams() {
     testParams.setMyIntParam(2).setMyDoubleParam(0.4).setMyStringParam("a");
     Assert.assertEquals(testParams.getMyDoubleParam(), 0.4, 0.0);
     Assert.assertEquals(testParams.getMyStringParam(), "a");
+    Assert.assertArrayEquals(testParams.getMyDoubleArrayParam(), new double[] {1.0, 2.0}, 0.0);
   }
 
   @Test
diff --git a/mllib/src/test/java/org/apache/spark/ml/param/JavaTestParams.java b/mllib/src/test/java/org/apache/spark/ml/param/JavaTestParams.java
index 947ae3a2ce06f..ff5929235ac2c 100644
--- a/mllib/src/test/java/org/apache/spark/ml/param/JavaTestParams.java
+++ b/mllib/src/test/java/org/apache/spark/ml/param/JavaTestParams.java
@@ -51,7 +51,8 @@ public String uid() {
   public int getMyIntParam() { return (Integer)getOrDefault(myIntParam_); }
 
   public JavaTestParams setMyIntParam(int value) {
-    set(myIntParam_, value); return this;
+    set(myIntParam_, value);
+    return this;
   }
 
   private DoubleParam myDoubleParam_;
@@ -60,7 +61,8 @@ public JavaTestParams setMyIntParam(int value) {
   public double getMyDoubleParam() { return (Double)getOrDefault(myDoubleParam_); }
 
   public JavaTestParams setMyDoubleParam(double value) {
-    set(myDoubleParam_, value); return this;
+    set(myDoubleParam_, value);
+    return this;
   }
 
   private Param<String> myStringParam_;
@@ -69,7 +71,18 @@ public JavaTestParams setMyDoubleParam(double value) {
   public String getMyStringParam() { return getOrDefault(myStringParam_); }
 
   public JavaTestParams setMyStringParam(String value) {
-    set(myStringParam_, value); return this;
+    set(myStringParam_, value);
+    return this;
+  }
+
+  private DoubleArrayParam myDoubleArrayParam_;
+  public DoubleArrayParam myDoubleArrayParam() { return myDoubleArrayParam_; }
+
+  public double[] getMyDoubleArrayParam() { return getOrDefault(myDoubleArrayParam_); }
+
+  public JavaTestParams setMyDoubleArrayParam(double[] value) {
+    set(myDoubleArrayParam_, value);
+    return this;
   }
 
   private void init() {
@@ -79,8 +92,14 @@ private void init() {
     List<String> validStrings = Lists.newArrayList("a", "b");
     myStringParam_ = new Param<String>(this, "myStringParam", "this is a string param",
       ParamValidators.inArray(validStrings));
-    setDefault(myIntParam_, 1);
-    setDefault(myDoubleParam_, 0.5);
+    myDoubleArrayParam_ =
+      new DoubleArrayParam(this, "myDoubleArrayParam", "this is a double param");
+
+    setDefault(myIntParam(), 1);
+    setDefault(myIntParam().w(1));
+    setDefault(myDoubleParam(), 0.5);
     setDefault(myIntParam().w(1), myDoubleParam().w(0.5));
+    setDefault(myDoubleArrayParam(), new double[] {1.0, 2.0});
+    setDefault(myDoubleArrayParam().w(new double[] {1.0, 2.0}));
   }
 }
diff --git a/mllib/src/test/java/org/apache/spark/ml/classification/JavaStreamingLogisticRegressionSuite.java b/mllib/src/test/java/org/apache/spark/mllib/classification/JavaStreamingLogisticRegressionSuite.java
similarity index 95%
rename from mllib/src/test/java/org/apache/spark/ml/classification/JavaStreamingLogisticRegressionSuite.java
rename to mllib/src/test/java/org/apache/spark/mllib/classification/JavaStreamingLogisticRegressionSuite.java
index 640d2ec55e4e7..55787f8606d48 100644
--- a/mllib/src/test/java/org/apache/spark/ml/classification/JavaStreamingLogisticRegressionSuite.java
+++ b/mllib/src/test/java/org/apache/spark/mllib/classification/JavaStreamingLogisticRegressionSuite.java
@@ -15,7 +15,7 @@
  * limitations under the License.
  */
 
-package org.apache.spark.ml.classification;
+package org.apache.spark.mllib.classification;
 
 import java.io.Serializable;
 import java.util.List;
@@ -28,7 +28,6 @@
 import org.junit.Test;
 
 import org.apache.spark.SparkConf;
-import org.apache.spark.mllib.classification.StreamingLogisticRegressionWithSGD;
 import org.apache.spark.mllib.linalg.Vector;
 import org.apache.spark.mllib.linalg.Vectors;
 import org.apache.spark.mllib.regression.LabeledPoint;
diff --git a/mllib/src/test/java/org/apache/spark/mllib/clustering/JavaGaussianMixtureSuite.java b/mllib/src/test/java/org/apache/spark/mllib/clustering/JavaGaussianMixtureSuite.java
new file mode 100644
index 0000000000000..467a7a69e8f30
--- /dev/null
+++ b/mllib/src/test/java/org/apache/spark/mllib/clustering/JavaGaussianMixtureSuite.java
@@ -0,0 +1,64 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.mllib.clustering;
+
+import java.io.Serializable;
+import java.util.List;
+
+import com.google.common.collect.Lists;
+import org.junit.After;
+import org.junit.Before;
+import org.junit.Test;
+
+import static org.junit.Assert.assertEquals;
+
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.mllib.linalg.Vector;
+import org.apache.spark.mllib.linalg.Vectors;
+
+public class JavaGaussianMixtureSuite implements Serializable {
+  private transient JavaSparkContext sc;
+
+  @Before
+  public void setUp() {
+    sc = new JavaSparkContext("local", "JavaGaussianMixture");
+  }
+
+  @After
+  public void tearDown() {
+    sc.stop();
+    sc = null;
+  }
+
+  @Test
+  public void runGaussianMixture() {
+    List<Vector> points = Lists.newArrayList(
+      Vectors.dense(1.0, 2.0, 6.0),
+      Vectors.dense(1.0, 3.0, 0.0),
+      Vectors.dense(1.0, 4.0, 6.0)
+    );
+
+    JavaRDD<Vector> data = sc.parallelize(points, 2);
+    GaussianMixtureModel model = new GaussianMixture().setK(2).setMaxIterations(1).setSeed(1234)
+      .run(data);
+    assertEquals(model.gaussians().length, 2);
+    JavaRDD<Integer> predictions = model.predict(data);
+    predictions.first();
+  }
+}
diff --git a/mllib/src/test/java/org/apache/spark/mllib/clustering/JavaLDASuite.java b/mllib/src/test/java/org/apache/spark/mllib/clustering/JavaLDASuite.java
index 96c2da169961f..581c033f08ebe 100644
--- a/mllib/src/test/java/org/apache/spark/mllib/clustering/JavaLDASuite.java
+++ b/mllib/src/test/java/org/apache/spark/mllib/clustering/JavaLDASuite.java
@@ -107,6 +107,10 @@ public void distributedLDAModel() {
     // Check: log probabilities
     assert(model.logLikelihood() < 0.0);
     assert(model.logPrior() < 0.0);
+
+    // Check: topic distributions
+    JavaPairRDD<Long, Vector> topicDistributions = model.javaTopicDistributions();
+    assertEquals(topicDistributions.count(), corpus.count());
   }
 
   @Test
diff --git a/mllib/src/test/java/org/apache/spark/mllib/clustering/JavaStreamingKMeansSuite.java b/mllib/src/test/java/org/apache/spark/mllib/clustering/JavaStreamingKMeansSuite.java
new file mode 100644
index 0000000000000..3b0e879eec77f
--- /dev/null
+++ b/mllib/src/test/java/org/apache/spark/mllib/clustering/JavaStreamingKMeansSuite.java
@@ -0,0 +1,82 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.mllib.clustering;
+
+import java.io.Serializable;
+import java.util.List;
+
+import scala.Tuple2;
+
+import com.google.common.collect.Lists;
+import org.junit.After;
+import org.junit.Before;
+import org.junit.Test;
+
+import static org.apache.spark.streaming.JavaTestUtils.*;
+
+import org.apache.spark.SparkConf;
+import org.apache.spark.mllib.linalg.Vector;
+import org.apache.spark.mllib.linalg.Vectors;
+import org.apache.spark.streaming.Duration;
+import org.apache.spark.streaming.api.java.JavaDStream;
+import org.apache.spark.streaming.api.java.JavaPairDStream;
+import org.apache.spark.streaming.api.java.JavaStreamingContext;
+
+public class JavaStreamingKMeansSuite implements Serializable {
+
+  protected transient JavaStreamingContext ssc;
+
+  @Before
+  public void setUp() {
+    SparkConf conf = new SparkConf()
+      .setMaster("local[2]")
+      .setAppName("test")
+      .set("spark.streaming.clock", "org.apache.spark.util.ManualClock");
+    ssc = new JavaStreamingContext(conf, new Duration(1000));
+    ssc.checkpoint("checkpoint");
+  }
+
+  @After
+  public void tearDown() {
+    ssc.stop();
+    ssc = null;
+  }
+
+  @Test
+  @SuppressWarnings("unchecked")
+  public void javaAPI() {
+    List<Vector> trainingBatch = Lists.newArrayList(
+      Vectors.dense(1.0),
+      Vectors.dense(0.0));
+    JavaDStream<Vector> training =
+      attachTestInputStream(ssc, Lists.newArrayList(trainingBatch, trainingBatch), 2);
+    List<Tuple2<Integer, Vector>> testBatch = Lists.newArrayList(
+      new Tuple2<Integer, Vector>(10, Vectors.dense(1.0)),
+      new Tuple2<Integer, Vector>(11, Vectors.dense(0.0)));
+    JavaPairDStream<Integer, Vector> test = JavaPairDStream.fromJavaDStream(
+      attachTestInputStream(ssc, Lists.newArrayList(testBatch, testBatch), 2));
+    StreamingKMeans skmeans = new StreamingKMeans()
+      .setK(1)
+      .setDecayFactor(1.0)
+      .setInitialCenters(new Vector[]{Vectors.dense(1.0)}, new double[]{0.0});
+    skmeans.trainOn(training);
+    JavaPairDStream<Integer, Integer> prediction = skmeans.predictOnValues(test);
+    attachTestOutputStream(prediction.count());
+    runStreams(ssc, 2, 2);
+  }
+}
diff --git a/mllib/src/test/java/org/apache/spark/mllib/stat/JavaStatisticsSuite.java b/mllib/src/test/java/org/apache/spark/mllib/stat/JavaStatisticsSuite.java
new file mode 100644
index 0000000000000..62f7f26b7c98f
--- /dev/null
+++ b/mllib/src/test/java/org/apache/spark/mllib/stat/JavaStatisticsSuite.java
@@ -0,0 +1,56 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.mllib.stat;
+
+import java.io.Serializable;
+
+import com.google.common.collect.Lists;
+import org.junit.After;
+import org.junit.Before;
+import org.junit.Test;
+
+import static org.junit.Assert.assertEquals;
+
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.api.java.JavaSparkContext;
+
+public class JavaStatisticsSuite implements Serializable {
+  private transient JavaSparkContext sc;
+
+  @Before
+  public void setUp() {
+    sc = new JavaSparkContext("local", "JavaStatistics");
+  }
+
+  @After
+  public void tearDown() {
+    sc.stop();
+    sc = null;
+  }
+
+  @Test
+  public void testCorr() {
+    JavaRDD<Double> x = sc.parallelize(Lists.newArrayList(1.0, 2.0, 3.0, 4.0));
+    JavaRDD<Double> y = sc.parallelize(Lists.newArrayList(1.1, 2.2, 3.1, 4.3));
+
+    Double corr1 = Statistics.corr(x, y);
+    Double corr2 = Statistics.corr(x, y, "pearson");
+    // Check default method
+    assertEquals(corr1, corr2);
+  }
+}

From c6a6dd0d0736d548ff9f255e5ed5df45b29c46c1 Mon Sep 17 00:00:00 2001
From: Andrew Or <andrew@databricks.com>
Date: Wed, 3 Jun 2015 12:10:12 -0700
Subject: [PATCH 341/525] [MINOR] [UI] Improve confusing message on log page

It's good practice to check if the input path is in the directory
we expect to avoid potentially confusing error messages.
---
 .../spark/deploy/worker/ui/LogPage.scala      |  9 +++
 .../scala/org/apache/spark/util/Utils.scala   | 16 +++++
 .../spark/deploy/worker/ui/LogPageSuite.scala | 36 ++++++----
 .../org/apache/spark/util/UtilsSuite.scala    | 65 +++++++++++++++++++
 4 files changed, 115 insertions(+), 11 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/deploy/worker/ui/LogPage.scala b/core/src/main/scala/org/apache/spark/deploy/worker/ui/LogPage.scala
index dc2bee6f2bdca..53f8f9a46cf8d 100644
--- a/core/src/main/scala/org/apache/spark/deploy/worker/ui/LogPage.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/worker/ui/LogPage.scala
@@ -17,6 +17,8 @@
 
 package org.apache.spark.deploy.worker.ui
 
+import java.io.File
+import java.net.URI
 import javax.servlet.http.HttpServletRequest
 
 import scala.xml.Node
@@ -135,6 +137,13 @@ private[ui] class LogPage(parent: WorkerWebUI) extends WebUIPage("logPage") with
       return ("Error: Log type must be one of " + supportedLogTypes.mkString(", "), 0, 0, 0)
     }
 
+    // Verify that the normalized path of the log directory is in the working directory
+    val normalizedUri = new URI(logDirectory).normalize()
+    val normalizedLogDir = new File(normalizedUri.getPath)
+    if (!Utils.isInDirectory(workDir, normalizedLogDir)) {
+      return ("Error: invalid log directory " + logDirectory, 0, 0, 0)
+    }
+
     try {
       val files = RollingFileAppender.getSortedRolledOverFiles(logDirectory, logType)
       logDebug(s"Sorted log files of type $logType in $logDirectory:\n${files.mkString("\n")}")
diff --git a/core/src/main/scala/org/apache/spark/util/Utils.scala b/core/src/main/scala/org/apache/spark/util/Utils.scala
index 693e1a0a3d5f0..5f132410540fd 100644
--- a/core/src/main/scala/org/apache/spark/util/Utils.scala
+++ b/core/src/main/scala/org/apache/spark/util/Utils.scala
@@ -2227,6 +2227,22 @@ private[spark] object Utils extends Logging {
     }
   }
 
+  /**
+   * Return whether the specified file is a parent directory of the child file.
+   */
+  def isInDirectory(parent: File, child: File): Boolean = {
+    if (child == null || parent == null) {
+      return false
+    }
+    if (!child.exists() || !parent.exists() || !parent.isDirectory()) {
+      return false
+    }
+    if (parent.equals(child)) {
+      return true
+    }
+    isInDirectory(parent, child.getParentFile)
+  }
+
 }
 
 private [util] class SparkShutdownHookManager {
diff --git a/core/src/test/scala/org/apache/spark/deploy/worker/ui/LogPageSuite.scala b/core/src/test/scala/org/apache/spark/deploy/worker/ui/LogPageSuite.scala
index 572360ddb95d4..72eaffb416981 100644
--- a/core/src/test/scala/org/apache/spark/deploy/worker/ui/LogPageSuite.scala
+++ b/core/src/test/scala/org/apache/spark/deploy/worker/ui/LogPageSuite.scala
@@ -19,7 +19,7 @@ package org.apache.spark.deploy.worker.ui
 
 import java.io.{File, FileWriter}
 
-import org.mockito.Mockito.mock
+import org.mockito.Mockito.{mock, when}
 import org.scalatest.PrivateMethodTester
 
 import org.apache.spark.SparkFunSuite
@@ -28,33 +28,47 @@ class LogPageSuite extends SparkFunSuite with PrivateMethodTester {
 
   test("get logs simple") {
     val webui = mock(classOf[WorkerWebUI])
+    val tmpDir = new File(sys.props("java.io.tmpdir"))
+    val workDir = new File(tmpDir, "work-dir")
+    workDir.mkdir()
+    when(webui.workDir).thenReturn(workDir)
     val logPage = new LogPage(webui)
 
     // Prepare some fake log files to read later
     val out = "some stdout here"
     val err = "some stderr here"
-    val tmpDir = new File(sys.props("java.io.tmpdir"))
-    val tmpOut = new File(tmpDir, "stdout")
-    val tmpErr = new File(tmpDir, "stderr")
-    val tmpRand = new File(tmpDir, "random")
+    val tmpOut = new File(workDir, "stdout")
+    val tmpErr = new File(workDir, "stderr")
+    val tmpErrBad = new File(tmpDir, "stderr") // outside the working directory
+    val tmpOutBad = new File(tmpDir, "stdout")
+    val tmpRand = new File(workDir, "random")
     write(tmpOut, out)
     write(tmpErr, err)
+    write(tmpOutBad, out)
+    write(tmpErrBad, err)
     write(tmpRand, "1 6 4 5 2 7 8")
 
     // Get the logs. All log types other than "stderr" or "stdout" will be rejected
     val getLog = PrivateMethod[(String, Long, Long, Long)]('getLog)
     val (stdout, _, _, _) =
-      logPage invokePrivate getLog(tmpDir.getAbsolutePath, "stdout", None, 100)
+      logPage invokePrivate getLog(workDir.getAbsolutePath, "stdout", None, 100)
     val (stderr, _, _, _) =
-      logPage invokePrivate getLog(tmpDir.getAbsolutePath, "stderr", None, 100)
+      logPage invokePrivate getLog(workDir.getAbsolutePath, "stderr", None, 100)
     val (error1, _, _, _) =
-      logPage invokePrivate getLog(tmpDir.getAbsolutePath, "random", None, 100)
+      logPage invokePrivate getLog(workDir.getAbsolutePath, "random", None, 100)
     val (error2, _, _, _) =
-      logPage invokePrivate getLog(tmpDir.getAbsolutePath, "does-not-exist.txt", None, 100)
+      logPage invokePrivate getLog(workDir.getAbsolutePath, "does-not-exist.txt", None, 100)
+    // These files exist, but live outside the working directory
+    val (error3, _, _, _) =
+      logPage invokePrivate getLog(tmpDir.getAbsolutePath, "stderr", None, 100)
+    val (error4, _, _, _) =
+      logPage invokePrivate getLog(tmpDir.getAbsolutePath, "stdout", None, 100)
     assert(stdout === out)
     assert(stderr === err)
-    assert(error1.startsWith("Error"))
-    assert(error2.startsWith("Error"))
+    assert(error1.startsWith("Error: Log type must be one of "))
+    assert(error2.startsWith("Error: Log type must be one of "))
+    assert(error3.startsWith("Error: invalid log directory"))
+    assert(error4.startsWith("Error: invalid log directory"))
   }
 
   /** Write the specified string to the file. */
diff --git a/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala b/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala
index a867cf83dc3f1..a61ea3918f46a 100644
--- a/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala
+++ b/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala
@@ -608,4 +608,69 @@ class UtilsSuite extends SparkFunSuite with ResetSystemProperties with Logging {
     manager.runAll()
     assert(output.toList === List(4, 3, 2))
   }
+
+  test("isInDirectory") {
+    val tmpDir = new File(sys.props("java.io.tmpdir"))
+    val parentDir = new File(tmpDir, "parent-dir")
+    val childDir1 = new File(parentDir, "child-dir-1")
+    val childDir1b = new File(parentDir, "child-dir-1b")
+    val childFile1 = new File(parentDir, "child-file-1.txt")
+    val childDir2 = new File(childDir1, "child-dir-2")
+    val childDir2b = new File(childDir1, "child-dir-2b")
+    val childFile2 = new File(childDir1, "child-file-2.txt")
+    val childFile3 = new File(childDir2, "child-file-3.txt")
+    val nullFile: File = null
+    parentDir.mkdir()
+    childDir1.mkdir()
+    childDir1b.mkdir()
+    childDir2.mkdir()
+    childDir2b.mkdir()
+    childFile1.createNewFile()
+    childFile2.createNewFile()
+    childFile3.createNewFile()
+
+    // Identity
+    assert(Utils.isInDirectory(parentDir, parentDir))
+    assert(Utils.isInDirectory(childDir1, childDir1))
+    assert(Utils.isInDirectory(childDir2, childDir2))
+
+    // Valid ancestor-descendant pairs
+    assert(Utils.isInDirectory(parentDir, childDir1))
+    assert(Utils.isInDirectory(parentDir, childFile1))
+    assert(Utils.isInDirectory(parentDir, childDir2))
+    assert(Utils.isInDirectory(parentDir, childFile2))
+    assert(Utils.isInDirectory(parentDir, childFile3))
+    assert(Utils.isInDirectory(childDir1, childDir2))
+    assert(Utils.isInDirectory(childDir1, childFile2))
+    assert(Utils.isInDirectory(childDir1, childFile3))
+    assert(Utils.isInDirectory(childDir2, childFile3))
+
+    // Inverted ancestor-descendant pairs should fail
+    assert(!Utils.isInDirectory(childDir1, parentDir))
+    assert(!Utils.isInDirectory(childDir2, parentDir))
+    assert(!Utils.isInDirectory(childDir2, childDir1))
+    assert(!Utils.isInDirectory(childFile1, parentDir))
+    assert(!Utils.isInDirectory(childFile2, parentDir))
+    assert(!Utils.isInDirectory(childFile3, parentDir))
+    assert(!Utils.isInDirectory(childFile2, childDir1))
+    assert(!Utils.isInDirectory(childFile3, childDir1))
+    assert(!Utils.isInDirectory(childFile3, childDir2))
+
+    // Non-existent files or directories should fail
+    assert(!Utils.isInDirectory(parentDir, new File(parentDir, "one.txt")))
+    assert(!Utils.isInDirectory(parentDir, new File(parentDir, "one/two.txt")))
+    assert(!Utils.isInDirectory(parentDir, new File(parentDir, "one/two/three.txt")))
+
+    // Siblings should fail
+    assert(!Utils.isInDirectory(childDir1, childDir1b))
+    assert(!Utils.isInDirectory(childDir1, childFile1))
+    assert(!Utils.isInDirectory(childDir2, childDir2b))
+    assert(!Utils.isInDirectory(childDir2, childFile2))
+
+    // Null files should fail without throwing NPE
+    assert(!Utils.isInDirectory(parentDir, nullFile))
+    assert(!Utils.isInDirectory(childFile3, nullFile))
+    assert(!Utils.isInDirectory(nullFile, parentDir))
+    assert(!Utils.isInDirectory(nullFile, childFile3))
+  }
 }

From bfbf12b349e998c7e674649a07b88c4658ae0711 Mon Sep 17 00:00:00 2001
From: Timothy Chen <tnachen@gmail.com>
Date: Wed, 3 Jun 2015 14:57:23 -0700
Subject: [PATCH 342/525] [SPARK-8083] [MESOS] Use the correct base path in
 mesos driver page.

Author: Timothy Chen <tnachen@gmail.com>

Closes #6615 from tnachen/mesos_driver_path and squashes the following commits:

4f47b7c [Timothy Chen] Use the correct base path in mesos driver page.
---
 .../scala/org/apache/spark/deploy/mesos/ui/DriverPage.scala     | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/core/src/main/scala/org/apache/spark/deploy/mesos/ui/DriverPage.scala b/core/src/main/scala/org/apache/spark/deploy/mesos/ui/DriverPage.scala
index be8560d10fc62..e8ef60bd5428a 100644
--- a/core/src/main/scala/org/apache/spark/deploy/mesos/ui/DriverPage.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/mesos/ui/DriverPage.scala
@@ -68,7 +68,7 @@ private[ui] class DriverPage(parent: MesosClusterUI) extends WebUIPage("driver")
         retryHeaders, retryRow, Iterable.apply(driverState.description.retryState))
     val content =
       <p>Driver state information for driver id {driverId}</p>
-        <a href="/">Back to Drivers</a>
+        <a href={UIUtils.prependBaseUri("/")}>Back to Drivers</a>
         <div class="row-fluid">
           <div class="span12">
             <h4>Driver state: {driverState.state}</h4>

From aa40c4420717aa06a7964bd30b428fb73548beb2 Mon Sep 17 00:00:00 2001
From: Marcelo Vanzin <vanzin@cloudera.com>
Date: Wed, 3 Jun 2015 14:59:30 -0700
Subject: [PATCH 343/525] [SPARK-8059] [YARN] Wake up allocation thread when
 new requests arrive.

This should help reduce latency for new executor allocations.

Author: Marcelo Vanzin <vanzin@cloudera.com>

Closes #6600 from vanzin/SPARK-8059 and squashes the following commits:

8387a3a [Marcelo Vanzin] [SPARK-8059] [yarn] Wake up allocation thread when new requests arrive.
---
 .../spark/deploy/yarn/ApplicationMaster.scala    | 16 +++++++++++++---
 .../apache/spark/deploy/yarn/YarnAllocator.scala |  7 ++++++-
 2 files changed, 19 insertions(+), 4 deletions(-)

diff --git a/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala b/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala
index 760e458972d98..002d7b6eaf498 100644
--- a/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala
+++ b/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala
@@ -67,6 +67,7 @@ private[spark] class ApplicationMaster(
 
   @volatile private var reporterThread: Thread = _
   @volatile private var allocator: YarnAllocator = _
+  private val allocatorLock = new Object()
 
   // Fields used in client mode.
   private var rpcEnv: RpcEnv = null
@@ -359,7 +360,9 @@ private[spark] class ApplicationMaster(
               }
             logDebug(s"Number of pending allocations is $numPendingAllocate. " +
                      s"Sleeping for $sleepInterval.")
-            Thread.sleep(sleepInterval)
+            allocatorLock.synchronized {
+              allocatorLock.wait(sleepInterval)
+            }
           } catch {
             case e: InterruptedException =>
           }
@@ -546,8 +549,15 @@ private[spark] class ApplicationMaster(
     override def receiveAndReply(context: RpcCallContext): PartialFunction[Any, Unit] = {
       case RequestExecutors(requestedTotal) =>
         Option(allocator) match {
-          case Some(a) => a.requestTotalExecutors(requestedTotal)
-          case None => logWarning("Container allocator is not ready to request executors yet.")
+          case Some(a) =>
+            allocatorLock.synchronized {
+              if (a.requestTotalExecutors(requestedTotal)) {
+                allocatorLock.notifyAll()
+              }
+            }
+
+          case None =>
+            logWarning("Container allocator is not ready to request executors yet.")
         }
         context.reply(true)
 
diff --git a/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnAllocator.scala b/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnAllocator.scala
index 21193e7c625e3..940873fbd046c 100644
--- a/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnAllocator.scala
+++ b/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnAllocator.scala
@@ -146,11 +146,16 @@ private[yarn] class YarnAllocator(
    * Request as many executors from the ResourceManager as needed to reach the desired total. If
    * the requested total is smaller than the current number of running executors, no executors will
    * be killed.
+   *
+   * @return Whether the new requested total is different than the old value.
    */
-  def requestTotalExecutors(requestedTotal: Int): Unit = synchronized {
+  def requestTotalExecutors(requestedTotal: Int): Boolean = synchronized {
     if (requestedTotal != targetNumExecutors) {
       logInfo(s"Driver requested a total number of $requestedTotal executor(s).")
       targetNumExecutors = requestedTotal
+      true
+    } else {
+      false
     }
   }
 

From 1d8669f15c136cd81f494dd487400c62c9498602 Mon Sep 17 00:00:00 2001
From: zsxwing <zsxwing@gmail.com>
Date: Wed, 3 Jun 2015 15:03:07 -0700
Subject: [PATCH 344/525] [SPARK-8001] [CORE] Make
 AsynchronousListenerBus.waitUntilEmpty throw TimeoutException if timeout

Some places forget to call `assert` to check the return value of `AsynchronousListenerBus.waitUntilEmpty`. Instead of adding `assert` in these places, I think it's better to make `AsynchronousListenerBus.waitUntilEmpty` throw `TimeoutException`.

Author: zsxwing <zsxwing@gmail.com>

Closes #6550 from zsxwing/SPARK-8001 and squashes the following commits:

607674a [zsxwing] Make AsynchronousListenerBus.waitUntilEmpty throw TimeoutException if timeout
---
 .../spark/util/AsynchronousListenerBus.scala  | 11 +++++-----
 .../spark/deploy/LogUrlsStandaloneSuite.scala |  4 ++--
 .../spark/scheduler/DAGSchedulerSuite.scala   | 18 +++++++--------
 .../spark/scheduler/SparkListenerSuite.scala  | 22 +++++++++----------
 .../SparkListenerWithClusterSuite.scala       |  2 +-
 .../spark/deploy/yarn/YarnClusterSuite.scala  |  2 +-
 6 files changed, 30 insertions(+), 29 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/util/AsynchronousListenerBus.scala b/core/src/main/scala/org/apache/spark/util/AsynchronousListenerBus.scala
index 1861d38640102..61b5a4cecddce 100644
--- a/core/src/main/scala/org/apache/spark/util/AsynchronousListenerBus.scala
+++ b/core/src/main/scala/org/apache/spark/util/AsynchronousListenerBus.scala
@@ -120,21 +120,22 @@ private[spark] abstract class AsynchronousListenerBus[L <: AnyRef, E](name: Stri
 
   /**
    * For testing only. Wait until there are no more events in the queue, or until the specified
-   * time has elapsed. Return true if the queue has emptied and false is the specified time
-   * elapsed before the queue emptied.
+   * time has elapsed. Throw `TimeoutException` if the specified time elapsed before the queue
+   * emptied.
    */
   @VisibleForTesting
-  def waitUntilEmpty(timeoutMillis: Int): Boolean = {
+  @throws(classOf[TimeoutException])
+  def waitUntilEmpty(timeoutMillis: Long): Unit = {
     val finishTime = System.currentTimeMillis + timeoutMillis
     while (!queueIsEmpty) {
       if (System.currentTimeMillis > finishTime) {
-        return false
+        throw new TimeoutException(
+          s"The event queue is not empty after $timeoutMillis milliseconds")
       }
       /* Sleep rather than using wait/notify, because this is used only for testing and
        * wait/notify add overhead in the general case. */
       Thread.sleep(10)
     }
-    true
   }
 
   /**
diff --git a/core/src/test/scala/org/apache/spark/deploy/LogUrlsStandaloneSuite.scala b/core/src/test/scala/org/apache/spark/deploy/LogUrlsStandaloneSuite.scala
index c215b0582889f..ddc92814c0acf 100644
--- a/core/src/test/scala/org/apache/spark/deploy/LogUrlsStandaloneSuite.scala
+++ b/core/src/test/scala/org/apache/spark/deploy/LogUrlsStandaloneSuite.scala
@@ -41,7 +41,7 @@ class LogUrlsStandaloneSuite extends SparkFunSuite with LocalSparkContext {
     // Trigger a job so that executors get added
     sc.parallelize(1 to 100, 4).map(_.toString).count()
 
-    assert(sc.listenerBus.waitUntilEmpty(WAIT_TIMEOUT_MILLIS))
+    sc.listenerBus.waitUntilEmpty(WAIT_TIMEOUT_MILLIS)
     listener.addedExecutorInfos.values.foreach { info =>
       assert(info.logUrlMap.nonEmpty)
       // Browse to each URL to check that it's valid
@@ -71,7 +71,7 @@ class LogUrlsStandaloneSuite extends SparkFunSuite with LocalSparkContext {
     // Trigger a job so that executors get added
     sc.parallelize(1 to 100, 4).map(_.toString).count()
 
-    assert(sc.listenerBus.waitUntilEmpty(WAIT_TIMEOUT_MILLIS))
+    sc.listenerBus.waitUntilEmpty(WAIT_TIMEOUT_MILLIS)
     val listeners = sc.listenerBus.findListenersByClass[SaveExecutorInfo]
     assert(listeners.size === 1)
     val listener = listeners(0)
diff --git a/core/src/test/scala/org/apache/spark/scheduler/DAGSchedulerSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/DAGSchedulerSuite.scala
index bfcf918e06162..47b2868753c0e 100644
--- a/core/src/test/scala/org/apache/spark/scheduler/DAGSchedulerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/scheduler/DAGSchedulerSuite.scala
@@ -254,7 +254,7 @@ class DAGSchedulerSuite
   test("[SPARK-3353] parent stage should have lower stage id") {
     sparkListener.stageByOrderOfExecution.clear()
     sc.parallelize(1 to 10).map(x => (x, x)).reduceByKey(_ + _, 4).count()
-    assert(sc.listenerBus.waitUntilEmpty(WAIT_TIMEOUT_MILLIS))
+    sc.listenerBus.waitUntilEmpty(WAIT_TIMEOUT_MILLIS)
     assert(sparkListener.stageByOrderOfExecution.length === 2)
     assert(sparkListener.stageByOrderOfExecution(0) < sparkListener.stageByOrderOfExecution(1))
   }
@@ -389,7 +389,7 @@ class DAGSchedulerSuite
     submit(unserializableRdd, Array(0))
     assert(failure.getMessage.startsWith(
       "Job aborted due to stage failure: Task not serializable:"))
-    assert(sc.listenerBus.waitUntilEmpty(WAIT_TIMEOUT_MILLIS))
+    sc.listenerBus.waitUntilEmpty(WAIT_TIMEOUT_MILLIS)
     assert(sparkListener.failedStages.contains(0))
     assert(sparkListener.failedStages.size === 1)
     assertDataStructuresEmpty()
@@ -399,7 +399,7 @@ class DAGSchedulerSuite
     submit(new MyRDD(sc, 1, Nil), Array(0))
     failed(taskSets(0), "some failure")
     assert(failure.getMessage === "Job aborted due to stage failure: some failure")
-    assert(sc.listenerBus.waitUntilEmpty(WAIT_TIMEOUT_MILLIS))
+    sc.listenerBus.waitUntilEmpty(WAIT_TIMEOUT_MILLIS)
     assert(sparkListener.failedStages.contains(0))
     assert(sparkListener.failedStages.size === 1)
     assertDataStructuresEmpty()
@@ -410,7 +410,7 @@ class DAGSchedulerSuite
     val jobId = submit(rdd, Array(0))
     cancel(jobId)
     assert(failure.getMessage === s"Job $jobId cancelled ")
-    assert(sc.listenerBus.waitUntilEmpty(WAIT_TIMEOUT_MILLIS))
+    sc.listenerBus.waitUntilEmpty(WAIT_TIMEOUT_MILLIS)
     assert(sparkListener.failedStages.contains(0))
     assert(sparkListener.failedStages.size === 1)
     assertDataStructuresEmpty()
@@ -462,7 +462,7 @@ class DAGSchedulerSuite
     assert(results === Map(0 -> 42))
     assertDataStructuresEmpty()
 
-    assert(sc.listenerBus.waitUntilEmpty(WAIT_TIMEOUT_MILLIS))
+    sc.listenerBus.waitUntilEmpty(WAIT_TIMEOUT_MILLIS)
     assert(sparkListener.failedStages.isEmpty)
     assert(sparkListener.successfulStages.contains(0))
   }
@@ -531,7 +531,7 @@ class DAGSchedulerSuite
       Map[Long, Any](),
       createFakeTaskInfo(),
       null))
-    assert(sc.listenerBus.waitUntilEmpty(WAIT_TIMEOUT_MILLIS))
+    sc.listenerBus.waitUntilEmpty(WAIT_TIMEOUT_MILLIS)
     assert(sparkListener.failedStages.contains(1))
 
     // The second ResultTask fails, with a fetch failure for the output from the second mapper.
@@ -543,7 +543,7 @@ class DAGSchedulerSuite
       createFakeTaskInfo(),
       null))
     // The SparkListener should not receive redundant failure events.
-    assert(sc.listenerBus.waitUntilEmpty(WAIT_TIMEOUT_MILLIS))
+    sc.listenerBus.waitUntilEmpty(WAIT_TIMEOUT_MILLIS)
     assert(sparkListener.failedStages.size == 1)
   }
 
@@ -592,7 +592,7 @@ class DAGSchedulerSuite
 
     // Listener bus should get told about the map stage failing, but not the reduce stage
     // (since the reduce stage hasn't been started yet).
-    assert(sc.listenerBus.waitUntilEmpty(WAIT_TIMEOUT_MILLIS))
+    sc.listenerBus.waitUntilEmpty(WAIT_TIMEOUT_MILLIS)
     assert(sparkListener.failedStages.toSet === Set(0))
 
     assertDataStructuresEmpty()
@@ -643,7 +643,7 @@ class DAGSchedulerSuite
     assert(cancelledStages.toSet === Set(0, 2))
 
     // Make sure the listeners got told about both failed stages.
-    assert(sc.listenerBus.waitUntilEmpty(WAIT_TIMEOUT_MILLIS))
+    sc.listenerBus.waitUntilEmpty(WAIT_TIMEOUT_MILLIS)
     assert(sparkListener.successfulStages.isEmpty)
     assert(sparkListener.failedStages.toSet === Set(0, 2))
 
diff --git a/core/src/test/scala/org/apache/spark/scheduler/SparkListenerSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/SparkListenerSuite.scala
index 06fb909bf5419..651295b7344c5 100644
--- a/core/src/test/scala/org/apache/spark/scheduler/SparkListenerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/scheduler/SparkListenerSuite.scala
@@ -47,7 +47,7 @@ class SparkListenerSuite extends SparkFunSuite with LocalSparkContext with Match
 
     // Starting listener bus should flush all buffered events
     bus.start(sc)
-    assert(bus.waitUntilEmpty(WAIT_TIMEOUT_MILLIS))
+    bus.waitUntilEmpty(WAIT_TIMEOUT_MILLIS)
     assert(counter.count === 5)
 
     // After listener bus has stopped, posting events should not increment counter
@@ -131,7 +131,7 @@ class SparkListenerSuite extends SparkFunSuite with LocalSparkContext with Match
     rdd2.setName("Target RDD")
     rdd2.count()
 
-    assert(sc.listenerBus.waitUntilEmpty(WAIT_TIMEOUT_MILLIS))
+    sc.listenerBus.waitUntilEmpty(WAIT_TIMEOUT_MILLIS)
 
     listener.stageInfos.size should be {1}
     val (stageInfo, taskInfoMetrics) = listener.stageInfos.head
@@ -156,7 +156,7 @@ class SparkListenerSuite extends SparkFunSuite with LocalSparkContext with Match
     rdd3.setName("Trois")
 
     rdd1.count()
-    assert(sc.listenerBus.waitUntilEmpty(WAIT_TIMEOUT_MILLIS))
+    sc.listenerBus.waitUntilEmpty(WAIT_TIMEOUT_MILLIS)
     listener.stageInfos.size should be {1}
     val stageInfo1 = listener.stageInfos.keys.find(_.stageId == 0).get
     stageInfo1.rddInfos.size should be {1} // ParallelCollectionRDD
@@ -165,7 +165,7 @@ class SparkListenerSuite extends SparkFunSuite with LocalSparkContext with Match
     listener.stageInfos.clear()
 
     rdd2.count()
-    assert(sc.listenerBus.waitUntilEmpty(WAIT_TIMEOUT_MILLIS))
+    sc.listenerBus.waitUntilEmpty(WAIT_TIMEOUT_MILLIS)
     listener.stageInfos.size should be {1}
     val stageInfo2 = listener.stageInfos.keys.find(_.stageId == 1).get
     stageInfo2.rddInfos.size should be {3} // ParallelCollectionRDD, FilteredRDD, MappedRDD
@@ -174,7 +174,7 @@ class SparkListenerSuite extends SparkFunSuite with LocalSparkContext with Match
     listener.stageInfos.clear()
 
     rdd3.count()
-    assert(sc.listenerBus.waitUntilEmpty(WAIT_TIMEOUT_MILLIS))
+    sc.listenerBus.waitUntilEmpty(WAIT_TIMEOUT_MILLIS)
     listener.stageInfos.size should be {2} // Shuffle map stage + result stage
     val stageInfo3 = listener.stageInfos.keys.find(_.stageId == 3).get
     stageInfo3.rddInfos.size should be {1} // ShuffledRDD
@@ -190,7 +190,7 @@ class SparkListenerSuite extends SparkFunSuite with LocalSparkContext with Match
     val rdd2 = rdd1.map(_.toString)
     sc.runJob(rdd2, (items: Iterator[String]) => items.size, Seq(0, 1), true)
 
-    assert(sc.listenerBus.waitUntilEmpty(WAIT_TIMEOUT_MILLIS))
+    sc.listenerBus.waitUntilEmpty(WAIT_TIMEOUT_MILLIS)
 
     listener.stageInfos.size should be {1}
     val (stageInfo, _) = listener.stageInfos.head
@@ -214,7 +214,7 @@ class SparkListenerSuite extends SparkFunSuite with LocalSparkContext with Match
 
     val d = sc.parallelize(0 to 1e4.toInt, 64).map(w)
     d.count()
-    assert(sc.listenerBus.waitUntilEmpty(WAIT_TIMEOUT_MILLIS))
+    sc.listenerBus.waitUntilEmpty(WAIT_TIMEOUT_MILLIS)
     listener.stageInfos.size should be (1)
 
     val d2 = d.map { i => w(i) -> i * 2 }.setName("shuffle input 1")
@@ -225,7 +225,7 @@ class SparkListenerSuite extends SparkFunSuite with LocalSparkContext with Match
     d4.setName("A Cogroup")
     d4.collectAsMap()
 
-    assert(sc.listenerBus.waitUntilEmpty(WAIT_TIMEOUT_MILLIS))
+    sc.listenerBus.waitUntilEmpty(WAIT_TIMEOUT_MILLIS)
     listener.stageInfos.size should be (4)
     listener.stageInfos.foreach { case (stageInfo, taskInfoMetrics) =>
       /**
@@ -281,7 +281,7 @@ class SparkListenerSuite extends SparkFunSuite with LocalSparkContext with Match
       .reduce { case (x, y) => x }
     assert(result === 1.to(akkaFrameSize).toArray)
 
-    assert(sc.listenerBus.waitUntilEmpty(WAIT_TIMEOUT_MILLIS))
+    sc.listenerBus.waitUntilEmpty(WAIT_TIMEOUT_MILLIS)
     val TASK_INDEX = 0
     assert(listener.startedTasks.contains(TASK_INDEX))
     assert(listener.startedGettingResultTasks.contains(TASK_INDEX))
@@ -297,7 +297,7 @@ class SparkListenerSuite extends SparkFunSuite with LocalSparkContext with Match
     val result = sc.parallelize(Seq(1), 1).map(2 * _).reduce { case (x, y) => x }
     assert(result === 2)
 
-    assert(sc.listenerBus.waitUntilEmpty(WAIT_TIMEOUT_MILLIS))
+    sc.listenerBus.waitUntilEmpty(WAIT_TIMEOUT_MILLIS)
     val TASK_INDEX = 0
     assert(listener.startedTasks.contains(TASK_INDEX))
     assert(listener.startedGettingResultTasks.isEmpty)
@@ -352,7 +352,7 @@ class SparkListenerSuite extends SparkFunSuite with LocalSparkContext with Match
 
     // Post events to all listeners, and wait until the queue is drained
     (1 to 5).foreach { _ => bus.post(SparkListenerJobEnd(0, jobCompletionTime, JobSucceeded)) }
-    assert(bus.waitUntilEmpty(WAIT_TIMEOUT_MILLIS))
+    bus.waitUntilEmpty(WAIT_TIMEOUT_MILLIS)
 
     // The exception should be caught, and the event should be propagated to other listeners
     assert(bus.listenerThreadIsAlive)
diff --git a/core/src/test/scala/org/apache/spark/scheduler/SparkListenerWithClusterSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/SparkListenerWithClusterSuite.scala
index c7f179e1483a5..50273bcc8ce5e 100644
--- a/core/src/test/scala/org/apache/spark/scheduler/SparkListenerWithClusterSuite.scala
+++ b/core/src/test/scala/org/apache/spark/scheduler/SparkListenerWithClusterSuite.scala
@@ -46,7 +46,7 @@ class SparkListenerWithClusterSuite extends SparkFunSuite with LocalSparkContext
     rdd2.setName("Target RDD")
     rdd2.count()
 
-    assert(sc.listenerBus.waitUntilEmpty(WAIT_TIMEOUT_MILLIS))
+    sc.listenerBus.waitUntilEmpty(WAIT_TIMEOUT_MILLIS)
     assert(listener.addedExecutorInfo.size == 2)
     assert(listener.addedExecutorInfo("0").totalCores == 1)
     assert(listener.addedExecutorInfo("1").totalCores == 1)
diff --git a/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnClusterSuite.scala b/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnClusterSuite.scala
index d8bc2534c1a6a..bc42e12dfafd7 100644
--- a/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnClusterSuite.scala
+++ b/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnClusterSuite.scala
@@ -326,7 +326,7 @@ private object YarnClusterDriver extends Logging with Matchers {
     var result = "failure"
     try {
       val data = sc.parallelize(1 to 4, 4).collect().toSet
-      assert(sc.listenerBus.waitUntilEmpty(WAIT_TIMEOUT_MILLIS))
+      sc.listenerBus.waitUntilEmpty(WAIT_TIMEOUT_MILLIS)
       data should be (Set(1, 2, 3, 4))
       result = "success"
     } finally {

From f27134782ebb61c360330e2d6d5bb1aa02be3fb6 Mon Sep 17 00:00:00 2001
From: zsxwing <zsxwing@gmail.com>
Date: Wed, 3 Jun 2015 15:04:20 -0700
Subject: [PATCH 345/525] [SPARK-7989] [CORE] [TESTS] Fix flaky tests in
 ExternalShuffleServiceSuite and SparkListenerWithClusterSuite

The flaky tests in ExternalShuffleServiceSuite and SparkListenerWithClusterSuite will fail if there are not enough executors up before running the jobs.

This PR adds `JobProgressListener.waitUntilExecutorsUp`. The tests for the cluster mode can use it to wait until the expected executors are up.

Author: zsxwing <zsxwing@gmail.com>

Closes #6546 from zsxwing/SPARK-7989 and squashes the following commits:

5560e09 [zsxwing] Fix a typo
3b69840 [zsxwing] Fix flaky tests in ExternalShuffleServiceSuite and SparkListenerWithClusterSuite
---
 .../spark/ui/jobs/JobProgressListener.scala   | 30 +++++++++++++++++++
 .../spark/ExternalShuffleServiceSuite.scala   |  8 +++++
 .../spark/broadcast/BroadcastSuite.scala      | 10 +------
 .../SparkListenerWithClusterSuite.scala       | 10 +++++--
 4 files changed, 46 insertions(+), 12 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/ui/jobs/JobProgressListener.scala b/core/src/main/scala/org/apache/spark/ui/jobs/JobProgressListener.scala
index f39e961772c46..1d31fce4c697b 100644
--- a/core/src/main/scala/org/apache/spark/ui/jobs/JobProgressListener.scala
+++ b/core/src/main/scala/org/apache/spark/ui/jobs/JobProgressListener.scala
@@ -17,8 +17,12 @@
 
 package org.apache.spark.ui.jobs
 
+import java.util.concurrent.TimeoutException
+
 import scala.collection.mutable.{HashMap, HashSet, ListBuffer}
 
+import com.google.common.annotations.VisibleForTesting
+
 import org.apache.spark._
 import org.apache.spark.annotation.DeveloperApi
 import org.apache.spark.executor.TaskMetrics
@@ -526,4 +530,30 @@ class JobProgressListener(conf: SparkConf) extends SparkListener with Logging {
   override def onApplicationStart(appStarted: SparkListenerApplicationStart) {
     startTime = appStarted.time
   }
+
+  /**
+   * For testing only. Wait until at least `numExecutors` executors are up, or throw
+   * `TimeoutException` if the waiting time elapsed before `numExecutors` executors up.
+   *
+   * @param numExecutors the number of executors to wait at least
+   * @param timeout time to wait in milliseconds
+   */
+  @VisibleForTesting
+  private[spark] def waitUntilExecutorsUp(numExecutors: Int, timeout: Long): Unit = {
+    val finishTime = System.currentTimeMillis() + timeout
+    while (System.currentTimeMillis() < finishTime) {
+      val numBlockManagers = synchronized {
+        blockManagerIds.size
+      }
+      if (numBlockManagers >= numExecutors + 1) {
+        // Need to count the block manager in driver
+        return
+      }
+      // Sleep rather than using wait/notify, because this is used only for testing and wait/notify
+      // add overhead in the general case.
+      Thread.sleep(10)
+    }
+    throw new TimeoutException(
+      s"Can't find $numExecutors executors before $timeout milliseconds elapsed")
+  }
 }
diff --git a/core/src/test/scala/org/apache/spark/ExternalShuffleServiceSuite.scala b/core/src/test/scala/org/apache/spark/ExternalShuffleServiceSuite.scala
index bac6fdbcdc976..5b127a070c07f 100644
--- a/core/src/test/scala/org/apache/spark/ExternalShuffleServiceSuite.scala
+++ b/core/src/test/scala/org/apache/spark/ExternalShuffleServiceSuite.scala
@@ -55,6 +55,14 @@ class ExternalShuffleServiceSuite extends ShuffleSuite with BeforeAndAfterAll {
     sc.env.blockManager.externalShuffleServiceEnabled should equal(true)
     sc.env.blockManager.shuffleClient.getClass should equal(classOf[ExternalShuffleClient])
 
+    // In a slow machine, one slave may register hundreds of milliseconds ahead of the other one.
+    // If we don't wait for all salves, it's possible that only one executor runs all jobs. Then
+    // all shuffle blocks will be in this executor, ShuffleBlockFetcherIterator will directly fetch
+    // local blocks from the local BlockManager and won't send requests to ExternalShuffleService.
+    // In this case, we won't receive FetchFailed. And it will make this test fail.
+    // Therefore, we should wait until all salves are up
+    sc.jobProgressListener.waitUntilExecutorsUp(2, 10000)
+
     val rdd = sc.parallelize(0 until 1000, 10).map(i => (i, 1)).reduceByKey(_ + _)
 
     rdd.count()
diff --git a/core/src/test/scala/org/apache/spark/broadcast/BroadcastSuite.scala b/core/src/test/scala/org/apache/spark/broadcast/BroadcastSuite.scala
index c05e8bb6538ba..c054c718075f8 100644
--- a/core/src/test/scala/org/apache/spark/broadcast/BroadcastSuite.scala
+++ b/core/src/test/scala/org/apache/spark/broadcast/BroadcastSuite.scala
@@ -17,11 +17,9 @@
 
 package org.apache.spark.broadcast
 
-import scala.concurrent.duration._
 import scala.util.Random
 
 import org.scalatest.Assertions
-import org.scalatest.concurrent.Eventually._
 
 import org.apache.spark._
 import org.apache.spark.io.SnappyCompressionCodec
@@ -312,13 +310,7 @@ class BroadcastSuite extends SparkFunSuite with LocalSparkContext {
       val _sc =
         new SparkContext("local-cluster[%d, 1, 512]".format(numSlaves), "test", broadcastConf)
       // Wait until all salves are up
-      eventually(timeout(10.seconds), interval(10.milliseconds)) {
-        _sc.jobProgressListener.synchronized {
-          val numBlockManagers = _sc.jobProgressListener.blockManagerIds.size
-          assert(numBlockManagers == numSlaves + 1,
-            s"Expect ${numSlaves + 1} block managers, but was ${numBlockManagers}")
-        }
-      }
+      _sc.jobProgressListener.waitUntilExecutorsUp(numSlaves, 10000)
       _sc
     } else {
       new SparkContext("local", "test", broadcastConf)
diff --git a/core/src/test/scala/org/apache/spark/scheduler/SparkListenerWithClusterSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/SparkListenerWithClusterSuite.scala
index 50273bcc8ce5e..d97fba00976d2 100644
--- a/core/src/test/scala/org/apache/spark/scheduler/SparkListenerWithClusterSuite.scala
+++ b/core/src/test/scala/org/apache/spark/scheduler/SparkListenerWithClusterSuite.scala
@@ -17,12 +17,12 @@
 
 package org.apache.spark.scheduler
 
-import org.apache.spark.scheduler.cluster.ExecutorInfo
-import org.apache.spark.{LocalSparkContext, SparkContext, SparkFunSuite}
+import scala.collection.mutable
 
 import org.scalatest.{BeforeAndAfter, BeforeAndAfterAll}
 
-import scala.collection.mutable
+import org.apache.spark.{LocalSparkContext, SparkContext, SparkFunSuite}
+import org.apache.spark.scheduler.cluster.ExecutorInfo
 
 /**
  * Unit tests for SparkListener that require a local cluster.
@@ -41,6 +41,10 @@ class SparkListenerWithClusterSuite extends SparkFunSuite with LocalSparkContext
     val listener = new SaveExecutorInfo
     sc.addSparkListener(listener)
 
+    // This test will check if the number of executors received by "SparkListener" is same as the
+    // number of all executors, so we need to wait until all executors are up
+    sc.jobProgressListener.waitUntilExecutorsUp(2, 10000)
+
     val rdd1 = sc.parallelize(1 to 100, 4)
     val rdd2 = rdd1.map(_.toString)
     rdd2.setName("Target RDD")

From a8f1f1543e29fb2897e9ae6940581b9e4a3a13fb Mon Sep 17 00:00:00 2001
From: Hari Shreedharan <hshreedharan@apache.org>
Date: Wed, 3 Jun 2015 15:11:02 -0700
Subject: [PATCH 346/525] [HOTFIX] Fix Hadoop-1 build caused by #5792.

Replaced `fs.listFiles` with Hadoop-1 friendly `fs.listStatus` method.

Author: Hari Shreedharan <hshreedharan@apache.org>

Closes #6619 from harishreedharan/evetlog-hadoop-1-fix and squashes the following commits:

6192078 [Hari Shreedharan] [HOTFIX] Fix Hadoop-1 build caused by #5972.
---
 .../apache/spark/deploy/history/FsHistoryProvider.scala   | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/deploy/history/FsHistoryProvider.scala b/core/src/main/scala/org/apache/spark/deploy/history/FsHistoryProvider.scala
index 52b149b273e4b..5427a88f32ffd 100644
--- a/core/src/main/scala/org/apache/spark/deploy/history/FsHistoryProvider.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/history/FsHistoryProvider.scala
@@ -255,12 +255,12 @@ private[history] class FsHistoryProvider(conf: SparkConf, clock: Clock)
             // If this is a legacy directory, then add the directory to the zipStream and add
             // each file to that directory.
             if (isLegacyLogDirectory(fs.getFileStatus(logPath))) {
-              val files = fs.listFiles(logPath, false)
+              val files = fs.listStatus(logPath)
               zipStream.putNextEntry(new ZipEntry(attempt.logPath + "/"))
               zipStream.closeEntry()
-              while (files.hasNext) {
-                val file = files.next().getPath
-                zipFileToStream(file, attempt.logPath + Path.SEPARATOR + file.getName, zipStream)
+              files.foreach { file =>
+                val path = file.getPath
+                zipFileToStream(path, attempt.logPath + Path.SEPARATOR + path.getName, zipStream)
               }
             } else {
               zipFileToStream(new Path(logDir, attempt.logPath), attempt.logPath, zipStream)

From d3e026f8798f9875b90e8c372056ee3d71489be5 Mon Sep 17 00:00:00 2001
From: Shivaram Venkataraman <shivaram@cs.berkeley.edu>
Date: Wed, 3 Jun 2015 15:14:38 -0700
Subject: [PATCH 347/525] [SPARK-3674] [EC2] Clear SPARK_WORKER_INSTANCES when
 using YARN

cc andrewor14

Author: Shivaram Venkataraman <shivaram@cs.berkeley.edu>

Closes #6424 from shivaram/spark-worker-instances-yarn-ec2 and squashes the following commits:

db244ae [Shivaram Venkataraman] Make Python Lint happy
0593d1b [Shivaram Venkataraman] Clear SPARK_WORKER_INSTANCES when using YARN
---
 ec2/spark_ec2.py | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/ec2/spark_ec2.py b/ec2/spark_ec2.py
index ee0904c9e5d54..84629cb9a0ca0 100755
--- a/ec2/spark_ec2.py
+++ b/ec2/spark_ec2.py
@@ -219,7 +219,8 @@ def parse_args():
              "(default: %default).")
     parser.add_option(
         "--hadoop-major-version", default="1",
-        help="Major version of Hadoop (default: %default)")
+        help="Major version of Hadoop. Valid options are 1 (Hadoop 1.0.4), 2 (CDH 4.2.0), yarn " +
+             "(Hadoop 2.4.0) (default: %default)")
     parser.add_option(
         "-D", metavar="[ADDRESS:]PORT", dest="proxy_port",
         help="Use SSH dynamic port forwarding to create a SOCKS proxy at " +
@@ -271,7 +272,8 @@ def parse_args():
         help="Launch fresh slaves, but use an existing stopped master if possible")
     parser.add_option(
         "--worker-instances", type="int", default=1,
-        help="Number of instances per worker: variable SPARK_WORKER_INSTANCES (default: %default)")
+        help="Number of instances per worker: variable SPARK_WORKER_INSTANCES. Not used if YARN " +
+             "is used as Hadoop major version (default: %default)")
     parser.add_option(
         "--master-opts", type="string", default="",
         help="Extra options to give to master through SPARK_MASTER_OPTS variable " +
@@ -761,6 +763,10 @@ def setup_cluster(conn, master_nodes, slave_nodes, opts, deploy_ssh_key):
     if opts.ganglia:
         modules.append('ganglia')
 
+    # Clear SPARK_WORKER_INSTANCES if running on YARN
+    if opts.hadoop_major_version == "yarn":
+        opts.worker_instances = ""
+
     # NOTE: We should clone the repository before running deploy_files to
     # prevent ec2-variables.sh from being overwritten
     print("Cloning spark-ec2 scripts from {r}/tree/{b} on master...".format(
@@ -998,6 +1004,7 @@ def deploy_files(conn, root_dir, opts, master_nodes, slave_nodes, modules):
 
     master_addresses = [get_dns_name(i, opts.private_ips) for i in master_nodes]
     slave_addresses = [get_dns_name(i, opts.private_ips) for i in slave_nodes]
+    worker_instances_str = "%d" % opts.worker_instances if opts.worker_instances else ""
     template_vars = {
         "master_list": '\n'.join(master_addresses),
         "active_master": active_master,
@@ -1011,7 +1018,7 @@ def deploy_files(conn, root_dir, opts, master_nodes, slave_nodes, modules):
         "spark_version": spark_v,
         "tachyon_version": tachyon_v,
         "hadoop_major_version": opts.hadoop_major_version,
-        "spark_worker_instances": "%d" % opts.worker_instances,
+        "spark_worker_instances": worker_instances_str,
         "spark_master_opts": opts.master_opts
     }
 

From 26c9d7a0f975009e22ec91e5c0b5cfcada79b35e Mon Sep 17 00:00:00 2001
From: Xiangrui Meng <meng@databricks.com>
Date: Wed, 3 Jun 2015 15:16:24 -0700
Subject: [PATCH 348/525] [SPARK-8051] [MLLIB] make StringIndexerModel silent
 if input column does not exist

This is just a workaround to a bigger problem. Some pipeline stages may not be effective during prediction, and they should not complain about missing required columns, e.g. `StringIndexerModel`. jkbradley

Author: Xiangrui Meng <meng@databricks.com>

Closes #6595 from mengxr/SPARK-8051 and squashes the following commits:

b6a36b9 [Xiangrui Meng] add doc
f143fd4 [Xiangrui Meng] Merge remote-tracking branch 'apache/master' into SPARK-8051
8ee7c7e [Xiangrui Meng] use SparkFunSuite
e112394 [Xiangrui Meng] make StringIndexerModel silent if input column does not exist
---
 .../apache/spark/ml/feature/StringIndexer.scala  | 16 +++++++++++++++-
 .../spark/ml/feature/StringIndexerSuite.scala    |  8 ++++++++
 2 files changed, 23 insertions(+), 1 deletion(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/StringIndexer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/StringIndexer.scala
index a2dc8a8b960c5..f4e250757560a 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/StringIndexer.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/StringIndexer.scala
@@ -88,6 +88,9 @@ class StringIndexer(override val uid: String) extends Estimator[StringIndexerMod
 /**
  * :: Experimental ::
  * Model fitted by [[StringIndexer]].
+ * NOTE: During transformation, if the input column does not exist,
+ * [[StringIndexerModel.transform]] would return the input dataset unmodified.
+ * This is a temporary fix for the case when target labels do not exist during prediction.
  */
 @Experimental
 class StringIndexerModel private[ml] (
@@ -112,6 +115,12 @@ class StringIndexerModel private[ml] (
   def setOutputCol(value: String): this.type = set(outputCol, value)
 
   override def transform(dataset: DataFrame): DataFrame = {
+    if (!dataset.schema.fieldNames.contains($(inputCol))) {
+      logInfo(s"Input column ${$(inputCol)} does not exist during transformation. " +
+        "Skip StringIndexerModel.")
+      return dataset
+    }
+
     val indexer = udf { label: String =>
       if (labelToIndex.contains(label)) {
         labelToIndex(label)
@@ -128,6 +137,11 @@ class StringIndexerModel private[ml] (
   }
 
   override def transformSchema(schema: StructType): StructType = {
-    validateAndTransformSchema(schema)
+    if (schema.fieldNames.contains($(inputCol))) {
+      validateAndTransformSchema(schema)
+    } else {
+      // If the input column does not exist during transformation, we skip StringIndexerModel.
+      schema
+    }
   }
 }
diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/StringIndexerSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/StringIndexerSuite.scala
index cbf1e8ddcb48a..5f557e16e5150 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/feature/StringIndexerSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/feature/StringIndexerSuite.scala
@@ -60,4 +60,12 @@ class StringIndexerSuite extends SparkFunSuite with MLlibTestSparkContext {
     val expected = Set((0, 0.0), (1, 2.0), (2, 1.0), (3, 0.0), (4, 0.0), (5, 1.0))
     assert(output === expected)
   }
+
+  test("StringIndexerModel should keep silent if the input column does not exist.") {
+    val indexerModel = new StringIndexerModel("indexer", Array("a", "b", "c"))
+      .setInputCol("label")
+      .setOutputCol("labelIndex")
+    val df = sqlContext.range(0L, 10L)
+    assert(indexerModel.transform(df).eq(df))
+  }
 }

From d8662cd909a41575df6e0ea1630d2386d3711240 Mon Sep 17 00:00:00 2001
From: leahmcguire <lmcguire@salesforce.com>
Date: Wed, 3 Jun 2015 15:46:38 -0700
Subject: [PATCH 349/525] [SPARK-6164] [ML] CrossValidatorModel should keep
 stats from fitting

Added stats from cross validation as a val in the cross validation model to save them for user access.

Author: leahmcguire <lmcguire@salesforce.com>

Closes #5915 from leahmcguire/saveCVmetrics and squashes the following commits:

49b507b [leahmcguire] fixed tyle error
67537b1 [leahmcguire] rebased
85907f0 [leahmcguire] fixed name
59987cc [leahmcguire] changed param name and test according to comments
36e71e3 [leahmcguire] rebasing
4b8223e [leahmcguire] fixed name
4ddffc6 [leahmcguire] changed param name and test according to comments
3a995da [leahmcguire] Added stats from cross validation as a val in the cross validation model to save them for user access
---
 .../org/apache/spark/ml/tuning/CrossValidator.scala    | 10 +++++++---
 .../apache/spark/ml/tuning/CrossValidatorSuite.scala   |  1 +
 2 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/tuning/CrossValidator.scala b/mllib/src/main/scala/org/apache/spark/ml/tuning/CrossValidator.scala
index 6434b64aed15d..cb29392e8bc63 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/tuning/CrossValidator.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/tuning/CrossValidator.scala
@@ -135,7 +135,7 @@ class CrossValidator(override val uid: String) extends Estimator[CrossValidatorM
     logInfo(s"Best set of parameters:\n${epm(bestIndex)}")
     logInfo(s"Best cross-validation metric: $bestMetric.")
     val bestModel = est.fit(dataset, epm(bestIndex)).asInstanceOf[Model[_]]
-    copyValues(new CrossValidatorModel(uid, bestModel).setParent(this))
+    copyValues(new CrossValidatorModel(uid, bestModel, metrics).setParent(this))
   }
 
   override def transformSchema(schema: StructType): StructType = {
@@ -158,7 +158,8 @@ class CrossValidator(override val uid: String) extends Estimator[CrossValidatorM
 @Experimental
 class CrossValidatorModel private[ml] (
     override val uid: String,
-    val bestModel: Model[_])
+    val bestModel: Model[_],
+    val avgMetrics: Array[Double])
   extends Model[CrossValidatorModel] with CrossValidatorParams {
 
   override def validateParams(): Unit = {
@@ -175,7 +176,10 @@ class CrossValidatorModel private[ml] (
   }
 
   override def copy(extra: ParamMap): CrossValidatorModel = {
-    val copied = new CrossValidatorModel(uid, bestModel.copy(extra).asInstanceOf[Model[_]])
+    val copied = new CrossValidatorModel(
+      uid,
+      bestModel.copy(extra).asInstanceOf[Model[_]],
+      avgMetrics.clone())
     copyValues(copied, extra)
   }
 }
diff --git a/mllib/src/test/scala/org/apache/spark/ml/tuning/CrossValidatorSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/tuning/CrossValidatorSuite.scala
index 5ba469c7b10a0..9b3619f0046ea 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/tuning/CrossValidatorSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/tuning/CrossValidatorSuite.scala
@@ -56,6 +56,7 @@ class CrossValidatorSuite extends SparkFunSuite with MLlibTestSparkContext {
     val parent = cvModel.bestModel.parent.asInstanceOf[LogisticRegression]
     assert(parent.getRegParam === 0.001)
     assert(parent.getMaxIter === 10)
+    assert(cvModel.avgMetrics.length === lrParamMaps.length)
   }
 
   test("validateParams should check estimatorParamMaps") {

From bfbdab12dd37587e5518dcbb76507b752759cace Mon Sep 17 00:00:00 2001
From: Andrew Or <andrew@databricks.com>
Date: Wed, 3 Jun 2015 16:04:02 -0700
Subject: [PATCH 350/525] [HOTFIX] [TYPO] Fix typo in #6546

---
 .../scala/org/apache/spark/ExternalShuffleServiceSuite.scala  | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/core/src/test/scala/org/apache/spark/ExternalShuffleServiceSuite.scala b/core/src/test/scala/org/apache/spark/ExternalShuffleServiceSuite.scala
index 5b127a070c07f..140012226fdbb 100644
--- a/core/src/test/scala/org/apache/spark/ExternalShuffleServiceSuite.scala
+++ b/core/src/test/scala/org/apache/spark/ExternalShuffleServiceSuite.scala
@@ -56,11 +56,11 @@ class ExternalShuffleServiceSuite extends ShuffleSuite with BeforeAndAfterAll {
     sc.env.blockManager.shuffleClient.getClass should equal(classOf[ExternalShuffleClient])
 
     // In a slow machine, one slave may register hundreds of milliseconds ahead of the other one.
-    // If we don't wait for all salves, it's possible that only one executor runs all jobs. Then
+    // If we don't wait for all slaves, it's possible that only one executor runs all jobs. Then
     // all shuffle blocks will be in this executor, ShuffleBlockFetcherIterator will directly fetch
     // local blocks from the local BlockManager and won't send requests to ExternalShuffleService.
     // In this case, we won't receive FetchFailed. And it will make this test fail.
-    // Therefore, we should wait until all salves are up
+    // Therefore, we should wait until all slaves are up
     sc.jobProgressListener.waitUntilExecutorsUp(2, 10000)
 
     val rdd = sc.parallelize(0 until 1000, 10).map(i => (i, 1)).reduceByKey(_ + _)

From 566cb5947925c79ef90af72346672ab7d27bf4df Mon Sep 17 00:00:00 2001
From: Hari Shreedharan <hshreedharan@apache.org>
Date: Wed, 3 Jun 2015 16:53:57 -0700
Subject: [PATCH 351/525] [HOTFIX] History Server API docs error fix.

Minor error in the monitoring docs. Also made indentation changes in `ApiRootResource`

Author: Hari Shreedharan <hshreedharan@apache.org>

Closes #6628 from harishreedharan/eventlog-formatting and squashes the following commits:

a12553d [Hari Shreedharan] Javadoc updates.
ca399b6 [Hari Shreedharan] [HOTFIX] History Server API docs error fix.
---
 .../apache/spark/status/api/v1/ApiRootResource.scala   | 10 +++++++---
 .../spark/status/api/v1/EventLogDownloadResource.scala |  2 +-
 docs/monitoring.md                                     |  2 +-
 3 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/status/api/v1/ApiRootResource.scala b/core/src/main/scala/org/apache/spark/status/api/v1/ApiRootResource.scala
index 9af90ee5ecd9d..50b6ba67e9931 100644
--- a/core/src/main/scala/org/apache/spark/status/api/v1/ApiRootResource.scala
+++ b/core/src/main/scala/org/apache/spark/status/api/v1/ApiRootResource.scala
@@ -167,14 +167,14 @@ private[v1] class ApiRootResource extends UIRootFromServletContext {
 
   @Path("applications/{appId}/logs")
   def getEventLogs(
-    @PathParam("appId") appId: String): EventLogDownloadResource = {
+      @PathParam("appId") appId: String): EventLogDownloadResource = {
     new EventLogDownloadResource(uiRoot, appId, None)
   }
 
   @Path("applications/{appId}/{attemptId}/logs")
   def getEventLogs(
-    @PathParam("appId") appId: String,
-    @PathParam("attemptId") attemptId: String): EventLogDownloadResource = {
+      @PathParam("appId") appId: String,
+      @PathParam("attemptId") attemptId: String): EventLogDownloadResource = {
     new EventLogDownloadResource(uiRoot, appId, Some(attemptId))
   }
 }
@@ -206,6 +206,10 @@ private[spark] trait UIRoot {
   def getSparkUI(appKey: String): Option[SparkUI]
   def getApplicationInfoList: Iterator[ApplicationInfo]
 
+  /**
+   * Write the event logs for the given app to the [[ZipOutputStream]] instance. If attemptId is
+   * [[None]], event logs for all attempts of this application will be written out.
+   */
   def writeEventLogs(appId: String, attemptId: Option[String], zipStream: ZipOutputStream): Unit = {
     Response.serverError()
       .entity("Event logs are only available through the history server.")
diff --git a/core/src/main/scala/org/apache/spark/status/api/v1/EventLogDownloadResource.scala b/core/src/main/scala/org/apache/spark/status/api/v1/EventLogDownloadResource.scala
index d416dba8324d8..22e21f0c62a29 100644
--- a/core/src/main/scala/org/apache/spark/status/api/v1/EventLogDownloadResource.scala
+++ b/core/src/main/scala/org/apache/spark/status/api/v1/EventLogDownloadResource.scala
@@ -44,7 +44,7 @@ private[v1] class EventLogDownloadResource(
       }
 
       val stream = new StreamingOutput {
-        override def write(output: OutputStream) = {
+        override def write(output: OutputStream): Unit = {
           val zipStream = new ZipOutputStream(output)
           try {
             uIRoot.writeEventLogs(appId, attemptId, zipStream)
diff --git a/docs/monitoring.md b/docs/monitoring.md
index 31ecddc6dbbb9..bcf885fe4e681 100644
--- a/docs/monitoring.md
+++ b/docs/monitoring.md
@@ -233,7 +233,7 @@ for a running application, at `http://localhost:4040/api/v1`.
     <td>Download the event logs for all attempts of the given application as a zip file</td>
   </tr>
   <tr>
-    <td><code>/applications/[app-id]/[attempt-id/logs</code></td>
+    <td><code>/applications/[app-id]/[attempt-id]/logs</code></td>
     <td>Download the event logs for the specified attempt of the given application as a zip file</td>
   </tr>
 </table>

From 51898b5158ac7e7e67b0539bc062c9c16ce9a7ce Mon Sep 17 00:00:00 2001
From: Ryan Williams <ryan.blake.williams@gmail.com>
Date: Wed, 3 Jun 2015 16:54:46 -0700
Subject: [PATCH 352/525] [SPARK-8088] don't attempt to lower number of
 executors by 0

Author: Ryan Williams <ryan.blake.williams@gmail.com>

Closes #6624 from ryan-williams/execs and squashes the following commits:

b6f71d4 [Ryan Williams] don't attempt to lower number of executors by 0
---
 .../org/apache/spark/ExecutorAllocationManager.scala   | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/ExecutorAllocationManager.scala b/core/src/main/scala/org/apache/spark/ExecutorAllocationManager.scala
index 9514604752640..f7323a4d9db72 100644
--- a/core/src/main/scala/org/apache/spark/ExecutorAllocationManager.scala
+++ b/core/src/main/scala/org/apache/spark/ExecutorAllocationManager.scala
@@ -266,10 +266,14 @@ private[spark] class ExecutorAllocationManager(
       // executors and inform the cluster manager to cancel the extra pending requests
       val oldNumExecutorsTarget = numExecutorsTarget
       numExecutorsTarget = math.max(maxNeeded, minNumExecutors)
-      client.requestTotalExecutors(numExecutorsTarget)
       numExecutorsToAdd = 1
-      logInfo(s"Lowering target number of executors to $numExecutorsTarget because " +
-        s"not all requests are actually needed (previously $oldNumExecutorsTarget)")
+
+      // If the new target has not changed, avoid sending a message to the cluster manager
+      if (numExecutorsTarget < oldNumExecutorsTarget) {
+        client.requestTotalExecutors(numExecutorsTarget)
+        logInfo(s"Lowering target number of executors to $numExecutorsTarget (previously " +
+          s"$oldNumExecutorsTarget) because not all requested executors are actually needed")
+      }
       numExecutorsTarget - oldNumExecutorsTarget
     } else if (addTime != NOT_SET && now >= addTime) {
       val delta = addExecutors(maxNeeded)

From 0576c3c4ff9d9bbff208e915bee1ac0d4956548c Mon Sep 17 00:00:00 2001
From: Shivaram Venkataraman <shivaram@cs.berkeley.edu>
Date: Wed, 3 Jun 2015 17:02:16 -0700
Subject: [PATCH 353/525] [SPARK-8084] [SPARKR] Make SparkR scripts fail on
 error

cc shaneknapp pwendell JoshRosen

Author: Shivaram Venkataraman <shivaram@cs.berkeley.edu>

Closes #6623 from shivaram/SPARK-8084 and squashes the following commits:

0ec5b26 [Shivaram Venkataraman] Make SparkR scripts fail on error
---
 R/create-docs.sh | 3 +++
 R/install-dev.sh | 2 ++
 2 files changed, 5 insertions(+)

diff --git a/R/create-docs.sh b/R/create-docs.sh
index 4194172a2e115..af47c0863bdd0 100755
--- a/R/create-docs.sh
+++ b/R/create-docs.sh
@@ -23,6 +23,9 @@
 # After running this script the html docs can be found in 
 # $SPARK_HOME/R/pkg/html
 
+set -o pipefail
+set -e
+
 # Figure out where the script is
 export FWDIR="$(cd "`dirname "$0"`"; pwd)"
 pushd $FWDIR
diff --git a/R/install-dev.sh b/R/install-dev.sh
index 55ed6f4be1a4a..b9e2527035994 100755
--- a/R/install-dev.sh
+++ b/R/install-dev.sh
@@ -26,6 +26,8 @@
 # NOTE(shivaram): Right now we use $SPARK_HOME/R/lib to be the installation directory
 # to load the SparkR package on the worker nodes.
 
+set -o pipefail
+set -e
 
 FWDIR="$(cd `dirname $0`; pwd)"
 LIB_DIR="$FWDIR/lib"

From e35cd36e08faa43466759c412c420a9d8901d368 Mon Sep 17 00:00:00 2001
From: Andrew Or <andrew@databricks.com>
Date: Wed, 3 Jun 2015 17:40:14 -0700
Subject: [PATCH 354/525] [BUILD] Increase Jenkins test timeout

Currently hive tests alone take 40m. The right thing to do is
to reduce the test time. However, that is a bigger project and
we currently have PRs blocking on tests not timing out.
---
 dev/run-tests-jenkins | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/dev/run-tests-jenkins b/dev/run-tests-jenkins
index 8b2a44fd72ba5..3cbd8666c8d68 100755
--- a/dev/run-tests-jenkins
+++ b/dev/run-tests-jenkins
@@ -47,7 +47,9 @@ COMMIT_URL="https://github.com/apache/spark/commit/${ghprbActualCommit}"
 # GitHub doesn't auto-link short hashes when submitted via the API, unfortunately. :(
 SHORT_COMMIT_HASH="${ghprbActualCommit:0:7}"
 
-TESTS_TIMEOUT="150m" # format: http://linux.die.net/man/1/timeout
+# format: http://linux.die.net/man/1/timeout
+# must be less than the timeout configured on Jenkins (currently 180m)
+TESTS_TIMEOUT="175m"
 
 # Array to capture all tests to run on the pull request. These tests are held under the
 #+ dev/tests/ directory.

From 9cf740f357fef00b5251618b20501774852f8a28 Mon Sep 17 00:00:00 2001
From: Andrew Or <andrew@databricks.com>
Date: Wed, 3 Jun 2015 18:08:53 -0700
Subject: [PATCH 355/525] [BUILD] Use right branch when checking against Hive

Right now we always run hive tests in branch-1.4 PRs because we compare whether the diff against master involves hive changes. Really we should be comparing against the target branch itself.

Author: Andrew Or <andrew@databricks.com>

Closes #6629 from andrewor14/build-check-hive and squashes the following commits:

450fbbd [Andrew Or] [BUILD] Use right branch when checking against Hive
---
 dev/run-tests | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/dev/run-tests b/dev/run-tests
index 7dd8d31fd44e3..d178e2a4601ea 100755
--- a/dev/run-tests
+++ b/dev/run-tests
@@ -80,18 +80,19 @@ export SBT_MAVEN_PROFILES_ARGS="$SBT_MAVEN_PROFILES_ARGS -Pkinesis-asl"
 # Only run Hive tests if there are SQL changes.
 # Partial solution for SPARK-1455.
 if [ -n "$AMPLAB_JENKINS" ]; then
-  git fetch origin master:master
+  target_branch="$ghprbTargetBranch"
+  git fetch origin "$target_branch":"$target_branch"
 
   # AMP_JENKINS_PRB indicates if the current build is a pull request build.
   if [ -n "$AMP_JENKINS_PRB" ]; then
     # It is a pull request build.
     sql_diffs=$(
-      git diff --name-only master \
+      git diff --name-only "$target_branch" \
       | grep -e "^sql/" -e "^bin/spark-sql" -e "^sbin/start-thriftserver.sh"
     )
 
     non_sql_diffs=$(
-      git diff --name-only master \
+      git diff --name-only "$target_branch" \
       | grep -v -e "^sql/" -e "^bin/spark-sql" -e "^sbin/start-thriftserver.sh"
     )
 

From 984ad60147c933f2d5a2040c87ae687c14eb1724 Mon Sep 17 00:00:00 2001
From: Andrew Or <andrew@databricks.com>
Date: Wed, 3 Jun 2015 20:45:31 -0700
Subject: [PATCH 356/525] [BUILD] Fix Maven build for Kinesis

A necessary dependency that is transitively referenced is not
provided, causing compilation failures in builds that provide
the kinesis-asl profile.
---
 extras/kinesis-asl/pom.xml | 7 +++++++
 pom.xml                    | 2 ++
 2 files changed, 9 insertions(+)

diff --git a/extras/kinesis-asl/pom.xml b/extras/kinesis-asl/pom.xml
index 4787991572b61..c6f60bc907438 100644
--- a/extras/kinesis-asl/pom.xml
+++ b/extras/kinesis-asl/pom.xml
@@ -40,6 +40,13 @@
       <artifactId>spark-streaming_${scala.binary.version}</artifactId>
       <version>${project.version}</version>
     </dependency>
+    <dependency>
+      <groupId>org.apache.spark</groupId>
+      <artifactId>spark-core_${scala.binary.version}</artifactId>
+      <version>${project.version}</version>
+      <type>test-jar</type>
+      <scope>test</scope>
+    </dependency>
     <dependency>
       <groupId>org.apache.spark</groupId>
       <artifactId>spark-streaming_${scala.binary.version}</artifactId>
diff --git a/pom.xml b/pom.xml
index 0b1aaad7566bc..d03d33bf02468 100644
--- a/pom.xml
+++ b/pom.xml
@@ -1438,6 +1438,8 @@
         <version>2.3</version>
         <configuration>
           <shadedArtifactAttached>false</shadedArtifactAttached>
+          <!-- Work around MSHADE-148 -->
+          <createDependencyReducedPom>false</createDependencyReducedPom>
           <artifactSet>
             <includes>
               <!-- At a minimum we must include this to force effective pom generation -->

From 9982d453c39e50aedae7d01e4c38fab1b2bc6be0 Mon Sep 17 00:00:00 2001
From: Patrick Wendell <patrick@databricks.com>
Date: Wed, 3 Jun 2015 23:45:06 -0700
Subject: [PATCH 357/525] MAINTENANCE: Automated closing of pull requests.

This commit exists to close the following pull requests on Github:

Closes #5976 (close requested by 'JoshRosen')
Closes #4576 (close requested by 'pwendell')
Closes #3430 (close requested by 'pwendell')
Closes #2495 (close requested by 'pwendell')

From 10ba1880878d0babcdc5c9b688df5458ea131531 Mon Sep 17 00:00:00 2001
From: Daniel Darabos <darabos.daniel@gmail.com>
Date: Thu, 4 Jun 2015 13:46:49 +0200
Subject: [PATCH 358/525] Fix maxTaskFailures comment
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

If maxTaskFailures is 1, the task set is aborted after 1 task failure. Other documentation and the code supports this reading, I think it's just this comment that was off. It's easy to make this mistake — can you please double-check if I'm correct? Thanks!

Author: Daniel Darabos <darabos.daniel@gmail.com>

Closes #6621 from darabos/patch-2 and squashes the following commits:

dfebdec [Daniel Darabos] Fix comment.
---
 .../main/scala/org/apache/spark/scheduler/TaskSetManager.scala  | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/core/src/main/scala/org/apache/spark/scheduler/TaskSetManager.scala b/core/src/main/scala/org/apache/spark/scheduler/TaskSetManager.scala
index 673cd0e19eba2..82455b0426a5d 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/TaskSetManager.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/TaskSetManager.scala
@@ -46,7 +46,7 @@ import org.apache.spark.util.{Clock, SystemClock, Utils}
  *
  * @param sched           the TaskSchedulerImpl associated with the TaskSetManager
  * @param taskSet         the TaskSet to manage scheduling for
- * @param maxTaskFailures if any particular task fails more than this number of times, the entire
+ * @param maxTaskFailures if any particular task fails this number of times, the entire
  *                        task set will be aborted
  */
 private[spark] class TaskSetManager(

From c8709dcfd1237ffa19ee9286e99ddf2718a616d8 Mon Sep 17 00:00:00 2001
From: Davies Liu <davies@databricks.com>
Date: Thu, 4 Jun 2015 10:28:59 -0700
Subject: [PATCH 359/525] [SPARK-7956] [SQL] Use Janino to compile SQL
 expressions into bytecode

In order to reduce the overhead of codegen, this PR switch to use Janino to compile SQL expressions into bytecode.

After this, the time used to compile a SQL expression is decreased from 100ms to 5ms, which is necessary to turn on codegen for general workload, also tests.

cc rxin

Author: Davies Liu <davies@databricks.com>

Closes #6479 from davies/janino and squashes the following commits:

cc689f5 [Davies Liu] remove globalLock
262d848 [Davies Liu] Merge branch 'master' of github.com:apache/spark into janino
eec3a33 [Davies Liu] address comments from Josh
f37c8c3 [Davies Liu] fix DecimalType and cast to String
202298b [Davies Liu] Merge branch 'master' of github.com:apache/spark into janino
a21e968 [Davies Liu] fix style
0ed3dc6 [Davies Liu] Merge branch 'master' of github.com:apache/spark into janino
551a851 [Davies Liu] fix tests
c3bdffa [Davies Liu] remove print
6089ce5 [Davies Liu] change logging level
7e46ac3 [Davies Liu] fix style
d8f0f6c [Davies Liu] Merge branch 'master' of github.com:apache/spark into janino
da4926a [Davies Liu] fix tests
03660f3 [Davies Liu] WIP: use Janino to compile Java source
f2629cd [Davies Liu] Merge branch 'master' of github.com:apache/spark into janino
f7d66cf [Davies Liu] use template based string for codegen
---
 .../spark/util/collection/OpenHashSet.scala   |  12 +-
 pom.xml                                       |  10 -
 project/SparkBuild.scala                      |  11 -
 sql/catalyst/pom.xml                          |  16 +-
 .../sql/catalyst/expressions/UnsafeRow.java   | 101 +--
 .../org/apache/spark/sql/BaseMutableRow.java  |  68 ++
 .../scala/org/apache/spark/sql/BaseRow.java   | 190 +++++
 .../expressions/codegen/CodeGenerator.scala   | 797 +++++++++---------
 .../codegen/GenerateMutableProjection.scala   |  87 +-
 .../codegen/GenerateOrdering.scala            | 146 ++--
 .../codegen/GeneratePredicate.scala           |  44 +-
 .../codegen/GenerateProjection.scala          | 316 ++++---
 .../expressions/codegen/package.scala         |   6 -
 .../ExpressionEvaluationSuite.scala           |  15 +-
 .../GeneratedEvaluationSuite.scala            |   5 +-
 .../GeneratedMutableEvaluationSuite.scala     |   7 +-
 .../org/apache/spark/sql/DataFrameSuite.scala |  11 +-
 .../org/apache/spark/sql/SQLQuerySuite.scala  | 162 ++--
 18 files changed, 1116 insertions(+), 888 deletions(-)
 create mode 100644 sql/catalyst/src/main/scala/org/apache/spark/sql/BaseMutableRow.java
 create mode 100644 sql/catalyst/src/main/scala/org/apache/spark/sql/BaseRow.java

diff --git a/core/src/main/scala/org/apache/spark/util/collection/OpenHashSet.scala b/core/src/main/scala/org/apache/spark/util/collection/OpenHashSet.scala
index 1501111a06655..64e7102e3654c 100644
--- a/core/src/main/scala/org/apache/spark/util/collection/OpenHashSet.scala
+++ b/core/src/main/scala/org/apache/spark/util/collection/OpenHashSet.scala
@@ -20,6 +20,8 @@ package org.apache.spark.util.collection
 import scala.reflect._
 import com.google.common.hash.Hashing
 
+import org.apache.spark.annotation.Private
+
 /**
  * A simple, fast hash set optimized for non-null insertion-only use case, where keys are never
  * removed.
@@ -37,7 +39,7 @@ import com.google.common.hash.Hashing
  * It uses quadratic probing with a power-of-2 hash table size, which is guaranteed
  * to explore all spaces for each key (see http://en.wikipedia.org/wiki/Quadratic_probing).
  */
-private[spark]
+@Private
 class OpenHashSet[@specialized(Long, Int) T: ClassTag](
     initialCapacity: Int,
     loadFactor: Double)
@@ -110,6 +112,14 @@ class OpenHashSet[@specialized(Long, Int) T: ClassTag](
     rehashIfNeeded(k, grow, move)
   }
 
+  def union(other: OpenHashSet[T]): OpenHashSet[T] = {
+    val iterator = other.iterator
+    while (iterator.hasNext) {
+      add(iterator.next())
+    }
+    this
+  }
+
   /**
    * Add an element to the set. This one differs from add in that it doesn't trigger rehashing.
    * The caller is responsible for calling rehashIfNeeded.
diff --git a/pom.xml b/pom.xml
index d03d33bf02468..bcb6ef96a1206 100644
--- a/pom.xml
+++ b/pom.xml
@@ -118,7 +118,6 @@
     <akka.version>2.3.4-spark</akka.version>
     <java.version>1.6</java.version>
     <sbt.project.name>spark</sbt.project.name>
-    <scala.macros.version>2.0.1</scala.macros.version>
     <mesos.version>0.21.1</mesos.version>
     <mesos.classifier>shaded-protobuf</mesos.classifier>
     <slf4j.version>1.7.10</slf4j.version>
@@ -1217,15 +1216,6 @@
               <javacArg>-target</javacArg>
               <javacArg>${java.version}</javacArg>
             </javacArgs>
-            <!-- The following plugin is required to use quasiquotes in Scala 2.10 and is used
-                 by Spark SQL for code generation. -->
-            <compilerPlugins>
-              <compilerPlugin>
-                <groupId>org.scalamacros</groupId>
-                <artifactId>paradise_${scala.version}</artifactId>
-                <version>${scala.macros.version}</version>
-              </compilerPlugin>
-            </compilerPlugins>
           </configuration>
         </plugin>
         <plugin>
diff --git a/project/SparkBuild.scala b/project/SparkBuild.scala
index 9a849639233bc..f65031fe25ac2 100644
--- a/project/SparkBuild.scala
+++ b/project/SparkBuild.scala
@@ -178,9 +178,6 @@ object SparkBuild extends PomBuild {
   /* Enable unidoc only for the root spark project */
   enable(Unidoc.settings)(spark)
 
-  /* Catalyst macro settings */
-  enable(Catalyst.settings)(catalyst)
-
   /* Spark SQL Core console settings */
   enable(SQL.settings)(sql)
 
@@ -275,14 +272,6 @@ object OldDeps {
   )
 }
 
-object Catalyst {
-  lazy val settings = Seq(
-    addCompilerPlugin("org.scalamacros" % "paradise" % "2.0.1" cross CrossVersion.full),
-    // Quasiquotes break compiling scala doc...
-    // TODO: Investigate fixing this.
-    sources in (Compile, doc) ~= (_ filter (_.getName contains "codegen")))
-}
-
 object SQL {
   lazy val settings = Seq(
     initialCommands in console :=
diff --git a/sql/catalyst/pom.xml b/sql/catalyst/pom.xml
index bf0a7327a58a2..f4b1cc3a4ffe7 100644
--- a/sql/catalyst/pom.xml
+++ b/sql/catalyst/pom.xml
@@ -36,10 +36,6 @@
   </properties>
 
   <dependencies>
-    <dependency>
-      <groupId>org.scala-lang</groupId>
-      <artifactId>scala-compiler</artifactId>
-    </dependency>
     <dependency>
       <groupId>org.scala-lang</groupId>
       <artifactId>scala-reflect</artifactId>
@@ -67,6 +63,11 @@
       <artifactId>scalacheck_${scala.binary.version}</artifactId>
       <scope>test</scope>
     </dependency>
+    <dependency>
+      <groupId>org.codehaus.janino</groupId>
+      <artifactId>janino</artifactId>
+      <version>2.7.8</version>
+    </dependency>
   </dependencies>
   <build>
     <outputDirectory>target/scala-${scala.binary.version}/classes</outputDirectory>
@@ -108,13 +109,6 @@
       <activation>
         <property><name>!scala-2.11</name></property>
       </activation>
-      <dependencies>
-        <dependency>
-          <groupId>org.scalamacros</groupId>
-          <artifactId>quasiquotes_${scala.binary.version}</artifactId>
-          <version>${scala.macros.version}</version>
-        </dependency>
-      </dependencies>
     </profile>
   </profiles>
 </project>
diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeRow.java b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeRow.java
index bb546b3086b33..ec97fe603c44f 100644
--- a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeRow.java
+++ b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeRow.java
@@ -17,23 +17,25 @@
 
 package org.apache.spark.sql.catalyst.expressions;
 
-import scala.collection.Map;
+import javax.annotation.Nullable;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.Set;
+
 import scala.collection.Seq;
 import scala.collection.mutable.ArraySeq;
 
-import javax.annotation.Nullable;
-import java.math.BigDecimal;
-import java.sql.Date;
-import java.util.*;
-
 import org.apache.spark.sql.Row;
+import org.apache.spark.sql.BaseMutableRow;
 import org.apache.spark.sql.types.DataType;
-import static org.apache.spark.sql.types.DataTypes.*;
 import org.apache.spark.sql.types.StructType;
 import org.apache.spark.sql.types.UTF8String;
 import org.apache.spark.unsafe.PlatformDependent;
 import org.apache.spark.unsafe.bitset.BitSetMethods;
 
+import static org.apache.spark.sql.types.DataTypes.*;
+
 /**
  * An Unsafe implementation of Row which is backed by raw memory instead of Java objects.
  *
@@ -49,7 +51,7 @@
  *
  * Instances of `UnsafeRow` act as pointers to row data stored in this format.
  */
-public final class UnsafeRow implements MutableRow {
+public final class UnsafeRow extends BaseMutableRow {
 
   private Object baseObject;
   private long baseOffset;
@@ -227,21 +229,11 @@ public int size() {
     return numFields;
   }
 
-  @Override
-  public int length() {
-    return size();
-  }
-
   @Override
   public StructType schema() {
     return schema;
   }
 
-  @Override
-  public Object apply(int i) {
-    return get(i);
-  }
-
   @Override
   public Object get(int i) {
     assertIndexIsValid(i);
@@ -339,60 +331,7 @@ public String getString(int i) {
     return getUTF8String(i).toString();
   }
 
-  @Override
-  public BigDecimal getDecimal(int i) {
-    throw new UnsupportedOperationException();
-  }
-
-  @Override
-  public Date getDate(int i) {
-    throw new UnsupportedOperationException();
-  }
 
-  @Override
-  public <T> Seq<T> getSeq(int i) {
-    throw new UnsupportedOperationException();
-  }
-
-  @Override
-  public <T> List<T> getList(int i) {
-    throw new UnsupportedOperationException();
-  }
-
-  @Override
-  public <K, V> Map<K, V> getMap(int i) {
-    throw new UnsupportedOperationException();
-  }
-
-  @Override
-  public <T> scala.collection.immutable.Map<String, T> getValuesMap(Seq<String> fieldNames) {
-    throw new UnsupportedOperationException();
-  }
-
-  @Override
-  public <K, V> java.util.Map<K, V> getJavaMap(int i) {
-    throw new UnsupportedOperationException();
-  }
-
-  @Override
-  public Row getStruct(int i) {
-    throw new UnsupportedOperationException();
-  }
-
-  @Override
-  public <T> T getAs(int i) {
-    throw new UnsupportedOperationException();
-  }
-
-  @Override
-  public <T> T getAs(String fieldName) {
-    throw new UnsupportedOperationException();
-  }
-
-  @Override
-  public int fieldIndex(String name) {
-    throw new UnsupportedOperationException();
-  }
 
   @Override
   public Row copy() {
@@ -412,24 +351,4 @@ public Seq<Object> toSeq() {
     }
     return values;
   }
-
-  @Override
-  public String toString() {
-    return mkString("[", ",", "]");
-  }
-
-  @Override
-  public String mkString() {
-    return toSeq().mkString();
-  }
-
-  @Override
-  public String mkString(String sep) {
-    return toSeq().mkString(sep);
-  }
-
-  @Override
-  public String mkString(String start, String sep, String end) {
-    return toSeq().mkString(start, sep, end);
-  }
 }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/BaseMutableRow.java b/sql/catalyst/src/main/scala/org/apache/spark/sql/BaseMutableRow.java
new file mode 100644
index 0000000000000..acec2bf4520f2
--- /dev/null
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/BaseMutableRow.java
@@ -0,0 +1,68 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql;
+
+import org.apache.spark.sql.catalyst.expressions.MutableRow;
+
+public abstract class BaseMutableRow extends BaseRow implements MutableRow {
+
+  @Override
+  public void update(int ordinal, Object value) {
+    throw new UnsupportedOperationException();
+  }
+
+  @Override
+  public void setInt(int ordinal, int value) {
+    throw new UnsupportedOperationException();
+  }
+
+  @Override
+  public void setLong(int ordinal, long value) {
+    throw new UnsupportedOperationException();
+  }
+
+  @Override
+  public void setDouble(int ordinal, double value) {
+    throw new UnsupportedOperationException();
+  }
+
+  @Override
+  public void setBoolean(int ordinal, boolean value) {
+    throw new UnsupportedOperationException();
+  }
+
+  @Override
+  public void setShort(int ordinal, short value) {
+    throw new UnsupportedOperationException();
+  }
+
+  @Override
+  public void setByte(int ordinal, byte value) {
+    throw new UnsupportedOperationException();
+  }
+
+  @Override
+  public void setFloat(int ordinal, float value) {
+    throw new UnsupportedOperationException();
+  }
+
+  @Override
+  public void setString(int ordinal, String value) {
+    throw new UnsupportedOperationException();
+  }
+}
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/BaseRow.java b/sql/catalyst/src/main/scala/org/apache/spark/sql/BaseRow.java
new file mode 100644
index 0000000000000..d138b43a3482b
--- /dev/null
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/BaseRow.java
@@ -0,0 +1,190 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql;
+
+import java.math.BigDecimal;
+import java.sql.Date;
+import java.util.List;
+
+import scala.collection.Seq;
+import scala.collection.mutable.ArraySeq;
+
+import org.apache.spark.sql.catalyst.expressions.GenericRow;
+import org.apache.spark.sql.types.StructType;
+
+public abstract class BaseRow implements Row {
+
+  @Override
+  final public int length() {
+    return size();
+  }
+
+  @Override
+  public boolean anyNull() {
+    final int n = size();
+    for (int i=0; i < n; i++) {
+      if (isNullAt(i)) {
+        return true;
+      }
+    }
+    return false;
+  }
+
+  @Override
+  public StructType schema() { throw new UnsupportedOperationException(); }
+
+  @Override
+  final public Object apply(int i) {
+    return get(i);
+  }
+
+  @Override
+  public int getInt(int i) {
+    throw new UnsupportedOperationException();
+  }
+
+  @Override
+  public long getLong(int i) {
+    throw new UnsupportedOperationException();
+  }
+
+  @Override
+  public float getFloat(int i) {
+    throw new UnsupportedOperationException();
+  }
+
+  @Override
+  public double getDouble(int i) {
+    throw new UnsupportedOperationException();
+  }
+
+  @Override
+  public byte getByte(int i) {
+    throw new UnsupportedOperationException();
+  }
+
+  @Override
+  public short getShort(int i) {
+    throw new UnsupportedOperationException();
+  }
+
+  @Override
+  public boolean getBoolean(int i) {
+    throw new UnsupportedOperationException();
+  }
+
+  @Override
+  public String getString(int i) {
+    throw new UnsupportedOperationException();
+  }
+
+  @Override
+  public BigDecimal getDecimal(int i) {
+    throw new UnsupportedOperationException();
+  }
+
+  @Override
+  public Date getDate(int i) {
+    throw new UnsupportedOperationException();
+  }
+
+  @Override
+  public <T> Seq<T> getSeq(int i) {
+    throw new UnsupportedOperationException();
+  }
+
+  @Override
+  public <T> List<T> getList(int i) {
+    throw new UnsupportedOperationException();
+  }
+
+  @Override
+  public <K, V> scala.collection.Map<K, V> getMap(int i) {
+    throw new UnsupportedOperationException();
+  }
+
+  @Override
+  public <T> scala.collection.immutable.Map<String, T> getValuesMap(Seq<String> fieldNames) {
+    throw new UnsupportedOperationException();
+  }
+
+  @Override
+  public <K, V> java.util.Map<K, V> getJavaMap(int i) {
+    throw new UnsupportedOperationException();
+  }
+
+  @Override
+  public Row getStruct(int i) {
+    throw new UnsupportedOperationException();
+  }
+
+  @Override
+  public <T> T getAs(int i) {
+    throw new UnsupportedOperationException();
+  }
+
+  @Override
+  public <T> T getAs(String fieldName) {
+    throw new UnsupportedOperationException();
+  }
+
+  @Override
+  public int fieldIndex(String name) {
+    throw new UnsupportedOperationException();
+  }
+
+  @Override
+  public Row copy() {
+    final int n = size();
+    Object[] arr = new Object[n];
+    for (int i = 0; i < n; i++) {
+      arr[i] = get(i);
+    }
+    return new GenericRow(arr);
+  }
+
+  @Override
+  public Seq<Object> toSeq() {
+    final int n = size();
+    final ArraySeq<Object> values = new ArraySeq<Object>(n);
+    for (int i = 0; i < n; i++) {
+      values.update(i, get(i));
+    }
+    return values;
+  }
+
+  @Override
+  public String toString() {
+    return mkString("[", ",", "]");
+  }
+
+  @Override
+  public String mkString() {
+    return toSeq().mkString();
+  }
+
+  @Override
+  public String mkString(String sep) {
+    return toSeq().mkString(sep);
+  }
+
+  @Override
+  public String mkString(String start, String sep, String end) {
+    return toSeq().mkString(start, sep, end);
+  }
+}
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala
index 36964af68dd8d..cd604121b7dd9 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala
@@ -17,10 +17,12 @@
 
 package org.apache.spark.sql.catalyst.expressions.codegen
 
-import com.google.common.cache.{CacheLoader, CacheBuilder}
-
+import scala.collection.mutable
 import scala.language.existentials
 
+import com.google.common.cache.{CacheBuilder, CacheLoader}
+import org.codehaus.janino.ClassBodyEvaluator
+
 import org.apache.spark.Logging
 import org.apache.spark.sql.catalyst.expressions
 import org.apache.spark.sql.catalyst.expressions._
@@ -36,23 +38,15 @@ class LongHashSet extends org.apache.spark.util.collection.OpenHashSet[Long]
  * expressions.
  */
 abstract class CodeGenerator[InType <: AnyRef, OutType <: AnyRef] extends Logging {
-  import scala.reflect.runtime.{universe => ru}
-  import scala.reflect.runtime.universe._
-
-  import scala.tools.reflect.ToolBox
-
-  protected val toolBox = runtimeMirror(getClass.getClassLoader).mkToolBox()
 
-  protected val rowType = typeOf[Row]
-  protected val mutableRowType = typeOf[MutableRow]
-  protected val genericRowType = typeOf[GenericRow]
-  protected val genericMutableRowType = typeOf[GenericMutableRow]
-
-  protected val projectionType = typeOf[Projection]
-  protected val mutableProjectionType = typeOf[MutableProjection]
+  protected val rowType = classOf[Row].getName
+  protected val stringType = classOf[UTF8String].getName
+  protected val decimalType = classOf[Decimal].getName
+  protected val exprType = classOf[Expression].getName
+  protected val mutableRowType = classOf[MutableRow].getName
+  protected val genericMutableRowType = classOf[GenericMutableRow].getName
 
   private val curId = new java.util.concurrent.atomic.AtomicInteger()
-  private val javaSeparator = "$"
 
   /**
    * Can be flipped on manually in the console to add (expensive) expression evaluation trace code.
@@ -74,6 +68,20 @@ abstract class CodeGenerator[InType <: AnyRef, OutType <: AnyRef] extends Loggin
   /** Binds an input expression to a given input schema */
   protected def bind(in: InType, inputSchema: Seq[Attribute]): InType
 
+  /**
+   * Compile the Java source code into a Java class, using Janino.
+   *
+   * It will track the time used to compile
+   */
+  protected def compile(code: String): Class[_] = {
+    val startTime = System.nanoTime()
+    val clazz = new ClassBodyEvaluator(code).getClazz()
+    val endTime = System.nanoTime()
+    def timeMs: Double = (endTime - startTime).toDouble / 1000000
+    logDebug(s"Compiled Java code (${code.size} bytes) in $timeMs ms")
+    clazz
+  }
+
   /**
    * A cache of generated classes.
    *
@@ -87,7 +95,7 @@ abstract class CodeGenerator[InType <: AnyRef, OutType <: AnyRef] extends Loggin
     .maximumSize(1000)
     .build(
       new CacheLoader[InType, OutType]() {
-        override def load(in: InType): OutType = globalLock.synchronized {
+        override def load(in: InType): OutType = {
           val startTime = System.nanoTime()
           val result = create(in)
           val endTime = System.nanoTime()
@@ -110,8 +118,8 @@ abstract class CodeGenerator[InType <: AnyRef, OutType <: AnyRef] extends Loggin
    * (Since we aren't in a macro context we do not seem to have access to the built in `freshName`
    * function.)
    */
-  protected def freshName(prefix: String): TermName = {
-    newTermName(s"$prefix$javaSeparator${curId.getAndIncrement}")
+  protected def freshName(prefix: String): String = {
+    s"$prefix${curId.getAndIncrement}"
   }
 
   /**
@@ -125,32 +133,51 @@ abstract class CodeGenerator[InType <: AnyRef, OutType <: AnyRef] extends Loggin
    * @param objectTerm A possibly boxed version of the result of evaluating this expression.
    */
   protected case class EvaluatedExpression(
-      code: Seq[Tree],
-      nullTerm: TermName,
-      primitiveTerm: TermName,
-      objectTerm: TermName)
+      code: String,
+      nullTerm: String,
+      primitiveTerm: String,
+      objectTerm: String)
+
+  /**
+   * A context for codegen, which is used to bookkeeping the expressions those are not supported
+   * by codegen, then they are evaluated directly. The unsupported expression is appended at the
+   * end of `references`, the position of it is kept in the code, used to access and evaluate it.
+   */
+  protected class CodeGenContext {
+    /**
+     * Holding all the expressions those do not support codegen, will be evaluated directly.
+     */
+    val references: mutable.ArrayBuffer[Expression] = new mutable.ArrayBuffer[Expression]()
+  }
+
+  /**
+   * Create a new codegen context for expression evaluator, used to store those
+   * expressions that don't support codegen
+   */
+  def newCodeGenContext(): CodeGenContext = {
+    new CodeGenContext()
+  }
 
   /**
    * Given an expression tree returns an [[EvaluatedExpression]], which contains Scala trees that
    * can be used to determine the result of evaluating the expression on an input row.
    */
-  def expressionEvaluator(e: Expression): EvaluatedExpression = {
+  def expressionEvaluator(e: Expression, ctx: CodeGenContext): EvaluatedExpression = {
     val primitiveTerm = freshName("primitiveTerm")
     val nullTerm = freshName("nullTerm")
     val objectTerm = freshName("objectTerm")
 
     implicit class Evaluate1(e: Expression) {
-      def castOrNull(f: TermName => Tree, dataType: DataType): Seq[Tree] = {
-        val eval = expressionEvaluator(e)
-        eval.code ++
-        q"""
-          val $nullTerm = ${eval.nullTerm}
-          val $primitiveTerm =
-            if($nullTerm)
-              ${defaultPrimitive(dataType)}
-            else
-              ${f(eval.primitiveTerm)}
-        """.children
+      def castOrNull(f: String => String, dataType: DataType): String = {
+        val eval = expressionEvaluator(e, ctx)
+        eval.code +
+        s"""
+          boolean $nullTerm = ${eval.nullTerm};
+          ${primitiveForType(dataType)} $primitiveTerm = ${defaultPrimitive(dataType)};
+          if (!$nullTerm) {
+            $primitiveTerm = ${f(eval.primitiveTerm)};
+          }
+        """
       }
     }
 
@@ -163,529 +190,505 @@ abstract class CodeGenerator[InType <: AnyRef, OutType <: AnyRef] extends Loggin
        *
        * @param f a function from two primitive term names to a tree that evaluates them.
        */
-      def evaluate(f: (TermName, TermName) => Tree): Seq[Tree] =
+      def evaluate(f: (String, String) => String): String =
         evaluateAs(expressions._1.dataType)(f)
 
-      def evaluateAs(resultType: DataType)(f: (TermName, TermName) => Tree): Seq[Tree] = {
+      def evaluateAs(resultType: DataType)(f: (String, String) => String): String = {
         // TODO: Right now some timestamp tests fail if we enforce this...
         if (expressions._1.dataType != expressions._2.dataType) {
           log.warn(s"${expressions._1.dataType} != ${expressions._2.dataType}")
         }
 
-        val eval1 = expressionEvaluator(expressions._1)
-        val eval2 = expressionEvaluator(expressions._2)
+        val eval1 = expressionEvaluator(expressions._1, ctx)
+        val eval2 = expressionEvaluator(expressions._2, ctx)
         val resultCode = f(eval1.primitiveTerm, eval2.primitiveTerm)
 
-        eval1.code ++ eval2.code ++
-        q"""
-          val $nullTerm = ${eval1.nullTerm} || ${eval2.nullTerm}
-          val $primitiveTerm: ${termForType(resultType)} =
-            if($nullTerm) {
-              ${defaultPrimitive(resultType)}
-            } else {
-              $resultCode.asInstanceOf[${termForType(resultType)}]
-            }
-        """.children : Seq[Tree]
+        eval1.code + eval2.code +
+        s"""
+          boolean $nullTerm = ${eval1.nullTerm} || ${eval2.nullTerm};
+          ${primitiveForType(resultType)} $primitiveTerm = ${defaultPrimitive(resultType)};
+          if(!$nullTerm) {
+            $primitiveTerm = (${primitiveForType(resultType)})($resultCode);
+          }
+        """
       }
     }
 
-    val inputTuple = newTermName(s"i")
+    val inputTuple = "i"
 
     // TODO: Skip generation of null handling code when expression are not nullable.
-    val primitiveEvaluation: PartialFunction[Expression, Seq[Tree]] = {
+    val primitiveEvaluation: PartialFunction[Expression, String] = {
       case b @ BoundReference(ordinal, dataType, nullable) =>
-        val nullValue = q"$inputTuple.isNullAt($ordinal)"
-        q"""
-          val $nullTerm: Boolean = $nullValue
-          val $primitiveTerm: ${termForType(dataType)} =
-            if($nullTerm)
-              ${defaultPrimitive(dataType)}
-            else
-              ${getColumn(inputTuple, dataType, ordinal)}
-         """.children
+        s"""
+          final boolean $nullTerm = $inputTuple.isNullAt($ordinal);
+          final ${primitiveForType(dataType)} $primitiveTerm = $nullTerm ?
+              ${defaultPrimitive(dataType)} : (${getColumn(inputTuple, dataType, ordinal)});
+         """
 
       case expressions.Literal(null, dataType) =>
-        q"""
-          val $nullTerm = true
-          val $primitiveTerm: ${termForType(dataType)} = null.asInstanceOf[${termForType(dataType)}]
-         """.children
-
-      case expressions.Literal(value: Boolean, dataType) =>
-        q"""
-          val $nullTerm = ${value == null}
-          val $primitiveTerm: ${termForType(dataType)} = $value
-         """.children
-
-      case expressions.Literal(value: UTF8String, dataType) =>
-        q"""
-          val $nullTerm = ${value == null}
-          val $primitiveTerm: ${termForType(dataType)} =
-            org.apache.spark.sql.types.UTF8String(${value.getBytes})
-         """.children
-
-      case expressions.Literal(value: Int, dataType) =>
-        q"""
-          val $nullTerm = ${value == null}
-          val $primitiveTerm: ${termForType(dataType)} = $value
-         """.children
-
-      case expressions.Literal(value: Long, dataType) =>
-        q"""
-          val $nullTerm = ${value == null}
-          val $primitiveTerm: ${termForType(dataType)} = $value
-         """.children
-
-      case Cast(e @ BinaryType(), StringType) =>
-        val eval = expressionEvaluator(e)
-        eval.code ++
-        q"""
-          val $nullTerm = ${eval.nullTerm}
-          val $primitiveTerm =
-            if($nullTerm)
-              ${defaultPrimitive(StringType)}
-            else
-              org.apache.spark.sql.types.UTF8String(${eval.primitiveTerm}.asInstanceOf[Array[Byte]])
-        """.children
+        s"""
+          final boolean $nullTerm = true;
+          ${primitiveForType(dataType)} $primitiveTerm = ${defaultPrimitive(dataType)};
+        """
+
+      case expressions.Literal(value: UTF8String, StringType) =>
+        val arr = s"new byte[]{${value.getBytes.map(_.toString).mkString(", ")}}"
+        s"""
+          final boolean $nullTerm = false;
+          ${stringType} $primitiveTerm =
+            new ${stringType}().set(${arr});
+         """
+
+      case expressions.Literal(value, FloatType) =>
+        s"""
+          final boolean $nullTerm = false;
+          float $primitiveTerm = ${value}f;
+         """
+
+      case expressions.Literal(value, dt @ DecimalType()) =>
+        s"""
+          final boolean $nullTerm = false;
+          ${primitiveForType(dt)} $primitiveTerm = new ${primitiveForType(dt)}().set($value);
+         """
+
+      case expressions.Literal(value, dataType) =>
+        s"""
+          final boolean $nullTerm = false;
+          ${primitiveForType(dataType)} $primitiveTerm = $value;
+         """
+
+      case Cast(child @ BinaryType(), StringType) =>
+        child.castOrNull(c =>
+          s"new ${stringType}().set($c)",
+          StringType)
 
       case Cast(child @ DateType(), StringType) =>
         child.castOrNull(c =>
-          q"""org.apache.spark.sql.types.UTF8String(
+          s"""new ${stringType}().set(
                 org.apache.spark.sql.catalyst.util.DateUtils.toString($c))""",
           StringType)
 
-      case Cast(child @ NumericType(), IntegerType) =>
-        child.castOrNull(c => q"$c.toInt", IntegerType)
+      case Cast(child @ BooleanType(), dt: NumericType)  if !dt.isInstanceOf[DecimalType] =>
+        child.castOrNull(c => s"(${primitiveForType(dt)})($c?1:0)", dt)
 
-      case Cast(child @ NumericType(), LongType) =>
-        child.castOrNull(c => q"$c.toLong", LongType)
+      case Cast(child @ DecimalType(), IntegerType) =>
+        child.castOrNull(c => s"($c).toInt()", IntegerType)
 
-      case Cast(child @ NumericType(), DoubleType) =>
-        child.castOrNull(c => q"$c.toDouble", DoubleType)
+      case Cast(child @ DecimalType(), dt: NumericType) if !dt.isInstanceOf[DecimalType] =>
+        child.castOrNull(c => s"($c).to${termForType(dt)}()", dt)
 
-      case Cast(child @ NumericType(), FloatType) =>
-        child.castOrNull(c => q"$c.toFloat", FloatType)
+      case Cast(child @ NumericType(), dt: NumericType) if !dt.isInstanceOf[DecimalType] =>
+        child.castOrNull(c => s"(${primitiveForType(dt)})($c)", dt)
 
       // Special handling required for timestamps in hive test cases since the toString function
       // does not match the expected output.
       case Cast(e, StringType) if e.dataType != TimestampType =>
-        val eval = expressionEvaluator(e)
-        eval.code ++
-        q"""
-          val $nullTerm = ${eval.nullTerm}
-          val $primitiveTerm =
-            if($nullTerm)
-              ${defaultPrimitive(StringType)}
-            else
-              org.apache.spark.sql.types.UTF8String(${eval.primitiveTerm}.toString)
-        """.children
+        e.castOrNull(c =>
+          s"new ${stringType}().set(String.valueOf($c))",
+          StringType)
 
       case EqualTo(e1 @ BinaryType(), e2 @ BinaryType()) =>
         (e1, e2).evaluateAs (BooleanType) {
           case (eval1, eval2) =>
-            q"""
-              java.util.Arrays.equals($eval1.asInstanceOf[Array[Byte]],
-                 $eval2.asInstanceOf[Array[Byte]])
-            """
+            s"java.util.Arrays.equals((byte[])$eval1, (byte[])$eval2)"
         }
 
       case EqualTo(e1, e2) =>
-        (e1, e2).evaluateAs (BooleanType) { case (eval1, eval2) => q"$eval1 == $eval2" }
-
-      /* TODO: Fix null semantics.
-      case In(e1, list) if !list.exists(!_.isInstanceOf[expressions.Literal]) =>
-        val eval = expressionEvaluator(e1)
-
-        val checks = list.map {
-          case expressions.Literal(v: String, dataType) =>
-            q"if(${eval.primitiveTerm} == $v) return true"
-          case expressions.Literal(v: Int, dataType) =>
-            q"if(${eval.primitiveTerm} == $v) return true"
-        }
-
-        val funcName = newTermName(s"isIn${curId.getAndIncrement()}")
-
-        q"""
-            def $funcName: Boolean = {
-              ..${eval.code}
-              if(${eval.nullTerm}) return false
-              ..$checks
-              return false
-            }
-            val $nullTerm = false
-            val $primitiveTerm = $funcName
-        """.children
-      */
+        (e1, e2).evaluateAs (BooleanType) { case (eval1, eval2) => s"$eval1 == $eval2" }
 
       case GreaterThan(e1 @ NumericType(), e2 @ NumericType()) =>
-        (e1, e2).evaluateAs (BooleanType) { case (eval1, eval2) => q"$eval1 > $eval2" }
+        (e1, e2).evaluateAs (BooleanType) { case (eval1, eval2) => s"$eval1 > $eval2" }
       case GreaterThanOrEqual(e1 @ NumericType(), e2 @ NumericType()) =>
-        (e1, e2).evaluateAs (BooleanType) { case (eval1, eval2) => q"$eval1 >= $eval2" }
+        (e1, e2).evaluateAs (BooleanType) { case (eval1, eval2) => s"$eval1 >= $eval2" }
       case LessThan(e1 @ NumericType(), e2 @ NumericType()) =>
-        (e1, e2).evaluateAs (BooleanType) { case (eval1, eval2) => q"$eval1 < $eval2" }
+        (e1, e2).evaluateAs (BooleanType) { case (eval1, eval2) => s"$eval1 < $eval2" }
       case LessThanOrEqual(e1 @ NumericType(), e2 @ NumericType()) =>
-        (e1, e2).evaluateAs (BooleanType) { case (eval1, eval2) => q"$eval1 <= $eval2" }
+        (e1, e2).evaluateAs (BooleanType) { case (eval1, eval2) => s"$eval1 <= $eval2" }
 
       case And(e1, e2) =>
-        val eval1 = expressionEvaluator(e1)
-        val eval2 = expressionEvaluator(e2)
-
-        q"""
-          ..${eval1.code}
-          var $nullTerm = false
-          var $primitiveTerm: ${termForType(BooleanType)} = false
-
-          if (!${eval1.nullTerm} && ${eval1.primitiveTerm} == false) {
+        val eval1 = expressionEvaluator(e1, ctx)
+        val eval2 = expressionEvaluator(e2, ctx)
+        s"""
+          ${eval1.code}
+          boolean $nullTerm = false;
+          boolean $primitiveTerm  = false;
+
+          if (!${eval1.nullTerm} && !${eval1.primitiveTerm}) {
           } else {
-            ..${eval2.code}
-            if (!${eval2.nullTerm} && ${eval2.primitiveTerm} == false) {
+            ${eval2.code}
+            if (!${eval2.nullTerm} && !${eval2.primitiveTerm}) {
             } else if (!${eval1.nullTerm} && !${eval2.nullTerm}) {
-              $primitiveTerm = true
+              $primitiveTerm = true;
             } else {
-              $nullTerm = true
+              $nullTerm = true;
             }
           }
-         """.children
+         """
 
       case Or(e1, e2) =>
-        val eval1 = expressionEvaluator(e1)
-        val eval2 = expressionEvaluator(e2)
+        val eval1 = expressionEvaluator(e1, ctx)
+        val eval2 = expressionEvaluator(e2, ctx)
 
-        q"""
-          ..${eval1.code}
-          var $nullTerm = false
-          var $primitiveTerm: ${termForType(BooleanType)} = false
+        s"""
+          ${eval1.code}
+          boolean $nullTerm = false;
+          boolean $primitiveTerm = false;
 
           if (!${eval1.nullTerm} && ${eval1.primitiveTerm}) {
-            $primitiveTerm = true
+            $primitiveTerm = true;
           } else {
-            ..${eval2.code}
+            ${eval2.code}
             if (!${eval2.nullTerm} && ${eval2.primitiveTerm}) {
-              $primitiveTerm = true
+              $primitiveTerm = true;
             } else if (!${eval1.nullTerm} && !${eval2.nullTerm}) {
-              $primitiveTerm = false
+              $primitiveTerm = false;
             } else {
-              $nullTerm = true
+              $nullTerm = true;
             }
           }
-         """.children
+         """
 
       case Not(child) =>
         // Uh, bad function name...
-        child.castOrNull(c => q"!$c", BooleanType)
-
-      case Add(e1, e2) => (e1, e2) evaluate { case (eval1, eval2) => q"$eval1 + $eval2" }
-      case Subtract(e1, e2) => (e1, e2) evaluate { case (eval1, eval2) => q"$eval1 - $eval2" }
-      case Multiply(e1, e2) => (e1, e2) evaluate { case (eval1, eval2) => q"$eval1 * $eval2" }
+        child.castOrNull(c => s"!$c", BooleanType)
+
+      case Add(e1 @ DecimalType(), e2 @ DecimalType()) =>
+        (e1, e2) evaluate { case (eval1, eval2) => s"$eval1.$$plus($eval2)" }
+      case Subtract(e1 @ DecimalType(), e2 @ DecimalType()) =>
+        (e1, e2) evaluate { case (eval1, eval2) => s"$eval1.$$minus($eval2)" }
+      case Multiply(e1 @ DecimalType(), e2 @ DecimalType()) =>
+        (e1, e2) evaluate { case (eval1, eval2) => s"$eval1.$$times($eval2)" }
+      case Divide(e1 @ DecimalType(), e2 @ DecimalType()) =>
+        val eval1 = expressionEvaluator(e1, ctx)
+        val eval2 = expressionEvaluator(e2, ctx)
+        eval1.code + eval2.code +
+          s"""
+          boolean $nullTerm = false;
+          ${primitiveForType(e1.dataType)} $primitiveTerm = null;
+          if (${eval1.nullTerm} || ${eval2.nullTerm} || ${eval2.primitiveTerm}.isZero()) {
+            $nullTerm = true;
+          } else {
+            $primitiveTerm = ${eval1.primitiveTerm}.$$div${eval2.primitiveTerm});
+          }
+          """
+      case Remainder(e1 @ DecimalType(), e2 @ DecimalType()) =>
+        val eval1 = expressionEvaluator(e1, ctx)
+        val eval2 = expressionEvaluator(e2, ctx)
+        eval1.code + eval2.code +
+          s"""
+          boolean $nullTerm = false;
+          ${primitiveForType(e1.dataType)} $primitiveTerm = 0;
+          if (${eval1.nullTerm} || ${eval2.nullTerm} || ${eval2.primitiveTerm}.isZero()) {
+            $nullTerm = true;
+          } else {
+            $primitiveTerm = ${eval1.primitiveTerm}.remainder(${eval2.primitiveTerm});
+          }
+         """
+
+      case Add(e1, e2) =>
+        (e1, e2) evaluate { case (eval1, eval2) => s"$eval1 + $eval2" }
+      case Subtract(e1, e2) =>
+        (e1, e2) evaluate { case (eval1, eval2) => s"$eval1 - $eval2" }
+      case Multiply(e1, e2) =>
+        (e1, e2) evaluate { case (eval1, eval2) => s"$eval1 * $eval2" }
       case Divide(e1, e2) =>
-        val eval1 = expressionEvaluator(e1)
-        val eval2 = expressionEvaluator(e2)
-
-        eval1.code ++ eval2.code ++
-        q"""
-          var $nullTerm = false
-          var $primitiveTerm: ${termForType(e1.dataType)} = 0
-
-          if (${eval1.nullTerm} || ${eval2.nullTerm} ) {
-            $nullTerm = true
-          } else if (${eval2.primitiveTerm} == 0)
-            $nullTerm = true
-          else {
-            $primitiveTerm = ${eval1.primitiveTerm} / ${eval2.primitiveTerm}
+        val eval1 = expressionEvaluator(e1, ctx)
+        val eval2 = expressionEvaluator(e2, ctx)
+        eval1.code + eval2.code +
+        s"""
+          boolean $nullTerm = false;
+          ${primitiveForType(e1.dataType)} $primitiveTerm = 0;
+          if (${eval1.nullTerm} || ${eval2.nullTerm} || ${eval2.primitiveTerm} == 0) {
+            $nullTerm = true;
+          } else {
+            $primitiveTerm = ${eval1.primitiveTerm} / ${eval2.primitiveTerm};
           }
-         """.children
-
+        """
       case Remainder(e1, e2) =>
-        val eval1 = expressionEvaluator(e1)
-        val eval2 = expressionEvaluator(e2)
-
-        eval1.code ++ eval2.code ++
-        q"""
-          var $nullTerm = false
-          var $primitiveTerm: ${termForType(e1.dataType)} = 0
-
-          if (${eval1.nullTerm} || ${eval2.nullTerm} ) {
-            $nullTerm = true
-          } else if (${eval2.primitiveTerm} == 0)
-            $nullTerm = true
-          else {
-            $nullTerm = false
-            $primitiveTerm = ${eval1.primitiveTerm} % ${eval2.primitiveTerm}
+        val eval1 = expressionEvaluator(e1, ctx)
+        val eval2 = expressionEvaluator(e2, ctx)
+        eval1.code + eval2.code +
+        s"""
+          boolean $nullTerm = false;
+          ${primitiveForType(e1.dataType)} $primitiveTerm = 0;
+          if (${eval1.nullTerm} || ${eval2.nullTerm} || ${eval2.primitiveTerm} == 0) {
+            $nullTerm = true;
+          } else {
+            $primitiveTerm = ${eval1.primitiveTerm} % ${eval2.primitiveTerm};
           }
-         """.children
+         """
 
       case IsNotNull(e) =>
-        val eval = expressionEvaluator(e)
-        q"""
-          ..${eval.code}
-          var $nullTerm = false
-          var $primitiveTerm: ${termForType(BooleanType)} = !${eval.nullTerm}
-        """.children
+        val eval = expressionEvaluator(e, ctx)
+        s"""
+          ${eval.code}
+          boolean $nullTerm = false;
+          boolean $primitiveTerm = !${eval.nullTerm};
+        """
 
       case IsNull(e) =>
-        val eval = expressionEvaluator(e)
-        q"""
-          ..${eval.code}
-          var $nullTerm = false
-          var $primitiveTerm: ${termForType(BooleanType)} = ${eval.nullTerm}
-        """.children
-
-      case c @ Coalesce(children) =>
-        q"""
-          var $nullTerm = true
-          var $primitiveTerm: ${termForType(c.dataType)} = ${defaultPrimitive(c.dataType)}
-        """.children ++
+        val eval = expressionEvaluator(e, ctx)
+        s"""
+          ${eval.code}
+          boolean $nullTerm = false;
+          boolean $primitiveTerm = ${eval.nullTerm};
+        """
+
+      case e @ Coalesce(children) =>
+        s"""
+          boolean $nullTerm = true;
+          ${primitiveForType(e.dataType)} $primitiveTerm = ${defaultPrimitive(e.dataType)};
+        """ +
         children.map { c =>
-          val eval = expressionEvaluator(c)
-          q"""
+          val eval = expressionEvaluator(c, ctx)
+          s"""
             if($nullTerm) {
-              ..${eval.code}
+              ${eval.code}
               if(!${eval.nullTerm}) {
-                $nullTerm = false
-                $primitiveTerm = ${eval.primitiveTerm}
+                $nullTerm = false;
+                $primitiveTerm = ${eval.primitiveTerm};
               }
             }
           """
-        }
+        }.mkString("\n")
 
-      case i @ expressions.If(condition, trueValue, falseValue) =>
-        val condEval = expressionEvaluator(condition)
-        val trueEval = expressionEvaluator(trueValue)
-        val falseEval = expressionEvaluator(falseValue)
+      case e @ expressions.If(condition, trueValue, falseValue) =>
+        val condEval = expressionEvaluator(condition, ctx)
+        val trueEval = expressionEvaluator(trueValue, ctx)
+        val falseEval = expressionEvaluator(falseValue, ctx)
 
-        q"""
-          var $nullTerm = false
-          var $primitiveTerm: ${termForType(i.dataType)} = ${defaultPrimitive(i.dataType)}
-          ..${condEval.code}
+        s"""
+          boolean $nullTerm = false;
+          ${primitiveForType(e.dataType)} $primitiveTerm = ${defaultPrimitive(e.dataType)};
+          ${condEval.code}
           if(!${condEval.nullTerm} && ${condEval.primitiveTerm}) {
-            ..${trueEval.code}
-            $nullTerm = ${trueEval.nullTerm}
-            $primitiveTerm = ${trueEval.primitiveTerm}
+            ${trueEval.code}
+            $nullTerm = ${trueEval.nullTerm};
+            $primitiveTerm = ${trueEval.primitiveTerm};
           } else {
-            ..${falseEval.code}
-            $nullTerm = ${falseEval.nullTerm}
-            $primitiveTerm = ${falseEval.primitiveTerm}
+            ${falseEval.code}
+            $nullTerm = ${falseEval.nullTerm};
+            $primitiveTerm = ${falseEval.primitiveTerm};
           }
-        """.children
+        """
 
       case NewSet(elementType) =>
-        q"""
-          val $nullTerm = false
-          val $primitiveTerm = new ${hashSetForType(elementType)}()
-        """.children
+        s"""
+          boolean $nullTerm = false;
+          ${hashSetForType(elementType)} $primitiveTerm = new ${hashSetForType(elementType)}();
+        """
 
       case AddItemToSet(item, set) =>
-        val itemEval = expressionEvaluator(item)
-        val setEval = expressionEvaluator(set)
+        val itemEval = expressionEvaluator(item, ctx)
+        val setEval = expressionEvaluator(set, ctx)
 
         val elementType = set.dataType.asInstanceOf[OpenHashSetUDT].elementType
+        val htype = hashSetForType(elementType)
 
-        itemEval.code ++ setEval.code ++
-        q"""
-           if (!${itemEval.nullTerm}) {
-             ${setEval.primitiveTerm}
-               .asInstanceOf[${hashSetForType(elementType)}]
-               .add(${itemEval.primitiveTerm})
+        itemEval.code + setEval.code +
+        s"""
+           if (!${itemEval.nullTerm} && !${setEval.nullTerm}) {
+             (($htype)${setEval.primitiveTerm}).add(${itemEval.primitiveTerm});
            }
-
-           val $nullTerm = false
-           val $primitiveTerm = ${setEval.primitiveTerm}
-         """.children
+           boolean $nullTerm = false;
+           ${htype} $primitiveTerm = ($htype)${setEval.primitiveTerm};
+         """
 
       case CombineSets(left, right) =>
-        val leftEval = expressionEvaluator(left)
-        val rightEval = expressionEvaluator(right)
+        val leftEval = expressionEvaluator(left, ctx)
+        val rightEval = expressionEvaluator(right, ctx)
 
         val elementType = left.dataType.asInstanceOf[OpenHashSetUDT].elementType
+        val htype = hashSetForType(elementType)
 
-        leftEval.code ++ rightEval.code ++
-        q"""
-          val $nullTerm = false
-          var $primitiveTerm: ${hashSetForType(elementType)} = null
-
-          {
-            val leftSet = ${leftEval.primitiveTerm}.asInstanceOf[${hashSetForType(elementType)}]
-            val rightSet = ${rightEval.primitiveTerm}.asInstanceOf[${hashSetForType(elementType)}]
-            val iterator = rightSet.iterator
-            while (iterator.hasNext) {
-              leftSet.add(iterator.next())
-            }
-            $primitiveTerm = leftSet
-          }
-        """.children
+        leftEval.code + rightEval.code +
+        s"""
+          boolean $nullTerm = false;
+          ${htype} $primitiveTerm =
+            (${htype})${leftEval.primitiveTerm};
+          $primitiveTerm.union((${htype})${rightEval.primitiveTerm});
+        """
 
-      case MaxOf(e1, e2) =>
-        val eval1 = expressionEvaluator(e1)
-        val eval2 = expressionEvaluator(e2)
+      case MaxOf(e1, e2) if !e1.dataType.isInstanceOf[DecimalType] =>
+        val eval1 = expressionEvaluator(e1, ctx)
+        val eval2 = expressionEvaluator(e2, ctx)
 
-        eval1.code ++ eval2.code ++
-        q"""
-          var $nullTerm = false
-          var $primitiveTerm: ${termForType(e1.dataType)} = ${defaultPrimitive(e1.dataType)}
+        eval1.code + eval2.code +
+        s"""
+          boolean $nullTerm = false;
+          ${primitiveForType(e1.dataType)} $primitiveTerm = ${defaultPrimitive(e1.dataType)};
 
           if (${eval1.nullTerm}) {
-            $nullTerm = ${eval2.nullTerm}
-            $primitiveTerm = ${eval2.primitiveTerm}
+            $nullTerm = ${eval2.nullTerm};
+            $primitiveTerm = ${eval2.primitiveTerm};
           } else if (${eval2.nullTerm}) {
-            $nullTerm = ${eval1.nullTerm}
-            $primitiveTerm = ${eval1.primitiveTerm}
+            $nullTerm = ${eval1.nullTerm};
+            $primitiveTerm = ${eval1.primitiveTerm};
           } else {
             if (${eval1.primitiveTerm} > ${eval2.primitiveTerm}) {
-              $primitiveTerm = ${eval1.primitiveTerm}
+              $primitiveTerm = ${eval1.primitiveTerm};
             } else {
-              $primitiveTerm = ${eval2.primitiveTerm}
+              $primitiveTerm = ${eval2.primitiveTerm};
             }
           }
-        """.children
+        """
 
-      case MinOf(e1, e2) =>
-        val eval1 = expressionEvaluator(e1)
-        val eval2 = expressionEvaluator(e2)
+      case MinOf(e1, e2) if !e1.dataType.isInstanceOf[DecimalType] =>
+        val eval1 = expressionEvaluator(e1, ctx)
+        val eval2 = expressionEvaluator(e2, ctx)
 
-        eval1.code ++ eval2.code ++
-        q"""
-          var $nullTerm = false
-          var $primitiveTerm: ${termForType(e1.dataType)} = ${defaultPrimitive(e1.dataType)}
+        eval1.code + eval2.code +
+        s"""
+          boolean $nullTerm = false;
+          ${primitiveForType(e1.dataType)} $primitiveTerm = ${defaultPrimitive(e1.dataType)};
 
           if (${eval1.nullTerm}) {
-            $nullTerm = ${eval2.nullTerm}
-            $primitiveTerm = ${eval2.primitiveTerm}
+            $nullTerm = ${eval2.nullTerm};
+            $primitiveTerm = ${eval2.primitiveTerm};
           } else if (${eval2.nullTerm}) {
-            $nullTerm = ${eval1.nullTerm}
-            $primitiveTerm = ${eval1.primitiveTerm}
+            $nullTerm = ${eval1.nullTerm};
+            $primitiveTerm = ${eval1.primitiveTerm};
           } else {
             if (${eval1.primitiveTerm} < ${eval2.primitiveTerm}) {
-              $primitiveTerm = ${eval1.primitiveTerm}
+              $primitiveTerm = ${eval1.primitiveTerm};
             } else {
-              $primitiveTerm = ${eval2.primitiveTerm}
+              $primitiveTerm = ${eval2.primitiveTerm};
             }
           }
-        """.children
+        """
 
       case UnscaledValue(child) =>
-        val childEval = expressionEvaluator(child)
-
-        childEval.code ++
-        q"""
-         var $nullTerm = ${childEval.nullTerm}
-         var $primitiveTerm: Long = if (!$nullTerm) {
-           ${childEval.primitiveTerm}.toUnscaledLong
-         } else {
-           ${defaultPrimitive(LongType)}
-         }
-         """.children
+        val childEval = expressionEvaluator(child, ctx)
+
+        childEval.code +
+        s"""
+         boolean $nullTerm = ${childEval.nullTerm};
+         long $primitiveTerm = $nullTerm ? -1 : ${childEval.primitiveTerm}.toUnscaledLong();
+         """
 
       case MakeDecimal(child, precision, scale) =>
-        val childEval = expressionEvaluator(child)
+        val eval = expressionEvaluator(child, ctx)
 
-        childEval.code ++
-        q"""
-         var $nullTerm = ${childEval.nullTerm}
-         var $primitiveTerm: org.apache.spark.sql.types.Decimal =
-           ${defaultPrimitive(DecimalType())}
+        eval.code +
+        s"""
+         boolean $nullTerm = ${eval.nullTerm};
+         org.apache.spark.sql.types.Decimal $primitiveTerm = ${defaultPrimitive(DecimalType())};
 
          if (!$nullTerm) {
-           $primitiveTerm = new org.apache.spark.sql.types.Decimal()
-           $primitiveTerm = $primitiveTerm.setOrNull(${childEval.primitiveTerm}, $precision, $scale)
-           $nullTerm = $primitiveTerm == null
+           $primitiveTerm = new org.apache.spark.sql.types.Decimal();
+           $primitiveTerm = $primitiveTerm.setOrNull(${eval.primitiveTerm}, $precision, $scale);
+           $nullTerm = $primitiveTerm == null;
          }
-         """.children
+         """
     }
 
     // If there was no match in the partial function above, we fall back on calling the interpreted
     // expression evaluator.
-    val code: Seq[Tree] =
+    val code: String =
       primitiveEvaluation.lift.apply(e).getOrElse {
-        log.debug(s"No rules to generate $e")
-        val tree = reify { e }
-        q"""
-          val $objectTerm = $tree.eval(i)
-          val $nullTerm = $objectTerm == null
-          val $primitiveTerm = $objectTerm.asInstanceOf[${termForType(e.dataType)}]
-         """.children
-      }
-
-    // Only inject debugging code if debugging is turned on.
-    val debugCode =
-      if (debugLogging) {
-        val localLogger = log
-        val localLoggerTree = reify { localLogger }
-        q"""
-          $localLoggerTree.debug(
-            ${e.toString} + ": " + (if ($nullTerm) "null" else $primitiveTerm.toString))
-        """ :: Nil
-      } else {
-        Nil
+        logError(s"No rules to generate $e")
+        ctx.references += e
+        s"""
+          /* expression: ${e} */
+          Object $objectTerm = expressions[${ctx.references.size - 1}].eval(i);
+          boolean $nullTerm = $objectTerm == null;
+          ${primitiveForType(e.dataType)} $primitiveTerm = ${defaultPrimitive(e.dataType)};
+          if (!$nullTerm) $primitiveTerm = (${termForType(e.dataType)})$objectTerm;
+         """
       }
 
-    EvaluatedExpression(code ++ debugCode, nullTerm, primitiveTerm, objectTerm)
+    EvaluatedExpression(code, nullTerm, primitiveTerm, objectTerm)
   }
 
-  protected def getColumn(inputRow: TermName, dataType: DataType, ordinal: Int) = {
+  protected def getColumn(inputRow: String, dataType: DataType, ordinal: Int) = {
     dataType match {
-      case StringType => q"$inputRow($ordinal).asInstanceOf[org.apache.spark.sql.types.UTF8String]"
-      case dt: DataType if isNativeType(dt) => q"$inputRow.${accessorForType(dt)}($ordinal)"
-      case _ => q"$inputRow.apply($ordinal).asInstanceOf[${termForType(dataType)}]"
+      case StringType => s"(${stringType})$inputRow.apply($ordinal)"
+      case dt: DataType if isNativeType(dt) => s"$inputRow.${accessorForType(dt)}($ordinal)"
+      case _ => s"(${termForType(dataType)})$inputRow.apply($ordinal)"
     }
   }
 
   protected def setColumn(
-      destinationRow: TermName,
+      destinationRow: String,
       dataType: DataType,
       ordinal: Int,
-      value: TermName) = {
+      value: String): String = {
     dataType match {
-      case StringType => q"$destinationRow.update($ordinal, $value)"
+      case StringType => s"$destinationRow.update($ordinal, $value)"
       case dt: DataType if isNativeType(dt) =>
-        q"$destinationRow.${mutatorForType(dt)}($ordinal, $value)"
-      case _ => q"$destinationRow.update($ordinal, $value)"
+        s"$destinationRow.${mutatorForType(dt)}($ordinal, $value)"
+      case _ => s"$destinationRow.update($ordinal, $value)"
     }
   }
 
-  protected def accessorForType(dt: DataType) = newTermName(s"get${primitiveForType(dt)}")
-  protected def mutatorForType(dt: DataType) = newTermName(s"set${primitiveForType(dt)}")
+  protected def accessorForType(dt: DataType) = dt match {
+    case IntegerType => "getInt"
+    case other => s"get${termForType(dt)}"
+  }
+
+  protected def mutatorForType(dt: DataType) = dt match {
+    case IntegerType => "setInt"
+    case other => s"set${termForType(dt)}"
+  }
 
-  protected def hashSetForType(dt: DataType) = dt match {
-    case IntegerType => typeOf[IntegerHashSet]
-    case LongType => typeOf[LongHashSet]
+  protected def hashSetForType(dt: DataType): String = dt match {
+    case IntegerType => classOf[IntegerHashSet].getName
+    case LongType => classOf[LongHashSet].getName
     case unsupportedType =>
       sys.error(s"Code generation not support for hashset of type $unsupportedType")
   }
 
-  protected def primitiveForType(dt: DataType) = dt match {
-    case IntegerType => "Int"
+  protected def primitiveForType(dt: DataType): String = dt match {
+    case IntegerType => "int"
+    case LongType => "long"
+    case ShortType => "short"
+    case ByteType => "byte"
+    case DoubleType => "double"
+    case FloatType => "float"
+    case BooleanType => "boolean"
+    case dt: DecimalType => decimalType
+    case BinaryType => "byte[]"
+    case StringType => stringType
+    case DateType => "int"
+    case TimestampType => "java.sql.Timestamp"
+    case _ => "Object"
+  }
+
+  protected def defaultPrimitive(dt: DataType): String = dt match {
+    case BooleanType => "false"
+    case FloatType => "-1.0f"
+    case ShortType => "-1"
+    case LongType => "-1"
+    case ByteType => "-1"
+    case DoubleType => "-1.0"
+    case IntegerType => "-1"
+    case DateType => "-1"
+    case dt: DecimalType => "null"
+    case StringType => "null"
+    case _ => "null"
+  }
+
+  protected def termForType(dt: DataType): String = dt match {
+    case IntegerType => "Integer"
     case LongType => "Long"
     case ShortType => "Short"
     case ByteType => "Byte"
     case DoubleType => "Double"
     case FloatType => "Float"
     case BooleanType => "Boolean"
-    case StringType => "org.apache.spark.sql.types.UTF8String"
-  }
-
-  protected def defaultPrimitive(dt: DataType) = dt match {
-    case BooleanType => ru.Literal(Constant(false))
-    case FloatType => ru.Literal(Constant(-1.0.toFloat))
-    case StringType => q"""org.apache.spark.sql.types.UTF8String("<uninit>")"""
-    case ShortType => ru.Literal(Constant(-1.toShort))
-    case LongType => ru.Literal(Constant(-1L))
-    case ByteType => ru.Literal(Constant(-1.toByte))
-    case DoubleType => ru.Literal(Constant(-1.toDouble))
-    case DecimalType() => q"org.apache.spark.sql.types.Decimal(-1)"
-    case IntegerType => ru.Literal(Constant(-1))
-    case DateType => ru.Literal(Constant(-1))
-    case _ => ru.Literal(Constant(null))
-  }
-
-  protected def termForType(dt: DataType) = dt match {
-    case n: AtomicType => n.tag
-    case _ => typeTag[Any]
+    case dt: DecimalType => decimalType
+    case BinaryType => "byte[]"
+    case StringType => stringType
+    case DateType => "Integer"
+    case TimestampType => "java.sql.Timestamp"
+    case _ => "Object"
   }
 
   /**
    * List of data types that have special accessors and setters in [[Row]].
    */
   protected val nativeTypes =
-    Seq(IntegerType, BooleanType, LongType, DoubleType, FloatType, ShortType, ByteType, StringType)
+    Seq(IntegerType, BooleanType, LongType, DoubleType, FloatType, ShortType, ByteType)
 
   /**
    * Returns true if the data type has a special accessor and setter in [[Row]].
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateMutableProjection.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateMutableProjection.scala
index 840260703ab74..638b53fe0fe2f 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateMutableProjection.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateMutableProjection.scala
@@ -19,15 +19,14 @@ package org.apache.spark.sql.catalyst.expressions.codegen
 
 import org.apache.spark.sql.catalyst.expressions._
 
+// MutableProjection is not accessible in Java
+abstract class BaseMutableProjection extends MutableProjection {}
+
 /**
  * Generates byte code that produces a [[MutableRow]] object that can update itself based on a new
  * input [[Row]] for a fixed set of [[Expression Expressions]].
  */
 object GenerateMutableProjection extends CodeGenerator[Seq[Expression], () => MutableProjection] {
-  import scala.reflect.runtime.{universe => ru}
-  import scala.reflect.runtime.universe._
-
-  val mutableRowName = newTermName("mutableRow")
 
   protected def canonicalize(in: Seq[Expression]): Seq[Expression] =
     in.map(ExpressionCanonicalizer.execute)
@@ -36,41 +35,61 @@ object GenerateMutableProjection extends CodeGenerator[Seq[Expression], () => Mu
     in.map(BindReferences.bindReference(_, inputSchema))
 
   protected def create(expressions: Seq[Expression]): (() => MutableProjection) = {
-    val projectionCode = expressions.zipWithIndex.flatMap { case (e, i) =>
-      val evaluationCode = expressionEvaluator(e)
-
-      evaluationCode.code :+
-      q"""
-        if(${evaluationCode.nullTerm})
-          mutableRow.setNullAt($i)
-        else
-          ${setColumn(mutableRowName, e.dataType, i, evaluationCode.primitiveTerm)}
-      """
-    }
+    val ctx = newCodeGenContext()
+    val projectionCode = expressions.zipWithIndex.map { case (e, i) =>
+      val evaluationCode = expressionEvaluator(e, ctx)
+      evaluationCode.code +
+        s"""
+          if(${evaluationCode.nullTerm})
+            mutableRow.setNullAt($i);
+          else
+            ${setColumn("mutableRow", e.dataType, i, evaluationCode.primitiveTerm)};
+        """
+    }.mkString("\n")
+    val code = s"""
+      import org.apache.spark.sql.Row;
+
+      public SpecificProjection generate($exprType[] expr) {
+        return new SpecificProjection(expr);
+      }
+
+      class SpecificProjection extends ${classOf[BaseMutableProjection].getName} {
 
-    val code =
-      q"""
-        () => { new $mutableProjectionType {
+        private $exprType[] expressions = null;
+        private $mutableRowType mutableRow = null;
 
-          private[this] var $mutableRowName: $mutableRowType =
-            new $genericMutableRowType(${expressions.size})
+        public SpecificProjection($exprType[] expr) {
+          expressions = expr;
+          mutableRow = new $genericMutableRowType(${expressions.size});
+        }
 
-          def target(row: $mutableRowType): $mutableProjectionType = {
-            $mutableRowName = row
-            this
-          }
+        public ${classOf[BaseMutableProjection].getName} target($mutableRowType row) {
+          mutableRow = row;
+          return this;
+        }
 
-          /* Provide immutable access to the last projected row. */
-          def currentValue: $rowType = mutableRow
+        /* Provide immutable access to the last projected row. */
+        public Row currentValue() {
+          return mutableRow;
+        }
 
-          def apply(i: $rowType): $rowType = {
-            ..$projectionCode
-            mutableRow
-          }
-        } }
-      """
+        public Object apply(Object _i) {
+          Row i = (Row) _i;
+          $projectionCode
 
-    log.debug(s"code for ${expressions.mkString(",")}:\n$code")
-    toolBox.eval(code).asInstanceOf[() => MutableProjection]
+          return mutableRow;
+        }
+      }
+    """
+
+
+    logDebug(s"code for ${expressions.mkString(",")}:\n$code")
+
+    val c = compile(code)
+    // fetch the only one method `generate(Expression[])`
+    val m = c.getDeclaredMethods()(0)
+    () => {
+      m.invoke(c.newInstance(), ctx.references.toArray).asInstanceOf[BaseMutableProjection]
+    }
   }
 }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateOrdering.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateOrdering.scala
index b129c0d898bb7..0ff840dab393c 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateOrdering.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateOrdering.scala
@@ -18,18 +18,29 @@
 package org.apache.spark.sql.catalyst.expressions.codegen
 
 import org.apache.spark.Logging
+import org.apache.spark.annotation.Private
+import org.apache.spark.sql.Row
 import org.apache.spark.sql.catalyst.expressions._
-import org.apache.spark.sql.types.{BinaryType, StringType, NumericType}
+import org.apache.spark.sql.types.{BinaryType, NumericType}
+
+/**
+ * Inherits some default implementation for Java from `Ordering[Row]`
+ */
+@Private
+class BaseOrdering extends Ordering[Row] {
+  def compare(a: Row, b: Row): Int = {
+    throw new UnsupportedOperationException
+  }
+}
 
 /**
  * Generates bytecode for an [[Ordering]] of [[Row Rows]] for a given set of
  * [[Expression Expressions]].
  */
 object GenerateOrdering extends CodeGenerator[Seq[SortOrder], Ordering[Row]] with Logging {
-  import scala.reflect.runtime.{universe => ru}
   import scala.reflect.runtime.universe._
 
- protected def canonicalize(in: Seq[SortOrder]): Seq[SortOrder] =
+  protected def canonicalize(in: Seq[SortOrder]): Seq[SortOrder] =
     in.map(ExpressionCanonicalizer.execute(_).asInstanceOf[SortOrder])
 
   protected def bind(in: Seq[SortOrder], inputSchema: Seq[Attribute]): Seq[SortOrder] =
@@ -38,73 +49,90 @@ object GenerateOrdering extends CodeGenerator[Seq[SortOrder], Ordering[Row]] wit
   protected def create(ordering: Seq[SortOrder]): Ordering[Row] = {
     val a = newTermName("a")
     val b = newTermName("b")
-    val comparisons = ordering.zipWithIndex.map { case (order, i) =>
-      val evalA = expressionEvaluator(order.child)
-      val evalB = expressionEvaluator(order.child)
+    val ctx = newCodeGenContext()
 
+    val comparisons = ordering.zipWithIndex.map { case (order, i) =>
+      val evalA = expressionEvaluator(order.child, ctx)
+      val evalB = expressionEvaluator(order.child, ctx)
+      val asc = order.direction == Ascending
       val compare = order.child.dataType match {
         case BinaryType =>
-          q"""
-          val x = ${if (order.direction == Ascending) evalA.primitiveTerm else evalB.primitiveTerm}
-          val y = ${if (order.direction != Ascending) evalB.primitiveTerm else evalA.primitiveTerm}
-          var i = 0
-          while (i < x.length && i < y.length) {
-            val res = x(i).compareTo(y(i))
-            if (res != 0) return res
-            i = i+1
-          }
-          return x.length - y.length
-          """
+          s"""
+            {
+              byte[] x = ${if (asc) evalA.primitiveTerm else evalB.primitiveTerm};
+              byte[] y = ${if (!asc) evalB.primitiveTerm else evalA.primitiveTerm};
+              int j = 0;
+              while (j < x.length && j < y.length) {
+                if (x[j] != y[j]) return x[j] - y[j];
+                j = j + 1;
+              }
+              int d = x.length - y.length;
+              if (d != 0) {
+                return d;
+              }
+            }"""
         case _: NumericType =>
-          q"""
-          val comp = ${evalA.primitiveTerm} - ${evalB.primitiveTerm}
-          if(comp != 0) {
-            return ${if (order.direction == Ascending) q"comp.toInt" else q"-comp.toInt"}
-          }
-          """
-        case StringType =>
-          if (order.direction == Ascending) {
-            q"""return ${evalA.primitiveTerm}.compare(${evalB.primitiveTerm})"""
+          s"""
+            if (${evalA.primitiveTerm} != ${evalB.primitiveTerm}) {
+              if (${evalA.primitiveTerm} > ${evalB.primitiveTerm}) {
+                return ${if (asc) "1" else "-1"};
+              } else {
+                return ${if (asc) "-1" else "1"};
+              }
+            }"""
+        case _ =>
+          s"""
+            int comp = ${evalA.primitiveTerm}.compare(${evalB.primitiveTerm});
+            if (comp != 0) {
+              return ${if (asc) "comp" else "-comp"};
+            }"""
+      }
+
+      s"""
+          i = $a;
+          ${evalA.code}
+          i = $b;
+          ${evalB.code}
+          if (${evalA.nullTerm} && ${evalB.nullTerm}) {
+            // Nothing
+          } else if (${evalA.nullTerm}) {
+            return ${if (order.direction == Ascending) "-1" else "1"};
+          } else if (${evalB.nullTerm}) {
+            return ${if (order.direction == Ascending) "1" else "-1"};
           } else {
-            q"""return ${evalB.primitiveTerm}.compare(${evalA.primitiveTerm})"""
+            $compare
           }
+      """
+    }.mkString("\n")
+
+    val code = s"""
+      import org.apache.spark.sql.Row;
+
+      public SpecificOrdering generate($exprType[] expr) {
+        return new SpecificOrdering(expr);
       }
 
-      q"""
-        i = $a
-        ..${evalA.code}
-        i = $b
-        ..${evalB.code}
-        if (${evalA.nullTerm} && ${evalB.nullTerm}) {
-          // Nothing
-        } else if (${evalA.nullTerm}) {
-          return ${if (order.direction == Ascending) q"-1" else q"1"}
-        } else if (${evalB.nullTerm}) {
-          return ${if (order.direction == Ascending) q"1" else q"-1"}
-        } else {
-          $compare
+      class SpecificOrdering extends ${typeOf[BaseOrdering]} {
+
+        private $exprType[] expressions = null;
+
+        public SpecificOrdering($exprType[] expr) {
+          expressions = expr;
         }
-      """
-    }
 
-    val q"class $orderingName extends $orderingType { ..$body }" = reify {
-      class SpecificOrdering extends Ordering[Row] {
-        val o = ordering
-      }
-    }.tree.children.head
-
-    val code = q"""
-      class $orderingName extends $orderingType {
-        ..$body
-        def compare(a: $rowType, b: $rowType): Int = {
-          var i: $rowType = null // Holds current row being evaluated.
-          ..$comparisons
-          return 0
+        @Override
+        public int compare(Row a, Row b) {
+          Row i = null;  // Holds current row being evaluated.
+          $comparisons
+          return 0;
         }
-      }
-      new $orderingName()
-      """
+      }"""
+
     logDebug(s"Generated Ordering: $code")
-    toolBox.eval(code).asInstanceOf[Ordering[Row]]
+
+    val c = compile(code)
+    // fetch the only one method `generate(Expression[])`
+    val m = c.getDeclaredMethods()(0)
+    m.invoke(c.newInstance(), ctx.references.toArray).asInstanceOf[BaseOrdering]
   }
 }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GeneratePredicate.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GeneratePredicate.scala
index 40e163024360e..fb18769f00da3 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GeneratePredicate.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GeneratePredicate.scala
@@ -19,12 +19,17 @@ package org.apache.spark.sql.catalyst.expressions.codegen
 
 import org.apache.spark.sql.catalyst.expressions._
 
+/**
+ * Interface for generated predicate
+ */
+abstract class Predicate {
+  def eval(r: Row): Boolean
+}
+
 /**
  * Generates bytecode that evaluates a boolean [[Expression]] on a given input [[Row]].
  */
 object GeneratePredicate extends CodeGenerator[Expression, (Row) => Boolean] {
-  import scala.reflect.runtime.{universe => ru}
-  import scala.reflect.runtime.universe._
 
   protected def canonicalize(in: Expression): Expression = ExpressionCanonicalizer.execute(in)
 
@@ -32,17 +37,34 @@ object GeneratePredicate extends CodeGenerator[Expression, (Row) => Boolean] {
     BindReferences.bindReference(in, inputSchema)
 
   protected def create(predicate: Expression): ((Row) => Boolean) = {
-    val cEval = expressionEvaluator(predicate)
+    val ctx = newCodeGenContext()
+    val eval = expressionEvaluator(predicate, ctx)
+    val code = s"""
+      import org.apache.spark.sql.Row;
 
-    val code =
-      q"""
-        (i: $rowType) => {
-          ..${cEval.code}
-          if (${cEval.nullTerm}) false else ${cEval.primitiveTerm}
+      public SpecificPredicate generate($exprType[] expr) {
+        return new SpecificPredicate(expr);
+      }
+
+      class SpecificPredicate extends ${classOf[Predicate].getName} {
+        private final $exprType[] expressions;
+        public SpecificPredicate($exprType[] expr) {
+          expressions = expr;
+        }
+
+        @Override
+        public boolean eval(Row i) {
+          ${eval.code}
+          return !${eval.nullTerm} && ${eval.primitiveTerm};
         }
-      """
+      }"""
+
+    logDebug(s"Generated predicate '$predicate':\n$code")
 
-    log.debug(s"Generated predicate '$predicate':\n$code")
-    toolBox.eval(code).asInstanceOf[Row => Boolean]
+    val c = compile(code)
+    // fetch the only one method `generate(Expression[])`
+    val m = c.getDeclaredMethods()(0)
+    val p = m.invoke(c.newInstance(), ctx.references.toArray).asInstanceOf[Predicate]
+    (r: Row) => p.eval(r)
   }
 }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateProjection.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateProjection.scala
index 31c63a79ebc8c..d5be1fc12e0f0 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateProjection.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateProjection.scala
@@ -17,9 +17,14 @@
 
 package org.apache.spark.sql.catalyst.expressions.codegen
 
+import org.apache.spark.sql.BaseMutableRow
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.types._
 
+/**
+ * Java can not access Projection (in package object)
+ */
+abstract class BaseProject extends Projection {}
 
 /**
  * Generates bytecode that produces a new [[Row]] object based on a fixed set of input
@@ -27,7 +32,6 @@ import org.apache.spark.sql.types._
  * generated based on the output types of the [[Expression]] to avoid boxing of primitive values.
  */
 object GenerateProjection extends CodeGenerator[Seq[Expression], Projection] {
-  import scala.reflect.runtime.{universe => ru}
   import scala.reflect.runtime.universe._
 
   protected def canonicalize(in: Seq[Expression]): Seq[Expression] =
@@ -38,201 +42,183 @@ object GenerateProjection extends CodeGenerator[Seq[Expression], Projection] {
 
   // Make Mutablility optional...
   protected def create(expressions: Seq[Expression]): Projection = {
-    val tupleLength = ru.Literal(Constant(expressions.length))
-    val lengthDef = q"final val length = $tupleLength"
-
-    /* TODO: Configurable...
-    val nullFunctions =
-      q"""
-        private final val nullSet = new org.apache.spark.util.collection.BitSet(length)
-        final def setNullAt(i: Int) = nullSet.set(i)
-        final def isNullAt(i: Int) = nullSet.get(i)
-      """
-     */
-
-    val nullFunctions =
-      q"""
-        private[this] var nullBits = new Array[Boolean](${expressions.size})
-        override def setNullAt(i: Int) = { nullBits(i) = true }
-        override def isNullAt(i: Int) = nullBits(i)
-      """.children
-
-    val tupleElements = expressions.zipWithIndex.flatMap {
+    val ctx = newCodeGenContext()
+    val columns = expressions.zipWithIndex.map {
       case (e, i) =>
-        val elementName = newTermName(s"c$i")
-        val evaluatedExpression = expressionEvaluator(e)
-        val iLit = ru.Literal(Constant(i))
+        s"private ${primitiveForType(e.dataType)} c$i = ${defaultPrimitive(e.dataType)};\n"
+    }.mkString("\n      ")
 
-        q"""
-        var ${newTermName(s"c$i")}: ${termForType(e.dataType)} = _
+    val initColumns = expressions.zipWithIndex.map {
+      case (e, i) =>
+        val eval = expressionEvaluator(e, ctx)
+        s"""
         {
-          ..${evaluatedExpression.code}
-          if(${evaluatedExpression.nullTerm})
-            setNullAt($iLit)
-          else {
-            nullBits($iLit) = false
-            $elementName = ${evaluatedExpression.primitiveTerm}
+          // column$i
+          ${eval.code}
+          nullBits[$i] = ${eval.nullTerm};
+          if(!${eval.nullTerm}) {
+            c$i = ${eval.primitiveTerm};
           }
         }
-        """.children : Seq[Tree]
-    }
+        """
+    }.mkString("\n")
 
-    val accessorFailure = q"""scala.sys.error("Invalid ordinal:" + i)"""
-    val applyFunction = {
-      val cases = (0 until expressions.size).map { i =>
-        val ordinal = ru.Literal(Constant(i))
-        val elementName = newTermName(s"c$i")
-        val iLit = ru.Literal(Constant(i))
+    val getCases = (0 until expressions.size).map { i =>
+      s"case $i: return c$i;"
+    }.mkString("\n        ")
 
-        q"if(i == $ordinal) { if(isNullAt($i)) return null else return $elementName }"
-      }
-      q"override def apply(i: Int): Any = { ..$cases; $accessorFailure }"
-    }
-
-    val updateFunction = {
-      val cases = expressions.zipWithIndex.map {case (e, i) =>
-        val ordinal = ru.Literal(Constant(i))
-        val elementName = newTermName(s"c$i")
-        val iLit = ru.Literal(Constant(i))
-
-        q"""
-          if(i == $ordinal) {
-            if(value == null) {
-              setNullAt(i)
-            } else {
-              nullBits(i) = false
-              $elementName = value.asInstanceOf[${termForType(e.dataType)}]
-            }
-            return
-          }"""
-      }
-      q"override def update(i: Int, value: Any): Unit = { ..$cases; $accessorFailure }"
-    }
+    val updateCases = expressions.zipWithIndex.map { case (e, i) =>
+      s"case $i: { c$i = (${termForType(e.dataType)})value; return;}"
+    }.mkString("\n        ")
 
     val specificAccessorFunctions = nativeTypes.map { dataType =>
-      val ifStatements = expressions.zipWithIndex.flatMap {
-        // getString() is not used by expressions
-        case (e, i) if e.dataType == dataType && dataType != StringType =>
-          val elementName = newTermName(s"c$i")
-          // TODO: The string of ifs gets pretty inefficient as the row grows in size.
-          // TODO: Optional null checks?
-          q"if(i == $i) return $elementName" :: Nil
-        case _ => Nil
-      }
-      dataType match {
-        // Row() need this interface to compile
-        case StringType =>
-          q"""
-          override def getString(i: Int): String = {
-            $accessorFailure
-          }"""
-        case other =>
-          q"""
-          override def ${accessorForType(dataType)}(i: Int): ${termForType(dataType)} = {
-            ..$ifStatements;
-            $accessorFailure
-          }"""
+      val cases = expressions.zipWithIndex.map {
+        case (e, i) if e.dataType == dataType =>
+          s"case $i: return c$i;"
+        case _ => ""
+      }.mkString("\n        ")
+      if (cases.count(_ != '\n') > 0) {
+        s"""
+      @Override
+      public ${primitiveForType(dataType)} ${accessorForType(dataType)}(int i) {
+        if (isNullAt(i)) {
+          return ${defaultPrimitive(dataType)};
+        }
+        switch (i) {
+        $cases
+        }
+        return ${defaultPrimitive(dataType)};
+      }"""
+      } else {
+        ""
       }
-    }
+    }.mkString("\n")
 
     val specificMutatorFunctions = nativeTypes.map { dataType =>
-      val ifStatements = expressions.zipWithIndex.flatMap {
-        // setString() is not used by expressions
-        case (e, i) if e.dataType == dataType && dataType != StringType =>
-          val elementName = newTermName(s"c$i")
-          // TODO: The string of ifs gets pretty inefficient as the row grows in size.
-          // TODO: Optional null checks?
-          q"if(i == $i) { nullBits($i) = false; $elementName = value; return }" :: Nil
-        case _ => Nil
-      }
-      dataType match {
-        case StringType =>
-          // MutableRow() need this interface to compile
-          q"""
-          override def setString(i: Int, value: String) {
-            $accessorFailure
-          }"""
-        case other =>
-          q"""
-          override def ${mutatorForType(dataType)}(i: Int, value: ${termForType(dataType)}) {
-            ..$ifStatements;
-            $accessorFailure
-          }"""
+      val cases = expressions.zipWithIndex.map {
+        case (e, i) if e.dataType == dataType =>
+          s"case $i: { c$i = value; return; }"
+        case _ => ""
+      }.mkString("\n")
+      if (cases.count(_ != '\n') > 0) {
+        s"""
+      @Override
+      public void ${mutatorForType(dataType)}(int i, ${primitiveForType(dataType)} value) {
+        nullBits[i] = false;
+        switch (i) {
+        $cases
+        }
+      }"""
+      } else {
+        ""
       }
-    }
+    }.mkString("\n")
 
     val hashValues = expressions.zipWithIndex.map { case (e, i) =>
-      val elementName = newTermName(s"c$i")
+      val col = newTermName(s"c$i")
       val nonNull = e.dataType match {
-        case BooleanType => q"if ($elementName) 0 else 1"
-        case ByteType | ShortType | IntegerType => q"$elementName.toInt"
-        case LongType => q"($elementName ^ ($elementName >>> 32)).toInt"
-        case FloatType => q"java.lang.Float.floatToIntBits($elementName)"
+        case BooleanType => s"$col ? 0 : 1"
+        case ByteType | ShortType | IntegerType | DateType => s"$col"
+        case LongType => s"$col ^ ($col >>> 32)"
+        case FloatType => s"Float.floatToIntBits($col)"
         case DoubleType =>
-          q"{ val b = java.lang.Double.doubleToLongBits($elementName); (b ^ (b >>>32)).toInt }"
-        case _ => q"$elementName.hashCode"
+          s"Double.doubleToLongBits($col) ^ (Double.doubleToLongBits($col) >>> 32)"
+        case _ => s"$col.hashCode()"
       }
-      q"if (isNullAt($i)) 0 else $nonNull"
+      s"isNullAt($i) ? 0 : ($nonNull)"
     }
 
-    val hashUpdates: Seq[Tree] = hashValues.map(v => q"""result = 37 * result + $v""": Tree)
+    val hashUpdates: String = hashValues.map( v =>
+      s"""
+        result *= 37; result += $v;"""
+    ).mkString("\n")
 
-    val hashCodeFunction =
-      q"""
-        override def hashCode(): Int = {
-          var result: Int = 37
-          ..$hashUpdates
-          result
-        }
+    val columnChecks = expressions.zipWithIndex.map { case (e, i) =>
+      s"""
+          if (isNullAt($i) != row.isNullAt($i) || !isNullAt($i) && !get($i).equals(row.get($i))) {
+            return false;
+          }
       """
+    }.mkString("\n")
 
-    val columnChecks = (0 until expressions.size).map { i =>
-      val elementName = newTermName(s"c$i")
-      q"if (this.$elementName != specificType.$elementName) return false"
+    val code = s"""
+    import org.apache.spark.sql.Row;
+
+    public SpecificProjection generate($exprType[] expr) {
+      return new SpecificProjection(expr);
     }
 
-    val equalsFunction =
-      q"""
-        override def equals(other: Any): Boolean = other match {
-          case specificType: SpecificRow =>
-            ..$columnChecks
-            return true
-          case other => super.equals(other)
-        }
-      """
+    class SpecificProjection extends ${typeOf[BaseProject]} {
+      private $exprType[] expressions = null;
+
+      public SpecificProjection($exprType[] expr) {
+        expressions = expr;
+      }
 
-    val allColumns = (0 until expressions.size).map { i =>
-      val iLit = ru.Literal(Constant(i))
-      q"if(isNullAt($iLit)) { null } else { ${newTermName(s"c$i")} }"
+      @Override
+      public Object apply(Object r) {
+        return new SpecificRow(expressions, (Row) r);
+      }
     }
 
-    val copyFunction =
-      q"override def copy() = new $genericRowType(Array[Any](..$allColumns))"
-
-    val toSeqFunction =
-      q"override def toSeq: Seq[Any] = Seq(..$allColumns)"
-
-    val classBody =
-      nullFunctions ++ (
-        lengthDef +:
-        applyFunction +:
-        updateFunction +:
-        equalsFunction +:
-        hashCodeFunction +:
-        copyFunction +:
-        toSeqFunction +:
-        (tupleElements ++ specificAccessorFunctions ++ specificMutatorFunctions))
-
-    val code = q"""
-      final class SpecificRow(i: $rowType) extends $mutableRowType {
-        ..$classBody
+    final class SpecificRow extends ${typeOf[BaseMutableRow]} {
+
+      $columns
+
+      public SpecificRow($exprType[] expressions, Row i) {
+        $initColumns
+      }
+
+      public int size() { return ${expressions.length};}
+      private boolean[] nullBits = new boolean[${expressions.length}];
+      public void setNullAt(int i) { nullBits[i] = true; }
+      public boolean isNullAt(int i) { return nullBits[i]; }
+
+      public Object get(int i) {
+        if (isNullAt(i)) return null;
+        switch (i) {
+        $getCases
+        }
+        return null;
+      }
+      public void update(int i, Object value) {
+        if (value == null) {
+          setNullAt(i);
+          return;
+        }
+        nullBits[i] = false;
+        switch (i) {
+        $updateCases
+        }
+      }
+      $specificAccessorFunctions
+      $specificMutatorFunctions
+
+      @Override
+      public int hashCode() {
+        int result = 37;
+        $hashUpdates
+        return result;
       }
 
-      new $projectionType { def apply(r: $rowType) = new SpecificRow(r) }
+      @Override
+      public boolean equals(Object other) {
+        if (other instanceof Row) {
+          Row row = (Row) other;
+          if (row.length() != size()) return false;
+          $columnChecks
+          return true;
+        }
+        return super.equals(other);
+      }
+    }
     """
 
-    log.debug(
-      s"MutableRow, initExprs: ${expressions.mkString(",")} code:\n${toolBox.typeCheck(code)}")
-    toolBox.eval(code).asInstanceOf[Projection]
+    logDebug(s"MutableRow, initExprs: ${expressions.mkString(",")} code:\n${code}")
+
+    val c = compile(code)
+    // fetch the only one method `generate(Expression[])`
+    val m = c.getDeclaredMethods()(0)
+    m.invoke(c.newInstance(), ctx.references.toArray).asInstanceOf[Projection]
   }
 }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/package.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/package.scala
index 528e38a50a740..7f1b12cdd5800 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/package.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/package.scala
@@ -27,12 +27,6 @@ import org.apache.spark.util.Utils
  */
 package object codegen {
 
-  /**
-   * A lock to protect invoking the scala compiler at runtime, since it is not thread safe in Scala
-   * 2.10.
-   */
-  protected[codegen] val globalLock = org.apache.spark.sql.catalyst.ScalaReflectionLock
-
   /** Canonicalizes an expression so those that differ only by names can reuse the same code. */
   object ExpressionCanonicalizer extends rules.RuleExecutor[Expression] {
     val batches =
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionEvaluationSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionEvaluationSuite.scala
index b6927485f42bf..5df528770ca6e 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionEvaluationSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionEvaluationSuite.scala
@@ -344,7 +344,7 @@ class ExpressionEvaluationSuite extends ExpressionEvaluationBaseSuite {
     checkEvaluation("abdef" cast TimestampType, null)
     checkEvaluation("12.65" cast DecimalType.Unlimited, Decimal(12.65))
 
-    checkEvaluation(Literal(1) cast LongType, 1)
+    checkEvaluation(Literal(1) cast LongType, 1.toLong)
     checkEvaluation(Cast(Literal(1000) cast TimestampType, LongType), 1.toLong)
     checkEvaluation(Cast(Literal(-1200) cast TimestampType, LongType), -2.toLong)
     checkEvaluation(Cast(Literal(1.toDouble) cast TimestampType, DoubleType), 1.toDouble)
@@ -363,13 +363,16 @@ class ExpressionEvaluationSuite extends ExpressionEvaluationBaseSuite {
     checkEvaluation(Cast("abdef" cast BinaryType, StringType), "abdef")
 
     checkEvaluation(Cast(Cast(Cast(Cast(
-      Cast("5" cast ByteType, ShortType), IntegerType), FloatType), DoubleType), LongType), 5)
+      Cast("5" cast ByteType, ShortType), IntegerType), FloatType), DoubleType), LongType),
+      5.toLong)
     checkEvaluation(Cast(Cast(Cast(Cast(Cast("5" cast
-      ByteType, TimestampType), DecimalType.Unlimited), LongType), StringType), ShortType), 0)
+      ByteType, TimestampType), DecimalType.Unlimited), LongType), StringType), ShortType),
+      0.toShort)
     checkEvaluation(Cast(Cast(Cast(Cast(Cast("5" cast
       TimestampType, ByteType), DecimalType.Unlimited), LongType), StringType), ShortType), null)
     checkEvaluation(Cast(Cast(Cast(Cast(Cast("5" cast
-      DecimalType.Unlimited, ByteType), TimestampType), LongType), StringType), ShortType), 0)
+      DecimalType.Unlimited, ByteType), TimestampType), LongType), StringType), ShortType),
+      0.toShort)
     checkEvaluation(Literal(true) cast IntegerType, 1)
     checkEvaluation(Literal(false) cast IntegerType, 0)
     checkEvaluation(Literal(true) cast StringType, "true")
@@ -509,9 +512,9 @@ class ExpressionEvaluationSuite extends ExpressionEvaluationBaseSuite {
     val seconds = millis * 1000 + 2
     val ts = new Timestamp(millis)
     val tss = new Timestamp(seconds)
-    checkEvaluation(Cast(ts, ShortType), 15)
+    checkEvaluation(Cast(ts, ShortType), 15.toShort)
     checkEvaluation(Cast(ts, IntegerType), 15)
-    checkEvaluation(Cast(ts, LongType), 15)
+    checkEvaluation(Cast(ts, LongType), 15.toLong)
     checkEvaluation(Cast(ts, FloatType), 15.002f)
     checkEvaluation(Cast(ts, DoubleType), 15.002)
     checkEvaluation(Cast(Cast(tss, ShortType), TimestampType), ts)
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/GeneratedEvaluationSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/GeneratedEvaluationSuite.scala
index d7c437095e395..8cfd853afa35f 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/GeneratedEvaluationSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/GeneratedEvaluationSuite.scala
@@ -32,11 +32,12 @@ class GeneratedEvaluationSuite extends ExpressionEvaluationSuite {
       GenerateMutableProjection.generate(Alias(expression, s"Optimized($expression)")() :: Nil)()
     } catch {
       case e: Throwable =>
-        val evaluated = GenerateProjection.expressionEvaluator(expression)
+        val ctx = GenerateProjection.newCodeGenContext()
+        val evaluated = GenerateProjection.expressionEvaluator(expression, ctx)
         fail(
           s"""
             |Code generation of $expression failed:
-            |${evaluated.code.mkString("\n")}
+            |${evaluated.code}
             |$e
           """.stripMargin)
     }
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/GeneratedMutableEvaluationSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/GeneratedMutableEvaluationSuite.scala
index a40324b008e16..9ab1f7d7ad0db 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/GeneratedMutableEvaluationSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/GeneratedMutableEvaluationSuite.scala
@@ -28,7 +28,8 @@ class GeneratedMutableEvaluationSuite extends ExpressionEvaluationSuite {
       expression: Expression,
       expected: Any,
       inputRow: Row = EmptyRow): Unit = {
-    lazy val evaluated = GenerateProjection.expressionEvaluator(expression)
+    val ctx = GenerateProjection.newCodeGenContext()
+    lazy val evaluated = GenerateProjection.expressionEvaluator(expression, ctx)
 
     val plan = try {
       GenerateProjection.generate(Alias(expression, s"Optimized($expression)")() :: Nil)
@@ -37,7 +38,7 @@ class GeneratedMutableEvaluationSuite extends ExpressionEvaluationSuite {
         fail(
           s"""
             |Code generation of $expression failed:
-            |${evaluated.code.mkString("\n")}
+            |${evaluated.code}
             |$e
           """.stripMargin)
     }
@@ -49,7 +50,7 @@ class GeneratedMutableEvaluationSuite extends ExpressionEvaluationSuite {
         s"""
           |Mismatched hashCodes for values: $actual, $expectedRow
           |Hash Codes: ${actual.hashCode()} != ${expectedRow.hashCode()}
-          |${evaluated.code.mkString("\n")}
+          |${evaluated.code}
         """.stripMargin)
     }
     if (actual != expectedRow) {
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
index 9aaec2b064d76..b41b1b77d049e 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
@@ -451,10 +451,13 @@ class DataFrameSuite extends QueryTest {
   test("SPARK-6899") {
     val originalValue = TestSQLContext.conf.codegenEnabled
     TestSQLContext.setConf(SQLConf.CODEGEN_ENABLED, "true")
-    checkAnswer(
-      decimalData.agg(avg('a)),
-      Row(new java.math.BigDecimal(2.0)))
-    TestSQLContext.setConf(SQLConf.CODEGEN_ENABLED, originalValue.toString)
+    try{
+      checkAnswer(
+        decimalData.agg(avg('a)),
+        Row(new java.math.BigDecimal(2.0)))
+    } finally {
+      TestSQLContext.setConf(SQLConf.CODEGEN_ENABLED, originalValue.toString)
+    }
   }
 
   test("SPARK-7133: Implement struct, array, and map field accessor") {
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
index 63f7d314fb699..55b68d8e2283c 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
@@ -184,77 +184,79 @@ class SQLQuerySuite extends QueryTest with BeforeAndAfterAll with SQLTestUtils {
       checkAnswer(df, expectedResults)
     }
 
-    // Just to group rows.
-    testCodeGen(
-      "SELECT key FROM testData3x GROUP BY key",
-      (1 to 100).map(Row(_)))
-    // COUNT
-    testCodeGen(
-      "SELECT key, count(value) FROM testData3x GROUP BY key",
-      (1 to 100).map(i => Row(i, 3)))
-    testCodeGen(
-      "SELECT count(key) FROM testData3x",
-      Row(300) :: Nil)
-    // COUNT DISTINCT ON int
-    testCodeGen(
-      "SELECT value, count(distinct key) FROM testData3x GROUP BY value",
-      (1 to 100).map(i => Row(i.toString, 1)))
-    testCodeGen(
-      "SELECT count(distinct key) FROM testData3x",
-      Row(100) :: Nil)
-    // SUM
-    testCodeGen(
-      "SELECT value, sum(key) FROM testData3x GROUP BY value",
-      (1 to 100).map(i => Row(i.toString, 3 * i)))
-    testCodeGen(
-      "SELECT sum(key), SUM(CAST(key as Double)) FROM testData3x",
-      Row(5050 * 3, 5050 * 3.0) :: Nil)
-    // AVERAGE
-    testCodeGen(
-      "SELECT value, avg(key) FROM testData3x GROUP BY value",
-      (1 to 100).map(i => Row(i.toString, i)))
-    testCodeGen(
-      "SELECT avg(key) FROM testData3x",
-      Row(50.5) :: Nil)
-    // MAX
-    testCodeGen(
-      "SELECT value, max(key) FROM testData3x GROUP BY value",
-      (1 to 100).map(i => Row(i.toString, i)))
-    testCodeGen(
-      "SELECT max(key) FROM testData3x",
-      Row(100) :: Nil)
-    // MIN
-    testCodeGen(
-      "SELECT value, min(key) FROM testData3x GROUP BY value",
-      (1 to 100).map(i => Row(i.toString, i)))
-    testCodeGen(
-      "SELECT min(key) FROM testData3x",
-      Row(1) :: Nil)
-    // Some combinations.
-    testCodeGen(
-      """
-        |SELECT
-        |  value,
-        |  sum(key),
-        |  max(key),
-        |  min(key),
-        |  avg(key),
-        |  count(key),
-        |  count(distinct key)
-        |FROM testData3x
-        |GROUP BY value
-      """.stripMargin,
-      (1 to 100).map(i => Row(i.toString, i*3, i, i, i, 3, 1)))
-    testCodeGen(
-      "SELECT max(key), min(key), avg(key), count(key), count(distinct key) FROM testData3x",
-      Row(100, 1, 50.5, 300, 100) :: Nil)
-    // Aggregate with Code generation handling all null values
-    testCodeGen(
-      "SELECT  sum('a'), avg('a'), count(null) FROM testData",
-      Row(0, null, 0) :: Nil)
-
-    dropTempTable("testData3x")
-    setConf(SQLConf.CODEGEN_ENABLED, originalValue.toString)
+    try {
+      // Just to group rows.
+      testCodeGen(
+        "SELECT key FROM testData3x GROUP BY key",
+        (1 to 100).map(Row(_)))
+      // COUNT
+      testCodeGen(
+        "SELECT key, count(value) FROM testData3x GROUP BY key",
+        (1 to 100).map(i => Row(i, 3)))
+      testCodeGen(
+        "SELECT count(key) FROM testData3x",
+        Row(300) :: Nil)
+      // COUNT DISTINCT ON int
+      testCodeGen(
+        "SELECT value, count(distinct key) FROM testData3x GROUP BY value",
+        (1 to 100).map(i => Row(i.toString, 1)))
+      testCodeGen(
+        "SELECT count(distinct key) FROM testData3x",
+        Row(100) :: Nil)
+      // SUM
+      testCodeGen(
+        "SELECT value, sum(key) FROM testData3x GROUP BY value",
+        (1 to 100).map(i => Row(i.toString, 3 * i)))
+      testCodeGen(
+        "SELECT sum(key), SUM(CAST(key as Double)) FROM testData3x",
+        Row(5050 * 3, 5050 * 3.0) :: Nil)
+      // AVERAGE
+      testCodeGen(
+        "SELECT value, avg(key) FROM testData3x GROUP BY value",
+        (1 to 100).map(i => Row(i.toString, i)))
+      testCodeGen(
+        "SELECT avg(key) FROM testData3x",
+        Row(50.5) :: Nil)
+      // MAX
+      testCodeGen(
+        "SELECT value, max(key) FROM testData3x GROUP BY value",
+        (1 to 100).map(i => Row(i.toString, i)))
+      testCodeGen(
+        "SELECT max(key) FROM testData3x",
+        Row(100) :: Nil)
+      // MIN
+      testCodeGen(
+        "SELECT value, min(key) FROM testData3x GROUP BY value",
+        (1 to 100).map(i => Row(i.toString, i)))
+      testCodeGen(
+        "SELECT min(key) FROM testData3x",
+        Row(1) :: Nil)
+      // Some combinations.
+      testCodeGen(
+        """
+          |SELECT
+          |  value,
+          |  sum(key),
+          |  max(key),
+          |  min(key),
+          |  avg(key),
+          |  count(key),
+          |  count(distinct key)
+          |FROM testData3x
+          |GROUP BY value
+        """.stripMargin,
+        (1 to 100).map(i => Row(i.toString, i*3, i, i, i, 3, 1)))
+      testCodeGen(
+        "SELECT max(key), min(key), avg(key), count(key), count(distinct key) FROM testData3x",
+        Row(100, 1, 50.5, 300, 100) :: Nil)
+      // Aggregate with Code generation handling all null values
+      testCodeGen(
+        "SELECT  sum('a'), avg('a'), count(null) FROM testData",
+        Row(0, null, 0) :: Nil)
+    } finally {
+      dropTempTable("testData3x")
+      setConf(SQLConf.CODEGEN_ENABLED, originalValue.toString)
+    }
   }
 
   test("Add Parser of SQL COALESCE()") {
@@ -463,9 +465,12 @@ class SQLQuerySuite extends QueryTest with BeforeAndAfterAll with SQLTestUtils {
     val codegenbefore = conf.codegenEnabled
     setConf(SQLConf.EXTERNAL_SORT, "false")
     setConf(SQLConf.CODEGEN_ENABLED, "true")
-    sortTest()
-    setConf(SQLConf.EXTERNAL_SORT, externalbefore.toString)
-    setConf(SQLConf.CODEGEN_ENABLED, codegenbefore.toString)
+    try{
+      sortTest()
+    } finally {
+      setConf(SQLConf.EXTERNAL_SORT, externalbefore.toString)
+      setConf(SQLConf.CODEGEN_ENABLED, codegenbefore.toString)
+    }
   }
 
   test("SPARK-6927 external sorting with codegen on") {
@@ -473,9 +478,12 @@ class SQLQuerySuite extends QueryTest with BeforeAndAfterAll with SQLTestUtils {
     val codegenbefore = conf.codegenEnabled
     setConf(SQLConf.CODEGEN_ENABLED, "true")
     setConf(SQLConf.EXTERNAL_SORT, "true")
-    sortTest()
-    setConf(SQLConf.EXTERNAL_SORT, externalbefore.toString)
-    setConf(SQLConf.CODEGEN_ENABLED, codegenbefore.toString)
+    try {
+      sortTest()
+    } finally {
+      setConf(SQLConf.EXTERNAL_SORT, externalbefore.toString)
+      setConf(SQLConf.CODEGEN_ENABLED, codegenbefore.toString)
+    }
   }
 
   test("limit") {

From df7da07a86a30c684d5b07d955f1045a66715e3a Mon Sep 17 00:00:00 2001
From: Mike Dusenberry <dusenberrymw@gmail.com>
Date: Thu, 4 Jun 2015 11:30:07 -0700
Subject: [PATCH 360/525] [SPARK-7969] [SQL] Added a DataFrame.drop function
 that accepts a Column reference.

Added a `DataFrame.drop` function that accepts a `Column` reference rather than a `String`, and added associated unit tests.  Basically iterates through the `DataFrame` to find a column with an expression that is equivalent to that of the `Column` argument supplied to the function.

Author: Mike Dusenberry <dusenberrymw@gmail.com>

Closes #6585 from dusenberrymw/SPARK-7969_Drop_method_on_Dataframes_should_handle_Column and squashes the following commits:

514727a [Mike Dusenberry] Updating the @since tag of the drop(Column) function doc to reflect version 1.4.1 instead of 1.4.0.
2f1bb4e [Mike Dusenberry] Adding an additional assert statement to the 'drop column after join' unit test in order to make sure the correct column was indeed left over.
6bf7c0e [Mike Dusenberry] Minor code formatting change.
e583888 [Mike Dusenberry] Adding more Python doctests for the df.drop with column reference function to test joined datasets that have columns with the same name.
5f74401 [Mike Dusenberry] Updating DataFrame.drop with column reference function to use logicalPlan.output to prevent ambiguities resulting from columns with the same name. Also added associated unit tests for joined datasets with duplicate column names.
4b8bbe8 [Mike Dusenberry] Adding Python support for Dataframe.drop with a Column reference.
986129c [Mike Dusenberry] Added a DataFrame.drop function that accepts a Column reference rather than a String, and added associated unit tests.  Basically iterates through the DataFrame to find a column with an expression that is equivalent to one supplied to the function.
---
 python/pyspark/sql/dataframe.py               | 21 +++++++--
 .../org/apache/spark/sql/DataFrame.scala      | 16 +++++++
 .../org/apache/spark/sql/DataFrameSuite.scala | 45 +++++++++++++++++++
 3 files changed, 79 insertions(+), 3 deletions(-)

diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py
index 7673153abe0e2..03b01a1136e45 100644
--- a/python/pyspark/sql/dataframe.py
+++ b/python/pyspark/sql/dataframe.py
@@ -1189,15 +1189,30 @@ def withColumnRenamed(self, existing, new):
 
     @since(1.4)
     @ignore_unicode_prefix
-    def drop(self, colName):
+    def drop(self, col):
         """Returns a new :class:`DataFrame` that drops the specified column.
 
-        :param colName: string, name of the column to drop.
+        :param col: a string name of the column to drop, or a
+            :class:`Column` to drop.
 
         >>> df.drop('age').collect()
         [Row(name=u'Alice'), Row(name=u'Bob')]
+
+        >>> df.drop(df.age).collect()
+        [Row(name=u'Alice'), Row(name=u'Bob')]
+
+        >>> df.join(df2, df.name == df2.name, 'inner').drop(df.name).collect()
+        [Row(age=5, height=85, name=u'Bob')]
+
+        >>> df.join(df2, df.name == df2.name, 'inner').drop(df2.name).collect()
+        [Row(age=5, name=u'Bob', height=85)]
         """
-        jdf = self._jdf.drop(colName)
+        if isinstance(col, basestring):
+            jdf = self._jdf.drop(col)
+        elif isinstance(col, Column):
+            jdf = self._jdf.drop(col._jc)
+        else:
+            raise TypeError("col should be a string or a Column")
         return DataFrame(jdf, self.sql_ctx)
 
     @since(1.3)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
index 034d887901975..d1a54ada7b191 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
@@ -1082,6 +1082,22 @@ class DataFrame private[sql](
     }
   }
 
+  /**
+   * Returns a new [[DataFrame]] with a column dropped.
+   * This version of drop accepts a Column rather than a name.
+   * This is a no-op if the DataFrame doesn't have a column
+   * with an equivalent expression.
+   * @group dfops
+   * @since 1.4.1
+   */
+  def drop(col: Column): DataFrame = {
+    val attrs = this.logicalPlan.output
+    val colsAfterDrop = attrs.filter { attr =>
+      attr != col.expr
+    }.map(attr => Column(attr))
+    select(colsAfterDrop : _*)
+  }
+
   /**
    * Returns a new [[DataFrame]] that contains only the unique rows from this [[DataFrame]].
    * This is an alias for `distinct`.
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
index b41b1b77d049e..8e81dacb8660f 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
@@ -334,6 +334,51 @@ class DataFrameSuite extends QueryTest {
     assert(df.schema.map(_.name) === Seq("key", "value"))
   }
 
+  test("drop column using drop with column reference") {
+    val col = testData("key")
+    val df = testData.drop(col)
+    checkAnswer(
+      df,
+      testData.collect().map(x => Row(x.getString(1))).toSeq)
+    assert(df.schema.map(_.name) === Seq("value"))
+  }
+
+  test("drop unknown column (no-op) with column reference") {
+    val col = Column("random")
+    val df = testData.drop(col)
+    checkAnswer(
+      df,
+      testData.collect().toSeq)
+    assert(df.schema.map(_.name) === Seq("key", "value"))
+  }
+
+  test("drop unknown column with same name (no-op) with column reference") {
+    val col = Column("key")
+    val df = testData.drop(col)
+    checkAnswer(
+      df,
+      testData.collect().toSeq)
+    assert(df.schema.map(_.name) === Seq("key", "value"))
+  }
+
+  test("drop column after join with duplicate columns using column reference") {
+    val newSalary = salary.withColumnRenamed("personId", "id")
+    val col = newSalary("id")
+    // this join will result in duplicate "id" columns
+    val joinedDf = person.join(newSalary,
+      person("id") === newSalary("id"), "inner")
+    // remove only the "id" column that was associated with newSalary
+    val df = joinedDf.drop(col)
+    checkAnswer(
+      df,
+      joinedDf.collect().map {
+        case Row(id: Int, name: String, age: Int, idToDrop: Int, salary: Double) =>
+          Row(id, name, age, salary)
+      }.toSeq)
+    assert(df.schema.map(_.name) === Seq("id", "name", "age", "salary"))
+    assert(df("id") == person("id"))
+  }
+
   test("withColumnRenamed") {
     val df = testData.toDF().withColumn("newCol", col("key") + 1)
       .withColumnRenamed("value", "valueRenamed")

From cd3176bd86eafa09a5e11baf3636861c1f46e844 Mon Sep 17 00:00:00 2001
From: Thomas Omans <tomans@cj.com>
Date: Thu, 4 Jun 2015 11:32:03 -0700
Subject: [PATCH 361/525] [SPARK-7743] [SQL] Parquet 1.7

Resolves [SPARK-7743](https://issues.apache.org/jira/browse/SPARK-7743).

Trivial changes of versions, package names, as well as a small issue in `ParquetTableOperations.scala`

```diff
-    val readContext = getReadSupport(configuration).init(
+    val readContext = ParquetInputFormat.getReadSupportInstance(configuration).init(
```

Since ParquetInputFormat.getReadSupport was made package private in the latest release.

Thanks
-- Thomas Omans

Author: Thomas Omans <tomans@cj.com>

Closes #6597 from eggsby/SPARK-7743 and squashes the following commits:

2df0d1b [Thomas Omans] [SPARK-7743] [SQL] Upgrading parquet version to 1.7.0
---
 .../src/main/python/parquet_inputformat.py    |  2 +-
 pom.xml                                       |  6 ++--
 sql/core/pom.xml                              |  4 +--
 .../DirectParquetOutputCommitter.scala        |  6 ++--
 .../spark/sql/parquet/ParquetConverter.scala  |  6 ++--
 .../spark/sql/parquet/ParquetFilters.scala    | 10 +++---
 .../spark/sql/parquet/ParquetRelation.scala   | 10 +++---
 .../sql/parquet/ParquetTableOperations.scala  | 34 +++++++++----------
 .../sql/parquet/ParquetTableSupport.scala     | 12 +++----
 .../spark/sql/parquet/ParquetTypes.scala      | 14 ++++----
 .../apache/spark/sql/parquet/newParquet.scala |  8 ++---
 .../sql/parquet/timestamp/NanoTime.scala      |  4 +--
 .../apache/spark/sql/sources/commands.scala   |  2 +-
 sql/core/src/test/resources/log4j.properties  | 10 +++---
 .../sql/parquet/ParquetFilterSuite.scala      |  4 +--
 .../spark/sql/parquet/ParquetIOSuite.scala    | 18 +++++-----
 .../sql/parquet/ParquetSchemaSuite.scala      |  2 +-
 17 files changed, 76 insertions(+), 76 deletions(-)

diff --git a/examples/src/main/python/parquet_inputformat.py b/examples/src/main/python/parquet_inputformat.py
index 96ddac761d698..e1fd85b082c08 100644
--- a/examples/src/main/python/parquet_inputformat.py
+++ b/examples/src/main/python/parquet_inputformat.py
@@ -51,7 +51,7 @@
 
     parquet_rdd = sc.newAPIHadoopFile(
         path,
-        'parquet.avro.AvroParquetInputFormat',
+        'org.apache.parquet.avro.AvroParquetInputFormat',
         'java.lang.Void',
         'org.apache.avro.generic.IndexedRecord',
         valueConverter='org.apache.spark.examples.pythonconverters.IndexedRecordToJavaConverter')
diff --git a/pom.xml b/pom.xml
index bcb6ef96a1206..abb9b55400340 100644
--- a/pom.xml
+++ b/pom.xml
@@ -136,7 +136,7 @@
     <!-- Version used for internal directory structure -->
     <hive.version.short>0.13.1</hive.version.short>
     <derby.version>10.10.1.1</derby.version>
-    <parquet.version>1.6.0rc3</parquet.version>
+    <parquet.version>1.7.0</parquet.version>
     <jblas.version>1.2.4</jblas.version>
     <jetty.version>8.1.14.v20131031</jetty.version>
     <orbit.version>3.0.0.v201112011016</orbit.version>
@@ -1080,13 +1080,13 @@
         </exclusions>
       </dependency>
       <dependency>
-        <groupId>com.twitter</groupId>
+        <groupId>org.apache.parquet</groupId>
         <artifactId>parquet-column</artifactId>
         <version>${parquet.version}</version>
         <scope>${parquet.deps.scope}</scope>
       </dependency>
       <dependency>
-        <groupId>com.twitter</groupId>
+        <groupId>org.apache.parquet</groupId>
         <artifactId>parquet-hadoop</artifactId>
         <version>${parquet.version}</version>
         <scope>${parquet.deps.scope}</scope>
diff --git a/sql/core/pom.xml b/sql/core/pom.xml
index 3192f81ffaecd..ed75475a87067 100644
--- a/sql/core/pom.xml
+++ b/sql/core/pom.xml
@@ -61,11 +61,11 @@
       <scope>test</scope>
     </dependency>
     <dependency>
-      <groupId>com.twitter</groupId>
+      <groupId>org.apache.parquet</groupId>
       <artifactId>parquet-column</artifactId>
     </dependency>
     <dependency>
-      <groupId>com.twitter</groupId>
+      <groupId>org.apache.parquet</groupId>
       <artifactId>parquet-hadoop</artifactId>
     </dependency>
     <dependency>
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/DirectParquetOutputCommitter.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/DirectParquetOutputCommitter.scala
index f5ce2718bec4a..62c4e92ebec68 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/DirectParquetOutputCommitter.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/DirectParquetOutputCommitter.scala
@@ -21,9 +21,9 @@ import org.apache.hadoop.fs.Path
 import org.apache.hadoop.mapreduce.{JobContext, TaskAttemptContext}
 import org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter
 
-import parquet.Log
-import parquet.hadoop.util.ContextUtil
-import parquet.hadoop.{ParquetFileReader, ParquetFileWriter, ParquetOutputCommitter, ParquetOutputFormat}
+import org.apache.parquet.Log
+import org.apache.parquet.hadoop.util.ContextUtil
+import org.apache.parquet.hadoop.{ParquetFileReader, ParquetFileWriter, ParquetOutputCommitter, ParquetOutputFormat}
 
 private[parquet] class DirectParquetOutputCommitter(outputPath: Path, context: TaskAttemptContext)
   extends ParquetOutputCommitter(outputPath, context) {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetConverter.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetConverter.scala
index caa9f045537d0..85c2ce740fe52 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetConverter.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetConverter.scala
@@ -23,9 +23,9 @@ import java.util.{TimeZone, Calendar}
 import scala.collection.mutable.{Buffer, ArrayBuffer, HashMap}
 
 import jodd.datetime.JDateTime
-import parquet.column.Dictionary
-import parquet.io.api.{PrimitiveConverter, GroupConverter, Binary, Converter}
-import parquet.schema.MessageType
+import org.apache.parquet.column.Dictionary
+import org.apache.parquet.io.api.{PrimitiveConverter, GroupConverter, Binary, Converter}
+import org.apache.parquet.schema.MessageType
 
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.parquet.CatalystConverter.FieldType
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetFilters.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetFilters.scala
index f0f4e7d147e75..88ae88e9684c8 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetFilters.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetFilters.scala
@@ -21,11 +21,11 @@ import java.nio.ByteBuffer
 
 import com.google.common.io.BaseEncoding
 import org.apache.hadoop.conf.Configuration
-import parquet.filter2.compat.FilterCompat
-import parquet.filter2.compat.FilterCompat._
-import parquet.filter2.predicate.FilterApi._
-import parquet.filter2.predicate.{FilterApi, FilterPredicate}
-import parquet.io.api.Binary
+import org.apache.parquet.filter2.compat.FilterCompat
+import org.apache.parquet.filter2.compat.FilterCompat._
+import org.apache.parquet.filter2.predicate.FilterApi._
+import org.apache.parquet.filter2.predicate.{FilterApi, FilterPredicate}
+import org.apache.parquet.io.api.Binary
 
 import org.apache.spark.SparkEnv
 import org.apache.spark.sql.catalyst.expressions._
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetRelation.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetRelation.scala
index fcb9513ab66f6..09088ee91106c 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetRelation.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetRelation.scala
@@ -24,9 +24,9 @@ import org.apache.hadoop.conf.Configuration
 import org.apache.hadoop.fs.Path
 import org.apache.hadoop.fs.permission.FsAction
 import org.apache.spark.sql.types.{StructType, DataType}
-import parquet.hadoop.{ParquetOutputCommitter, ParquetOutputFormat}
-import parquet.hadoop.metadata.CompressionCodecName
-import parquet.schema.MessageType
+import org.apache.parquet.hadoop.{ParquetOutputCommitter, ParquetOutputFormat}
+import org.apache.parquet.hadoop.metadata.CompressionCodecName
+import org.apache.parquet.schema.MessageType
 
 import org.apache.spark.sql.{DataFrame, SQLContext}
 import org.apache.spark.sql.catalyst.analysis.{MultiInstanceRelation, UnresolvedException}
@@ -107,7 +107,7 @@ private[sql] object ParquetRelation {
     //
     // Therefore we need to force the class to be loaded.
     // This should really be resolved by Parquet.
-    Class.forName(classOf[parquet.Log].getName)
+    Class.forName(classOf[org.apache.parquet.Log].getName)
 
     // Note: Logger.getLogger("parquet") has a default logger
     // that appends to Console which needs to be cleared.
@@ -127,7 +127,7 @@ private[sql] object ParquetRelation {
   type RowType = org.apache.spark.sql.catalyst.expressions.GenericMutableRow
 
   // The compression type
-  type CompressionType = parquet.hadoop.metadata.CompressionCodecName
+  type CompressionType = org.apache.parquet.hadoop.metadata.CompressionCodecName
 
   // The parquet compression short names
   val shortParquetCompressionCodecNames = Map(
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableOperations.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableOperations.scala
index cb7ae246d0d75..1e694f2feabee 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableOperations.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableOperations.scala
@@ -33,13 +33,13 @@ import org.apache.hadoop.fs.{BlockLocation, FileStatus, Path}
 import org.apache.hadoop.mapreduce._
 import org.apache.hadoop.mapreduce.lib.input.{FileInputFormat => NewFileInputFormat}
 import org.apache.hadoop.mapreduce.lib.output.{FileOutputCommitter, FileOutputFormat => NewFileOutputFormat}
-import parquet.hadoop._
-import parquet.hadoop.api.ReadSupport.ReadContext
-import parquet.hadoop.api.{InitContext, ReadSupport}
-import parquet.hadoop.metadata.GlobalMetaData
-import parquet.hadoop.util.ContextUtil
-import parquet.io.ParquetDecodingException
-import parquet.schema.MessageType
+import org.apache.parquet.hadoop._
+import org.apache.parquet.hadoop.api.ReadSupport.ReadContext
+import org.apache.parquet.hadoop.api.{InitContext, ReadSupport}
+import org.apache.parquet.hadoop.metadata.GlobalMetaData
+import org.apache.parquet.hadoop.util.ContextUtil
+import org.apache.parquet.io.ParquetDecodingException
+import org.apache.parquet.schema.MessageType
 
 import org.apache.spark.annotation.DeveloperApi
 import org.apache.spark.mapred.SparkHadoopMapRedUtil
@@ -78,7 +78,7 @@ private[sql] case class ParquetTableScan(
   }.toArray
 
   protected override def doExecute(): RDD[Row] = {
-    import parquet.filter2.compat.FilterCompat.FilterPredicateCompat
+    import org.apache.parquet.filter2.compat.FilterCompat.FilterPredicateCompat
 
     val sc = sqlContext.sparkContext
     val job = new Job(sc.hadoopConfiguration)
@@ -136,7 +136,7 @@ private[sql] case class ParquetTableScan(
       baseRDD.mapPartitionsWithInputSplit { case (split, iter) =>
         val partValue = "([^=]+)=([^=]+)".r
         val partValues =
-          split.asInstanceOf[parquet.hadoop.ParquetInputSplit]
+          split.asInstanceOf[org.apache.parquet.hadoop.ParquetInputSplit]
             .getPath
             .toString
             .split("/")
@@ -378,7 +378,7 @@ private[sql] case class InsertIntoParquetTable(
  * to imported ones.
  */
 private[parquet] class AppendingParquetOutputFormat(offset: Int)
-  extends parquet.hadoop.ParquetOutputFormat[Row] {
+  extends org.apache.parquet.hadoop.ParquetOutputFormat[Row] {
   // override to accept existing directories as valid output directory
   override def checkOutputSpecs(job: JobContext): Unit = {}
   var committer: OutputCommitter = null
@@ -431,7 +431,7 @@ private[parquet] class AppendingParquetOutputFormat(offset: Int)
  * RecordFilter we want to use.
  */
 private[parquet] class FilteringParquetRowInputFormat
-  extends parquet.hadoop.ParquetInputFormat[Row] with Logging {
+  extends org.apache.parquet.hadoop.ParquetInputFormat[Row] with Logging {
 
   private var fileStatuses = Map.empty[Path, FileStatus]
 
@@ -439,7 +439,7 @@ private[parquet] class FilteringParquetRowInputFormat
       inputSplit: InputSplit,
       taskAttemptContext: TaskAttemptContext): RecordReader[Void, Row] = {
 
-    import parquet.filter2.compat.FilterCompat.NoOpFilter
+    import org.apache.parquet.filter2.compat.FilterCompat.NoOpFilter
 
     val readSupport: ReadSupport[Row] = new RowReadSupport()
 
@@ -501,7 +501,7 @@ private[parquet] class FilteringParquetRowInputFormat
     globalMetaData = new GlobalMetaData(globalMetaData.getSchema,
       mergedMetadata, globalMetaData.getCreatedBy)
 
-    val readContext = getReadSupport(configuration).init(
+    val readContext = ParquetInputFormat.getReadSupportInstance(configuration).init(
       new InitContext(configuration,
         globalMetaData.getKeyValueMetaData,
         globalMetaData.getSchema))
@@ -531,8 +531,8 @@ private[parquet] class FilteringParquetRowInputFormat
     minSplitSize: JLong,
     readContext: ReadContext): JList[ParquetInputSplit] = {
 
-    import parquet.filter2.compat.FilterCompat.Filter
-    import parquet.filter2.compat.RowGroupFilter
+    import org.apache.parquet.filter2.compat.FilterCompat.Filter
+    import org.apache.parquet.filter2.compat.RowGroupFilter
 
     import org.apache.spark.sql.parquet.FilteringParquetRowInputFormat.blockLocationCache
 
@@ -547,7 +547,7 @@ private[parquet] class FilteringParquetRowInputFormat
     // https://github.com/apache/incubator-parquet-mr/pull/17
     // is resolved
     val generateSplits =
-      Class.forName("parquet.hadoop.ClientSideMetadataSplitStrategy")
+      Class.forName("org.apache.parquet.hadoop.ClientSideMetadataSplitStrategy")
        .getDeclaredMethods.find(_.getName == "generateSplits").getOrElse(
          sys.error(s"Failed to reflectively invoke ClientSideMetadataSplitStrategy.generateSplits"))
     generateSplits.setAccessible(true)
@@ -612,7 +612,7 @@ private[parquet] class FilteringParquetRowInputFormat
     // https://github.com/apache/incubator-parquet-mr/pull/17
     // is resolved
     val generateSplits =
-      Class.forName("parquet.hadoop.TaskSideMetadataSplitStrategy")
+      Class.forName("org.apache.parquet.hadoop.TaskSideMetadataSplitStrategy")
        .getDeclaredMethods.find(_.getName == "generateTaskSideMDSplits").getOrElse(
          sys.error(
            s"Failed to reflectively invoke TaskSideMetadataSplitStrategy.generateTaskSideMDSplits"))
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableSupport.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableSupport.scala
index 70a220cc43ab9..89db408b1c382 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableSupport.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableSupport.scala
@@ -20,12 +20,12 @@ package org.apache.spark.sql.parquet
 import java.util.{HashMap => JHashMap}
 
 import org.apache.hadoop.conf.Configuration
-import parquet.column.ParquetProperties
-import parquet.hadoop.ParquetOutputFormat
-import parquet.hadoop.api.ReadSupport.ReadContext
-import parquet.hadoop.api.{ReadSupport, WriteSupport}
-import parquet.io.api._
-import parquet.schema.MessageType
+import org.apache.parquet.column.ParquetProperties
+import org.apache.parquet.hadoop.ParquetOutputFormat
+import org.apache.parquet.hadoop.api.ReadSupport.ReadContext
+import org.apache.parquet.hadoop.api.{ReadSupport, WriteSupport}
+import org.apache.parquet.io.api._
+import org.apache.parquet.schema.MessageType
 
 import org.apache.spark.Logging
 import org.apache.spark.sql.catalyst.expressions.{Attribute, Row}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTypes.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTypes.scala
index f8a5d84549336..ba2a35b74ef82 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTypes.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTypes.scala
@@ -25,13 +25,13 @@ import scala.util.Try
 import org.apache.hadoop.conf.Configuration
 import org.apache.hadoop.fs.{FileSystem, Path}
 import org.apache.hadoop.mapreduce.Job
-import parquet.format.converter.ParquetMetadataConverter
-import parquet.hadoop.metadata.{FileMetaData, ParquetMetadata}
-import parquet.hadoop.util.ContextUtil
-import parquet.hadoop.{Footer, ParquetFileReader, ParquetFileWriter}
-import parquet.schema.PrimitiveType.{PrimitiveTypeName => ParquetPrimitiveTypeName}
-import parquet.schema.Type.Repetition
-import parquet.schema.{ConversionPatterns, DecimalMetadata, GroupType => ParquetGroupType, MessageType, OriginalType => ParquetOriginalType, PrimitiveType => ParquetPrimitiveType, Type => ParquetType, Types => ParquetTypes}
+import org.apache.parquet.format.converter.ParquetMetadataConverter
+import org.apache.parquet.hadoop.metadata.{FileMetaData, ParquetMetadata}
+import org.apache.parquet.hadoop.util.ContextUtil
+import org.apache.parquet.hadoop.{Footer, ParquetFileReader, ParquetFileWriter}
+import org.apache.parquet.schema.PrimitiveType.{PrimitiveTypeName => ParquetPrimitiveTypeName}
+import org.apache.parquet.schema.Type.Repetition
+import org.apache.parquet.schema.{ConversionPatterns, DecimalMetadata, GroupType => ParquetGroupType, MessageType, OriginalType => ParquetOriginalType, PrimitiveType => ParquetPrimitiveType, Type => ParquetType, Types => ParquetTypes}
 
 import org.apache.spark.Logging
 import org.apache.spark.sql.AnalysisException
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/newParquet.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/newParquet.scala
index bf55e2383ab56..5dda440240e60 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/newParquet.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/newParquet.scala
@@ -29,10 +29,10 @@ import org.apache.hadoop.fs.{FileStatus, Path}
 import org.apache.hadoop.io.Writable
 import org.apache.hadoop.mapreduce._
 import org.apache.hadoop.mapreduce.lib.input.FileInputFormat
-import parquet.filter2.predicate.FilterApi
-import parquet.hadoop._
-import parquet.hadoop.metadata.CompressionCodecName
-import parquet.hadoop.util.ContextUtil
+import org.apache.parquet.filter2.predicate.FilterApi
+import org.apache.parquet.hadoop._
+import org.apache.parquet.hadoop.metadata.CompressionCodecName
+import org.apache.parquet.hadoop.util.ContextUtil
 
 import org.apache.spark.{Partition => SparkPartition, SerializableWritable, Logging, SparkException}
 import org.apache.spark.broadcast.Broadcast
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/timestamp/NanoTime.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/timestamp/NanoTime.scala
index 70bcca7526aae..4d5ed211ad0c0 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/timestamp/NanoTime.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/timestamp/NanoTime.scala
@@ -19,8 +19,8 @@ package org.apache.spark.sql.parquet.timestamp
 
 import java.nio.{ByteBuffer, ByteOrder}
 
-import parquet.Preconditions
-import parquet.io.api.{Binary, RecordConsumer}
+import org.apache.parquet.Preconditions
+import org.apache.parquet.io.api.{Binary, RecordConsumer}
 
 private[parquet] class NanoTime extends Serializable {
   private var julianDay = 0
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/sources/commands.scala b/sql/core/src/main/scala/org/apache/spark/sql/sources/commands.scala
index 71f016b1f14de..e9932c09107db 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/sources/commands.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/sources/commands.scala
@@ -24,7 +24,7 @@ import scala.collection.mutable
 import org.apache.hadoop.fs.Path
 import org.apache.hadoop.mapreduce._
 import org.apache.hadoop.mapreduce.lib.output.{FileOutputCommitter => MapReduceFileOutputCommitter, FileOutputFormat}
-import parquet.hadoop.util.ContextUtil
+import org.apache.parquet.hadoop.util.ContextUtil
 
 import org.apache.spark._
 import org.apache.spark.mapred.SparkHadoopMapRedUtil
diff --git a/sql/core/src/test/resources/log4j.properties b/sql/core/src/test/resources/log4j.properties
index 28e90b9520b2c..12fb128149d32 100644
--- a/sql/core/src/test/resources/log4j.properties
+++ b/sql/core/src/test/resources/log4j.properties
@@ -36,11 +36,11 @@ log4j.appender.FA.layout.ConversionPattern=%d{HH:mm:ss.SSS} %t %p %c{1}: %m%n
 log4j.appender.FA.Threshold = INFO
 
 # Some packages are noisy for no good reason.
-log4j.additivity.parquet.hadoop.ParquetRecordReader=false
-log4j.logger.parquet.hadoop.ParquetRecordReader=OFF
+log4j.additivity.org.apache.parquet.hadoop.ParquetRecordReader=false
+log4j.logger.org.apache.parquet.hadoop.ParquetRecordReader=OFF
 
-log4j.additivity.parquet.hadoop.ParquetOutputCommitter=false
-log4j.logger.parquet.hadoop.ParquetOutputCommitter=OFF
+log4j.additivity.org.apache.parquet.hadoop.ParquetOutputCommitter=false
+log4j.logger.org.apache.parquet.hadoop.ParquetOutputCommitter=OFF
 
 log4j.additivity.org.apache.hadoop.hive.serde2.lazy.LazyStruct=false
 log4j.logger.org.apache.hadoop.hive.serde2.lazy.LazyStruct=OFF
@@ -52,5 +52,5 @@ log4j.additivity.hive.ql.metadata.Hive=false
 log4j.logger.hive.ql.metadata.Hive=OFF
 
 # Parquet related logging
-log4j.logger.parquet.hadoop=WARN
+log4j.logger.org.apache.parquet.hadoop=WARN
 log4j.logger.org.apache.spark.sql.parquet=INFO
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetFilterSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetFilterSuite.scala
index bdc2ebabc5e9a..4aa5bcb7fdbca 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetFilterSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetFilterSuite.scala
@@ -18,8 +18,8 @@
 package org.apache.spark.sql.parquet
 
 import org.scalatest.BeforeAndAfterAll
-import parquet.filter2.predicate.Operators._
-import parquet.filter2.predicate.{FilterPredicate, Operators}
+import org.apache.parquet.filter2.predicate.Operators._
+import org.apache.parquet.filter2.predicate.{FilterPredicate, Operators}
 
 import org.apache.spark.sql.catalyst.dsl.expressions._
 import org.apache.spark.sql.catalyst.expressions._
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetIOSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetIOSuite.scala
index dd48bb350f26d..7f7c2cc1a6c26 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetIOSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetIOSuite.scala
@@ -24,14 +24,14 @@ import scala.reflect.runtime.universe.TypeTag
 import org.apache.hadoop.conf.Configuration
 import org.apache.hadoop.fs.{FileSystem, Path}
 import org.scalatest.BeforeAndAfterAll
-import parquet.example.data.simple.SimpleGroup
-import parquet.example.data.{Group, GroupWriter}
-import parquet.hadoop.api.WriteSupport
-import parquet.hadoop.api.WriteSupport.WriteContext
-import parquet.hadoop.metadata.{ParquetMetadata, FileMetaData, CompressionCodecName}
-import parquet.hadoop.{Footer, ParquetFileWriter, ParquetWriter}
-import parquet.io.api.RecordConsumer
-import parquet.schema.{MessageType, MessageTypeParser}
+import org.apache.parquet.example.data.simple.SimpleGroup
+import org.apache.parquet.example.data.{Group, GroupWriter}
+import org.apache.parquet.hadoop.api.WriteSupport
+import org.apache.parquet.hadoop.api.WriteSupport.WriteContext
+import org.apache.parquet.hadoop.metadata.{ParquetMetadata, FileMetaData, CompressionCodecName}
+import org.apache.parquet.hadoop.{Footer, ParquetFileWriter, ParquetWriter}
+import org.apache.parquet.io.api.RecordConsumer
+import org.apache.parquet.schema.{MessageType, MessageTypeParser}
 
 import org.apache.spark.sql.catalyst.ScalaReflection
 import org.apache.spark.sql.catalyst.expressions.Row
@@ -400,7 +400,7 @@ class ParquetIOSuiteBase extends QueryTest with ParquetTest {
     }
     finally {
       configuration.set("spark.sql.parquet.output.committer.class",
-        "parquet.hadoop.ParquetOutputCommitter")
+        "org.apache.parquet.hadoop.ParquetOutputCommitter")
     }
   }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetSchemaSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetSchemaSuite.scala
index caec2a6f25489..8b1745124b8e1 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetSchemaSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetSchemaSuite.scala
@@ -20,7 +20,7 @@ package org.apache.spark.sql.parquet
 import scala.reflect.ClassTag
 import scala.reflect.runtime.universe.TypeTag
 
-import parquet.schema.MessageTypeParser
+import org.apache.parquet.schema.MessageTypeParser
 
 import org.apache.spark.SparkFunSuite
 import org.apache.spark.sql.catalyst.ScalaReflection

From 3dc005282a694e105f40e429b28b0a677743341f Mon Sep 17 00:00:00 2001
From: Shivaram Venkataraman <shivaram@cs.berkeley.edu>
Date: Thu, 4 Jun 2015 12:52:16 -0700
Subject: [PATCH 362/525] [SPARK-8027] [SPARKR] Move man pages creation to
 install-dev.sh

This also helps us get rid of the sparkr-docs maven profile as docs are now built by just using -Psparkr when the roxygen2 package is available

Related to discussion in #6567

cc pwendell srowen -- Let me know if this looks better

Author: Shivaram Venkataraman <shivaram@cs.berkeley.edu>

Closes #6593 from shivaram/sparkr-pom-cleanup and squashes the following commits:

b282241 [Shivaram Venkataraman] Remove sparkr-docs from release script as well
8f100a5 [Shivaram Venkataraman] Move man pages creation to install-dev.sh This also helps us get rid of the sparkr-docs maven profile as docs are now built by just using -Psparkr when the roxygen2 package is available
---
 R/create-docs.sh                     |  5 +----
 R/install-dev.sh                     |  9 ++++++++-
 core/pom.xml                         | 23 -----------------------
 dev/create-release/create-release.sh | 16 ++++++++--------
 4 files changed, 17 insertions(+), 36 deletions(-)

diff --git a/R/create-docs.sh b/R/create-docs.sh
index af47c0863bdd0..6a4687b06ecb9 100755
--- a/R/create-docs.sh
+++ b/R/create-docs.sh
@@ -30,10 +30,7 @@ set -e
 export FWDIR="$(cd "`dirname "$0"`"; pwd)"
 pushd $FWDIR
 
-# Generate Rd file
-Rscript -e 'library(devtools); devtools::document(pkg="./pkg", roclets=c("rd"))'
-
-# Install the package
+# Install the package (this will also generate the Rd files)
 ./install-dev.sh
 
 # Now create HTML files
diff --git a/R/install-dev.sh b/R/install-dev.sh
index b9e2527035994..1edd551f8d243 100755
--- a/R/install-dev.sh
+++ b/R/install-dev.sh
@@ -34,5 +34,12 @@ LIB_DIR="$FWDIR/lib"
 
 mkdir -p $LIB_DIR
 
-# Install R
+pushd $FWDIR
+
+# Generate Rd files if devtools is installed
+Rscript -e ' if("devtools" %in% rownames(installed.packages())) { library(devtools); devtools::document(pkg="./pkg", roclets=c("rd")) }'
+
+# Install SparkR to $LIB_DIR
 R CMD INSTALL --library=$LIB_DIR $FWDIR/pkg/
+
+popd
diff --git a/core/pom.xml b/core/pom.xml
index e35694e9e98b4..40a64beccdc24 100644
--- a/core/pom.xml
+++ b/core/pom.xml
@@ -481,29 +481,6 @@
         </plugins>
       </build>
     </profile>
-    <profile>
-      <id>sparkr-docs</id>
-      <build>
-        <plugins>
-          <plugin>
-            <groupId>org.codehaus.mojo</groupId>
-            <artifactId>exec-maven-plugin</artifactId>
-            <executions>
-              <execution>
-                <id>sparkr-pkg-docs</id>
-                <phase>compile</phase>
-                <goals>
-                  <goal>exec</goal>
-                </goals>
-              </execution>
-            </executions>
-            <configuration>
-              <executable>..${path.separator}R${path.separator}create-docs${script.extension}</executable>
-            </configuration>
-          </plugin>
-        </plugins>
-      </build>
-    </profile>
   </profiles>
 
 </project>
diff --git a/dev/create-release/create-release.sh b/dev/create-release/create-release.sh
index 0b14a618e755c..54274a83f6d66 100755
--- a/dev/create-release/create-release.sh
+++ b/dev/create-release/create-release.sh
@@ -228,14 +228,14 @@ if [[ ! "$@" =~ --skip-package ]]; then
 
   # We increment the Zinc port each time to avoid OOM's and other craziness if multiple builds
   # share the same Zinc server.
-  make_binary_release "hadoop1" "-Psparkr -Psparkr-docs -Phadoop-1 -Phive -Phive-thriftserver" "3030" &
-  make_binary_release "hadoop1-scala2.11" "-Psparkr -Psparkr-docs -Phadoop-1 -Phive -Dscala-2.11" "3031" &
-  make_binary_release "cdh4" "-Psparkr -Psparkr-docs -Phadoop-1 -Phive -Phive-thriftserver -Dhadoop.version=2.0.0-mr1-cdh4.2.0" "3032" &
-  make_binary_release "hadoop2.3" "-Psparkr -Psparkr-docs  -Phadoop-2.3 -Phive -Phive-thriftserver -Pyarn" "3033" &
-  make_binary_release "hadoop2.4" "-Psparkr -Psparkr-docs -Phadoop-2.4 -Phive -Phive-thriftserver -Pyarn" "3034" &
-  make_binary_release "mapr3" "-Pmapr3 -Psparkr -Psparkr-docs -Phive -Phive-thriftserver" "3035" &
-  make_binary_release "mapr4" "-Pmapr4 -Psparkr -Psparkr-docs -Pyarn -Phive -Phive-thriftserver" "3036" &
-  make_binary_release "hadoop2.4-without-hive" "-Psparkr -Psparkr-docs -Phadoop-2.4 -Pyarn" "3037" &
+  make_binary_release "hadoop1" "-Psparkr -Phadoop-1 -Phive -Phive-thriftserver" "3030" &
+  make_binary_release "hadoop1-scala2.11" "-Psparkr -Phadoop-1 -Phive -Dscala-2.11" "3031" &
+  make_binary_release "cdh4" "-Psparkr -Phadoop-1 -Phive -Phive-thriftserver -Dhadoop.version=2.0.0-mr1-cdh4.2.0" "3032" &
+  make_binary_release "hadoop2.3" "-Psparkr -Phadoop-2.3 -Phive -Phive-thriftserver -Pyarn" "3033" &
+  make_binary_release "hadoop2.4" "-Psparkr -Phadoop-2.4 -Phive -Phive-thriftserver -Pyarn" "3034" &
+  make_binary_release "mapr3" "-Pmapr3 -Psparkr -Phive -Phive-thriftserver" "3035" &
+  make_binary_release "mapr4" "-Pmapr4 -Psparkr -Pyarn -Phive -Phive-thriftserver" "3036" &
+  make_binary_release "hadoop2.4-without-hive" "-Psparkr -Phadoop-2.4 -Pyarn" "3037" &
   wait
   rm -rf spark-$RELEASE_VERSION-bin-*/
 

From 0526fea483066086dfc27d1606f74220fe822f7f Mon Sep 17 00:00:00 2001
From: Cheolsoo Park <cheolsoop@netflix.com>
Date: Thu, 4 Jun 2015 13:27:35 -0700
Subject: [PATCH 363/525] [SPARK-6909][SQL] Remove Hive Shim code

This is a follow-up on #6393. I am removing the following files in this PR.
```
./sql/hive/v0.13.1/src/main/scala/org/apache/spark/sql/hive/Shim13.scala
./sql/hive-thriftserver/v0.13.1/src/main/scala/org/apache/spark/sql/hive/thriftserver/Shim13.scala
```
Basically, I re-factored the shim code as follows-
* Rewrote code directly with Hive 0.13 methods, or
* Converted code into private methods, or
* Extracted code into separate classes

But for leftover code that didn't fit in any of these cases, I created a HiveShim object. For eg, helper functions which wrap Hive 0.13 methods to work around Hive bugs are placed here.

Author: Cheolsoo Park <cheolsoop@netflix.com>

Closes #6604 from piaozhexiu/SPARK-6909 and squashes the following commits:

5dccc20 [Cheolsoo Park] Remove hive shim code
---
 .../hive/thriftserver/HiveThriftServer2.scala |  10 +-
 .../SparkExecuteStatementOperation.scala}     | 102 +---
 .../hive/thriftserver/SparkSQLCLIDriver.scala |   6 +-
 .../thriftserver/SparkSQLCLIService.scala     |   7 +-
 ...rkSQLDriver.scala => SparkSQLDriver.scala} |  20 +-
 .../sql/hive/thriftserver/SparkSQLEnv.scala   |   4 +-
 .../thriftserver/SparkSQLSessionManager.scala |  75 +++
 .../HiveThriftServer2Suites.scala             |   8 +-
 .../execution/HiveCompatibilitySuite.scala    |   3 +-
 .../apache/spark/sql/hive/HiveContext.scala   |  23 +-
 .../spark/sql/hive/HiveInspectors.scala       | 187 +++++--
 .../spark/sql/hive/HiveMetastoreCatalog.scala |  22 +-
 .../org/apache/spark/sql/hive/HiveQl.scala    |   4 +-
 .../org/apache/spark/sql/hive/HiveShim.scala  | 247 ++++++++++
 .../apache/spark/sql/hive/TableReader.scala   |  11 +-
 .../hive/execution/InsertIntoHiveTable.scala  |  12 +-
 .../org/apache/spark/sql/hive/hiveUdfs.scala  |   1 +
 .../spark/sql/hive/hiveWriterContainers.scala |   3 +-
 .../sql/hive/MetastoreDataSourcesSuite.scala  |  46 +-
 .../spark/sql/hive/StatisticsSuite.scala      |   4 -
 .../sql/hive/execution/HiveQuerySuite.scala   |  25 +-
 .../sql/hive/execution/SQLQuerySuite.scala    |  58 ++-
 .../org/apache/spark/sql/hive/Shim13.scala    | 457 ------------------
 23 files changed, 619 insertions(+), 716 deletions(-)
 rename sql/hive-thriftserver/{v0.13.1/src/main/scala/org/apache/spark/sql/hive/thriftserver/Shim13.scala => src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkExecuteStatementOperation.scala} (66%)
 rename sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/{AbstractSparkSQLDriver.scala => SparkSQLDriver.scala} (86%)
 create mode 100644 sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLSessionManager.scala
 create mode 100644 sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveShim.scala
 delete mode 100644 sql/hive/v0.13.1/src/main/scala/org/apache/spark/sql/hive/Shim13.scala

diff --git a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/HiveThriftServer2.scala b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/HiveThriftServer2.scala
index 94687eeda4179..5b391d3dce882 100644
--- a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/HiveThriftServer2.scala
+++ b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/HiveThriftServer2.scala
@@ -17,9 +17,6 @@
 
 package org.apache.spark.sql.hive.thriftserver
 
-import scala.collection.mutable
-import scala.collection.mutable.ArrayBuffer
-
 import org.apache.commons.logging.LogFactory
 import org.apache.hadoop.hive.conf.HiveConf
 import org.apache.hadoop.hive.conf.HiveConf.ConfVars
@@ -29,12 +26,15 @@ import org.apache.hive.service.server.{HiveServer2, ServerOptionsProcessor}
 import org.apache.spark.annotation.DeveloperApi
 import org.apache.spark.scheduler.{SparkListener, SparkListenerApplicationEnd, SparkListenerJobStart}
 import org.apache.spark.sql.SQLConf
+import org.apache.spark.sql.hive.HiveContext
 import org.apache.spark.sql.hive.thriftserver.ReflectionUtils._
 import org.apache.spark.sql.hive.thriftserver.ui.ThriftServerTab
-import org.apache.spark.sql.hive.{HiveContext, HiveShim}
 import org.apache.spark.util.Utils
 import org.apache.spark.{Logging, SparkContext}
 
+import scala.collection.mutable
+import scala.collection.mutable.ArrayBuffer
+
 /**
  * The main entry point for the Spark SQL port of HiveServer2.  Starts up a `SparkSQLContext` and a
  * `HiveThriftServer2` thrift server.
@@ -51,7 +51,7 @@ object HiveThriftServer2 extends Logging {
   @DeveloperApi
   def startWithContext(sqlContext: HiveContext): Unit = {
     val server = new HiveThriftServer2(sqlContext)
-    sqlContext.setConf("spark.sql.hive.version", HiveShim.version)
+    sqlContext.setConf("spark.sql.hive.version", HiveContext.hiveExecutionVersion)
     server.init(sqlContext.hiveconf)
     server.start()
     listener = new HiveThriftServer2Listener(server, sqlContext.conf)
diff --git a/sql/hive-thriftserver/v0.13.1/src/main/scala/org/apache/spark/sql/hive/thriftserver/Shim13.scala b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkExecuteStatementOperation.scala
similarity index 66%
rename from sql/hive-thriftserver/v0.13.1/src/main/scala/org/apache/spark/sql/hive/thriftserver/Shim13.scala
rename to sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkExecuteStatementOperation.scala
index b9d4f1c58c982..c0d1266212cdd 100644
--- a/sql/hive-thriftserver/v0.13.1/src/main/scala/org/apache/spark/sql/hive/thriftserver/Shim13.scala
+++ b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkExecuteStatementOperation.scala
@@ -18,66 +18,31 @@
 package org.apache.spark.sql.hive.thriftserver
 
 import java.sql.{Date, Timestamp}
-import java.util.concurrent.Executors
-import java.util.{ArrayList => JArrayList, List => JList, Map => JMap, UUID}
-
-import org.apache.commons.logging.Log
-import org.apache.hadoop.hive.conf.HiveConf
-import org.apache.hadoop.hive.conf.HiveConf.ConfVars
-import org.apache.hive.service.cli.thrift.TProtocolVersion
-import org.apache.spark.sql.hive.thriftserver.server.SparkSQLOperationManager
-
-import scala.collection.JavaConversions._
-import scala.collection.mutable.{ArrayBuffer, Map => SMap}
+import java.util.{Map => JMap, UUID}
 
 import org.apache.hadoop.hive.metastore.api.FieldSchema
-import org.apache.hadoop.security.UserGroupInformation
 import org.apache.hive.service.cli._
 import org.apache.hive.service.cli.operation.ExecuteStatementOperation
-import org.apache.hive.service.cli.session.{SessionManager, HiveSession}
+import org.apache.hive.service.cli.session.HiveSession
 
-import org.apache.spark.{SparkContext, Logging}
-import org.apache.spark.sql.{DataFrame, Row => SparkRow, SQLConf}
+import org.apache.spark.Logging
 import org.apache.spark.sql.execution.SetCommand
-import org.apache.spark.sql.hive.thriftserver.ReflectionUtils._
 import org.apache.spark.sql.hive.{HiveContext, HiveMetastoreTypes}
 import org.apache.spark.sql.types._
+import org.apache.spark.sql.{DataFrame, Row => SparkRow, SQLConf}
 
-/**
- * A compatibility layer for interacting with Hive version 0.13.1.
- */
-private[thriftserver] object HiveThriftServerShim {
-  val version = "0.13.1"
-
-  def setServerUserName(
-      sparkServiceUGI: UserGroupInformation,
-      sparkCliService:SparkSQLCLIService) = {
-    setSuperField(sparkCliService, "serviceUGI", sparkServiceUGI)
-  }
-}
-
-private[hive] class SparkSQLDriver(val _context: HiveContext = SparkSQLEnv.hiveContext)
-  extends AbstractSparkSQLDriver(_context) {
-  override def getResults(res: JList[_]): Boolean = {
-    if (hiveResponse == null) {
-      false
-    } else {
-      res.asInstanceOf[JArrayList[String]].addAll(hiveResponse)
-      hiveResponse = null
-      true
-    }
-  }
-}
+import scala.collection.JavaConversions._
+import scala.collection.mutable.{ArrayBuffer, Map => SMap}
 
 private[hive] class SparkExecuteStatementOperation(
     parentSession: HiveSession,
     statement: String,
     confOverlay: JMap[String, String],
-    runInBackground: Boolean = true)(
-    hiveContext: HiveContext,
-    sessionToActivePool: SMap[SessionHandle, String])
+    runInBackground: Boolean = true)
+    (hiveContext: HiveContext, sessionToActivePool: SMap[SessionHandle, String])
   // NOTE: `runInBackground` is set to `false` intentionally to disable asynchronous execution
-  extends ExecuteStatementOperation(parentSession, statement, confOverlay, false) with Logging {
+  extends ExecuteStatementOperation(parentSession, statement, confOverlay, false)
+  with Logging {
 
   private var result: DataFrame = _
   private var iter: Iterator[SparkRow] = _
@@ -88,7 +53,7 @@ private[hive] class SparkExecuteStatementOperation(
     logDebug("CLOSING")
   }
 
-  def addNonNullColumnValue(from: SparkRow, to: ArrayBuffer[Any],  ordinal: Int) {
+  def addNonNullColumnValue(from: SparkRow, to: ArrayBuffer[Any], ordinal: Int) {
     dataTypes(ordinal) match {
       case StringType =>
         to += from.getString(ordinal)
@@ -209,48 +174,3 @@ private[hive] class SparkExecuteStatementOperation(
     HiveThriftServer2.listener.onStatementFinish(statementId)
   }
 }
-
-private[hive] class SparkSQLSessionManager(hiveContext: HiveContext)
-  extends SessionManager
-  with ReflectedCompositeService {
-
-  private lazy val sparkSqlOperationManager = new SparkSQLOperationManager(hiveContext)
-
-  override def init(hiveConf: HiveConf) {
-    setSuperField(this, "hiveConf", hiveConf)
-
-    val backgroundPoolSize = hiveConf.getIntVar(ConfVars.HIVE_SERVER2_ASYNC_EXEC_THREADS)
-    setSuperField(this, "backgroundOperationPool", Executors.newFixedThreadPool(backgroundPoolSize))
-    getAncestorField[Log](this, 3, "LOG").info(
-      s"HiveServer2: Async execution pool size $backgroundPoolSize")
-
-    setSuperField(this, "operationManager", sparkSqlOperationManager)
-    addService(sparkSqlOperationManager)
-
-    initCompositeService(hiveConf)
-  }
-
-  override def openSession(
-      protocol: TProtocolVersion,
-      username: String,
-      passwd: String,
-      sessionConf: java.util.Map[String, String],
-      withImpersonation: Boolean,
-      delegationToken: String): SessionHandle = {
-    hiveContext.openSession()
-    val sessionHandle = super.openSession(
-      protocol, username, passwd, sessionConf, withImpersonation, delegationToken)
-    val session = super.getSession(sessionHandle)
-    HiveThriftServer2.listener.onSessionCreated(
-      session.getIpAddress, sessionHandle.getSessionId.toString, session.getUsername)
-    sessionHandle
-  }
-
-  override def closeSession(sessionHandle: SessionHandle) {
-    HiveThriftServer2.listener.onSessionClosed(sessionHandle.getSessionId.toString)
-    super.closeSession(sessionHandle)
-    sparkSqlOperationManager.sessionToActivePool -= sessionHandle
-
-    hiveContext.detachSession()
-  }
-}
diff --git a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLCLIDriver.scala b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLCLIDriver.scala
index 14f6f658d9b75..039cfa40d26b3 100644
--- a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLCLIDriver.scala
+++ b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLCLIDriver.scala
@@ -32,12 +32,12 @@ import org.apache.hadoop.hive.common.{HiveInterruptCallback, HiveInterruptUtils}
 import org.apache.hadoop.hive.conf.HiveConf
 import org.apache.hadoop.hive.ql.Driver
 import org.apache.hadoop.hive.ql.exec.Utilities
-import org.apache.hadoop.hive.ql.processors.{AddResourceProcessor, SetProcessor, CommandProcessor}
+import org.apache.hadoop.hive.ql.processors.{AddResourceProcessor, SetProcessor, CommandProcessor, CommandProcessorFactory}
 import org.apache.hadoop.hive.ql.session.SessionState
 import org.apache.thrift.transport.TSocket
 
 import org.apache.spark.Logging
-import org.apache.spark.sql.hive.{HiveContext, HiveShim}
+import org.apache.spark.sql.hive.HiveContext
 import org.apache.spark.util.Utils
 
 private[hive] object SparkSQLCLIDriver {
@@ -267,7 +267,7 @@ private[hive] class SparkSQLCLIDriver extends CliDriver with Logging {
     } else {
       var ret = 0
       val hconf = conf.asInstanceOf[HiveConf]
-      val proc: CommandProcessor = HiveShim.getCommandProcessor(Array(tokens(0)), hconf)
+      val proc: CommandProcessor = CommandProcessorFactory.get(Array(tokens(0)), hconf)
 
       if (proc != null) {
         if (proc.isInstanceOf[Driver] || proc.isInstanceOf[SetProcessor] ||
diff --git a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLCLIService.scala b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLCLIService.scala
index 499e077d7294a..41f647d5f8c5a 100644
--- a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLCLIService.scala
+++ b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLCLIService.scala
@@ -21,8 +21,6 @@ import java.io.IOException
 import java.util.{List => JList}
 import javax.security.auth.login.LoginException
 
-import scala.collection.JavaConversions._
-
 import org.apache.commons.logging.Log
 import org.apache.hadoop.hive.conf.HiveConf
 import org.apache.hadoop.hive.shims.ShimLoader
@@ -34,7 +32,8 @@ import org.apache.hive.service.{AbstractService, Service, ServiceException}
 
 import org.apache.spark.sql.hive.HiveContext
 import org.apache.spark.sql.hive.thriftserver.ReflectionUtils._
-import org.apache.spark.util.Utils
+
+import scala.collection.JavaConversions._
 
 private[hive] class SparkSQLCLIService(hiveContext: HiveContext)
   extends CLIService
@@ -52,7 +51,7 @@ private[hive] class SparkSQLCLIService(hiveContext: HiveContext)
       try {
         HiveAuthFactory.loginFromKeytab(hiveConf)
         sparkServiceUGI = ShimLoader.getHadoopShims.getUGIForConf(hiveConf)
-        HiveThriftServerShim.setServerUserName(sparkServiceUGI, this)
+        setSuperField(this, "serviceUGI", sparkServiceUGI)
       } catch {
         case e @ (_: IOException | _: LoginException) =>
           throw new ServiceException("Unable to login to kerberos with given principal/keytab", e)
diff --git a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/AbstractSparkSQLDriver.scala b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLDriver.scala
similarity index 86%
rename from sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/AbstractSparkSQLDriver.scala
rename to sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLDriver.scala
index 48ac9062af96a..77272aecf2835 100644
--- a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/AbstractSparkSQLDriver.scala
+++ b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLDriver.scala
@@ -17,7 +17,7 @@
 
 package org.apache.spark.sql.hive.thriftserver
 
-import scala.collection.JavaConversions._
+import java.util.{ArrayList => JArrayList, List => JList}
 
 import org.apache.commons.lang3.exception.ExceptionUtils
 import org.apache.hadoop.hive.metastore.api.{FieldSchema, Schema}
@@ -27,8 +27,12 @@ import org.apache.hadoop.hive.ql.processors.CommandProcessorResponse
 import org.apache.spark.Logging
 import org.apache.spark.sql.hive.{HiveContext, HiveMetastoreTypes}
 
-private[hive] abstract class AbstractSparkSQLDriver(
-    val context: HiveContext = SparkSQLEnv.hiveContext) extends Driver with Logging {
+import scala.collection.JavaConversions._
+
+private[hive] class SparkSQLDriver(
+    val context: HiveContext = SparkSQLEnv.hiveContext)
+  extends Driver
+  with Logging {
 
   private[hive] var tableSchema: Schema = _
   private[hive] var hiveResponse: Seq[String] = _
@@ -71,6 +75,16 @@ private[hive] abstract class AbstractSparkSQLDriver(
     0
   }
 
+  override def getResults(res: JList[_]): Boolean = {
+    if (hiveResponse == null) {
+      false
+    } else {
+      res.asInstanceOf[JArrayList[String]].addAll(hiveResponse)
+      hiveResponse = null
+      true
+    }
+  }
+
   override def getSchema: Schema = tableSchema
 
   override def destroy() {
diff --git a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLEnv.scala b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLEnv.scala
index 7c0c505e2d61e..79eda1f5123bf 100644
--- a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLEnv.scala
+++ b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLEnv.scala
@@ -22,7 +22,7 @@ import java.io.PrintStream
 import scala.collection.JavaConversions._
 
 import org.apache.spark.scheduler.StatsReportListener
-import org.apache.spark.sql.hive.{HiveShim, HiveContext}
+import org.apache.spark.sql.hive.HiveContext
 import org.apache.spark.{Logging, SparkConf, SparkContext}
 import org.apache.spark.util.Utils
 
@@ -56,7 +56,7 @@ private[hive] object SparkSQLEnv extends Logging {
       hiveContext.metadataHive.setInfo(new PrintStream(System.err, true, "UTF-8"))
       hiveContext.metadataHive.setError(new PrintStream(System.err, true, "UTF-8"))
 
-      hiveContext.setConf("spark.sql.hive.version", HiveShim.version)
+      hiveContext.setConf("spark.sql.hive.version", HiveContext.hiveExecutionVersion)
 
       if (log.isDebugEnabled) {
         hiveContext.hiveconf.getAllProperties.toSeq.sorted.foreach { case (k, v) =>
diff --git a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLSessionManager.scala b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLSessionManager.scala
new file mode 100644
index 0000000000000..357b27f7401a3
--- /dev/null
+++ b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLSessionManager.scala
@@ -0,0 +1,75 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.hive.thriftserver
+
+import java.util.concurrent.Executors
+
+import org.apache.commons.logging.Log
+import org.apache.hadoop.hive.conf.HiveConf
+import org.apache.hadoop.hive.conf.HiveConf.ConfVars
+import org.apache.hive.service.cli.SessionHandle
+import org.apache.hive.service.cli.session.SessionManager
+import org.apache.hive.service.cli.thrift.TProtocolVersion
+
+import org.apache.spark.sql.hive.HiveContext
+import org.apache.spark.sql.hive.thriftserver.ReflectionUtils._
+import org.apache.spark.sql.hive.thriftserver.server.SparkSQLOperationManager
+
+private[hive] class SparkSQLSessionManager(hiveContext: HiveContext)
+  extends SessionManager
+  with ReflectedCompositeService {
+
+  private lazy val sparkSqlOperationManager = new SparkSQLOperationManager(hiveContext)
+
+  override def init(hiveConf: HiveConf) {
+    setSuperField(this, "hiveConf", hiveConf)
+
+    val backgroundPoolSize = hiveConf.getIntVar(ConfVars.HIVE_SERVER2_ASYNC_EXEC_THREADS)
+    setSuperField(this, "backgroundOperationPool", Executors.newFixedThreadPool(backgroundPoolSize))
+    getAncestorField[Log](this, 3, "LOG").info(
+      s"HiveServer2: Async execution pool size $backgroundPoolSize")
+
+    setSuperField(this, "operationManager", sparkSqlOperationManager)
+    addService(sparkSqlOperationManager)
+
+    initCompositeService(hiveConf)
+  }
+
+  override def openSession(protocol: TProtocolVersion,
+                           username: String,
+                           passwd: String,
+                           sessionConf: java.util.Map[String, String],
+                           withImpersonation: Boolean,
+                           delegationToken: String): SessionHandle = {
+    hiveContext.openSession()
+    val sessionHandle = super.openSession(
+      protocol, username, passwd, sessionConf, withImpersonation, delegationToken)
+    val session = super.getSession(sessionHandle)
+    HiveThriftServer2.listener.onSessionCreated(
+      session.getIpAddress, sessionHandle.getSessionId.toString, session.getUsername)
+    sessionHandle
+  }
+
+  override def closeSession(sessionHandle: SessionHandle) {
+    HiveThriftServer2.listener.onSessionClosed(sessionHandle.getSessionId.toString)
+    super.closeSession(sessionHandle)
+    sparkSqlOperationManager.sessionToActivePool -= sessionHandle
+
+    hiveContext.detachSession()
+  }
+}
diff --git a/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/HiveThriftServer2Suites.scala b/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/HiveThriftServer2Suites.scala
index a93a3dee43511..f57c7083ea504 100644
--- a/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/HiveThriftServer2Suites.scala
+++ b/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/HiveThriftServer2Suites.scala
@@ -40,7 +40,7 @@ import org.apache.thrift.transport.TSocket
 import org.scalatest.BeforeAndAfterAll
 
 import org.apache.spark.{Logging, SparkFunSuite}
-import org.apache.spark.sql.hive.HiveShim
+import org.apache.spark.sql.hive.HiveContext
 import org.apache.spark.util.Utils
 
 object TestData {
@@ -111,7 +111,8 @@ class HiveThriftBinaryServerSuite extends HiveThriftJdbcTest {
     withJdbcStatement { statement =>
       val resultSet = statement.executeQuery("SET spark.sql.hive.version")
       resultSet.next()
-      assert(resultSet.getString(1) === s"spark.sql.hive.version=${HiveShim.version}")
+      assert(resultSet.getString(1) ===
+        s"spark.sql.hive.version=${HiveContext.hiveExecutionVersion}")
     }
   }
 
@@ -365,7 +366,8 @@ class HiveThriftHttpServerSuite extends HiveThriftJdbcTest {
     withJdbcStatement { statement =>
       val resultSet = statement.executeQuery("SET spark.sql.hive.version")
       resultSet.next()
-      assert(resultSet.getString(1) === s"spark.sql.hive.version=${HiveShim.version}")
+      assert(resultSet.getString(1) ===
+        s"spark.sql.hive.version=${HiveContext.hiveExecutionVersion}")
     }
   }
 }
diff --git a/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveCompatibilitySuite.scala b/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveCompatibilitySuite.scala
index 0b1917a392901..048f78b4daa8d 100644
--- a/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveCompatibilitySuite.scala
+++ b/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveCompatibilitySuite.scala
@@ -23,7 +23,6 @@ import java.util.{Locale, TimeZone}
 import org.scalatest.BeforeAndAfter
 
 import org.apache.spark.sql.SQLConf
-import org.apache.spark.sql.hive.HiveShim
 import org.apache.spark.sql.hive.test.TestHive
 
 /**
@@ -254,7 +253,7 @@ class HiveCompatibilitySuite extends HiveQueryFileTest with BeforeAndAfter {
 
     // the answer is sensitive for jdk version
     "udf_java_method"
-  ) ++ HiveShim.compatibilityBlackList
+  )
 
   /**
    * The set of tests that are believed to be working in catalyst. Tests not on whiteList or
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala
index fbf2c7d8cbc06..800f51c5e2e86 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala
@@ -17,37 +17,34 @@
 
 package org.apache.spark.sql.hive
 
-import java.io.{BufferedReader, File, InputStreamReader, PrintStream}
+import java.io.File
 import java.net.{URL, URLClassLoader}
 import java.sql.Timestamp
-import java.util.{ArrayList => JArrayList}
 
-import org.apache.hadoop.hive.ql.parse.VariableSubstitution
+import org.apache.hadoop.hive.common.StatsSetupConst
+import org.apache.hadoop.hive.common.`type`.HiveDecimal
 import org.apache.spark.sql.catalyst.ParserDialect
 
 import scala.collection.JavaConversions._
-import scala.collection.mutable.{ArrayBuffer, HashMap}
+import scala.collection.mutable.HashMap
 import scala.language.implicitConversions
 
 import org.apache.hadoop.fs.{FileSystem, Path}
 import org.apache.hadoop.hive.conf.HiveConf
-import org.apache.hadoop.hive.ql.Driver
 import org.apache.hadoop.hive.ql.metadata.Table
 import org.apache.hadoop.hive.ql.parse.VariableSubstitution
-import org.apache.hadoop.hive.ql.processors._
 import org.apache.hadoop.hive.ql.session.SessionState
 import org.apache.hadoop.hive.serde2.io.{DateWritable, TimestampWritable}
 
-import org.apache.spark.{SparkConf, SparkContext}
+import org.apache.spark.SparkContext
 import org.apache.spark.annotation.Experimental
-import org.apache.spark.deploy.SparkHadoopUtil
 import org.apache.spark.sql._
 import org.apache.spark.sql.catalyst.analysis.{Analyzer, EliminateSubQueries, OverrideCatalog, OverrideFunctionRegistry}
 import org.apache.spark.sql.catalyst.plans.logical._
-import org.apache.spark.sql.execution.{ExecutedCommand, ExtractPythonUdfs, QueryExecutionException, SetCommand}
+import org.apache.spark.sql.execution.{ExecutedCommand, ExtractPythonUdfs, SetCommand}
 import org.apache.spark.sql.hive.client._
 import org.apache.spark.sql.hive.execution.{DescribeHiveTableCommand, HiveNativeCommand}
-import org.apache.spark.sql.sources.{DDLParser, DataSourceStrategy}
+import org.apache.spark.sql.sources.DataSourceStrategy
 import org.apache.spark.sql.catalyst.CatalystConf
 import org.apache.spark.sql.types._
 import org.apache.spark.util.Utils
@@ -331,7 +328,7 @@ class HiveContext(sc: SparkContext) extends SQLContext(sc) {
 
         val tableParameters = relation.hiveQlTable.getParameters
         val oldTotalSize =
-          Option(tableParameters.get(HiveShim.getStatsSetupConstTotalSize))
+          Option(tableParameters.get(StatsSetupConst.TOTAL_SIZE))
             .map(_.toLong)
             .getOrElse(0L)
         val newTotalSize = getFileSizeForTable(hiveconf, relation.hiveQlTable)
@@ -342,7 +339,7 @@ class HiveContext(sc: SparkContext) extends SQLContext(sc) {
           catalog.client.alterTable(
             relation.table.copy(
               properties = relation.table.properties +
-                (HiveShim.getStatsSetupConstTotalSize -> newTotalSize.toString)))
+                (StatsSetupConst.TOTAL_SIZE -> newTotalSize.toString)))
         }
       case otherRelation =>
         throw new UnsupportedOperationException(
@@ -564,7 +561,7 @@ private[hive] object HiveContext {
     case (bin: Array[Byte], BinaryType) => new String(bin, "UTF-8")
     case (decimal: java.math.BigDecimal, DecimalType()) =>
       // Hive strips trailing zeros so use its toString
-      HiveShim.createDecimal(decimal).toString
+      HiveDecimal.create(decimal).toString
     case (other, tpe) if primitiveTypes contains tpe => other.toString
   }
 
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveInspectors.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveInspectors.scala
index 24cd335082639..c466203cd0220 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveInspectors.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveInspectors.scala
@@ -20,6 +20,7 @@ package org.apache.spark.sql.hive
 import org.apache.hadoop.hive.common.`type`.{HiveDecimal, HiveVarchar}
 import org.apache.hadoop.hive.serde2.objectinspector.primitive._
 import org.apache.hadoop.hive.serde2.objectinspector.{StructField => HiveStructField, _}
+import org.apache.hadoop.hive.serde2.typeinfo.{DecimalTypeInfo, TypeInfoFactory}
 import org.apache.hadoop.hive.serde2.{io => hiveIo}
 import org.apache.hadoop.{io => hadoopIo}
 
@@ -350,7 +351,7 @@ private[hive] trait HiveInspectors {
         new HiveVarchar(s, s.size)
 
     case _: JavaHiveDecimalObjectInspector =>
-      (o: Any) => HiveShim.createDecimal(o.asInstanceOf[Decimal].toJavaBigDecimal)
+      (o: Any) => HiveDecimal.create(o.asInstanceOf[Decimal].toJavaBigDecimal)
 
     case _: JavaDateObjectInspector =>
       (o: Any) => DateUtils.toJavaDate(o.asInstanceOf[Int])
@@ -439,31 +440,31 @@ private[hive] trait HiveInspectors {
     case _ if a == null => null
     case x: PrimitiveObjectInspector => x match {
       // TODO we don't support the HiveVarcharObjectInspector yet.
-      case _: StringObjectInspector if x.preferWritable() => HiveShim.getStringWritable(a)
+      case _: StringObjectInspector if x.preferWritable() => getStringWritable(a)
       case _: StringObjectInspector => a.asInstanceOf[UTF8String].toString()
-      case _: IntObjectInspector if x.preferWritable() => HiveShim.getIntWritable(a)
+      case _: IntObjectInspector if x.preferWritable() => getIntWritable(a)
       case _: IntObjectInspector => a.asInstanceOf[java.lang.Integer]
-      case _: BooleanObjectInspector if x.preferWritable() => HiveShim.getBooleanWritable(a)
+      case _: BooleanObjectInspector if x.preferWritable() => getBooleanWritable(a)
       case _: BooleanObjectInspector => a.asInstanceOf[java.lang.Boolean]
-      case _: FloatObjectInspector if x.preferWritable() => HiveShim.getFloatWritable(a)
+      case _: FloatObjectInspector if x.preferWritable() => getFloatWritable(a)
       case _: FloatObjectInspector => a.asInstanceOf[java.lang.Float]
-      case _: DoubleObjectInspector if x.preferWritable() => HiveShim.getDoubleWritable(a)
+      case _: DoubleObjectInspector if x.preferWritable() => getDoubleWritable(a)
       case _: DoubleObjectInspector => a.asInstanceOf[java.lang.Double]
-      case _: LongObjectInspector if x.preferWritable() => HiveShim.getLongWritable(a)
+      case _: LongObjectInspector if x.preferWritable() => getLongWritable(a)
       case _: LongObjectInspector => a.asInstanceOf[java.lang.Long]
-      case _: ShortObjectInspector if x.preferWritable() => HiveShim.getShortWritable(a)
+      case _: ShortObjectInspector if x.preferWritable() => getShortWritable(a)
       case _: ShortObjectInspector => a.asInstanceOf[java.lang.Short]
-      case _: ByteObjectInspector if x.preferWritable() => HiveShim.getByteWritable(a)
+      case _: ByteObjectInspector if x.preferWritable() => getByteWritable(a)
       case _: ByteObjectInspector => a.asInstanceOf[java.lang.Byte]
       case _: HiveDecimalObjectInspector if x.preferWritable() =>
-        HiveShim.getDecimalWritable(a.asInstanceOf[Decimal])
+        getDecimalWritable(a.asInstanceOf[Decimal])
       case _: HiveDecimalObjectInspector =>
-        HiveShim.createDecimal(a.asInstanceOf[Decimal].toJavaBigDecimal)
-      case _: BinaryObjectInspector if x.preferWritable() => HiveShim.getBinaryWritable(a)
+        HiveDecimal.create(a.asInstanceOf[Decimal].toJavaBigDecimal)
+      case _: BinaryObjectInspector if x.preferWritable() => getBinaryWritable(a)
       case _: BinaryObjectInspector => a.asInstanceOf[Array[Byte]]
-      case _: DateObjectInspector if x.preferWritable() => HiveShim.getDateWritable(a)
+      case _: DateObjectInspector if x.preferWritable() => getDateWritable(a)
       case _: DateObjectInspector => DateUtils.toJavaDate(a.asInstanceOf[Int])
-      case _: TimestampObjectInspector if x.preferWritable() => HiveShim.getTimestampWritable(a)
+      case _: TimestampObjectInspector if x.preferWritable() => getTimestampWritable(a)
       case _: TimestampObjectInspector => a.asInstanceOf[java.sql.Timestamp]
     }
     case x: SettableStructObjectInspector =>
@@ -574,31 +575,31 @@ private[hive] trait HiveInspectors {
    */
   def toInspector(expr: Expression): ObjectInspector = expr match {
     case Literal(value, StringType) =>
-      HiveShim.getStringWritableConstantObjectInspector(value)
+      getStringWritableConstantObjectInspector(value)
     case Literal(value, IntegerType) =>
-      HiveShim.getIntWritableConstantObjectInspector(value)
+      getIntWritableConstantObjectInspector(value)
     case Literal(value, DoubleType) =>
-      HiveShim.getDoubleWritableConstantObjectInspector(value)
+      getDoubleWritableConstantObjectInspector(value)
     case Literal(value, BooleanType) =>
-      HiveShim.getBooleanWritableConstantObjectInspector(value)
+      getBooleanWritableConstantObjectInspector(value)
     case Literal(value, LongType) =>
-      HiveShim.getLongWritableConstantObjectInspector(value)
+      getLongWritableConstantObjectInspector(value)
     case Literal(value, FloatType) =>
-      HiveShim.getFloatWritableConstantObjectInspector(value)
+      getFloatWritableConstantObjectInspector(value)
     case Literal(value, ShortType) =>
-      HiveShim.getShortWritableConstantObjectInspector(value)
+      getShortWritableConstantObjectInspector(value)
     case Literal(value, ByteType) =>
-      HiveShim.getByteWritableConstantObjectInspector(value)
+      getByteWritableConstantObjectInspector(value)
     case Literal(value, BinaryType) =>
-      HiveShim.getBinaryWritableConstantObjectInspector(value)
+      getBinaryWritableConstantObjectInspector(value)
     case Literal(value, DateType) =>
-      HiveShim.getDateWritableConstantObjectInspector(value)
+      getDateWritableConstantObjectInspector(value)
     case Literal(value, TimestampType) =>
-      HiveShim.getTimestampWritableConstantObjectInspector(value)
+      getTimestampWritableConstantObjectInspector(value)
     case Literal(value, DecimalType()) =>
-      HiveShim.getDecimalWritableConstantObjectInspector(value)
+      getDecimalWritableConstantObjectInspector(value)
     case Literal(_, NullType) =>
-      HiveShim.getPrimitiveNullWritableConstantObjectInspector
+      getPrimitiveNullWritableConstantObjectInspector
     case Literal(value, ArrayType(dt, _)) =>
       val listObjectInspector = toInspector(dt)
       if (value == null) {
@@ -658,8 +659,8 @@ private[hive] trait HiveInspectors {
     case _: JavaFloatObjectInspector => FloatType
     case _: WritableBinaryObjectInspector => BinaryType
     case _: JavaBinaryObjectInspector => BinaryType
-    case w: WritableHiveDecimalObjectInspector => HiveShim.decimalTypeInfoToCatalyst(w)
-    case j: JavaHiveDecimalObjectInspector => HiveShim.decimalTypeInfoToCatalyst(j)
+    case w: WritableHiveDecimalObjectInspector => decimalTypeInfoToCatalyst(w)
+    case j: JavaHiveDecimalObjectInspector => decimalTypeInfoToCatalyst(j)
     case _: WritableDateObjectInspector => DateType
     case _: JavaDateObjectInspector => DateType
     case _: WritableTimestampObjectInspector => TimestampType
@@ -668,10 +669,136 @@ private[hive] trait HiveInspectors {
     case _: JavaVoidObjectInspector => NullType
   }
 
+  private def decimalTypeInfoToCatalyst(inspector: PrimitiveObjectInspector): DecimalType = {
+    val info = inspector.getTypeInfo.asInstanceOf[DecimalTypeInfo]
+    DecimalType(info.precision(), info.scale())
+  }
+
+  private def getStringWritableConstantObjectInspector(value: Any): ObjectInspector =
+    PrimitiveObjectInspectorFactory.getPrimitiveWritableConstantObjectInspector(
+      TypeInfoFactory.stringTypeInfo, getStringWritable(value))
+
+  private def getIntWritableConstantObjectInspector(value: Any): ObjectInspector =
+    PrimitiveObjectInspectorFactory.getPrimitiveWritableConstantObjectInspector(
+      TypeInfoFactory.intTypeInfo, getIntWritable(value))
+
+  private def getDoubleWritableConstantObjectInspector(value: Any): ObjectInspector =
+    PrimitiveObjectInspectorFactory.getPrimitiveWritableConstantObjectInspector(
+      TypeInfoFactory.doubleTypeInfo, getDoubleWritable(value))
+
+  private def getBooleanWritableConstantObjectInspector(value: Any): ObjectInspector =
+    PrimitiveObjectInspectorFactory.getPrimitiveWritableConstantObjectInspector(
+      TypeInfoFactory.booleanTypeInfo, getBooleanWritable(value))
+
+  private def getLongWritableConstantObjectInspector(value: Any): ObjectInspector =
+    PrimitiveObjectInspectorFactory.getPrimitiveWritableConstantObjectInspector(
+      TypeInfoFactory.longTypeInfo, getLongWritable(value))
+
+  private def getFloatWritableConstantObjectInspector(value: Any): ObjectInspector =
+    PrimitiveObjectInspectorFactory.getPrimitiveWritableConstantObjectInspector(
+      TypeInfoFactory.floatTypeInfo, getFloatWritable(value))
+
+  private def getShortWritableConstantObjectInspector(value: Any): ObjectInspector =
+    PrimitiveObjectInspectorFactory.getPrimitiveWritableConstantObjectInspector(
+      TypeInfoFactory.shortTypeInfo, getShortWritable(value))
+
+  private def getByteWritableConstantObjectInspector(value: Any): ObjectInspector =
+    PrimitiveObjectInspectorFactory.getPrimitiveWritableConstantObjectInspector(
+      TypeInfoFactory.byteTypeInfo, getByteWritable(value))
+
+  private def getBinaryWritableConstantObjectInspector(value: Any): ObjectInspector =
+    PrimitiveObjectInspectorFactory.getPrimitiveWritableConstantObjectInspector(
+      TypeInfoFactory.binaryTypeInfo, getBinaryWritable(value))
+
+  private def getDateWritableConstantObjectInspector(value: Any): ObjectInspector =
+    PrimitiveObjectInspectorFactory.getPrimitiveWritableConstantObjectInspector(
+      TypeInfoFactory.dateTypeInfo, getDateWritable(value))
+
+  private def getTimestampWritableConstantObjectInspector(value: Any): ObjectInspector =
+    PrimitiveObjectInspectorFactory.getPrimitiveWritableConstantObjectInspector(
+      TypeInfoFactory.timestampTypeInfo, getTimestampWritable(value))
+
+  private def getDecimalWritableConstantObjectInspector(value: Any): ObjectInspector =
+    PrimitiveObjectInspectorFactory.getPrimitiveWritableConstantObjectInspector(
+      TypeInfoFactory.decimalTypeInfo, getDecimalWritable(value))
+
+  private def getPrimitiveNullWritableConstantObjectInspector: ObjectInspector =
+    PrimitiveObjectInspectorFactory.getPrimitiveWritableConstantObjectInspector(
+      TypeInfoFactory.voidTypeInfo, null)
+
+  private def getStringWritable(value: Any): hadoopIo.Text =
+    if (value == null) null else new hadoopIo.Text(value.asInstanceOf[UTF8String].toString)
+
+  private def getIntWritable(value: Any): hadoopIo.IntWritable =
+    if (value == null) null else new hadoopIo.IntWritable(value.asInstanceOf[Int])
+
+  private def getDoubleWritable(value: Any): hiveIo.DoubleWritable =
+    if (value == null) {
+      null
+    } else {
+      new hiveIo.DoubleWritable(value.asInstanceOf[Double])
+    }
+
+  private def getBooleanWritable(value: Any): hadoopIo.BooleanWritable =
+    if (value == null) {
+      null
+    } else {
+      new hadoopIo.BooleanWritable(value.asInstanceOf[Boolean])
+    }
+
+  private def getLongWritable(value: Any): hadoopIo.LongWritable =
+    if (value == null) null else new hadoopIo.LongWritable(value.asInstanceOf[Long])
+
+  private def getFloatWritable(value: Any): hadoopIo.FloatWritable =
+    if (value == null) {
+      null
+    } else {
+      new hadoopIo.FloatWritable(value.asInstanceOf[Float])
+    }
+
+  private def getShortWritable(value: Any): hiveIo.ShortWritable =
+    if (value == null) null else new hiveIo.ShortWritable(value.asInstanceOf[Short])
+
+  private def getByteWritable(value: Any): hiveIo.ByteWritable =
+    if (value == null) null else new hiveIo.ByteWritable(value.asInstanceOf[Byte])
+
+  private def getBinaryWritable(value: Any): hadoopIo.BytesWritable =
+    if (value == null) {
+      null
+    } else {
+      new hadoopIo.BytesWritable(value.asInstanceOf[Array[Byte]])
+    }
+
+  private def getDateWritable(value: Any): hiveIo.DateWritable =
+    if (value == null) null else new hiveIo.DateWritable(value.asInstanceOf[Int])
+
+  private def getTimestampWritable(value: Any): hiveIo.TimestampWritable =
+    if (value == null) {
+      null
+    } else {
+      new hiveIo.TimestampWritable(value.asInstanceOf[java.sql.Timestamp])
+    }
+
+  private def getDecimalWritable(value: Any): hiveIo.HiveDecimalWritable =
+    if (value == null) {
+      null
+    } else {
+      // TODO precise, scale?
+      new hiveIo.HiveDecimalWritable(
+        HiveDecimal.create(value.asInstanceOf[Decimal].toJavaBigDecimal))
+    }
+
   implicit class typeInfoConversions(dt: DataType) {
     import org.apache.hadoop.hive.serde2.typeinfo._
     import TypeInfoFactory._
 
+    private def decimalTypeInfo(decimalType: DecimalType): TypeInfo = decimalType match {
+      case DecimalType.Fixed(precision, scale) => new DecimalTypeInfo(precision, scale)
+      case _ => new DecimalTypeInfo(
+        HiveShim.UNLIMITED_DECIMAL_PRECISION,
+        HiveShim.UNLIMITED_DECIMAL_SCALE)
+    }
+
     def toTypeInfo: TypeInfo = dt match {
       case ArrayType(elemType, _) =>
         getListTypeInfo(elemType.toTypeInfo)
@@ -690,7 +817,7 @@ private[hive] trait HiveInspectors {
       case LongType => longTypeInfo
       case ShortType => shortTypeInfo
       case StringType => stringTypeInfo
-      case d: DecimalType => HiveShim.decimalTypeInfo(d)
+      case d: DecimalType => decimalTypeInfo(d)
       case DateType => dateTypeInfo
       case TimestampType => timestampTypeInfo
       case NullType => voidTypeInfo
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
index ca1f49b546bd7..5a4651a887b7c 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
@@ -19,11 +19,13 @@ package org.apache.spark.sql.hive
 
 import com.google.common.base.Objects
 import com.google.common.cache.{CacheBuilder, CacheLoader, LoadingCache}
+
 import org.apache.hadoop.fs.Path
+import org.apache.hadoop.hive.common.StatsSetupConst
 import org.apache.hadoop.hive.metastore.Warehouse
 import org.apache.hadoop.hive.metastore.api.FieldSchema
 import org.apache.hadoop.hive.ql.metadata._
-import org.apache.hadoop.hive.serde2.Deserializer
+import org.apache.hadoop.hive.ql.plan.TableDesc
 
 import org.apache.spark.Logging
 import org.apache.spark.sql.catalyst.analysis.{Catalog, MultiInstanceRelation, OverrideCatalog}
@@ -37,7 +39,6 @@ import org.apache.spark.sql.parquet.ParquetRelation2
 import org.apache.spark.sql.sources.{CreateTableUsingAsSelect, LogicalRelation, Partition => ParquetPartition, PartitionSpec, ResolvedDataSource}
 import org.apache.spark.sql.types._
 import org.apache.spark.sql.{AnalysisException, SQLContext, SaveMode, sources}
-import org.apache.spark.util.Utils
 
 /* Implicit conversions */
 import scala.collection.JavaConversions._
@@ -670,8 +671,8 @@ private[hive] case class MetastoreRelation
 
   @transient override lazy val statistics: Statistics = Statistics(
     sizeInBytes = {
-      val totalSize = hiveQlTable.getParameters.get(HiveShim.getStatsSetupConstTotalSize)
-      val rawDataSize = hiveQlTable.getParameters.get(HiveShim.getStatsSetupConstRawDataSize)
+      val totalSize = hiveQlTable.getParameters.get(StatsSetupConst.TOTAL_SIZE)
+      val rawDataSize = hiveQlTable.getParameters.get(StatsSetupConst.RAW_DATA_SIZE)
       // TODO: check if this estimate is valid for tables after partition pruning.
       // NOTE: getting `totalSize` directly from params is kind of hacky, but this should be
       // relatively cheap if parameters for the table are populated into the metastore.  An
@@ -697,11 +698,7 @@ private[hive] case class MetastoreRelation
     }
   }
 
-  val tableDesc = HiveShim.getTableDesc(
-    Class.forName(
-      hiveQlTable.getSerializationLib,
-      true,
-      Utils.getContextOrSparkClassLoader).asInstanceOf[Class[Deserializer]],
+  val tableDesc = new TableDesc(
     hiveQlTable.getInputFormatClass,
     // The class of table should be org.apache.hadoop.hive.ql.metadata.Table because
     // getOutputFormatClass will use HiveFileFormatUtils.getOutputFormatSubstitute to
@@ -743,6 +740,11 @@ private[hive] case class MetastoreRelation
 private[hive] object HiveMetastoreTypes {
   def toDataType(metastoreType: String): DataType = DataTypeParser.parse(metastoreType)
 
+  def decimalMetastoreString(decimalType: DecimalType): String = decimalType match {
+    case DecimalType.Fixed(precision, scale) => s"decimal($precision,$scale)"
+    case _ => s"decimal($HiveShim.UNLIMITED_DECIMAL_PRECISION,$HiveShim.UNLIMITED_DECIMAL_SCALE)"
+  }
+
   def toMetastoreType(dt: DataType): String = dt match {
     case ArrayType(elementType, _) => s"array<${toMetastoreType(elementType)}>"
     case StructType(fields) =>
@@ -759,7 +761,7 @@ private[hive] object HiveMetastoreTypes {
     case BinaryType => "binary"
     case BooleanType => "boolean"
     case DateType => "date"
-    case d: DecimalType => HiveShim.decimalMetastoreString(d)
+    case d: DecimalType => decimalMetastoreString(d)
     case TimestampType => "timestamp"
     case NullType => "void"
     case udt: UserDefinedType[_] => toMetastoreType(udt.sqlType)
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala
index a5ca3613c5e00..9544d12c9053c 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala
@@ -19,8 +19,6 @@ package org.apache.spark.sql.hive
 
 import java.sql.Date
 
-import scala.collection.mutable.ArrayBuffer
-
 import org.apache.hadoop.hive.conf.HiveConf
 import org.apache.hadoop.hive.serde.serdeConstants
 import org.apache.hadoop.hive.ql.{ErrorMsg, Context}
@@ -39,6 +37,7 @@ import org.apache.spark.sql.catalyst.plans.logical._
 import org.apache.spark.sql.catalyst.trees.CurrentOrigin
 import org.apache.spark.sql.execution.ExplainCommand
 import org.apache.spark.sql.sources.DescribeCommand
+import org.apache.spark.sql.hive.HiveShim._
 import org.apache.spark.sql.hive.client._
 import org.apache.spark.sql.hive.execution.{HiveNativeCommand, DropTable, AnalyzeTable, HiveScriptIOSchema}
 import org.apache.spark.sql.types._
@@ -46,6 +45,7 @@ import org.apache.spark.util.random.RandomSampler
 
 /* Implicit conversions */
 import scala.collection.JavaConversions._
+import scala.collection.mutable.ArrayBuffer
 
 /**
  * Used when we need to start parsing the AST before deciding that we are going to pass the command
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveShim.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveShim.scala
new file mode 100644
index 0000000000000..fa5409f602444
--- /dev/null
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveShim.scala
@@ -0,0 +1,247 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.hive
+
+import java.io.{InputStream, OutputStream}
+import java.rmi.server.UID
+
+import com.esotericsoftware.kryo.Kryo
+import com.esotericsoftware.kryo.io.{Input, Output}
+import org.apache.hadoop.conf.Configuration
+import org.apache.hadoop.fs.Path
+import org.apache.hadoop.hive.ql.exec.{UDF, Utilities}
+import org.apache.hadoop.hive.ql.plan.{FileSinkDesc, TableDesc}
+import org.apache.hadoop.hive.serde2.ColumnProjectionUtils
+import org.apache.hadoop.hive.serde2.avro.AvroGenericRecordWritable
+import org.apache.hadoop.hive.serde2.objectinspector.primitive.HiveDecimalObjectInspector
+import org.apache.hadoop.io.Writable
+
+import org.apache.spark.Logging
+import org.apache.spark.sql.types.Decimal
+import org.apache.spark.util.Utils
+
+/* Implicit conversions */
+import scala.collection.JavaConversions._
+import scala.reflect.ClassTag
+
+private[hive] object HiveShim {
+  // Precision and scale to pass for unlimited decimals; these are the same as the precision and
+  // scale Hive 0.13 infers for BigDecimals from sources that don't specify them (e.g. UDFs)
+  val UNLIMITED_DECIMAL_PRECISION = 38
+  val UNLIMITED_DECIMAL_SCALE = 18
+
+  /*
+   * This function in hive-0.13 become private, but we have to do this to walkaround hive bug
+   */
+  private def appendReadColumnNames(conf: Configuration, cols: Seq[String]) {
+    val old: String = conf.get(ColumnProjectionUtils.READ_COLUMN_NAMES_CONF_STR, "")
+    val result: StringBuilder = new StringBuilder(old)
+    var first: Boolean = old.isEmpty
+
+    for (col <- cols) {
+      if (first) {
+        first = false
+      } else {
+        result.append(',')
+      }
+      result.append(col)
+    }
+    conf.set(ColumnProjectionUtils.READ_COLUMN_NAMES_CONF_STR, result.toString)
+  }
+
+  /*
+   * Cannot use ColumnProjectionUtils.appendReadColumns directly, if ids is null or empty
+   */
+  def appendReadColumns(conf: Configuration, ids: Seq[Integer], names: Seq[String]) {
+    if (ids != null && ids.size > 0) {
+      ColumnProjectionUtils.appendReadColumns(conf, ids)
+    }
+    if (names != null && names.size > 0) {
+      appendReadColumnNames(conf, names)
+    }
+  }
+
+  /*
+   * Bug introduced in hive-0.13. AvroGenericRecordWritable has a member recordReaderID that
+   * is needed to initialize before serialization.
+   */
+  def prepareWritable(w: Writable): Writable = {
+    w match {
+      case w: AvroGenericRecordWritable =>
+        w.setRecordReaderID(new UID())
+      case _ =>
+    }
+    w
+  }
+
+  def toCatalystDecimal(hdoi: HiveDecimalObjectInspector, data: Any): Decimal = {
+    if (hdoi.preferWritable()) {
+      Decimal(hdoi.getPrimitiveWritableObject(data).getHiveDecimal().bigDecimalValue,
+        hdoi.precision(), hdoi.scale())
+    } else {
+      Decimal(hdoi.getPrimitiveJavaObject(data).bigDecimalValue(), hdoi.precision(), hdoi.scale())
+    }
+  }
+
+  /**
+   * This class provides the UDF creation and also the UDF instance serialization and
+   * de-serialization cross process boundary.
+   *
+   * Detail discussion can be found at https://github.com/apache/spark/pull/3640
+   *
+   * @param functionClassName UDF class name
+   */
+  private[hive] case class HiveFunctionWrapper(var functionClassName: String)
+    extends java.io.Externalizable {
+
+    // for Serialization
+    def this() = this(null)
+
+    @transient
+    def deserializeObjectByKryo[T: ClassTag](
+        kryo: Kryo,
+        in: InputStream,
+        clazz: Class[_]): T = {
+      val inp = new Input(in)
+      val t: T = kryo.readObject(inp, clazz).asInstanceOf[T]
+      inp.close()
+      t
+    }
+
+    @transient
+    def serializeObjectByKryo(
+        kryo: Kryo,
+        plan: Object,
+        out: OutputStream) {
+      val output: Output = new Output(out)
+      kryo.writeObject(output, plan)
+      output.close()
+    }
+
+    def deserializePlan[UDFType](is: java.io.InputStream, clazz: Class[_]): UDFType = {
+      deserializeObjectByKryo(Utilities.runtimeSerializationKryo.get(), is, clazz)
+        .asInstanceOf[UDFType]
+    }
+
+    def serializePlan(function: AnyRef, out: java.io.OutputStream): Unit = {
+      serializeObjectByKryo(Utilities.runtimeSerializationKryo.get(), function, out)
+    }
+
+    private var instance: AnyRef = null
+
+    def writeExternal(out: java.io.ObjectOutput) {
+      // output the function name
+      out.writeUTF(functionClassName)
+
+      // Write a flag if instance is null or not
+      out.writeBoolean(instance != null)
+      if (instance != null) {
+        // Some of the UDF are serializable, but some others are not
+        // Hive Utilities can handle both cases
+        val baos = new java.io.ByteArrayOutputStream()
+        serializePlan(instance, baos)
+        val functionInBytes = baos.toByteArray
+
+        // output the function bytes
+        out.writeInt(functionInBytes.length)
+        out.write(functionInBytes, 0, functionInBytes.length)
+      }
+    }
+
+    def readExternal(in: java.io.ObjectInput) {
+      // read the function name
+      functionClassName = in.readUTF()
+
+      if (in.readBoolean()) {
+        // if the instance is not null
+        // read the function in bytes
+        val functionInBytesLength = in.readInt()
+        val functionInBytes = new Array[Byte](functionInBytesLength)
+        in.read(functionInBytes, 0, functionInBytesLength)
+
+        // deserialize the function object via Hive Utilities
+        instance = deserializePlan[AnyRef](new java.io.ByteArrayInputStream(functionInBytes),
+          Utils.getContextOrSparkClassLoader.loadClass(functionClassName))
+      }
+    }
+
+    def createFunction[UDFType <: AnyRef](): UDFType = {
+      if (instance != null) {
+        instance.asInstanceOf[UDFType]
+      } else {
+        val func = Utils.getContextOrSparkClassLoader
+          .loadClass(functionClassName).newInstance.asInstanceOf[UDFType]
+        if (!func.isInstanceOf[UDF]) {
+          // We cache the function if it's no the Simple UDF,
+          // as we always have to create new instance for Simple UDF
+          instance = func
+        }
+        func
+      }
+    }
+  }
+
+  /*
+ * Bug introduced in hive-0.13. FileSinkDesc is serializable, but its member path is not.
+ * Fix it through wrapper.
+ * */
+  implicit def wrapperToFileSinkDesc(w: ShimFileSinkDesc): FileSinkDesc = {
+    var f = new FileSinkDesc(new Path(w.dir), w.tableInfo, w.compressed)
+    f.setCompressCodec(w.compressCodec)
+    f.setCompressType(w.compressType)
+    f.setTableInfo(w.tableInfo)
+    f.setDestTableId(w.destTableId)
+    f
+  }
+
+  /*
+   * Bug introduced in hive-0.13. FileSinkDesc is serializable, but its member path is not.
+   * Fix it through wrapper.
+   */
+  private[hive] class ShimFileSinkDesc(
+      var dir: String,
+      var tableInfo: TableDesc,
+      var compressed: Boolean)
+    extends Serializable with Logging {
+    var compressCodec: String = _
+    var compressType: String = _
+    var destTableId: Int = _
+
+    def setCompressed(compressed: Boolean) {
+      this.compressed = compressed
+    }
+
+    def getDirName(): String = dir
+
+    def setDestTableId(destTableId: Int) {
+      this.destTableId = destTableId
+    }
+
+    def setTableInfo(tableInfo: TableDesc) {
+      this.tableInfo = tableInfo
+    }
+
+    def setCompressCodec(intermediateCompressorCodec: String) {
+      compressCodec = intermediateCompressorCodec
+    }
+
+    def setCompressType(intermediateCompressType: String) {
+      compressType = intermediateCompressType
+    }
+  }
+}
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/TableReader.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/TableReader.scala
index 294fc3bd7d5e9..334bfccc9d200 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/TableReader.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/TableReader.scala
@@ -25,14 +25,13 @@ import org.apache.hadoop.hive.ql.exec.Utilities
 import org.apache.hadoop.hive.ql.metadata.{Partition => HivePartition, Table => HiveTable}
 import org.apache.hadoop.hive.ql.plan.{PlanUtils, TableDesc}
 import org.apache.hadoop.hive.serde2.Deserializer
-import org.apache.hadoop.hive.serde2.objectinspector.{ObjectInspectorConverters, StructObjectInspector}
 import org.apache.hadoop.hive.serde2.objectinspector.primitive._
+import org.apache.hadoop.hive.serde2.objectinspector.{ObjectInspectorConverters, StructObjectInspector}
 import org.apache.hadoop.io.Writable
 import org.apache.hadoop.mapred.{FileInputFormat, InputFormat, JobConf}
 
-import org.apache.spark.SerializableWritable
+import org.apache.spark.{Logging, SerializableWritable}
 import org.apache.spark.broadcast.Broadcast
-import org.apache.spark.Logging
 import org.apache.spark.rdd.{EmptyRDD, HadoopRDD, RDD, UnionRDD}
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.util.DateUtils
@@ -172,7 +171,7 @@ class HadoopTableReader(
               path.toString + tails
             }
 
-            val partPath = HiveShim.getDataLocationPath(partition)
+            val partPath = partition.getDataLocation
             val partNum = Utilities.getPartitionDesc(partition).getPartSpec.size();
             var pathPatternStr = getPathPatternByPath(partNum, partPath)
             if (!pathPatternSet.contains(pathPatternStr)) {
@@ -187,7 +186,7 @@ class HadoopTableReader(
     val hivePartitionRDDs = verifyPartitionPath(partitionToDeserializer)
       .map { case (partition, partDeserializer) =>
       val partDesc = Utilities.getPartitionDesc(partition)
-      val partPath = HiveShim.getDataLocationPath(partition)
+      val partPath = partition.getDataLocation
       val inputPathStr = applyFilterIfNeeded(partPath, filterOpt)
       val ifc = partDesc.getInputFileFormatClass
         .asInstanceOf[java.lang.Class[InputFormat[Writable, Writable]]]
@@ -325,7 +324,7 @@ private[hive] object HadoopTableReader extends HiveInspectors with Logging {
     val soi = if (rawDeser.getObjectInspector.equals(tableDeser.getObjectInspector)) {
       rawDeser.getObjectInspector.asInstanceOf[StructObjectInspector]
     } else {
-      HiveShim.getConvertedOI(
+      ObjectInspectorConverters.getConvertedOI(
         rawDeser.getObjectInspector,
         tableDeser.getObjectInspector).asInstanceOf[StructObjectInspector]
     }
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveTable.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveTable.scala
index 8613332186f28..eeb472602be3c 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveTable.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveTable.scala
@@ -19,27 +19,25 @@ package org.apache.spark.sql.hive.execution
 
 import java.util
 
-import scala.collection.JavaConversions._
-
 import org.apache.hadoop.hive.conf.HiveConf
 import org.apache.hadoop.hive.conf.HiveConf.ConfVars
 import org.apache.hadoop.hive.metastore.MetaStoreUtils
-import org.apache.hadoop.hive.ql.metadata.Hive
 import org.apache.hadoop.hive.ql.plan.TableDesc
 import org.apache.hadoop.hive.ql.{Context, ErrorMsg}
 import org.apache.hadoop.hive.serde2.Serializer
 import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils.ObjectInspectorCopyOption
 import org.apache.hadoop.hive.serde2.objectinspector._
-import org.apache.hadoop.mapred.{FileOutputCommitter, FileOutputFormat, JobConf}
+import org.apache.hadoop.mapred.{FileOutputFormat, JobConf}
 
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.catalyst.expressions.{Attribute, Row}
 import org.apache.spark.sql.execution.{UnaryNode, SparkPlan}
+import org.apache.spark.sql.hive.HiveShim.{ShimFileSinkDesc => FileSinkDesc}
 import org.apache.spark.sql.hive._
-import org.apache.spark.sql.hive.{ ShimFileSinkDesc => FileSinkDesc}
-import org.apache.spark.sql.hive.HiveShim._
 import org.apache.spark.{SerializableWritable, SparkException, TaskContext}
 
+import scala.collection.JavaConversions._
+
 private[hive]
 case class InsertIntoHiveTable(
     table: MetastoreRelation,
@@ -126,7 +124,7 @@ case class InsertIntoHiveTable(
     // instances within the closure, since Serializer is not serializable while TableDesc is.
     val tableDesc = table.tableDesc
     val tableLocation = table.hiveQlTable.getDataLocation
-    val tmpLocation = HiveShim.getExternalTmpPath(hiveContext, tableLocation)
+    val tmpLocation = hiveContext.getExternalTmpPath(tableLocation.toUri)
     val fileSinkConf = new FileSinkDesc(tmpLocation.toString, tableDesc, false)
     val isCompressed = sc.hiveconf.getBoolean(
       ConfVars.COMPRESSRESULT.varname, ConfVars.COMPRESSRESULT.defaultBoolVal)
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/hiveUdfs.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/hiveUdfs.scala
index 1658bb93b0b79..01f47352b2313 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/hiveUdfs.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/hiveUdfs.scala
@@ -37,6 +37,7 @@ import org.apache.spark.sql.catalyst.errors.TreeNodeException
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.plans.logical._
 import org.apache.spark.sql.catalyst.rules.Rule
+import org.apache.spark.sql.hive.HiveShim._
 import org.apache.spark.sql.types._
 
 /* Implicit conversions */
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/hiveWriterContainers.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/hiveWriterContainers.scala
index 2bb526b14be34..ee440e304ec19 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/hiveWriterContainers.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/hiveWriterContainers.scala
@@ -35,8 +35,7 @@ import org.apache.spark.mapred.SparkHadoopMapRedUtil
 import org.apache.spark.sql.Row
 import org.apache.spark.{Logging, SerializableWritable, SparkHadoopWriter}
 import org.apache.spark.sql.catalyst.util.DateUtils
-import org.apache.spark.sql.hive.{ShimFileSinkDesc => FileSinkDesc}
-import org.apache.spark.sql.hive.HiveShim._
+import org.apache.spark.sql.hive.HiveShim.{ShimFileSinkDesc => FileSinkDesc}
 import org.apache.spark.sql.types._
 
 /**
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala
index 58e2d1fbfa73e..af586712e3235 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala
@@ -561,30 +561,28 @@ class MetastoreDataSourcesSuite extends QueryTest with SQLTestUtils with BeforeA
     }
   }
 
-  if (HiveShim.version == "0.13.1") {
-    test("scan a parquet table created through a CTAS statement") {
-      withSQLConf(
-        "spark.sql.hive.convertMetastoreParquet" -> "true",
-        SQLConf.PARQUET_USE_DATA_SOURCE_API -> "true") {
-
-        withTempTable("jt") {
-          (1 to 10).map(i => i -> s"str$i").toDF("a", "b").registerTempTable("jt")
-
-          withTable("test_parquet_ctas") {
-            sql(
-              """CREATE TABLE test_parquet_ctas STORED AS PARQUET
-                |AS SELECT tmp.a FROM jt tmp WHERE tmp.a < 5
-              """.stripMargin)
-
-            checkAnswer(
-              sql(s"SELECT a FROM test_parquet_ctas WHERE a > 2 "),
-              Row(3) :: Row(4) :: Nil)
-
-            table("test_parquet_ctas").queryExecution.optimizedPlan match {
-              case LogicalRelation(p: ParquetRelation2) => // OK
-              case _ =>
-                fail(s"test_parquet_ctas should have be converted to ${classOf[ParquetRelation2]}")
-            }
+  test("scan a parquet table created through a CTAS statement") {
+    withSQLConf(
+      "spark.sql.hive.convertMetastoreParquet" -> "true",
+      SQLConf.PARQUET_USE_DATA_SOURCE_API -> "true") {
+
+      withTempTable("jt") {
+        (1 to 10).map(i => i -> s"str$i").toDF("a", "b").registerTempTable("jt")
+
+        withTable("test_parquet_ctas") {
+          sql(
+            """CREATE TABLE test_parquet_ctas STORED AS PARQUET
+              |AS SELECT tmp.a FROM jt tmp WHERE tmp.a < 5
+            """.stripMargin)
+
+          checkAnswer(
+            sql(s"SELECT a FROM test_parquet_ctas WHERE a > 2 "),
+            Row(3) :: Row(4) :: Nil)
+
+          table("test_parquet_ctas").queryExecution.optimizedPlan match {
+            case LogicalRelation(p: ParquetRelation2) => // OK
+            case _ =>
+              fail(s"test_parquet_ctas should have be converted to ${classOf[ParquetRelation2]}")
           }
         }
       }
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala
index 00a69de9e4262..e16e530555aee 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala
@@ -79,10 +79,6 @@ class StatisticsSuite extends QueryTest with BeforeAndAfterAll {
     sql("INSERT INTO TABLE analyzeTable SELECT * FROM src").collect()
     sql("INSERT INTO TABLE analyzeTable SELECT * FROM src").collect()
 
-    // TODO: How does it works? needs to add it back for other hive version.
-    if (HiveShim.version =="0.12.0") {
-      assert(queryTotalSize("analyzeTable") === conf.defaultSizeInBytes)
-    }
     sql("ANALYZE TABLE analyzeTable COMPUTE STATISTICS noscan")
 
     assert(queryTotalSize("analyzeTable") === BigInt(11624))
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala
index 440b7c87b0da2..6d8d99ebc8164 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala
@@ -874,15 +874,6 @@ class HiveQuerySuite extends HiveComparisonTest with BeforeAndAfter {
           |WITH serdeproperties('s1'='9')
         """.stripMargin)
     }
-    // Now only verify 0.12.0, and ignore other versions due to binary compatibility
-    // current TestSerDe.jar is from 0.12.0
-    if (HiveShim.version == "0.12.0") {
-      sql(s"ADD JAR $testJar")
-      sql(
-        """ALTER TABLE alter1 SET SERDE 'org.apache.hadoop.hive.serde2.TestSerDe'
-          |WITH serdeproperties('s1'='9')
-        """.stripMargin)
-    }
     sql("DROP TABLE alter1")
   }
 
@@ -890,15 +881,13 @@ class HiveQuerySuite extends HiveComparisonTest with BeforeAndAfter {
     // this is a test case from mapjoin_addjar.q
     val testJar = TestHive.getHiveFile("hive-hcatalog-core-0.13.1.jar").getCanonicalPath
     val testData = TestHive.getHiveFile("data/files/sample.json").getCanonicalPath
-    if (HiveShim.version == "0.13.1") {
-      sql(s"ADD JAR $testJar")
-      sql(
-        """CREATE TABLE t1(a string, b string)
-        |ROW FORMAT SERDE 'org.apache.hive.hcatalog.data.JsonSerDe'""".stripMargin)
-      sql(s"""LOAD DATA LOCAL INPATH "$testData" INTO TABLE t1""")
-      sql("select * from src join t1 on src.key = t1.a")
-      sql("DROP TABLE t1")
-    }
+    sql(s"ADD JAR $testJar")
+    sql(
+      """CREATE TABLE t1(a string, b string)
+      |ROW FORMAT SERDE 'org.apache.hive.hcatalog.data.JsonSerDe'""".stripMargin)
+    sql(s"""LOAD DATA LOCAL INPATH "$testData" INTO TABLE t1""")
+    sql("select * from src join t1 on src.key = t1.a")
+    sql("DROP TABLE t1")
   }
 
   test("ADD FILE command") {
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
index aba3becb1bce2..40a35674e4cb8 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
@@ -24,7 +24,7 @@ import org.apache.spark.sql._
 import org.apache.spark.sql.hive.test.TestHive
 import org.apache.spark.sql.hive.test.TestHive._
 import org.apache.spark.sql.hive.test.TestHive.implicits._
-import org.apache.spark.sql.hive.{HiveQLDialect, HiveShim, MetastoreRelation}
+import org.apache.spark.sql.hive.{HiveQLDialect, MetastoreRelation}
 import org.apache.spark.sql.parquet.ParquetRelation2
 import org.apache.spark.sql.sources.LogicalRelation
 import org.apache.spark.sql.types._
@@ -330,35 +330,33 @@ class SQLQuerySuite extends QueryTest {
       "serde_p1=p1", "serde_p2=p2", "tbl_p1=p11", "tbl_p2=p22", "MANAGED_TABLE"
     )
 
-    if (HiveShim.version =="0.13.1") {
-      val origUseParquetDataSource = conf.parquetUseDataSourceApi
-      try {
-        setConf(SQLConf.PARQUET_USE_DATA_SOURCE_API, "false")
-        sql(
-          """CREATE TABLE ctas5
-            | STORED AS parquet AS
-            |   SELECT key, value
-            |   FROM src
-            |   ORDER BY key, value""".stripMargin).collect()
-
-        checkExistence(sql("DESC EXTENDED ctas5"), true,
-          "name:key", "type:string", "name:value", "ctas5",
-          "org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat",
-          "org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat",
-          "org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe",
-          "MANAGED_TABLE"
-        )
-
-        val default = getConf("spark.sql.hive.convertMetastoreParquet", "true")
-        // use the Hive SerDe for parquet tables
-        sql("set spark.sql.hive.convertMetastoreParquet = false")
-        checkAnswer(
-          sql("SELECT key, value FROM ctas5 ORDER BY key, value"),
-          sql("SELECT key, value FROM src ORDER BY key, value").collect().toSeq)
-        sql(s"set spark.sql.hive.convertMetastoreParquet = $default")
-      } finally {
-        setConf(SQLConf.PARQUET_USE_DATA_SOURCE_API, origUseParquetDataSource.toString)
-      }
+    val origUseParquetDataSource = conf.parquetUseDataSourceApi
+    try {
+      setConf(SQLConf.PARQUET_USE_DATA_SOURCE_API, "false")
+      sql(
+        """CREATE TABLE ctas5
+          | STORED AS parquet AS
+          |   SELECT key, value
+          |   FROM src
+          |   ORDER BY key, value""".stripMargin).collect()
+
+      checkExistence(sql("DESC EXTENDED ctas5"), true,
+        "name:key", "type:string", "name:value", "ctas5",
+        "org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat",
+        "org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat",
+        "org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe",
+        "MANAGED_TABLE"
+      )
+
+      val default = getConf("spark.sql.hive.convertMetastoreParquet", "true")
+      // use the Hive SerDe for parquet tables
+      sql("set spark.sql.hive.convertMetastoreParquet = false")
+      checkAnswer(
+        sql("SELECT key, value FROM ctas5 ORDER BY key, value"),
+        sql("SELECT key, value FROM src ORDER BY key, value").collect().toSeq)
+      sql(s"set spark.sql.hive.convertMetastoreParquet = $default")
+    } finally {
+      setConf(SQLConf.PARQUET_USE_DATA_SOURCE_API, origUseParquetDataSource.toString)
     }
   }
 
diff --git a/sql/hive/v0.13.1/src/main/scala/org/apache/spark/sql/hive/Shim13.scala b/sql/hive/v0.13.1/src/main/scala/org/apache/spark/sql/hive/Shim13.scala
deleted file mode 100644
index dbc5e029e2047..0000000000000
--- a/sql/hive/v0.13.1/src/main/scala/org/apache/spark/sql/hive/Shim13.scala
+++ /dev/null
@@ -1,457 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.sql.hive
-
-import java.rmi.server.UID
-import java.util.{Properties, ArrayList => JArrayList}
-import java.io.{OutputStream, InputStream}
-
-import scala.collection.JavaConversions._
-import scala.language.implicitConversions
-import scala.reflect.ClassTag
-
-import com.esotericsoftware.kryo.Kryo
-import com.esotericsoftware.kryo.io.Input
-import com.esotericsoftware.kryo.io.Output
-import org.apache.hadoop.conf.Configuration
-import org.apache.hadoop.fs.Path
-import org.apache.hadoop.hive.common.StatsSetupConst
-import org.apache.hadoop.hive.common.`type`.HiveDecimal
-import org.apache.hadoop.hive.conf.HiveConf
-import org.apache.hadoop.hive.ql.Context
-import org.apache.hadoop.hive.ql.exec.{UDF, Utilities}
-import org.apache.hadoop.hive.ql.metadata.{Hive, Partition, Table}
-import org.apache.hadoop.hive.ql.plan.{CreateTableDesc, FileSinkDesc, TableDesc}
-import org.apache.hadoop.hive.ql.processors.CommandProcessorFactory
-import org.apache.hadoop.hive.serde.serdeConstants
-import org.apache.hadoop.hive.serde2.avro.AvroGenericRecordWritable
-import org.apache.hadoop.hive.serde2.objectinspector.primitive.{HiveDecimalObjectInspector, PrimitiveObjectInspectorFactory}
-import org.apache.hadoop.hive.serde2.objectinspector.{ObjectInspector, ObjectInspectorConverters, PrimitiveObjectInspector}
-import org.apache.hadoop.hive.serde2.typeinfo.{DecimalTypeInfo, TypeInfo, TypeInfoFactory}
-import org.apache.hadoop.hive.serde2.{ColumnProjectionUtils, Deserializer, io => hiveIo}
-import org.apache.hadoop.io.{NullWritable, Writable}
-import org.apache.hadoop.mapred.InputFormat
-import org.apache.hadoop.{io => hadoopIo}
-
-import org.apache.spark.Logging
-import org.apache.spark.sql.types.{Decimal, DecimalType, UTF8String}
-import org.apache.spark.util.Utils._
-
-/**
- * This class provides the UDF creation and also the UDF instance serialization and
- * de-serialization cross process boundary.
- * 
- * Detail discussion can be found at https://github.com/apache/spark/pull/3640
- *
- * @param functionClassName UDF class name
- */
-private[hive] case class HiveFunctionWrapper(var functionClassName: String)
-  extends java.io.Externalizable {
-
-  // for Serialization
-  def this() = this(null)
-
-  @transient
-  def deserializeObjectByKryo[T: ClassTag](
-      kryo: Kryo,
-      in: InputStream,
-      clazz: Class[_]): T = {
-    val inp = new Input(in)
-    val t: T = kryo.readObject(inp,clazz).asInstanceOf[T]
-    inp.close()
-    t
-  }
-
-  @transient
-  def serializeObjectByKryo(
-      kryo: Kryo,
-      plan: Object,
-      out: OutputStream ) {
-    val output: Output = new Output(out)
-    kryo.writeObject(output, plan)
-    output.close()
-  }
-
-  def deserializePlan[UDFType](is: java.io.InputStream, clazz: Class[_]): UDFType = {
-    deserializeObjectByKryo(Utilities.runtimeSerializationKryo.get(), is, clazz)
-      .asInstanceOf[UDFType]
-  }
-
-  def serializePlan(function: AnyRef, out: java.io.OutputStream): Unit = {
-    serializeObjectByKryo(Utilities.runtimeSerializationKryo.get(), function, out)
-  }
-
-  private var instance: AnyRef = null
-
-  def writeExternal(out: java.io.ObjectOutput) {
-    // output the function name
-    out.writeUTF(functionClassName)
-
-    // Write a flag if instance is null or not
-    out.writeBoolean(instance != null)
-    if (instance != null) {
-      // Some of the UDF are serializable, but some others are not
-      // Hive Utilities can handle both cases
-      val baos = new java.io.ByteArrayOutputStream()
-      serializePlan(instance, baos)
-      val functionInBytes = baos.toByteArray
-
-      // output the function bytes
-      out.writeInt(functionInBytes.length)
-      out.write(functionInBytes, 0, functionInBytes.length)
-    }
-  }
-
-  def readExternal(in: java.io.ObjectInput) {
-    // read the function name
-    functionClassName = in.readUTF()
-
-    if (in.readBoolean()) {
-      // if the instance is not null
-      // read the function in bytes
-      val functionInBytesLength = in.readInt()
-      val functionInBytes = new Array[Byte](functionInBytesLength)
-      in.read(functionInBytes, 0, functionInBytesLength)
-
-      // deserialize the function object via Hive Utilities
-      instance = deserializePlan[AnyRef](new java.io.ByteArrayInputStream(functionInBytes),
-        getContextOrSparkClassLoader.loadClass(functionClassName))
-    }
-  }
-
-  def createFunction[UDFType <: AnyRef](): UDFType = {
-    if (instance != null) {
-      instance.asInstanceOf[UDFType]
-    } else {
-      val func = getContextOrSparkClassLoader
-                   .loadClass(functionClassName).newInstance.asInstanceOf[UDFType]
-      if (!func.isInstanceOf[UDF]) {
-        // We cache the function if it's no the Simple UDF,
-        // as we always have to create new instance for Simple UDF
-        instance = func
-      }
-      func
-    }
-  }
-}
-
-/**
- * A compatibility layer for interacting with Hive version 0.13.1.
- */
-private[hive] object HiveShim {
-  val version = "0.13.1"
-
-  def getTableDesc(
-    serdeClass: Class[_ <: Deserializer],
-    inputFormatClass: Class[_ <: InputFormat[_, _]],
-    outputFormatClass: Class[_],
-    properties: Properties) = {
-    new TableDesc(inputFormatClass, outputFormatClass, properties)
-  }
-
-
-  def getStringWritableConstantObjectInspector(value: Any): ObjectInspector =
-    PrimitiveObjectInspectorFactory.getPrimitiveWritableConstantObjectInspector(
-      TypeInfoFactory.stringTypeInfo, getStringWritable(value))
-
-  def getIntWritableConstantObjectInspector(value: Any): ObjectInspector =
-    PrimitiveObjectInspectorFactory.getPrimitiveWritableConstantObjectInspector(
-      TypeInfoFactory.intTypeInfo, getIntWritable(value))
-
-  def getDoubleWritableConstantObjectInspector(value: Any): ObjectInspector =
-    PrimitiveObjectInspectorFactory.getPrimitiveWritableConstantObjectInspector(
-      TypeInfoFactory.doubleTypeInfo, getDoubleWritable(value))
-
-  def getBooleanWritableConstantObjectInspector(value: Any): ObjectInspector =
-    PrimitiveObjectInspectorFactory.getPrimitiveWritableConstantObjectInspector(
-      TypeInfoFactory.booleanTypeInfo, getBooleanWritable(value))
-
-  def getLongWritableConstantObjectInspector(value: Any): ObjectInspector =
-    PrimitiveObjectInspectorFactory.getPrimitiveWritableConstantObjectInspector(
-      TypeInfoFactory.longTypeInfo, getLongWritable(value))
-
-  def getFloatWritableConstantObjectInspector(value: Any): ObjectInspector =
-    PrimitiveObjectInspectorFactory.getPrimitiveWritableConstantObjectInspector(
-      TypeInfoFactory.floatTypeInfo, getFloatWritable(value))
-
-  def getShortWritableConstantObjectInspector(value: Any): ObjectInspector =
-    PrimitiveObjectInspectorFactory.getPrimitiveWritableConstantObjectInspector(
-      TypeInfoFactory.shortTypeInfo, getShortWritable(value))
-
-  def getByteWritableConstantObjectInspector(value: Any): ObjectInspector =
-    PrimitiveObjectInspectorFactory.getPrimitiveWritableConstantObjectInspector(
-      TypeInfoFactory.byteTypeInfo, getByteWritable(value))
-
-  def getBinaryWritableConstantObjectInspector(value: Any): ObjectInspector =
-    PrimitiveObjectInspectorFactory.getPrimitiveWritableConstantObjectInspector(
-      TypeInfoFactory.binaryTypeInfo, getBinaryWritable(value))
-
-  def getDateWritableConstantObjectInspector(value: Any): ObjectInspector =
-    PrimitiveObjectInspectorFactory.getPrimitiveWritableConstantObjectInspector(
-      TypeInfoFactory.dateTypeInfo, getDateWritable(value))
-
-  def getTimestampWritableConstantObjectInspector(value: Any): ObjectInspector =
-    PrimitiveObjectInspectorFactory.getPrimitiveWritableConstantObjectInspector(
-      TypeInfoFactory.timestampTypeInfo, getTimestampWritable(value))
-
-  def getDecimalWritableConstantObjectInspector(value: Any): ObjectInspector =
-    PrimitiveObjectInspectorFactory.getPrimitiveWritableConstantObjectInspector(
-      TypeInfoFactory.decimalTypeInfo, getDecimalWritable(value))
-
-  def getPrimitiveNullWritableConstantObjectInspector: ObjectInspector =
-    PrimitiveObjectInspectorFactory.getPrimitiveWritableConstantObjectInspector(
-      TypeInfoFactory.voidTypeInfo, null)
-
-  def getStringWritable(value: Any): hadoopIo.Text =
-    if (value == null) null else new hadoopIo.Text(value.asInstanceOf[UTF8String].toString)
-
-  def getIntWritable(value: Any): hadoopIo.IntWritable =
-    if (value == null) null else new hadoopIo.IntWritable(value.asInstanceOf[Int])
-
-  def getDoubleWritable(value: Any): hiveIo.DoubleWritable =
-    if (value == null) {
-      null
-    } else {
-      new hiveIo.DoubleWritable(value.asInstanceOf[Double])
-    }
-
-  def getBooleanWritable(value: Any): hadoopIo.BooleanWritable =
-    if (value == null) {
-      null
-    } else {
-      new hadoopIo.BooleanWritable(value.asInstanceOf[Boolean])
-    }
-
-  def getLongWritable(value: Any): hadoopIo.LongWritable =
-    if (value == null) null else new hadoopIo.LongWritable(value.asInstanceOf[Long])
-
-  def getFloatWritable(value: Any): hadoopIo.FloatWritable =
-    if (value == null) {
-      null
-    } else {
-      new hadoopIo.FloatWritable(value.asInstanceOf[Float])
-    }
-
-  def getShortWritable(value: Any): hiveIo.ShortWritable =
-    if (value == null) null else new hiveIo.ShortWritable(value.asInstanceOf[Short])
-
-  def getByteWritable(value: Any): hiveIo.ByteWritable =
-    if (value == null) null else new hiveIo.ByteWritable(value.asInstanceOf[Byte])
-
-  def getBinaryWritable(value: Any): hadoopIo.BytesWritable =
-    if (value == null) {
-      null
-    } else {
-      new hadoopIo.BytesWritable(value.asInstanceOf[Array[Byte]])
-    }
-
-  def getDateWritable(value: Any): hiveIo.DateWritable =
-    if (value == null) null else new hiveIo.DateWritable(value.asInstanceOf[Int])
-
-  def getTimestampWritable(value: Any): hiveIo.TimestampWritable =
-    if (value == null) {
-      null
-    } else {
-      new hiveIo.TimestampWritable(value.asInstanceOf[java.sql.Timestamp])
-    }
-
-  def getDecimalWritable(value: Any): hiveIo.HiveDecimalWritable =
-    if (value == null) {
-      null
-    } else {
-      // TODO precise, scale?
-      new hiveIo.HiveDecimalWritable(
-        HiveShim.createDecimal(value.asInstanceOf[Decimal].toJavaBigDecimal))
-    }
-
-  def getPrimitiveNullWritable: NullWritable = NullWritable.get()
-
-  def createDriverResultsArray = new JArrayList[Object]
-
-  def processResults(results: JArrayList[Object]) = {
-    results.map { r =>
-      r match {
-        case s: String => s
-        case a: Array[Object] => a(0).asInstanceOf[String]
-      }
-    }
-  }
-
-  def getStatsSetupConstTotalSize = StatsSetupConst.TOTAL_SIZE
-
-  def getStatsSetupConstRawDataSize = StatsSetupConst.RAW_DATA_SIZE
-
-  def createDefaultDBIfNeeded(context: HiveContext) = {
-    context.runSqlHive("CREATE DATABASE default")
-    context.runSqlHive("USE default")
-  }
-
-  def getCommandProcessor(cmd: Array[String], conf: HiveConf) = {
-    CommandProcessorFactory.get(cmd, conf)
-  }
-
-  def createDecimal(bd: java.math.BigDecimal): HiveDecimal = {
-    HiveDecimal.create(bd)
-  }
-
-  /*
-   * This function in hive-0.13 become private, but we have to do this to walkaround hive bug
-   */
-  private def appendReadColumnNames(conf: Configuration, cols: Seq[String]) {
-    val old: String = conf.get(ColumnProjectionUtils.READ_COLUMN_NAMES_CONF_STR, "")
-    val result: StringBuilder = new StringBuilder(old)
-    var first: Boolean = old.isEmpty
-
-    for (col <- cols) {
-      if (first) {
-        first = false
-      } else {
-        result.append(',')
-      }
-      result.append(col)
-    }
-    conf.set(ColumnProjectionUtils.READ_COLUMN_NAMES_CONF_STR, result.toString)
-  }
-
-  /*
-   * Cannot use ColumnProjectionUtils.appendReadColumns directly, if ids is null or empty
-   */
-  def appendReadColumns(conf: Configuration, ids: Seq[Integer], names: Seq[String]) {
-    if (ids != null && ids.size > 0) {
-      ColumnProjectionUtils.appendReadColumns(conf, ids)
-    }
-    if (names != null && names.size > 0) {
-      appendReadColumnNames(conf, names)
-    }
-  }
-
-  def getExternalTmpPath(context: Context, path: Path) = {
-    context.getExternalTmpPath(path.toUri)
-  }
-
-  def getDataLocationPath(p: Partition) = p.getDataLocation
-
-  def getAllPartitionsOf(client: Hive, tbl: Table) =  client.getAllPartitionsOf(tbl)
-
-  def compatibilityBlackList = Seq()
-
-  def setLocation(tbl: Table, crtTbl: CreateTableDesc): Unit = {
-    tbl.setDataLocation(new Path(crtTbl.getLocation()))
-  }
-
-  /*
-   * Bug introdiced in hive-0.13. FileSinkDesc is serializable, but its member path is not.
-   * Fix it through wrapper.
-   * */
-  implicit def wrapperToFileSinkDesc(w: ShimFileSinkDesc): FileSinkDesc = {
-    var f = new FileSinkDesc(new Path(w.dir), w.tableInfo, w.compressed)
-    f.setCompressCodec(w.compressCodec)
-    f.setCompressType(w.compressType)
-    f.setTableInfo(w.tableInfo)
-    f.setDestTableId(w.destTableId)
-    f
-  }
-
-  // Precision and scale to pass for unlimited decimals; these are the same as the precision and
-  // scale Hive 0.13 infers for BigDecimals from sources that don't specify them (e.g. UDFs)
-  private val UNLIMITED_DECIMAL_PRECISION = 38
-  private val UNLIMITED_DECIMAL_SCALE = 18
-
-  def decimalMetastoreString(decimalType: DecimalType): String = decimalType match {
-    case DecimalType.Fixed(precision, scale) => s"decimal($precision,$scale)"
-    case _ => s"decimal($UNLIMITED_DECIMAL_PRECISION,$UNLIMITED_DECIMAL_SCALE)"
-  }
-
-  def decimalTypeInfo(decimalType: DecimalType): TypeInfo = decimalType match {
-    case DecimalType.Fixed(precision, scale) => new DecimalTypeInfo(precision, scale)
-    case _ => new DecimalTypeInfo(UNLIMITED_DECIMAL_PRECISION, UNLIMITED_DECIMAL_SCALE)
-  }
-
-  def decimalTypeInfoToCatalyst(inspector: PrimitiveObjectInspector): DecimalType = {
-    val info = inspector.getTypeInfo.asInstanceOf[DecimalTypeInfo]
-    DecimalType(info.precision(), info.scale())
-  }
-
-  def toCatalystDecimal(hdoi: HiveDecimalObjectInspector, data: Any): Decimal = {
-    if (hdoi.preferWritable()) {
-      Decimal(hdoi.getPrimitiveWritableObject(data).getHiveDecimal().bigDecimalValue,
-        hdoi.precision(), hdoi.scale())
-    } else {
-      Decimal(hdoi.getPrimitiveJavaObject(data).bigDecimalValue(), hdoi.precision(), hdoi.scale())
-    }
-  }
-
-  def getConvertedOI(inputOI: ObjectInspector, outputOI: ObjectInspector): ObjectInspector = {
-    ObjectInspectorConverters.getConvertedOI(inputOI, outputOI)
-  }
-
-  /*
-   * Bug introduced in hive-0.13. AvroGenericRecordWritable has a member recordReaderID that
-   * is needed to initialize before serialization.
-   */
-  def prepareWritable(w: Writable): Writable = {
-    w match {
-      case w: AvroGenericRecordWritable =>
-        w.setRecordReaderID(new UID())
-      case _ =>
-    }
-    w
-  }
-
-  def setTblNullFormat(crtTbl: CreateTableDesc, tbl: Table) = {
-    if (crtTbl != null && crtTbl.getNullFormat() != null) {
-      tbl.setSerdeParam(serdeConstants.SERIALIZATION_NULL_FORMAT, crtTbl.getNullFormat())
-    }
-  }
-}
-
-/*
- * Bug introduced in hive-0.13. FileSinkDesc is serilizable, but its member path is not.
- * Fix it through wrapper.
- */
-private[hive] class ShimFileSinkDesc(
-    var dir: String,
-    var tableInfo: TableDesc,
-    var compressed: Boolean)
-  extends Serializable with Logging {
-  var compressCodec: String = _
-  var compressType: String = _
-  var destTableId: Int = _
-
-  def setCompressed(compressed: Boolean) {
-    this.compressed = compressed
-  }
-
-  def getDirName = dir
-
-  def setDestTableId(destTableId: Int) {
-    this.destTableId = destTableId
-  }
-
-  def setTableInfo(tableInfo: TableDesc) {
-    this.tableInfo = tableInfo
-  }
-
-  def setCompressCodec(intermediateCompressorCodec: String) {
-    compressCodec = intermediateCompressorCodec
-  }
-
-  def setCompressType(intermediateCompressType: String) {
-    compressType = intermediateCompressType
-  }
-}

From 65938422718383d17f084e577763e2c671726baa Mon Sep 17 00:00:00 2001
From: Reynold Xin <rxin@databricks.com>
Date: Thu, 4 Jun 2015 13:44:47 -0700
Subject: [PATCH 364/525] Fixed style issues for [SPARK-6909][SQL] Remove Hive
 Shim code.

---
 .../sql/hive/thriftserver/HiveThriftServer2.scala  |  5 +++--
 .../hive/thriftserver/SparkSQLSessionManager.scala | 14 ++++++++------
 2 files changed, 11 insertions(+), 8 deletions(-)

diff --git a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/HiveThriftServer2.scala b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/HiveThriftServer2.scala
index 5b391d3dce882..c9da25253e13f 100644
--- a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/HiveThriftServer2.scala
+++ b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/HiveThriftServer2.scala
@@ -17,6 +17,9 @@
 
 package org.apache.spark.sql.hive.thriftserver
 
+import scala.collection.mutable
+import scala.collection.mutable.ArrayBuffer
+
 import org.apache.commons.logging.LogFactory
 import org.apache.hadoop.hive.conf.HiveConf
 import org.apache.hadoop.hive.conf.HiveConf.ConfVars
@@ -32,8 +35,6 @@ import org.apache.spark.sql.hive.thriftserver.ui.ThriftServerTab
 import org.apache.spark.util.Utils
 import org.apache.spark.{Logging, SparkContext}
 
-import scala.collection.mutable
-import scala.collection.mutable.ArrayBuffer
 
 /**
  * The main entry point for the Spark SQL port of HiveServer2.  Starts up a `SparkSQLContext` and a
diff --git a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLSessionManager.scala b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLSessionManager.scala
index 357b27f7401a3..2d5ee68002286 100644
--- a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLSessionManager.scala
+++ b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLSessionManager.scala
@@ -30,6 +30,7 @@ import org.apache.spark.sql.hive.HiveContext
 import org.apache.spark.sql.hive.thriftserver.ReflectionUtils._
 import org.apache.spark.sql.hive.thriftserver.server.SparkSQLOperationManager
 
+
 private[hive] class SparkSQLSessionManager(hiveContext: HiveContext)
   extends SessionManager
   with ReflectedCompositeService {
@@ -50,12 +51,13 @@ private[hive] class SparkSQLSessionManager(hiveContext: HiveContext)
     initCompositeService(hiveConf)
   }
 
-  override def openSession(protocol: TProtocolVersion,
-                           username: String,
-                           passwd: String,
-                           sessionConf: java.util.Map[String, String],
-                           withImpersonation: Boolean,
-                           delegationToken: String): SessionHandle = {
+  override def openSession(
+      protocol: TProtocolVersion,
+      username: String,
+      passwd: String,
+      sessionConf: java.util.Map[String, String],
+      withImpersonation: Boolean,
+      delegationToken: String): SessionHandle = {
     hiveContext.openSession()
     val sessionHandle = super.openSession(
       protocol, username, passwd, sessionConf, withImpersonation, delegationToken)

From 2bcdf8c239d2ba79f64fb8878da83d4c2ec28b30 Mon Sep 17 00:00:00 2001
From: Reynold Xin <rxin@databricks.com>
Date: Thu, 4 Jun 2015 13:52:53 -0700
Subject: [PATCH 365/525] [SPARK-7440][SQL] Remove physical Distinct operator
 in favor of Aggregate

This patch replaces Distinct with Aggregate in the optimizer, so Distinct will become
more efficient over time as we optimize Aggregate (via Tungsten).

Author: Reynold Xin <rxin@databricks.com>

Closes #6637 from rxin/replace-distinct and squashes the following commits:

b3cc50e [Reynold Xin] Mima excludes.
93d6117 [Reynold Xin] Code review feedback.
87e4741 [Reynold Xin] [SPARK-7440][SQL] Remove physical Distinct operator in favor of Aggregate.
---
 project/MimaExcludes.scala                    |  4 +-
 .../sql/catalyst/optimizer/Optimizer.scala    | 14 +++++++
 .../plans/logical/basicOperators.scala        |  3 ++
 .../ReplaceDistinctWithAggregateSuite.scala   | 42 +++++++++++++++++++
 .../org/apache/spark/sql/DataFrame.scala      |  2 +-
 .../spark/sql/execution/SparkStrategies.scala |  4 +-
 .../spark/sql/execution/basicOperators.scala  | 31 --------------
 7 files changed, 65 insertions(+), 35 deletions(-)
 create mode 100644 sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/ReplaceDistinctWithAggregateSuite.scala

diff --git a/project/MimaExcludes.scala b/project/MimaExcludes.scala
index 34371c9659423..73e4bfd78e577 100644
--- a/project/MimaExcludes.scala
+++ b/project/MimaExcludes.scala
@@ -46,7 +46,9 @@ object MimaExcludes {
               "org.apache.spark.api.java.JavaRDDLike.partitioner"),
             // Mima false positive (was a private[spark] class)
             ProblemFilters.exclude[MissingClassProblem](
-              "org.apache.spark.util.collection.PairIterator")
+              "org.apache.spark.util.collection.PairIterator"),
+            // SQL execution is considered private.
+            excludePackage("org.apache.spark.sql.execution")
           )
         case v if v.startsWith("1.4") =>
           Seq(
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
index 5c6379b8d44b0..0a17b10c521e5 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
@@ -36,6 +36,8 @@ object DefaultOptimizer extends Optimizer {
     // SubQueries are only needed for analysis and can be removed before execution.
     Batch("Remove SubQueries", FixedPoint(100),
       EliminateSubQueries) ::
+    Batch("Distinct", FixedPoint(100),
+      ReplaceDistinctWithAggregate) ::
     Batch("Operator Reordering", FixedPoint(100),
       UnionPushdown,
       CombineFilters,
@@ -696,3 +698,15 @@ object ConvertToLocalRelation extends Rule[LogicalPlan] {
       LocalRelation(projectList.map(_.toAttribute), data.map(projection))
   }
 }
+
+/**
+ * Replaces logical [[Distinct]] operator with an [[Aggregate]] operator.
+ * {{{
+ *   SELECT DISTINCT f1, f2 FROM t  ==>  SELECT f1, f2 FROM t GROUP BY f1, f2
+ * }}}
+ */
+object ReplaceDistinctWithAggregate extends Rule[LogicalPlan] {
+  def apply(plan: LogicalPlan): LogicalPlan = plan transform {
+    case Distinct(child) => Aggregate(child.output, child.output, child)
+  }
+}
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicOperators.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicOperators.scala
index 33a9e55a47dee..e77e5c27b687a 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicOperators.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicOperators.scala
@@ -339,6 +339,9 @@ case class Sample(
   override def output: Seq[Attribute] = child.output
 }
 
+/**
+ * Returns a new logical plan that dedups input rows.
+ */
 case class Distinct(child: LogicalPlan) extends UnaryNode {
   override def output: Seq[Attribute] = child.output
 }
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/ReplaceDistinctWithAggregateSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/ReplaceDistinctWithAggregateSuite.scala
new file mode 100644
index 0000000000000..df29a62ff0e15
--- /dev/null
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/ReplaceDistinctWithAggregateSuite.scala
@@ -0,0 +1,42 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.catalyst.optimizer
+
+import org.apache.spark.sql.catalyst.dsl.plans._
+import org.apache.spark.sql.catalyst.dsl.expressions._
+import org.apache.spark.sql.catalyst.plans.PlanTest
+import org.apache.spark.sql.catalyst.plans.logical.{Aggregate, Distinct, LocalRelation, LogicalPlan}
+import org.apache.spark.sql.catalyst.rules.RuleExecutor
+
+class ReplaceDistinctWithAggregateSuite extends PlanTest {
+
+  object Optimize extends RuleExecutor[LogicalPlan] {
+    val batches = Batch("ProjectCollapsing", Once, ReplaceDistinctWithAggregate) :: Nil
+  }
+
+  test("replace distinct with aggregate") {
+    val input = LocalRelation('a.int, 'b.int)
+
+    val query = Distinct(input)
+    val optimized = Optimize.execute(query.analyze)
+
+    val correctAnswer = Aggregate(input.output, input.output, input)
+
+    comparePlans(optimized, correctAnswer)
+  }
+}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
index d1a54ada7b191..4a224153e1a37 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
@@ -1311,7 +1311,7 @@ class DataFrame private[sql](
    * @group dfops
    * @since 1.3.0
    */
-  override def distinct: DataFrame = Distinct(logicalPlan)
+  override def distinct: DataFrame = dropDuplicates()
 
   /**
    * @group basic
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala
index d0a1ad00560d3..7a1331a39151a 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala
@@ -284,8 +284,8 @@ private[sql] abstract class SparkStrategies extends QueryPlanner[SparkPlan] {
       case r: RunnableCommand => ExecutedCommand(r) :: Nil
 
       case logical.Distinct(child) =>
-        execution.Distinct(partial = false,
-          execution.Distinct(partial = true, planLater(child))) :: Nil
+        throw new IllegalStateException(
+          "logical distinct operator should have been replaced by aggregate in the optimizer")
       case logical.Repartition(numPartitions, shuffle, child) =>
         execution.Repartition(numPartitions, shuffle, planLater(child)) :: Nil
       case logical.SortPartitions(sortExprs, child) =>
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/basicOperators.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/basicOperators.scala
index a30ade86441ca..fb42072f9d5a7 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/basicOperators.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/basicOperators.scala
@@ -230,37 +230,6 @@ case class ExternalSort(
   override def outputOrdering: Seq[SortOrder] = sortOrder
 }
 
-/**
- * :: DeveloperApi ::
- * Computes the set of distinct input rows using a HashSet.
- * @param partial when true the distinct operation is performed partially, per partition, without
- *                shuffling the data.
- * @param child the input query plan.
- */
-@DeveloperApi
-case class Distinct(partial: Boolean, child: SparkPlan) extends UnaryNode {
-  override def output: Seq[Attribute] = child.output
-
-  override def requiredChildDistribution: Seq[Distribution] =
-    if (partial) UnspecifiedDistribution :: Nil else ClusteredDistribution(child.output) :: Nil
-
-  protected override def doExecute(): RDD[Row] = {
-    child.execute().mapPartitions { iter =>
-      val hashSet = new scala.collection.mutable.HashSet[Row]()
-
-      var currentRow: Row = null
-      while (iter.hasNext) {
-        currentRow = iter.next()
-        if (!hashSet.contains(currentRow)) {
-          hashSet.add(currentRow.copy())
-        }
-      }
-
-      hashSet.iterator
-    }
-  }
-}
-
 /**
  * :: DeveloperApi ::
  * Return a new RDD that has exactly `numPartitions` partitions.

From 63bc0c4430680cce230dd7a10d34da0492351446 Mon Sep 17 00:00:00 2001
From: Carson Wang <carson.wang@intel.com>
Date: Thu, 4 Jun 2015 16:24:50 -0700
Subject: [PATCH 366/525] [SPARK-8098] [WEBUI] Show correct length of bytes on
 log page

The log page should only show desired length of bytes. Currently it shows bytes from the startIndex to the end of the file. The "Next" button on the page is always disabled.

Author: Carson Wang <carson.wang@intel.com>

Closes #6640 from carsonwang/logpage and squashes the following commits:

58cb3fd [Carson Wang] Show correct length of bytes on log page
---
 .../main/scala/org/apache/spark/deploy/worker/ui/LogPage.scala  | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/core/src/main/scala/org/apache/spark/deploy/worker/ui/LogPage.scala b/core/src/main/scala/org/apache/spark/deploy/worker/ui/LogPage.scala
index 53f8f9a46cf8d..5a1d06eb87db9 100644
--- a/core/src/main/scala/org/apache/spark/deploy/worker/ui/LogPage.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/worker/ui/LogPage.scala
@@ -159,7 +159,7 @@ private[ui] class LogPage(parent: WorkerWebUI) extends WebUIPage("logPage") with
           offset
         }
       }
-      val endIndex = math.min(startIndex + totalLength, totalLength)
+      val endIndex = math.min(startIndex + byteLength, totalLength)
       logDebug(s"Getting log from $startIndex to $endIndex")
       val logText = Utils.offsetBytes(files, startIndex, endIndex)
       logDebug(s"Got log of length ${logText.length} bytes")

From 74dc2a90bcb05b64c3e7efc02d1451b0cbc2adba Mon Sep 17 00:00:00 2001
From: Josh Rosen <joshrosen@databricks.com>
Date: Thu, 4 Jun 2015 17:33:24 -0700
Subject: [PATCH 367/525] [SPARK-8106] [SQL] Set derby.system.durability=test
 to speed up Hive compatibility tests

Derby has a `derby.system.durability` configuration property that can be used to disable I/O synchronization calls for writes. This sacrifices durability but can result in large performance gains, which is appropriate for tests.

We should enable this in our test system properties in order to speed up the Hive compatibility tests. I saw 2-3x speedups locally with this change.

See https://db.apache.org/derby/docs/10.8/ref/rrefproperdurability.html for more documentation of this property.

Author: Josh Rosen <joshrosen@databricks.com>

Closes #6651 from JoshRosen/hive-compat-suite-speedup and squashes the following commits:

b7a08a2 [Josh Rosen] Set derby.system.durability=test in our unit tests.
---
 pom.xml                  | 2 ++
 project/SparkBuild.scala | 1 +
 2 files changed, 3 insertions(+)

diff --git a/pom.xml b/pom.xml
index abb9b55400340..e28d4b9fc2b17 100644
--- a/pom.xml
+++ b/pom.xml
@@ -1254,6 +1254,7 @@
               <JAVA_HOME>${test.java.home}</JAVA_HOME>
             </environmentVariables>
             <systemProperties>
+              <derby.system.durability>test</derby.system.durability>
               <java.awt.headless>true</java.awt.headless>
               <spark.test.home>${spark.test.home}</spark.test.home>
               <spark.testing>1</spark.testing>
@@ -1286,6 +1287,7 @@
               <JAVA_HOME>${test.java.home}</JAVA_HOME>
             </environmentVariables>
             <systemProperties>
+              <derby.system.durability>test</derby.system.durability>
               <java.awt.headless>true</java.awt.headless>
               <spark.test.home>${spark.test.home}</spark.test.home>
               <spark.testing>1</spark.testing>
diff --git a/project/SparkBuild.scala b/project/SparkBuild.scala
index f65031fe25ac2..ef3a175bac209 100644
--- a/project/SparkBuild.scala
+++ b/project/SparkBuild.scala
@@ -504,6 +504,7 @@ object TestSettings {
     javaOptions in Test += "-Dspark.driver.allowMultipleContexts=true",
     javaOptions in Test += "-Dspark.unsafe.exceptionOnMemoryLeak=true",
     javaOptions in Test += "-Dsun.io.serialization.extendedDebugInfo=true",
+    javaOptions in Test += "-Dderby.system.durability=test",
     javaOptions in Test ++= System.getProperties.filter(_._1 startsWith "spark")
       .map { case (k,v) => s"-D$k=$v" }.toSeq,
     javaOptions in Test += "-ea",

From 8f16b94afb39e1641c02d4e0be18d34ef7c211cc Mon Sep 17 00:00:00 2001
From: Reynold Xin <rxin@databricks.com>
Date: Thu, 4 Jun 2015 22:15:58 -0700
Subject: [PATCH 368/525] [SPARK-8114][SQL] Remove some wildcard import on
 TestSQLContext._

I kept some of the sql import there to avoid changing too many lines.

Author: Reynold Xin <rxin@databricks.com>

Closes #6661 from rxin/remove-wildcard-import-sqlcontext and squashes the following commits:

c265347 [Reynold Xin] Fixed ListTablesSuite failure.
de9d491 [Reynold Xin] Fixed tests.
73b5365 [Reynold Xin] Mima.
8f6b642 [Reynold Xin] Fixed style violation.
443f6e8 [Reynold Xin] [SPARK-8113][SQL] Remove some wildcard import on TestSQLContext._
---
 .../sql/catalyst/analysis/Analyzer.scala      |  12 +-
 .../apache/spark/sql/CachedTableSuite.scala   | 160 +++++++++---------
 .../spark/sql/ColumnExpressionSuite.scala     |  15 +-
 .../spark/sql/DataFrameAggregateSuite.scala   |   9 +-
 .../spark/sql/DataFrameFunctionsSuite.scala   |   4 +-
 .../spark/sql/DataFrameImplicitsSuite.scala   |  15 +-
 .../apache/spark/sql/DataFrameJoinSuite.scala |   9 +-
 .../spark/sql/DataFrameNaFunctionsSuite.scala |   5 +-
 .../apache/spark/sql/DataFrameStatSuite.scala |   8 +-
 .../org/apache/spark/sql/DataFrameSuite.scala |  68 ++++----
 .../org/apache/spark/sql/JoinSuite.scala      |  65 +++----
 .../apache/spark/sql/ListTablesSuite.scala    |  35 ++--
 .../spark/sql/MathExpressionsSuite.scala      |  44 +++--
 .../scala/org/apache/spark/sql/RowSuite.scala |   7 +-
 .../org/apache/spark/sql/SQLConfSuite.scala   |  67 ++++----
 .../apache/spark/sql/SQLContextSuite.scala    |  16 +-
 .../org/apache/spark/sql/SQLQuerySuite.scala  | 125 +++++++-------
 .../sql/ScalaReflectionRelationSuite.scala    |  31 ++--
 .../apache/spark/sql/SerializationSuite.scala |   5 +-
 .../scala/org/apache/spark/sql/UDFSuite.scala |  28 ++-
 .../spark/sql/UserDefinedTypeSuite.scala      |  23 ++-
 21 files changed, 373 insertions(+), 378 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
index bc17169f35a46..5883d938b676d 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
@@ -235,9 +235,8 @@ class Analyzer(
   }
 
   /**
-   * Replaces [[UnresolvedAttribute]]s with concrete
-   * [[catalyst.expressions.AttributeReference AttributeReferences]] from a logical plan node's
-   * children.
+   * Replaces [[UnresolvedAttribute]]s with concrete [[AttributeReference]]s from
+   * a logical plan node's children.
    */
   object ResolveReferences extends Rule[LogicalPlan] {
     def apply(plan: LogicalPlan): LogicalPlan = plan transformUp {
@@ -455,7 +454,7 @@ class Analyzer(
   }
 
   /**
-   * Replaces [[UnresolvedFunction]]s with concrete [[catalyst.expressions.Expression Expressions]].
+   * Replaces [[UnresolvedFunction]]s with concrete [[Expression]]s.
    */
   object ResolveFunctions extends Rule[LogicalPlan] {
     def apply(plan: LogicalPlan): LogicalPlan = plan transform {
@@ -846,9 +845,8 @@ class Analyzer(
 }
 
 /**
- * Removes [[catalyst.plans.logical.Subquery Subquery]] operators from the plan.  Subqueries are
- * only required to provide scoping information for attributes and can be removed once analysis is
- * complete.
+ * Removes [[Subquery]] operators from the plan. Subqueries are only required to provide
+ * scoping information for attributes and can be removed once analysis is complete.
  */
 object EliminateSubQueries extends Rule[LogicalPlan] {
   def apply(plan: LogicalPlan): LogicalPlan = plan transform {
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/CachedTableSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/CachedTableSuite.scala
index 0772e5e187425..72e60d9aa75cb 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/CachedTableSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/CachedTableSuite.scala
@@ -25,8 +25,6 @@ import org.scalatest.concurrent.Eventually._
 import org.apache.spark.Accumulators
 import org.apache.spark.sql.TestData._
 import org.apache.spark.sql.columnar._
-import org.apache.spark.sql.test.TestSQLContext._
-import org.apache.spark.sql.test.TestSQLContext.implicits._
 import org.apache.spark.storage.{RDDBlockId, StorageLevel}
 
 case class BigData(s: String)
@@ -34,8 +32,12 @@ case class BigData(s: String)
 class CachedTableSuite extends QueryTest {
   TestData // Load test tables.
 
+  private lazy val ctx = org.apache.spark.sql.test.TestSQLContext
+  import ctx.implicits._
+  import ctx.sql
+
   def rddIdOf(tableName: String): Int = {
-    val executedPlan = table(tableName).queryExecution.executedPlan
+    val executedPlan = ctx.table(tableName).queryExecution.executedPlan
     executedPlan.collect {
       case InMemoryColumnarTableScan(_, _, relation) =>
         relation.cachedColumnBuffers.id
@@ -45,47 +47,47 @@ class CachedTableSuite extends QueryTest {
   }
 
   def isMaterialized(rddId: Int): Boolean = {
-    sparkContext.env.blockManager.get(RDDBlockId(rddId, 0)).nonEmpty
+    ctx.sparkContext.env.blockManager.get(RDDBlockId(rddId, 0)).nonEmpty
   }
 
   test("cache temp table") {
     testData.select('key).registerTempTable("tempTable")
     assertCached(sql("SELECT COUNT(*) FROM tempTable"), 0)
-    cacheTable("tempTable")
+    ctx.cacheTable("tempTable")
     assertCached(sql("SELECT COUNT(*) FROM tempTable"))
-    uncacheTable("tempTable")
+    ctx.uncacheTable("tempTable")
   }
 
   test("unpersist an uncached table will not raise exception") {
-    assert(None == cacheManager.lookupCachedData(testData))
+    assert(None == ctx.cacheManager.lookupCachedData(testData))
     testData.unpersist(blocking = true)
-    assert(None == cacheManager.lookupCachedData(testData))
+    assert(None == ctx.cacheManager.lookupCachedData(testData))
     testData.unpersist(blocking = false)
-    assert(None == cacheManager.lookupCachedData(testData))
+    assert(None == ctx.cacheManager.lookupCachedData(testData))
     testData.persist()
-    assert(None != cacheManager.lookupCachedData(testData))
+    assert(None != ctx.cacheManager.lookupCachedData(testData))
     testData.unpersist(blocking = true)
-    assert(None == cacheManager.lookupCachedData(testData))
+    assert(None == ctx.cacheManager.lookupCachedData(testData))
     testData.unpersist(blocking = false)
-    assert(None == cacheManager.lookupCachedData(testData))
+    assert(None == ctx.cacheManager.lookupCachedData(testData))
   }
 
   test("cache table as select") {
     sql("CACHE TABLE tempTable AS SELECT key FROM testData")
     assertCached(sql("SELECT COUNT(*) FROM tempTable"))
-    uncacheTable("tempTable")
+    ctx.uncacheTable("tempTable")
   }
 
   test("uncaching temp table") {
     testData.select('key).registerTempTable("tempTable1")
     testData.select('key).registerTempTable("tempTable2")
-    cacheTable("tempTable1")
+    ctx.cacheTable("tempTable1")
 
     assertCached(sql("SELECT COUNT(*) FROM tempTable1"))
     assertCached(sql("SELECT COUNT(*) FROM tempTable2"))
 
     // Is this valid?
-    uncacheTable("tempTable2")
+    ctx.uncacheTable("tempTable2")
 
     // Should this be cached?
     assertCached(sql("SELECT COUNT(*) FROM tempTable1"), 0)
@@ -93,103 +95,103 @@ class CachedTableSuite extends QueryTest {
 
   test("too big for memory") {
     val data = "*" * 10000
-    sparkContext.parallelize(1 to 200000, 1).map(_ => BigData(data)).toDF()
+    ctx.sparkContext.parallelize(1 to 200000, 1).map(_ => BigData(data)).toDF()
       .registerTempTable("bigData")
-    table("bigData").persist(StorageLevel.MEMORY_AND_DISK)
-    assert(table("bigData").count() === 200000L)
-    table("bigData").unpersist(blocking = true)
+    ctx.table("bigData").persist(StorageLevel.MEMORY_AND_DISK)
+    assert(ctx.table("bigData").count() === 200000L)
+    ctx.table("bigData").unpersist(blocking = true)
   }
 
   test("calling .cache() should use in-memory columnar caching") {
-    table("testData").cache()
-    assertCached(table("testData"))
-    table("testData").unpersist(blocking = true)
+    ctx.table("testData").cache()
+    assertCached(ctx.table("testData"))
+    ctx.table("testData").unpersist(blocking = true)
   }
 
   test("calling .unpersist() should drop in-memory columnar cache") {
-    table("testData").cache()
-    table("testData").count()
-    table("testData").unpersist(blocking = true)
-    assertCached(table("testData"), 0)
+    ctx.table("testData").cache()
+    ctx.table("testData").count()
+    ctx.table("testData").unpersist(blocking = true)
+    assertCached(ctx.table("testData"), 0)
   }
 
   test("isCached") {
-    cacheTable("testData")
+    ctx.cacheTable("testData")
 
-    assertCached(table("testData"))
-    assert(table("testData").queryExecution.withCachedData match {
+    assertCached(ctx.table("testData"))
+    assert(ctx.table("testData").queryExecution.withCachedData match {
       case _: InMemoryRelation => true
       case _ => false
     })
 
-    uncacheTable("testData")
-    assert(!isCached("testData"))
-    assert(table("testData").queryExecution.withCachedData match {
+    ctx.uncacheTable("testData")
+    assert(!ctx.isCached("testData"))
+    assert(ctx.table("testData").queryExecution.withCachedData match {
       case _: InMemoryRelation => false
       case _ => true
     })
   }
 
   test("SPARK-1669: cacheTable should be idempotent") {
-    assume(!table("testData").logicalPlan.isInstanceOf[InMemoryRelation])
+    assume(!ctx.table("testData").logicalPlan.isInstanceOf[InMemoryRelation])
 
-    cacheTable("testData")
-    assertCached(table("testData"))
+    ctx.cacheTable("testData")
+    assertCached(ctx.table("testData"))
 
     assertResult(1, "InMemoryRelation not found, testData should have been cached") {
-      table("testData").queryExecution.withCachedData.collect {
+      ctx.table("testData").queryExecution.withCachedData.collect {
         case r: InMemoryRelation => r
       }.size
     }
 
-    cacheTable("testData")
+    ctx.cacheTable("testData")
     assertResult(0, "Double InMemoryRelations found, cacheTable() is not idempotent") {
-      table("testData").queryExecution.withCachedData.collect {
+      ctx.table("testData").queryExecution.withCachedData.collect {
         case r @ InMemoryRelation(_, _, _, _, _: InMemoryColumnarTableScan, _) => r
       }.size
     }
 
-    uncacheTable("testData")
+    ctx.uncacheTable("testData")
   }
 
   test("read from cached table and uncache") {
-    cacheTable("testData")
-    checkAnswer(table("testData"), testData.collect().toSeq)
-    assertCached(table("testData"))
+    ctx.cacheTable("testData")
+    checkAnswer(ctx.table("testData"), testData.collect().toSeq)
+    assertCached(ctx.table("testData"))
 
-    uncacheTable("testData")
-    checkAnswer(table("testData"), testData.collect().toSeq)
-    assertCached(table("testData"), 0)
+    ctx.uncacheTable("testData")
+    checkAnswer(ctx.table("testData"), testData.collect().toSeq)
+    assertCached(ctx.table("testData"), 0)
   }
 
   test("correct error on uncache of non-cached table") {
     intercept[IllegalArgumentException] {
-      uncacheTable("testData")
+      ctx.uncacheTable("testData")
     }
   }
 
   test("SELECT star from cached table") {
     sql("SELECT * FROM testData").registerTempTable("selectStar")
-    cacheTable("selectStar")
+    ctx.cacheTable("selectStar")
     checkAnswer(
       sql("SELECT * FROM selectStar WHERE key = 1"),
       Seq(Row(1, "1")))
-    uncacheTable("selectStar")
+    ctx.uncacheTable("selectStar")
   }
 
   test("Self-join cached") {
     val unCachedAnswer =
       sql("SELECT * FROM testData a JOIN testData b ON a.key = b.key").collect()
-    cacheTable("testData")
+    ctx.cacheTable("testData")
     checkAnswer(
       sql("SELECT * FROM testData a JOIN testData b ON a.key = b.key"),
       unCachedAnswer.toSeq)
-    uncacheTable("testData")
+    ctx.uncacheTable("testData")
   }
 
   test("'CACHE TABLE' and 'UNCACHE TABLE' SQL statement") {
     sql("CACHE TABLE testData")
-    assertCached(table("testData"))
+    assertCached(ctx.table("testData"))
 
     val rddId = rddIdOf("testData")
     assert(
@@ -197,7 +199,7 @@ class CachedTableSuite extends QueryTest {
       "Eagerly cached in-memory table should have already been materialized")
 
     sql("UNCACHE TABLE testData")
-    assert(!isCached("testData"), "Table 'testData' should not be cached")
+    assert(!ctx.isCached("testData"), "Table 'testData' should not be cached")
 
     eventually(timeout(10 seconds)) {
       assert(!isMaterialized(rddId), "Uncached in-memory table should have been unpersisted")
@@ -206,14 +208,14 @@ class CachedTableSuite extends QueryTest {
 
   test("CACHE TABLE tableName AS SELECT * FROM anotherTable") {
     sql("CACHE TABLE testCacheTable AS SELECT * FROM testData")
-    assertCached(table("testCacheTable"))
+    assertCached(ctx.table("testCacheTable"))
 
     val rddId = rddIdOf("testCacheTable")
     assert(
       isMaterialized(rddId),
       "Eagerly cached in-memory table should have already been materialized")
 
-    uncacheTable("testCacheTable")
+    ctx.uncacheTable("testCacheTable")
     eventually(timeout(10 seconds)) {
       assert(!isMaterialized(rddId), "Uncached in-memory table should have been unpersisted")
     }
@@ -221,14 +223,14 @@ class CachedTableSuite extends QueryTest {
 
   test("CACHE TABLE tableName AS SELECT ...") {
     sql("CACHE TABLE testCacheTable AS SELECT key FROM testData LIMIT 10")
-    assertCached(table("testCacheTable"))
+    assertCached(ctx.table("testCacheTable"))
 
     val rddId = rddIdOf("testCacheTable")
     assert(
       isMaterialized(rddId),
       "Eagerly cached in-memory table should have already been materialized")
 
-    uncacheTable("testCacheTable")
+    ctx.uncacheTable("testCacheTable")
     eventually(timeout(10 seconds)) {
       assert(!isMaterialized(rddId), "Uncached in-memory table should have been unpersisted")
     }
@@ -236,7 +238,7 @@ class CachedTableSuite extends QueryTest {
 
   test("CACHE LAZY TABLE tableName") {
     sql("CACHE LAZY TABLE testData")
-    assertCached(table("testData"))
+    assertCached(ctx.table("testData"))
 
     val rddId = rddIdOf("testData")
     assert(
@@ -248,7 +250,7 @@ class CachedTableSuite extends QueryTest {
       isMaterialized(rddId),
       "Lazily cached in-memory table should have been materialized")
 
-    uncacheTable("testData")
+    ctx.uncacheTable("testData")
     eventually(timeout(10 seconds)) {
       assert(!isMaterialized(rddId), "Uncached in-memory table should have been unpersisted")
     }
@@ -256,7 +258,7 @@ class CachedTableSuite extends QueryTest {
 
   test("InMemoryRelation statistics") {
     sql("CACHE TABLE testData")
-    table("testData").queryExecution.withCachedData.collect {
+    ctx.table("testData").queryExecution.withCachedData.collect {
       case cached: InMemoryRelation =>
         val actualSizeInBytes = (1 to 100).map(i => INT.defaultSize + i.toString.length + 4).sum
         assert(cached.statistics.sizeInBytes === actualSizeInBytes)
@@ -265,38 +267,38 @@ class CachedTableSuite extends QueryTest {
 
   test("Drops temporary table") {
     testData.select('key).registerTempTable("t1")
-    table("t1")
-    dropTempTable("t1")
-    assert(intercept[RuntimeException](table("t1")).getMessage.startsWith("Table Not Found"))
+    ctx.table("t1")
+    ctx.dropTempTable("t1")
+    assert(intercept[RuntimeException](ctx.table("t1")).getMessage.startsWith("Table Not Found"))
   }
 
   test("Drops cached temporary table") {
     testData.select('key).registerTempTable("t1")
     testData.select('key).registerTempTable("t2")
-    cacheTable("t1")
+    ctx.cacheTable("t1")
 
-    assert(isCached("t1"))
-    assert(isCached("t2"))
+    assert(ctx.isCached("t1"))
+    assert(ctx.isCached("t2"))
 
-    dropTempTable("t1")
-    assert(intercept[RuntimeException](table("t1")).getMessage.startsWith("Table Not Found"))
-    assert(!isCached("t2"))
+    ctx.dropTempTable("t1")
+    assert(intercept[RuntimeException](ctx.table("t1")).getMessage.startsWith("Table Not Found"))
+    assert(!ctx.isCached("t2"))
   }
 
   test("Clear all cache") {
     sql("SELECT key FROM testData LIMIT 10").registerTempTable("t1")
     sql("SELECT key FROM testData LIMIT 5").registerTempTable("t2")
-    cacheTable("t1")
-    cacheTable("t2")
-    clearCache()
-    assert(cacheManager.isEmpty)
+    ctx.cacheTable("t1")
+    ctx.cacheTable("t2")
+    ctx.clearCache()
+    assert(ctx.cacheManager.isEmpty)
 
     sql("SELECT key FROM testData LIMIT 10").registerTempTable("t1")
     sql("SELECT key FROM testData LIMIT 5").registerTempTable("t2")
-    cacheTable("t1")
-    cacheTable("t2")
+    ctx.cacheTable("t1")
+    ctx.cacheTable("t2")
     sql("Clear CACHE")
-    assert(cacheManager.isEmpty)
+    assert(ctx.cacheManager.isEmpty)
   }
 
   test("Clear accumulators when uncacheTable to prevent memory leaking") {
@@ -305,8 +307,8 @@ class CachedTableSuite extends QueryTest {
 
     Accumulators.synchronized {
       val accsSize = Accumulators.originals.size
-      cacheTable("t1")
-      cacheTable("t2")
+      ctx.cacheTable("t1")
+      ctx.cacheTable("t2")
       assert((accsSize + 2) == Accumulators.originals.size)
     }
 
@@ -317,8 +319,8 @@ class CachedTableSuite extends QueryTest {
 
     Accumulators.synchronized {
       val accsSize = Accumulators.originals.size
-      uncacheTable("t1")
-      uncacheTable("t2")
+      ctx.uncacheTable("t1")
+      ctx.uncacheTable("t2")
       assert((accsSize - 2) == Accumulators.originals.size)
     }
   }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/ColumnExpressionSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/ColumnExpressionSuite.scala
index bfba379d9a518..4f5484f1368d1 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/ColumnExpressionSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/ColumnExpressionSuite.scala
@@ -21,13 +21,14 @@ import org.scalatest.Matchers._
 
 import org.apache.spark.sql.execution.Project
 import org.apache.spark.sql.functions._
-import org.apache.spark.sql.test.TestSQLContext
-import org.apache.spark.sql.test.TestSQLContext.implicits._
 import org.apache.spark.sql.types._
 
 class ColumnExpressionSuite extends QueryTest {
   import org.apache.spark.sql.TestData._
 
+  private lazy val ctx = org.apache.spark.sql.test.TestSQLContext
+  import ctx.implicits._
+
   test("alias") {
     val df = Seq((1, Seq(1, 2, 3))).toDF("a", "intList")
     assert(df.select(df("a").as("b")).columns.head === "b")
@@ -213,7 +214,7 @@ class ColumnExpressionSuite extends QueryTest {
   }
 
   test("!==") {
-    val nullData = TestSQLContext.createDataFrame(TestSQLContext.sparkContext.parallelize(
+    val nullData = ctx.createDataFrame(ctx.sparkContext.parallelize(
       Row(1, 1) ::
       Row(1, 2) ::
       Row(1, null) ::
@@ -274,7 +275,7 @@ class ColumnExpressionSuite extends QueryTest {
   }
 
   test("between") {
-    val testData = TestSQLContext.sparkContext.parallelize(
+    val testData = ctx.sparkContext.parallelize(
       (0, 1, 2) ::
       (1, 2, 3) ::
       (2, 1, 0) ::
@@ -287,7 +288,7 @@ class ColumnExpressionSuite extends QueryTest {
     checkAnswer(testData.filter($"a".between($"b", $"c")), expectAnswer)
   }
 
-  val booleanData = TestSQLContext.createDataFrame(TestSQLContext.sparkContext.parallelize(
+  val booleanData = ctx.createDataFrame(ctx.sparkContext.parallelize(
     Row(false, false) ::
       Row(false, true) ::
       Row(true, false) ::
@@ -413,7 +414,7 @@ class ColumnExpressionSuite extends QueryTest {
 
   test("monotonicallyIncreasingId") {
     // Make sure we have 2 partitions, each with 2 records.
-    val df = TestSQLContext.sparkContext.parallelize(1 to 2, 2).mapPartitions { iter =>
+    val df = ctx.sparkContext.parallelize(1 to 2, 2).mapPartitions { iter =>
       Iterator(Tuple1(1), Tuple1(2))
     }.toDF("a")
     checkAnswer(
@@ -423,7 +424,7 @@ class ColumnExpressionSuite extends QueryTest {
   }
 
   test("sparkPartitionId") {
-    val df = TestSQLContext.sparkContext.parallelize(1 to 1, 1).map(i => (i, i)).toDF("a", "b")
+    val df = ctx.sparkContext.parallelize(1 to 1, 1).map(i => (i, i)).toDF("a", "b")
     checkAnswer(
       df.select(sparkPartitionId()),
       Row(0)
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameAggregateSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameAggregateSuite.scala
index 232f05c00918f..790b405c72697 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameAggregateSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameAggregateSuite.scala
@@ -19,13 +19,14 @@ package org.apache.spark.sql
 
 import org.apache.spark.sql.TestData._
 import org.apache.spark.sql.functions._
-import org.apache.spark.sql.test.TestSQLContext
-import org.apache.spark.sql.test.TestSQLContext.implicits._
 import org.apache.spark.sql.types.DecimalType
 
 
 class DataFrameAggregateSuite extends QueryTest {
 
+  private lazy val ctx = org.apache.spark.sql.test.TestSQLContext
+  import ctx.implicits._
+
   test("groupBy") {
     checkAnswer(
       testData2.groupBy("a").agg(sum($"b")),
@@ -67,12 +68,12 @@ class DataFrameAggregateSuite extends QueryTest {
       Seq(Row(1, 3), Row(2, 3), Row(3, 3))
     )
 
-    TestSQLContext.conf.setConf("spark.sql.retainGroupColumns", "false")
+    ctx.conf.setConf("spark.sql.retainGroupColumns", "false")
     checkAnswer(
       testData2.groupBy("a").agg(sum($"b")),
       Seq(Row(3), Row(3), Row(3))
     )
-    TestSQLContext.conf.setConf("spark.sql.retainGroupColumns", "true")
+    ctx.conf.setConf("spark.sql.retainGroupColumns", "true")
   }
 
   test("agg without groups") {
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala
index b1e0faa310b68..53c2befb73702 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala
@@ -19,7 +19,6 @@ package org.apache.spark.sql
 
 import org.apache.spark.sql.TestData._
 import org.apache.spark.sql.functions._
-import org.apache.spark.sql.test.TestSQLContext.implicits._
 import org.apache.spark.sql.types._
 
 /**
@@ -27,6 +26,9 @@ import org.apache.spark.sql.types._
  */
 class DataFrameFunctionsSuite extends QueryTest {
 
+  private lazy val ctx = org.apache.spark.sql.test.TestSQLContext
+  import ctx.implicits._
+
   test("array with column name") {
     val df = Seq((0, 1)).toDF("a", "b")
     val row = df.select(array("a", "b")).first()
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameImplicitsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameImplicitsSuite.scala
index 2d2367d6e7292..fbb30706a4943 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameImplicitsSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameImplicitsSuite.scala
@@ -17,15 +17,14 @@
 
 package org.apache.spark.sql
 
-import org.apache.spark.sql.test.TestSQLContext.{sparkContext => sc}
-import org.apache.spark.sql.test.TestSQLContext.implicits._
-
-
 class DataFrameImplicitsSuite extends QueryTest {
 
+  private lazy val ctx = org.apache.spark.sql.test.TestSQLContext
+  import ctx.implicits._
+
   test("RDD of tuples") {
     checkAnswer(
-      sc.parallelize(1 to 10).map(i => (i, i.toString)).toDF("intCol", "strCol"),
+      ctx.sparkContext.parallelize(1 to 10).map(i => (i, i.toString)).toDF("intCol", "strCol"),
       (1 to 10).map(i => Row(i, i.toString)))
   }
 
@@ -37,19 +36,19 @@ class DataFrameImplicitsSuite extends QueryTest {
 
   test("RDD[Int]") {
     checkAnswer(
-      sc.parallelize(1 to 10).toDF("intCol"),
+      ctx.sparkContext.parallelize(1 to 10).toDF("intCol"),
       (1 to 10).map(i => Row(i)))
   }
 
   test("RDD[Long]") {
     checkAnswer(
-      sc.parallelize(1L to 10L).toDF("longCol"),
+      ctx.sparkContext.parallelize(1L to 10L).toDF("longCol"),
       (1L to 10L).map(i => Row(i)))
   }
 
   test("RDD[String]") {
     checkAnswer(
-      sc.parallelize(1 to 10).map(_.toString).toDF("stringCol"),
+      ctx.sparkContext.parallelize(1 to 10).map(_.toString).toDF("stringCol"),
       (1 to 10).map(i => Row(i.toString)))
   }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameJoinSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameJoinSuite.scala
index 787f3f175fea2..051d13e9a544f 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameJoinSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameJoinSuite.scala
@@ -19,12 +19,12 @@ package org.apache.spark.sql
 
 import org.apache.spark.sql.TestData._
 import org.apache.spark.sql.functions._
-import org.apache.spark.sql.test.TestSQLContext._
-import org.apache.spark.sql.test.TestSQLContext.implicits._
-
 
 class DataFrameJoinSuite extends QueryTest {
 
+  private lazy val ctx = org.apache.spark.sql.test.TestSQLContext
+  import ctx.implicits._
+
   test("join - join using") {
     val df = Seq(1, 2, 3).map(i => (i, i.toString)).toDF("int", "str")
     val df2 = Seq(1, 2, 3).map(i => (i, (i + 1).toString)).toDF("int", "str")
@@ -49,7 +49,8 @@ class DataFrameJoinSuite extends QueryTest {
 
     checkAnswer(
       df1.join(df2, $"df1.key" === $"df2.key"),
-      sql("SELECT a.key, b.key FROM testData a JOIN testData b ON a.key = b.key").collect().toSeq)
+      ctx.sql("SELECT a.key, b.key FROM testData a JOIN testData b ON a.key = b.key")
+        .collect().toSeq)
   }
 
   test("join - using aliases after self join") {
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameNaFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameNaFunctionsSuite.scala
index 41b4f02e6a294..495701d4f616c 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameNaFunctionsSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameNaFunctionsSuite.scala
@@ -19,11 +19,12 @@ package org.apache.spark.sql
 
 import scala.collection.JavaConversions._
 
-import org.apache.spark.sql.test.TestSQLContext.implicits._
-
 
 class DataFrameNaFunctionsSuite extends QueryTest {
 
+  private lazy val ctx = org.apache.spark.sql.test.TestSQLContext
+  import ctx.implicits._
+
   def createDF(): DataFrame = {
     Seq[(String, java.lang.Integer, java.lang.Double)](
       ("Bob", 16, 176.5),
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameStatSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameStatSuite.scala
index 438f479459dfe..0d3ff899dad72 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameStatSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameStatSuite.scala
@@ -20,13 +20,13 @@ package org.apache.spark.sql
 import org.scalatest.Matchers._
 
 import org.apache.spark.SparkFunSuite
-import org.apache.spark.sql.test.TestSQLContext
-import org.apache.spark.sql.test.TestSQLContext.implicits._
 
 class DataFrameStatSuite extends SparkFunSuite  {
 
-  val sqlCtx = TestSQLContext
-  def toLetter(i: Int): String = (i + 97).toChar.toString
+  private val sqlCtx = org.apache.spark.sql.test.TestSQLContext
+  import sqlCtx.implicits._
+
+  private def toLetter(i: Int): String = (i + 97).toChar.toString
 
   test("pearson correlation") {
     val df = Seq.tabulate(10)(i => (i, 2 * i, i * -1.0)).toDF("a", "b", "c")
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
index 8e81dacb8660f..bb8621abe64ad 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
@@ -21,17 +21,19 @@ import scala.language.postfixOps
 
 import org.apache.spark.sql.functions._
 import org.apache.spark.sql.types._
-import org.apache.spark.sql.test.{ExamplePointUDT, ExamplePoint, TestSQLContext}
-import org.apache.spark.sql.test.TestSQLContext.implicits._
+import org.apache.spark.sql.test.{ExamplePointUDT, ExamplePoint}
 
 
 class DataFrameSuite extends QueryTest {
   import org.apache.spark.sql.TestData._
 
+  lazy val ctx = org.apache.spark.sql.test.TestSQLContext
+  import ctx.implicits._
+
   test("analysis error should be eagerly reported") {
-    val oldSetting = TestSQLContext.conf.dataFrameEagerAnalysis
+    val oldSetting = ctx.conf.dataFrameEagerAnalysis
     // Eager analysis.
-    TestSQLContext.setConf(SQLConf.DATAFRAME_EAGER_ANALYSIS, "true")
+    ctx.setConf(SQLConf.DATAFRAME_EAGER_ANALYSIS, "true")
 
     intercept[Exception] { testData.select('nonExistentName) }
     intercept[Exception] {
@@ -45,11 +47,11 @@ class DataFrameSuite extends QueryTest {
     }
 
     // No more eager analysis once the flag is turned off
-    TestSQLContext.setConf(SQLConf.DATAFRAME_EAGER_ANALYSIS, "false")
+    ctx.setConf(SQLConf.DATAFRAME_EAGER_ANALYSIS, "false")
     testData.select('nonExistentName)
 
     // Set the flag back to original value before this test.
-    TestSQLContext.setConf(SQLConf.DATAFRAME_EAGER_ANALYSIS, oldSetting.toString)
+    ctx.setConf(SQLConf.DATAFRAME_EAGER_ANALYSIS, oldSetting.toString)
   }
 
   test("dataframe toString") {
@@ -67,12 +69,12 @@ class DataFrameSuite extends QueryTest {
   }
 
   test("invalid plan toString, debug mode") {
-    val oldSetting = TestSQLContext.conf.dataFrameEagerAnalysis
-    TestSQLContext.setConf(SQLConf.DATAFRAME_EAGER_ANALYSIS, "true")
+    val oldSetting = ctx.conf.dataFrameEagerAnalysis
+    ctx.setConf(SQLConf.DATAFRAME_EAGER_ANALYSIS, "true")
 
     // Turn on debug mode so we can see invalid query plans.
     import org.apache.spark.sql.execution.debug._
-    TestSQLContext.debug()
+    ctx.debug()
 
     val badPlan = testData.select('badColumn)
 
@@ -81,7 +83,7 @@ class DataFrameSuite extends QueryTest {
         badPlan.toString)
 
     // Set the flag back to original value before this test.
-    TestSQLContext.setConf(SQLConf.DATAFRAME_EAGER_ANALYSIS, oldSetting.toString)
+    ctx.setConf(SQLConf.DATAFRAME_EAGER_ANALYSIS, oldSetting.toString)
   }
 
   test("access complex data") {
@@ -97,8 +99,8 @@ class DataFrameSuite extends QueryTest {
   }
 
   test("empty data frame") {
-    assert(TestSQLContext.emptyDataFrame.columns.toSeq === Seq.empty[String])
-    assert(TestSQLContext.emptyDataFrame.count() === 0)
+    assert(ctx.emptyDataFrame.columns.toSeq === Seq.empty[String])
+    assert(ctx.emptyDataFrame.count() === 0)
   }
 
   test("head and take") {
@@ -311,7 +313,7 @@ class DataFrameSuite extends QueryTest {
   }
 
   test("replace column using withColumn") {
-    val df2 = TestSQLContext.sparkContext.parallelize(Array(1, 2, 3)).toDF("x")
+    val df2 = ctx.sparkContext.parallelize(Array(1, 2, 3)).toDF("x")
     val df3 = df2.withColumn("x", df2("x") + 1)
     checkAnswer(
       df3.select("x"),
@@ -392,7 +394,7 @@ class DataFrameSuite extends QueryTest {
 
   test("randomSplit") {
     val n = 600
-    val data = TestSQLContext.sparkContext.parallelize(1 to n, 2).toDF("id")
+    val data = ctx.sparkContext.parallelize(1 to n, 2).toDF("id")
     for (seed <- 1 to 5) {
       val splits = data.randomSplit(Array[Double](1, 2, 3), seed)
       assert(splits.length == 3, "wrong number of splits")
@@ -487,21 +489,21 @@ class DataFrameSuite extends QueryTest {
   }
 
   test("createDataFrame(RDD[Row], StructType) should convert UDTs (SPARK-6672)") {
-    val rowRDD = TestSQLContext.sparkContext.parallelize(Seq(Row(new ExamplePoint(1.0, 2.0))))
+    val rowRDD = ctx.sparkContext.parallelize(Seq(Row(new ExamplePoint(1.0, 2.0))))
     val schema = StructType(Array(StructField("point", new ExamplePointUDT(), false)))
-    val df = TestSQLContext.createDataFrame(rowRDD, schema)
+    val df = ctx.createDataFrame(rowRDD, schema)
     df.rdd.collect()
   }
 
   test("SPARK-6899") {
-    val originalValue = TestSQLContext.conf.codegenEnabled
-    TestSQLContext.setConf(SQLConf.CODEGEN_ENABLED, "true")
+    val originalValue = ctx.conf.codegenEnabled
+    ctx.setConf(SQLConf.CODEGEN_ENABLED, "true")
     try{
       checkAnswer(
         decimalData.agg(avg('a)),
         Row(new java.math.BigDecimal(2.0)))
     } finally {
-      TestSQLContext.setConf(SQLConf.CODEGEN_ENABLED, originalValue.toString)
+      ctx.setConf(SQLConf.CODEGEN_ENABLED, originalValue.toString)
     }
   }
 
@@ -513,14 +515,14 @@ class DataFrameSuite extends QueryTest {
   }
 
   test("SPARK-7551: support backticks for DataFrame attribute resolution") {
-    val df = TestSQLContext.read.json(TestSQLContext.sparkContext.makeRDD(
+    val df = ctx.read.json(ctx.sparkContext.makeRDD(
       """{"a.b": {"c": {"d..e": {"f": 1}}}}""" :: Nil))
     checkAnswer(
       df.select(df("`a.b`.c.`d..e`.`f`")),
       Row(1)
     )
 
-    val df2 = TestSQLContext.read.json(TestSQLContext.sparkContext.makeRDD(
+    val df2 = ctx.read.json(ctx.sparkContext.makeRDD(
       """{"a  b": {"c": {"d  e": {"f": 1}}}}""" :: Nil))
     checkAnswer(
       df2.select(df2("`a  b`.c.d  e.f")),
@@ -540,7 +542,7 @@ class DataFrameSuite extends QueryTest {
   }
 
   test("SPARK-7324 dropDuplicates") {
-    val testData = TestSQLContext.sparkContext.parallelize(
+    val testData = ctx.sparkContext.parallelize(
       (2, 1, 2) :: (1, 1, 1) ::
       (1, 2, 1) :: (2, 1, 2) ::
       (2, 2, 2) :: (2, 2, 1) ::
@@ -588,49 +590,49 @@ class DataFrameSuite extends QueryTest {
 
   test("SPARK-7150 range api") {
     // numSlice is greater than length
-    val res1 = TestSQLContext.range(0, 10, 1, 15).select("id")
+    val res1 = ctx.range(0, 10, 1, 15).select("id")
     assert(res1.count == 10)
     assert(res1.agg(sum("id")).as("sumid").collect() === Seq(Row(45)))
 
-    val res2 = TestSQLContext.range(3, 15, 3, 2).select("id")
+    val res2 = ctx.range(3, 15, 3, 2).select("id")
     assert(res2.count == 4)
     assert(res2.agg(sum("id")).as("sumid").collect() === Seq(Row(30)))
 
-    val res3 = TestSQLContext.range(1, -2).select("id")
+    val res3 = ctx.range(1, -2).select("id")
     assert(res3.count == 0)
 
     // start is positive, end is negative, step is negative
-    val res4 = TestSQLContext.range(1, -2, -2, 6).select("id")
+    val res4 = ctx.range(1, -2, -2, 6).select("id")
     assert(res4.count == 2)
     assert(res4.agg(sum("id")).as("sumid").collect() === Seq(Row(0)))
 
     // start, end, step are negative
-    val res5 = TestSQLContext.range(-3, -8, -2, 1).select("id")
+    val res5 = ctx.range(-3, -8, -2, 1).select("id")
     assert(res5.count == 3)
     assert(res5.agg(sum("id")).as("sumid").collect() === Seq(Row(-15)))
 
     // start, end are negative, step is positive
-    val res6 = TestSQLContext.range(-8, -4, 2, 1).select("id")
+    val res6 = ctx.range(-8, -4, 2, 1).select("id")
     assert(res6.count == 2)
     assert(res6.agg(sum("id")).as("sumid").collect() === Seq(Row(-14)))
 
-    val res7 = TestSQLContext.range(-10, -9, -20, 1).select("id")
+    val res7 = ctx.range(-10, -9, -20, 1).select("id")
     assert(res7.count == 0)
 
-    val res8 = TestSQLContext.range(Long.MinValue, Long.MaxValue, Long.MaxValue, 100).select("id")
+    val res8 = ctx.range(Long.MinValue, Long.MaxValue, Long.MaxValue, 100).select("id")
     assert(res8.count == 3)
     assert(res8.agg(sum("id")).as("sumid").collect() === Seq(Row(-3)))
 
-    val res9 = TestSQLContext.range(Long.MaxValue, Long.MinValue, Long.MinValue, 100).select("id")
+    val res9 = ctx.range(Long.MaxValue, Long.MinValue, Long.MinValue, 100).select("id")
     assert(res9.count == 2)
     assert(res9.agg(sum("id")).as("sumid").collect() === Seq(Row(Long.MaxValue - 1)))
 
     // only end provided as argument
-    val res10 = TestSQLContext.range(10).select("id")
+    val res10 = ctx.range(10).select("id")
     assert(res10.count == 10)
     assert(res10.agg(sum("id")).as("sumid").collect() === Seq(Row(45)))
 
-    val res11 = TestSQLContext.range(-1).select("id")
+    val res11 = ctx.range(-1).select("id")
     assert(res11.count == 0)
   }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/JoinSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/JoinSuite.scala
index 407c789657834..ffd26c4f5a7c2 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/JoinSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/JoinSuite.scala
@@ -20,27 +20,28 @@ package org.apache.spark.sql
 import org.scalatest.BeforeAndAfterEach
 
 import org.apache.spark.sql.TestData._
-import org.apache.spark.sql.functions._
 import org.apache.spark.sql.catalyst.analysis.UnresolvedRelation
 import org.apache.spark.sql.execution.joins._
-import org.apache.spark.sql.test.TestSQLContext._
-import org.apache.spark.sql.test.TestSQLContext.implicits._
 
 
 class JoinSuite extends QueryTest with BeforeAndAfterEach {
   // Ensures tables are loaded.
   TestData
 
+  lazy val ctx = org.apache.spark.sql.test.TestSQLContext
+  import ctx.implicits._
+  import ctx.logicalPlanToSparkQuery
+
   test("equi-join is hash-join") {
     val x = testData2.as("x")
     val y = testData2.as("y")
     val join = x.join(y, $"x.a" === $"y.a", "inner").queryExecution.optimizedPlan
-    val planned = planner.HashJoin(join)
+    val planned = ctx.planner.HashJoin(join)
     assert(planned.size === 1)
   }
 
   def assertJoin(sqlString: String, c: Class[_]): Any = {
-    val df = sql(sqlString)
+    val df = ctx.sql(sqlString)
     val physical = df.queryExecution.sparkPlan
     val operators = physical.collect {
       case j: ShuffledHashJoin => j
@@ -61,9 +62,9 @@ class JoinSuite extends QueryTest with BeforeAndAfterEach {
   }
 
   test("join operator selection") {
-    cacheManager.clearCache()
+    ctx.cacheManager.clearCache()
 
-    val SORTMERGEJOIN_ENABLED: Boolean = conf.sortMergeJoinEnabled
+    val SORTMERGEJOIN_ENABLED: Boolean = ctx.conf.sortMergeJoinEnabled
     Seq(
       ("SELECT * FROM testData LEFT SEMI JOIN testData2 ON key = a", classOf[LeftSemiJoinHash]),
       ("SELECT * FROM testData LEFT SEMI JOIN testData2", classOf[LeftSemiJoinBNL]),
@@ -94,22 +95,22 @@ class JoinSuite extends QueryTest with BeforeAndAfterEach {
         classOf[BroadcastNestedLoopJoin])
     ).foreach { case (query, joinClass) => assertJoin(query, joinClass) }
     try {
-      conf.setConf("spark.sql.planner.sortMergeJoin", "true")
+      ctx.conf.setConf("spark.sql.planner.sortMergeJoin", "true")
       Seq(
         ("SELECT * FROM testData JOIN testData2 ON key = a", classOf[SortMergeJoin]),
         ("SELECT * FROM testData JOIN testData2 ON key = a and key = 2", classOf[SortMergeJoin]),
         ("SELECT * FROM testData JOIN testData2 ON key = a where key = 2", classOf[SortMergeJoin])
       ).foreach { case (query, joinClass) => assertJoin(query, joinClass) }
     } finally {
-      conf.setConf("spark.sql.planner.sortMergeJoin", SORTMERGEJOIN_ENABLED.toString)
+      ctx.conf.setConf("spark.sql.planner.sortMergeJoin", SORTMERGEJOIN_ENABLED.toString)
     }
   }
 
   test("broadcasted hash join operator selection") {
-    cacheManager.clearCache()
-    sql("CACHE TABLE testData")
+    ctx.cacheManager.clearCache()
+    ctx.sql("CACHE TABLE testData")
 
-    val SORTMERGEJOIN_ENABLED: Boolean = conf.sortMergeJoinEnabled
+    val SORTMERGEJOIN_ENABLED: Boolean = ctx.conf.sortMergeJoinEnabled
     Seq(
       ("SELECT * FROM testData join testData2 ON key = a", classOf[BroadcastHashJoin]),
       ("SELECT * FROM testData join testData2 ON key = a and key = 2", classOf[BroadcastHashJoin]),
@@ -117,7 +118,7 @@ class JoinSuite extends QueryTest with BeforeAndAfterEach {
         classOf[BroadcastHashJoin])
     ).foreach { case (query, joinClass) => assertJoin(query, joinClass) }
     try {
-      conf.setConf("spark.sql.planner.sortMergeJoin", "true")
+      ctx.conf.setConf("spark.sql.planner.sortMergeJoin", "true")
       Seq(
         ("SELECT * FROM testData join testData2 ON key = a", classOf[BroadcastHashJoin]),
         ("SELECT * FROM testData join testData2 ON key = a and key = 2",
@@ -126,17 +127,17 @@ class JoinSuite extends QueryTest with BeforeAndAfterEach {
           classOf[BroadcastHashJoin])
       ).foreach { case (query, joinClass) => assertJoin(query, joinClass) }
     } finally {
-      conf.setConf("spark.sql.planner.sortMergeJoin", SORTMERGEJOIN_ENABLED.toString)
+      ctx.conf.setConf("spark.sql.planner.sortMergeJoin", SORTMERGEJOIN_ENABLED.toString)
     }
 
-    sql("UNCACHE TABLE testData")
+    ctx.sql("UNCACHE TABLE testData")
   }
 
   test("multiple-key equi-join is hash-join") {
     val x = testData2.as("x")
     val y = testData2.as("y")
     val join = x.join(y, ($"x.a" === $"y.a") && ($"x.b" === $"y.b")).queryExecution.optimizedPlan
-    val planned = planner.HashJoin(join)
+    val planned = ctx.planner.HashJoin(join)
     assert(planned.size === 1)
   }
 
@@ -241,7 +242,7 @@ class JoinSuite extends QueryTest with BeforeAndAfterEach {
     // Make sure we are choosing left.outputPartitioning as the
     // outputPartitioning for the outer join operator.
     checkAnswer(
-      sql(
+      ctx.sql(
         """
           |SELECT l.N, count(*)
           |FROM upperCaseData l LEFT OUTER JOIN allNulls r ON (l.N = r.a)
@@ -255,7 +256,7 @@ class JoinSuite extends QueryTest with BeforeAndAfterEach {
         Row(6, 1) :: Nil)
 
     checkAnswer(
-      sql(
+      ctx.sql(
         """
           |SELECT r.a, count(*)
           |FROM upperCaseData l LEFT OUTER JOIN allNulls r ON (l.N = r.a)
@@ -301,7 +302,7 @@ class JoinSuite extends QueryTest with BeforeAndAfterEach {
     // Make sure we are choosing right.outputPartitioning as the
     // outputPartitioning for the outer join operator.
     checkAnswer(
-      sql(
+      ctx.sql(
         """
           |SELECT l.a, count(*)
           |FROM allNulls l RIGHT OUTER JOIN upperCaseData r ON (l.a = r.N)
@@ -310,7 +311,7 @@ class JoinSuite extends QueryTest with BeforeAndAfterEach {
       Row(null, 6))
 
     checkAnswer(
-      sql(
+      ctx.sql(
         """
           |SELECT r.N, count(*)
           |FROM allNulls l RIGHT OUTER JOIN upperCaseData r ON (l.a = r.N)
@@ -362,7 +363,7 @@ class JoinSuite extends QueryTest with BeforeAndAfterEach {
 
     // Make sure we are UnknownPartitioning as the outputPartitioning for the outer join operator.
     checkAnswer(
-      sql(
+      ctx.sql(
         """
           |SELECT l.a, count(*)
           |FROM allNulls l FULL OUTER JOIN upperCaseData r ON (l.a = r.N)
@@ -371,7 +372,7 @@ class JoinSuite extends QueryTest with BeforeAndAfterEach {
       Row(null, 10))
 
     checkAnswer(
-      sql(
+      ctx.sql(
         """
           |SELECT r.N, count(*)
           |FROM allNulls l FULL OUTER JOIN upperCaseData r ON (l.a = r.N)
@@ -386,7 +387,7 @@ class JoinSuite extends QueryTest with BeforeAndAfterEach {
         Row(null, 4) :: Nil)
 
     checkAnswer(
-      sql(
+      ctx.sql(
         """
           |SELECT l.N, count(*)
           |FROM upperCaseData l FULL OUTER JOIN allNulls r ON (l.N = r.a)
@@ -401,7 +402,7 @@ class JoinSuite extends QueryTest with BeforeAndAfterEach {
         Row(null, 4) :: Nil)
 
     checkAnswer(
-      sql(
+      ctx.sql(
         """
           |SELECT r.a, count(*)
           |FROM upperCaseData l FULL OUTER JOIN allNulls r ON (l.N = r.a)
@@ -411,11 +412,11 @@ class JoinSuite extends QueryTest with BeforeAndAfterEach {
   }
 
   test("broadcasted left semi join operator selection") {
-    cacheManager.clearCache()
-    sql("CACHE TABLE testData")
-    val tmp = conf.autoBroadcastJoinThreshold
+    ctx.cacheManager.clearCache()
+    ctx.sql("CACHE TABLE testData")
+    val tmp = ctx.conf.autoBroadcastJoinThreshold
 
-    sql(s"SET ${SQLConf.AUTO_BROADCASTJOIN_THRESHOLD}=1000000000")
+    ctx.sql(s"SET ${SQLConf.AUTO_BROADCASTJOIN_THRESHOLD}=1000000000")
     Seq(
       ("SELECT * FROM testData LEFT SEMI JOIN testData2 ON key = a",
         classOf[BroadcastLeftSemiJoinHash])
@@ -423,7 +424,7 @@ class JoinSuite extends QueryTest with BeforeAndAfterEach {
       case (query, joinClass) => assertJoin(query, joinClass)
     }
 
-    sql(s"SET ${SQLConf.AUTO_BROADCASTJOIN_THRESHOLD}=-1")
+    ctx.sql(s"SET ${SQLConf.AUTO_BROADCASTJOIN_THRESHOLD}=-1")
 
     Seq(
       ("SELECT * FROM testData LEFT SEMI JOIN testData2 ON key = a", classOf[LeftSemiJoinHash])
@@ -431,12 +432,12 @@ class JoinSuite extends QueryTest with BeforeAndAfterEach {
       case (query, joinClass) => assertJoin(query, joinClass)
     }
 
-    setConf(SQLConf.AUTO_BROADCASTJOIN_THRESHOLD, tmp.toString)
-    sql("UNCACHE TABLE testData")
+    ctx.setConf(SQLConf.AUTO_BROADCASTJOIN_THRESHOLD, tmp.toString)
+    ctx.sql("UNCACHE TABLE testData")
   }
 
   test("left semi join") {
-    val df = sql("SELECT * FROM testData2 LEFT SEMI JOIN testData ON key = a")
+    val df = ctx.sql("SELECT * FROM testData2 LEFT SEMI JOIN testData ON key = a")
     checkAnswer(df,
       Row(1, 1) ::
         Row(1, 2) ::
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/ListTablesSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/ListTablesSuite.scala
index 3ce97c3fffdb4..2089660c52bf7 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/ListTablesSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/ListTablesSuite.scala
@@ -19,49 +19,47 @@ package org.apache.spark.sql
 
 import org.scalatest.BeforeAndAfter
 
-import org.apache.spark.sql.test.TestSQLContext
-import org.apache.spark.sql.test.TestSQLContext._
 import org.apache.spark.sql.types.{BooleanType, StringType, StructField, StructType}
 
 class ListTablesSuite extends QueryTest with BeforeAndAfter {
 
-  import org.apache.spark.sql.test.TestSQLContext.implicits._
+  private lazy val ctx = org.apache.spark.sql.test.TestSQLContext
+  import ctx.implicits._
 
-  val df =
-    sparkContext.parallelize((1 to 10).map(i => (i, s"str$i"))).toDF("key", "value")
+  private lazy val df = (1 to 10).map(i => (i, s"str$i")).toDF("key", "value")
 
   before {
     df.registerTempTable("ListTablesSuiteTable")
   }
 
   after {
-    catalog.unregisterTable(Seq("ListTablesSuiteTable"))
+    ctx.catalog.unregisterTable(Seq("ListTablesSuiteTable"))
   }
 
   test("get all tables") {
     checkAnswer(
-      tables().filter("tableName = 'ListTablesSuiteTable'"),
+      ctx.tables().filter("tableName = 'ListTablesSuiteTable'"),
       Row("ListTablesSuiteTable", true))
 
     checkAnswer(
-      sql("SHOW tables").filter("tableName = 'ListTablesSuiteTable'"),
+      ctx.sql("SHOW tables").filter("tableName = 'ListTablesSuiteTable'"),
       Row("ListTablesSuiteTable", true))
 
-    catalog.unregisterTable(Seq("ListTablesSuiteTable"))
-    assert(tables().filter("tableName = 'ListTablesSuiteTable'").count() === 0)
+    ctx.catalog.unregisterTable(Seq("ListTablesSuiteTable"))
+    assert(ctx.tables().filter("tableName = 'ListTablesSuiteTable'").count() === 0)
   }
 
   test("getting all Tables with a database name has no impact on returned table names") {
     checkAnswer(
-      tables("DB").filter("tableName = 'ListTablesSuiteTable'"),
+      ctx.tables("DB").filter("tableName = 'ListTablesSuiteTable'"),
       Row("ListTablesSuiteTable", true))
 
     checkAnswer(
-      sql("show TABLES in DB").filter("tableName = 'ListTablesSuiteTable'"),
+      ctx.sql("show TABLES in DB").filter("tableName = 'ListTablesSuiteTable'"),
       Row("ListTablesSuiteTable", true))
 
-    catalog.unregisterTable(Seq("ListTablesSuiteTable"))
-    assert(tables().filter("tableName = 'ListTablesSuiteTable'").count() === 0)
+    ctx.catalog.unregisterTable(Seq("ListTablesSuiteTable"))
+    assert(ctx.tables().filter("tableName = 'ListTablesSuiteTable'").count() === 0)
   }
 
   test("query the returned DataFrame of tables") {
@@ -69,19 +67,20 @@ class ListTablesSuite extends QueryTest with BeforeAndAfter {
       StructField("tableName", StringType, false) ::
       StructField("isTemporary", BooleanType, false) :: Nil)
 
-    Seq(tables(), sql("SHOW TABLes")).foreach {
+    Seq(ctx.tables(), ctx.sql("SHOW TABLes")).foreach {
       case tableDF =>
         assert(expectedSchema === tableDF.schema)
 
         tableDF.registerTempTable("tables")
         checkAnswer(
-          sql("SELECT isTemporary, tableName from tables WHERE tableName = 'ListTablesSuiteTable'"),
+          ctx.sql(
+            "SELECT isTemporary, tableName from tables WHERE tableName = 'ListTablesSuiteTable'"),
           Row(true, "ListTablesSuiteTable")
         )
         checkAnswer(
-          tables().filter("tableName = 'tables'").select("tableName", "isTemporary"),
+          ctx.tables().filter("tableName = 'tables'").select("tableName", "isTemporary"),
           Row("tables", true))
-        dropTempTable("tables")
+        ctx.dropTempTable("tables")
     }
   }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/MathExpressionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/MathExpressionsSuite.scala
index dd68965444f5d..0a38af2b4c889 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/MathExpressionsSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/MathExpressionsSuite.scala
@@ -17,36 +17,29 @@
 
 package org.apache.spark.sql
 
-import java.lang.{Double => JavaDouble}
-
 import org.apache.spark.sql.functions._
-import org.apache.spark.sql.test.TestSQLContext
-import org.apache.spark.sql.test.TestSQLContext.implicits._
-
-private[this] object MathExpressionsTestData {
-
-  case class DoubleData(a: JavaDouble, b: JavaDouble)
-  val doubleData = TestSQLContext.sparkContext.parallelize(
-    (1 to 10).map(i => DoubleData(i * 0.2 - 1, i * -0.2 + 1))).toDF()
-
-  val nnDoubleData = TestSQLContext.sparkContext.parallelize(
-    (1 to 10).map(i => DoubleData(i * 0.1, i * -0.1))).toDF()
-
-  case class NullDoubles(a: JavaDouble)
-  val nullDoubles =
-    TestSQLContext.sparkContext.parallelize(
-      NullDoubles(1.0) ::
-        NullDoubles(2.0) ::
-        NullDoubles(3.0) ::
-        NullDoubles(null) :: Nil
-    ).toDF()
+
+
+private object MathExpressionsTestData {
+  case class DoubleData(a: java.lang.Double, b: java.lang.Double)
+  case class NullDoubles(a: java.lang.Double)
 }
 
 class MathExpressionsSuite extends QueryTest {
 
   import MathExpressionsTestData._
 
-  def testOneToOneMathFunction[@specialized(Int, Long, Float, Double) T](
+  private lazy val ctx = org.apache.spark.sql.test.TestSQLContext
+  import ctx.implicits._
+
+  private lazy val doubleData = (1 to 10).map(i => DoubleData(i * 0.2 - 1, i * -0.2 + 1)).toDF()
+
+  private lazy val nnDoubleData = (1 to 10).map(i => DoubleData(i * 0.1, i * -0.1)).toDF()
+
+  private lazy val nullDoubles =
+    Seq(NullDoubles(1.0), NullDoubles(2.0), NullDoubles(3.0), NullDoubles(null)).toDF()
+
+  private def testOneToOneMathFunction[@specialized(Int, Long, Float, Double) T](
       c: Column => Column,
       f: T => T): Unit = {
     checkAnswer(
@@ -65,7 +58,8 @@ class MathExpressionsSuite extends QueryTest {
     )
   }
 
-  def testOneToOneNonNegativeMathFunction(c: Column => Column, f: Double => Double): Unit = {
+  private def testOneToOneNonNegativeMathFunction(c: Column => Column, f: Double => Double): Unit =
+  {
     checkAnswer(
       nnDoubleData.select(c('a)),
       (1 to 10).map(n => Row(f(n * 0.1)))
@@ -89,7 +83,7 @@ class MathExpressionsSuite extends QueryTest {
     )
   }
 
-  def testTwoToOneMathFunction(
+  private def testTwoToOneMathFunction(
       c: (Column, Column) => Column,
       d: (Column, Double) => Column,
       f: (Double, Double) => Double): Unit = {
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/RowSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/RowSuite.scala
index 513ac915dcb2a..d84b57af9c882 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/RowSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/RowSuite.scala
@@ -21,12 +21,13 @@ import org.apache.spark.SparkFunSuite
 import org.apache.spark.sql.execution.SparkSqlSerializer
 
 import org.apache.spark.sql.catalyst.expressions.{GenericMutableRow, SpecificMutableRow}
-import org.apache.spark.sql.test.TestSQLContext
-import org.apache.spark.sql.test.TestSQLContext.implicits._
 import org.apache.spark.sql.types._
 
 class RowSuite extends SparkFunSuite {
 
+  private lazy val ctx = org.apache.spark.sql.test.TestSQLContext
+  import ctx.implicits._
+
   test("create row") {
     val expected = new GenericMutableRow(4)
     expected.update(0, 2147483647)
@@ -56,7 +57,7 @@ class RowSuite extends SparkFunSuite {
 
   test("serialize w/ kryo") {
     val row = Seq((1, Seq(1), Map(1 -> 1), BigDecimal(1))).toDF().first()
-    val serializer = new SparkSqlSerializer(TestSQLContext.sparkContext.getConf)
+    val serializer = new SparkSqlSerializer(ctx.sparkContext.getConf)
     val instance = serializer.newInstance()
     val ser = instance.serialize(row)
     val de = instance.deserialize(ser).asInstanceOf[Row]
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLConfSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLConfSuite.scala
index 3a5f071e2f7cb..76d0dd1744a41 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/SQLConfSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/SQLConfSuite.scala
@@ -17,67 +17,64 @@
 
 package org.apache.spark.sql
 
-import org.apache.spark.SparkFunSuite
-import org.apache.spark.sql.test._
-
-/* Implicits */
-import TestSQLContext._
 
 class SQLConfSuite extends QueryTest {
 
-  val testKey = "test.key.0"
-  val testVal = "test.val.0"
+  private lazy val ctx = org.apache.spark.sql.test.TestSQLContext
+
+  private val testKey = "test.key.0"
+  private val testVal = "test.val.0"
 
   test("propagate from spark conf") {
     // We create a new context here to avoid order dependence with other tests that might call
     // clear().
-    val newContext = new SQLContext(TestSQLContext.sparkContext)
-    assert(newContext.getConf("spark.sql.testkey", "false") == "true")
+    val newContext = new SQLContext(ctx.sparkContext)
+    assert(newContext.getConf("spark.sql.testkey", "false") === "true")
   }
 
   test("programmatic ways of basic setting and getting") {
-    conf.clear()
-    assert(getAllConfs.size === 0)
+    ctx.conf.clear()
+    assert(ctx.getAllConfs.size === 0)
 
-    setConf(testKey, testVal)
-    assert(getConf(testKey) == testVal)
-    assert(getConf(testKey, testVal + "_") == testVal)
-    assert(getAllConfs.contains(testKey))
+    ctx.setConf(testKey, testVal)
+    assert(ctx.getConf(testKey) === testVal)
+    assert(ctx.getConf(testKey, testVal + "_") === testVal)
+    assert(ctx.getAllConfs.contains(testKey))
 
     // Tests SQLConf as accessed from a SQLContext is mutable after
     // the latter is initialized, unlike SparkConf inside a SparkContext.
-    assert(TestSQLContext.getConf(testKey) == testVal)
-    assert(TestSQLContext.getConf(testKey, testVal + "_") == testVal)
-    assert(TestSQLContext.getAllConfs.contains(testKey))
+    assert(ctx.getConf(testKey) == testVal)
+    assert(ctx.getConf(testKey, testVal + "_") === testVal)
+    assert(ctx.getAllConfs.contains(testKey))
 
-    conf.clear()
+    ctx.conf.clear()
   }
 
   test("parse SQL set commands") {
-    conf.clear()
-    sql(s"set $testKey=$testVal")
-    assert(getConf(testKey, testVal + "_") == testVal)
-    assert(TestSQLContext.getConf(testKey, testVal + "_") == testVal)
+    ctx.conf.clear()
+    ctx.sql(s"set $testKey=$testVal")
+    assert(ctx.getConf(testKey, testVal + "_") === testVal)
+    assert(ctx.getConf(testKey, testVal + "_") === testVal)
 
-    sql("set some.property=20")
-    assert(getConf("some.property", "0") == "20")
-    sql("set some.property = 40")
-    assert(getConf("some.property", "0") == "40")
+    ctx.sql("set some.property=20")
+    assert(ctx.getConf("some.property", "0") === "20")
+    ctx.sql("set some.property = 40")
+    assert(ctx.getConf("some.property", "0") === "40")
 
     val key = "spark.sql.key"
     val vs = "val0,val_1,val2.3,my_table"
-    sql(s"set $key=$vs")
-    assert(getConf(key, "0") == vs)
+    ctx.sql(s"set $key=$vs")
+    assert(ctx.getConf(key, "0") === vs)
 
-    sql(s"set $key=")
-    assert(getConf(key, "0") == "")
+    ctx.sql(s"set $key=")
+    assert(ctx.getConf(key, "0") === "")
 
-    conf.clear()
+    ctx.conf.clear()
   }
 
   test("deprecated property") {
-    conf.clear()
-    sql(s"set ${SQLConf.Deprecated.MAPRED_REDUCE_TASKS}=10")
-    assert(getConf(SQLConf.SHUFFLE_PARTITIONS) == "10")
+    ctx.conf.clear()
+    ctx.sql(s"set ${SQLConf.Deprecated.MAPRED_REDUCE_TASKS}=10")
+    assert(ctx.getConf(SQLConf.SHUFFLE_PARTITIONS) === "10")
   }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLContextSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLContextSuite.scala
index 797d123b48668..c8d8796568a41 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/SQLContextSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/SQLContextSuite.scala
@@ -20,31 +20,29 @@ package org.apache.spark.sql
 import org.scalatest.BeforeAndAfterAll
 
 import org.apache.spark.SparkFunSuite
-import org.apache.spark.sql.test.TestSQLContext
 
 class SQLContextSuite extends SparkFunSuite with BeforeAndAfterAll {
 
-  private val testSqlContext = TestSQLContext
-  private val testSparkContext = TestSQLContext.sparkContext
+  private lazy val ctx = org.apache.spark.sql.test.TestSQLContext
 
   override def afterAll(): Unit = {
-    SQLContext.setLastInstantiatedContext(testSqlContext)
+    SQLContext.setLastInstantiatedContext(ctx)
   }
 
   test("getOrCreate instantiates SQLContext") {
     SQLContext.clearLastInstantiatedContext()
-    val sqlContext = SQLContext.getOrCreate(testSparkContext)
+    val sqlContext = SQLContext.getOrCreate(ctx.sparkContext)
     assert(sqlContext != null, "SQLContext.getOrCreate returned null")
-    assert(SQLContext.getOrCreate(testSparkContext).eq(sqlContext),
+    assert(SQLContext.getOrCreate(ctx.sparkContext).eq(sqlContext),
       "SQLContext created by SQLContext.getOrCreate not returned by SQLContext.getOrCreate")
   }
 
   test("getOrCreate gets last explicitly instantiated SQLContext") {
     SQLContext.clearLastInstantiatedContext()
-    val sqlContext = new SQLContext(testSparkContext)
-    assert(SQLContext.getOrCreate(testSparkContext) != null,
+    val sqlContext = new SQLContext(ctx.sparkContext)
+    assert(SQLContext.getOrCreate(ctx.sparkContext) != null,
       "SQLContext.getOrCreate after explicitly created SQLContext returned null")
-    assert(SQLContext.getOrCreate(testSparkContext).eq(sqlContext),
+    assert(SQLContext.getOrCreate(ctx.sparkContext).eq(sqlContext),
       "SQLContext.getOrCreate after explicitly created SQLContext did not return the context")
   }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
index 55b68d8e2283c..5babc4332cc77 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
@@ -24,9 +24,7 @@ import org.apache.spark.sql.catalyst.errors.DialectException
 import org.apache.spark.sql.execution.GeneratedAggregate
 import org.apache.spark.sql.functions._
 import org.apache.spark.sql.TestData._
-import org.apache.spark.sql.test.{SQLTestUtils, TestSQLContext}
-import org.apache.spark.sql.test.TestSQLContext.{udf => _, _}
-
+import org.apache.spark.sql.test.SQLTestUtils
 import org.apache.spark.sql.types._
 
 /** A SQL Dialect for testing purpose, and it can not be nested type */
@@ -36,8 +34,9 @@ class SQLQuerySuite extends QueryTest with BeforeAndAfterAll with SQLTestUtils {
   // Make sure the tables are loaded.
   TestData
 
-  val sqlContext = TestSQLContext
+  val sqlContext = org.apache.spark.sql.test.TestSQLContext
   import sqlContext.implicits._
+  import sqlContext.sql
 
   test("SPARK-6743: no columns from cache") {
     Seq(
@@ -46,7 +45,7 @@ class SQLQuerySuite extends QueryTest with BeforeAndAfterAll with SQLTestUtils {
       (43, 81, 24)
     ).toDF("a", "b", "c").registerTempTable("cachedData")
 
-    cacheTable("cachedData")
+    sqlContext.cacheTable("cachedData")
     checkAnswer(
       sql("SELECT t1.b FROM cachedData, cachedData t1 GROUP BY t1.b"),
       Row(0) :: Row(81) :: Nil)
@@ -94,14 +93,14 @@ class SQLQuerySuite extends QueryTest with BeforeAndAfterAll with SQLTestUtils {
   }
 
   test("SQL Dialect Switching to a new SQL parser") {
-    val newContext = new SQLContext(TestSQLContext.sparkContext)
+    val newContext = new SQLContext(sqlContext.sparkContext)
     newContext.setConf("spark.sql.dialect", classOf[MyDialect].getCanonicalName())
     assert(newContext.getSQLDialect().getClass === classOf[MyDialect])
     assert(newContext.sql("SELECT 1").collect() === Array(Row(1)))
   }
 
   test("SQL Dialect Switch to an invalid parser with alias") {
-    val newContext = new SQLContext(TestSQLContext.sparkContext)
+    val newContext = new SQLContext(sqlContext.sparkContext)
     newContext.sql("SET spark.sql.dialect=MyTestClass")
     intercept[DialectException] {
       newContext.sql("SELECT 1")
@@ -118,7 +117,8 @@ class SQLQuerySuite extends QueryTest with BeforeAndAfterAll with SQLTestUtils {
   }
 
   test("grouping on nested fields") {
-    read.json(sparkContext.parallelize("""{"nested": {"attribute": 1}, "value": 2}""" :: Nil))
+    sqlContext.read.json(sqlContext.sparkContext.parallelize(
+      """{"nested": {"attribute": 1}, "value": 2}""" :: Nil))
      .registerTempTable("rows")
 
     checkAnswer(
@@ -135,8 +135,9 @@ class SQLQuerySuite extends QueryTest with BeforeAndAfterAll with SQLTestUtils {
   }
 
   test("SPARK-6201 IN type conversion") {
-    read.json(
-      sparkContext.parallelize(Seq("{\"a\": \"1\"}}", "{\"a\": \"2\"}}", "{\"a\": \"3\"}}")))
+    sqlContext.read.json(
+      sqlContext.sparkContext.parallelize(
+        Seq("{\"a\": \"1\"}}", "{\"a\": \"2\"}}", "{\"a\": \"3\"}}")))
       .registerTempTable("d")
 
     checkAnswer(
@@ -157,12 +158,12 @@ class SQLQuerySuite extends QueryTest with BeforeAndAfterAll with SQLTestUtils {
   }
 
   test("aggregation with codegen") {
-    val originalValue = conf.codegenEnabled
-    setConf(SQLConf.CODEGEN_ENABLED, "true")
+    val originalValue = sqlContext.conf.codegenEnabled
+    sqlContext.setConf(SQLConf.CODEGEN_ENABLED, "true")
     // Prepare a table that we can group some rows.
-    table("testData")
-      .unionAll(table("testData"))
-      .unionAll(table("testData"))
+    sqlContext.table("testData")
+      .unionAll(sqlContext.table("testData"))
+      .unionAll(sqlContext.table("testData"))
       .registerTempTable("testData3x")
 
     def testCodeGen(sqlText: String, expectedResults: Seq[Row]): Unit = {
@@ -254,8 +255,8 @@ class SQLQuerySuite extends QueryTest with BeforeAndAfterAll with SQLTestUtils {
         "SELECT  sum('a'), avg('a'), count(null) FROM testData",
         Row(0, null, 0) :: Nil)
     } finally {
-      dropTempTable("testData3x")
-      setConf(SQLConf.CODEGEN_ENABLED, originalValue.toString)
+      sqlContext.dropTempTable("testData3x")
+      sqlContext.setConf(SQLConf.CODEGEN_ENABLED, originalValue.toString)
     }
   }
 
@@ -447,42 +448,42 @@ class SQLQuerySuite extends QueryTest with BeforeAndAfterAll with SQLTestUtils {
   }
 
   test("sorting") {
-    val before = conf.externalSortEnabled
-    setConf(SQLConf.EXTERNAL_SORT, "false")
+    val before = sqlContext.conf.externalSortEnabled
+    sqlContext.setConf(SQLConf.EXTERNAL_SORT, "false")
     sortTest()
-    setConf(SQLConf.EXTERNAL_SORT, before.toString)
+    sqlContext.setConf(SQLConf.EXTERNAL_SORT, before.toString)
   }
 
   test("external sorting") {
-    val before = conf.externalSortEnabled
-    setConf(SQLConf.EXTERNAL_SORT, "true")
+    val before = sqlContext.conf.externalSortEnabled
+    sqlContext.setConf(SQLConf.EXTERNAL_SORT, "true")
     sortTest()
-    setConf(SQLConf.EXTERNAL_SORT, before.toString)
+    sqlContext.setConf(SQLConf.EXTERNAL_SORT, before.toString)
   }
 
   test("SPARK-6927 sorting with codegen on") {
-    val externalbefore = conf.externalSortEnabled
-    val codegenbefore = conf.codegenEnabled
-    setConf(SQLConf.EXTERNAL_SORT, "false")
-    setConf(SQLConf.CODEGEN_ENABLED, "true")
+    val externalbefore = sqlContext.conf.externalSortEnabled
+    val codegenbefore = sqlContext.conf.codegenEnabled
+    sqlContext.setConf(SQLConf.EXTERNAL_SORT, "false")
+    sqlContext.setConf(SQLConf.CODEGEN_ENABLED, "true")
     try{
       sortTest()
     } finally {
-      setConf(SQLConf.EXTERNAL_SORT, externalbefore.toString)
-      setConf(SQLConf.CODEGEN_ENABLED, codegenbefore.toString)
+      sqlContext.setConf(SQLConf.EXTERNAL_SORT, externalbefore.toString)
+      sqlContext.setConf(SQLConf.CODEGEN_ENABLED, codegenbefore.toString)
     }
   }
 
   test("SPARK-6927 external sorting with codegen on") {
-    val externalbefore = conf.externalSortEnabled
-    val codegenbefore = conf.codegenEnabled
-    setConf(SQLConf.CODEGEN_ENABLED, "true")
-    setConf(SQLConf.EXTERNAL_SORT, "true")
+    val externalbefore = sqlContext.conf.externalSortEnabled
+    val codegenbefore = sqlContext.conf.codegenEnabled
+    sqlContext.setConf(SQLConf.CODEGEN_ENABLED, "true")
+    sqlContext.setConf(SQLConf.EXTERNAL_SORT, "true")
     try {
       sortTest()
     } finally {
-      setConf(SQLConf.EXTERNAL_SORT, externalbefore.toString)
-      setConf(SQLConf.CODEGEN_ENABLED, codegenbefore.toString)
+      sqlContext.setConf(SQLConf.EXTERNAL_SORT, externalbefore.toString)
+      sqlContext.setConf(SQLConf.CODEGEN_ENABLED, codegenbefore.toString)
     }
   }
 
@@ -516,7 +517,8 @@ class SQLQuerySuite extends QueryTest with BeforeAndAfterAll with SQLTestUtils {
 
   test("Allow only a single WITH clause per query") {
     intercept[RuntimeException] {
-      sql("with q1 as (select * from testData) with q2 as (select * from q1) select * from q2")
+      sql(
+        "with q1 as (select * from testData) with q2 as (select * from q1) select * from q2")
     }
   }
 
@@ -863,7 +865,7 @@ class SQLQuerySuite extends QueryTest with BeforeAndAfterAll with SQLTestUtils {
   }
 
   test("SET commands semantics using sql()") {
-    conf.clear()
+    sqlContext.conf.clear()
     val testKey = "test.key.0"
     val testVal = "test.val.0"
     val nonexistentKey = "nonexistent"
@@ -895,17 +897,17 @@ class SQLQuerySuite extends QueryTest with BeforeAndAfterAll with SQLTestUtils {
       sql(s"SET $nonexistentKey"),
       Row(s"$nonexistentKey=<undefined>")
     )
-    conf.clear()
+    sqlContext.conf.clear()
   }
 
   test("SET commands with illegal or inappropriate argument") {
-    conf.clear()
+    sqlContext.conf.clear()
     // Set negative mapred.reduce.tasks for automatically determing
     // the number of reducers is not supported
     intercept[IllegalArgumentException](sql(s"SET mapred.reduce.tasks=-1"))
     intercept[IllegalArgumentException](sql(s"SET mapred.reduce.tasks=-01"))
     intercept[IllegalArgumentException](sql(s"SET mapred.reduce.tasks=-2"))
-    conf.clear()
+    sqlContext.conf.clear()
   }
 
   test("apply schema") {
@@ -923,7 +925,7 @@ class SQLQuerySuite extends QueryTest with BeforeAndAfterAll with SQLTestUtils {
       Row(values(0).toInt, values(1), values(2).toBoolean, v4)
     }
 
-    val df1 = createDataFrame(rowRDD1, schema1)
+    val df1 = sqlContext.createDataFrame(rowRDD1, schema1)
     df1.registerTempTable("applySchema1")
     checkAnswer(
       sql("SELECT * FROM applySchema1"),
@@ -953,7 +955,7 @@ class SQLQuerySuite extends QueryTest with BeforeAndAfterAll with SQLTestUtils {
       Row(Row(values(0).toInt, values(2).toBoolean), Map(values(1) -> v4))
     }
 
-    val df2 = createDataFrame(rowRDD2, schema2)
+    val df2 = sqlContext.createDataFrame(rowRDD2, schema2)
     df2.registerTempTable("applySchema2")
     checkAnswer(
       sql("SELECT * FROM applySchema2"),
@@ -978,7 +980,7 @@ class SQLQuerySuite extends QueryTest with BeforeAndAfterAll with SQLTestUtils {
       Row(Row(values(0).toInt, values(2).toBoolean), scala.collection.mutable.Map(values(1) -> v4))
     }
 
-    val df3 = createDataFrame(rowRDD3, schema2)
+    val df3 = sqlContext.createDataFrame(rowRDD3, schema2)
     df3.registerTempTable("applySchema3")
 
     checkAnswer(
@@ -1023,7 +1025,7 @@ class SQLQuerySuite extends QueryTest with BeforeAndAfterAll with SQLTestUtils {
       .build()
     val schemaWithMeta = new StructType(Array(
       schema("id"), schema("name").copy(metadata = metadata), schema("age")))
-    val personWithMeta = createDataFrame(person.rdd, schemaWithMeta)
+    val personWithMeta = sqlContext.createDataFrame(person.rdd, schemaWithMeta)
     def validateMetadata(rdd: DataFrame): Unit = {
       assert(rdd.schema("name").metadata.getString(docKey) == docValue)
     }
@@ -1038,7 +1040,7 @@ class SQLQuerySuite extends QueryTest with BeforeAndAfterAll with SQLTestUtils {
   }
 
   test("SPARK-3371 Renaming a function expression with group by gives error") {
-    TestSQLContext.udf.register("len", (s: String) => s.length)
+    sqlContext.udf.register("len", (s: String) => s.length)
     checkAnswer(
       sql("SELECT len(value) as temp FROM testData WHERE key = 1 group by len(value)"),
       Row(1))
@@ -1219,9 +1221,9 @@ class SQLQuerySuite extends QueryTest with BeforeAndAfterAll with SQLTestUtils {
   }
 
   test("SPARK-3483 Special chars in column names") {
-    val data = sparkContext.parallelize(
+    val data = sqlContext.sparkContext.parallelize(
       Seq("""{"key?number1": "value1", "key.number2": "value2"}"""))
-    read.json(data).registerTempTable("records")
+    sqlContext.read.json(data).registerTempTable("records")
     sql("SELECT `key?number1`, `key.number2` FROM records")
   }
 
@@ -1262,13 +1264,15 @@ class SQLQuerySuite extends QueryTest with BeforeAndAfterAll with SQLTestUtils {
   }
 
   test("SPARK-4322 Grouping field with struct field as sub expression") {
-    read.json(sparkContext.makeRDD("""{"a": {"b": [{"c": 1}]}}""" :: Nil)).registerTempTable("data")
+    sqlContext.read.json(sqlContext.sparkContext.makeRDD("""{"a": {"b": [{"c": 1}]}}""" :: Nil))
+      .registerTempTable("data")
     checkAnswer(sql("SELECT a.b[0].c FROM data GROUP BY a.b[0].c"), Row(1))
-    dropTempTable("data")
+    sqlContext.dropTempTable("data")
 
-    read.json(sparkContext.makeRDD("""{"a": {"b": 1}}""" :: Nil)).registerTempTable("data")
+    sqlContext.read.json(
+      sqlContext.sparkContext.makeRDD("""{"a": {"b": 1}}""" :: Nil)).registerTempTable("data")
     checkAnswer(sql("SELECT a.b + 1 FROM data GROUP BY a.b + 1"), Row(2))
-    dropTempTable("data")
+    sqlContext.dropTempTable("data")
   }
 
   test("SPARK-4432 Fix attribute reference resolution error when using ORDER BY") {
@@ -1287,10 +1291,10 @@ class SQLQuerySuite extends QueryTest with BeforeAndAfterAll with SQLTestUtils {
 
   test("Supporting relational operator '<=>' in Spark SQL") {
     val nullCheckData1 = TestData(1, "1") :: TestData(2, null) :: Nil
-    val rdd1 = sparkContext.parallelize((0 to 1).map(i => nullCheckData1(i)))
+    val rdd1 = sqlContext.sparkContext.parallelize((0 to 1).map(i => nullCheckData1(i)))
     rdd1.toDF().registerTempTable("nulldata1")
     val nullCheckData2 = TestData(1, "1") :: TestData(2, null) :: Nil
-    val rdd2 = sparkContext.parallelize((0 to 1).map(i => nullCheckData2(i)))
+    val rdd2 = sqlContext.sparkContext.parallelize((0 to 1).map(i => nullCheckData2(i)))
     rdd2.toDF().registerTempTable("nulldata2")
     checkAnswer(sql("SELECT nulldata1.key FROM nulldata1 join " +
       "nulldata2 on nulldata1.value <=> nulldata2.value"),
@@ -1299,22 +1303,23 @@ class SQLQuerySuite extends QueryTest with BeforeAndAfterAll with SQLTestUtils {
 
   test("Multi-column COUNT(DISTINCT ...)") {
     val data = TestData(1, "val_1") :: TestData(2, "val_2") :: Nil
-    val rdd = sparkContext.parallelize((0 to 1).map(i => data(i)))
+    val rdd = sqlContext.sparkContext.parallelize((0 to 1).map(i => data(i)))
     rdd.toDF().registerTempTable("distinctData")
     checkAnswer(sql("SELECT COUNT(DISTINCT key,value) FROM distinctData"), Row(2))
   }
 
   test("SPARK-4699 case sensitivity SQL query") {
-    setConf(SQLConf.CASE_SENSITIVE, "false")
+    sqlContext.setConf(SQLConf.CASE_SENSITIVE, "false")
     val data = TestData(1, "val_1") :: TestData(2, "val_2") :: Nil
-    val rdd = sparkContext.parallelize((0 to 1).map(i => data(i)))
+    val rdd = sqlContext.sparkContext.parallelize((0 to 1).map(i => data(i)))
     rdd.toDF().registerTempTable("testTable1")
     checkAnswer(sql("SELECT VALUE FROM TESTTABLE1 where KEY = 1"), Row("val_1"))
-    setConf(SQLConf.CASE_SENSITIVE, "true")
+    sqlContext.setConf(SQLConf.CASE_SENSITIVE, "true")
   }
 
   test("SPARK-6145: ORDER BY test for nested fields") {
-    read.json(sparkContext.makeRDD("""{"a": {"b": 1, "a": {"a": 1}}, "c": [{"d": 1}]}""" :: Nil))
+    sqlContext.read.json(sqlContext.sparkContext.makeRDD(
+        """{"a": {"b": 1, "a": {"a": 1}}, "c": [{"d": 1}]}""" :: Nil))
       .registerTempTable("nestedOrder")
 
     checkAnswer(sql("SELECT 1 FROM nestedOrder ORDER BY a.b"), Row(1))
@@ -1326,14 +1331,14 @@ class SQLQuerySuite extends QueryTest with BeforeAndAfterAll with SQLTestUtils {
   }
 
   test("SPARK-6145: special cases") {
-    read.json(sparkContext.makeRDD(
+    sqlContext.read.json(sqlContext.sparkContext.makeRDD(
       """{"a": {"b": [1]}, "b": [{"a": 1}], "c0": {"a": 1}}""" :: Nil)).registerTempTable("t")
     checkAnswer(sql("SELECT a.b[0] FROM t ORDER BY c0.a"), Row(1))
     checkAnswer(sql("SELECT b[0].a FROM t ORDER BY c0.a"), Row(1))
   }
 
   test("SPARK-6898: complete support for special chars in column names") {
-    read.json(sparkContext.makeRDD(
+    sqlContext.read.json(sqlContext.sparkContext.makeRDD(
       """{"a": {"c.b": 1}, "b.$q": [{"a@!.q": 1}], "q.w": {"w.i&": [1]}}""" :: Nil))
       .registerTempTable("t")
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/ScalaReflectionRelationSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/ScalaReflectionRelationSuite.scala
index d2ede39f0a5f6..ece3d6fdf2af5 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/ScalaReflectionRelationSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/ScalaReflectionRelationSuite.scala
@@ -21,7 +21,6 @@ import java.sql.{Date, Timestamp}
 
 import org.apache.spark.SparkFunSuite
 import org.apache.spark.sql.catalyst.expressions._
-import org.apache.spark.sql.test.TestSQLContext._
 
 case class ReflectData(
     stringField: String,
@@ -75,15 +74,15 @@ case class ComplexReflectData(
 
 class ScalaReflectionRelationSuite extends SparkFunSuite {
 
-  import org.apache.spark.sql.test.TestSQLContext.implicits._
+  private lazy val ctx = org.apache.spark.sql.test.TestSQLContext
+  import ctx.implicits._
 
   test("query case class RDD") {
     val data = ReflectData("a", 1, 1L, 1.toFloat, 1.toDouble, 1.toShort, 1.toByte, true,
       new java.math.BigDecimal(1), new Date(12345), new Timestamp(12345), Seq(1, 2, 3))
-    val rdd = sparkContext.parallelize(data :: Nil)
-    rdd.toDF().registerTempTable("reflectData")
+    Seq(data).toDF().registerTempTable("reflectData")
 
-    assert(sql("SELECT * FROM reflectData").collect().head ===
+    assert(ctx.sql("SELECT * FROM reflectData").collect().head ===
       Row("a", 1, 1L, 1.toFloat, 1.toDouble, 1.toShort, 1.toByte, true,
         new java.math.BigDecimal(1), Date.valueOf("1970-01-01"),
         new Timestamp(12345), Seq(1, 2, 3)))
@@ -91,27 +90,26 @@ class ScalaReflectionRelationSuite extends SparkFunSuite {
 
   test("query case class RDD with nulls") {
     val data = NullReflectData(null, null, null, null, null, null, null)
-    val rdd = sparkContext.parallelize(data :: Nil)
-    rdd.toDF().registerTempTable("reflectNullData")
+    Seq(data).toDF().registerTempTable("reflectNullData")
 
-    assert(sql("SELECT * FROM reflectNullData").collect().head === Row.fromSeq(Seq.fill(7)(null)))
+    assert(ctx.sql("SELECT * FROM reflectNullData").collect().head ===
+      Row.fromSeq(Seq.fill(7)(null)))
   }
 
   test("query case class RDD with Nones") {
     val data = OptionalReflectData(None, None, None, None, None, None, None)
-    val rdd = sparkContext.parallelize(data :: Nil)
-    rdd.toDF().registerTempTable("reflectOptionalData")
+    Seq(data).toDF().registerTempTable("reflectOptionalData")
 
-    assert(sql("SELECT * FROM reflectOptionalData").collect().head ===
+    assert(ctx.sql("SELECT * FROM reflectOptionalData").collect().head ===
       Row.fromSeq(Seq.fill(7)(null)))
   }
 
   // Equality is broken for Arrays, so we test that separately.
   test("query binary data") {
-    val rdd = sparkContext.parallelize(ReflectBinary(Array[Byte](1)) :: Nil)
-    rdd.toDF().registerTempTable("reflectBinary")
+    Seq(ReflectBinary(Array[Byte](1))).toDF().registerTempTable("reflectBinary")
 
-    val result = sql("SELECT data FROM reflectBinary").collect().head(0).asInstanceOf[Array[Byte]]
+    val result = ctx.sql("SELECT data FROM reflectBinary")
+      .collect().head(0).asInstanceOf[Array[Byte]]
     assert(result.toSeq === Seq[Byte](1))
   }
 
@@ -127,10 +125,9 @@ class ScalaReflectionRelationSuite extends SparkFunSuite {
         Map(10 -> 100L, 20 -> 200L),
         Map(10 -> Some(100L), 20 -> Some(200L), 30 -> None),
         Nested(None, "abc")))
-    val rdd = sparkContext.parallelize(data :: Nil)
-    rdd.toDF().registerTempTable("reflectComplexData")
 
-    assert(sql("SELECT * FROM reflectComplexData").collect().head ===
+    Seq(data).toDF().registerTempTable("reflectComplexData")
+    assert(ctx.sql("SELECT * FROM reflectComplexData").collect().head ===
       new GenericRow(Array[Any](
         Seq(1, 2, 3),
         Seq(1, 2, null),
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SerializationSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SerializationSuite.scala
index 1e8cde606b67b..e55c9e460b791 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/SerializationSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/SerializationSuite.scala
@@ -19,12 +19,13 @@ package org.apache.spark.sql
 
 import org.apache.spark.{SparkConf, SparkFunSuite}
 import org.apache.spark.serializer.JavaSerializer
-import org.apache.spark.sql.test.TestSQLContext
 
 class SerializationSuite extends SparkFunSuite {
 
+  private lazy val ctx = org.apache.spark.sql.test.TestSQLContext
+
   test("[SPARK-5235] SQLContext should be serializable") {
-    val sqlContext = new SQLContext(TestSQLContext.sparkContext)
+    val sqlContext = new SQLContext(ctx.sparkContext)
     new JavaSerializer(new SparkConf()).newInstance().serialize(sqlContext)
   }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/UDFSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/UDFSuite.scala
index 1a9ba66416b21..064c040d2b771 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/UDFSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/UDFSuite.scala
@@ -17,43 +17,41 @@
 
 package org.apache.spark.sql
 
-import org.apache.spark.sql.test._
-
-/* Implicits */
-import TestSQLContext._
-import TestSQLContext.implicits._
 
 case class FunctionResult(f1: String, f2: String)
 
 class UDFSuite extends QueryTest {
 
+  private lazy val ctx = org.apache.spark.sql.test.TestSQLContext
+  import ctx.implicits._
+
   test("Simple UDF") {
-    udf.register("strLenScala", (_: String).length)
-    assert(sql("SELECT strLenScala('test')").head().getInt(0) === 4)
+    ctx.udf.register("strLenScala", (_: String).length)
+    assert(ctx.sql("SELECT strLenScala('test')").head().getInt(0) === 4)
   }
 
   test("ZeroArgument UDF") {
-    udf.register("random0", () => { Math.random()})
-    assert(sql("SELECT random0()").head().getDouble(0) >= 0.0)
+    ctx.udf.register("random0", () => { Math.random()})
+    assert(ctx.sql("SELECT random0()").head().getDouble(0) >= 0.0)
   }
 
   test("TwoArgument UDF") {
-    udf.register("strLenScala", (_: String).length + (_: Int))
-    assert(sql("SELECT strLenScala('test', 1)").head().getInt(0) === 5)
+    ctx.udf.register("strLenScala", (_: String).length + (_: Int))
+    assert(ctx.sql("SELECT strLenScala('test', 1)").head().getInt(0) === 5)
   }
 
   test("struct UDF") {
-    udf.register("returnStruct", (f1: String, f2: String) => FunctionResult(f1, f2))
+    ctx.udf.register("returnStruct", (f1: String, f2: String) => FunctionResult(f1, f2))
 
     val result =
-      sql("SELECT returnStruct('test', 'test2') as ret")
+      ctx.sql("SELECT returnStruct('test', 'test2') as ret")
         .select($"ret.f1").head().getString(0)
     assert(result === "test")
   }
 
   test("udf that is transformed") {
-    udf.register("makeStruct", (x: Int, y: Int) => (x, y))
+    ctx.udf.register("makeStruct", (x: Int, y: Int) => (x, y))
     // 1 + 1 is constant folded causing a transformation.
-    assert(sql("SELECT makeStruct(1 + 1, 2)").first().getAs[Row](0) === Row(2, 2))
+    assert(ctx.sql("SELECT makeStruct(1 + 1, 2)").first().getAs[Row](0) === Row(2, 2))
   }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/UserDefinedTypeSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/UserDefinedTypeSuite.scala
index dc2d43a197f40..45c9f06941c10 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/UserDefinedTypeSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/UserDefinedTypeSuite.scala
@@ -17,10 +17,6 @@
 
 package org.apache.spark.sql
 
-import java.io.File
-
-import org.apache.spark.util.Utils
-
 import scala.beans.{BeanInfo, BeanProperty}
 
 import com.clearspring.analytics.stream.cardinality.HyperLogLog
@@ -28,12 +24,11 @@ import com.clearspring.analytics.stream.cardinality.HyperLogLog
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.catalyst.expressions.{OpenHashSetUDT, HyperLogLogUDT}
 import org.apache.spark.sql.functions._
-import org.apache.spark.sql.test.TestSQLContext
-import org.apache.spark.sql.test.TestSQLContext.{sparkContext, sql}
-import org.apache.spark.sql.test.TestSQLContext.implicits._
 import org.apache.spark.sql.types._
+import org.apache.spark.util.Utils
 import org.apache.spark.util.collection.OpenHashSet
 
+
 @SQLUserDefinedType(udt = classOf[MyDenseVectorUDT])
 private[sql] class MyDenseVector(val data: Array[Double]) extends Serializable {
   override def equals(other: Any): Boolean = other match {
@@ -72,11 +67,13 @@ private[sql] class MyDenseVectorUDT extends UserDefinedType[MyDenseVector] {
 }
 
 class UserDefinedTypeSuite extends QueryTest {
-  val points = Seq(
-    MyLabeledPoint(1.0, new MyDenseVector(Array(0.1, 1.0))),
-    MyLabeledPoint(0.0, new MyDenseVector(Array(0.2, 2.0))))
-  val pointsRDD = sparkContext.parallelize(points).toDF()
 
+  private lazy val ctx = org.apache.spark.sql.test.TestSQLContext
+  import ctx.implicits._
+
+  private lazy val pointsRDD = Seq(
+    MyLabeledPoint(1.0, new MyDenseVector(Array(0.1, 1.0))),
+    MyLabeledPoint(0.0, new MyDenseVector(Array(0.2, 2.0)))).toDF()
 
   test("register user type: MyDenseVector for MyLabeledPoint") {
     val labels: RDD[Double] = pointsRDD.select('label).rdd.map { case Row(v: Double) => v }
@@ -94,10 +91,10 @@ class UserDefinedTypeSuite extends QueryTest {
   }
 
   test("UDTs and UDFs") {
-    TestSQLContext.udf.register("testType", (d: MyDenseVector) => d.isInstanceOf[MyDenseVector])
+    ctx.udf.register("testType", (d: MyDenseVector) => d.isInstanceOf[MyDenseVector])
     pointsRDD.registerTempTable("points")
     checkAnswer(
-      sql("SELECT testType(features) from points"),
+      ctx.sql("SELECT testType(features) from points"),
       Seq(Row(true), Row(true)))
   }
 

From e5054605994b8777e629c02fcbf8a5a6cbd0b0fe Mon Sep 17 00:00:00 2001
From: Ted Blackman <ted.blackman@gmail.com>
Date: Thu, 4 Jun 2015 22:21:11 -0700
Subject: [PATCH 369/525] [SPARK-8116][PYSPARK] Allow sc.range() to take a
 single argument.

Author: Ted Blackman <ted.blackman@gmail.com>

Closes #6656 from belisarius222/branch-1.4 and squashes the following commits:

747cbc2 [Ted Blackman] [SPARK-8116][PYSPARK] Allow sc.range() to take a single argument.

(cherry picked from commit f02af7c8f7f43e4cfe3c412d2b5ea4128669ce22)
Signed-off-by: Reynold Xin <rxin@databricks.com>
---
 python/pyspark/context.py | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/python/pyspark/context.py b/python/pyspark/context.py
index aeb7ad4f2f83e..44d90f1437bc9 100644
--- a/python/pyspark/context.py
+++ b/python/pyspark/context.py
@@ -324,10 +324,12 @@ def stop(self):
         with SparkContext._lock:
             SparkContext._active_spark_context = None
 
-    def range(self, start, end, step=1, numSlices=None):
+    def range(self, start, end=None, step=1, numSlices=None):
         """
         Create a new RDD of int containing elements from `start` to `end`
-        (exclusive), increased by `step` every element.
+        (exclusive), increased by `step` every element. Can be called the same
+        way as python's built-in range() function. If called with a single argument,
+        the argument is interpreted as `end`, and `start` is set to 0.
 
         :param start: the start value
         :param end: the end value (exclusive)
@@ -335,9 +337,17 @@ def range(self, start, end, step=1, numSlices=None):
         :param numSlices: the number of partitions of the new RDD
         :return: An RDD of int
 
+        >>> sc.range(5).collect()
+        [0, 1, 2, 3, 4]
+        >>> sc.range(2, 4).collect()
+        [2, 3]
         >>> sc.range(1, 7, 2).collect()
         [1, 3, 5]
         """
+        if end is None:
+            end = start
+            start = 0
+
         return self.parallelize(xrange(start, end, step), numSlices)
 
     def parallelize(self, c, numSlices=None):

From 2777ed3948d26b14e342ba161e145009e31b8829 Mon Sep 17 00:00:00 2001
From: Yijie Shen <henry.yijieshen@gmail.com>
Date: Fri, 5 Jun 2015 07:45:25 +0200
Subject: [PATCH 370/525] [DOC][Minor]Specify the common sources available for
 collecting

I was wondering what else common sources available until search the source code. Maybe better to make this clear.

Author: Yijie Shen <henry.yijieshen@gmail.com>

Closes #6641 from yijieshen/patch-1 and squashes the following commits:

b5b99b4 [Yijie Shen] Make it clear that JvmSource is the only available additional source currently
f23140c [Yijie Shen] [DOC][Minor]Specify the common sources available for collecting
---
 conf/metrics.properties.template | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/conf/metrics.properties.template b/conf/metrics.properties.template
index 7de0011a48ca8..7f17bc7eea4f5 100644
--- a/conf/metrics.properties.template
+++ b/conf/metrics.properties.template
@@ -4,7 +4,7 @@
 #  divided into instances which correspond to internal components.
 #  Each instance can be configured to report its metrics to one or more sinks.
 #  Accepted values for [instance] are "master", "worker", "executor", "driver",
-#  and "applications". A wild card "*" can be used as an instance name, in
+#  and "applications". A wildcard "*" can be used as an instance name, in
 #  which case all instances will inherit the supplied property.
 #
 #  Within an instance, a "source" specifies a particular set of grouped metrics.
@@ -32,7 +32,7 @@
 #    name (see examples below).
 #    2. Some sinks involve a polling period. The minimum allowed polling period
 #    is 1 second.
-#    3. Wild card properties can be overridden by more specific properties.
+#    3. Wildcard properties can be overridden by more specific properties.
 #    For example, master.sink.console.period takes precedence over
 #    *.sink.console.period.
 #    4. A metrics specific configuration
@@ -47,6 +47,13 @@
 #    instance master and applications. MetricsServlet may not be configured by self.
 #
 
+## List of available common sources and their properties.
+
+# org.apache.spark.metrics.source.JvmSource
+#   Note: Currently, JvmSource is the only available common source 
+#         to add additionaly to an instance, to enable this, 
+#         set the "class" option to its fully qulified class name (see examples below)
+
 ## List of available sinks and their properties.
 
 # org.apache.spark.metrics.sink.ConsoleSink

From 3a5c4da473a8a497004dfe6eacc0e6646651b227 Mon Sep 17 00:00:00 2001
From: Sean Owen <sowen@cloudera.com>
Date: Fri, 5 Jun 2015 00:32:46 -0700
Subject: [PATCH 371/525] [MINOR] remove unused interpolation var in log
 message

Completely trivial but I noticed this wrinkle in a log message today; `$sender` doesn't refer to anything and isn't interpolated here.

Author: Sean Owen <sowen@cloudera.com>

Closes #6650 from srowen/Interpolation and squashes the following commits:

518687a [Sean Owen] Actually interpolate log string
7edb866 [Sean Owen] Trivial: remove unused interpolation var in log message
---
 .../spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala b/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala
index fcad959540f5a..7c7f70d8a193b 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala
@@ -103,7 +103,7 @@ class CoarseGrainedSchedulerBackend(scheduler: TaskSchedulerImpl, val rpcEnv: Rp
             case None =>
               // Ignoring the update since we don't know about the executor.
               logWarning(s"Ignored task status update ($taskId state $state) " +
-                "from unknown executor $sender with ID $executorId")
+                s"from unknown executor with ID $executorId")
           }
         }
 

From da20c8ca37663738112b04657057858ee3e55072 Mon Sep 17 00:00:00 2001
From: Marcelo Vanzin <vanzin@cloudera.com>
Date: Fri, 5 Jun 2015 10:32:33 +0200
Subject: [PATCH 372/525] [MINOR] [BUILD] Change link to jenkins builds on
 github.

Link to the tail of the console log, instead of the full log. That's
bound to have the info the user is looking for, and at the same time
loads way more quickly than the (huge) full log, which is just one click
away if needed.

Author: Marcelo Vanzin <vanzin@cloudera.com>

Closes #6664 from vanzin/jenkins-link and squashes the following commits:

ba07ed8 [Marcelo Vanzin] [minor] [build] Change link to jenkins builds on github.
---
 dev/run-tests-jenkins | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/dev/run-tests-jenkins b/dev/run-tests-jenkins
index 3cbd8666c8d68..641b0ff3c4be4 100755
--- a/dev/run-tests-jenkins
+++ b/dev/run-tests-jenkins
@@ -193,7 +193,7 @@ done
   test_result="$?"
 
   if [ "$test_result" -eq "124" ]; then
-    fail_message="**[Test build ${BUILD_DISPLAY_NAME} timed out](${BUILD_URL}consoleFull)** \
+    fail_message="**[Test build ${BUILD_DISPLAY_NAME} timed out](${BUILD_URL}console)** \
     for PR $ghprbPullId at commit [\`${SHORT_COMMIT_HASH}\`](${COMMIT_URL}) \
     after a configured wait of \`${TESTS_TIMEOUT}\`."
 
@@ -233,7 +233,7 @@ done
 # post end message
 {
   result_message="\
-  [Test build ${BUILD_DISPLAY_NAME} has finished](${BUILD_URL}consoleFull) for \
+  [Test build ${BUILD_DISPLAY_NAME} has finished](${BUILD_URL}console) for \
   PR $ghprbPullId at commit [\`${SHORT_COMMIT_HASH}\`](${COMMIT_URL})."
 
   result_message="${result_message}\n${test_result_note}"

From b16b5434ff44c42e4b3a337f9af147669ba44896 Mon Sep 17 00:00:00 2001
From: Marcelo Vanzin <vanzin@cloudera.com>
Date: Fri, 5 Jun 2015 14:11:38 +0200
Subject: [PATCH 373/525] [MINOR] [BUILD] Use custom temp directory during
 build.

Even with all the efforts to cleanup the temp directories created by
unit tests, Spark leaves a lot of garbage in /tmp after a test run.
This change overrides java.io.tmpdir to place those files under the
build directory instead.

After an sbt full unit test run, I was left with > 400 MB of temp
files. Since they're now under the build dir, it's much easier to
clean them up.

Also make a slight change to a unit test to make it not pollute the
source directory with test data.

Author: Marcelo Vanzin <vanzin@cloudera.com>

Closes #6653 from vanzin/unit-test-tmp and squashes the following commits:

31e2dd5 [Marcelo Vanzin] Fix tests that depend on each other.
aa92944 [Marcelo Vanzin] [minor] [build] Use custom temp directory during build.
---
 .../spark/deploy/SparkSubmitUtilsSuite.scala  | 22 ++++++++++---------
 pom.xml                                       |  4 +++-
 project/SparkBuild.scala                      |  1 +
 3 files changed, 16 insertions(+), 11 deletions(-)

diff --git a/core/src/test/scala/org/apache/spark/deploy/SparkSubmitUtilsSuite.scala b/core/src/test/scala/org/apache/spark/deploy/SparkSubmitUtilsSuite.scala
index 8fda5c8b472c9..07d261cc428c4 100644
--- a/core/src/test/scala/org/apache/spark/deploy/SparkSubmitUtilsSuite.scala
+++ b/core/src/test/scala/org/apache/spark/deploy/SparkSubmitUtilsSuite.scala
@@ -28,9 +28,12 @@ import org.apache.ivy.plugins.resolver.IBiblioResolver
 
 import org.apache.spark.SparkFunSuite
 import org.apache.spark.deploy.SparkSubmitUtils.MavenCoordinate
+import org.apache.spark.util.Utils
 
 class SparkSubmitUtilsSuite extends SparkFunSuite with BeforeAndAfterAll {
 
+  private var tempIvyPath: String = _
+
   private val noOpOutputStream = new OutputStream {
     def write(b: Int) = {}
   }
@@ -47,6 +50,7 @@ class SparkSubmitUtilsSuite extends SparkFunSuite with BeforeAndAfterAll {
     super.beforeAll()
     // We don't want to write logs during testing
     SparkSubmitUtils.printStream = new BufferPrintStream
+    tempIvyPath = Utils.createTempDir(namePrefix = "ivy").getAbsolutePath()
   }
 
   test("incorrect maven coordinate throws error") {
@@ -90,21 +94,20 @@ class SparkSubmitUtilsSuite extends SparkFunSuite with BeforeAndAfterAll {
   }
 
   test("ivy path works correctly") {
-    val ivyPath = "dummy" + File.separator +  "ivy"
     val md = SparkSubmitUtils.getModuleDescriptor
     val artifacts = for (i <- 0 until 3) yield new MDArtifact(md, s"jar-$i", "jar", "jar")
-    var jPaths = SparkSubmitUtils.resolveDependencyPaths(artifacts.toArray, new File(ivyPath))
+    var jPaths = SparkSubmitUtils.resolveDependencyPaths(artifacts.toArray, new File(tempIvyPath))
     for (i <- 0 until 3) {
-      val index = jPaths.indexOf(ivyPath)
+      val index = jPaths.indexOf(tempIvyPath)
       assert(index >= 0)
-      jPaths = jPaths.substring(index + ivyPath.length)
+      jPaths = jPaths.substring(index + tempIvyPath.length)
     }
     val main = MavenCoordinate("my.awesome.lib", "mylib", "0.1")
     IvyTestUtils.withRepository(main, None, None) { repo =>
       // end to end
       val jarPath = SparkSubmitUtils.resolveMavenCoordinates(main.toString, Option(repo),
-        Option(ivyPath), true)
-      assert(jarPath.indexOf(ivyPath) >= 0, "should use non-default ivy path")
+        Option(tempIvyPath), true)
+      assert(jarPath.indexOf(tempIvyPath) >= 0, "should use non-default ivy path")
     }
   }
 
@@ -123,13 +126,12 @@ class SparkSubmitUtilsSuite extends SparkFunSuite with BeforeAndAfterAll {
       assert(jarPath.indexOf("mylib") >= 0, "should find artifact")
     }
     // Local ivy repository with modified home
-    val dummyIvyPath = "dummy" + File.separator + "ivy"
-    val dummyIvyLocal = new File(dummyIvyPath, "local" + File.separator)
+    val dummyIvyLocal = new File(tempIvyPath, "local" + File.separator)
     IvyTestUtils.withRepository(main, None, Some(dummyIvyLocal), true) { repo =>
       val jarPath = SparkSubmitUtils.resolveMavenCoordinates(main.toString, None,
-        Some(dummyIvyPath), true)
+        Some(tempIvyPath), true)
       assert(jarPath.indexOf("mylib") >= 0, "should find artifact")
-      assert(jarPath.indexOf(dummyIvyPath) >= 0, "should be in new ivy path")
+      assert(jarPath.indexOf(tempIvyPath) >= 0, "should be in new ivy path")
     }
   }
 
diff --git a/pom.xml b/pom.xml
index e28d4b9fc2b17..a848deffe7375 100644
--- a/pom.xml
+++ b/pom.xml
@@ -179,7 +179,7 @@
     <parquet.deps.scope>compile</parquet.deps.scope>
 
     <!--
-      Overridable test home. So that you can call individual pom files directory without
+      Overridable test home. So that you can call individual pom files directly without
       things breaking.
     -->
     <spark.test.home>${session.executionRootDirectory}</spark.test.home>
@@ -1256,6 +1256,7 @@
             <systemProperties>
               <derby.system.durability>test</derby.system.durability>
               <java.awt.headless>true</java.awt.headless>
+              <java.io.tmpdir>${project.build.directory}/tmp</java.io.tmpdir>
               <spark.test.home>${spark.test.home}</spark.test.home>
               <spark.testing>1</spark.testing>
               <spark.ui.enabled>false</spark.ui.enabled>
@@ -1289,6 +1290,7 @@
             <systemProperties>
               <derby.system.durability>test</derby.system.durability>
               <java.awt.headless>true</java.awt.headless>
+              <java.io.tmpdir>${project.build.directory}/tmp</java.io.tmpdir>
               <spark.test.home>${spark.test.home}</spark.test.home>
               <spark.testing>1</spark.testing>
               <spark.ui.enabled>false</spark.ui.enabled>
diff --git a/project/SparkBuild.scala b/project/SparkBuild.scala
index ef3a175bac209..921f1599fedef 100644
--- a/project/SparkBuild.scala
+++ b/project/SparkBuild.scala
@@ -496,6 +496,7 @@ object TestSettings {
       "SPARK_DIST_CLASSPATH" ->
         (fullClasspath in Test).value.files.map(_.getAbsolutePath).mkString(":").stripSuffix(":"),
       "JAVA_HOME" -> sys.env.get("JAVA_HOME").getOrElse(sys.props("java.home"))),
+    javaOptions in Test += s"-Djava.io.tmpdir=$sparkHome/target/tmp",
     javaOptions in Test += "-Dspark.test.home=" + sparkHome,
     javaOptions in Test += "-Dspark.testing=1",
     javaOptions in Test += "-Dspark.port.maxRetries=100",

From 019dc9f558cf7c0b708d3b1f0882b0c19134ffb6 Mon Sep 17 00:00:00 2001
From: Akhil Das <akhld@darktech.ca>
Date: Fri, 5 Jun 2015 14:23:23 +0200
Subject: [PATCH 374/525] [STREAMING] Update streaming-kafka-integration.md

Fixed the broken links (Examples) in the documentation.

Author: Akhil Das <akhld@darktech.ca>

Closes #6666 from akhld/patch-2 and squashes the following commits:

2228b83 [Akhil Das] Update streaming-kafka-integration.md
---
 docs/streaming-kafka-integration.md | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/docs/streaming-kafka-integration.md b/docs/streaming-kafka-integration.md
index 64714f0b799fc..d6d5605948a5a 100644
--- a/docs/streaming-kafka-integration.md
+++ b/docs/streaming-kafka-integration.md
@@ -29,7 +29,7 @@ Next, we discuss how to use this approach in your streaming application.
             [ZK quorum], [consumer group id], [per-topic number of Kafka partitions to consume])
 
     You can also specify the key and value classes and their corresponding decoder classes using variations of `createStream`. See the [API docs](api/scala/index.html#org.apache.spark.streaming.kafka.KafkaUtils$)
-	and the [example]({{site.SPARK_GITHUB_URL}}/blob/master/examples/scala-2.10/src/main/scala/org/apache/spark/examples/streaming/KafkaWordCount.scala).
+	and the [example]({{site.SPARK_GITHUB_URL}}/blob/master/examples/src/main/scala/org/apache/spark/examples/streaming/KafkaWordCount.scala).
 	</div>
 	<div data-lang="java" markdown="1">
 		import org.apache.spark.streaming.kafka.*;
@@ -39,7 +39,7 @@ Next, we discuss how to use this approach in your streaming application.
             [ZK quorum], [consumer group id], [per-topic number of Kafka partitions to consume]);
 
     You can also specify the key and value classes and their corresponding decoder classes using variations of `createStream`. See the [API docs](api/java/index.html?org/apache/spark/streaming/kafka/KafkaUtils.html)
-	and the [example]({{site.SPARK_GITHUB_URL}}/blob/master/examples/scala-2.10/src/main/java/org/apache/spark/examples/streaming/JavaKafkaWordCount.java).
+	and the [example]({{site.SPARK_GITHUB_URL}}/blob/master/examples/src/main/java/org/apache/spark/examples/streaming/JavaKafkaWordCount.java).
 
 	</div>
 	<div data-lang="python" markdown="1">
@@ -105,7 +105,7 @@ Next, we discuss how to use this approach in your streaming application.
 			streamingContext, [map of Kafka parameters], [set of topics to consume])
 
 	See the [API docs](api/scala/index.html#org.apache.spark.streaming.kafka.KafkaUtils$)
-	and the [example]({{site.SPARK_GITHUB_URL}}/blob/master/examples/scala-2.10/src/main/scala/org/apache/spark/examples/streaming/DirectKafkaWordCount.scala).
+	and the [example]({{site.SPARK_GITHUB_URL}}/blob/master/examples/src/main/scala/org/apache/spark/examples/streaming/DirectKafkaWordCount.scala).
 	</div>
 	<div data-lang="java" markdown="1">
 		import org.apache.spark.streaming.kafka.*;
@@ -116,7 +116,7 @@ Next, we discuss how to use this approach in your streaming application.
 				[map of Kafka parameters], [set of topics to consume]);
 
 	See the [API docs](api/java/index.html?org/apache/spark/streaming/kafka/KafkaUtils.html)
-	and the [example]({{site.SPARK_GITHUB_URL}}/blob/master/examples/scala-2.10/src/main/java/org/apache/spark/examples/streaming/JavaDirectKafkaWordCount.java).
+	and the [example]({{site.SPARK_GITHUB_URL}}/blob/master/examples/src/main/java/org/apache/spark/examples/streaming/JavaDirectKafkaWordCount.java).
 
 	</div>
 	</div>
@@ -153,4 +153,4 @@ Next, we discuss how to use this approach in your streaming application.
 
 	Another thing to note is that since this approach does not use Receivers, the standard receiver-related (that is, [configurations](configuration.html) of the form `spark.streaming.receiver.*` ) will not apply to the input DStreams created by this approach (will apply to other input DStreams though). Instead, use the [configurations](configuration.html) `spark.streaming.kafka.*`. An important one is `spark.streaming.kafka.maxRatePerPartition` which is the maximum rate at which each Kafka partition will be read by this direct API. 
 
-3. **Deploying:** Similar to the first approach, you can package `spark-streaming-kafka_{{site.SCALA_BINARY_VERSION}}` and its dependencies into the application JAR and the launch the application using `spark-submit`. Make sure `spark-core_{{site.SCALA_BINARY_VERSION}}` and `spark-streaming_{{site.SCALA_BINARY_VERSION}}` are marked as `provided` dependencies as those are already present in a Spark installation.
\ No newline at end of file
+3. **Deploying:** Similar to the first approach, you can package `spark-streaming-kafka_{{site.SCALA_BINARY_VERSION}}` and its dependencies into the application JAR and the launch the application using `spark-submit`. Make sure `spark-core_{{site.SCALA_BINARY_VERSION}}` and `spark-streaming_{{site.SCALA_BINARY_VERSION}}` are marked as `provided` dependencies as those are already present in a Spark installation.

From 700312e12f9588f01a592d6eac7bff7eb366ac8f Mon Sep 17 00:00:00 2001
From: Marcelo Vanzin <vanzin@cloudera.com>
Date: Fri, 5 Jun 2015 14:32:00 +0200
Subject: [PATCH 375/525] [SPARK-6324] [CORE] Centralize handling of script
 usage messages.

Reorganize code so that the launcher library handles most of the work
of printing usage messages, instead of having an awkward protocol between
the library and the scripts for that.

This mostly applies to SparkSubmit, since the launcher lib does not do
command line parsing for classes invoked in other ways, and thus cannot
handle failures for those. Most scripts end up going through SparkSubmit,
though, so it all works.

The change adds a new, internal command line switch, "--usage-error",
which prints the usage message and exits with a non-zero status. Scripts
can override the command printed in the usage message by setting an
environment variable - this avoids having to grep the output of
SparkSubmit to remove references to the "spark-submit" script.

The only sub-optimal part of the change is the special handling for the
spark-sql usage, which is now done in SparkSubmitArguments.

Author: Marcelo Vanzin <vanzin@cloudera.com>

Closes #5841 from vanzin/SPARK-6324 and squashes the following commits:

2821481 [Marcelo Vanzin] Merge branch 'master' into SPARK-6324
bf139b5 [Marcelo Vanzin] Filter output of Spark SQL CLI help.
c6609bf [Marcelo Vanzin] Fix exit code never being used when printing usage messages.
6bc1b41 [Marcelo Vanzin] [SPARK-6324] [core] Centralize handling of script usage messages.
---
 bin/pyspark                                   | 16 +---
 bin/pyspark2.cmd                              |  1 +
 bin/spark-class                               | 13 +--
 bin/spark-shell                               | 15 +---
 bin/spark-shell2.cmd                          | 21 +----
 bin/spark-sql                                 | 39 +--------
 bin/spark-submit                              | 12 ---
 bin/spark-submit2.cmd                         | 13 +--
 bin/sparkR                                    | 18 +---
 .../org/apache/spark/deploy/SparkSubmit.scala | 10 +--
 .../spark/deploy/SparkSubmitArguments.scala   | 76 ++++++++++++++++-
 .../spark/deploy/SparkSubmitSuite.scala       |  2 +-
 .../java/org/apache/spark/launcher/Main.java  | 83 ++++++++++---------
 .../launcher/SparkSubmitCommandBuilder.java   | 18 +++-
 .../launcher/SparkSubmitOptionParser.java     |  2 +
 15 files changed, 147 insertions(+), 192 deletions(-)

diff --git a/bin/pyspark b/bin/pyspark
index 7cb19c51b43a2..f9dbddfa53560 100755
--- a/bin/pyspark
+++ b/bin/pyspark
@@ -17,24 +17,10 @@
 # limitations under the License.
 #
 
-# Figure out where Spark is installed
 export SPARK_HOME="$(cd "`dirname "$0"`"/..; pwd)"
 
 source "$SPARK_HOME"/bin/load-spark-env.sh
-
-function usage() {
-  if [ -n "$1" ]; then
-    echo $1
-  fi
-  echo "Usage: ./bin/pyspark [options]" 1>&2
-  "$SPARK_HOME"/bin/spark-submit --help 2>&1 | grep -v Usage 1>&2
-  exit $2
-}
-export -f usage
-
-if [[ "$@" = *--help ]] || [[ "$@" = *-h ]]; then
-  usage
-fi
+export _SPARK_CMD_USAGE="Usage: ./bin/pyspark [options]"
 
 # In Spark <= 1.1, setting IPYTHON=1 would cause the driver to be launched using the `ipython`
 # executable, while the worker would still be launched using PYSPARK_PYTHON.
diff --git a/bin/pyspark2.cmd b/bin/pyspark2.cmd
index 09b4149c2a439..45e9e3def5121 100644
--- a/bin/pyspark2.cmd
+++ b/bin/pyspark2.cmd
@@ -21,6 +21,7 @@ rem Figure out where the Spark framework is installed
 set SPARK_HOME=%~dp0..
 
 call %SPARK_HOME%\bin\load-spark-env.cmd
+set _SPARK_CMD_USAGE=Usage: bin\pyspark.cmd [options]
 
 rem Figure out which Python to use.
 if "x%PYSPARK_DRIVER_PYTHON%"=="x" (
diff --git a/bin/spark-class b/bin/spark-class
index c49d97ce5cf25..7bb1afe4b44f5 100755
--- a/bin/spark-class
+++ b/bin/spark-class
@@ -16,18 +16,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
-set -e
 
 # Figure out where Spark is installed
 export SPARK_HOME="$(cd "`dirname "$0"`"/..; pwd)"
 
 . "$SPARK_HOME"/bin/load-spark-env.sh
 
-if [ -z "$1" ]; then
-  echo "Usage: spark-class <class> [<args>]" 1>&2
-  exit 1
-fi
-
 # Find the java binary
 if [ -n "${JAVA_HOME}" ]; then
   RUNNER="${JAVA_HOME}/bin/java"
@@ -98,9 +92,4 @@ CMD=()
 while IFS= read -d '' -r ARG; do
   CMD+=("$ARG")
 done < <("$RUNNER" -cp "$LAUNCH_CLASSPATH" org.apache.spark.launcher.Main "$@")
-
-if [ "${CMD[0]}" = "usage" ]; then
-  "${CMD[@]}"
-else
-  exec "${CMD[@]}"
-fi
+exec "${CMD[@]}"
diff --git a/bin/spark-shell b/bin/spark-shell
index b3761b5e1375b..a6dc863d83fc6 100755
--- a/bin/spark-shell
+++ b/bin/spark-shell
@@ -29,20 +29,7 @@ esac
 set -o posix
 
 export FWDIR="$(cd "`dirname "$0"`"/..; pwd)"
-
-usage() {
-  if [ -n "$1" ]; then
-    echo "$1"
-  fi
-  echo "Usage: ./bin/spark-shell [options]"
-  "$FWDIR"/bin/spark-submit --help 2>&1 | grep -v Usage 1>&2
-  exit "$2"
-}
-export -f usage
-
-if [[ "$@" = *--help ]] || [[ "$@" = *-h ]]; then
-  usage "" 0
-fi
+export _SPARK_CMD_USAGE="Usage: ./bin/spark-shell [options]"
 
 # SPARK-4161: scala does not assume use of the java classpath,
 # so we need to add the "-Dscala.usejavacp=true" flag manually. We
diff --git a/bin/spark-shell2.cmd b/bin/spark-shell2.cmd
index 00fd30fa38d36..251309d67f860 100644
--- a/bin/spark-shell2.cmd
+++ b/bin/spark-shell2.cmd
@@ -18,12 +18,7 @@ rem limitations under the License.
 rem
 
 set SPARK_HOME=%~dp0..
-
-echo "%*" | findstr " \<--help\> \<-h\>" >nul
-if %ERRORLEVEL% equ 0 (
-  call :usage
-  exit /b 0
-)
+set _SPARK_CMD_USAGE=Usage: .\bin\spark-shell.cmd [options]
 
 rem SPARK-4161: scala does not assume use of the java classpath,
 rem so we need to add the "-Dscala.usejavacp=true" flag manually. We
@@ -37,16 +32,4 @@ if "x%SPARK_SUBMIT_OPTS%"=="x" (
 set SPARK_SUBMIT_OPTS="%SPARK_SUBMIT_OPTS% -Dscala.usejavacp=true"
 
 :run_shell
-call %SPARK_HOME%\bin\spark-submit2.cmd --class org.apache.spark.repl.Main %*
-set SPARK_ERROR_LEVEL=%ERRORLEVEL%
-if not "x%SPARK_LAUNCHER_USAGE_ERROR%"=="x" (
-  call :usage
-  exit /b 1
-)
-exit /b %SPARK_ERROR_LEVEL%
-
-:usage
-echo %SPARK_LAUNCHER_USAGE_ERROR%
-echo "Usage: .\bin\spark-shell.cmd [options]" >&2
-call %SPARK_HOME%\bin\spark-submit2.cmd --help 2>&1 | findstr /V "Usage" 1>&2
-goto :eof
+%SPARK_HOME%\bin\spark-submit2.cmd --class org.apache.spark.repl.Main %*
diff --git a/bin/spark-sql b/bin/spark-sql
index ca1729f4cfcb4..4ea7bc6e39c07 100755
--- a/bin/spark-sql
+++ b/bin/spark-sql
@@ -17,41 +17,6 @@
 # limitations under the License.
 #
 
-#
-# Shell script for starting the Spark SQL CLI
-
-# Enter posix mode for bash
-set -o posix
-
-# NOTE: This exact class name is matched downstream by SparkSubmit.
-# Any changes need to be reflected there.
-export CLASS="org.apache.spark.sql.hive.thriftserver.SparkSQLCLIDriver"
-
-# Figure out where Spark is installed
 export FWDIR="$(cd "`dirname "$0"`"/..; pwd)"
-
-function usage {
-  if [ -n "$1" ]; then
-    echo "$1"
-  fi
-  echo "Usage: ./bin/spark-sql [options] [cli option]"
-  pattern="usage"
-  pattern+="\|Spark assembly has been built with Hive"
-  pattern+="\|NOTE: SPARK_PREPEND_CLASSES is set"
-  pattern+="\|Spark Command: "
-  pattern+="\|--help"
-  pattern+="\|======="
-
-  "$FWDIR"/bin/spark-submit --help 2>&1 | grep -v Usage 1>&2
-  echo
-  echo "CLI options:"
-  "$FWDIR"/bin/spark-class "$CLASS" --help 2>&1 | grep -v "$pattern" 1>&2
-  exit "$2"
-}
-export -f usage
-
-if [[ "$@" = *--help ]] || [[ "$@" = *-h ]]; then
-  usage "" 0
-fi
-
-exec "$FWDIR"/bin/spark-submit --class "$CLASS" "$@"
+export _SPARK_CMD_USAGE="Usage: ./bin/spark-sql [options] [cli option]"
+exec "$FWDIR"/bin/spark-submit --class org.apache.spark.sql.hive.thriftserver.SparkSQLCLIDriver "$@"
diff --git a/bin/spark-submit b/bin/spark-submit
index 0e0afe71a0f05..255378b0f077c 100755
--- a/bin/spark-submit
+++ b/bin/spark-submit
@@ -22,16 +22,4 @@ SPARK_HOME="$(cd "`dirname "$0"`"/..; pwd)"
 # disable randomized hash for string in Python 3.3+
 export PYTHONHASHSEED=0
 
-# Only define a usage function if an upstream script hasn't done so.
-if ! type -t usage >/dev/null 2>&1; then
-  usage() {
-    if [ -n "$1" ]; then
-      echo "$1"
-    fi
-    "$SPARK_HOME"/bin/spark-class org.apache.spark.deploy.SparkSubmit --help
-    exit "$2"
-  }
-  export -f usage
-fi
-
 exec "$SPARK_HOME"/bin/spark-class org.apache.spark.deploy.SparkSubmit "$@"
diff --git a/bin/spark-submit2.cmd b/bin/spark-submit2.cmd
index d3fc4a5cc3f6e..651376e526928 100644
--- a/bin/spark-submit2.cmd
+++ b/bin/spark-submit2.cmd
@@ -24,15 +24,4 @@ rem disable randomized hash for string in Python 3.3+
 set PYTHONHASHSEED=0
 
 set CLASS=org.apache.spark.deploy.SparkSubmit
-call %~dp0spark-class2.cmd %CLASS% %*
-set SPARK_ERROR_LEVEL=%ERRORLEVEL%
-if not "x%SPARK_LAUNCHER_USAGE_ERROR%"=="x" (
-  call :usage
-  exit /b 1
-)
-exit /b %SPARK_ERROR_LEVEL%
-
-:usage
-echo %SPARK_LAUNCHER_USAGE_ERROR%
-call %SPARK_HOME%\bin\spark-class2.cmd %CLASS% --help
-goto :eof
+%~dp0spark-class2.cmd %CLASS% %*
diff --git a/bin/sparkR b/bin/sparkR
index 8c918e2b09aef..464c29f369424 100755
--- a/bin/sparkR
+++ b/bin/sparkR
@@ -17,23 +17,7 @@
 # limitations under the License.
 #
 
-# Figure out where Spark is installed
 export SPARK_HOME="$(cd "`dirname "$0"`"/..; pwd)"
-
 source "$SPARK_HOME"/bin/load-spark-env.sh
-
-function usage() {
-  if [ -n "$1" ]; then
-    echo $1
-  fi
-  echo "Usage: ./bin/sparkR [options]" 1>&2
-  "$SPARK_HOME"/bin/spark-submit --help 2>&1 | grep -v Usage 1>&2
-  exit $2
-}
-export -f usage
-
-if [[ "$@" = *--help ]] || [[ "$@" = *-h ]]; then
-  usage
-fi
-
+export _SPARK_CMD_USAGE="Usage: ./bin/sparkR [options]"
 exec "$SPARK_HOME"/bin/spark-submit sparkr-shell-main "$@"
diff --git a/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala b/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
index 8cf4d58847d8e..3aa3f948e865d 100644
--- a/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
@@ -82,13 +82,13 @@ object SparkSubmit {
   private val CLASS_NOT_FOUND_EXIT_STATUS = 101
 
   // Exposed for testing
-  private[spark] var exitFn: () => Unit = () => System.exit(1)
+  private[spark] var exitFn: Int => Unit = (exitCode: Int) => System.exit(exitCode)
   private[spark] var printStream: PrintStream = System.err
   private[spark] def printWarning(str: String): Unit = printStream.println("Warning: " + str)
   private[spark] def printErrorAndExit(str: String): Unit = {
     printStream.println("Error: " + str)
     printStream.println("Run with --help for usage help or --verbose for debug output")
-    exitFn()
+    exitFn(1)
   }
   private[spark] def printVersionAndExit(): Unit = {
     printStream.println("""Welcome to
@@ -99,7 +99,7 @@ object SparkSubmit {
       /_/
                         """.format(SPARK_VERSION))
     printStream.println("Type --help for more information.")
-    exitFn()
+    exitFn(0)
   }
 
   def main(args: Array[String]): Unit = {
@@ -160,7 +160,7 @@ object SparkSubmit {
             // detect exceptions with empty stack traces here, and treat them differently.
             if (e.getStackTrace().length == 0) {
               printStream.println(s"ERROR: ${e.getClass().getName()}: ${e.getMessage()}")
-              exitFn()
+              exitFn(1)
             } else {
               throw e
             }
@@ -700,7 +700,7 @@ object SparkSubmit {
   /**
    * Return whether the given main class represents a sql shell.
    */
-  private def isSqlShell(mainClass: String): Boolean = {
+  private[deploy] def isSqlShell(mainClass: String): Boolean = {
     mainClass == "org.apache.spark.sql.hive.thriftserver.SparkSQLCLIDriver"
   }
 
diff --git a/core/src/main/scala/org/apache/spark/deploy/SparkSubmitArguments.scala b/core/src/main/scala/org/apache/spark/deploy/SparkSubmitArguments.scala
index cc6a7bd9f4119..b7429a901e162 100644
--- a/core/src/main/scala/org/apache/spark/deploy/SparkSubmitArguments.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/SparkSubmitArguments.scala
@@ -17,12 +17,15 @@
 
 package org.apache.spark.deploy
 
+import java.io.{ByteArrayOutputStream, PrintStream}
+import java.lang.reflect.InvocationTargetException
 import java.net.URI
 import java.util.{List => JList}
 import java.util.jar.JarFile
 
 import scala.collection.JavaConversions._
 import scala.collection.mutable.{ArrayBuffer, HashMap}
+import scala.io.Source
 
 import org.apache.spark.deploy.SparkSubmitAction._
 import org.apache.spark.launcher.SparkSubmitArgumentsParser
@@ -412,6 +415,9 @@ private[deploy] class SparkSubmitArguments(args: Seq[String], env: Map[String, S
       case VERSION =>
         SparkSubmit.printVersionAndExit()
 
+      case USAGE_ERROR =>
+        printUsageAndExit(1)
+
       case _ =>
         throw new IllegalArgumentException(s"Unexpected argument '$opt'.")
     }
@@ -449,11 +455,14 @@ private[deploy] class SparkSubmitArguments(args: Seq[String], env: Map[String, S
     if (unknownParam != null) {
       outStream.println("Unknown/unsupported param " + unknownParam)
     }
-    outStream.println(
+    val command = sys.env.get("_SPARK_CMD_USAGE").getOrElse(
       """Usage: spark-submit [options] <app jar | python file> [app arguments]
         |Usage: spark-submit --kill [submission ID] --master [spark://...]
-        |Usage: spark-submit --status [submission ID] --master [spark://...]
-        |
+        |Usage: spark-submit --status [submission ID] --master [spark://...]""".stripMargin)
+    outStream.println(command)
+
+    outStream.println(
+      """
         |Options:
         |  --master MASTER_URL         spark://host:port, mesos://host:port, yarn, or local.
         |  --deploy-mode DEPLOY_MODE   Whether to launch the driver program locally ("client") or
@@ -525,6 +534,65 @@ private[deploy] class SparkSubmitArguments(args: Seq[String], env: Map[String, S
         |                              delegation tokens periodically.
       """.stripMargin
     )
-    SparkSubmit.exitFn()
+
+    if (SparkSubmit.isSqlShell(mainClass)) {
+      outStream.println("CLI options:")
+      outStream.println(getSqlShellOptions())
+    }
+
+    SparkSubmit.exitFn(exitCode)
   }
+
+  /**
+   * Run the Spark SQL CLI main class with the "--help" option and catch its output. Then filter
+   * the results to remove unwanted lines.
+   *
+   * Since the CLI will call `System.exit()`, we install a security manager to prevent that call
+   * from working, and restore the original one afterwards.
+   */
+  private def getSqlShellOptions(): String = {
+    val currentOut = System.out
+    val currentErr = System.err
+    val currentSm = System.getSecurityManager()
+    try {
+      val out = new ByteArrayOutputStream()
+      val stream = new PrintStream(out)
+      System.setOut(stream)
+      System.setErr(stream)
+
+      val sm = new SecurityManager() {
+        override def checkExit(status: Int): Unit = {
+          throw new SecurityException()
+        }
+
+        override def checkPermission(perm: java.security.Permission): Unit = {}
+      }
+      System.setSecurityManager(sm)
+
+      try {
+        Class.forName(mainClass).getMethod("main", classOf[Array[String]])
+          .invoke(null, Array(HELP))
+      } catch {
+        case e: InvocationTargetException =>
+          // Ignore SecurityException, since we throw it above.
+          if (!e.getCause().isInstanceOf[SecurityException]) {
+            throw e
+          }
+      }
+
+      stream.flush()
+
+      // Get the output and discard any unnecessary lines from it.
+      Source.fromString(new String(out.toByteArray())).getLines
+        .filter { line =>
+          !line.startsWith("log4j") && !line.startsWith("usage")
+        }
+        .mkString("\n")
+    } finally {
+      System.setSecurityManager(currentSm)
+      System.setOut(currentOut)
+      System.setErr(currentErr)
+    }
+  }
+
 }
diff --git a/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala b/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala
index 46369457f000a..46ea28d0f18f6 100644
--- a/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala
+++ b/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala
@@ -62,7 +62,7 @@ class SparkSubmitSuite
     SparkSubmit.printStream = printStream
 
     @volatile var exitedCleanly = false
-    SparkSubmit.exitFn = () => exitedCleanly = true
+    SparkSubmit.exitFn = (_) => exitedCleanly = true
 
     val thread = new Thread {
       override def run() = try {
diff --git a/launcher/src/main/java/org/apache/spark/launcher/Main.java b/launcher/src/main/java/org/apache/spark/launcher/Main.java
index 929b29a49ed70..62492f9baf3bb 100644
--- a/launcher/src/main/java/org/apache/spark/launcher/Main.java
+++ b/launcher/src/main/java/org/apache/spark/launcher/Main.java
@@ -53,21 +53,33 @@ public static void main(String[] argsArray) throws Exception {
     List<String> args = new ArrayList<String>(Arrays.asList(argsArray));
     String className = args.remove(0);
 
-    boolean printLaunchCommand;
-    boolean printUsage;
+    boolean printLaunchCommand = !isEmpty(System.getenv("SPARK_PRINT_LAUNCH_COMMAND"));
     AbstractCommandBuilder builder;
-    try {
-      if (className.equals("org.apache.spark.deploy.SparkSubmit")) {
+    if (className.equals("org.apache.spark.deploy.SparkSubmit")) {
+      try {
         builder = new SparkSubmitCommandBuilder(args);
-      } else {
-        builder = new SparkClassCommandBuilder(className, args);
+      } catch (IllegalArgumentException e) {
+        printLaunchCommand = false;
+        System.err.println("Error: " + e.getMessage());
+        System.err.println();
+
+        MainClassOptionParser parser = new MainClassOptionParser();
+        try {
+          parser.parse(args);
+        } catch (Exception ignored) {
+          // Ignore parsing exceptions.
+        }
+
+        List<String> help = new ArrayList<String>();
+        if (parser.className != null) {
+          help.add(parser.CLASS);
+          help.add(parser.className);
+        }
+        help.add(parser.USAGE_ERROR);
+        builder = new SparkSubmitCommandBuilder(help);
       }
-      printLaunchCommand = !isEmpty(System.getenv("SPARK_PRINT_LAUNCH_COMMAND"));
-      printUsage = false;
-    } catch (IllegalArgumentException e) {
-      builder = new UsageCommandBuilder(e.getMessage());
-      printLaunchCommand = false;
-      printUsage = true;
+    } else {
+      builder = new SparkClassCommandBuilder(className, args);
     }
 
     Map<String, String> env = new HashMap<String, String>();
@@ -78,13 +90,7 @@ public static void main(String[] argsArray) throws Exception {
     }
 
     if (isWindows()) {
-      // When printing the usage message, we can't use "cmd /v" since that prevents the env
-      // variable from being seen in the caller script. So do not call prepareWindowsCommand().
-      if (printUsage) {
-        System.out.println(join(" ", cmd));
-      } else {
-        System.out.println(prepareWindowsCommand(cmd, env));
-      }
+      System.out.println(prepareWindowsCommand(cmd, env));
     } else {
       // In bash, use NULL as the arg separator since it cannot be used in an argument.
       List<String> bashCmd = prepareBashCommand(cmd, env);
@@ -135,33 +141,30 @@ private static List<String> prepareBashCommand(List<String> cmd, Map<String, Str
   }
 
   /**
-   * Internal builder used when command line parsing fails. This will behave differently depending
-   * on the platform:
-   *
-   * - On Unix-like systems, it will print a call to the "usage" function with two arguments: the
-   *   the error string, and the exit code to use. The function is expected to print the command's
-   *   usage and exit with the provided exit code. The script should use "export -f usage" after
-   *   declaring a function called "usage", so that the function is available to downstream scripts.
-   *
-   * - On Windows it will set the variable "SPARK_LAUNCHER_USAGE_ERROR" to the usage error message.
-   *   The batch script should check for this variable and print its usage, since batch scripts
-   *   don't really support the "export -f" functionality used in bash.
+   * A parser used when command line parsing fails for spark-submit. It's used as a best-effort
+   * at trying to identify the class the user wanted to invoke, since that may require special
+   * usage strings (handled by SparkSubmitArguments).
    */
-  private static class UsageCommandBuilder extends AbstractCommandBuilder {
+  private static class MainClassOptionParser extends SparkSubmitOptionParser {
 
-    private final String message;
+    String className;
 
-    UsageCommandBuilder(String message) {
-      this.message = message;
+    @Override
+    protected boolean handle(String opt, String value) {
+      if (opt == CLASS) {
+        className = value;
+      }
+      return false;
     }
 
     @Override
-    public List<String> buildCommand(Map<String, String> env) {
-      if (isWindows()) {
-        return Arrays.asList("set", "SPARK_LAUNCHER_USAGE_ERROR=" + message);
-      } else {
-        return Arrays.asList("usage", message, "1");
-      }
+    protected boolean handleUnknown(String opt) {
+      return false;
+    }
+
+    @Override
+    protected void handleExtraArgs(List<String> extra) {
+
     }
 
   }
diff --git a/launcher/src/main/java/org/apache/spark/launcher/SparkSubmitCommandBuilder.java b/launcher/src/main/java/org/apache/spark/launcher/SparkSubmitCommandBuilder.java
index 7d387d406edae..3e5a2820b6c11 100644
--- a/launcher/src/main/java/org/apache/spark/launcher/SparkSubmitCommandBuilder.java
+++ b/launcher/src/main/java/org/apache/spark/launcher/SparkSubmitCommandBuilder.java
@@ -77,6 +77,7 @@ class SparkSubmitCommandBuilder extends AbstractCommandBuilder {
   }
 
   private final List<String> sparkArgs;
+  private final boolean printHelp;
 
   /**
    * Controls whether mixing spark-submit arguments with app arguments is allowed. This is needed
@@ -87,10 +88,11 @@ class SparkSubmitCommandBuilder extends AbstractCommandBuilder {
 
   SparkSubmitCommandBuilder() {
     this.sparkArgs = new ArrayList<String>();
+    this.printHelp = false;
   }
 
   SparkSubmitCommandBuilder(List<String> args) {
-    this();
+    this.sparkArgs = new ArrayList<String>();
     List<String> submitArgs = args;
     if (args.size() > 0 && args.get(0).equals(PYSPARK_SHELL)) {
       this.allowsMixedArguments = true;
@@ -104,14 +106,16 @@ class SparkSubmitCommandBuilder extends AbstractCommandBuilder {
       this.allowsMixedArguments = false;
     }
 
-    new OptionParser().parse(submitArgs);
+    OptionParser parser = new OptionParser();
+    parser.parse(submitArgs);
+    this.printHelp = parser.helpRequested;
   }
 
   @Override
   public List<String> buildCommand(Map<String, String> env) throws IOException {
-    if (PYSPARK_SHELL_RESOURCE.equals(appResource)) {
+    if (PYSPARK_SHELL_RESOURCE.equals(appResource) && !printHelp) {
       return buildPySparkShellCommand(env);
-    } else if (SPARKR_SHELL_RESOURCE.equals(appResource)) {
+    } else if (SPARKR_SHELL_RESOURCE.equals(appResource) && !printHelp) {
       return buildSparkRCommand(env);
     } else {
       return buildSparkSubmitCommand(env);
@@ -311,6 +315,8 @@ private boolean isThriftServer(String mainClass) {
 
   private class OptionParser extends SparkSubmitOptionParser {
 
+    boolean helpRequested = false;
+
     @Override
     protected boolean handle(String opt, String value) {
       if (opt.equals(MASTER)) {
@@ -341,6 +347,9 @@ protected boolean handle(String opt, String value) {
           allowsMixedArguments = true;
           appResource = specialClasses.get(value);
         }
+      } else if (opt.equals(HELP) || opt.equals(USAGE_ERROR)) {
+        helpRequested = true;
+        sparkArgs.add(opt);
       } else {
         sparkArgs.add(opt);
         if (value != null) {
@@ -360,6 +369,7 @@ protected boolean handleUnknown(String opt) {
         appArgs.add(opt);
         return true;
       } else {
+        checkArgument(!opt.startsWith("-"), "Unrecognized option: %s", opt);
         sparkArgs.add(opt);
         return false;
       }
diff --git a/launcher/src/main/java/org/apache/spark/launcher/SparkSubmitOptionParser.java b/launcher/src/main/java/org/apache/spark/launcher/SparkSubmitOptionParser.java
index 229000087688f..b88bba883ac65 100644
--- a/launcher/src/main/java/org/apache/spark/launcher/SparkSubmitOptionParser.java
+++ b/launcher/src/main/java/org/apache/spark/launcher/SparkSubmitOptionParser.java
@@ -61,6 +61,7 @@ class SparkSubmitOptionParser {
   // Options that do not take arguments.
   protected final String HELP = "--help";
   protected final String SUPERVISE = "--supervise";
+  protected final String USAGE_ERROR = "--usage-error";
   protected final String VERBOSE = "--verbose";
   protected final String VERSION = "--version";
 
@@ -120,6 +121,7 @@ class SparkSubmitOptionParser {
   final String[][] switches = {
     { HELP, "-h" },
     { SUPERVISE },
+    { USAGE_ERROR },
     { VERBOSE, "-v" },
     { VERSION },
   };

From bc0d76a246cc534234b96a661d70feb94b26538c Mon Sep 17 00:00:00 2001
From: Cheng Lian <lian@databricks.com>
Date: Fri, 5 Jun 2015 23:06:19 +0800
Subject: [PATCH 376/525] [SQL] Simplifies binary node pattern matching

This PR is a simpler version of #2764, and adds `unapply` methods to the following binary nodes for simpler pattern matching:

- `BinaryExpression`
- `BinaryComparison`
- `BinaryArithmetics`

This enables nested pattern matching for binary nodes. For example, the following pattern matching

```scala
case p: BinaryComparison if p.left.dataType == StringType &&
                            p.right.dataType == DateType =>
  p.makeCopy(Array(p.left, Cast(p.right, StringType)))
```

can be simplified to

```scala
case p  BinaryComparison(l  StringType(), r  DateType()) =>
  p.makeCopy(Array(l, Cast(r, StringType)))
```

Author: Cheng Lian <lian@databricks.com>

Closes #6537 from liancheng/binary-node-patmat and squashes the following commits:

a3bf5fe [Cheng Lian] Fixes compilation error introduced while rebasing
b738986 [Cheng Lian] Renames `l`/`r` to `left`/`right` or `lhs`/`rhs`
14900ae [Cheng Lian] Simplifies binary node pattern matching
---
 .../catalyst/analysis/HiveTypeCoercion.scala  | 215 ++++++++----------
 .../sql/catalyst/expressions/Expression.scala |   4 +
 .../sql/catalyst/expressions/arithmetic.scala |   4 +
 .../sql/catalyst/expressions/predicates.scala |   5 +-
 .../sql/catalyst/optimizer/Optimizer.scala    |  19 +-
 5 files changed, 119 insertions(+), 128 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala
index b064600e94fac..9b8a08a88dcb0 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala
@@ -130,7 +130,7 @@ trait HiveTypeCoercion {
    * the appropriate numeric equivalent.
    */
   object ConvertNaNs extends Rule[LogicalPlan] {
-    private val stringNaN = Literal("NaN")
+    private val StringNaN = Literal("NaN")
 
     def apply(plan: LogicalPlan): LogicalPlan = plan transform {
       case q: LogicalPlan => q transformExpressions {
@@ -138,20 +138,20 @@ trait HiveTypeCoercion {
         case e if !e.childrenResolved => e
 
         /* Double Conversions */
-        case b: BinaryExpression if b.left == stringNaN && b.right.dataType == DoubleType =>
-          b.makeCopy(Array(b.right, Literal(Double.NaN)))
-        case b: BinaryExpression if b.left.dataType == DoubleType && b.right == stringNaN =>
-          b.makeCopy(Array(Literal(Double.NaN), b.left))
-        case b: BinaryExpression if b.left == stringNaN && b.right == stringNaN =>
-          b.makeCopy(Array(Literal(Double.NaN), b.left))
+        case b @ BinaryExpression(StringNaN, right @ DoubleType()) =>
+          b.makeCopy(Array(Literal(Double.NaN), right))
+        case b @ BinaryExpression(left @ DoubleType(), StringNaN) =>
+          b.makeCopy(Array(left, Literal(Double.NaN)))
 
         /* Float Conversions */
-        case b: BinaryExpression if b.left == stringNaN && b.right.dataType == FloatType =>
-          b.makeCopy(Array(b.right, Literal(Float.NaN)))
-        case b: BinaryExpression if b.left.dataType == FloatType && b.right == stringNaN =>
-          b.makeCopy(Array(Literal(Float.NaN), b.left))
-        case b: BinaryExpression if b.left == stringNaN && b.right == stringNaN =>
-          b.makeCopy(Array(Literal(Float.NaN), b.left))
+        case b @ BinaryExpression(StringNaN, right @ FloatType()) =>
+          b.makeCopy(Array(Literal(Float.NaN), right))
+        case b @ BinaryExpression(left @ FloatType(), StringNaN) =>
+          b.makeCopy(Array(left, Literal(Float.NaN)))
+
+        /* Use float NaN by default to avoid unnecessary type widening */
+        case b @ BinaryExpression(left @ StringNaN, StringNaN) =>
+          b.makeCopy(Array(left, Literal(Float.NaN)))
       }
     }
   }
@@ -184,21 +184,25 @@ trait HiveTypeCoercion {
       case u @ Union(left, right) if u.childrenResolved && !u.resolved =>
         val castedInput = left.output.zip(right.output).map {
           // When a string is found on one side, make the other side a string too.
-          case (l, r) if l.dataType == StringType && r.dataType != StringType =>
-            (l, Alias(Cast(r, StringType), r.name)())
-          case (l, r) if l.dataType != StringType && r.dataType == StringType =>
-            (Alias(Cast(l, StringType), l.name)(), r)
-
-          case (l, r) if l.dataType != r.dataType =>
-            logDebug(s"Resolving mismatched union input ${l.dataType}, ${r.dataType}")
-            findTightestCommonTypeOfTwo(l.dataType, r.dataType).map { widestType =>
+          case (lhs, rhs) if lhs.dataType == StringType && rhs.dataType != StringType =>
+            (lhs, Alias(Cast(rhs, StringType), rhs.name)())
+          case (lhs, rhs) if lhs.dataType != StringType && rhs.dataType == StringType =>
+            (Alias(Cast(lhs, StringType), lhs.name)(), rhs)
+
+          case (lhs, rhs) if lhs.dataType != rhs.dataType =>
+            logDebug(s"Resolving mismatched union input ${lhs.dataType}, ${rhs.dataType}")
+            findTightestCommonTypeOfTwo(lhs.dataType, rhs.dataType).map { widestType =>
               val newLeft =
-                if (l.dataType == widestType) l else Alias(Cast(l, widestType), l.name)()
+                if (lhs.dataType == widestType) lhs else Alias(Cast(lhs, widestType), lhs.name)()
               val newRight =
-                if (r.dataType == widestType) r else Alias(Cast(r, widestType), r.name)()
+                if (rhs.dataType == widestType) rhs else Alias(Cast(rhs, widestType), rhs.name)()
 
               (newLeft, newRight)
-            }.getOrElse((l, r)) // If there is no applicable conversion, leave expression unchanged.
+            }.getOrElse {
+              // If there is no applicable conversion, leave expression unchanged.
+              (lhs, rhs)
+            }
+
           case other => other
         }
 
@@ -227,12 +231,10 @@ trait HiveTypeCoercion {
         // Skip nodes who's children have not been resolved yet.
         case e if !e.childrenResolved => e
 
-        case b: BinaryExpression if b.left.dataType != b.right.dataType =>
-          findTightestCommonTypeOfTwo(b.left.dataType, b.right.dataType).map { widestType =>
-            val newLeft =
-              if (b.left.dataType == widestType) b.left else Cast(b.left, widestType)
-            val newRight =
-              if (b.right.dataType == widestType) b.right else Cast(b.right, widestType)
+        case b @ BinaryExpression(left, right) if left.dataType != right.dataType =>
+          findTightestCommonTypeOfTwo(left.dataType, right.dataType).map { widestType =>
+            val newLeft = if (left.dataType == widestType) left else Cast(left, widestType)
+            val newRight = if (right.dataType == widestType) right else Cast(right, widestType)
             b.makeCopy(Array(newLeft, newRight))
           }.getOrElse(b)  // If there is no applicable conversion, leave expression unchanged.
       }
@@ -247,57 +249,42 @@ trait HiveTypeCoercion {
       // Skip nodes who's children have not been resolved yet.
       case e if !e.childrenResolved => e
 
-      case a: BinaryArithmetic if a.left.dataType == StringType =>
-        a.makeCopy(Array(Cast(a.left, DoubleType), a.right))
-      case a: BinaryArithmetic if a.right.dataType == StringType =>
-        a.makeCopy(Array(a.left, Cast(a.right, DoubleType)))
+      case a @ BinaryArithmetic(left @ StringType(), r) =>
+        a.makeCopy(Array(Cast(left, DoubleType), r))
+      case a @ BinaryArithmetic(left, right @ StringType()) =>
+        a.makeCopy(Array(left, Cast(right, DoubleType)))
 
       // we should cast all timestamp/date/string compare into string compare
-      case p: BinaryComparison if p.left.dataType == StringType &&
-                                  p.right.dataType == DateType =>
-        p.makeCopy(Array(p.left, Cast(p.right, StringType)))
-      case p: BinaryComparison if p.left.dataType == DateType &&
-                                  p.right.dataType == StringType =>
-        p.makeCopy(Array(Cast(p.left, StringType), p.right))
-      case p: BinaryComparison if p.left.dataType == StringType &&
-                                  p.right.dataType == TimestampType =>
-        p.makeCopy(Array(Cast(p.left, TimestampType), p.right))
-      case p: BinaryComparison if p.left.dataType == TimestampType &&
-                                  p.right.dataType == StringType =>
-        p.makeCopy(Array(p.left, Cast(p.right, TimestampType)))
-      case p: BinaryComparison if p.left.dataType == TimestampType &&
-                                  p.right.dataType == DateType =>
-        p.makeCopy(Array(Cast(p.left, StringType), Cast(p.right, StringType)))
-      case p: BinaryComparison if p.left.dataType == DateType &&
-                                  p.right.dataType == TimestampType =>
-        p.makeCopy(Array(Cast(p.left, StringType), Cast(p.right, StringType)))
-
-      case p: BinaryComparison if p.left.dataType == StringType &&
-                                  p.right.dataType != StringType =>
-        p.makeCopy(Array(Cast(p.left, DoubleType), p.right))
-      case p: BinaryComparison if p.left.dataType != StringType &&
-                                  p.right.dataType == StringType =>
-        p.makeCopy(Array(p.left, Cast(p.right, DoubleType)))
-
-      case i @ In(a, b) if a.dataType == DateType &&
-                           b.forall(_.dataType == StringType) =>
+      case p @ BinaryComparison(left @ StringType(), right @ DateType()) =>
+        p.makeCopy(Array(left, Cast(right, StringType)))
+      case p @ BinaryComparison(left @ DateType(), right @ StringType()) =>
+        p.makeCopy(Array(Cast(left, StringType), right))
+      case p @ BinaryComparison(left @ StringType(), right @ TimestampType()) =>
+        p.makeCopy(Array(Cast(left, TimestampType), right))
+      case p @ BinaryComparison(left @ TimestampType(), right @ StringType()) =>
+        p.makeCopy(Array(left, Cast(right, TimestampType)))
+      case p @ BinaryComparison(left @ TimestampType(), right @ DateType()) =>
+        p.makeCopy(Array(Cast(left, StringType), Cast(right, StringType)))
+      case p @ BinaryComparison(left @ DateType(), right @ TimestampType()) =>
+        p.makeCopy(Array(Cast(left, StringType), Cast(right, StringType)))
+
+      case p @ BinaryComparison(left @ StringType(), right) if right.dataType != StringType =>
+        p.makeCopy(Array(Cast(left, DoubleType), right))
+      case p @ BinaryComparison(left, right @ StringType()) if left.dataType != StringType =>
+        p.makeCopy(Array(left, Cast(right, DoubleType)))
+
+      case i @ In(a @ DateType(), b) if b.forall(_.dataType == StringType) =>
         i.makeCopy(Array(Cast(a, StringType), b))
-      case i @ In(a, b) if a.dataType == TimestampType &&
-                           b.forall(_.dataType == StringType) =>
+      case i @ In(a @ TimestampType(), b) if b.forall(_.dataType == StringType) =>
         i.makeCopy(Array(a, b.map(Cast(_, TimestampType))))
-      case i @ In(a, b) if a.dataType == DateType &&
-                           b.forall(_.dataType == TimestampType) =>
+      case i @ In(a @ DateType(), b) if b.forall(_.dataType == TimestampType) =>
         i.makeCopy(Array(Cast(a, StringType), b.map(Cast(_, StringType))))
-      case i @ In(a, b) if a.dataType == TimestampType &&
-                           b.forall(_.dataType == DateType) =>
+      case i @ In(a @ TimestampType(), b) if b.forall(_.dataType == DateType) =>
         i.makeCopy(Array(Cast(a, StringType), b.map(Cast(_, StringType))))
 
-      case Sum(e) if e.dataType == StringType =>
-        Sum(Cast(e, DoubleType))
-      case Average(e) if e.dataType == StringType =>
-        Average(Cast(e, DoubleType))
-      case Sqrt(e) if e.dataType == StringType =>
-        Sqrt(Cast(e, DoubleType))
+      case Sum(e @ StringType()) => Sum(Cast(e, DoubleType))
+      case Average(e @ StringType()) => Average(Cast(e, DoubleType))
+      case Sqrt(e @ StringType()) => Sqrt(Cast(e, DoubleType))
     }
   }
 
@@ -379,22 +366,22 @@ trait HiveTypeCoercion {
       // fix decimal precision for union
       case u @ Union(left, right) if u.childrenResolved && !u.resolved =>
         val castedInput = left.output.zip(right.output).map {
-          case (l, r) if l.dataType != r.dataType =>
-            (l.dataType, r.dataType) match {
+          case (lhs, rhs) if lhs.dataType != rhs.dataType =>
+            (lhs.dataType, rhs.dataType) match {
               case (DecimalType.Fixed(p1, s1), DecimalType.Fixed(p2, s2)) =>
                 // Union decimals with precision/scale p1/s2 and p2/s2  will be promoted to
                 // DecimalType(max(s1, s2) + max(p1-s1, p2-s2), max(s1, s2))
                 val fixedType = DecimalType(max(s1, s2) + max(p1 - s1, p2 - s2), max(s1, s2))
-                (Alias(Cast(l, fixedType), l.name)(), Alias(Cast(r, fixedType), r.name)())
+                (Alias(Cast(lhs, fixedType), lhs.name)(), Alias(Cast(rhs, fixedType), rhs.name)())
               case (t, DecimalType.Fixed(p, s)) if intTypeToFixed.contains(t) =>
-                (Alias(Cast(l, intTypeToFixed(t)), l.name)(), r)
+                (Alias(Cast(lhs, intTypeToFixed(t)), lhs.name)(), rhs)
               case (DecimalType.Fixed(p, s), t) if intTypeToFixed.contains(t) =>
-                (l, Alias(Cast(r, intTypeToFixed(t)), r.name)())
+                (lhs, Alias(Cast(rhs, intTypeToFixed(t)), rhs.name)())
               case (t, DecimalType.Fixed(p, s)) if floatTypeToFixed.contains(t) =>
-                (Alias(Cast(l, floatTypeToFixed(t)), l.name)(), r)
+                (Alias(Cast(lhs, floatTypeToFixed(t)), lhs.name)(), rhs)
               case (DecimalType.Fixed(p, s), t) if floatTypeToFixed.contains(t) =>
-                (l, Alias(Cast(r, floatTypeToFixed(t)), r.name)())
-              case _ => (l, r)
+                (lhs, Alias(Cast(rhs, floatTypeToFixed(t)), rhs.name)())
+              case _ => (lhs, rhs)
             }
           case other => other
         }
@@ -467,16 +454,16 @@ trait HiveTypeCoercion {
 
         // Promote integers inside a binary expression with fixed-precision decimals to decimals,
         // and fixed-precision decimals in an expression with floats / doubles to doubles
-        case b: BinaryExpression if b.left.dataType != b.right.dataType =>
-          (b.left.dataType, b.right.dataType) match {
+        case b @ BinaryExpression(left, right) if left.dataType != right.dataType =>
+          (left.dataType, right.dataType) match {
             case (t, DecimalType.Fixed(p, s)) if intTypeToFixed.contains(t) =>
-              b.makeCopy(Array(Cast(b.left, intTypeToFixed(t)), b.right))
+              b.makeCopy(Array(Cast(left, intTypeToFixed(t)), right))
             case (DecimalType.Fixed(p, s), t) if intTypeToFixed.contains(t) =>
-              b.makeCopy(Array(b.left, Cast(b.right, intTypeToFixed(t))))
+              b.makeCopy(Array(left, Cast(right, intTypeToFixed(t))))
             case (t, DecimalType.Fixed(p, s)) if isFloat(t) =>
-              b.makeCopy(Array(b.left, Cast(b.right, DoubleType)))
+              b.makeCopy(Array(left, Cast(right, DoubleType)))
             case (DecimalType.Fixed(p, s), t) if isFloat(t) =>
-              b.makeCopy(Array(Cast(b.left, DoubleType), b.right))
+              b.makeCopy(Array(Cast(left, DoubleType), right))
             case _ =>
               b
           }
@@ -525,31 +512,31 @@ trait HiveTypeCoercion {
       // all other cases are considered as false.
 
       // We may simplify the expression if one side is literal numeric values
-      case EqualTo(l @ BooleanType(), Literal(value, _: NumericType))
-        if trueValues.contains(value) => l
-      case EqualTo(l @ BooleanType(), Literal(value, _: NumericType))
-        if falseValues.contains(value) => Not(l)
-      case EqualTo(Literal(value, _: NumericType), r @ BooleanType())
-        if trueValues.contains(value) => r
-      case EqualTo(Literal(value, _: NumericType), r @ BooleanType())
-        if falseValues.contains(value) => Not(r)
-      case EqualNullSafe(l @ BooleanType(), Literal(value, _: NumericType))
-        if trueValues.contains(value) => And(IsNotNull(l), l)
-      case EqualNullSafe(l @ BooleanType(), Literal(value, _: NumericType))
-        if falseValues.contains(value) => And(IsNotNull(l), Not(l))
-      case EqualNullSafe(Literal(value, _: NumericType), r @ BooleanType())
-        if trueValues.contains(value) => And(IsNotNull(r), r)
-      case EqualNullSafe(Literal(value, _: NumericType), r @ BooleanType())
-        if falseValues.contains(value) => And(IsNotNull(r), Not(r))
-
-      case EqualTo(l @ BooleanType(), r @ NumericType()) =>
-        transform(l , r)
-      case EqualTo(l @ NumericType(), r @ BooleanType()) =>
-        transform(r, l)
-      case EqualNullSafe(l @ BooleanType(), r @ NumericType()) =>
-        transformNullSafe(l, r)
-      case EqualNullSafe(l @ NumericType(), r @ BooleanType()) =>
-        transformNullSafe(r, l)
+      case EqualTo(left @ BooleanType(), Literal(value, _: NumericType))
+        if trueValues.contains(value) => left
+      case EqualTo(left @ BooleanType(), Literal(value, _: NumericType))
+        if falseValues.contains(value) => Not(left)
+      case EqualTo(Literal(value, _: NumericType), right @ BooleanType())
+        if trueValues.contains(value) => right
+      case EqualTo(Literal(value, _: NumericType), right @ BooleanType())
+        if falseValues.contains(value) => Not(right)
+      case EqualNullSafe(left @ BooleanType(), Literal(value, _: NumericType))
+        if trueValues.contains(value) => And(IsNotNull(left), left)
+      case EqualNullSafe(left @ BooleanType(), Literal(value, _: NumericType))
+        if falseValues.contains(value) => And(IsNotNull(left), Not(left))
+      case EqualNullSafe(Literal(value, _: NumericType), right @ BooleanType())
+        if trueValues.contains(value) => And(IsNotNull(right), right)
+      case EqualNullSafe(Literal(value, _: NumericType), right @ BooleanType())
+        if falseValues.contains(value) => And(IsNotNull(right), Not(right))
+
+      case EqualTo(left @ BooleanType(), right @ NumericType()) =>
+        transform(left , right)
+      case EqualTo(left @ NumericType(), right @ BooleanType()) =>
+        transform(right, left)
+      case EqualNullSafe(left @ BooleanType(), right @ NumericType()) =>
+        transformNullSafe(left, right)
+      case EqualNullSafe(left @ NumericType(), right @ BooleanType()) =>
+        transformNullSafe(right, left)
     }
   }
 
@@ -630,7 +617,7 @@ trait HiveTypeCoercion {
       case d: Divide if d.dataType == DoubleType => d
       case d: Divide if d.dataType.isInstanceOf[DecimalType] => d
 
-      case Divide(l, r) => Divide(Cast(l, DoubleType), Cast(r, DoubleType))
+      case Divide(left, right) => Divide(Cast(left, DoubleType), Cast(right, DoubleType))
     }
   }
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala
index 3cf851aec15ea..b2b9d1a5e1581 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala
@@ -118,6 +118,10 @@ abstract class BinaryExpression extends Expression with trees.BinaryNode[Express
   override def toString: String = s"($left $symbol $right)"
 }
 
+private[sql] object BinaryExpression {
+  def unapply(e: BinaryExpression): Option[(Expression, Expression)] = Some((e.left, e.right))
+}
+
 abstract class LeafExpression extends Expression with trees.LeafNode[Expression] {
   self: Product =>
 }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/arithmetic.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/arithmetic.scala
index 2ac53f8f6613f..a3770f998d94d 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/arithmetic.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/arithmetic.scala
@@ -118,6 +118,10 @@ abstract class BinaryArithmetic extends BinaryExpression {
     sys.error(s"BinaryArithmetics must override either eval or evalInternal")
 }
 
+private[sql] object BinaryArithmetic {
+  def unapply(e: BinaryArithmetic): Option[(Expression, Expression)] = Some((e.left, e.right))
+}
+
 case class Add(left: Expression, right: Expression) extends BinaryArithmetic {
   override def symbol: String = "+"
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala
index 807021d50e8e0..58273b166fe91 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala
@@ -202,9 +202,8 @@ abstract class BinaryComparison extends BinaryExpression with Predicate {
     sys.error(s"BinaryComparisons must override either eval or evalInternal")
 }
 
-object BinaryComparison {
-  def unapply(b: BinaryComparison): Option[(Expression, Expression)] =
-    Some((b.left, b.right))
+private[sql] object BinaryComparison {
+  def unapply(e: BinaryComparison): Option[(Expression, Expression)] = Some((e.left, e.right))
 }
 
 case class EqualTo(left: Expression, right: Expression) extends BinaryComparison {
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
index 0a17b10c521e5..c16f08d389955 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
@@ -266,7 +266,7 @@ object NullPropagation extends Rule[LogicalPlan] {
         if (newChildren.length == 0) {
           Literal.create(null, e.dataType)
         } else if (newChildren.length == 1) {
-          newChildren(0)
+          newChildren.head
         } else {
           Coalesce(newChildren)
         }
@@ -280,21 +280,18 @@ object NullPropagation extends Rule[LogicalPlan] {
       case e: MinOf => e
 
       // Put exceptional cases above if any
-      case e: BinaryArithmetic => e.children match {
-        case Literal(null, _) :: right :: Nil => Literal.create(null, e.dataType)
-        case left :: Literal(null, _) :: Nil => Literal.create(null, e.dataType)
-        case _ => e
-      }
-      case e: BinaryComparison => e.children match {
-        case Literal(null, _) :: right :: Nil => Literal.create(null, e.dataType)
-        case left :: Literal(null, _) :: Nil => Literal.create(null, e.dataType)
-        case _ => e
-      }
+      case e @ BinaryArithmetic(Literal(null, _), _) => Literal.create(null, e.dataType)
+      case e @ BinaryArithmetic(_, Literal(null, _)) => Literal.create(null, e.dataType)
+
+      case e @ BinaryComparison(Literal(null, _), _) => Literal.create(null, e.dataType)
+      case e @ BinaryComparison(_, Literal(null, _)) => Literal.create(null, e.dataType)
+
       case e: StringRegexExpression => e.children match {
         case Literal(null, _) :: right :: Nil => Literal.create(null, e.dataType)
         case left :: Literal(null, _) :: Nil => Literal.create(null, e.dataType)
         case _ => e
       }
+
       case e: StringComparison => e.children match {
         case Literal(null, _) :: right :: Nil => Literal.create(null, e.dataType)
         case left :: Literal(null, _) :: Nil => Literal.create(null, e.dataType)

From 12f5eaeee1235850a076ce5716d069bd2f1205a5 Mon Sep 17 00:00:00 2001
From: Shivaram Venkataraman <shivaram@cs.berkeley.edu>
Date: Fri, 5 Jun 2015 10:19:03 -0700
Subject: [PATCH 377/525] [SPARK-8085] [SPARKR] Support user-specified schema
 in read.df

cc davies sun-rui

Author: Shivaram Venkataraman <shivaram@cs.berkeley.edu>

Closes #6620 from shivaram/sparkr-read-schema and squashes the following commits:

16a6726 [Shivaram Venkataraman] Fix loadDF to pass schema Also add a unit test
a229877 [Shivaram Venkataraman] Use wrapper function to DataFrameReader
ee70ba8 [Shivaram Venkataraman] Support user-specified schema in read.df
---
 R/pkg/R/SQLContext.R                              | 14 ++++++++++----
 R/pkg/inst/tests/test_sparkSQL.R                  | 13 +++++++++++++
 .../org/apache/spark/sql/api/r/SQLUtils.scala     | 15 +++++++++++++++
 3 files changed, 38 insertions(+), 4 deletions(-)

diff --git a/R/pkg/R/SQLContext.R b/R/pkg/R/SQLContext.R
index 88e1a508f37c4..22a4b5bf86ebd 100644
--- a/R/pkg/R/SQLContext.R
+++ b/R/pkg/R/SQLContext.R
@@ -452,7 +452,7 @@ dropTempTable <- function(sqlContext, tableName) {
 #' df <- read.df(sqlContext, "path/to/file.json", source = "json")
 #' }
 
-read.df <- function(sqlContext, path = NULL, source = NULL, ...) {
+read.df <- function(sqlContext, path = NULL, source = NULL, schema = NULL, ...) {
   options <- varargsToEnv(...)
   if (!is.null(path)) {
     options[['path']] <- path
@@ -462,15 +462,21 @@ read.df <- function(sqlContext, path = NULL, source = NULL, ...) {
     source <- callJMethod(sqlContext, "getConf", "spark.sql.sources.default",
                           "org.apache.spark.sql.parquet")
   }
-  sdf <- callJMethod(sqlContext, "load", source, options)
+  if (!is.null(schema)) {
+    stopifnot(class(schema) == "structType")
+    sdf <- callJStatic("org.apache.spark.sql.api.r.SQLUtils", "loadDF", sqlContext, source,
+                       schema$jobj, options)
+  } else {
+    sdf <- callJStatic("org.apache.spark.sql.api.r.SQLUtils", "loadDF", sqlContext, source, options)
+  }
   dataFrame(sdf)
 }
 
 #' @aliases loadDF
 #' @export
 
-loadDF <- function(sqlContext, path = NULL, source = NULL, ...) {
-  read.df(sqlContext, path, source, ...)
+loadDF <- function(sqlContext, path = NULL, source = NULL, schema = NULL, ...) {
+  read.df(sqlContext, path, source, schema, ...)
 }
 
 #' Create an external table
diff --git a/R/pkg/inst/tests/test_sparkSQL.R b/R/pkg/inst/tests/test_sparkSQL.R
index d2d82e791e876..30edfc8a7bd94 100644
--- a/R/pkg/inst/tests/test_sparkSQL.R
+++ b/R/pkg/inst/tests/test_sparkSQL.R
@@ -504,6 +504,19 @@ test_that("read.df() from json file", {
   df <- read.df(sqlContext, jsonPath, "json")
   expect_true(inherits(df, "DataFrame"))
   expect_true(count(df) == 3)
+
+  # Check if we can apply a user defined schema
+  schema <- structType(structField("name", type = "string"),
+                       structField("age", type = "double"))
+
+  df1 <- read.df(sqlContext, jsonPath, "json", schema)
+  expect_true(inherits(df1, "DataFrame"))
+  expect_equal(dtypes(df1), list(c("name", "string"), c("age", "double")))
+
+  # Run the same with loadDF
+  df2 <- loadDF(sqlContext, jsonPath, "json", schema)
+  expect_true(inherits(df2, "DataFrame"))
+  expect_equal(dtypes(df2), list(c("name", "string"), c("age", "double")))
 })
 
 test_that("write.df() as parquet file", {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/api/r/SQLUtils.scala b/sql/core/src/main/scala/org/apache/spark/sql/api/r/SQLUtils.scala
index 604f3124e23ae..43b62f0e822f8 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/api/r/SQLUtils.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/api/r/SQLUtils.scala
@@ -139,4 +139,19 @@ private[r] object SQLUtils {
       case "ignore" => SaveMode.Ignore
     }
   }
+
+  def loadDF(
+      sqlContext: SQLContext,
+      source: String,
+      options: java.util.Map[String, String]): DataFrame = {
+    sqlContext.read.format(source).options(options).load()
+  }
+
+  def loadDF(
+      sqlContext: SQLContext,
+      source: String,
+      schema: StructType,
+      options: java.util.Map[String, String]): DataFrame = {
+    sqlContext.read.format(source).schema(schema).options(options).load()
+  }
 }

From 4036d05ceeec77ebfa9c683cbc699250df3e3895 Mon Sep 17 00:00:00 2001
From: Andrew Or <andrew@databricks.com>
Date: Fri, 5 Jun 2015 10:53:32 -0700
Subject: [PATCH 378/525] Revert "[MINOR] [BUILD] Use custom temp directory
 during build."

This reverts commit b16b5434ff44c42e4b3a337f9af147669ba44896.
---
 .../spark/deploy/SparkSubmitUtilsSuite.scala  | 22 +++++++++----------
 pom.xml                                       |  4 +---
 project/SparkBuild.scala                      |  1 -
 3 files changed, 11 insertions(+), 16 deletions(-)

diff --git a/core/src/test/scala/org/apache/spark/deploy/SparkSubmitUtilsSuite.scala b/core/src/test/scala/org/apache/spark/deploy/SparkSubmitUtilsSuite.scala
index 07d261cc428c4..8fda5c8b472c9 100644
--- a/core/src/test/scala/org/apache/spark/deploy/SparkSubmitUtilsSuite.scala
+++ b/core/src/test/scala/org/apache/spark/deploy/SparkSubmitUtilsSuite.scala
@@ -28,12 +28,9 @@ import org.apache.ivy.plugins.resolver.IBiblioResolver
 
 import org.apache.spark.SparkFunSuite
 import org.apache.spark.deploy.SparkSubmitUtils.MavenCoordinate
-import org.apache.spark.util.Utils
 
 class SparkSubmitUtilsSuite extends SparkFunSuite with BeforeAndAfterAll {
 
-  private var tempIvyPath: String = _
-
   private val noOpOutputStream = new OutputStream {
     def write(b: Int) = {}
   }
@@ -50,7 +47,6 @@ class SparkSubmitUtilsSuite extends SparkFunSuite with BeforeAndAfterAll {
     super.beforeAll()
     // We don't want to write logs during testing
     SparkSubmitUtils.printStream = new BufferPrintStream
-    tempIvyPath = Utils.createTempDir(namePrefix = "ivy").getAbsolutePath()
   }
 
   test("incorrect maven coordinate throws error") {
@@ -94,20 +90,21 @@ class SparkSubmitUtilsSuite extends SparkFunSuite with BeforeAndAfterAll {
   }
 
   test("ivy path works correctly") {
+    val ivyPath = "dummy" + File.separator +  "ivy"
     val md = SparkSubmitUtils.getModuleDescriptor
     val artifacts = for (i <- 0 until 3) yield new MDArtifact(md, s"jar-$i", "jar", "jar")
-    var jPaths = SparkSubmitUtils.resolveDependencyPaths(artifacts.toArray, new File(tempIvyPath))
+    var jPaths = SparkSubmitUtils.resolveDependencyPaths(artifacts.toArray, new File(ivyPath))
     for (i <- 0 until 3) {
-      val index = jPaths.indexOf(tempIvyPath)
+      val index = jPaths.indexOf(ivyPath)
       assert(index >= 0)
-      jPaths = jPaths.substring(index + tempIvyPath.length)
+      jPaths = jPaths.substring(index + ivyPath.length)
     }
     val main = MavenCoordinate("my.awesome.lib", "mylib", "0.1")
     IvyTestUtils.withRepository(main, None, None) { repo =>
       // end to end
       val jarPath = SparkSubmitUtils.resolveMavenCoordinates(main.toString, Option(repo),
-        Option(tempIvyPath), true)
-      assert(jarPath.indexOf(tempIvyPath) >= 0, "should use non-default ivy path")
+        Option(ivyPath), true)
+      assert(jarPath.indexOf(ivyPath) >= 0, "should use non-default ivy path")
     }
   }
 
@@ -126,12 +123,13 @@ class SparkSubmitUtilsSuite extends SparkFunSuite with BeforeAndAfterAll {
       assert(jarPath.indexOf("mylib") >= 0, "should find artifact")
     }
     // Local ivy repository with modified home
-    val dummyIvyLocal = new File(tempIvyPath, "local" + File.separator)
+    val dummyIvyPath = "dummy" + File.separator + "ivy"
+    val dummyIvyLocal = new File(dummyIvyPath, "local" + File.separator)
     IvyTestUtils.withRepository(main, None, Some(dummyIvyLocal), true) { repo =>
       val jarPath = SparkSubmitUtils.resolveMavenCoordinates(main.toString, None,
-        Some(tempIvyPath), true)
+        Some(dummyIvyPath), true)
       assert(jarPath.indexOf("mylib") >= 0, "should find artifact")
-      assert(jarPath.indexOf(tempIvyPath) >= 0, "should be in new ivy path")
+      assert(jarPath.indexOf(dummyIvyPath) >= 0, "should be in new ivy path")
     }
   }
 
diff --git a/pom.xml b/pom.xml
index a848deffe7375..e28d4b9fc2b17 100644
--- a/pom.xml
+++ b/pom.xml
@@ -179,7 +179,7 @@
     <parquet.deps.scope>compile</parquet.deps.scope>
 
     <!--
-      Overridable test home. So that you can call individual pom files directly without
+      Overridable test home. So that you can call individual pom files directory without
       things breaking.
     -->
     <spark.test.home>${session.executionRootDirectory}</spark.test.home>
@@ -1256,7 +1256,6 @@
             <systemProperties>
               <derby.system.durability>test</derby.system.durability>
               <java.awt.headless>true</java.awt.headless>
-              <java.io.tmpdir>${project.build.directory}/tmp</java.io.tmpdir>
               <spark.test.home>${spark.test.home}</spark.test.home>
               <spark.testing>1</spark.testing>
               <spark.ui.enabled>false</spark.ui.enabled>
@@ -1290,7 +1289,6 @@
             <systemProperties>
               <derby.system.durability>test</derby.system.durability>
               <java.awt.headless>true</java.awt.headless>
-              <java.io.tmpdir>${project.build.directory}/tmp</java.io.tmpdir>
               <spark.test.home>${spark.test.home}</spark.test.home>
               <spark.testing>1</spark.testing>
               <spark.ui.enabled>false</spark.ui.enabled>
diff --git a/project/SparkBuild.scala b/project/SparkBuild.scala
index 921f1599fedef..ef3a175bac209 100644
--- a/project/SparkBuild.scala
+++ b/project/SparkBuild.scala
@@ -496,7 +496,6 @@ object TestSettings {
       "SPARK_DIST_CLASSPATH" ->
         (fullClasspath in Test).value.files.map(_.getAbsolutePath).mkString(":").stripSuffix(":"),
       "JAVA_HOME" -> sys.env.get("JAVA_HOME").getOrElse(sys.props("java.home"))),
-    javaOptions in Test += s"-Djava.io.tmpdir=$sparkHome/target/tmp",
     javaOptions in Test += "-Dspark.test.home=" + sparkHome,
     javaOptions in Test += "-Dspark.testing=1",
     javaOptions in Test += "-Dspark.port.maxRetries=100",

From 0992a0a77d38081c6c206bb34333013125d85376 Mon Sep 17 00:00:00 2001
From: Xutingjun <xutingjun@huawei.com>
Date: Fri, 5 Jun 2015 11:41:39 -0700
Subject: [PATCH 379/525] [SPARK-8099] set executor cores into system in
 yarn-cluster mode

Author: Xutingjun <xutingjun@huawei.com>
Author: xutingjun <xutingjun@huawei.com>

Closes #6643 from XuTingjun/SPARK-8099 and squashes the following commits:

80b18cd [Xutingjun] change to STANDALONE | YARN
ce33148 [Xutingjun] set executor cores into system
e51cc9e [Xutingjun] set executor cores into system
0600861 [xutingjun] set executor cores into system
---
 core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala b/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
index 3aa3f948e865d..a0eae774268ed 100644
--- a/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
@@ -425,7 +425,6 @@ object SparkSubmit {
       // Yarn client only
       OptionAssigner(args.queue, YARN, CLIENT, sysProp = "spark.yarn.queue"),
       OptionAssigner(args.numExecutors, YARN, CLIENT, sysProp = "spark.executor.instances"),
-      OptionAssigner(args.executorCores, YARN, CLIENT, sysProp = "spark.executor.cores"),
       OptionAssigner(args.files, YARN, CLIENT, sysProp = "spark.yarn.dist.files"),
       OptionAssigner(args.archives, YARN, CLIENT, sysProp = "spark.yarn.dist.archives"),
       OptionAssigner(args.principal, YARN, CLIENT, sysProp = "spark.yarn.principal"),
@@ -446,7 +445,7 @@ object SparkSubmit {
       OptionAssigner(args.keytab, YARN, CLUSTER, clOption = "--keytab"),
 
       // Other options
-      OptionAssigner(args.executorCores, STANDALONE, ALL_DEPLOY_MODES,
+      OptionAssigner(args.executorCores, STANDALONE | YARN, ALL_DEPLOY_MODES,
         sysProp = "spark.executor.cores"),
       OptionAssigner(args.executorMemory, STANDALONE | MESOS | YARN, ALL_DEPLOY_MODES,
         sysProp = "spark.executor.memory"),

From 3f80bc841ab155925fb0530eef5927990f4a5793 Mon Sep 17 00:00:00 2001
From: jerryshao <saisai.shao@intel.com>
Date: Fri, 5 Jun 2015 12:28:37 -0700
Subject: [PATCH 380/525] [SPARK-7699] [CORE] Lazy start the scheduler for
 dynamic allocation

This patch propose to lazy start the scheduler for dynamic allocation to avoid fast ramp down executor numbers is load is less.

This implementation will:
1. immediately start the scheduler is `numExecutorsTarget` is 0, this is the expected behavior.
2. if `numExecutorsTarget` is not zero, start the scheduler until the number is satisfied, if the load is less, this initial started executors will last for at least 60 seconds, user will have a window to submit a job, no need to revamp the executors.
3. if `numExecutorsTarget` is not satisfied until the timeout, this means resource is not enough, the scheduler will start until this timeout, will not wait infinitely.

Please help to review, thanks a lot.

Author: jerryshao <saisai.shao@intel.com>

Closes #6430 from jerryshao/SPARK-7699 and squashes the following commits:

02cac8e [jerryshao] Address the comments
7242450 [jerryshao] Remove the useless import
ecc0b00 [jerryshao] Address the comments
6f75f00 [jerryshao] Style changes
8b8decc [jerryshao] change the test name
fb822ca [jerryshao] Change the solution according to comments
1cc74e5 [jerryshao] Lazy start the scheduler for dynamic allocation
---
 .../spark/ExecutorAllocationManager.scala     | 17 +++-
 .../ExecutorAllocationManagerSuite.scala      | 90 +++++++++++++++----
 2 files changed, 89 insertions(+), 18 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/ExecutorAllocationManager.scala b/core/src/main/scala/org/apache/spark/ExecutorAllocationManager.scala
index f7323a4d9db72..9939103bb0903 100644
--- a/core/src/main/scala/org/apache/spark/ExecutorAllocationManager.scala
+++ b/core/src/main/scala/org/apache/spark/ExecutorAllocationManager.scala
@@ -150,6 +150,13 @@ private[spark] class ExecutorAllocationManager(
   // Metric source for ExecutorAllocationManager to expose internal status to MetricsSystem.
   val executorAllocationManagerSource = new ExecutorAllocationManagerSource
 
+  // Whether we are still waiting for the initial set of executors to be allocated.
+  // While this is true, we will not cancel outstanding executor requests. This is
+  // set to false when:
+  //   (1) a stage is submitted, or
+  //   (2) an executor idle timeout has elapsed.
+  @volatile private var initializing: Boolean = true
+
   /**
    * Verify that the settings specified through the config are valid.
    * If not, throw an appropriate exception.
@@ -240,6 +247,7 @@ private[spark] class ExecutorAllocationManager(
     removeTimes.retain { case (executorId, expireTime) =>
       val expired = now >= expireTime
       if (expired) {
+        initializing = false
         removeExecutor(executorId)
       }
       !expired
@@ -261,7 +269,11 @@ private[spark] class ExecutorAllocationManager(
   private def updateAndSyncNumExecutorsTarget(now: Long): Int = synchronized {
     val maxNeeded = maxNumExecutorsNeeded
 
-    if (maxNeeded < numExecutorsTarget) {
+    if (initializing) {
+      // Do not change our target while we are still initializing,
+      // Otherwise the first job may have to ramp up unnecessarily
+      0
+    } else if (maxNeeded < numExecutorsTarget) {
       // The target number exceeds the number we actually need, so stop adding new
       // executors and inform the cluster manager to cancel the extra pending requests
       val oldNumExecutorsTarget = numExecutorsTarget
@@ -271,7 +283,7 @@ private[spark] class ExecutorAllocationManager(
       // If the new target has not changed, avoid sending a message to the cluster manager
       if (numExecutorsTarget < oldNumExecutorsTarget) {
         client.requestTotalExecutors(numExecutorsTarget)
-        logInfo(s"Lowering target number of executors to $numExecutorsTarget (previously " +
+        logDebug(s"Lowering target number of executors to $numExecutorsTarget (previously " +
           s"$oldNumExecutorsTarget) because not all requested executors are actually needed")
       }
       numExecutorsTarget - oldNumExecutorsTarget
@@ -481,6 +493,7 @@ private[spark] class ExecutorAllocationManager(
     private var numRunningTasks: Int = _
 
     override def onStageSubmitted(stageSubmitted: SparkListenerStageSubmitted): Unit = {
+      initializing = false
       val stageId = stageSubmitted.stageInfo.stageId
       val numTasks = stageSubmitted.stageInfo.numTasks
       allocationManager.synchronized {
diff --git a/core/src/test/scala/org/apache/spark/ExecutorAllocationManagerSuite.scala b/core/src/test/scala/org/apache/spark/ExecutorAllocationManagerSuite.scala
index 1c2b681f0b843..803e1831bb269 100644
--- a/core/src/test/scala/org/apache/spark/ExecutorAllocationManagerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/ExecutorAllocationManagerSuite.scala
@@ -90,7 +90,7 @@ class ExecutorAllocationManagerSuite
   }
 
   test("add executors") {
-    sc = createSparkContext(1, 10)
+    sc = createSparkContext(1, 10, 1)
     val manager = sc.executorAllocationManager.get
     sc.listenerBus.postToAll(SparkListenerStageSubmitted(createStageInfo(0, 1000)))
 
@@ -135,7 +135,7 @@ class ExecutorAllocationManagerSuite
   }
 
   test("add executors capped by num pending tasks") {
-    sc = createSparkContext(0, 10)
+    sc = createSparkContext(0, 10, 0)
     val manager = sc.executorAllocationManager.get
     sc.listenerBus.postToAll(SparkListenerStageSubmitted(createStageInfo(0, 5)))
 
@@ -186,7 +186,7 @@ class ExecutorAllocationManagerSuite
   }
 
   test("cancel pending executors when no longer needed") {
-    sc = createSparkContext(0, 10)
+    sc = createSparkContext(0, 10, 0)
     val manager = sc.executorAllocationManager.get
     sc.listenerBus.postToAll(SparkListenerStageSubmitted(createStageInfo(2, 5)))
 
@@ -213,7 +213,7 @@ class ExecutorAllocationManagerSuite
   }
 
   test("remove executors") {
-    sc = createSparkContext(5, 10)
+    sc = createSparkContext(5, 10, 5)
     val manager = sc.executorAllocationManager.get
     (1 to 10).map(_.toString).foreach { id => onExecutorAdded(manager, id) }
 
@@ -263,7 +263,7 @@ class ExecutorAllocationManagerSuite
   }
 
   test ("interleaving add and remove") {
-    sc = createSparkContext(5, 10)
+    sc = createSparkContext(5, 10, 5)
     val manager = sc.executorAllocationManager.get
     sc.listenerBus.postToAll(SparkListenerStageSubmitted(createStageInfo(0, 1000)))
 
@@ -331,7 +331,7 @@ class ExecutorAllocationManagerSuite
   }
 
   test("starting/canceling add timer") {
-    sc = createSparkContext(2, 10)
+    sc = createSparkContext(2, 10, 2)
     val clock = new ManualClock(8888L)
     val manager = sc.executorAllocationManager.get
     manager.setClock(clock)
@@ -363,7 +363,7 @@ class ExecutorAllocationManagerSuite
   }
 
   test("starting/canceling remove timers") {
-    sc = createSparkContext(2, 10)
+    sc = createSparkContext(2, 10, 2)
     val clock = new ManualClock(14444L)
     val manager = sc.executorAllocationManager.get
     manager.setClock(clock)
@@ -410,7 +410,7 @@ class ExecutorAllocationManagerSuite
   }
 
   test("mock polling loop with no events") {
-    sc = createSparkContext(0, 20)
+    sc = createSparkContext(0, 20, 0)
     val manager = sc.executorAllocationManager.get
     val clock = new ManualClock(2020L)
     manager.setClock(clock)
@@ -436,7 +436,7 @@ class ExecutorAllocationManagerSuite
   }
 
   test("mock polling loop add behavior") {
-    sc = createSparkContext(0, 20)
+    sc = createSparkContext(0, 20, 0)
     val clock = new ManualClock(2020L)
     val manager = sc.executorAllocationManager.get
     manager.setClock(clock)
@@ -486,7 +486,7 @@ class ExecutorAllocationManagerSuite
   }
 
   test("mock polling loop remove behavior") {
-    sc = createSparkContext(1, 20)
+    sc = createSparkContext(1, 20, 1)
     val clock = new ManualClock(2020L)
     val manager = sc.executorAllocationManager.get
     manager.setClock(clock)
@@ -547,7 +547,7 @@ class ExecutorAllocationManagerSuite
   }
 
   test("listeners trigger add executors correctly") {
-    sc = createSparkContext(2, 10)
+    sc = createSparkContext(2, 10, 2)
     val manager = sc.executorAllocationManager.get
     assert(addTime(manager) === NOT_SET)
 
@@ -577,7 +577,7 @@ class ExecutorAllocationManagerSuite
   }
 
   test("listeners trigger remove executors correctly") {
-    sc = createSparkContext(2, 10)
+    sc = createSparkContext(2, 10, 2)
     val manager = sc.executorAllocationManager.get
     assert(removeTimes(manager).isEmpty)
 
@@ -608,7 +608,7 @@ class ExecutorAllocationManagerSuite
   }
 
   test("listeners trigger add and remove executor callbacks correctly") {
-    sc = createSparkContext(2, 10)
+    sc = createSparkContext(2, 10, 2)
     val manager = sc.executorAllocationManager.get
     assert(executorIds(manager).isEmpty)
     assert(removeTimes(manager).isEmpty)
@@ -641,7 +641,7 @@ class ExecutorAllocationManagerSuite
   }
 
   test("SPARK-4951: call onTaskStart before onBlockManagerAdded") {
-    sc = createSparkContext(2, 10)
+    sc = createSparkContext(2, 10, 2)
     val manager = sc.executorAllocationManager.get
     assert(executorIds(manager).isEmpty)
     assert(removeTimes(manager).isEmpty)
@@ -677,7 +677,7 @@ class ExecutorAllocationManagerSuite
   }
 
   test("avoid ramp up when target < running executors") {
-    sc = createSparkContext(0, 100000)
+    sc = createSparkContext(0, 100000, 0)
     val manager = sc.executorAllocationManager.get
     val stage1 = createStageInfo(0, 1000)
     sc.listenerBus.postToAll(SparkListenerStageSubmitted(stage1))
@@ -701,13 +701,67 @@ class ExecutorAllocationManagerSuite
     assert(numExecutorsTarget(manager) === 16)
   }
 
-  private def createSparkContext(minExecutors: Int = 1, maxExecutors: Int = 5): SparkContext = {
+  test("avoid ramp down initial executors until first job is submitted") {
+    sc = createSparkContext(2, 5, 3)
+    val manager = sc.executorAllocationManager.get
+    val clock = new ManualClock(10000L)
+    manager.setClock(clock)
+
+    // Verify the initial number of executors
+    assert(numExecutorsTarget(manager) === 3)
+    schedule(manager)
+    // Verify whether the initial number of executors is kept with no pending tasks
+    assert(numExecutorsTarget(manager) === 3)
+
+    sc.listenerBus.postToAll(SparkListenerStageSubmitted(createStageInfo(1, 2)))
+    clock.advance(100L)
+
+    assert(maxNumExecutorsNeeded(manager) === 2)
+    schedule(manager)
+
+    // Verify that current number of executors should be ramp down when first job is submitted
+    assert(numExecutorsTarget(manager) === 2)
+  }
+
+  test("avoid ramp down initial executors until idle executor is timeout") {
+    sc = createSparkContext(2, 5, 3)
+    val manager = sc.executorAllocationManager.get
+    val clock = new ManualClock(10000L)
+    manager.setClock(clock)
+
+    // Verify the initial number of executors
+    assert(numExecutorsTarget(manager) === 3)
+    schedule(manager)
+    // Verify the initial number of executors is kept when no pending tasks
+    assert(numExecutorsTarget(manager) === 3)
+    (0 until 3).foreach { i =>
+      onExecutorAdded(manager, s"executor-$i")
+    }
+
+    clock.advance(executorIdleTimeout * 1000)
+
+    assert(maxNumExecutorsNeeded(manager) === 0)
+    schedule(manager)
+    // Verify executor is timeout but numExecutorsTarget is not recalculated
+    assert(numExecutorsTarget(manager) === 3)
+
+    // Schedule again to recalculate the numExecutorsTarget after executor is timeout
+    schedule(manager)
+    // Verify that current number of executors should be ramp down when executor is timeout
+    assert(numExecutorsTarget(manager) === 2)
+  }
+
+  private def createSparkContext(
+      minExecutors: Int = 1,
+      maxExecutors: Int = 5,
+      initialExecutors: Int = 1): SparkContext = {
     val conf = new SparkConf()
       .setMaster("local")
       .setAppName("test-executor-allocation-manager")
       .set("spark.dynamicAllocation.enabled", "true")
       .set("spark.dynamicAllocation.minExecutors", minExecutors.toString)
       .set("spark.dynamicAllocation.maxExecutors", maxExecutors.toString)
+      .set("spark.dynamicAllocation.initialExecutors", initialExecutors.toString)
       .set("spark.dynamicAllocation.schedulerBacklogTimeout",
           s"${schedulerBacklogTimeout.toString}s")
       .set("spark.dynamicAllocation.sustainedSchedulerBacklogTimeout",
@@ -791,6 +845,10 @@ private object ExecutorAllocationManagerSuite extends PrivateMethodTester {
     manager invokePrivate _schedule()
   }
 
+  private def maxNumExecutorsNeeded(manager: ExecutorAllocationManager): Int = {
+    manager invokePrivate _maxNumExecutorsNeeded()
+  }
+
   private def addExecutors(manager: ExecutorAllocationManager): Int = {
     val maxNumExecutorsNeeded = manager invokePrivate _maxNumExecutorsNeeded()
     manager invokePrivate _addExecutors(maxNumExecutorsNeeded)

From 4f16d3fe2e260a716b5b4e4005cb6229386440ed Mon Sep 17 00:00:00 2001
From: zsxwing <zsxwing@gmail.com>
Date: Fri, 5 Jun 2015 12:46:02 -0700
Subject: [PATCH 381/525] [SPARK-8112] [STREAMING] Fix the negative event count
 issue

Author: zsxwing <zsxwing@gmail.com>

Closes #6659 from zsxwing/SPARK-8112 and squashes the following commits:

a5d7da6 [zsxwing] Address comments
d255b6e [zsxwing] Fix the negative event count issue
---
 .../apache/spark/streaming/dstream/ReceiverInputDStream.scala | 2 +-
 .../spark/streaming/receiver/ReceiverSupervisorImpl.scala     | 4 ++--
 .../apache/spark/streaming/scheduler/InputInfoTracker.scala   | 4 +++-
 .../apache/spark/streaming/scheduler/ReceivedBlockInfo.scala  | 4 +++-
 .../apache/spark/streaming/ReceivedBlockTrackerSuite.scala    | 2 +-
 5 files changed, 10 insertions(+), 6 deletions(-)

diff --git a/streaming/src/main/scala/org/apache/spark/streaming/dstream/ReceiverInputDStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/dstream/ReceiverInputDStream.scala
index e4ff05e12f201..e76e7eb0dea19 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/dstream/ReceiverInputDStream.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/dstream/ReceiverInputDStream.scala
@@ -70,7 +70,7 @@ abstract class ReceiverInputDStream[T: ClassTag](@transient ssc_ : StreamingCont
         val blockIds = blockInfos.map { _.blockId.asInstanceOf[BlockId] }.toArray
 
         // Register the input blocks information into InputInfoTracker
-        val inputInfo = InputInfo(id, blockInfos.map(_.numRecords).sum)
+        val inputInfo = InputInfo(id, blockInfos.flatMap(_.numRecords).sum)
         ssc.scheduler.inputInfoTracker.reportInfo(validTime, inputInfo)
 
         if (blockInfos.nonEmpty) {
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/receiver/ReceiverSupervisorImpl.scala b/streaming/src/main/scala/org/apache/spark/streaming/receiver/ReceiverSupervisorImpl.scala
index 92938379b9c17..8be732b64e3a3 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/receiver/ReceiverSupervisorImpl.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/receiver/ReceiverSupervisorImpl.scala
@@ -138,8 +138,8 @@ private[streaming] class ReceiverSupervisorImpl(
     ) {
     val blockId = blockIdOption.getOrElse(nextBlockId)
     val numRecords = receivedBlock match {
-      case ArrayBufferBlock(arrayBuffer) => arrayBuffer.size
-      case _ => -1
+      case ArrayBufferBlock(arrayBuffer) => Some(arrayBuffer.size.toLong)
+      case _ => None
     }
 
     val time = System.currentTimeMillis
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/InputInfoTracker.scala b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/InputInfoTracker.scala
index a72efccf2f994..7c0db8a863c67 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/InputInfoTracker.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/InputInfoTracker.scala
@@ -23,7 +23,9 @@ import org.apache.spark.Logging
 import org.apache.spark.streaming.{Time, StreamingContext}
 
 /** To track the information of input stream at specified batch time. */
-private[streaming] case class InputInfo(inputStreamId: Int, numRecords: Long)
+private[streaming] case class InputInfo(inputStreamId: Int, numRecords: Long) {
+  require(numRecords >= 0, "numRecords must not be negative")
+}
 
 /**
  * This class manages all the input streams as well as their input data statistics. The information
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ReceivedBlockInfo.scala b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ReceivedBlockInfo.scala
index dc11e84f29965..656ac80df8979 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ReceivedBlockInfo.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ReceivedBlockInfo.scala
@@ -24,11 +24,13 @@ import org.apache.spark.streaming.util.WriteAheadLogRecordHandle
 /** Information about blocks received by the receiver */
 private[streaming] case class ReceivedBlockInfo(
     streamId: Int,
-    numRecords: Long,
+    numRecords: Option[Long],
     metadataOption: Option[Any],
     blockStoreResult: ReceivedBlockStoreResult
   ) {
 
+  require(numRecords.isEmpty || numRecords.get >= 0, "numRecords must not be negative")
+
   @volatile private var _isBlockIdValid = true
 
   def blockId: StreamBlockId = blockStoreResult.blockId
diff --git a/streaming/src/test/scala/org/apache/spark/streaming/ReceivedBlockTrackerSuite.scala b/streaming/src/test/scala/org/apache/spark/streaming/ReceivedBlockTrackerSuite.scala
index 6f0ee774cb5cf..be305b5e0dfea 100644
--- a/streaming/src/test/scala/org/apache/spark/streaming/ReceivedBlockTrackerSuite.scala
+++ b/streaming/src/test/scala/org/apache/spark/streaming/ReceivedBlockTrackerSuite.scala
@@ -224,7 +224,7 @@ class ReceivedBlockTrackerSuite
 
   /** Generate blocks infos using random ids */
   def generateBlockInfos(): Seq[ReceivedBlockInfo] = {
-    List.fill(5)(ReceivedBlockInfo(streamId, 0, None,
+    List.fill(5)(ReceivedBlockInfo(streamId, Some(0L), None,
       BlockManagerBasedStoreResult(StreamBlockId(streamId, math.abs(Random.nextInt)))))
   }
 

From 4060526cd3b7e9ba345ce94f6e081cc1156e53ab Mon Sep 17 00:00:00 2001
From: Luca Martinetti <luca@luca.io>
Date: Fri, 5 Jun 2015 13:40:11 -0700
Subject: [PATCH 382/525] [SPARK-7747] [SQL] [DOCS] 
 spark.sql.planner.externalSort

Add documentation for spark.sql.planner.externalSort

Author: Luca Martinetti <luca@luca.io>

Closes #6272 from lucamartinetti/docs-externalsort and squashes the following commits:

985661b [Luca Martinetti] [SPARK-7747] [SQL] [DOCS] Add documentation for spark.sql.planner.externalSort
---
 docs/sql-programming-guide.md | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/docs/sql-programming-guide.md b/docs/sql-programming-guide.md
index 282ea75e1e785..cde5830c733e0 100644
--- a/docs/sql-programming-guide.md
+++ b/docs/sql-programming-guide.md
@@ -1785,6 +1785,13 @@ that these options will be deprecated in future release as more optimizations ar
       Configures the number of partitions to use when shuffling data for joins or aggregations.
     </td>
   </tr>
+   <tr>
+    <td><code>spark.sql.planner.externalSort</code></td>
+    <td>false</td>
+    <td>
+      When true, performs sorts spilling to disk as needed otherwise sort each partition in memory.
+    </td>
+  </tr>
 </table>
 
 # Distributed SQL Engine

From 356a4a9b93a1eeedb910c6bccc0abadf59e4877f Mon Sep 17 00:00:00 2001
From: amey <amey@skytree.net>
Date: Fri, 5 Jun 2015 13:49:33 -0700
Subject: [PATCH 383/525] [SPARK-7991] [PySpark] Adding support for passing
 lists to describe.

This is a minor change.

Author: amey <amey@skytree.net>

Closes #6655 from ameyc/JIRA-7991/support-passing-list-to-describe and squashes the following commits:

e8a1dff [amey] Adding support for passing lists to describe.
---
 python/pyspark/sql/dataframe.py | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py
index 03b01a1136e45..902504df5b11b 100644
--- a/python/pyspark/sql/dataframe.py
+++ b/python/pyspark/sql/dataframe.py
@@ -616,7 +616,19 @@ def describe(self, *cols):
         |    min|  2|
         |    max|  5|
         +-------+---+
+        >>> df.describe(['age', 'name']).show()
+        +-------+---+-----+
+        |summary|age| name|
+        +-------+---+-----+
+        |  count|  2|    2|
+        |   mean|3.5| null|
+        | stddev|1.5| null|
+        |    min|  2|Alice|
+        |    max|  5|  Bob|
+        +-------+---+-----+
         """
+        if len(cols) == 1 and isinstance(cols[0], list):
+            cols = cols[0]
         jdf = self._jdf.describe(self._jseq(cols))
         return DataFrame(jdf, self.sql_ctx)
 

From 6ebe419f335fcfb66dd3da74baf35eb5b2fc061d Mon Sep 17 00:00:00 2001
From: Reynold Xin <rxin@databricks.com>
Date: Fri, 5 Jun 2015 13:57:21 -0700
Subject: [PATCH 384/525] [SPARK-8114][SQL] Remove some wildcard import on
 TestSQLContext._ cont'd.

Fixed the following packages:
sql.columnar
sql.jdbc
sql.json
sql.parquet

Author: Reynold Xin <rxin@databricks.com>

Closes #6667 from rxin/testsqlcontext_wildcard and squashes the following commits:

134a776 [Reynold Xin] Fixed compilation break.
6da7b69 [Reynold Xin] [SPARK-8114][SQL] Remove some wildcard import on TestSQLContext._ cont'd.
---
 .../columnar/InMemoryColumnarQuerySuite.scala | 40 ++++----
 .../columnar/PartitionBatchPruningSuite.scala | 28 +++---
 .../org/apache/spark/sql/jdbc/JDBCSuite.scala | 45 +++++----
 .../spark/sql/jdbc/JDBCWriteSuite.scala       | 75 +++++++--------
 .../org/apache/spark/sql/json/JsonSuite.scala | 95 +++++++++----------
 .../apache/spark/sql/json/TestJsonData.scala  | 62 ++++++------
 .../sql/parquet/ParquetFilterSuite.scala      |  7 +-
 .../spark/sql/parquet/ParquetIOSuite.scala    | 40 ++++----
 .../ParquetPartitionDiscoverySuite.scala      | 27 +++---
 .../spark/sql/parquet/ParquetQuerySuite.scala | 24 ++---
 .../sql/parquet/ParquetSchemaSuite.scala      |  3 +-
 .../spark/sql/parquet/ParquetTest.scala       |  6 +-
 .../apache/spark/sql/test/SQLTestUtils.scala  | 14 ++-
 .../spark/sql/hive/orc/OrcQuerySuite.scala    |  5 +-
 .../apache/spark/sql/hive/orc/OrcTest.scala   |  8 +-
 15 files changed, 234 insertions(+), 245 deletions(-)

diff --git a/sql/core/src/test/scala/org/apache/spark/sql/columnar/InMemoryColumnarQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/columnar/InMemoryColumnarQuerySuite.scala
index 055453e688e73..fa3b8144c086e 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/columnar/InMemoryColumnarQuerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/columnar/InMemoryColumnarQuerySuite.scala
@@ -21,8 +21,6 @@ import java.sql.{Date, Timestamp}
 
 import org.apache.spark.sql.TestData._
 import org.apache.spark.sql.catalyst.expressions.Row
-import org.apache.spark.sql.test.TestSQLContext._
-import org.apache.spark.sql.test.TestSQLContext.implicits._
 import org.apache.spark.sql.types._
 import org.apache.spark.sql.{QueryTest, TestData}
 import org.apache.spark.storage.StorageLevel.MEMORY_ONLY
@@ -31,8 +29,12 @@ class InMemoryColumnarQuerySuite extends QueryTest {
   // Make sure the tables are loaded.
   TestData
 
+  private lazy val ctx = org.apache.spark.sql.test.TestSQLContext
+  import ctx.implicits._
+  import ctx.{logicalPlanToSparkQuery, sql}
+
   test("simple columnar query") {
-    val plan = executePlan(testData.logicalPlan).executedPlan
+    val plan = ctx.executePlan(testData.logicalPlan).executedPlan
     val scan = InMemoryRelation(useCompression = true, 5, MEMORY_ONLY, plan, None)
 
     checkAnswer(scan, testData.collect().toSeq)
@@ -40,16 +42,16 @@ class InMemoryColumnarQuerySuite extends QueryTest {
 
   test("default size avoids broadcast") {
     // TODO: Improve this test when we have better statistics
-    sparkContext.parallelize(1 to 10).map(i => TestData(i, i.toString))
+    ctx.sparkContext.parallelize(1 to 10).map(i => TestData(i, i.toString))
       .toDF().registerTempTable("sizeTst")
-    cacheTable("sizeTst")
+    ctx.cacheTable("sizeTst")
     assert(
-      table("sizeTst").queryExecution.analyzed.statistics.sizeInBytes >
-        conf.autoBroadcastJoinThreshold)
+      ctx.table("sizeTst").queryExecution.analyzed.statistics.sizeInBytes >
+        ctx.conf.autoBroadcastJoinThreshold)
   }
 
   test("projection") {
-    val plan = executePlan(testData.select('value, 'key).logicalPlan).executedPlan
+    val plan = ctx.executePlan(testData.select('value, 'key).logicalPlan).executedPlan
     val scan = InMemoryRelation(useCompression = true, 5, MEMORY_ONLY, plan, None)
 
     checkAnswer(scan, testData.collect().map {
@@ -58,7 +60,7 @@ class InMemoryColumnarQuerySuite extends QueryTest {
   }
 
   test("SPARK-1436 regression: in-memory columns must be able to be accessed multiple times") {
-    val plan = executePlan(testData.logicalPlan).executedPlan
+    val plan = ctx.executePlan(testData.logicalPlan).executedPlan
     val scan = InMemoryRelation(useCompression = true, 5, MEMORY_ONLY, plan, None)
 
     checkAnswer(scan, testData.collect().toSeq)
@@ -70,7 +72,7 @@ class InMemoryColumnarQuerySuite extends QueryTest {
       sql("SELECT * FROM repeatedData"),
       repeatedData.collect().toSeq.map(Row.fromTuple))
 
-    cacheTable("repeatedData")
+    ctx.cacheTable("repeatedData")
 
     checkAnswer(
       sql("SELECT * FROM repeatedData"),
@@ -82,7 +84,7 @@ class InMemoryColumnarQuerySuite extends QueryTest {
       sql("SELECT * FROM nullableRepeatedData"),
       nullableRepeatedData.collect().toSeq.map(Row.fromTuple))
 
-    cacheTable("nullableRepeatedData")
+    ctx.cacheTable("nullableRepeatedData")
 
     checkAnswer(
       sql("SELECT * FROM nullableRepeatedData"),
@@ -94,7 +96,7 @@ class InMemoryColumnarQuerySuite extends QueryTest {
       sql("SELECT time FROM timestamps"),
       timestamps.collect().toSeq.map(Row.fromTuple))
 
-    cacheTable("timestamps")
+    ctx.cacheTable("timestamps")
 
     checkAnswer(
       sql("SELECT time FROM timestamps"),
@@ -106,7 +108,7 @@ class InMemoryColumnarQuerySuite extends QueryTest {
       sql("SELECT * FROM withEmptyParts"),
       withEmptyParts.collect().toSeq.map(Row.fromTuple))
 
-    cacheTable("withEmptyParts")
+    ctx.cacheTable("withEmptyParts")
 
     checkAnswer(
       sql("SELECT * FROM withEmptyParts"),
@@ -155,7 +157,7 @@ class InMemoryColumnarQuerySuite extends QueryTest {
 
     // Create a RDD for the schema
     val rdd =
-      sparkContext.parallelize((1 to 100), 10).map { i =>
+      ctx.sparkContext.parallelize((1 to 100), 10).map { i =>
         Row(
           s"str${i}: test cache.",
           s"binary${i}: test cache.".getBytes("UTF-8"),
@@ -175,18 +177,18 @@ class InMemoryColumnarQuerySuite extends QueryTest {
           (0 to i).map(j => s"map_key_$j" -> (Long.MaxValue - j)).toMap,
           Row((i - 0.25).toFloat, Seq(true, false, null)))
       }
-    createDataFrame(rdd, schema).registerTempTable("InMemoryCache_different_data_types")
+    ctx.createDataFrame(rdd, schema).registerTempTable("InMemoryCache_different_data_types")
     // Cache the table.
     sql("cache table InMemoryCache_different_data_types")
     // Make sure the table is indeed cached.
-    val tableScan = table("InMemoryCache_different_data_types").queryExecution.executedPlan
+    val tableScan = ctx.table("InMemoryCache_different_data_types").queryExecution.executedPlan
     assert(
-      isCached("InMemoryCache_different_data_types"),
+      ctx.isCached("InMemoryCache_different_data_types"),
       "InMemoryCache_different_data_types should be cached.")
     // Issue a query and check the results.
     checkAnswer(
       sql(s"SELECT DISTINCT ${allColumns} FROM InMemoryCache_different_data_types"),
-      table("InMemoryCache_different_data_types").collect())
-    dropTempTable("InMemoryCache_different_data_types")
+      ctx.table("InMemoryCache_different_data_types").collect())
+    ctx.dropTempTable("InMemoryCache_different_data_types")
   }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/columnar/PartitionBatchPruningSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/columnar/PartitionBatchPruningSuite.scala
index cda1b0992e36f..6545c6b314a4c 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/columnar/PartitionBatchPruningSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/columnar/PartitionBatchPruningSuite.scala
@@ -21,40 +21,42 @@ import org.scalatest.{BeforeAndAfter, BeforeAndAfterAll}
 
 import org.apache.spark.SparkFunSuite
 import org.apache.spark.sql._
-import org.apache.spark.sql.test.TestSQLContext._
-import org.apache.spark.sql.test.TestSQLContext.implicits._
 
 class PartitionBatchPruningSuite extends SparkFunSuite with BeforeAndAfterAll with BeforeAndAfter {
-  val originalColumnBatchSize = conf.columnBatchSize
-  val originalInMemoryPartitionPruning = conf.inMemoryPartitionPruning
+
+  private lazy val ctx = org.apache.spark.sql.test.TestSQLContext
+  import ctx.implicits._
+
+  private lazy val originalColumnBatchSize = ctx.conf.columnBatchSize
+  private lazy val originalInMemoryPartitionPruning = ctx.conf.inMemoryPartitionPruning
 
   override protected def beforeAll(): Unit = {
     // Make a table with 5 partitions, 2 batches per partition, 10 elements per batch
-    setConf(SQLConf.COLUMN_BATCH_SIZE, "10")
+    ctx.setConf(SQLConf.COLUMN_BATCH_SIZE, "10")
 
-    val pruningData = sparkContext.makeRDD((1 to 100).map { key =>
+    val pruningData = ctx.sparkContext.makeRDD((1 to 100).map { key =>
       val string = if (((key - 1) / 10) % 2 == 0) null else key.toString
       TestData(key, string)
     }, 5).toDF()
     pruningData.registerTempTable("pruningData")
 
     // Enable in-memory partition pruning
-    setConf(SQLConf.IN_MEMORY_PARTITION_PRUNING, "true")
+    ctx.setConf(SQLConf.IN_MEMORY_PARTITION_PRUNING, "true")
     // Enable in-memory table scan accumulators
-    setConf("spark.sql.inMemoryTableScanStatistics.enable", "true")
+    ctx.setConf("spark.sql.inMemoryTableScanStatistics.enable", "true")
   }
 
   override protected def afterAll(): Unit = {
-    setConf(SQLConf.COLUMN_BATCH_SIZE, originalColumnBatchSize.toString)
-    setConf(SQLConf.IN_MEMORY_PARTITION_PRUNING, originalInMemoryPartitionPruning.toString)
+    ctx.setConf(SQLConf.COLUMN_BATCH_SIZE, originalColumnBatchSize.toString)
+    ctx.setConf(SQLConf.IN_MEMORY_PARTITION_PRUNING, originalInMemoryPartitionPruning.toString)
   }
 
   before {
-    cacheTable("pruningData")
+    ctx.cacheTable("pruningData")
   }
 
   after {
-    uncacheTable("pruningData")
+    ctx.uncacheTable("pruningData")
   }
 
   // Comparisons
@@ -108,7 +110,7 @@ class PartitionBatchPruningSuite extends SparkFunSuite with BeforeAndAfterAll wi
       expectedQueryResult: => Seq[Int]): Unit = {
 
     test(query) {
-      val df = sql(query)
+      val df = ctx.sql(query)
       val queryExecution = df.queryExecution
 
       assertResult(expectedQueryResult.toArray, s"Wrong query result: $queryExecution") {
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCSuite.scala
index e20c66cb2f1d7..7931854db27c1 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCSuite.scala
@@ -21,13 +21,11 @@ import java.math.BigDecimal
 import java.sql.DriverManager
 import java.util.{Calendar, GregorianCalendar, Properties}
 
-import org.apache.spark.SparkFunSuite
-import org.apache.spark.sql.test._
-import org.apache.spark.sql.types._
 import org.h2.jdbc.JdbcSQLException
 import org.scalatest.BeforeAndAfter
-import TestSQLContext._
-import TestSQLContext.implicits._
+
+import org.apache.spark.SparkFunSuite
+import org.apache.spark.sql.types._
 
 class JDBCSuite extends SparkFunSuite with BeforeAndAfter {
   val url = "jdbc:h2:mem:testdb0"
@@ -37,12 +35,16 @@ class JDBCSuite extends SparkFunSuite with BeforeAndAfter {
   val testBytes = Array[Byte](99.toByte, 134.toByte, 135.toByte, 200.toByte, 205.toByte)
 
   val testH2Dialect = new JdbcDialect {
-    def canHandle(url: String) : Boolean = url.startsWith("jdbc:h2")
+    override def canHandle(url: String) : Boolean = url.startsWith("jdbc:h2")
     override def getCatalystType(
         sqlType: Int, typeName: String, size: Int, md: MetadataBuilder): Option[DataType] =
       Some(StringType)
   }
 
+  private lazy val ctx = org.apache.spark.sql.test.TestSQLContext
+  import ctx.implicits._
+  import ctx.sql
+
   before {
     Class.forName("org.h2.Driver")
     // Extra properties that will be specified for our database. We need these to test
@@ -253,26 +255,26 @@ class JDBCSuite extends SparkFunSuite with BeforeAndAfter {
   }
 
   test("Basic API") {
-    assert(TestSQLContext.read.jdbc(
+    assert(ctx.read.jdbc(
       urlWithUserAndPass, "TEST.PEOPLE", new Properties).collect().length === 3)
   }
 
   test("Basic API with FetchSize") {
     val properties = new Properties
     properties.setProperty("fetchSize", "2")
-    assert(TestSQLContext.read.jdbc(
+    assert(ctx.read.jdbc(
       urlWithUserAndPass, "TEST.PEOPLE", properties).collect().length === 3)
   }
 
   test("Partitioning via JDBCPartitioningInfo API") {
     assert(
-      TestSQLContext.read.jdbc(urlWithUserAndPass, "TEST.PEOPLE", "THEID", 0, 4, 3, new Properties)
+      ctx.read.jdbc(urlWithUserAndPass, "TEST.PEOPLE", "THEID", 0, 4, 3, new Properties)
       .collect().length === 3)
   }
 
   test("Partitioning via list-of-where-clauses API") {
     val parts = Array[String]("THEID < 2", "THEID >= 2")
-    assert(TestSQLContext.read.jdbc(urlWithUserAndPass, "TEST.PEOPLE", parts, new Properties)
+    assert(ctx.read.jdbc(urlWithUserAndPass, "TEST.PEOPLE", parts, new Properties)
       .collect().length === 3)
   }
 
@@ -328,9 +330,9 @@ class JDBCSuite extends SparkFunSuite with BeforeAndAfter {
   }
 
   test("test DATE types") {
-    val rows = TestSQLContext.read.jdbc(
+    val rows = ctx.read.jdbc(
       urlWithUserAndPass, "TEST.TIMETYPES", new Properties).collect()
-    val cachedRows = TestSQLContext.read.jdbc(urlWithUserAndPass, "TEST.TIMETYPES", new Properties)
+    val cachedRows = ctx.read.jdbc(urlWithUserAndPass, "TEST.TIMETYPES", new Properties)
       .cache().collect()
     assert(rows(0).getAs[java.sql.Date](1) === java.sql.Date.valueOf("1996-01-01"))
     assert(rows(1).getAs[java.sql.Date](1) === null)
@@ -338,9 +340,8 @@ class JDBCSuite extends SparkFunSuite with BeforeAndAfter {
   }
 
   test("test DATE types in cache") {
-    val rows =
-      TestSQLContext.read.jdbc(urlWithUserAndPass, "TEST.TIMETYPES", new Properties).collect()
-    TestSQLContext.read.jdbc(urlWithUserAndPass, "TEST.TIMETYPES", new Properties)
+    val rows = ctx.read.jdbc(urlWithUserAndPass, "TEST.TIMETYPES", new Properties).collect()
+    ctx.read.jdbc(urlWithUserAndPass, "TEST.TIMETYPES", new Properties)
       .cache().registerTempTable("mycached_date")
     val cachedRows = sql("select * from mycached_date").collect()
     assert(rows(0).getAs[java.sql.Date](1) === java.sql.Date.valueOf("1996-01-01"))
@@ -348,7 +349,7 @@ class JDBCSuite extends SparkFunSuite with BeforeAndAfter {
   }
 
   test("test types for null value") {
-    val rows = TestSQLContext.read.jdbc(
+    val rows = ctx.read.jdbc(
       urlWithUserAndPass, "TEST.NULLTYPES", new Properties).collect()
     assert((0 to 14).forall(i => rows(0).isNullAt(i)))
   }
@@ -395,10 +396,8 @@ class JDBCSuite extends SparkFunSuite with BeforeAndAfter {
 
   test("Remap types via JdbcDialects") {
     JdbcDialects.registerDialect(testH2Dialect)
-    val df = TestSQLContext.read.jdbc(urlWithUserAndPass, "TEST.PEOPLE", new Properties)
-    assert(df.schema.filter(
-      _.dataType != org.apache.spark.sql.types.StringType
-    ).isEmpty)
+    val df = ctx.read.jdbc(urlWithUserAndPass, "TEST.PEOPLE", new Properties)
+    assert(df.schema.filter(_.dataType != org.apache.spark.sql.types.StringType).isEmpty)
     val rows = df.collect()
     assert(rows(0).get(0).isInstanceOf[String])
     assert(rows(0).get(1).isInstanceOf[String])
@@ -419,7 +418,7 @@ class JDBCSuite extends SparkFunSuite with BeforeAndAfter {
 
   test("Aggregated dialects") {
     val agg = new AggregatedDialect(List(new JdbcDialect {
-      def canHandle(url: String) : Boolean = url.startsWith("jdbc:h2:")
+      override def canHandle(url: String) : Boolean = url.startsWith("jdbc:h2:")
       override def getCatalystType(
           sqlType: Int, typeName: String, size: Int, md: MetadataBuilder): Option[DataType] =
         if (sqlType % 2 == 0) {
@@ -430,8 +429,8 @@ class JDBCSuite extends SparkFunSuite with BeforeAndAfter {
     }, testH2Dialect))
     assert(agg.canHandle("jdbc:h2:xxx"))
     assert(!agg.canHandle("jdbc:h2"))
-    assert(agg.getCatalystType(0, "", 1, null) == Some(LongType))
-    assert(agg.getCatalystType(1, "", 1, null) == Some(StringType))
+    assert(agg.getCatalystType(0, "", 1, null) === Some(LongType))
+    assert(agg.getCatalystType(1, "", 1, null) === Some(StringType))
   }
 
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCWriteSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCWriteSuite.scala
index 2de8c1a6098e0..d949ef42267ec 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCWriteSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCWriteSuite.scala
@@ -24,7 +24,6 @@ import org.scalatest.BeforeAndAfter
 
 import org.apache.spark.SparkFunSuite
 import org.apache.spark.sql.{SaveMode, Row}
-import org.apache.spark.sql.test._
 import org.apache.spark.sql.types._
 
 class JDBCWriteSuite extends SparkFunSuite with BeforeAndAfter {
@@ -37,6 +36,10 @@ class JDBCWriteSuite extends SparkFunSuite with BeforeAndAfter {
   properties.setProperty("password", "testPass")
   properties.setProperty("rowId", "false")
 
+  private lazy val ctx = org.apache.spark.sql.test.TestSQLContext
+  import ctx.implicits._
+  import ctx.sql
+
   before {
     Class.forName("org.h2.Driver")
     conn = DriverManager.getConnection(url)
@@ -54,14 +57,14 @@ class JDBCWriteSuite extends SparkFunSuite with BeforeAndAfter {
       "create table test.people1 (name TEXT(32) NOT NULL, theid INTEGER NOT NULL)").executeUpdate()
     conn1.commit()
 
-    TestSQLContext.sql(
+    ctx.sql(
       s"""
         |CREATE TEMPORARY TABLE PEOPLE
         |USING org.apache.spark.sql.jdbc
         |OPTIONS (url '$url1', dbtable 'TEST.PEOPLE', user 'testUser', password 'testPass')
       """.stripMargin.replaceAll("\n", " "))
 
-    TestSQLContext.sql(
+    ctx.sql(
       s"""
         |CREATE TEMPORARY TABLE PEOPLE1
         |USING org.apache.spark.sql.jdbc
@@ -74,66 +77,64 @@ class JDBCWriteSuite extends SparkFunSuite with BeforeAndAfter {
     conn1.close()
   }
 
-  val sc = TestSQLContext.sparkContext
+  private lazy val sc = ctx.sparkContext
 
-  val arr2x2 = Array[Row](Row.apply("dave", 42), Row.apply("mary", 222))
-  val arr1x2 = Array[Row](Row.apply("fred", 3))
-  val schema2 = StructType(
+  private lazy val arr2x2 = Array[Row](Row.apply("dave", 42), Row.apply("mary", 222))
+  private lazy val arr1x2 = Array[Row](Row.apply("fred", 3))
+  private lazy val schema2 = StructType(
       StructField("name", StringType) ::
       StructField("id", IntegerType) :: Nil)
 
-  val arr2x3 = Array[Row](Row.apply("dave", 42, 1), Row.apply("mary", 222, 2))
-  val schema3 = StructType(
+  private lazy val arr2x3 = Array[Row](Row.apply("dave", 42, 1), Row.apply("mary", 222, 2))
+  private lazy val schema3 = StructType(
       StructField("name", StringType) ::
       StructField("id", IntegerType) ::
       StructField("seq", IntegerType) :: Nil)
 
   test("Basic CREATE") {
-    val df = TestSQLContext.createDataFrame(sc.parallelize(arr2x2), schema2)
+    val df = ctx.createDataFrame(sc.parallelize(arr2x2), schema2)
 
     df.write.jdbc(url, "TEST.BASICCREATETEST", new Properties)
-    assert(2 == TestSQLContext.read.jdbc(url, "TEST.BASICCREATETEST", new Properties).count)
-    assert(2 ==
-      TestSQLContext.read.jdbc(url, "TEST.BASICCREATETEST", new Properties).collect()(0).length)
+    assert(2 === ctx.read.jdbc(url, "TEST.BASICCREATETEST", new Properties).count)
+    assert(2 === ctx.read.jdbc(url, "TEST.BASICCREATETEST", new Properties).collect()(0).length)
   }
 
   test("CREATE with overwrite") {
-    val df = TestSQLContext.createDataFrame(sc.parallelize(arr2x3), schema3)
-    val df2 = TestSQLContext.createDataFrame(sc.parallelize(arr1x2), schema2)
+    val df = ctx.createDataFrame(sc.parallelize(arr2x3), schema3)
+    val df2 = ctx.createDataFrame(sc.parallelize(arr1x2), schema2)
 
     df.write.jdbc(url1, "TEST.DROPTEST", properties)
-    assert(2 == TestSQLContext.read.jdbc(url1, "TEST.DROPTEST", properties).count)
-    assert(3 == TestSQLContext.read.jdbc(url1, "TEST.DROPTEST", properties).collect()(0).length)
+    assert(2 === ctx.read.jdbc(url1, "TEST.DROPTEST", properties).count)
+    assert(3 === ctx.read.jdbc(url1, "TEST.DROPTEST", properties).collect()(0).length)
 
     df2.write.mode(SaveMode.Overwrite).jdbc(url1, "TEST.DROPTEST", properties)
-    assert(1 == TestSQLContext.read.jdbc(url1, "TEST.DROPTEST", properties).count)
-    assert(2 == TestSQLContext.read.jdbc(url1, "TEST.DROPTEST", properties).collect()(0).length)
+    assert(1 === ctx.read.jdbc(url1, "TEST.DROPTEST", properties).count)
+    assert(2 === ctx.read.jdbc(url1, "TEST.DROPTEST", properties).collect()(0).length)
   }
 
   test("CREATE then INSERT to append") {
-    val df = TestSQLContext.createDataFrame(sc.parallelize(arr2x2), schema2)
-    val df2 = TestSQLContext.createDataFrame(sc.parallelize(arr1x2), schema2)
+    val df = ctx.createDataFrame(sc.parallelize(arr2x2), schema2)
+    val df2 = ctx.createDataFrame(sc.parallelize(arr1x2), schema2)
 
     df.write.jdbc(url, "TEST.APPENDTEST", new Properties)
     df2.write.mode(SaveMode.Append).jdbc(url, "TEST.APPENDTEST", new Properties)
-    assert(3 == TestSQLContext.read.jdbc(url, "TEST.APPENDTEST", new Properties).count)
-    assert(2 ==
-      TestSQLContext.read.jdbc(url, "TEST.APPENDTEST", new Properties).collect()(0).length)
+    assert(3 === ctx.read.jdbc(url, "TEST.APPENDTEST", new Properties).count)
+    assert(2 === ctx.read.jdbc(url, "TEST.APPENDTEST", new Properties).collect()(0).length)
   }
 
   test("CREATE then INSERT to truncate") {
-    val df = TestSQLContext.createDataFrame(sc.parallelize(arr2x2), schema2)
-    val df2 = TestSQLContext.createDataFrame(sc.parallelize(arr1x2), schema2)
+    val df = ctx.createDataFrame(sc.parallelize(arr2x2), schema2)
+    val df2 = ctx.createDataFrame(sc.parallelize(arr1x2), schema2)
 
     df.write.jdbc(url1, "TEST.TRUNCATETEST", properties)
     df2.write.mode(SaveMode.Overwrite).jdbc(url1, "TEST.TRUNCATETEST", properties)
-    assert(1 == TestSQLContext.read.jdbc(url1, "TEST.TRUNCATETEST", properties).count)
-    assert(2 == TestSQLContext.read.jdbc(url1, "TEST.TRUNCATETEST", properties).collect()(0).length)
+    assert(1 === ctx.read.jdbc(url1, "TEST.TRUNCATETEST", properties).count)
+    assert(2 === ctx.read.jdbc(url1, "TEST.TRUNCATETEST", properties).collect()(0).length)
   }
 
   test("Incompatible INSERT to append") {
-    val df = TestSQLContext.createDataFrame(sc.parallelize(arr2x2), schema2)
-    val df2 = TestSQLContext.createDataFrame(sc.parallelize(arr2x3), schema3)
+    val df = ctx.createDataFrame(sc.parallelize(arr2x2), schema2)
+    val df2 = ctx.createDataFrame(sc.parallelize(arr2x3), schema3)
 
     df.write.jdbc(url, "TEST.INCOMPATIBLETEST", new Properties)
     intercept[org.apache.spark.SparkException] {
@@ -142,15 +143,15 @@ class JDBCWriteSuite extends SparkFunSuite with BeforeAndAfter {
   }
 
   test("INSERT to JDBC Datasource") {
-    TestSQLContext.sql("INSERT INTO TABLE PEOPLE1 SELECT * FROM PEOPLE")
-    assert(2 == TestSQLContext.read.jdbc(url1, "TEST.PEOPLE1", properties).count)
-    assert(2 == TestSQLContext.read.jdbc(url1, "TEST.PEOPLE1", properties).collect()(0).length)
+    ctx.sql("INSERT INTO TABLE PEOPLE1 SELECT * FROM PEOPLE")
+    assert(2 === ctx.read.jdbc(url1, "TEST.PEOPLE1", properties).count)
+    assert(2 === ctx.read.jdbc(url1, "TEST.PEOPLE1", properties).collect()(0).length)
   }
 
   test("INSERT to JDBC Datasource with overwrite") {
-    TestSQLContext.sql("INSERT INTO TABLE PEOPLE1 SELECT * FROM PEOPLE")
-    TestSQLContext.sql("INSERT OVERWRITE TABLE PEOPLE1 SELECT * FROM PEOPLE")
-    assert(2 == TestSQLContext.read.jdbc(url1, "TEST.PEOPLE1", properties).count)
-    assert(2 == TestSQLContext.read.jdbc(url1, "TEST.PEOPLE1", properties).collect()(0).length)
+    ctx.sql("INSERT INTO TABLE PEOPLE1 SELECT * FROM PEOPLE")
+    ctx.sql("INSERT OVERWRITE TABLE PEOPLE1 SELECT * FROM PEOPLE")
+    assert(2 === ctx.read.jdbc(url1, "TEST.PEOPLE1", properties).count)
+    assert(2 === ctx.read.jdbc(url1, "TEST.PEOPLE1", properties).collect()(0).length)
   }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/json/JsonSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/json/JsonSuite.scala
index f8d62f9e7e02b..d889c7be17ce7 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/json/JsonSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/json/JsonSuite.scala
@@ -23,21 +23,19 @@ import java.sql.{Date, Timestamp}
 import com.fasterxml.jackson.core.JsonFactory
 import org.scalactic.Tolerance._
 
+import org.apache.spark.sql.{QueryTest, Row, SQLConf}
 import org.apache.spark.sql.TestData._
 import org.apache.spark.sql.catalyst.util.DateUtils
 import org.apache.spark.sql.json.InferSchema.compatibleType
 import org.apache.spark.sql.sources.LogicalRelation
-import org.apache.spark.sql.test.TestSQLContext
-import org.apache.spark.sql.test.TestSQLContext._
-import org.apache.spark.sql.test.TestSQLContext.implicits._
 import org.apache.spark.sql.types._
-import org.apache.spark.sql.{QueryTest, Row, SQLConf}
 import org.apache.spark.util.Utils
 
-class JsonSuite extends QueryTest {
-  import org.apache.spark.sql.json.TestJsonData._
+class JsonSuite extends QueryTest with TestJsonData {
 
-  TestJsonData
+  protected lazy val ctx = org.apache.spark.sql.test.TestSQLContext
+  import ctx.sql
+  import ctx.implicits._
 
   test("Type promotion") {
     def checkTypePromotion(expected: Any, actual: Any) {
@@ -214,7 +212,7 @@ class JsonSuite extends QueryTest {
   }
 
   test("Complex field and type inferring with null in sampling") {
-    val jsonDF = read.json(jsonNullStruct)
+    val jsonDF = ctx.read.json(jsonNullStruct)
     val expectedSchema = StructType(
       StructField("headers", StructType(
         StructField("Charset", StringType, true) ::
@@ -233,7 +231,7 @@ class JsonSuite extends QueryTest {
   }
 
   test("Primitive field and type inferring") {
-    val jsonDF = read.json(primitiveFieldAndType)
+    val jsonDF = ctx.read.json(primitiveFieldAndType)
 
     val expectedSchema = StructType(
       StructField("bigInteger", DecimalType.Unlimited, true) ::
@@ -261,7 +259,7 @@ class JsonSuite extends QueryTest {
   }
 
   test("Complex field and type inferring") {
-    val jsonDF = read.json(complexFieldAndType1)
+    val jsonDF = ctx.read.json(complexFieldAndType1)
 
     val expectedSchema = StructType(
       StructField("arrayOfArray1", ArrayType(ArrayType(StringType, true), true), true) ::
@@ -360,7 +358,7 @@ class JsonSuite extends QueryTest {
   }
 
   test("GetField operation on complex data type") {
-    val jsonDF = read.json(complexFieldAndType1)
+    val jsonDF = ctx.read.json(complexFieldAndType1)
     jsonDF.registerTempTable("jsonTable")
 
     checkAnswer(
@@ -376,7 +374,7 @@ class JsonSuite extends QueryTest {
   }
 
   test("Type conflict in primitive field values") {
-    val jsonDF = read.json(primitiveFieldValueTypeConflict)
+    val jsonDF = ctx.read.json(primitiveFieldValueTypeConflict)
 
     val expectedSchema = StructType(
       StructField("num_bool", StringType, true) ::
@@ -450,7 +448,7 @@ class JsonSuite extends QueryTest {
   }
 
   ignore("Type conflict in primitive field values (Ignored)") {
-    val jsonDF = read.json(primitiveFieldValueTypeConflict)
+    val jsonDF = ctx.read.json(primitiveFieldValueTypeConflict)
     jsonDF.registerTempTable("jsonTable")
 
     // Right now, the analyzer does not promote strings in a boolean expression.
@@ -503,7 +501,7 @@ class JsonSuite extends QueryTest {
   }
 
   test("Type conflict in complex field values") {
-    val jsonDF = read.json(complexFieldValueTypeConflict)
+    val jsonDF = ctx.read.json(complexFieldValueTypeConflict)
 
     val expectedSchema = StructType(
       StructField("array", ArrayType(LongType, true), true) ::
@@ -527,7 +525,7 @@ class JsonSuite extends QueryTest {
   }
 
   test("Type conflict in array elements") {
-    val jsonDF = read.json(arrayElementTypeConflict)
+    val jsonDF = ctx.read.json(arrayElementTypeConflict)
 
     val expectedSchema = StructType(
       StructField("array1", ArrayType(StringType, true), true) ::
@@ -555,7 +553,7 @@ class JsonSuite extends QueryTest {
   }
 
   test("Handling missing fields") {
-    val jsonDF = read.json(missingFields)
+    val jsonDF = ctx.read.json(missingFields)
 
     val expectedSchema = StructType(
       StructField("a", BooleanType, true) ::
@@ -574,8 +572,9 @@ class JsonSuite extends QueryTest {
     val dir = Utils.createTempDir()
     dir.delete()
     val path = dir.getCanonicalPath
-    sparkContext.parallelize(1 to 100).map(i => s"""{"a": 1, "b": "str$i"}""").saveAsTextFile(path)
-    val jsonDF = read.option("samplingRatio", "0.49").json(path)
+    ctx.sparkContext.parallelize(1 to 100)
+      .map(i => s"""{"a": 1, "b": "str$i"}""").saveAsTextFile(path)
+    val jsonDF = ctx.read.option("samplingRatio", "0.49").json(path)
 
     val analyzed = jsonDF.queryExecution.analyzed
     assert(
@@ -590,7 +589,7 @@ class JsonSuite extends QueryTest {
 
     val schema = StructType(StructField("a", LongType, true) :: Nil)
     val logicalRelation =
-      read.schema(schema).json(path).queryExecution.analyzed.asInstanceOf[LogicalRelation]
+      ctx.read.schema(schema).json(path).queryExecution.analyzed.asInstanceOf[LogicalRelation]
     val relationWithSchema = logicalRelation.relation.asInstanceOf[JSONRelation]
     assert(relationWithSchema.path === Some(path))
     assert(relationWithSchema.schema === schema)
@@ -602,7 +601,7 @@ class JsonSuite extends QueryTest {
     dir.delete()
     val path = dir.getCanonicalPath
     primitiveFieldAndType.map(record => record.replaceAll("\n", " ")).saveAsTextFile(path)
-    val jsonDF = read.json(path)
+    val jsonDF = ctx.read.json(path)
 
     val expectedSchema = StructType(
       StructField("bigInteger", DecimalType.Unlimited, true) ::
@@ -671,7 +670,7 @@ class JsonSuite extends QueryTest {
       StructField("null", StringType, true) ::
       StructField("string", StringType, true) :: Nil)
 
-    val jsonDF1 = read.schema(schema).json(path)
+    val jsonDF1 = ctx.read.schema(schema).json(path)
 
     assert(schema === jsonDF1.schema)
 
@@ -688,7 +687,7 @@ class JsonSuite extends QueryTest {
       "this is a simple string.")
     )
 
-    val jsonDF2 = read.schema(schema).json(primitiveFieldAndType)
+    val jsonDF2 = ctx.read.schema(schema).json(primitiveFieldAndType)
 
     assert(schema === jsonDF2.schema)
 
@@ -709,7 +708,7 @@ class JsonSuite extends QueryTest {
   test("Applying schemas with MapType") {
     val schemaWithSimpleMap = StructType(
       StructField("map", MapType(StringType, IntegerType, true), false) :: Nil)
-    val jsonWithSimpleMap = read.schema(schemaWithSimpleMap).json(mapType1)
+    val jsonWithSimpleMap = ctx.read.schema(schemaWithSimpleMap).json(mapType1)
 
     jsonWithSimpleMap.registerTempTable("jsonWithSimpleMap")
 
@@ -737,7 +736,7 @@ class JsonSuite extends QueryTest {
     val schemaWithComplexMap = StructType(
       StructField("map", MapType(StringType, innerStruct, true), false) :: Nil)
 
-    val jsonWithComplexMap = read.schema(schemaWithComplexMap).json(mapType2)
+    val jsonWithComplexMap = ctx.read.schema(schemaWithComplexMap).json(mapType2)
 
     jsonWithComplexMap.registerTempTable("jsonWithComplexMap")
 
@@ -763,7 +762,7 @@ class JsonSuite extends QueryTest {
   }
 
   test("SPARK-2096 Correctly parse dot notations") {
-    val jsonDF = read.json(complexFieldAndType2)
+    val jsonDF = ctx.read.json(complexFieldAndType2)
     jsonDF.registerTempTable("jsonTable")
 
     checkAnswer(
@@ -781,7 +780,7 @@ class JsonSuite extends QueryTest {
   }
 
   test("SPARK-3390 Complex arrays") {
-    val jsonDF = read.json(complexFieldAndType2)
+    val jsonDF = ctx.read.json(complexFieldAndType2)
     jsonDF.registerTempTable("jsonTable")
 
     checkAnswer(
@@ -804,7 +803,7 @@ class JsonSuite extends QueryTest {
   }
 
   test("SPARK-3308 Read top level JSON arrays") {
-    val jsonDF = read.json(jsonArray)
+    val jsonDF = ctx.read.json(jsonArray)
     jsonDF.registerTempTable("jsonTable")
 
     checkAnswer(
@@ -822,10 +821,10 @@ class JsonSuite extends QueryTest {
 
   test("Corrupt records") {
     // Test if we can query corrupt records.
-    val oldColumnNameOfCorruptRecord = TestSQLContext.conf.columnNameOfCorruptRecord
-    TestSQLContext.setConf(SQLConf.COLUMN_NAME_OF_CORRUPT_RECORD, "_unparsed")
+    val oldColumnNameOfCorruptRecord = ctx.conf.columnNameOfCorruptRecord
+    ctx.setConf(SQLConf.COLUMN_NAME_OF_CORRUPT_RECORD, "_unparsed")
 
-    val jsonDF = read.json(corruptRecords)
+    val jsonDF = ctx.read.json(corruptRecords)
     jsonDF.registerTempTable("jsonTable")
 
     val schema = StructType(
@@ -875,11 +874,11 @@ class JsonSuite extends QueryTest {
         Row("]") :: Nil
     )
 
-    TestSQLContext.setConf(SQLConf.COLUMN_NAME_OF_CORRUPT_RECORD, oldColumnNameOfCorruptRecord)
+    ctx.setConf(SQLConf.COLUMN_NAME_OF_CORRUPT_RECORD, oldColumnNameOfCorruptRecord)
   }
 
   test("SPARK-4068: nulls in arrays") {
-    val jsonDF = read.json(nullsInArrays)
+    val jsonDF = ctx.read.json(nullsInArrays)
     jsonDF.registerTempTable("jsonTable")
 
     val schema = StructType(
@@ -925,7 +924,7 @@ class JsonSuite extends QueryTest {
       Row(values(0).toInt, values(1), values(2).toBoolean, r.split(",").toList, v5)
     }
 
-    val df1 = createDataFrame(rowRDD1, schema1)
+    val df1 = ctx.createDataFrame(rowRDD1, schema1)
     df1.registerTempTable("applySchema1")
     val df2 = df1.toDF
     val result = df2.toJSON.collect()
@@ -948,7 +947,7 @@ class JsonSuite extends QueryTest {
       Row(Row(values(0).toInt, values(2).toBoolean), Map(values(1) -> v4))
     }
 
-    val df3 = createDataFrame(rowRDD2, schema2)
+    val df3 = ctx.createDataFrame(rowRDD2, schema2)
     df3.registerTempTable("applySchema2")
     val df4 = df3.toDF
     val result2 = df4.toJSON.collect()
@@ -956,8 +955,8 @@ class JsonSuite extends QueryTest {
     assert(result2(1) === "{\"f1\":{\"f11\":2,\"f12\":false},\"f2\":{\"B2\":null}}")
     assert(result2(3) === "{\"f1\":{\"f11\":4,\"f12\":true},\"f2\":{\"D4\":2147483644}}")
 
-    val jsonDF = read.json(primitiveFieldAndType)
-    val primTable = read.json(jsonDF.toJSON)
+    val jsonDF = ctx.read.json(primitiveFieldAndType)
+    val primTable = ctx.read.json(jsonDF.toJSON)
     primTable.registerTempTable("primativeTable")
     checkAnswer(
         sql("select * from primativeTable"),
@@ -969,8 +968,8 @@ class JsonSuite extends QueryTest {
         "this is a simple string.")
       )
 
-    val complexJsonDF = read.json(complexFieldAndType1)
-    val compTable = read.json(complexJsonDF.toJSON)
+    val complexJsonDF = ctx.read.json(complexFieldAndType1)
+    val compTable = ctx.read.json(complexJsonDF.toJSON)
     compTable.registerTempTable("complexTable")
     // Access elements of a primitive array.
     checkAnswer(
@@ -1074,29 +1073,29 @@ class JsonSuite extends QueryTest {
   }
 
   test("SPARK-7565 MapType in JsonRDD") {
-    val useStreaming = getConf(SQLConf.USE_JACKSON_STREAMING_API, "true")
-    val oldColumnNameOfCorruptRecord = TestSQLContext.conf.columnNameOfCorruptRecord
-    TestSQLContext.setConf(SQLConf.COLUMN_NAME_OF_CORRUPT_RECORD, "_unparsed")
+    val useStreaming = ctx.getConf(SQLConf.USE_JACKSON_STREAMING_API, "true")
+    val oldColumnNameOfCorruptRecord = ctx.conf.columnNameOfCorruptRecord
+    ctx.setConf(SQLConf.COLUMN_NAME_OF_CORRUPT_RECORD, "_unparsed")
 
     val schemaWithSimpleMap = StructType(
       StructField("map", MapType(StringType, IntegerType, true), false) :: Nil)
     try{
       for (useStreaming <- List("true", "false")) {
-        setConf(SQLConf.USE_JACKSON_STREAMING_API, useStreaming)
+        ctx.setConf(SQLConf.USE_JACKSON_STREAMING_API, useStreaming)
         val temp = Utils.createTempDir().getPath
 
-        val df = read.schema(schemaWithSimpleMap).json(mapType1)
+        val df = ctx.read.schema(schemaWithSimpleMap).json(mapType1)
         df.write.mode("overwrite").parquet(temp)
         // order of MapType is not defined
-        assert(read.parquet(temp).count() == 5)
+        assert(ctx.read.parquet(temp).count() == 5)
 
-        val df2 = read.json(corruptRecords)
+        val df2 = ctx.read.json(corruptRecords)
         df2.write.mode("overwrite").parquet(temp)
-        checkAnswer(read.parquet(temp), df2.collect())
+        checkAnswer(ctx.read.parquet(temp), df2.collect())
       }
     } finally {
-      setConf(SQLConf.USE_JACKSON_STREAMING_API, useStreaming)
-      setConf(SQLConf.COLUMN_NAME_OF_CORRUPT_RECORD, oldColumnNameOfCorruptRecord)
+      ctx.setConf(SQLConf.USE_JACKSON_STREAMING_API, useStreaming)
+      ctx.setConf(SQLConf.COLUMN_NAME_OF_CORRUPT_RECORD, oldColumnNameOfCorruptRecord)
     }
   }
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/json/TestJsonData.scala b/sql/core/src/test/scala/org/apache/spark/sql/json/TestJsonData.scala
index 47a97a49daabb..b6a6a8dc6a63c 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/json/TestJsonData.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/json/TestJsonData.scala
@@ -17,12 +17,15 @@
 
 package org.apache.spark.sql.json
 
-import org.apache.spark.sql.test.TestSQLContext
+import org.apache.spark.rdd.RDD
+import org.apache.spark.sql.SQLContext
 
-object TestJsonData {
+trait TestJsonData {
 
-  val primitiveFieldAndType =
-    TestSQLContext.sparkContext.parallelize(
+  protected def ctx: SQLContext
+
+  def primitiveFieldAndType: RDD[String] =
+    ctx.sparkContext.parallelize(
       """{"string":"this is a simple string.",
           "integer":10,
           "long":21474836470,
@@ -32,8 +35,8 @@ object TestJsonData {
           "null":null
       }"""  :: Nil)
 
-  val primitiveFieldValueTypeConflict =
-    TestSQLContext.sparkContext.parallelize(
+  def primitiveFieldValueTypeConflict: RDD[String] =
+    ctx.sparkContext.parallelize(
       """{"num_num_1":11, "num_num_2":null, "num_num_3": 1.1,
           "num_bool":true, "num_str":13.1, "str_bool":"str1"}""" ::
       """{"num_num_1":null, "num_num_2":21474836470.9, "num_num_3": null,
@@ -43,15 +46,15 @@ object TestJsonData {
       """{"num_num_1":21474836570, "num_num_2":1.1, "num_num_3": 21474836470,
           "num_bool":null, "num_str":92233720368547758070, "str_bool":null}""" :: Nil)
 
-  val jsonNullStruct =
-    TestSQLContext.sparkContext.parallelize(
+  def jsonNullStruct: RDD[String] =
+    ctx.sparkContext.parallelize(
       """{"nullstr":"","ip":"27.31.100.29","headers":{"Host":"1.abc.com","Charset":"UTF-8"}}""" ::
         """{"nullstr":"","ip":"27.31.100.29","headers":{}}""" ::
         """{"nullstr":"","ip":"27.31.100.29","headers":""}""" ::
         """{"nullstr":null,"ip":"27.31.100.29","headers":null}""" :: Nil)
 
-  val complexFieldValueTypeConflict =
-    TestSQLContext.sparkContext.parallelize(
+  def complexFieldValueTypeConflict: RDD[String] =
+    ctx.sparkContext.parallelize(
       """{"num_struct":11, "str_array":[1, 2, 3],
           "array":[], "struct_array":[], "struct": {}}""" ::
       """{"num_struct":{"field":false}, "str_array":null,
@@ -61,23 +64,23 @@ object TestJsonData {
       """{"num_struct":{}, "str_array":["str1", "str2", 33],
           "array":[7], "struct_array":{"field": true}, "struct": {"field": "str"}}""" :: Nil)
 
-  val arrayElementTypeConflict =
-    TestSQLContext.sparkContext.parallelize(
+  def arrayElementTypeConflict: RDD[String] =
+    ctx.sparkContext.parallelize(
       """{"array1": [1, 1.1, true, null, [], {}, [2,3,4], {"field":"str"}],
           "array2": [{"field":214748364700}, {"field":1}]}""" ::
       """{"array3": [{"field":"str"}, {"field":1}]}""" ::
       """{"array3": [1, 2, 3]}""" :: Nil)
 
-  val missingFields =
-    TestSQLContext.sparkContext.parallelize(
+  def missingFields: RDD[String] =
+    ctx.sparkContext.parallelize(
       """{"a":true}""" ::
       """{"b":21474836470}""" ::
       """{"c":[33, 44]}""" ::
       """{"d":{"field":true}}""" ::
       """{"e":"str"}""" :: Nil)
 
-  val complexFieldAndType1 =
-    TestSQLContext.sparkContext.parallelize(
+  def complexFieldAndType1: RDD[String] =
+    ctx.sparkContext.parallelize(
       """{"struct":{"field1": true, "field2": 92233720368547758070},
           "structWithArrayFields":{"field1":[4, 5, 6], "field2":["str1", "str2"]},
           "arrayOfString":["str1", "str2"],
@@ -92,8 +95,8 @@ object TestJsonData {
           "arrayOfArray2":[[1, 2, 3], [1.1, 2.1, 3.1]]
          }"""  :: Nil)
 
-  val complexFieldAndType2 =
-    TestSQLContext.sparkContext.parallelize(
+  def complexFieldAndType2: RDD[String] =
+    ctx.sparkContext.parallelize(
       """{"arrayOfStruct":[{"field1": true, "field2": "str1"}, {"field1": false}, {"field3": null}],
           "complexArrayOfStruct": [
           {
@@ -146,16 +149,16 @@ object TestJsonData {
           ]]
       }""" :: Nil)
 
-  val mapType1 =
-    TestSQLContext.sparkContext.parallelize(
+  def mapType1: RDD[String] =
+    ctx.sparkContext.parallelize(
       """{"map": {"a": 1}}""" ::
       """{"map": {"b": 2}}""" ::
       """{"map": {"c": 3}}""" ::
       """{"map": {"c": 1, "d": 4}}""" ::
       """{"map": {"e": null}}""" :: Nil)
 
-  val mapType2 =
-    TestSQLContext.sparkContext.parallelize(
+  def mapType2: RDD[String] =
+    ctx.sparkContext.parallelize(
       """{"map": {"a": {"field1": [1, 2, 3, null]}}}""" ::
       """{"map": {"b": {"field2": 2}}}""" ::
       """{"map": {"c": {"field1": [], "field2": 4}}}""" ::
@@ -163,22 +166,22 @@ object TestJsonData {
       """{"map": {"e": null}}""" ::
       """{"map": {"f": {"field1": null}}}""" :: Nil)
 
-  val nullsInArrays =
-    TestSQLContext.sparkContext.parallelize(
+  def nullsInArrays: RDD[String] =
+    ctx.sparkContext.parallelize(
       """{"field1":[[null], [[["Test"]]]]}""" ::
       """{"field2":[null, [{"Test":1}]]}""" ::
       """{"field3":[[null], [{"Test":"2"}]]}""" ::
       """{"field4":[[null, [1,2,3]]]}""" :: Nil)
 
-  val jsonArray =
-    TestSQLContext.sparkContext.parallelize(
+  def jsonArray: RDD[String] =
+    ctx.sparkContext.parallelize(
       """[{"a":"str_a_1"}]""" ::
       """[{"a":"str_a_2"}, {"b":"str_b_3"}]""" ::
       """{"b":"str_b_4", "a":"str_a_4", "c":"str_c_4"}""" ::
       """[]""" :: Nil)
 
-  val corruptRecords =
-    TestSQLContext.sparkContext.parallelize(
+  def corruptRecords: RDD[String] =
+    ctx.sparkContext.parallelize(
       """{""" ::
       """""" ::
       """{"a":1, b:2}""" ::
@@ -186,6 +189,5 @@ object TestJsonData {
       """{"b":"str_b_4", "a":"str_a_4", "c":"str_c_4"}""" ::
       """]""" :: Nil)
 
-  val empty =
-    TestSQLContext.sparkContext.parallelize(Seq[String]())
+  def empty: RDD[String] = ctx.sparkContext.parallelize(Seq[String]())
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetFilterSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetFilterSuite.scala
index 4aa5bcb7fdbca..17f5f9a491e6b 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetFilterSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetFilterSuite.scala
@@ -25,7 +25,6 @@ import org.apache.spark.sql.catalyst.dsl.expressions._
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.planning.PhysicalOperation
 import org.apache.spark.sql.sources.LogicalRelation
-import org.apache.spark.sql.test.TestSQLContext
 import org.apache.spark.sql.types._
 import org.apache.spark.sql.{Column, DataFrame, QueryTest, SQLConf}
 
@@ -42,7 +41,7 @@ import org.apache.spark.sql.{Column, DataFrame, QueryTest, SQLConf}
  *    data type is nullable.
  */
 class ParquetFilterSuiteBase extends QueryTest with ParquetTest {
-  val sqlContext = TestSQLContext
+  lazy val sqlContext = org.apache.spark.sql.test.TestSQLContext
 
   private def checkFilterPredicate(
       df: DataFrame,
@@ -312,7 +311,7 @@ class ParquetFilterSuiteBase extends QueryTest with ParquetTest {
 }
 
 class ParquetDataSourceOnFilterSuite extends ParquetFilterSuiteBase with BeforeAndAfterAll {
-  val originalConf = sqlContext.conf.parquetUseDataSourceApi
+  lazy val originalConf = sqlContext.conf.parquetUseDataSourceApi
 
   override protected def beforeAll(): Unit = {
     sqlContext.conf.setConf(SQLConf.PARQUET_USE_DATA_SOURCE_API, "true")
@@ -341,7 +340,7 @@ class ParquetDataSourceOnFilterSuite extends ParquetFilterSuiteBase with BeforeA
 }
 
 class ParquetDataSourceOffFilterSuite extends ParquetFilterSuiteBase with BeforeAndAfterAll {
-  val originalConf = sqlContext.conf.parquetUseDataSourceApi
+  lazy val originalConf = sqlContext.conf.parquetUseDataSourceApi
 
   override protected def beforeAll(): Unit = {
     sqlContext.conf.setConf(SQLConf.PARQUET_USE_DATA_SOURCE_API, "false")
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetIOSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetIOSuite.scala
index 7f7c2cc1a6c26..2b6a27032e637 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetIOSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetIOSuite.scala
@@ -36,9 +36,6 @@ import org.apache.parquet.schema.{MessageType, MessageTypeParser}
 import org.apache.spark.sql.catalyst.ScalaReflection
 import org.apache.spark.sql.catalyst.expressions.Row
 import org.apache.spark.sql.catalyst.util.DateUtils
-import org.apache.spark.sql.test.TestSQLContext
-import org.apache.spark.sql.test.TestSQLContext._
-import org.apache.spark.sql.test.TestSQLContext.implicits._
 import org.apache.spark.sql.types._
 import org.apache.spark.sql.{DataFrame, QueryTest, SQLConf, SaveMode}
 
@@ -66,9 +63,8 @@ private[parquet] class TestGroupWriteSupport(schema: MessageType) extends WriteS
  * A test suite that tests basic Parquet I/O.
  */
 class ParquetIOSuiteBase extends QueryTest with ParquetTest {
-  val sqlContext = TestSQLContext
-
-  import sqlContext.implicits.localSeqToDataFrameHolder
+  lazy val sqlContext = org.apache.spark.sql.test.TestSQLContext
+  import sqlContext.implicits._
 
   /**
    * Writes `data` to a Parquet file, reads it back and check file contents.
@@ -104,7 +100,7 @@ class ParquetIOSuiteBase extends QueryTest with ParquetTest {
   test("fixed-length decimals") {
 
     def makeDecimalRDD(decimal: DecimalType): DataFrame =
-      sparkContext
+      sqlContext.sparkContext
         .parallelize(0 to 1000)
         .map(i => Tuple1(i / 100.0))
         .toDF()
@@ -115,7 +111,7 @@ class ParquetIOSuiteBase extends QueryTest with ParquetTest {
       withTempPath { dir =>
         val data = makeDecimalRDD(DecimalType(precision, scale))
         data.write.parquet(dir.getCanonicalPath)
-        checkAnswer(read.parquet(dir.getCanonicalPath), data.collect().toSeq)
+        checkAnswer(sqlContext.read.parquet(dir.getCanonicalPath), data.collect().toSeq)
       }
     }
 
@@ -123,7 +119,7 @@ class ParquetIOSuiteBase extends QueryTest with ParquetTest {
     intercept[Throwable] {
       withTempPath { dir =>
         makeDecimalRDD(DecimalType(19, 10)).write.parquet(dir.getCanonicalPath)
-        read.parquet(dir.getCanonicalPath).collect()
+        sqlContext.read.parquet(dir.getCanonicalPath).collect()
       }
     }
 
@@ -131,14 +127,14 @@ class ParquetIOSuiteBase extends QueryTest with ParquetTest {
     intercept[Throwable] {
       withTempPath { dir =>
         makeDecimalRDD(DecimalType.Unlimited).write.parquet(dir.getCanonicalPath)
-        read.parquet(dir.getCanonicalPath).collect()
+        sqlContext.read.parquet(dir.getCanonicalPath).collect()
       }
     }
   }
 
   test("date type") {
     def makeDateRDD(): DataFrame =
-      sparkContext
+      sqlContext.sparkContext
         .parallelize(0 to 1000)
         .map(i => Tuple1(DateUtils.toJavaDate(i)))
         .toDF()
@@ -147,7 +143,7 @@ class ParquetIOSuiteBase extends QueryTest with ParquetTest {
     withTempPath { dir =>
       val data = makeDateRDD()
       data.write.parquet(dir.getCanonicalPath)
-      checkAnswer(read.parquet(dir.getCanonicalPath), data.collect().toSeq)
+      checkAnswer(sqlContext.read.parquet(dir.getCanonicalPath), data.collect().toSeq)
     }
   }
 
@@ -236,7 +232,7 @@ class ParquetIOSuiteBase extends QueryTest with ParquetTest {
     def checkCompressionCodec(codec: CompressionCodecName): Unit = {
       withSQLConf(SQLConf.PARQUET_COMPRESSION -> codec.name()) {
         withParquetFile(data) { path =>
-          assertResult(conf.parquetCompressionCodec.toUpperCase) {
+          assertResult(sqlContext.conf.parquetCompressionCodec.toUpperCase) {
             compressionCodecFor(path)
           }
         }
@@ -244,7 +240,7 @@ class ParquetIOSuiteBase extends QueryTest with ParquetTest {
     }
 
     // Checks default compression codec
-    checkCompressionCodec(CompressionCodecName.fromConf(conf.parquetCompressionCodec))
+    checkCompressionCodec(CompressionCodecName.fromConf(sqlContext.conf.parquetCompressionCodec))
 
     checkCompressionCodec(CompressionCodecName.UNCOMPRESSED)
     checkCompressionCodec(CompressionCodecName.GZIP)
@@ -283,7 +279,7 @@ class ParquetIOSuiteBase extends QueryTest with ParquetTest {
     withTempDir { dir =>
       val path = new Path(dir.toURI.toString, "part-r-0.parquet")
       makeRawParquetFile(path)
-      checkAnswer(read.parquet(path.toString), (0 until 10).map { i =>
+      checkAnswer(sqlContext.read.parquet(path.toString), (0 until 10).map { i =>
         Row(i % 2 == 0, i, i.toLong, i.toFloat, i.toDouble)
       })
     }
@@ -312,7 +308,7 @@ class ParquetIOSuiteBase extends QueryTest with ParquetTest {
     withParquetFile((1 to 10).map(i => (i, i.toString))) { file =>
       val newData = (11 to 20).map(i => (i, i.toString))
       newData.toDF().write.format("parquet").mode(SaveMode.Overwrite).save(file)
-      checkAnswer(read.parquet(file), newData.map(Row.fromTuple))
+      checkAnswer(sqlContext.read.parquet(file), newData.map(Row.fromTuple))
     }
   }
 
@@ -321,7 +317,7 @@ class ParquetIOSuiteBase extends QueryTest with ParquetTest {
     withParquetFile(data) { file =>
       val newData = (11 to 20).map(i => (i, i.toString))
       newData.toDF().write.format("parquet").mode(SaveMode.Ignore).save(file)
-      checkAnswer(read.parquet(file), data.map(Row.fromTuple))
+      checkAnswer(sqlContext.read.parquet(file), data.map(Row.fromTuple))
     }
   }
 
@@ -341,7 +337,7 @@ class ParquetIOSuiteBase extends QueryTest with ParquetTest {
     withParquetFile(data) { file =>
       val newData = (11 to 20).map(i => (i, i.toString))
       newData.toDF().write.format("parquet").mode(SaveMode.Append).save(file)
-      checkAnswer(read.parquet(file), (data ++ newData).map(Row.fromTuple))
+      checkAnswer(sqlContext.read.parquet(file), (data ++ newData).map(Row.fromTuple))
     }
   }
 
@@ -369,11 +365,11 @@ class ParquetIOSuiteBase extends QueryTest with ParquetTest {
       val path = new Path(location.getCanonicalPath)
 
       ParquetFileWriter.writeMetadataFile(
-        sparkContext.hadoopConfiguration,
+        sqlContext.sparkContext.hadoopConfiguration,
         path,
         new Footer(path, new ParquetMetadata(fileMetadata, Nil)) :: Nil)
 
-      assertResult(read.parquet(path.toString).schema) {
+      assertResult(sqlContext.read.parquet(path.toString).schema) {
         StructType(
           StructField("a", BooleanType, nullable = false) ::
           StructField("b", IntegerType, nullable = false) ::
@@ -406,7 +402,7 @@ class ParquetIOSuiteBase extends QueryTest with ParquetTest {
 }
 
 class ParquetDataSourceOnIOSuite extends ParquetIOSuiteBase with BeforeAndAfterAll {
-  val originalConf = sqlContext.conf.parquetUseDataSourceApi
+  private lazy val originalConf = sqlContext.conf.parquetUseDataSourceApi
 
   override protected def beforeAll(): Unit = {
     sqlContext.conf.setConf(SQLConf.PARQUET_USE_DATA_SOURCE_API, "true")
@@ -430,7 +426,7 @@ class ParquetDataSourceOnIOSuite extends ParquetIOSuiteBase with BeforeAndAfterA
 }
 
 class ParquetDataSourceOffIOSuite extends ParquetIOSuiteBase with BeforeAndAfterAll {
-  val originalConf = sqlContext.conf.parquetUseDataSourceApi
+  private lazy val originalConf = sqlContext.conf.parquetUseDataSourceApi
 
   override protected def beforeAll(): Unit = {
     sqlContext.conf.setConf(SQLConf.PARQUET_USE_DATA_SOURCE_API, "false")
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetPartitionDiscoverySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetPartitionDiscoverySuite.scala
index 3b29979452ad9..8979a0a210a42 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetPartitionDiscoverySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetPartitionDiscoverySuite.scala
@@ -14,6 +14,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+
 package org.apache.spark.sql.parquet
 
 import java.io.File
@@ -28,7 +29,6 @@ import org.apache.hadoop.fs.Path
 import org.apache.spark.sql.catalyst.expressions.Literal
 import org.apache.spark.sql.sources.PartitioningUtils._
 import org.apache.spark.sql.sources.{LogicalRelation, Partition, PartitionSpec}
-import org.apache.spark.sql.test.TestSQLContext
 import org.apache.spark.sql.types._
 import org.apache.spark.sql.{Column, QueryTest, Row, SQLContext}
 
@@ -39,10 +39,10 @@ case class ParquetData(intField: Int, stringField: String)
 case class ParquetDataWithKey(intField: Int, pi: Int, stringField: String, ps: String)
 
 class ParquetPartitionDiscoverySuite extends QueryTest with ParquetTest {
-  override val sqlContext: SQLContext = TestSQLContext
 
-  import sqlContext._
+  override lazy val sqlContext: SQLContext = org.apache.spark.sql.test.TestSQLContext
   import sqlContext.implicits._
+  import sqlContext.sql
 
   val defaultPartitionName = "__HIVE_DEFAULT_PARTITION__"
 
@@ -190,8 +190,7 @@ class ParquetPartitionDiscoverySuite extends QueryTest with ParquetTest {
       // Introduce _temporary dir to the base dir the robustness of the schema discovery process.
       new File(base.getCanonicalPath, "_temporary").mkdir()
 
-      println("load the partitioned table")
-      read.parquet(base.getCanonicalPath).registerTempTable("t")
+      sqlContext.read.parquet(base.getCanonicalPath).registerTempTable("t")
 
       withTempTable("t") {
         checkAnswer(
@@ -238,7 +237,7 @@ class ParquetPartitionDiscoverySuite extends QueryTest with ParquetTest {
           makePartitionDir(base, defaultPartitionName, "pi" -> pi, "ps" -> ps))
       }
 
-      read.parquet(base.getCanonicalPath).registerTempTable("t")
+      sqlContext.read.parquet(base.getCanonicalPath).registerTempTable("t")
 
       withTempTable("t") {
         checkAnswer(
@@ -286,7 +285,7 @@ class ParquetPartitionDiscoverySuite extends QueryTest with ParquetTest {
           makePartitionDir(base, defaultPartitionName, "pi" -> pi, "ps" -> ps))
       }
 
-      val parquetRelation = read.format("org.apache.spark.sql.parquet").load(base.getCanonicalPath)
+      val parquetRelation = sqlContext.read.format("parquet").load(base.getCanonicalPath)
       parquetRelation.registerTempTable("t")
 
       withTempTable("t") {
@@ -326,7 +325,7 @@ class ParquetPartitionDiscoverySuite extends QueryTest with ParquetTest {
           makePartitionDir(base, defaultPartitionName, "pi" -> pi, "ps" -> ps))
       }
 
-      val parquetRelation = read.format("org.apache.spark.sql.parquet").load(base.getCanonicalPath)
+      val parquetRelation = sqlContext.read.format("parquet").load(base.getCanonicalPath)
       parquetRelation.registerTempTable("t")
 
       withTempTable("t") {
@@ -358,7 +357,7 @@ class ParquetPartitionDiscoverySuite extends QueryTest with ParquetTest {
         (1 to 10).map(i => (i, i.toString)).toDF("intField", "stringField"),
         makePartitionDir(base, defaultPartitionName, "pi" -> 2))
 
-      read.format("org.apache.spark.sql.parquet").load(base.getCanonicalPath).registerTempTable("t")
+      sqlContext.read.format("parquet").load(base.getCanonicalPath).registerTempTable("t")
 
       withTempTable("t") {
         checkAnswer(
@@ -371,7 +370,7 @@ class ParquetPartitionDiscoverySuite extends QueryTest with ParquetTest {
   test("SPARK-7749 Non-partitioned table should have empty partition spec") {
     withTempPath { dir =>
       (1 to 10).map(i => (i, i.toString)).toDF("a", "b").write.parquet(dir.getCanonicalPath)
-      val queryExecution = read.parquet(dir.getCanonicalPath).queryExecution
+      val queryExecution = sqlContext.read.parquet(dir.getCanonicalPath).queryExecution
       queryExecution.analyzed.collectFirst {
         case LogicalRelation(relation: ParquetRelation2) =>
           assert(relation.partitionSpec === PartitionSpec.emptySpec)
@@ -385,7 +384,7 @@ class ParquetPartitionDiscoverySuite extends QueryTest with ParquetTest {
     withTempPath { dir =>
       val df = Seq("/", "[]", "?").zipWithIndex.map(_.swap).toDF("i", "s")
       df.write.format("parquet").partitionBy("s").save(dir.getCanonicalPath)
-      checkAnswer(read.parquet(dir.getCanonicalPath), df.collect())
+      checkAnswer(sqlContext.read.parquet(dir.getCanonicalPath), df.collect())
     }
   }
 
@@ -425,12 +424,12 @@ class ParquetPartitionDiscoverySuite extends QueryTest with ParquetTest {
     }
 
     val schema = StructType(partitionColumns :+ StructField(s"i", StringType))
-    val df = createDataFrame(sparkContext.parallelize(row :: Nil), schema)
+    val df = sqlContext.createDataFrame(sqlContext.sparkContext.parallelize(row :: Nil), schema)
 
     withTempPath { dir =>
       df.write.format("parquet").partitionBy(partitionColumns.map(_.name): _*).save(dir.toString)
       val fields = schema.map(f => Column(f.name).cast(f.dataType))
-      checkAnswer(read.load(dir.toString).select(fields: _*), row)
+      checkAnswer(sqlContext.read.load(dir.toString).select(fields: _*), row)
     }
   }
 
@@ -446,7 +445,7 @@ class ParquetPartitionDiscoverySuite extends QueryTest with ParquetTest {
       Files.touch(new File(s"${dir.getCanonicalPath}/b=1", ".DS_Store"))
       Files.createParentDirs(new File(s"${dir.getCanonicalPath}/b=1/c=1/.foo/bar"))
 
-      checkAnswer(read.format("parquet").load(dir.getCanonicalPath), df)
+      checkAnswer(sqlContext.read.format("parquet").load(dir.getCanonicalPath), df)
     }
   }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetQuerySuite.scala
index 304936fb2be8e..de0107a361815 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetQuerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetQuerySuite.scala
@@ -22,14 +22,14 @@ import org.scalatest.BeforeAndAfterAll
 import org.apache.spark.sql.types._
 import org.apache.spark.sql.{SQLConf, QueryTest}
 import org.apache.spark.sql.catalyst.expressions.Row
-import org.apache.spark.sql.test.TestSQLContext
-import org.apache.spark.sql.test.TestSQLContext._
 
 /**
  * A test suite that tests various Parquet queries.
  */
 class ParquetQuerySuiteBase extends QueryTest with ParquetTest {
-  val sqlContext = TestSQLContext
+  lazy val sqlContext = org.apache.spark.sql.test.TestSQLContext
+  import sqlContext.implicits._
+  import sqlContext.sql
 
   test("simple select queries") {
     withParquetTable((0 until 10).map(i => (i, i.toString)), "t") {
@@ -40,22 +40,22 @@ class ParquetQuerySuiteBase extends QueryTest with ParquetTest {
 
   test("appending") {
     val data = (0 until 10).map(i => (i, i.toString))
-    createDataFrame(data).toDF("c1", "c2").registerTempTable("tmp")
+    sqlContext.createDataFrame(data).toDF("c1", "c2").registerTempTable("tmp")
     withParquetTable(data, "t") {
       sql("INSERT INTO TABLE t SELECT * FROM tmp")
-      checkAnswer(table("t"), (data ++ data).map(Row.fromTuple))
+      checkAnswer(sqlContext.table("t"), (data ++ data).map(Row.fromTuple))
     }
-    catalog.unregisterTable(Seq("tmp"))
+    sqlContext.catalog.unregisterTable(Seq("tmp"))
   }
 
   test("overwriting") {
     val data = (0 until 10).map(i => (i, i.toString))
-    createDataFrame(data).toDF("c1", "c2").registerTempTable("tmp")
+    sqlContext.createDataFrame(data).toDF("c1", "c2").registerTempTable("tmp")
     withParquetTable(data, "t") {
       sql("INSERT OVERWRITE TABLE t SELECT * FROM tmp")
-      checkAnswer(table("t"), data.map(Row.fromTuple))
+      checkAnswer(sqlContext.table("t"), data.map(Row.fromTuple))
     }
-    catalog.unregisterTable(Seq("tmp"))
+    sqlContext.catalog.unregisterTable(Seq("tmp"))
   }
 
   test("self-join") {
@@ -118,7 +118,7 @@ class ParquetQuerySuiteBase extends QueryTest with ParquetTest {
     val schema = StructType(List(StructField("d", DecimalType(18, 0), false),
       StructField("time", TimestampType, false)).toArray)
     withTempPath { file =>
-      val df = sqlContext.createDataFrame(sparkContext.parallelize(data), schema)
+      val df = sqlContext.createDataFrame(sqlContext.sparkContext.parallelize(data), schema)
       df.write.parquet(file.getCanonicalPath)
       val df2 = sqlContext.read.parquet(file.getCanonicalPath)
       checkAnswer(df2, df.collect().toSeq)
@@ -127,7 +127,7 @@ class ParquetQuerySuiteBase extends QueryTest with ParquetTest {
 }
 
 class ParquetDataSourceOnQuerySuite extends ParquetQuerySuiteBase with BeforeAndAfterAll {
-  val originalConf = sqlContext.conf.parquetUseDataSourceApi
+  private lazy val originalConf = sqlContext.conf.parquetUseDataSourceApi
 
   override protected def beforeAll(): Unit = {
     sqlContext.conf.setConf(SQLConf.PARQUET_USE_DATA_SOURCE_API, "true")
@@ -139,7 +139,7 @@ class ParquetDataSourceOnQuerySuite extends ParquetQuerySuiteBase with BeforeAnd
 }
 
 class ParquetDataSourceOffQuerySuite extends ParquetQuerySuiteBase with BeforeAndAfterAll {
-  val originalConf = sqlContext.conf.parquetUseDataSourceApi
+  private lazy val originalConf = sqlContext.conf.parquetUseDataSourceApi
 
   override protected def beforeAll(): Unit = {
     sqlContext.conf.setConf(SQLConf.PARQUET_USE_DATA_SOURCE_API, "false")
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetSchemaSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetSchemaSuite.scala
index 8b1745124b8e1..171a656f0e01e 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetSchemaSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetSchemaSuite.scala
@@ -24,11 +24,10 @@ import org.apache.parquet.schema.MessageTypeParser
 
 import org.apache.spark.SparkFunSuite
 import org.apache.spark.sql.catalyst.ScalaReflection
-import org.apache.spark.sql.test.TestSQLContext
 import org.apache.spark.sql.types._
 
 class ParquetSchemaSuite extends SparkFunSuite with ParquetTest {
-  val sqlContext = TestSQLContext
+  lazy val sqlContext = org.apache.spark.sql.test.TestSQLContext
 
   /**
    * Checks whether the reflected Parquet message type for product type `T` conforms `messageType`.
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetTest.scala b/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetTest.scala
index 516ba373f41d2..eb15a1609f1d0 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetTest.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetTest.scala
@@ -33,8 +33,6 @@ import org.apache.spark.sql.{DataFrame, SaveMode}
  * Especially, `Tuple1.apply` can be used to easily wrap a single type/value.
  */
 private[sql] trait ParquetTest extends SQLTestUtils {
-  import sqlContext.implicits.{localSeqToDataFrameHolder, rddToDataFrameHolder}
-  import sqlContext.sparkContext
 
   /**
    * Writes `data` to a Parquet file, which is then passed to `f` and will be deleted after `f`
@@ -44,7 +42,7 @@ private[sql] trait ParquetTest extends SQLTestUtils {
       (data: Seq[T])
       (f: String => Unit): Unit = {
     withTempPath { file =>
-      sparkContext.parallelize(data).toDF().write.parquet(file.getCanonicalPath)
+      sqlContext.createDataFrame(data).write.parquet(file.getCanonicalPath)
       f(file.getCanonicalPath)
     }
   }
@@ -75,7 +73,7 @@ private[sql] trait ParquetTest extends SQLTestUtils {
 
   protected def makeParquetFile[T <: Product: ClassTag: TypeTag](
       data: Seq[T], path: File): Unit = {
-    data.toDF().write.mode(SaveMode.Overwrite).parquet(path.getCanonicalPath)
+    sqlContext.createDataFrame(data).write.mode(SaveMode.Overwrite).parquet(path.getCanonicalPath)
   }
 
   protected def makeParquetFile[T <: Product: ClassTag: TypeTag](
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/test/SQLTestUtils.scala b/sql/core/src/test/scala/org/apache/spark/sql/test/SQLTestUtils.scala
index 17a8b0cca09df..ac4a00a6f3dac 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/test/SQLTestUtils.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/test/SQLTestUtils.scala
@@ -25,11 +25,9 @@ import org.apache.spark.sql.SQLContext
 import org.apache.spark.util.Utils
 
 trait SQLTestUtils {
-  val sqlContext: SQLContext
+  def sqlContext: SQLContext
 
-  import sqlContext.{conf, sparkContext}
-
-  protected def configuration = sparkContext.hadoopConfiguration
+  protected def configuration = sqlContext.sparkContext.hadoopConfiguration
 
   /**
    * Sets all SQL configurations specified in `pairs`, calls `f`, and then restore all SQL
@@ -39,12 +37,12 @@ trait SQLTestUtils {
    */
   protected def withSQLConf(pairs: (String, String)*)(f: => Unit): Unit = {
     val (keys, values) = pairs.unzip
-    val currentValues = keys.map(key => Try(conf.getConf(key)).toOption)
-    (keys, values).zipped.foreach(conf.setConf)
+    val currentValues = keys.map(key => Try(sqlContext.conf.getConf(key)).toOption)
+    (keys, values).zipped.foreach(sqlContext.conf.setConf)
     try f finally {
       keys.zip(currentValues).foreach {
-        case (key, Some(value)) => conf.setConf(key, value)
-        case (key, None) => conf.unsetConf(key)
+        case (key, Some(value)) => sqlContext.conf.setConf(key, value)
+        case (key, None) => sqlContext.conf.unsetConf(key)
       }
     }
   }
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcQuerySuite.scala
index 57c23fe77f8b5..b384fb39f3d66 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcQuerySuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcQuerySuite.scala
@@ -52,9 +52,6 @@ case class Contact(name: String, phone: String)
 case class Person(name: String, age: Int, contacts: Seq[Contact])
 
 class OrcQuerySuite extends QueryTest with BeforeAndAfterAll with OrcTest {
-  override val sqlContext = TestHive
-
-  import TestHive.read
 
   def getTempFilePath(prefix: String, suffix: String = ""): File = {
     val tempFile = File.createTempFile(prefix, suffix)
@@ -69,7 +66,7 @@ class OrcQuerySuite extends QueryTest with BeforeAndAfterAll with OrcTest {
 
     withOrcFile(data) { file =>
       checkAnswer(
-        read.format("orc").load(file),
+        sqlContext.read.format("orc").load(file),
         data.toDF().collect())
     }
   }
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcTest.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcTest.scala
index 750f0b04aaa87..5daf691aa8c53 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcTest.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcTest.scala
@@ -22,13 +22,11 @@ import java.io.File
 import scala.reflect.ClassTag
 import scala.reflect.runtime.universe.TypeTag
 
-import org.apache.spark.sql.hive.HiveContext
-import org.apache.spark.sql.hive.test.TestHive
 import org.apache.spark.sql.test.SQLTestUtils
 import org.apache.spark.sql._
 
 private[sql] trait OrcTest extends SQLTestUtils {
-  protected def hiveContext = sqlContext.asInstanceOf[HiveContext]
+  lazy val sqlContext = org.apache.spark.sql.hive.test.TestHive
 
   import sqlContext.sparkContext
   import sqlContext.implicits._
@@ -53,7 +51,7 @@ private[sql] trait OrcTest extends SQLTestUtils {
   protected def withOrcDataFrame[T <: Product: ClassTag: TypeTag]
       (data: Seq[T])
       (f: DataFrame => Unit): Unit = {
-    withOrcFile(data)(path => f(hiveContext.read.format("orc").load(path)))
+    withOrcFile(data)(path => f(sqlContext.read.format("orc").load(path)))
   }
 
   /**
@@ -65,7 +63,7 @@ private[sql] trait OrcTest extends SQLTestUtils {
       (data: Seq[T], tableName: String)
       (f: => Unit): Unit = {
     withOrcDataFrame(data) { df =>
-      hiveContext.registerDataFrameAsTable(df, tableName)
+      sqlContext.registerDataFrameAsTable(df, tableName)
       withTempTable(tableName)(f)
     }
   }

From eb19d3f75cbd002f7e72ce02017a8de67f562792 Mon Sep 17 00:00:00 2001
From: Dong Wang <dong@databricks.com>
Date: Fri, 5 Jun 2015 17:41:12 -0700
Subject: [PATCH 385/525] [SPARK-6964] [SQL] Support Cancellation in the Thrift
 Server

Support runInBackground in SparkExecuteStatementOperation, and add cancellation

Author: Dong Wang <dong@databricks.com>

Closes #6207 from dongwang218/SPARK-6964-jdbc-cancel and squashes the following commits:

687c113 [Dong Wang] fix 100 characters
7bfa2a7 [Dong Wang] fix merge
380480f [Dong Wang] fix for liancheng's comments
eb3e385 [Dong Wang] small nit
341885b [Dong Wang] small fix
3d8ebf8 [Dong Wang] add spark.sql.hive.thriftServer.async flag
04142c3 [Dong Wang] set SQLSession for async execution
184ec35 [Dong Wang] keep hive conf
819ae03 [Dong Wang] [SPARK-6964][SQL][WIP] Support Cancellation in the Thrift Server
---
 .../org/apache/spark/sql/SQLContext.scala     |   5 +
 .../SparkExecuteStatementOperation.scala      | 164 ++++++++++++++++--
 .../server/SparkSQLOperationManager.scala     |   7 +-
 .../HiveThriftServer2Suites.scala             |  42 ++++-
 .../apache/spark/sql/hive/HiveContext.scala   |   6 +
 5 files changed, 208 insertions(+), 16 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
index 0aab7fa8709b8..ddb54025baa24 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
@@ -916,6 +916,11 @@ class SQLContext(@transient val sparkContext: SparkContext)
     tlSession.remove()
   }
 
+  protected[sql] def setSession(session: SQLSession): Unit = {
+    detachSession()
+    tlSession.set(session)
+  }
+
   protected[sql] class SQLSession {
     // Note that this is a lazy val so we can override the default value in subclasses.
     protected[sql] lazy val conf: SQLConf = new SQLConf
diff --git a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkExecuteStatementOperation.scala b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkExecuteStatementOperation.scala
index c0d1266212cdd..e071103df925c 100644
--- a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkExecuteStatementOperation.scala
+++ b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkExecuteStatementOperation.scala
@@ -17,11 +17,23 @@
 
 package org.apache.spark.sql.hive.thriftserver
 
+import java.security.PrivilegedExceptionAction
 import java.sql.{Date, Timestamp}
+import java.util.concurrent.RejectedExecutionException
 import java.util.{Map => JMap, UUID}
 
+import scala.collection.JavaConversions._
+import scala.collection.mutable.{ArrayBuffer, Map => SMap}
+import scala.util.control.NonFatal
+
+import org.apache.hadoop.hive.conf.HiveConf
 import org.apache.hadoop.hive.metastore.api.FieldSchema
 import org.apache.hive.service.cli._
+import org.apache.hadoop.hive.ql.metadata.Hive
+import org.apache.hadoop.hive.ql.metadata.HiveException
+import org.apache.hadoop.hive.ql.session.SessionState
+import org.apache.hadoop.hive.shims.ShimLoader
+import org.apache.hadoop.security.UserGroupInformation
 import org.apache.hive.service.cli.operation.ExecuteStatementOperation
 import org.apache.hive.service.cli.session.HiveSession
 
@@ -31,8 +43,6 @@ import org.apache.spark.sql.hive.{HiveContext, HiveMetastoreTypes}
 import org.apache.spark.sql.types._
 import org.apache.spark.sql.{DataFrame, Row => SparkRow, SQLConf}
 
-import scala.collection.JavaConversions._
-import scala.collection.mutable.{ArrayBuffer, Map => SMap}
 
 private[hive] class SparkExecuteStatementOperation(
     parentSession: HiveSession,
@@ -40,17 +50,19 @@ private[hive] class SparkExecuteStatementOperation(
     confOverlay: JMap[String, String],
     runInBackground: Boolean = true)
     (hiveContext: HiveContext, sessionToActivePool: SMap[SessionHandle, String])
-  // NOTE: `runInBackground` is set to `false` intentionally to disable asynchronous execution
-  extends ExecuteStatementOperation(parentSession, statement, confOverlay, false)
+  extends ExecuteStatementOperation(parentSession, statement, confOverlay, runInBackground)
   with Logging {
 
   private var result: DataFrame = _
   private var iter: Iterator[SparkRow] = _
   private var dataTypes: Array[DataType] = _
+  private var statementId: String = _
 
   def close(): Unit = {
     // RDDs will be cleaned automatically upon garbage collection.
-    logDebug("CLOSING")
+    hiveContext.sparkContext.clearJobGroup()
+    logDebug(s"CLOSING $statementId")
+    cleanup(OperationState.CLOSED)
   }
 
   def addNonNullColumnValue(from: SparkRow, to: ArrayBuffer[Any], ordinal: Int) {
@@ -114,10 +126,10 @@ private[hive] class SparkExecuteStatementOperation(
   }
 
   def getResultSetSchema: TableSchema = {
-    logInfo(s"Result Schema: ${result.queryExecution.analyzed.output}")
-    if (result.queryExecution.analyzed.output.size == 0) {
+    if (result == null || result.queryExecution.analyzed.output.size == 0) {
       new TableSchema(new FieldSchema("Result", "string", "") :: Nil)
     } else {
+      logInfo(s"Result Schema: ${result.queryExecution.analyzed.output}")
       val schema = result.queryExecution.analyzed.output.map { attr =>
         new FieldSchema(attr.name, HiveMetastoreTypes.toMetastoreType(attr.dataType), "")
       }
@@ -125,9 +137,73 @@ private[hive] class SparkExecuteStatementOperation(
     }
   }
 
-  def run(): Unit = {
-    val statementId = UUID.randomUUID().toString
-    logInfo(s"Running query '$statement'")
+  override def run(): Unit = {
+    setState(OperationState.PENDING)
+    setHasResultSet(true) // avoid no resultset for async run
+
+    if (!runInBackground) {
+      runInternal()
+    } else {
+      val parentSessionState = SessionState.get()
+      val hiveConf = getConfigForOperation()
+      val sparkServiceUGI = ShimLoader.getHadoopShims.getUGIForConf(hiveConf)
+      val sessionHive = getCurrentHive()
+      val currentSqlSession = hiveContext.currentSession
+
+      // Runnable impl to call runInternal asynchronously,
+      // from a different thread
+      val backgroundOperation = new Runnable() {
+
+        override def run(): Unit = {
+          val doAsAction = new PrivilegedExceptionAction[Object]() {
+            override def run(): Object = {
+
+              // User information is part of the metastore client member in Hive
+              hiveContext.setSession(currentSqlSession)
+              Hive.set(sessionHive)
+              SessionState.setCurrentSessionState(parentSessionState)
+              try {
+                runInternal()
+              } catch {
+                case e: HiveSQLException =>
+                  setOperationException(e)
+                  log.error("Error running hive query: ", e)
+              }
+              return null
+            }
+          }
+
+          try {
+            ShimLoader.getHadoopShims().doAs(sparkServiceUGI, doAsAction)
+          } catch {
+            case e: Exception =>
+              setOperationException(new HiveSQLException(e))
+              logError("Error running hive query as user : " +
+                sparkServiceUGI.getShortUserName(), e)
+          }
+        }
+      }
+      try {
+        // This submit blocks if no background threads are available to run this operation
+        val backgroundHandle =
+          getParentSession().getSessionManager().submitBackgroundOperation(backgroundOperation)
+        setBackgroundHandle(backgroundHandle)
+      } catch {
+        case rejected: RejectedExecutionException =>
+          setState(OperationState.ERROR)
+          throw new HiveSQLException("The background threadpool cannot accept" +
+            " new task for execution, please retry the operation", rejected)
+        case NonFatal(e) =>
+          logError(s"Error executing query in background", e)
+          setState(OperationState.ERROR)
+          throw e
+      }
+    }
+  }
+
+  private def runInternal(): Unit = {
+    statementId = UUID.randomUUID().toString
+    logInfo(s"Running query '$statement' with $statementId")
     setState(OperationState.RUNNING)
     HiveThriftServer2.listener.onStatementStart(
       statementId,
@@ -159,18 +235,82 @@ private[hive] class SparkExecuteStatementOperation(
         }
       }
       dataTypes = result.queryExecution.analyzed.output.map(_.dataType).toArray
-      setHasResultSet(true)
     } catch {
+      case e: HiveSQLException =>
+        if (getStatus().getState() == OperationState.CANCELED) {
+          return
+        } else {
+          setState(OperationState.ERROR);
+          throw e
+        }
       // Actually do need to catch Throwable as some failures don't inherit from Exception and
       // HiveServer will silently swallow them.
       case e: Throwable =>
+        val currentState = getStatus().getState()
+        logError(s"Error executing query, currentState $currentState, ", e)
         setState(OperationState.ERROR)
         HiveThriftServer2.listener.onStatementError(
           statementId, e.getMessage, e.getStackTraceString)
-        logError("Error executing query:", e)
         throw new HiveSQLException(e.toString)
     }
     setState(OperationState.FINISHED)
     HiveThriftServer2.listener.onStatementFinish(statementId)
   }
+
+  override def cancel(): Unit = {
+    logInfo(s"Cancel '$statement' with $statementId")
+    if (statementId != null) {
+      hiveContext.sparkContext.cancelJobGroup(statementId)
+    }
+    cleanup(OperationState.CANCELED)
+  }
+
+  private def cleanup(state: OperationState) {
+    setState(state)
+    if (runInBackground) {
+      val backgroundHandle = getBackgroundHandle()
+      if (backgroundHandle != null) {
+        backgroundHandle.cancel(true)
+      }
+    }
+  }
+
+  /**
+   * If there are query specific settings to overlay, then create a copy of config
+   * There are two cases we need to clone the session config that's being passed to hive driver
+   * 1. Async query -
+   *    If the client changes a config setting, that shouldn't reflect in the execution
+   *    already underway
+   * 2. confOverlay -
+   *    The query specific settings should only be applied to the query config and not session
+   * @return new configuration
+   * @throws HiveSQLException
+   */
+  private def getConfigForOperation(): HiveConf = {
+    var sqlOperationConf = getParentSession().getHiveConf()
+    if (!getConfOverlay().isEmpty() || runInBackground) {
+      // clone the partent session config for this query
+      sqlOperationConf = new HiveConf(sqlOperationConf)
+
+      // apply overlay query specific settings, if any
+      getConfOverlay().foreach { case (k, v) =>
+        try {
+          sqlOperationConf.verifyAndSet(k, v)
+        } catch {
+          case e: IllegalArgumentException =>
+            throw new HiveSQLException("Error applying statement specific settings", e)
+        }
+      }
+    }
+    return sqlOperationConf
+  }
+
+  private def getCurrentHive(): Hive = {
+    try {
+      return Hive.get()
+    } catch {
+      case e: HiveException =>
+        throw new HiveSQLException("Failed to get current Hive object", e);
+    }
+  }
 }
diff --git a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/server/SparkSQLOperationManager.scala b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/server/SparkSQLOperationManager.scala
index 9c0bf02391e0e..c8031ed0f3437 100644
--- a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/server/SparkSQLOperationManager.scala
+++ b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/server/SparkSQLOperationManager.scala
@@ -44,9 +44,12 @@ private[thriftserver] class SparkSQLOperationManager(hiveContext: HiveContext)
       confOverlay: JMap[String, String],
       async: Boolean): ExecuteStatementOperation = synchronized {
 
-    val operation = new SparkExecuteStatementOperation(parentSession, statement, confOverlay)(
-      hiveContext, sessionToActivePool)
+    val runInBackground = async && hiveContext.hiveThriftServerAsync
+    val operation = new SparkExecuteStatementOperation(parentSession, statement, confOverlay,
+      runInBackground)(hiveContext, sessionToActivePool)
     handleToOperation.put(operation.getHandle, operation)
+    logDebug(s"Created Operation for $statement with session=$parentSession, " +
+      s"runInBackground=$runInBackground")
     operation
   }
 }
diff --git a/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/HiveThriftServer2Suites.scala b/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/HiveThriftServer2Suites.scala
index f57c7083ea504..178bd1f5cb164 100644
--- a/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/HiveThriftServer2Suites.scala
+++ b/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/HiveThriftServer2Suites.scala
@@ -19,11 +19,13 @@ package org.apache.spark.sql.hive.thriftserver
 
 import java.io.File
 import java.net.URL
-import java.sql.{Date, DriverManager, Statement}
+import java.nio.charset.StandardCharsets
+import java.sql.{Date, DriverManager, SQLException, Statement}
 
 import scala.collection.mutable.ArrayBuffer
 import scala.concurrent.duration._
-import scala.concurrent.{Await, Promise}
+import scala.concurrent.{Await, Promise, future}
+import scala.concurrent.ExecutionContext.Implicits.global
 import scala.sys.process.{Process, ProcessLogger}
 import scala.util.{Random, Try}
 
@@ -338,6 +340,42 @@ class HiveThriftBinaryServerSuite extends HiveThriftJdbcTest {
       }
     )
   }
+
+  test("test jdbc cancel") {
+    withJdbcStatement { statement =>
+      val queries = Seq(
+        "DROP TABLE IF EXISTS test_map",
+        "CREATE TABLE test_map(key INT, value STRING)",
+        s"LOAD DATA LOCAL INPATH '${TestData.smallKv}' OVERWRITE INTO TABLE test_map")
+
+      queries.foreach(statement.execute)
+
+      val largeJoin = "SELECT COUNT(*) FROM test_map " +
+        List.fill(10)("join test_map").mkString(" ")
+      val f = future { Thread.sleep(100); statement.cancel(); }
+      val e = intercept[SQLException] {
+        statement.executeQuery(largeJoin)
+      }
+      assert(e.getMessage contains "cancelled")
+      Await.result(f, 3.minute)
+
+      // cancel is a noop
+      statement.executeQuery("SET spark.sql.hive.thriftServer.async=false")
+      val sf = future { Thread.sleep(100); statement.cancel(); }
+      val smallJoin = "SELECT COUNT(*) FROM test_map " +
+        List.fill(4)("join test_map").mkString(" ")
+      val rs1 = statement.executeQuery(smallJoin)
+      Await.result(sf, 3.minute)
+      rs1.next()
+      assert(rs1.getInt(1) === math.pow(5, 5))
+      rs1.close()
+
+      val rs2 = statement.executeQuery("SELECT COUNT(*) FROM test_map")
+      rs2.next()
+      assert(rs2.getInt(1) === 5)
+      rs2.close()
+    }
+  }
 }
 
 class HiveThriftHttpServerSuite extends HiveThriftJdbcTest {
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala
index 800f51c5e2e86..b8f294c262af7 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala
@@ -144,6 +144,12 @@ class HiveContext(sc: SparkContext) extends SQLContext(sc) {
     getConf("spark.sql.hive.metastore.barrierPrefixes", "")
       .split(",").filterNot(_ == "")
 
+  /*
+   * hive thrift server use background spark sql thread pool to execute sql queries
+   */
+  protected[hive] def hiveThriftServerAsync: Boolean =
+    getConf("spark.sql.hive.thriftServer.async", "true").toBoolean
+
   @transient
   protected[sql] lazy val substitutor = new VariableSubstitution()
 

From a71be0a36de94b3962c09f871845d745047a78e6 Mon Sep 17 00:00:00 2001
From: Reynold Xin <rxin@databricks.com>
Date: Fri, 5 Jun 2015 23:15:10 -0700
Subject: [PATCH 386/525] [SPARK-8114][SQL] Remove some wildcard import on
 TestSQLContext._ round 3.

Author: Reynold Xin <rxin@databricks.com>

Closes #6677 from rxin/test-wildcard and squashes the following commits:

8a17b33 [Reynold Xin] Fixed line length.
6663813 [Reynold Xin] [SPARK-8114][SQL] Remove some wildcard import on TestSQLContext._ round 3.
---
 .../execution/SparkSqlSerializer2Suite.scala  | 43 ++++++++---------
 .../sources/CreateTableAsSelectSuite.scala    | 14 +++---
 .../spark/sql/sources/DDLTestSuite.scala      | 24 +++++-----
 .../spark/sql/sources/DataSourceTest.scala    | 12 +++--
 .../spark/sql/sources/FilteredScanSuite.scala |  2 +-
 .../spark/sql/sources/InsertSuite.scala       | 29 ++++++------
 .../spark/sql/sources/PrunedScanSuite.scala   |  5 +-
 .../spark/sql/sources/SaveLoadSuite.scala     | 35 ++++++++------
 .../spark/sql/sources/TableScanSuite.scala    | 10 ++--
 .../spark/sql/hive/QueryPartitionSuite.scala  | 16 +++----
 .../spark/sql/hive/SerializationSuite.scala   |  3 +-
 .../spark/sql/hive/StatisticsSuite.scala      | 37 ++++++++-------
 .../org/apache/spark/sql/hive/UDFSuite.scala  | 18 ++++----
 .../sql/sources/hadoopFsRelationSuites.scala  | 46 +++++++++----------
 14 files changed, 156 insertions(+), 138 deletions(-)

diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/SparkSqlSerializer2Suite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/SparkSqlSerializer2Suite.scala
index 6ca5390cde23e..8631e247c6c05 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/SparkSqlSerializer2Suite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/SparkSqlSerializer2Suite.scala
@@ -19,6 +19,7 @@ package org.apache.spark.sql.execution
 
 import java.sql.{Timestamp, Date}
 
+import org.apache.spark.sql.test.TestSQLContext
 import org.scalatest.BeforeAndAfterAll
 
 import org.apache.spark.rdd.ShuffledRDD
@@ -26,7 +27,6 @@ import org.apache.spark.serializer.Serializer
 import org.apache.spark.{ShuffleDependency, SparkFunSuite}
 import org.apache.spark.sql.types._
 import org.apache.spark.sql.Row
-import org.apache.spark.sql.test.TestSQLContext._
 import org.apache.spark.sql.{MyDenseVectorUDT, QueryTest}
 
 class SparkSqlSerializer2DataTypeSuite extends SparkFunSuite {
@@ -74,11 +74,13 @@ abstract class SparkSqlSerializer2Suite extends QueryTest with BeforeAndAfterAll
   var numShufflePartitions: Int = _
   var useSerializer2: Boolean = _
 
+  protected lazy val ctx = TestSQLContext
+
   override def beforeAll(): Unit = {
-    numShufflePartitions = conf.numShufflePartitions
-    useSerializer2 = conf.useSqlSerializer2
+    numShufflePartitions = ctx.conf.numShufflePartitions
+    useSerializer2 = ctx.conf.useSqlSerializer2
 
-    sql("set spark.sql.useSerializer2=true")
+    ctx.sql("set spark.sql.useSerializer2=true")
 
     val supportedTypes =
       Seq(StringType, BinaryType, NullType, BooleanType,
@@ -94,7 +96,7 @@ abstract class SparkSqlSerializer2Suite extends QueryTest with BeforeAndAfterAll
 
     // Create a RDD with all data types supported by SparkSqlSerializer2.
     val rdd =
-      sparkContext.parallelize((1 to 1000), 10).map { i =>
+      ctx.sparkContext.parallelize((1 to 1000), 10).map { i =>
         Row(
           s"str${i}: test serializer2.",
           s"binary${i}: test serializer2.".getBytes("UTF-8"),
@@ -112,15 +114,15 @@ abstract class SparkSqlSerializer2Suite extends QueryTest with BeforeAndAfterAll
           new Timestamp(i))
       }
 
-    createDataFrame(rdd, schema).registerTempTable("shuffle")
+    ctx.createDataFrame(rdd, schema).registerTempTable("shuffle")
 
     super.beforeAll()
   }
 
   override def afterAll(): Unit = {
-    dropTempTable("shuffle")
-    sql(s"set spark.sql.shuffle.partitions=$numShufflePartitions")
-    sql(s"set spark.sql.useSerializer2=$useSerializer2")
+    ctx.dropTempTable("shuffle")
+    ctx.sql(s"set spark.sql.shuffle.partitions=$numShufflePartitions")
+    ctx.sql(s"set spark.sql.useSerializer2=$useSerializer2")
     super.afterAll()
   }
 
@@ -141,16 +143,16 @@ abstract class SparkSqlSerializer2Suite extends QueryTest with BeforeAndAfterAll
   }
 
   test("key schema and value schema are not nulls") {
-    val df = sql(s"SELECT DISTINCT ${allColumns} FROM shuffle")
+    val df = ctx.sql(s"SELECT DISTINCT ${allColumns} FROM shuffle")
     checkSerializer(df.queryExecution.executedPlan, serializerClass)
     checkAnswer(
       df,
-      table("shuffle").collect())
+      ctx.table("shuffle").collect())
   }
 
   test("key schema is null") {
     val aggregations = allColumns.split(",").map(c => s"COUNT($c)").mkString(",")
-    val df = sql(s"SELECT $aggregations FROM shuffle")
+    val df = ctx.sql(s"SELECT $aggregations FROM shuffle")
     checkSerializer(df.queryExecution.executedPlan, serializerClass)
     checkAnswer(
       df,
@@ -158,15 +160,14 @@ abstract class SparkSqlSerializer2Suite extends QueryTest with BeforeAndAfterAll
   }
 
   test("value schema is null") {
-    val df = sql(s"SELECT col0 FROM shuffle ORDER BY col0")
+    val df = ctx.sql(s"SELECT col0 FROM shuffle ORDER BY col0")
     checkSerializer(df.queryExecution.executedPlan, serializerClass)
-    assert(
-      df.map(r => r.getString(0)).collect().toSeq ===
-      table("shuffle").select("col0").map(r => r.getString(0)).collect().sorted.toSeq)
+    assert(df.map(r => r.getString(0)).collect().toSeq ===
+      ctx.table("shuffle").select("col0").map(r => r.getString(0)).collect().sorted.toSeq)
   }
 
   test("no map output field") {
-    val df = sql(s"SELECT 1 + 1 FROM shuffle")
+    val df = ctx.sql(s"SELECT 1 + 1 FROM shuffle")
     checkSerializer(df.queryExecution.executedPlan, classOf[SparkSqlSerializer])
   }
 }
@@ -177,8 +178,8 @@ class SparkSqlSerializer2SortShuffleSuite extends SparkSqlSerializer2Suite {
     super.beforeAll()
     // Sort merge will not be triggered.
     val bypassMergeThreshold =
-      sparkContext.conf.getInt("spark.shuffle.sort.bypassMergeThreshold", 200)
-    sql(s"set spark.sql.shuffle.partitions=${bypassMergeThreshold-1}")
+      ctx.sparkContext.conf.getInt("spark.shuffle.sort.bypassMergeThreshold", 200)
+    ctx.sql(s"set spark.sql.shuffle.partitions=${bypassMergeThreshold-1}")
   }
 }
 
@@ -189,7 +190,7 @@ class SparkSqlSerializer2SortMergeShuffleSuite extends SparkSqlSerializer2Suite
     super.beforeAll()
     // To trigger the sort merge.
     val bypassMergeThreshold =
-      sparkContext.conf.getInt("spark.shuffle.sort.bypassMergeThreshold", 200)
-    sql(s"set spark.sql.shuffle.partitions=${bypassMergeThreshold + 1}")
+      ctx.sparkContext.conf.getInt("spark.shuffle.sort.bypassMergeThreshold", 200)
+    ctx.sql(s"set spark.sql.shuffle.partitions=${bypassMergeThreshold + 1}")
   }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/CreateTableAsSelectSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/CreateTableAsSelectSuite.scala
index d2d1011b8e917..a71088430bfd5 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/sources/CreateTableAsSelectSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/sources/CreateTableAsSelectSuite.scala
@@ -26,18 +26,20 @@ import org.apache.spark.util.Utils
 
 class CreateTableAsSelectSuite extends DataSourceTest with BeforeAndAfterAll {
 
-  import caseInsensitiveContext._
+  import caseInsensitiveContext.sql
+
+  private lazy val sparkContext = caseInsensitiveContext.sparkContext
 
   var path: File = null
 
   override def beforeAll(): Unit = {
     path = Utils.createTempDir()
     val rdd = sparkContext.parallelize((1 to 10).map(i => s"""{"a":$i, "b":"str${i}"}"""))
-    read.json(rdd).registerTempTable("jt")
+    caseInsensitiveContext.read.json(rdd).registerTempTable("jt")
   }
 
   override def afterAll(): Unit = {
-    dropTempTable("jt")
+    caseInsensitiveContext.dropTempTable("jt")
   }
 
   after {
@@ -59,7 +61,7 @@ class CreateTableAsSelectSuite extends DataSourceTest with BeforeAndAfterAll {
       sql("SELECT a, b FROM jsonTable"),
       sql("SELECT a, b FROM jt").collect())
 
-    dropTempTable("jsonTable")
+    caseInsensitiveContext.dropTempTable("jsonTable")
   }
 
   test("CREATE TEMPORARY TABLE AS SELECT based on the file without write permission") {
@@ -129,7 +131,7 @@ class CreateTableAsSelectSuite extends DataSourceTest with BeforeAndAfterAll {
       sql("SELECT * FROM jsonTable"),
       sql("SELECT a * 4 FROM jt").collect())
 
-    dropTempTable("jsonTable")
+    caseInsensitiveContext.dropTempTable("jsonTable")
     // Explicitly delete the data.
     if (path.exists()) Utils.deleteRecursively(path)
 
@@ -147,7 +149,7 @@ class CreateTableAsSelectSuite extends DataSourceTest with BeforeAndAfterAll {
       sql("SELECT * FROM jsonTable"),
       sql("SELECT b FROM jt").collect())
 
-    dropTempTable("jsonTable")
+    caseInsensitiveContext.dropTempTable("jsonTable")
   }
 
   test("CREATE TEMPORARY TABLE AS SELECT with IF NOT EXISTS is not allowed") {
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/DDLTestSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/DDLTestSuite.scala
index 5c3467158a01b..51d22b6a1378a 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/sources/DDLTestSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/sources/DDLTestSuite.scala
@@ -63,19 +63,18 @@ case class SimpleDDLScan(from: Int, to: Int, table: String)(@transient val sqlCo
 }
 
 class DDLTestSuite extends DataSourceTest {
-  import caseInsensitiveContext._
 
   before {
-      sql(
-          """
-          |CREATE TEMPORARY TABLE ddlPeople
-          |USING org.apache.spark.sql.sources.DDLScanSource
-          |OPTIONS (
-          |  From '1',
-          |  To '10',
-          |  Table 'test1'
-          |)
-          """.stripMargin)
+    caseInsensitiveContext.sql(
+      """
+      |CREATE TEMPORARY TABLE ddlPeople
+      |USING org.apache.spark.sql.sources.DDLScanSource
+      |OPTIONS (
+      |  From '1',
+      |  To '10',
+      |  Table 'test1'
+      |)
+      """.stripMargin)
   }
 
   sqlTest(
@@ -100,7 +99,8 @@ class DDLTestSuite extends DataSourceTest {
       ))
 
   test("SPARK-7686 DescribeCommand should have correct physical plan output attributes") {
-    val attributes = sql("describe ddlPeople").queryExecution.executedPlan.output
+    val attributes = caseInsensitiveContext.sql("describe ddlPeople")
+      .queryExecution.executedPlan.output
     assert(attributes.map(_.name) === Seq("col_name", "data_type", "comment"))
     assert(attributes.map(_.dataType).toSet === Set(StringType))
   }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/DataSourceTest.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/DataSourceTest.scala
index 24ed665c67d2e..3f77960d09246 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/sources/DataSourceTest.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/sources/DataSourceTest.scala
@@ -17,14 +17,18 @@
 
 package org.apache.spark.sql.sources
 
+import org.scalatest.BeforeAndAfter
+
 import org.apache.spark.sql._
-import org.apache.spark.sql.catalyst.CatalystConf
 import org.apache.spark.sql.test.TestSQLContext
-import org.scalatest.BeforeAndAfter
+
 
 abstract class DataSourceTest extends QueryTest with BeforeAndAfter {
   // We want to test some edge cases.
-  implicit val caseInsensitiveContext = new SQLContext(TestSQLContext.sparkContext)
+  protected implicit lazy val caseInsensitiveContext = {
+    val ctx = new SQLContext(TestSQLContext.sparkContext)
+    ctx.setConf(SQLConf.CASE_SENSITIVE, "false")
+    ctx
+  }
 
-  caseInsensitiveContext.setConf(SQLConf.CASE_SENSITIVE, "false")
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/FilteredScanSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/FilteredScanSuite.scala
index db94b1f3e8926..81b3a0f0c5b3a 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/sources/FilteredScanSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/sources/FilteredScanSuite.scala
@@ -97,7 +97,7 @@ object FiltersPushed {
 
 class FilteredScanSuite extends DataSourceTest {
 
-  import caseInsensitiveContext._
+  import caseInsensitiveContext.sql
 
   before {
     sql(
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/InsertSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/InsertSuite.scala
index 6f375ef36237d..0b7c46c482c88 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/sources/InsertSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/sources/InsertSuite.scala
@@ -26,14 +26,16 @@ import org.apache.spark.util.Utils
 
 class InsertSuite extends DataSourceTest with BeforeAndAfterAll {
 
-  import caseInsensitiveContext._
+  import caseInsensitiveContext.sql
+
+  private lazy val sparkContext = caseInsensitiveContext.sparkContext
 
   var path: File = null
 
   override def beforeAll: Unit = {
     path = Utils.createTempDir()
     val rdd = sparkContext.parallelize((1 to 10).map(i => s"""{"a":$i, "b":"str${i}"}"""))
-    read.json(rdd).registerTempTable("jt")
+    caseInsensitiveContext.read.json(rdd).registerTempTable("jt")
     sql(
       s"""
         |CREATE TEMPORARY TABLE jsonTable (a int, b string)
@@ -45,8 +47,8 @@ class InsertSuite extends DataSourceTest with BeforeAndAfterAll {
   }
 
   override def afterAll: Unit = {
-    dropTempTable("jsonTable")
-    dropTempTable("jt")
+    caseInsensitiveContext.dropTempTable("jsonTable")
+    caseInsensitiveContext.dropTempTable("jt")
     Utils.deleteRecursively(path)
   }
 
@@ -109,7 +111,7 @@ class InsertSuite extends DataSourceTest with BeforeAndAfterAll {
 
     // Writing the table to less part files.
     val rdd1 = sparkContext.parallelize((1 to 10).map(i => s"""{"a":$i, "b":"str${i}"}"""), 5)
-    read.json(rdd1).registerTempTable("jt1")
+    caseInsensitiveContext.read.json(rdd1).registerTempTable("jt1")
     sql(
       s"""
          |INSERT OVERWRITE TABLE jsonTable SELECT a, b FROM jt1
@@ -121,7 +123,7 @@ class InsertSuite extends DataSourceTest with BeforeAndAfterAll {
 
     // Writing the table to more part files.
     val rdd2 = sparkContext.parallelize((1 to 10).map(i => s"""{"a":$i, "b":"str${i}"}"""), 10)
-    read.json(rdd2).registerTempTable("jt2")
+    caseInsensitiveContext.read.json(rdd2).registerTempTable("jt2")
     sql(
       s"""
          |INSERT OVERWRITE TABLE jsonTable SELECT a, b FROM jt2
@@ -140,8 +142,8 @@ class InsertSuite extends DataSourceTest with BeforeAndAfterAll {
       (1 to 10).map(i => Row(i * 10, s"str$i"))
     )
 
-    dropTempTable("jt1")
-    dropTempTable("jt2")
+    caseInsensitiveContext.dropTempTable("jt1")
+    caseInsensitiveContext.dropTempTable("jt2")
   }
 
   test("INSERT INTO not supported for JSONRelation for now") {
@@ -154,13 +156,14 @@ class InsertSuite extends DataSourceTest with BeforeAndAfterAll {
   }
 
   test("save directly to the path of a JSON table") {
-    table("jt").selectExpr("a * 5 as a", "b").write.mode(SaveMode.Overwrite).json(path.toString)
+    caseInsensitiveContext.table("jt").selectExpr("a * 5 as a", "b")
+      .write.mode(SaveMode.Overwrite).json(path.toString)
     checkAnswer(
       sql("SELECT a, b FROM jsonTable"),
       (1 to 10).map(i => Row(i * 5, s"str$i"))
     )
 
-    table("jt").write.mode(SaveMode.Overwrite).json(path.toString)
+    caseInsensitiveContext.table("jt").write.mode(SaveMode.Overwrite).json(path.toString)
     checkAnswer(
       sql("SELECT a, b FROM jsonTable"),
       (1 to 10).map(i => Row(i, s"str$i"))
@@ -181,7 +184,7 @@ class InsertSuite extends DataSourceTest with BeforeAndAfterAll {
 
   test("Caching")  {
     // Cached Query Execution
-    cacheTable("jsonTable")
+    caseInsensitiveContext.cacheTable("jsonTable")
     assertCached(sql("SELECT * FROM jsonTable"))
     checkAnswer(
       sql("SELECT * FROM jsonTable"),
@@ -220,7 +223,7 @@ class InsertSuite extends DataSourceTest with BeforeAndAfterAll {
       sql("SELECT a * 2, b FROM jt").collect())
 
     // Verify uncaching
-    uncacheTable("jsonTable")
+    caseInsensitiveContext.uncacheTable("jsonTable")
     assertCached(sql("SELECT * FROM jsonTable"), 0)
   }
 
@@ -251,6 +254,6 @@ class InsertSuite extends DataSourceTest with BeforeAndAfterAll {
       "It is not allowed to insert into a table that is not an InsertableRelation."
     )
 
-    dropTempTable("oneToTen")
+    caseInsensitiveContext.dropTempTable("oneToTen")
   }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/PrunedScanSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/PrunedScanSuite.scala
index c2bc52e2120c1..257526feab945 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/sources/PrunedScanSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/sources/PrunedScanSuite.scala
@@ -52,10 +52,9 @@ case class SimplePrunedScan(from: Int, to: Int)(@transient val sqlContext: SQLCo
 }
 
 class PrunedScanSuite extends DataSourceTest {
-  import caseInsensitiveContext._
 
   before {
-    sql(
+    caseInsensitiveContext.sql(
       """
         |CREATE TEMPORARY TABLE oneToTenPruned
         |USING org.apache.spark.sql.sources.PrunedScanSource
@@ -115,7 +114,7 @@ class PrunedScanSuite extends DataSourceTest {
 
   def testPruning(sqlString: String, expectedColumns: String*): Unit = {
     test(s"Columns output ${expectedColumns.mkString(",")}: $sqlString") {
-      val queryExecution = sql(sqlString).queryExecution
+      val queryExecution = caseInsensitiveContext.sql(sqlString).queryExecution
       val rawPlan = queryExecution.executedPlan.collect {
         case p: execution.PhysicalRDD => p
       } match {
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/SaveLoadSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/SaveLoadSuite.scala
index 274c652dd14d6..b032515a9d28c 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/sources/SaveLoadSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/sources/SaveLoadSuite.scala
@@ -27,7 +27,9 @@ import org.apache.spark.util.Utils
 
 class SaveLoadSuite extends DataSourceTest with BeforeAndAfterAll {
 
-  import caseInsensitiveContext._
+  import caseInsensitiveContext.sql
+
+  private lazy val sparkContext = caseInsensitiveContext.sparkContext
 
   var originalDefaultSource: String = null
 
@@ -36,60 +38,63 @@ class SaveLoadSuite extends DataSourceTest with BeforeAndAfterAll {
   var df: DataFrame = null
 
   override def beforeAll(): Unit = {
-    originalDefaultSource = conf.defaultDataSourceName
+    originalDefaultSource = caseInsensitiveContext.conf.defaultDataSourceName
 
     path = Utils.createTempDir()
     path.delete()
 
     val rdd = sparkContext.parallelize((1 to 10).map(i => s"""{"a":$i, "b":"str${i}"}"""))
-    df = read.json(rdd)
+    df = caseInsensitiveContext.read.json(rdd)
     df.registerTempTable("jsonTable")
   }
 
   override def afterAll(): Unit = {
-    conf.setConf(SQLConf.DEFAULT_DATA_SOURCE_NAME, originalDefaultSource)
+    caseInsensitiveContext.conf.setConf(SQLConf.DEFAULT_DATA_SOURCE_NAME, originalDefaultSource)
   }
 
   after {
-    conf.setConf(SQLConf.DEFAULT_DATA_SOURCE_NAME, originalDefaultSource)
+    caseInsensitiveContext.conf.setConf(SQLConf.DEFAULT_DATA_SOURCE_NAME, originalDefaultSource)
     Utils.deleteRecursively(path)
   }
 
   def checkLoad(): Unit = {
-    conf.setConf(SQLConf.DEFAULT_DATA_SOURCE_NAME, "org.apache.spark.sql.json")
-    checkAnswer(read.load(path.toString), df.collect())
+    caseInsensitiveContext.conf.setConf(
+      SQLConf.DEFAULT_DATA_SOURCE_NAME, "org.apache.spark.sql.json")
+    checkAnswer(caseInsensitiveContext.read.load(path.toString), df.collect())
 
     // Test if we can pick up the data source name passed in load.
-    conf.setConf(SQLConf.DEFAULT_DATA_SOURCE_NAME, "not a source name")
-    checkAnswer(read.format("json").load(path.toString), df.collect())
-    checkAnswer(read.format("json").load(path.toString), df.collect())
+    caseInsensitiveContext.conf.setConf(SQLConf.DEFAULT_DATA_SOURCE_NAME, "not a source name")
+    checkAnswer(caseInsensitiveContext.read.format("json").load(path.toString), df.collect())
+    checkAnswer(caseInsensitiveContext.read.format("json").load(path.toString), df.collect())
     val schema = StructType(StructField("b", StringType, true) :: Nil)
     checkAnswer(
-      read.format("json").schema(schema).load(path.toString),
+      caseInsensitiveContext.read.format("json").schema(schema).load(path.toString),
       sql("SELECT b FROM jsonTable").collect())
   }
 
   test("save with path and load") {
-    conf.setConf(SQLConf.DEFAULT_DATA_SOURCE_NAME, "org.apache.spark.sql.json")
+    caseInsensitiveContext.conf.setConf(
+      SQLConf.DEFAULT_DATA_SOURCE_NAME, "org.apache.spark.sql.json")
     df.write.save(path.toString)
     checkLoad()
   }
 
   test("save with string mode and path, and load") {
-    conf.setConf(SQLConf.DEFAULT_DATA_SOURCE_NAME, "org.apache.spark.sql.json")
+    caseInsensitiveContext.conf.setConf(
+      SQLConf.DEFAULT_DATA_SOURCE_NAME, "org.apache.spark.sql.json")
     path.createNewFile()
     df.write.mode("overwrite").save(path.toString)
     checkLoad()
   }
 
   test("save with path and datasource, and load") {
-    conf.setConf(SQLConf.DEFAULT_DATA_SOURCE_NAME, "not a source name")
+    caseInsensitiveContext.conf.setConf(SQLConf.DEFAULT_DATA_SOURCE_NAME, "not a source name")
     df.write.json(path.toString)
     checkLoad()
   }
 
   test("save with data source and options, and load") {
-    conf.setConf(SQLConf.DEFAULT_DATA_SOURCE_NAME, "not a source name")
+    caseInsensitiveContext.conf.setConf(SQLConf.DEFAULT_DATA_SOURCE_NAME, "not a source name")
     df.write.mode(SaveMode.ErrorIfExists).json(path.toString)
     checkLoad()
   }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/TableScanSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/TableScanSuite.scala
index 77af04a491742..5d4ecd810862c 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/sources/TableScanSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/sources/TableScanSuite.scala
@@ -88,9 +88,9 @@ case class AllDataTypesScan(
 }
 
 class TableScanSuite extends DataSourceTest {
-  import caseInsensitiveContext._
+  import caseInsensitiveContext.sql
 
-  var tableWithSchemaExpected = (1 to 10).map { i =>
+  private lazy val tableWithSchemaExpected = (1 to 10).map { i =>
     Row(
       s"str_$i",
       s"str_$i",
@@ -215,7 +215,7 @@ class TableScanSuite extends DataSourceTest {
       Nil
     )
 
-    assert(expectedSchema == table("tableWithSchema").schema)
+    assert(expectedSchema == caseInsensitiveContext.table("tableWithSchema").schema)
 
     checkAnswer(
       sql(
@@ -270,7 +270,7 @@ class TableScanSuite extends DataSourceTest {
 
   test("Caching")  {
     // Cached Query Execution
-    cacheTable("oneToTen")
+    caseInsensitiveContext.cacheTable("oneToTen")
     assertCached(sql("SELECT * FROM oneToTen"))
     checkAnswer(
       sql("SELECT * FROM oneToTen"),
@@ -297,7 +297,7 @@ class TableScanSuite extends DataSourceTest {
       (2 to 10).map(i => Row(i, i - 1)).toSeq)
 
     // Verify uncaching
-    uncacheTable("oneToTen")
+    caseInsensitiveContext.uncacheTable("oneToTen")
     assertCached(sql("SELECT * FROM oneToTen"), 0)
   }
 
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/QueryPartitionSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/QueryPartitionSuite.scala
index 4990092df6a99..017bc2adc103b 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/QueryPartitionSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/QueryPartitionSuite.scala
@@ -20,16 +20,17 @@ package org.apache.spark.sql.hive
 import com.google.common.io.Files
 
 import org.apache.spark.sql.{QueryTest, _}
-import org.apache.spark.sql.hive.test.TestHive
-import org.apache.spark.sql.hive.test.TestHive._
 import org.apache.spark.util.Utils
 
 
 class QueryPartitionSuite extends QueryTest {
-  import org.apache.spark.sql.hive.test.TestHive.implicits._
+
+  private lazy val ctx = org.apache.spark.sql.hive.test.TestHive
+  import ctx.implicits._
+  import ctx.sql
 
   test("SPARK-5068: query data when path doesn't exist"){
-    val testData = TestHive.sparkContext.parallelize(
+    val testData = ctx.sparkContext.parallelize(
       (1 to 10).map(i => TestData(i, i.toString))).toDF()
     testData.registerTempTable("testData")
 
@@ -48,8 +49,8 @@ class QueryPartitionSuite extends QueryTest {
 
     // test for the exist path
     checkAnswer(sql("select key,value from table_with_partition"),
-      testData.toSchemaRDD.collect ++ testData.toSchemaRDD.collect
-        ++ testData.toSchemaRDD.collect ++ testData.toSchemaRDD.collect)
+      testData.toDF.collect ++ testData.toDF.collect
+        ++ testData.toDF.collect ++ testData.toDF.collect)
 
     // delete the path of one partition
     tmpDir.listFiles
@@ -58,8 +59,7 @@ class QueryPartitionSuite extends QueryTest {
 
     // test for after delete the path
     checkAnswer(sql("select key,value from table_with_partition"),
-      testData.toSchemaRDD.collect ++ testData.toSchemaRDD.collect
-        ++ testData.toSchemaRDD.collect)
+      testData.toDF.collect ++ testData.toDF.collect ++ testData.toDF.collect)
 
     sql("DROP TABLE table_with_partition")
     sql("DROP TABLE createAndInsertTest")
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/SerializationSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/SerializationSuite.scala
index a492ecf203d17..93dcb10f7a296 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/SerializationSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/SerializationSuite.scala
@@ -19,12 +19,11 @@ package org.apache.spark.sql.hive
 
 import org.apache.spark.{SparkConf, SparkFunSuite}
 import org.apache.spark.serializer.JavaSerializer
-import org.apache.spark.sql.hive.test.TestHive
 
 class SerializationSuite extends SparkFunSuite {
 
   test("[SPARK-5840] HiveContext should be serializable") {
-    val hiveContext = TestHive
+    val hiveContext = org.apache.spark.sql.hive.test.TestHive
     hiveContext.hiveconf
     val serializer = new JavaSerializer(new SparkConf()).newInstance()
     val bytes = serializer.serialize(hiveContext)
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala
index e16e530555aee..78c94e6490e36 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala
@@ -23,13 +23,18 @@ import scala.reflect.ClassTag
 
 import org.apache.spark.sql.{Row, SQLConf, QueryTest}
 import org.apache.spark.sql.execution.joins._
-import org.apache.spark.sql.hive.test.TestHive
-import org.apache.spark.sql.hive.test.TestHive._
 import org.apache.spark.sql.hive.execution._
 
 class StatisticsSuite extends QueryTest with BeforeAndAfterAll {
-  TestHive.reset()
-  TestHive.cacheTables = false
+
+  private lazy val ctx: HiveContext = {
+    val ctx = org.apache.spark.sql.hive.test.TestHive
+    ctx.reset()
+    ctx.cacheTables = false
+    ctx
+  }
+
+  import ctx.sql
 
   test("parse analyze commands") {
     def assertAnalyzeCommand(analyzeCommand: String, c: Class[_]) {
@@ -72,7 +77,7 @@ class StatisticsSuite extends QueryTest with BeforeAndAfterAll {
 
   test("analyze MetastoreRelations") {
     def queryTotalSize(tableName: String): BigInt =
-      catalog.lookupRelation(Seq(tableName)).statistics.sizeInBytes
+      ctx.catalog.lookupRelation(Seq(tableName)).statistics.sizeInBytes
 
     // Non-partitioned table
     sql("CREATE TABLE analyzeTable (key STRING, value STRING)").collect()
@@ -106,7 +111,7 @@ class StatisticsSuite extends QueryTest with BeforeAndAfterAll {
         |SELECT * FROM src
       """.stripMargin).collect()
 
-    assert(queryTotalSize("analyzeTable_part") === conf.defaultSizeInBytes)
+    assert(queryTotalSize("analyzeTable_part") === ctx.conf.defaultSizeInBytes)
 
     sql("ANALYZE TABLE analyzeTable_part COMPUTE STATISTICS noscan")
 
@@ -117,9 +122,9 @@ class StatisticsSuite extends QueryTest with BeforeAndAfterAll {
     // Try to analyze a temp table
     sql("""SELECT * FROM src""").registerTempTable("tempTable")
     intercept[UnsupportedOperationException] {
-      analyze("tempTable")
+      ctx.analyze("tempTable")
     }
-    catalog.unregisterTable(Seq("tempTable"))
+    ctx.catalog.unregisterTable(Seq("tempTable"))
   }
 
   test("estimates the size of a test MetastoreRelation") {
@@ -147,8 +152,8 @@ class StatisticsSuite extends QueryTest with BeforeAndAfterAll {
       val sizes = df.queryExecution.analyzed.collect {
         case r if ct.runtimeClass.isAssignableFrom(r.getClass) => r.statistics.sizeInBytes
       }
-      assert(sizes.size === 2 && sizes(0) <= conf.autoBroadcastJoinThreshold
-        && sizes(1) <= conf.autoBroadcastJoinThreshold,
+      assert(sizes.size === 2 && sizes(0) <= ctx.conf.autoBroadcastJoinThreshold
+        && sizes(1) <= ctx.conf.autoBroadcastJoinThreshold,
         s"query should contain two relations, each of which has size smaller than autoConvertSize")
 
       // Using `sparkPlan` because for relevant patterns in HashJoin to be
@@ -159,8 +164,8 @@ class StatisticsSuite extends QueryTest with BeforeAndAfterAll {
 
       checkAnswer(df, expectedAnswer) // check correctness of output
 
-      TestHive.conf.settings.synchronized {
-        val tmp = conf.autoBroadcastJoinThreshold
+      ctx.conf.settings.synchronized {
+        val tmp = ctx.conf.autoBroadcastJoinThreshold
 
         sql(s"""SET ${SQLConf.AUTO_BROADCASTJOIN_THRESHOLD}=-1""")
         df = sql(query)
@@ -203,8 +208,8 @@ class StatisticsSuite extends QueryTest with BeforeAndAfterAll {
         .isAssignableFrom(r.getClass) =>
         r.statistics.sizeInBytes
     }
-    assert(sizes.size === 2 && sizes(1) <= conf.autoBroadcastJoinThreshold
-      && sizes(0) <= conf.autoBroadcastJoinThreshold,
+    assert(sizes.size === 2 && sizes(1) <= ctx.conf.autoBroadcastJoinThreshold
+      && sizes(0) <= ctx.conf.autoBroadcastJoinThreshold,
       s"query should contain two relations, each of which has size smaller than autoConvertSize")
 
     // Using `sparkPlan` because for relevant patterns in HashJoin to be
@@ -217,8 +222,8 @@ class StatisticsSuite extends QueryTest with BeforeAndAfterAll {
 
     checkAnswer(df, answer) // check correctness of output
 
-    TestHive.conf.settings.synchronized {
-      val tmp = conf.autoBroadcastJoinThreshold
+    ctx.conf.settings.synchronized {
+      val tmp = ctx.conf.autoBroadcastJoinThreshold
 
       sql(s"SET ${SQLConf.AUTO_BROADCASTJOIN_THRESHOLD}=-1")
       df = sql(leftSemiJoinQuery)
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/UDFSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/UDFSuite.scala
index 8245047626d57..4056dee777574 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/UDFSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/UDFSuite.scala
@@ -17,20 +17,20 @@
 
 package org.apache.spark.sql.hive
 
-/* Implicits */
-
 import org.apache.spark.sql.QueryTest
-import org.apache.spark.sql.hive.test.TestHive._
 
 case class FunctionResult(f1: String, f2: String)
 
 class UDFSuite extends QueryTest {
+
+  private lazy val ctx = org.apache.spark.sql.hive.test.TestHive
+
   test("UDF case insensitive") {
-    udf.register("random0", () => { Math.random() })
-    udf.register("RANDOM1", () => { Math.random() })
-    udf.register("strlenScala", (_: String).length + (_: Int))
-    assert(sql("SELECT RANDOM0() FROM src LIMIT 1").head().getDouble(0) >= 0.0)
-    assert(sql("SELECT RANDOm1() FROM src LIMIT 1").head().getDouble(0) >= 0.0)
-    assert(sql("SELECT strlenscala('test', 1) FROM src LIMIT 1").head().getInt(0) === 5)
+    ctx.udf.register("random0", () => { Math.random() })
+    ctx.udf.register("RANDOM1", () => { Math.random() })
+    ctx.udf.register("strlenScala", (_: String).length + (_: Int))
+    assert(ctx.sql("SELECT RANDOM0() FROM src LIMIT 1").head().getDouble(0) >= 0.0)
+    assert(ctx.sql("SELECT RANDOm1() FROM src LIMIT 1").head().getDouble(0) >= 0.0)
+    assert(ctx.sql("SELECT strlenscala('test', 1) FROM src LIMIT 1").head().getInt(0) === 5)
   }
 }
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/sources/hadoopFsRelationSuites.scala b/sql/hive/src/test/scala/org/apache/spark/sql/sources/hadoopFsRelationSuites.scala
index 74095426741e3..8787663a98f8f 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/sources/hadoopFsRelationSuites.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/sources/hadoopFsRelationSuites.scala
@@ -30,9 +30,9 @@ import org.apache.spark.sql.test.SQLTestUtils
 import org.apache.spark.sql.types._
 
 abstract class HadoopFsRelationTest extends QueryTest with SQLTestUtils {
-  override val sqlContext: SQLContext = TestHive
+  override lazy val sqlContext: SQLContext = TestHive
 
-  import sqlContext._
+  import sqlContext.sql
   import sqlContext.implicits._
 
   val dataSourceName = classOf[SimpleTextSource].getCanonicalName
@@ -43,19 +43,19 @@ abstract class HadoopFsRelationTest extends QueryTest with SQLTestUtils {
         StructField("a", IntegerType, nullable = false),
         StructField("b", StringType, nullable = false)))
 
-  val testDF = (1 to 3).map(i => (i, s"val_$i")).toDF("a", "b")
+  lazy val testDF = (1 to 3).map(i => (i, s"val_$i")).toDF("a", "b")
 
-  val partitionedTestDF1 = (for {
+  lazy val partitionedTestDF1 = (for {
     i <- 1 to 3
     p2 <- Seq("foo", "bar")
   } yield (i, s"val_$i", 1, p2)).toDF("a", "b", "p1", "p2")
 
-  val partitionedTestDF2 = (for {
+  lazy val partitionedTestDF2 = (for {
     i <- 1 to 3
     p2 <- Seq("foo", "bar")
   } yield (i, s"val_$i", 2, p2)).toDF("a", "b", "p1", "p2")
 
-  val partitionedTestDF = partitionedTestDF1.unionAll(partitionedTestDF2)
+  lazy val partitionedTestDF = partitionedTestDF1.unionAll(partitionedTestDF2)
 
   def checkQueries(df: DataFrame): Unit = {
     // Selects everything
@@ -103,7 +103,7 @@ abstract class HadoopFsRelationTest extends QueryTest with SQLTestUtils {
       testDF.write.mode(SaveMode.Overwrite).format(dataSourceName).save(file.getCanonicalPath)
 
       checkAnswer(
-        read.format(dataSourceName)
+        sqlContext.read.format(dataSourceName)
           .option("path", file.getCanonicalPath)
           .option("dataSchema", dataSchema.json)
           .load(),
@@ -117,7 +117,7 @@ abstract class HadoopFsRelationTest extends QueryTest with SQLTestUtils {
       testDF.write.mode(SaveMode.Append).format(dataSourceName).save(file.getCanonicalPath)
 
       checkAnswer(
-        read.format(dataSourceName)
+        sqlContext.read.format(dataSourceName)
           .option("dataSchema", dataSchema.json)
           .load(file.getCanonicalPath).orderBy("a"),
         testDF.unionAll(testDF).orderBy("a").collect())
@@ -151,7 +151,7 @@ abstract class HadoopFsRelationTest extends QueryTest with SQLTestUtils {
         .save(file.getCanonicalPath)
 
       checkQueries(
-        read.format(dataSourceName)
+        sqlContext.read.format(dataSourceName)
           .option("dataSchema", dataSchema.json)
           .load(file.getCanonicalPath))
     }
@@ -172,7 +172,7 @@ abstract class HadoopFsRelationTest extends QueryTest with SQLTestUtils {
         .save(file.getCanonicalPath)
 
       checkAnswer(
-        read.format(dataSourceName)
+        sqlContext.read.format(dataSourceName)
           .option("dataSchema", dataSchema.json)
           .load(file.getCanonicalPath),
         partitionedTestDF.collect())
@@ -194,7 +194,7 @@ abstract class HadoopFsRelationTest extends QueryTest with SQLTestUtils {
         .save(file.getCanonicalPath)
 
       checkAnswer(
-        read.format(dataSourceName)
+        sqlContext.read.format(dataSourceName)
           .option("dataSchema", dataSchema.json)
           .load(file.getCanonicalPath),
         partitionedTestDF.unionAll(partitionedTestDF).collect())
@@ -216,7 +216,7 @@ abstract class HadoopFsRelationTest extends QueryTest with SQLTestUtils {
         .save(file.getCanonicalPath)
 
       checkAnswer(
-        read.format(dataSourceName)
+        sqlContext.read.format(dataSourceName)
           .option("dataSchema", dataSchema.json)
           .load(file.getCanonicalPath),
         partitionedTestDF.collect())
@@ -252,7 +252,7 @@ abstract class HadoopFsRelationTest extends QueryTest with SQLTestUtils {
       .saveAsTable("t")
 
     withTable("t") {
-      checkAnswer(table("t"), testDF.collect())
+      checkAnswer(sqlContext.table("t"), testDF.collect())
     }
   }
 
@@ -261,7 +261,7 @@ abstract class HadoopFsRelationTest extends QueryTest with SQLTestUtils {
     testDF.write.format(dataSourceName).mode(SaveMode.Append).saveAsTable("t")
 
     withTable("t") {
-      checkAnswer(table("t"), testDF.unionAll(testDF).orderBy("a").collect())
+      checkAnswer(sqlContext.table("t"), testDF.unionAll(testDF).orderBy("a").collect())
     }
   }
 
@@ -280,7 +280,7 @@ abstract class HadoopFsRelationTest extends QueryTest with SQLTestUtils {
 
     withTempTable("t") {
       testDF.write.format(dataSourceName).mode(SaveMode.Ignore).saveAsTable("t")
-      assert(table("t").collect().isEmpty)
+      assert(sqlContext.table("t").collect().isEmpty)
     }
   }
 
@@ -291,7 +291,7 @@ abstract class HadoopFsRelationTest extends QueryTest with SQLTestUtils {
       .saveAsTable("t")
 
     withTable("t") {
-      checkQueries(table("t"))
+      checkQueries(sqlContext.table("t"))
     }
   }
 
@@ -311,7 +311,7 @@ abstract class HadoopFsRelationTest extends QueryTest with SQLTestUtils {
       .saveAsTable("t")
 
     withTable("t") {
-      checkAnswer(table("t"), partitionedTestDF.collect())
+      checkAnswer(sqlContext.table("t"), partitionedTestDF.collect())
     }
   }
 
@@ -331,7 +331,7 @@ abstract class HadoopFsRelationTest extends QueryTest with SQLTestUtils {
       .saveAsTable("t")
 
     withTable("t") {
-      checkAnswer(table("t"), partitionedTestDF.unionAll(partitionedTestDF).collect())
+      checkAnswer(sqlContext.table("t"), partitionedTestDF.unionAll(partitionedTestDF).collect())
     }
   }
 
@@ -351,7 +351,7 @@ abstract class HadoopFsRelationTest extends QueryTest with SQLTestUtils {
       .saveAsTable("t")
 
     withTable("t") {
-      checkAnswer(table("t"), partitionedTestDF.collect())
+      checkAnswer(sqlContext.table("t"), partitionedTestDF.collect())
     }
   }
 
@@ -400,7 +400,7 @@ abstract class HadoopFsRelationTest extends QueryTest with SQLTestUtils {
         .partitionBy("p1", "p2")
         .saveAsTable("t")
 
-      assert(table("t").collect().isEmpty)
+      assert(sqlContext.table("t").collect().isEmpty)
     }
   }
 
@@ -412,7 +412,7 @@ abstract class HadoopFsRelationTest extends QueryTest with SQLTestUtils {
         .partitionBy("p1", "p2")
         .save(file.getCanonicalPath)
 
-      val df = read
+      val df = sqlContext.read
         .format(dataSourceName)
         .option("dataSchema", dataSchema.json)
         .load(s"${file.getCanonicalPath}/p1=*/p2=???")
@@ -452,7 +452,7 @@ abstract class HadoopFsRelationTest extends QueryTest with SQLTestUtils {
         .saveAsTable("t")
 
       withTempTable("t") {
-        checkAnswer(table("t"), input.collect())
+        checkAnswer(sqlContext.table("t"), input.collect())
       }
     }
   }
@@ -467,7 +467,7 @@ abstract class HadoopFsRelationTest extends QueryTest with SQLTestUtils {
       .saveAsTable("t")
 
     withTable("t") {
-      checkAnswer(table("t"), df.select('b, 'c, 'a).collect())
+      checkAnswer(sqlContext.table("t"), df.select('b, 'c, 'a).collect())
     }
   }
 }

From a8077e5cfc48bdb9f0641d62fe6c01cc8c4f1694 Mon Sep 17 00:00:00 2001
From: Xu Tingjun <xutingjun@huawei.com>
Date: Sat, 6 Jun 2015 09:53:53 +0100
Subject: [PATCH 387/525] [SPARK-6973] remove skipped stage ID from completed
 set on the allJobsPage

Though totalStages = allStages - skippedStages is understandable. But consider the problem [SPARK-6973], I think totalStages = allStages is more reasonable. Like "2/1 (2 failed) (1 skipped)", this item also shows the skipped num, it also will be understandable.

Author: Xu Tingjun <xutingjun@huawei.com>
Author: Xutingjun <xutingjun@huawei.com>
Author: meiyoula <1039320815@qq.com>

Closes #5550 from XuTingjun/allJobsPage and squashes the following commits:

a742541 [Xu Tingjun] delete the loop
40ce94b [Xutingjun] remove stage id from completed set if it retries again
6459238 [meiyoula] delete space
9e23c71 [Xu Tingjun] recover numSkippedStages
b987ea7 [Xutingjun] delete skkiped stages from completed set
47525c6 [Xu Tingjun] modify total stages/tasks on the allJobsPage
---
 .../org/apache/spark/ui/jobs/JobProgressListener.scala     | 7 ++++++-
 core/src/main/scala/org/apache/spark/ui/jobs/UIData.scala  | 3 ++-
 2 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/ui/jobs/JobProgressListener.scala b/core/src/main/scala/org/apache/spark/ui/jobs/JobProgressListener.scala
index 1d31fce4c697b..730f9806e518e 100644
--- a/core/src/main/scala/org/apache/spark/ui/jobs/JobProgressListener.scala
+++ b/core/src/main/scala/org/apache/spark/ui/jobs/JobProgressListener.scala
@@ -282,7 +282,9 @@ class JobProgressListener(conf: SparkConf) extends SparkListener with Logging {
     ) {
       jobData.numActiveStages -= 1
       if (stage.failureReason.isEmpty) {
-        jobData.completedStageIndices.add(stage.stageId)
+        if (!stage.submissionTime.isEmpty) {
+          jobData.completedStageIndices.add(stage.stageId)
+        }
       } else {
         jobData.numFailedStages += 1
       }
@@ -315,6 +317,9 @@ class JobProgressListener(conf: SparkConf) extends SparkListener with Logging {
       jobData <- jobIdToData.get(jobId)
     ) {
       jobData.numActiveStages += 1
+
+      // If a stage retries again, it should be removed from completedStageIndices set
+      jobData.completedStageIndices.remove(stage.stageId)
     }
   }
 
diff --git a/core/src/main/scala/org/apache/spark/ui/jobs/UIData.scala b/core/src/main/scala/org/apache/spark/ui/jobs/UIData.scala
index 3d96113aa5fe9..f008d40180611 100644
--- a/core/src/main/scala/org/apache/spark/ui/jobs/UIData.scala
+++ b/core/src/main/scala/org/apache/spark/ui/jobs/UIData.scala
@@ -22,6 +22,7 @@ import org.apache.spark.executor.TaskMetrics
 import org.apache.spark.scheduler.{AccumulableInfo, TaskInfo}
 import org.apache.spark.util.collection.OpenHashSet
 
+import scala.collection.mutable
 import scala.collection.mutable.HashMap
 
 private[spark] object UIData {
@@ -63,7 +64,7 @@ private[spark] object UIData {
     /* Stages */
     var numActiveStages: Int = 0,
     // This needs to be a set instead of a simple count to prevent double-counting of rerun stages:
-    var completedStageIndices: OpenHashSet[Int] = new OpenHashSet[Int](),
+    var completedStageIndices: mutable.HashSet[Int] = new mutable.HashSet[Int](),
     var numSkippedStages: Int = 0,
     var numFailedStages: Int = 0
   )

From 16fc49617e1dfcbe9122b224f7f63b7bfddb36ce Mon Sep 17 00:00:00 2001
From: Cheng Lian <lian@databricks.com>
Date: Sat, 6 Jun 2015 17:23:12 +0800
Subject: [PATCH 388/525] [SPARK-8079] [SQL] Makes InsertIntoHadoopFsRelation
 job/task abortion more robust

As described in SPARK-8079, when writing a DataFrame to a `HadoopFsRelation`, if `HadoopFsRelation.prepareForWriteJob` throws exception, an unexpected NPE will be thrown during job abortion. (This issue doesn't bring much damage since the job is failing anyway.)

This PR makes the job/task abortion logic in `InsertIntoHadoopFsRelation` more robust to avoid such confusing exceptions.

Author: Cheng Lian <lian@databricks.com>

Closes #6612 from liancheng/spark-8079 and squashes the following commits:

87cd81e [Cheng Lian] Addresses @rxin's comment
1864c75 [Cheng Lian] Addresses review comments
9e6dbb3 [Cheng Lian] Makes InsertIntoHadoopFsRelation job/task abortion more robust
---
 .../apache/spark/sql/sources/commands.scala   | 93 ++++++++++++-------
 .../sql/sources/hadoopFsRelationSuites.scala  | 15 +++
 2 files changed, 76 insertions(+), 32 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/sources/commands.scala b/sql/core/src/main/scala/org/apache/spark/sql/sources/commands.scala
index e9932c09107db..bd3aad6631748 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/sources/commands.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/sources/commands.scala
@@ -33,7 +33,7 @@ import org.apache.spark.sql.catalyst.CatalystTypeConverters
 import org.apache.spark.sql.catalyst.analysis.UnresolvedAttribute
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.expressions.codegen.GenerateProjection
-import org.apache.spark.sql.catalyst.plans.logical.{Project, LogicalPlan}
+import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, Project}
 import org.apache.spark.sql.execution.RunnableCommand
 import org.apache.spark.sql.types.StructType
 import org.apache.spark.sql.{DataFrame, SQLConf, SQLContext, SaveMode}
@@ -127,8 +127,11 @@ private[sql] case class InsertIntoHadoopFsRelation(
     val needsConversion = relation.needConversion
     val dataSchema = relation.dataSchema
 
+    // This call shouldn't be put into the `try` block below because it only initializes and
+    // prepares the job, any exception thrown from here shouldn't cause abortJob() to be called.
+    writerContainer.driverSideSetup()
+
     try {
-      writerContainer.driverSideSetup()
       df.sqlContext.sparkContext.runJob(df.queryExecution.executedPlan.execute(), writeRows _)
       writerContainer.commitJob()
       relation.refresh()
@@ -139,9 +142,10 @@ private[sql] case class InsertIntoHadoopFsRelation(
     }
 
     def writeRows(taskContext: TaskContext, iterator: Iterator[Row]): Unit = {
-      writerContainer.executorSideSetup(taskContext)
-
+      // If anything below fails, we should abort the task.
       try {
+        writerContainer.executorSideSetup(taskContext)
+
         if (needsConversion) {
           val converter = CatalystTypeConverters.createToScalaConverter(dataSchema)
           while (iterator.hasNext) {
@@ -154,6 +158,7 @@ private[sql] case class InsertIntoHadoopFsRelation(
             writerContainer.outputWriterForRow(row).write(row)
           }
         }
+
         writerContainer.commitTask()
       } catch { case cause: Throwable =>
         logError("Aborting task.", cause)
@@ -191,8 +196,11 @@ private[sql] case class InsertIntoHadoopFsRelation(
     val (partitionOutput, dataOutput) = output.partition(a => partitionColumns.contains(a.name))
     val codegenEnabled = df.sqlContext.conf.codegenEnabled
 
+    // This call shouldn't be put into the `try` block below because it only initializes and
+    // prepares the job, any exception thrown from here shouldn't cause abortJob() to be called.
+    writerContainer.driverSideSetup()
+
     try {
-      writerContainer.driverSideSetup()
       df.sqlContext.sparkContext.runJob(df.queryExecution.executedPlan.execute(), writeRows _)
       writerContainer.commitJob()
       relation.refresh()
@@ -203,32 +211,39 @@ private[sql] case class InsertIntoHadoopFsRelation(
     }
 
     def writeRows(taskContext: TaskContext, iterator: Iterator[Row]): Unit = {
-      writerContainer.executorSideSetup(taskContext)
-
-      val partitionProj = newProjection(codegenEnabled, partitionOutput, output)
-      val dataProj = newProjection(codegenEnabled, dataOutput, output)
-
-      if (needsConversion) {
-        val converter = CatalystTypeConverters.createToScalaConverter(dataSchema)
-        while (iterator.hasNext) {
-          val row = iterator.next()
-          val partitionPart = partitionProj(row)
-          val dataPart = dataProj(row)
-          val convertedDataPart = converter(dataPart).asInstanceOf[Row]
-          writerContainer.outputWriterForRow(partitionPart).write(convertedDataPart)
-        }
-      } else {
-        val partitionSchema = StructType.fromAttributes(partitionOutput)
-        val converter = CatalystTypeConverters.createToScalaConverter(partitionSchema)
-        while (iterator.hasNext) {
-          val row = iterator.next()
-          val partitionPart = converter(partitionProj(row)).asInstanceOf[Row]
-          val dataPart = dataProj(row)
-          writerContainer.outputWriterForRow(partitionPart).write(dataPart)
+      // If anything below fails, we should abort the task.
+      try {
+        writerContainer.executorSideSetup(taskContext)
+
+        val partitionProj = newProjection(codegenEnabled, partitionOutput, output)
+        val dataProj = newProjection(codegenEnabled, dataOutput, output)
+
+        if (needsConversion) {
+          val converter = CatalystTypeConverters.createToScalaConverter(dataSchema)
+          while (iterator.hasNext) {
+            val row = iterator.next()
+            val partitionPart = partitionProj(row)
+            val dataPart = dataProj(row)
+            val convertedDataPart = converter(dataPart).asInstanceOf[Row]
+            writerContainer.outputWriterForRow(partitionPart).write(convertedDataPart)
+          }
+        } else {
+          val partitionSchema = StructType.fromAttributes(partitionOutput)
+          val converter = CatalystTypeConverters.createToScalaConverter(partitionSchema)
+          while (iterator.hasNext) {
+            val row = iterator.next()
+            val partitionPart = converter(partitionProj(row)).asInstanceOf[Row]
+            val dataPart = dataProj(row)
+            writerContainer.outputWriterForRow(partitionPart).write(dataPart)
+          }
         }
-      }
 
-      writerContainer.commitTask()
+        writerContainer.commitTask()
+      } catch { case cause: Throwable =>
+        logError("Aborting task.", cause)
+        writerContainer.abortTask()
+        throw new SparkException("Task failed while writing rows.", cause)
+      }
     }
   }
 
@@ -283,7 +298,12 @@ private[sql] abstract class BaseWriterContainer(
     setupIDs(0, 0, 0)
     setupConf()
     taskAttemptContext = newTaskAttemptContext(serializableConf.value, taskAttemptId)
+
+    // This preparation must happen before initializing output format and output committer, since
+    // their initialization involves the job configuration, which can be potentially decorated in
+    // `relation.prepareJobForWrite`.
     outputWriterFactory = relation.prepareJobForWrite(job)
+
     outputFormatClass = job.getOutputFormatClass
     outputCommitter = newOutputCommitter(taskAttemptContext)
     outputCommitter.setupJob(jobContext)
@@ -359,7 +379,9 @@ private[sql] abstract class BaseWriterContainer(
   }
 
   def abortTask(): Unit = {
-    outputCommitter.abortTask(taskAttemptContext)
+    if (outputCommitter != null) {
+      outputCommitter.abortTask(taskAttemptContext)
+    }
     logError(s"Task attempt $taskAttemptId aborted.")
   }
 
@@ -369,7 +391,9 @@ private[sql] abstract class BaseWriterContainer(
   }
 
   def abortJob(): Unit = {
-    outputCommitter.abortJob(jobContext, JobStatus.State.FAILED)
+    if (outputCommitter != null) {
+      outputCommitter.abortJob(jobContext, JobStatus.State.FAILED)
+    }
     logError(s"Job $jobId aborted.")
   }
 }
@@ -390,6 +414,7 @@ private[sql] class DefaultWriterContainer(
 
   override def commitTask(): Unit = {
     try {
+      assert(writer != null, "OutputWriter instance should have been initialized")
       writer.close()
       super.commitTask()
     } catch {
@@ -401,7 +426,9 @@ private[sql] class DefaultWriterContainer(
 
   override def abortTask(): Unit = {
     try {
-      writer.close()
+      if (writer != null) {
+        writer.close()
+      }
     } finally {
       super.abortTask()
     }
@@ -445,6 +472,7 @@ private[sql] class DynamicPartitionWriterContainer(
   override def commitTask(): Unit = {
     try {
       outputWriters.values.foreach(_.close())
+      outputWriters.clear()
       super.commitTask()
     } catch { case cause: Throwable =>
       super.abortTask()
@@ -455,6 +483,7 @@ private[sql] class DynamicPartitionWriterContainer(
   override def abortTask(): Unit = {
     try {
       outputWriters.values.foreach(_.close())
+      outputWriters.clear()
     } finally {
       super.abortTask()
     }
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/sources/hadoopFsRelationSuites.scala b/sql/hive/src/test/scala/org/apache/spark/sql/sources/hadoopFsRelationSuites.scala
index 8787663a98f8f..76469d7a3d6a5 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/sources/hadoopFsRelationSuites.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/sources/hadoopFsRelationSuites.scala
@@ -594,4 +594,19 @@ class ParquetHadoopFsRelationSuite extends HadoopFsRelationTest {
       checkAnswer(read.format("parquet").load(path), df)
     }
   }
+
+  test("SPARK-8079: Avoid NPE thrown from BaseWriterContainer.abortJob") {
+    withTempPath { dir =>
+      intercept[AnalysisException] {
+        // Parquet doesn't allow field names with spaces.  Here we are intentionally making an
+        // exception thrown from the `ParquetRelation2.prepareForWriteJob()` method to trigger
+        // the bug.  Please refer to spark-8079 for more details.
+        range(1, 10)
+          .withColumnRenamed("id", "a b")
+          .write
+          .format("parquet")
+          .save(dir.getCanonicalPath)
+      }
+    }
+  }
 }

From 5aa804f3c6485670937a658ce8207c2317c6a506 Mon Sep 17 00:00:00 2001
From: MechCoder <manojkumarsivaraj334@gmail.com>
Date: Sat, 6 Jun 2015 14:52:14 -0700
Subject: [PATCH 389/525] [SPARK-7639] [PYSPARK] [MLLIB] Python API for
 KernelDensity

Python API for KernelDensity

Author: MechCoder <manojkumarsivaraj334@gmail.com>

Closes #6387 from MechCoder/spark-7639 and squashes the following commits:

17abc62 [MechCoder] add tests
2de6540 [MechCoder] style tests
bf4acc0 [MechCoder] Added doctests
84359d5 [MechCoder] [SPARK-7639] Python API for KernelDensity
---
 .../mllib/api/python/PythonMLLibAPI.scala     | 12 +++-
 python/pyspark/mllib/stat/KernelDensity.py    | 61 +++++++++++++++++++
 python/pyspark/mllib/stat/__init__.py         |  3 +-
 python/run-tests                              |  1 +
 4 files changed, 75 insertions(+), 2 deletions(-)
 create mode 100644 python/pyspark/mllib/stat/KernelDensity.py

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala b/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala
index 16f3131796709..8f66bc808a007 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala
@@ -43,7 +43,8 @@ import org.apache.spark.mllib.regression._
 import org.apache.spark.mllib.stat.correlation.CorrelationNames
 import org.apache.spark.mllib.stat.distribution.MultivariateGaussian
 import org.apache.spark.mllib.stat.test.ChiSqTestResult
-import org.apache.spark.mllib.stat.{MultivariateStatisticalSummary, Statistics}
+import org.apache.spark.mllib.stat.{
+  KernelDensity, MultivariateStatisticalSummary, Statistics}
 import org.apache.spark.mllib.tree.configuration.{Algo, BoostingStrategy, Strategy}
 import org.apache.spark.mllib.tree.impurity._
 import org.apache.spark.mllib.tree.loss.Losses
@@ -945,6 +946,15 @@ private[python] class PythonMLLibAPI extends Serializable {
       r => (r.getSeq(0).toArray[Any], r.getSeq(1).toArray[Any])))
   }
 
+  /**
+   * Java stub for the estimate method of KernelDensity
+   */
+  def estimateKernelDensity(
+      sample: JavaRDD[Double],
+      bandwidth: Double, points: java.util.ArrayList[Double]): Array[Double] = {
+    return new KernelDensity().setSample(sample).setBandwidth(bandwidth).estimate(
+      points.asScala.toArray)
+  }
 
 }
 
diff --git a/python/pyspark/mllib/stat/KernelDensity.py b/python/pyspark/mllib/stat/KernelDensity.py
new file mode 100644
index 0000000000000..7da921976d4d2
--- /dev/null
+++ b/python/pyspark/mllib/stat/KernelDensity.py
@@ -0,0 +1,61 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import sys
+
+if sys.version > '3':
+    xrange = range
+
+import numpy as np
+
+from pyspark.mllib.common import callMLlibFunc
+from pyspark.rdd import RDD
+
+
+class KernelDensity(object):
+    """
+    .. note:: Experimental
+
+    Estimate probability density at required points given a RDD of samples
+    from the population.
+
+    >>> kd = KernelDensity()
+    >>> sample = sc.parallelize([0.0, 1.0])
+    >>> kd.setSample(sample)
+    >>> kd.estimate([0.0, 1.0])
+    array([ 0.12938758,  0.12938758])
+    """
+    def __init__(self):
+        self._bandwidth = 1.0
+        self._sample = None
+
+    def setBandwidth(self, bandwidth):
+        """Set bandwidth of each sample. Defaults to 1.0"""
+        self._bandwidth = bandwidth
+
+    def setSample(self, sample):
+        """Set sample points from the population. Should be a RDD"""
+        if not isinstance(sample, RDD):
+            raise TypeError("samples should be a RDD, received %s" % type(sample))
+        self._sample = sample
+
+    def estimate(self, points):
+        """Estimate the probability density at points"""
+        points = list(points)
+        densities = callMLlibFunc(
+            "estimateKernelDensity", self._sample, self._bandwidth, points)
+        return np.asarray(densities)
diff --git a/python/pyspark/mllib/stat/__init__.py b/python/pyspark/mllib/stat/__init__.py
index e3e128513e0d7..c8a721d3fe41c 100644
--- a/python/pyspark/mllib/stat/__init__.py
+++ b/python/pyspark/mllib/stat/__init__.py
@@ -22,6 +22,7 @@
 from pyspark.mllib.stat._statistics import *
 from pyspark.mllib.stat.distribution import MultivariateGaussian
 from pyspark.mllib.stat.test import ChiSqTestResult
+from pyspark.mllib.stat.KernelDensity import KernelDensity
 
 __all__ = ["Statistics", "MultivariateStatisticalSummary", "ChiSqTestResult",
-           "MultivariateGaussian"]
+           "MultivariateGaussian", "KernelDensity"]
diff --git a/python/run-tests b/python/run-tests
index 17dda3eadac0c..4468fdb3f267e 100755
--- a/python/run-tests
+++ b/python/run-tests
@@ -93,6 +93,7 @@ function run_mllib_tests() {
     run_test "pyspark.mllib.recommendation"
     run_test "pyspark.mllib.regression"
     run_test "pyspark.mllib.stat._statistics"
+    run_test "pyspark.mllib.stat.KernelDensity"
     run_test "pyspark.mllib.tree"
     run_test "pyspark.mllib.util"
     run_test "pyspark.mllib.tests"

From 18c4fcebbeecc3b26476a728bc9db62f5c0a6f87 Mon Sep 17 00:00:00 2001
From: Marcelo Vanzin <vanzin@cloudera.com>
Date: Sat, 6 Jun 2015 21:08:36 -0700
Subject: [PATCH 390/525] [SPARK-7169] [CORE] Allow metrics system to be
 configured through SparkConf.

Author: Marcelo Vanzin <vanzin@cloudera.com>
Author: Jacek Lewandowski <lewandowski.jacek@gmail.com>

Closes #6560 from vanzin/SPARK-7169 and squashes the following commits:

737266f [Marcelo Vanzin] Feedback.
702d5a3 [Marcelo Vanzin] Scalastyle.
ce66e7e [Marcelo Vanzin] Remove metrics config handling from SparkConf.
439938a [Jacek Lewandowski] SPARK-7169: Metrics can be additionally configured from Spark configuration
---
 .../apache/spark/metrics/MetricsConfig.scala  | 55 ++++++++-----
 .../apache/spark/metrics/MetricsSystem.scala  |  3 +-
 .../spark/metrics/MetricsConfigSuite.scala    | 82 ++++++++++++++++++-
 3 files changed, 115 insertions(+), 25 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/metrics/MetricsConfig.scala b/core/src/main/scala/org/apache/spark/metrics/MetricsConfig.scala
index 8edf493780687..d7495551ad233 100644
--- a/core/src/main/scala/org/apache/spark/metrics/MetricsConfig.scala
+++ b/core/src/main/scala/org/apache/spark/metrics/MetricsConfig.scala
@@ -23,10 +23,10 @@ import java.util.Properties
 import scala.collection.mutable
 import scala.util.matching.Regex
 
-import org.apache.spark.Logging
 import org.apache.spark.util.Utils
+import org.apache.spark.{Logging, SparkConf}
 
-private[spark] class MetricsConfig(val configFile: Option[String]) extends Logging {
+private[spark] class MetricsConfig(conf: SparkConf) extends Logging {
 
   private val DEFAULT_PREFIX = "*"
   private val INSTANCE_REGEX = "^(\\*|[a-zA-Z]+)\\.(.+)".r
@@ -46,23 +46,14 @@ private[spark] class MetricsConfig(val configFile: Option[String]) extends Loggi
     // Add default properties in case there's no properties file
     setDefaultProperties(properties)
 
-    // If spark.metrics.conf is not set, try to get file in class path
-    val isOpt: Option[InputStream] = configFile.map(new FileInputStream(_)).orElse {
-      try {
-        Option(Utils.getSparkClassLoader.getResourceAsStream(DEFAULT_METRICS_CONF_FILENAME))
-      } catch {
-        case e: Exception =>
-          logError("Error loading default configuration file", e)
-          None
-      }
-    }
+    loadPropertiesFromFile(conf.getOption("spark.metrics.conf"))
 
-    isOpt.foreach { is =>
-      try {
-        properties.load(is)
-      } finally {
-        is.close()
-      }
+    // Also look for the properties in provided Spark configuration
+    val prefix = "spark.metrics.conf."
+    conf.getAll.foreach {
+      case (k, v) if k.startsWith(prefix) =>
+        properties.setProperty(k.substring(prefix.length()), v)
+      case _ =>
     }
 
     propertyCategories = subProperties(properties, INSTANCE_REGEX)
@@ -97,5 +88,31 @@ private[spark] class MetricsConfig(val configFile: Option[String]) extends Loggi
       case None => propertyCategories.getOrElse(DEFAULT_PREFIX, new Properties)
     }
   }
-}
 
+  /**
+   * Loads configuration from a config file. If no config file is provided, try to get file
+   * in class path.
+   */
+  private[this] def loadPropertiesFromFile(path: Option[String]): Unit = {
+    var is: InputStream = null
+    try {
+      is = path match {
+        case Some(f) => new FileInputStream(f)
+        case None => Utils.getSparkClassLoader.getResourceAsStream(DEFAULT_METRICS_CONF_FILENAME)
+      }
+
+      if (is != null) {
+        properties.load(is)
+      }
+    } catch {
+      case e: Exception =>
+        val file = path.getOrElse(DEFAULT_METRICS_CONF_FILENAME)
+        logError(s"Error loading configuration file $file", e)
+    } finally {
+      if (is != null) {
+        is.close()
+      }
+    }
+  }
+
+}
diff --git a/core/src/main/scala/org/apache/spark/metrics/MetricsSystem.scala b/core/src/main/scala/org/apache/spark/metrics/MetricsSystem.scala
index 9150ad35712a1..ed5131c79fdc5 100644
--- a/core/src/main/scala/org/apache/spark/metrics/MetricsSystem.scala
+++ b/core/src/main/scala/org/apache/spark/metrics/MetricsSystem.scala
@@ -70,8 +70,7 @@ private[spark] class MetricsSystem private (
     securityMgr: SecurityManager)
   extends Logging {
 
-  private[this] val confFile = conf.get("spark.metrics.conf", null)
-  private[this] val metricsConfig = new MetricsConfig(Option(confFile))
+  private[this] val metricsConfig = new MetricsConfig(conf)
 
   private val sinks = new mutable.ArrayBuffer[Sink]
   private val sources = new mutable.ArrayBuffer[Source]
diff --git a/core/src/test/scala/org/apache/spark/metrics/MetricsConfigSuite.scala b/core/src/test/scala/org/apache/spark/metrics/MetricsConfigSuite.scala
index a901a069d9bfe..41f2ff725a17b 100644
--- a/core/src/test/scala/org/apache/spark/metrics/MetricsConfigSuite.scala
+++ b/core/src/test/scala/org/apache/spark/metrics/MetricsConfigSuite.scala
@@ -17,6 +17,8 @@
 
 package org.apache.spark.metrics
 
+import org.apache.spark.SparkConf
+
 import org.scalatest.BeforeAndAfter
 
 import org.apache.spark.SparkFunSuite
@@ -29,7 +31,9 @@ class MetricsConfigSuite extends SparkFunSuite with BeforeAndAfter {
   }
 
   test("MetricsConfig with default properties") {
-    val conf = new MetricsConfig(None)
+    val sparkConf = new SparkConf(loadDefaults = false)
+    sparkConf.set("spark.metrics.conf", "dummy-file")
+    val conf = new MetricsConfig(sparkConf)
     conf.initialize()
 
     assert(conf.properties.size() === 4)
@@ -42,8 +46,41 @@ class MetricsConfigSuite extends SparkFunSuite with BeforeAndAfter {
     assert(property.getProperty("sink.servlet.path") === "/metrics/json")
   }
 
-  test("MetricsConfig with properties set") {
-    val conf = new MetricsConfig(Option(filePath))
+  test("MetricsConfig with properties set from a file") {
+    val sparkConf = new SparkConf(loadDefaults = false)
+    sparkConf.set("spark.metrics.conf", filePath)
+    val conf = new MetricsConfig(sparkConf)
+    conf.initialize()
+
+    val masterProp = conf.getInstance("master")
+    assert(masterProp.size() === 5)
+    assert(masterProp.getProperty("sink.console.period") === "20")
+    assert(masterProp.getProperty("sink.console.unit") === "minutes")
+    assert(masterProp.getProperty("source.jvm.class") ===
+      "org.apache.spark.metrics.source.JvmSource")
+    assert(masterProp.getProperty("sink.servlet.class") ===
+      "org.apache.spark.metrics.sink.MetricsServlet")
+    assert(masterProp.getProperty("sink.servlet.path") === "/metrics/master/json")
+
+    val workerProp = conf.getInstance("worker")
+    assert(workerProp.size() === 5)
+    assert(workerProp.getProperty("sink.console.period") === "10")
+    assert(workerProp.getProperty("sink.console.unit") === "seconds")
+    assert(workerProp.getProperty("source.jvm.class") ===
+      "org.apache.spark.metrics.source.JvmSource")
+    assert(workerProp.getProperty("sink.servlet.class") ===
+      "org.apache.spark.metrics.sink.MetricsServlet")
+    assert(workerProp.getProperty("sink.servlet.path") === "/metrics/json")
+  }
+
+  test("MetricsConfig with properties set from a Spark configuration") {
+    val sparkConf = new SparkConf(loadDefaults = false)
+    setMetricsProperty(sparkConf, "*.sink.console.period", "10")
+    setMetricsProperty(sparkConf, "*.sink.console.unit", "seconds")
+    setMetricsProperty(sparkConf, "*.source.jvm.class", "org.apache.spark.metrics.source.JvmSource")
+    setMetricsProperty(sparkConf, "master.sink.console.period", "20")
+    setMetricsProperty(sparkConf, "master.sink.console.unit", "minutes")
+    val conf = new MetricsConfig(sparkConf)
     conf.initialize()
 
     val masterProp = conf.getInstance("master")
@@ -67,8 +104,40 @@ class MetricsConfigSuite extends SparkFunSuite with BeforeAndAfter {
     assert(workerProp.getProperty("sink.servlet.path") === "/metrics/json")
   }
 
+  test("MetricsConfig with properties set from a file and a Spark configuration") {
+    val sparkConf = new SparkConf(loadDefaults = false)
+    setMetricsProperty(sparkConf, "*.sink.console.period", "10")
+    setMetricsProperty(sparkConf, "*.sink.console.unit", "seconds")
+    setMetricsProperty(sparkConf, "*.source.jvm.class", "org.apache.spark.SomeOtherSource")
+    setMetricsProperty(sparkConf, "master.sink.console.period", "50")
+    setMetricsProperty(sparkConf, "master.sink.console.unit", "seconds")
+    sparkConf.set("spark.metrics.conf", filePath)
+    val conf = new MetricsConfig(sparkConf)
+    conf.initialize()
+
+    val masterProp = conf.getInstance("master")
+    assert(masterProp.size() === 5)
+    assert(masterProp.getProperty("sink.console.period") === "50")
+    assert(masterProp.getProperty("sink.console.unit") === "seconds")
+    assert(masterProp.getProperty("source.jvm.class") === "org.apache.spark.SomeOtherSource")
+    assert(masterProp.getProperty("sink.servlet.class") ===
+      "org.apache.spark.metrics.sink.MetricsServlet")
+    assert(masterProp.getProperty("sink.servlet.path") === "/metrics/master/json")
+
+    val workerProp = conf.getInstance("worker")
+    assert(workerProp.size() === 5)
+    assert(workerProp.getProperty("sink.console.period") === "10")
+    assert(workerProp.getProperty("sink.console.unit") === "seconds")
+    assert(workerProp.getProperty("source.jvm.class") === "org.apache.spark.SomeOtherSource")
+    assert(workerProp.getProperty("sink.servlet.class") ===
+      "org.apache.spark.metrics.sink.MetricsServlet")
+    assert(workerProp.getProperty("sink.servlet.path") === "/metrics/json")
+  }
+
   test("MetricsConfig with subProperties") {
-    val conf = new MetricsConfig(Option(filePath))
+    val sparkConf = new SparkConf(loadDefaults = false)
+    sparkConf.set("spark.metrics.conf", filePath)
+    val conf = new MetricsConfig(sparkConf)
     conf.initialize()
 
     val propCategories = conf.propertyCategories
@@ -90,4 +159,9 @@ class MetricsConfigSuite extends SparkFunSuite with BeforeAndAfter {
     val servletProps = sinkProps("servlet")
     assert(servletProps.size() === 2)
   }
+
+  private def setMetricsProperty(conf: SparkConf, name: String, value: String): Unit = {
+    conf.set(s"spark.metrics.conf.$name", value)
+  }
+
 }

From ed2cc3ee890694ca0c1fa0bbc7186c8b80da3fab Mon Sep 17 00:00:00 2001
From: Hari Shreedharan <hshreedharan@apache.org>
Date: Sat, 6 Jun 2015 21:09:56 -0700
Subject: [PATCH 391/525] [SPARK-8136] [YARN] Fix flakiness in
 YarnClusterSuite.

Instead of actually downloading the logs, just verify that the logs link is actually
a URL and is in the expected format.

Author: Hari Shreedharan <hshreedharan@apache.org>

Closes #6680 from harishreedharan/simplify-am-log-tests and squashes the following commits:

3183aeb [Hari Shreedharan] Remove check for hostname which can fail on machines with several hostnames. Removed some unused imports.
50d69a7 [Hari Shreedharan] [SPARK-8136][YARN] Fix flakiness in YarnClusterSuite.
---
 .../spark/deploy/yarn/YarnClusterSuite.scala     | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnClusterSuite.scala b/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnClusterSuite.scala
index bc42e12dfafd7..93d587d0cb36a 100644
--- a/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnClusterSuite.scala
+++ b/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnClusterSuite.scala
@@ -18,12 +18,12 @@
 package org.apache.spark.deploy.yarn
 
 import java.io.{File, FileOutputStream, OutputStreamWriter}
+import java.net.URL
 import java.util.Properties
 import java.util.concurrent.TimeUnit
 
 import scala.collection.JavaConversions._
 import scala.collection.mutable
-import scala.io.Source
 
 import com.google.common.base.Charsets.UTF_8
 import com.google.common.io.ByteStreams
@@ -344,18 +344,20 @@ private object YarnClusterDriver extends Logging with Matchers {
       assert(info.logUrlMap.nonEmpty)
     }
 
-    // If we are running in yarn-cluster mode, verify that driver logs are downloadable.
+    // If we are running in yarn-cluster mode, verify that driver logs links and present and are
+    // in the expected format.
     if (conf.get("spark.master") == "yarn-cluster") {
       assert(listener.driverLogs.nonEmpty)
       val driverLogs = listener.driverLogs.get
       assert(driverLogs.size === 2)
       assert(driverLogs.containsKey("stderr"))
       assert(driverLogs.containsKey("stdout"))
-      val stderr = driverLogs("stderr") // YARN puts everything in stderr.
-      val lines = Source.fromURL(stderr).getLines()
-      // Look for a line that contains YarnClusterSchedulerBackend, since that is guaranteed in
-      // cluster mode.
-      assert(lines.exists(_.contains("YarnClusterSchedulerBackend")))
+      val urlStr = driverLogs("stderr")
+      // Ensure that this is a valid URL, else this will throw an exception
+      new URL(urlStr)
+      val containerId = YarnSparkHadoopUtil.get.getContainerId
+      val user = Utils.getCurrentUserName()
+      assert(urlStr.endsWith(s"/node/containerlogs/$containerId/$user/stderr?start=0"))
     }
   }
 

From 3285a51121397bfd2e62dbee8e1f0fa7c72512a7 Mon Sep 17 00:00:00 2001
From: Hari Shreedharan <hshreedharan@apache.org>
Date: Sat, 6 Jun 2015 21:13:26 -0700
Subject: [PATCH 392/525] =?UTF-8?q?[SPARK-7955]=20[CORE]=20Ensure=20execut?=
 =?UTF-8?q?ors=20with=20cached=20RDD=20blocks=20are=20not=20re=E2=80=A6?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

…moved if dynamic allocation is enabled.

This is a work in progress. This patch ensures that an executor that has cached RDD blocks are not removed,
but makes no attempt to find another executor to remove. This is meant to get some feedback on the current
approach, and if it makes sense then I will look at choosing another executor to remove. No testing has been done either.

Author: Hari Shreedharan <hshreedharan@apache.org>

Closes #6508 from harishreedharan/dymanic-caching and squashes the following commits:

dddf1eb [Hari Shreedharan] Minor configuration description update.
10130e2 [Hari Shreedharan] Fix compile issue.
5417b53 [Hari Shreedharan] Add documentation for new config. Remove block from cachedBlocks when it is dropped.
875916a [Hari Shreedharan] Make some code more readable.
39940ca [Hari Shreedharan] Handle the case where the executor has not yet registered.
90ad711 [Hari Shreedharan] Remove unused imports and unused methods.
063985c [Hari Shreedharan] Send correct message instead of recursively calling same method.
ec2fd7e [Hari Shreedharan] Add file missed in last commit
5d10fad [Hari Shreedharan] Update cached blocks status using local info, rather than doing an RPC.
193af4c [Hari Shreedharan] WIP. Use local state rather than via RPC.
ae932ff [Hari Shreedharan] Fix config param name.
272969d [Hari Shreedharan] Fix seconds to millis bug.
5a1993f [Hari Shreedharan] Add timeout for cache executors. Ignore broadcast blocks while checking if there are cached blocks.
57fefc2 [Hari Shreedharan] [SPARK-7955][Core] Ensure executors with cached RDD blocks are not removed if dynamic allocation is enabled.
---
 .../spark/ExecutorAllocationManager.scala     | 21 ++++++++++--
 .../spark/storage/BlockManagerMaster.scala    |  8 +++++
 .../storage/BlockManagerMasterEndpoint.scala  | 33 +++++++++++++++++--
 .../spark/storage/BlockManagerMessages.scala  |  3 +-
 docs/configuration.md                         |  9 +++++
 5 files changed, 68 insertions(+), 6 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/ExecutorAllocationManager.scala b/core/src/main/scala/org/apache/spark/ExecutorAllocationManager.scala
index 9939103bb0903..49329423dca76 100644
--- a/core/src/main/scala/org/apache/spark/ExecutorAllocationManager.scala
+++ b/core/src/main/scala/org/apache/spark/ExecutorAllocationManager.scala
@@ -101,6 +101,9 @@ private[spark] class ExecutorAllocationManager(
   private val executorIdleTimeoutS = conf.getTimeAsSeconds(
     "spark.dynamicAllocation.executorIdleTimeout", "60s")
 
+  private val cachedExecutorIdleTimeoutS = conf.getTimeAsSeconds(
+    "spark.dynamicAllocation.cachedExecutorIdleTimeout", s"${2 * executorIdleTimeoutS}s")
+
   // During testing, the methods to actually kill and add executors are mocked out
   private val testing = conf.getBoolean("spark.dynamicAllocation.testing", false)
 
@@ -459,9 +462,23 @@ private[spark] class ExecutorAllocationManager(
   private def onExecutorIdle(executorId: String): Unit = synchronized {
     if (executorIds.contains(executorId)) {
       if (!removeTimes.contains(executorId) && !executorsPendingToRemove.contains(executorId)) {
+        // Note that it is not necessary to query the executors since all the cached
+        // blocks we are concerned with are reported to the driver. Note that this
+        // does not include broadcast blocks.
+        val hasCachedBlocks = SparkEnv.get.blockManager.master.hasCachedBlocks(executorId)
+        val now = clock.getTimeMillis()
+        val timeout = {
+          if (hasCachedBlocks) {
+            // Use a different timeout if the executor has cached blocks.
+            now + cachedExecutorIdleTimeoutS * 1000
+          } else {
+            now + executorIdleTimeoutS * 1000
+          }
+        }
+        val realTimeout = if (timeout <= 0) Long.MaxValue else timeout // overflow
+        removeTimes(executorId) = realTimeout
         logDebug(s"Starting idle timer for $executorId because there are no more tasks " +
-          s"scheduled to run on the executor (to expire in $executorIdleTimeoutS seconds)")
-        removeTimes(executorId) = clock.getTimeMillis + executorIdleTimeoutS * 1000
+          s"scheduled to run on the executor (to expire in ${(realTimeout - now)/1000} seconds)")
       }
     } else {
       logWarning(s"Attempted to mark unknown executor $executorId idle")
diff --git a/core/src/main/scala/org/apache/spark/storage/BlockManagerMaster.scala b/core/src/main/scala/org/apache/spark/storage/BlockManagerMaster.scala
index abcad9438bf28..7cdae22b0e253 100644
--- a/core/src/main/scala/org/apache/spark/storage/BlockManagerMaster.scala
+++ b/core/src/main/scala/org/apache/spark/storage/BlockManagerMaster.scala
@@ -202,6 +202,14 @@ class BlockManagerMaster(
     Await.result(future, timeout)
   }
 
+  /**
+   * Find out if the executor has cached blocks. This method does not consider broadcast blocks,
+   * since they are not reported the master.
+   */
+  def hasCachedBlocks(executorId: String): Boolean = {
+    driverEndpoint.askWithRetry[Boolean](HasCachedBlocks(executorId))
+  }
+
   /** Stop the driver endpoint, called only on the Spark driver node */
   def stop() {
     if (driverEndpoint != null && isDriver) {
diff --git a/core/src/main/scala/org/apache/spark/storage/BlockManagerMasterEndpoint.scala b/core/src/main/scala/org/apache/spark/storage/BlockManagerMasterEndpoint.scala
index 2cd8c5297b741..68ed9096731c5 100644
--- a/core/src/main/scala/org/apache/spark/storage/BlockManagerMasterEndpoint.scala
+++ b/core/src/main/scala/org/apache/spark/storage/BlockManagerMasterEndpoint.scala
@@ -19,6 +19,7 @@ package org.apache.spark.storage
 
 import java.util.{HashMap => JHashMap}
 
+import scala.collection.immutable.HashSet
 import scala.collection.mutable
 import scala.collection.JavaConversions._
 import scala.concurrent.{ExecutionContext, Future}
@@ -112,6 +113,17 @@ class BlockManagerMasterEndpoint(
     case BlockManagerHeartbeat(blockManagerId) =>
       context.reply(heartbeatReceived(blockManagerId))
 
+    case HasCachedBlocks(executorId) =>
+      blockManagerIdByExecutor.get(executorId) match {
+        case Some(bm) =>
+          if (blockManagerInfo.contains(bm)) {
+            val bmInfo = blockManagerInfo(bm)
+            context.reply(bmInfo.cachedBlocks.nonEmpty)
+          } else {
+            context.reply(false)
+          }
+        case None => context.reply(false)
+      }
   }
 
   private def removeRdd(rddId: Int): Future[Seq[Int]] = {
@@ -418,6 +430,9 @@ private[spark] class BlockManagerInfo(
   // Mapping from block id to its status.
   private val _blocks = new JHashMap[BlockId, BlockStatus]
 
+  // Cached blocks held by this BlockManager. This does not include broadcast blocks.
+  private val _cachedBlocks = new mutable.HashSet[BlockId]
+
   def getStatus(blockId: BlockId): Option[BlockStatus] = Option(_blocks.get(blockId))
 
   def updateLastSeenMs() {
@@ -451,27 +466,35 @@ private[spark] class BlockManagerInfo(
        * and the diskSize here indicates the data size in or dropped to disk.
        * They can be both larger than 0, when a block is dropped from memory to disk.
        * Therefore, a safe way to set BlockStatus is to set its info in accurate modes. */
+      var blockStatus: BlockStatus = null
       if (storageLevel.useMemory) {
-        _blocks.put(blockId, BlockStatus(storageLevel, memSize, 0, 0))
+        blockStatus = BlockStatus(storageLevel, memSize, 0, 0)
+        _blocks.put(blockId, blockStatus)
         _remainingMem -= memSize
         logInfo("Added %s in memory on %s (size: %s, free: %s)".format(
           blockId, blockManagerId.hostPort, Utils.bytesToString(memSize),
           Utils.bytesToString(_remainingMem)))
       }
       if (storageLevel.useDisk) {
-        _blocks.put(blockId, BlockStatus(storageLevel, 0, diskSize, 0))
+        blockStatus = BlockStatus(storageLevel, 0, diskSize, 0)
+        _blocks.put(blockId, blockStatus)
         logInfo("Added %s on disk on %s (size: %s)".format(
           blockId, blockManagerId.hostPort, Utils.bytesToString(diskSize)))
       }
       if (storageLevel.useOffHeap) {
-        _blocks.put(blockId, BlockStatus(storageLevel, 0, 0, externalBlockStoreSize))
+        blockStatus = BlockStatus(storageLevel, 0, 0, externalBlockStoreSize)
+        _blocks.put(blockId, blockStatus)
         logInfo("Added %s on ExternalBlockStore on %s (size: %s)".format(
           blockId, blockManagerId.hostPort, Utils.bytesToString(externalBlockStoreSize)))
       }
+      if (!blockId.isBroadcast && blockStatus.isCached) {
+        _cachedBlocks += blockId
+      }
     } else if (_blocks.containsKey(blockId)) {
       // If isValid is not true, drop the block.
       val blockStatus: BlockStatus = _blocks.get(blockId)
       _blocks.remove(blockId)
+      _cachedBlocks -= blockId
       if (blockStatus.storageLevel.useMemory) {
         logInfo("Removed %s on %s in memory (size: %s, free: %s)".format(
           blockId, blockManagerId.hostPort, Utils.bytesToString(blockStatus.memSize),
@@ -494,6 +517,7 @@ private[spark] class BlockManagerInfo(
       _remainingMem += _blocks.get(blockId).memSize
       _blocks.remove(blockId)
     }
+    _cachedBlocks -= blockId
   }
 
   def remainingMem: Long = _remainingMem
@@ -502,6 +526,9 @@ private[spark] class BlockManagerInfo(
 
   def blocks: JHashMap[BlockId, BlockStatus] = _blocks
 
+  // This does not include broadcast blocks.
+  def cachedBlocks: collection.Set[BlockId] = _cachedBlocks
+
   override def toString: String = "BlockManagerInfo " + timeMs + " " + _remainingMem
 
   def clear() {
diff --git a/core/src/main/scala/org/apache/spark/storage/BlockManagerMessages.scala b/core/src/main/scala/org/apache/spark/storage/BlockManagerMessages.scala
index 1683576067fe8..376e9eb48843d 100644
--- a/core/src/main/scala/org/apache/spark/storage/BlockManagerMessages.scala
+++ b/core/src/main/scala/org/apache/spark/storage/BlockManagerMessages.scala
@@ -42,7 +42,6 @@ private[spark] object BlockManagerMessages {
   case class RemoveBroadcast(broadcastId: Long, removeFromDriver: Boolean = true)
     extends ToBlockManagerSlave
 
-
   //////////////////////////////////////////////////////////////////////////////////
   // Messages from slaves to the master.
   //////////////////////////////////////////////////////////////////////////////////
@@ -108,4 +107,6 @@ private[spark] object BlockManagerMessages {
     extends ToBlockManagerMaster
 
   case class BlockManagerHeartbeat(blockManagerId: BlockManagerId) extends ToBlockManagerMaster
+
+  case class HasCachedBlocks(executorId: String) extends ToBlockManagerMaster
 }
diff --git a/docs/configuration.md b/docs/configuration.md
index 3a48da4592dd9..9667cebe0b87c 100644
--- a/docs/configuration.md
+++ b/docs/configuration.md
@@ -1201,6 +1201,15 @@ Apart from these, the following properties are also available, and may be useful
     <a href="job-scheduling.html#resource-allocation-policy">description</a>.
   </td>
 </tr>
+<tr>
+  <td><code>spark.dynamicAllocation.cachedExecutorIdleTimeout</code></td>
+  <td>2 * executorIdleTimeout</td>
+  <td>
+    If dynamic allocation is enabled and an executor which has cached data blocks has been idle for more than this duration,
+    the executor will be removed. For more details, see this
+    <a href="job-scheduling.html#resource-allocation-policy">description</a>.
+  </td>
+</tr>
 <tr>
   <td><code>spark.dynamicAllocation.initialExecutors</code></td>
   <td><code>spark.dynamicAllocation.minExecutors</code></td>

From 901a552c5e973262fddbf70ee2d4078c948bc668 Mon Sep 17 00:00:00 2001
From: Liang-Chi Hsieh <viirya@gmail.com>
Date: Sat, 6 Jun 2015 22:59:31 -0700
Subject: [PATCH 393/525] [SPARK-8004][SQL] Enclose column names by JDBC
 Dialect

JIRA: https://issues.apache.org/jira/browse/SPARK-8004

Author: Liang-Chi Hsieh <viirya@gmail.com>

Closes #6577 from viirya/enclose_jdbc_columns and squashes the following commits:

614606a [Liang-Chi Hsieh] For comment.
bc50182 [Liang-Chi Hsieh] Enclose column names by JDBC Dialect.
---
 .../scala/org/apache/spark/sql/jdbc/JDBCRDD.scala   |  4 +++-
 .../org/apache/spark/sql/jdbc/JdbcDialects.scala    | 13 +++++++++++++
 .../scala/org/apache/spark/sql/jdbc/JDBCSuite.scala | 11 +++++++++++
 3 files changed, 27 insertions(+), 1 deletion(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/jdbc/JDBCRDD.scala b/sql/core/src/main/scala/org/apache/spark/sql/jdbc/JDBCRDD.scala
index 40b604d710dce..2930f7bb4cae1 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/jdbc/JDBCRDD.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/jdbc/JDBCRDD.scala
@@ -211,12 +211,14 @@ private[sql] object JDBCRDD extends Logging {
       requiredColumns: Array[String],
       filters: Array[Filter],
       parts: Array[Partition]): RDD[Row] = {
+    val dialect = JdbcDialects.get(url)
+    val enclosedColumns = requiredColumns.map(dialect.columnEnclosing(_))
     new JDBCRDD(
       sc,
       getConnector(driver, url, properties),
       pruneSchema(schema, requiredColumns),
       fqTable,
-      requiredColumns,
+      enclosedColumns,
       filters,
       parts,
       properties)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/jdbc/JdbcDialects.scala b/sql/core/src/main/scala/org/apache/spark/sql/jdbc/JdbcDialects.scala
index 6a169e106b968..04052f80f5e78 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/jdbc/JdbcDialects.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/jdbc/JdbcDialects.scala
@@ -80,6 +80,15 @@ abstract class JdbcDialect {
    * @return The new JdbcType if there is an override for this DataType
    */
   def getJDBCType(dt: DataType): Option[JdbcType] = None
+
+  /**
+   * Enclose column name
+   * @param colName The coulmn name
+   * @return Enclosed column name
+   */
+  def columnEnclosing(colName: String): String = {
+    s""""$colName""""
+  }
 }
 
 /**
@@ -208,4 +217,8 @@ case object MySQLDialect extends JdbcDialect {
       Some(BooleanType)
     } else None
   }
+
+  override def columnEnclosing(colName: String): String = {
+    s"`$colName`"
+  }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCSuite.scala
index 7931854db27c1..a228543953536 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCSuite.scala
@@ -410,6 +410,17 @@ class JDBCSuite extends SparkFunSuite with BeforeAndAfter {
     assert(JdbcDialects.get("test.invalid") == NoopDialect)
   }
 
+  test("Enclosing column names by jdbc dialect") {
+    val MySQL = JdbcDialects.get("jdbc:mysql://127.0.0.1/db")
+    val Postgres = JdbcDialects.get("jdbc:postgresql://127.0.0.1/db")
+
+    val columns = Seq("abc", "key")
+    val MySQLColumns = columns.map(MySQL.columnEnclosing(_))
+    val PostgresColumns = columns.map(Postgres.columnEnclosing(_))
+    assert(MySQLColumns === Seq("`abc`", "`key`"))
+    assert(PostgresColumns === Seq(""""abc"""", """"key""""))
+  }
+
   test("Dialect unregister") {
     JdbcDialects.registerDialect(testH2Dialect)
     JdbcDialects.unregisterDialect(testH2Dialect)

From 081db9479abc559b26d115298fbcdc109858cad3 Mon Sep 17 00:00:00 2001
From: 979969786 <q79969786@gmail.com>
Date: Sat, 6 Jun 2015 23:15:27 -0700
Subject: [PATCH 394/525] [SPARK-8145] [WEBUI] Trigger a double click on the
 span to show full job description.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When using the Spark SQL, Jobs tab and Stages tab display only part of SQL. I change it to  display full SQL by double-click on the description span

before：
![before](https://cloud.githubusercontent.com/assets/5399861/8022257/9f8e0a22-0cf8-11e5-98c8-da4d7a615e7e.png)

after double click on the description span：
![after](https://cloud.githubusercontent.com/assets/5399861/8022261/dac08d4a-0cf8-11e5-8fe7-74c96c6ce933.png)

Author: 979969786 <q79969786@gmail.com>

Closes #6646 from 979969786/master and squashes the following commits:

b5ba20e [979969786] Trigger a double click on the span to show full job description.
---
 .../org/apache/spark/ui/static/additional-metrics.js      | 5 +++++
 .../main/resources/org/apache/spark/ui/static/webui.css   | 8 ++++++++
 2 files changed, 13 insertions(+)

diff --git a/core/src/main/resources/org/apache/spark/ui/static/additional-metrics.js b/core/src/main/resources/org/apache/spark/ui/static/additional-metrics.js
index 013db8df9b363..0b450dc76bc38 100644
--- a/core/src/main/resources/org/apache/spark/ui/static/additional-metrics.js
+++ b/core/src/main/resources/org/apache/spark/ui/static/additional-metrics.js
@@ -50,4 +50,9 @@ $(function() {
     $("span.additional-metric-title").click(function() {
         $(this).parent().find('input[type="checkbox"]').trigger('click');
     });
+
+    // Trigger a double click on the span to show full job description.
+    $(".description-input").dblclick(function() {
+        $(this).removeClass("description-input").addClass("description-input-full");
+    });
 });
diff --git a/core/src/main/resources/org/apache/spark/ui/static/webui.css b/core/src/main/resources/org/apache/spark/ui/static/webui.css
index e7c1d475d4e52..b1cef47042247 100644
--- a/core/src/main/resources/org/apache/spark/ui/static/webui.css
+++ b/core/src/main/resources/org/apache/spark/ui/static/webui.css
@@ -135,6 +135,14 @@ pre {
   display: block;
 }
 
+.description-input-full {
+  overflow: hidden;
+  text-overflow: ellipsis;
+  width: 100%;
+  white-space: normal;
+  display: block;
+}
+
 .stacktrace-details {
   max-height: 300px;
   overflow-y: auto;

From 26d07f1ece4174788b0bcdc338a14d0bbc0e3602 Mon Sep 17 00:00:00 2001
From: Liang-Chi Hsieh <viirya@gmail.com>
Date: Sun, 7 Jun 2015 15:33:48 +0800
Subject: [PATCH 395/525] [SPARK-8141] [SQL] Precompute datatypes for partition
 columns and reuse it

JIRA: https://issues.apache.org/jira/browse/SPARK-8141

Author: Liang-Chi Hsieh <viirya@gmail.com>

Closes #6687 from viirya/reuse_partition_column_types and squashes the following commits:

dab0688 [Liang-Chi Hsieh] Reuse partitionColumnTypes.
---
 .../main/scala/org/apache/spark/sql/sources/interfaces.scala   | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/sources/interfaces.scala b/sql/core/src/main/scala/org/apache/spark/sql/sources/interfaces.scala
index f5bd2d2941ca0..25887ba9a15b0 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/sources/interfaces.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/sources/interfaces.scala
@@ -435,8 +435,9 @@ abstract class HadoopFsRelation private[sql](maybePartitionSpec: Option[Partitio
           // partition values.
           userDefinedPartitionColumns.map { partitionSchema =>
             val spec = discoverPartitions()
+            val partitionColumnTypes = spec.partitionColumns.map(_.dataType)
             val castedPartitions = spec.partitions.map { case p @ Partition(values, path) =>
-              val literals = values.toSeq.zip(spec.partitionColumns.map(_.dataType)).map {
+              val literals = values.toSeq.zip(partitionColumnTypes).map {
                 case (value, dataType) => Literal.create(value, dataType)
               }
               val castedValues = partitionSchema.zip(literals).map { case (field, literal) =>

From 0ac47083f7ef5fca9847bca2f0490719e1ccf50a Mon Sep 17 00:00:00 2001
From: Reynold Xin <rxin@databricks.com>
Date: Sun, 7 Jun 2015 01:21:02 -0700
Subject: [PATCH 396/525] [SPARK-8146] DataFrame Python API: Alias replace in
 df.na

Author: Reynold Xin <rxin@databricks.com>

Closes #6688 from rxin/df-alias-replace and squashes the following commits:

774c19c [Reynold Xin] [SPARK-8146] DataFrame Python API: Alias replace in DataFrameNaFunctions.
---
 python/pyspark/sql/dataframe.py | 47 +++++++++++++++------------------
 python/pyspark/sql/window.py    |  1 -
 2 files changed, 22 insertions(+), 26 deletions(-)

diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py
index 902504df5b11b..2d8c59518b35a 100644
--- a/python/pyspark/sql/dataframe.py
+++ b/python/pyspark/sql/dataframe.py
@@ -909,8 +909,7 @@ def dropDuplicates(self, subset=None):
     @since("1.3.1")
     def dropna(self, how='any', thresh=None, subset=None):
         """Returns a new :class:`DataFrame` omitting rows with null values.
-
-        This is an alias for ``na.drop()``.
+        :func:`DataFrame.dropna` and :func:`DataFrameNaFunctions.drop` are aliases of each other.
 
         :param how: 'any' or 'all'.
             If 'any', drop a row if it contains any nulls.
@@ -920,13 +919,6 @@ def dropna(self, how='any', thresh=None, subset=None):
             This overwrites the `how` parameter.
         :param subset: optional list of column names to consider.
 
-        >>> df4.dropna().show()
-        +---+------+-----+
-        |age|height| name|
-        +---+------+-----+
-        | 10|    80|Alice|
-        +---+------+-----+
-
         >>> df4.na.drop().show()
         +---+------+-----+
         |age|height| name|
@@ -952,6 +944,7 @@ def dropna(self, how='any', thresh=None, subset=None):
     @since("1.3.1")
     def fillna(self, value, subset=None):
         """Replace null values, alias for ``na.fill()``.
+        :func:`DataFrame.fillna` and :func:`DataFrameNaFunctions.fill` are aliases of each other.
 
         :param value: int, long, float, string, or dict.
             Value to replace null values with.
@@ -963,7 +956,7 @@ def fillna(self, value, subset=None):
             For example, if `value` is a string, and subset contains a non-string column,
             then the non-string column is simply ignored.
 
-        >>> df4.fillna(50).show()
+        >>> df4.na.fill(50).show()
         +---+------+-----+
         |age|height| name|
         +---+------+-----+
@@ -973,16 +966,6 @@ def fillna(self, value, subset=None):
         | 50|    50| null|
         +---+------+-----+
 
-        >>> df4.fillna({'age': 50, 'name': 'unknown'}).show()
-        +---+------+-------+
-        |age|height|   name|
-        +---+------+-------+
-        | 10|    80|  Alice|
-        |  5|  null|    Bob|
-        | 50|  null|    Tom|
-        | 50|  null|unknown|
-        +---+------+-------+
-
         >>> df4.na.fill({'age': 50, 'name': 'unknown'}).show()
         +---+------+-------+
         |age|height|   name|
@@ -1014,6 +997,8 @@ def fillna(self, value, subset=None):
     @since(1.4)
     def replace(self, to_replace, value, subset=None):
         """Returns a new :class:`DataFrame` replacing a value with another value.
+        :func:`DataFrame.replace` and :func:`DataFrameNaFunctions.replace` are
+        aliases of each other.
 
         :param to_replace: int, long, float, string, or list.
             Value to be replaced.
@@ -1029,7 +1014,7 @@ def replace(self, to_replace, value, subset=None):
             For example, if `value` is a string, and subset contains a non-string column,
             then the non-string column is simply ignored.
 
-        >>> df4.replace(10, 20).show()
+        >>> df4.na.replace(10, 20).show()
         +----+------+-----+
         | age|height| name|
         +----+------+-----+
@@ -1039,7 +1024,7 @@ def replace(self, to_replace, value, subset=None):
         |null|  null| null|
         +----+------+-----+
 
-        >>> df4.replace(['Alice', 'Bob'], ['A', 'B'], 'name').show()
+        >>> df4.na.replace(['Alice', 'Bob'], ['A', 'B'], 'name').show()
         +----+------+----+
         | age|height|name|
         +----+------+----+
@@ -1090,9 +1075,9 @@ def replace(self, to_replace, value, subset=None):
     @since(1.4)
     def corr(self, col1, col2, method=None):
         """
-        Calculates the correlation of two columns of a DataFrame as a double value. Currently only
-        supports the Pearson Correlation Coefficient.
-        :func:`DataFrame.corr` and :func:`DataFrameStatFunctions.corr` are aliases.
+        Calculates the correlation of two columns of a DataFrame as a double value.
+        Currently only supports the Pearson Correlation Coefficient.
+        :func:`DataFrame.corr` and :func:`DataFrameStatFunctions.corr` are aliases of each other.
 
         :param col1: The name of the first column
         :param col2: The name of the second column
@@ -1241,7 +1226,10 @@ def toPandas(self):
         import pandas as pd
         return pd.DataFrame.from_records(self.collect(), columns=self.columns)
 
+    ##########################################################################################
     # Pandas compatibility
+    ##########################################################################################
+
     groupby = groupBy
     drop_duplicates = dropDuplicates
 
@@ -1261,6 +1249,8 @@ def _to_scala_map(sc, jm):
 
 class DataFrameNaFunctions(object):
     """Functionality for working with missing data in :class:`DataFrame`.
+
+    .. versionadded:: 1.4
     """
 
     def __init__(self, df):
@@ -1276,9 +1266,16 @@ def fill(self, value, subset=None):
 
     fill.__doc__ = DataFrame.fillna.__doc__
 
+    def replace(self, to_replace, value, subset=None):
+        return self.df.replace(to_replace, value, subset)
+
+    replace.__doc__ = DataFrame.replace.__doc__
+
 
 class DataFrameStatFunctions(object):
     """Functionality for statistic functions with :class:`DataFrame`.
+
+    .. versionadded:: 1.4
     """
 
     def __init__(self, df):
diff --git a/python/pyspark/sql/window.py b/python/pyspark/sql/window.py
index 0a0e006bdf83a..c74745c726a0c 100644
--- a/python/pyspark/sql/window.py
+++ b/python/pyspark/sql/window.py
@@ -32,7 +32,6 @@ def _to_java_cols(cols):
 
 
 class Window(object):
-
     """
     Utility functions for defining window in DataFrames.
 

From 8c321d66d79716f03b6f9c8ad5cedd75e8bfe631 Mon Sep 17 00:00:00 2001
From: Cheng Lian <lian@databricks.com>
Date: Sun, 7 Jun 2015 16:59:55 +0800
Subject: [PATCH 397/525] [SPARK-8118] [SQL] Mutes noisy Parquet log output
 reappeared after upgrading Parquet to 1.7.0

Author: Cheng Lian <lian@databricks.com>

Closes #6670 from liancheng/spark-8118 and squashes the following commits:

b6e85a6 [Cheng Lian] Suppresses unnecesary ParquetRecordReader log message (PARQUET-220)
385603c [Cheng Lian] Mutes noisy Parquet log output reappeared after upgrading Parquet to 1.7.0
---
 .../spark/sql/parquet/ParquetRelation.scala   | 35 +++++++++++--------
 1 file changed, 20 insertions(+), 15 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetRelation.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetRelation.scala
index 09088ee91106c..704cf56f38265 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetRelation.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetRelation.scala
@@ -18,20 +18,21 @@
 package org.apache.spark.sql.parquet
 
 import java.io.IOException
-import java.util.logging.Level
+import java.util.logging.{Level, Logger => JLogger}
 
 import org.apache.hadoop.conf.Configuration
 import org.apache.hadoop.fs.Path
 import org.apache.hadoop.fs.permission.FsAction
-import org.apache.spark.sql.types.{StructType, DataType}
-import org.apache.parquet.hadoop.{ParquetOutputCommitter, ParquetOutputFormat}
 import org.apache.parquet.hadoop.metadata.CompressionCodecName
+import org.apache.parquet.hadoop.{ParquetOutputCommitter, ParquetOutputFormat, ParquetRecordReader}
 import org.apache.parquet.schema.MessageType
+import org.apache.parquet.{Log => ParquetLog}
 
-import org.apache.spark.sql.{DataFrame, SQLContext}
 import org.apache.spark.sql.catalyst.analysis.{MultiInstanceRelation, UnresolvedException}
-import org.apache.spark.sql.catalyst.expressions.{AttributeMap, Attribute}
+import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeMap}
 import org.apache.spark.sql.catalyst.plans.logical.{LeafNode, LogicalPlan, Statistics}
+import org.apache.spark.sql.types.StructType
+import org.apache.spark.sql.{DataFrame, SQLContext}
 
 /**
  * Relation that consists of data stored in a Parquet columnar format.
@@ -94,33 +95,37 @@ private[sql] case class ParquetRelation(
 private[sql] object ParquetRelation {
 
   def enableLogForwarding() {
-    // Note: the parquet.Log class has a static initializer that
-    // sets the java.util.logging Logger for "parquet". This
+    // Note: the org.apache.parquet.Log class has a static initializer that
+    // sets the java.util.logging Logger for "org.apache.parquet". This
     // checks first to see if there's any handlers already set
     // and if not it creates them. If this method executes prior
     // to that class being loaded then:
     //  1) there's no handlers installed so there's none to
     // remove. But when it IS finally loaded the desired affect
     // of removing them is circumvented.
-    //  2) The parquet.Log static initializer calls setUseParentHanders(false)
+    //  2) The parquet.Log static initializer calls setUseParentHandlers(false)
     // undoing the attempt to override the logging here.
     //
     // Therefore we need to force the class to be loaded.
     // This should really be resolved by Parquet.
-    Class.forName(classOf[org.apache.parquet.Log].getName)
+    Class.forName(classOf[ParquetLog].getName)
 
     // Note: Logger.getLogger("parquet") has a default logger
     // that appends to Console which needs to be cleared.
-    val parquetLogger = java.util.logging.Logger.getLogger("parquet")
+    val parquetLogger = JLogger.getLogger(classOf[ParquetLog].getPackage.getName)
     parquetLogger.getHandlers.foreach(parquetLogger.removeHandler)
-    // TODO(witgo): Need to set the log level ?
-    // if(parquetLogger.getLevel != null) parquetLogger.setLevel(null)
-    if (!parquetLogger.getUseParentHandlers) parquetLogger.setUseParentHandlers(true)
+    parquetLogger.setUseParentHandlers(true)
 
-    // Disables WARN log message in ParquetOutputCommitter.
+    // Disables a WARN log message in ParquetOutputCommitter.  We first ensure that
+    // ParquetOutputCommitter is loaded and the static LOG field gets initialized.
     // See https://issues.apache.org/jira/browse/SPARK-5968 for details
     Class.forName(classOf[ParquetOutputCommitter].getName)
-    java.util.logging.Logger.getLogger(classOf[ParquetOutputCommitter].getName).setLevel(Level.OFF)
+    JLogger.getLogger(classOf[ParquetOutputCommitter].getName).setLevel(Level.OFF)
+
+    // Similar as above, disables a unnecessary WARN log message in ParquetRecordReader.
+    // See https://issues.apache.org/jira/browse/PARQUET-220 for details
+    Class.forName(classOf[ParquetRecordReader[_]].getName)
+    JLogger.getLogger(classOf[ParquetRecordReader[_]].getName).setLevel(Level.OFF)
   }
 
   // The element type for the RDDs that this relation maps to.

From ca8dafcc9fa661f05da9c98104d987716aa5f5eb Mon Sep 17 00:00:00 2001
From: Konstantin Shaposhnikov <Konstantin.Shaposhnikov@sc.com>
Date: Sun, 7 Jun 2015 13:41:00 +0100
Subject: [PATCH 398/525] [SPARK-7042] [BUILD] use the standard akka artifacts
 with hadoop-2.x

Both akka 2.3.x and hadoop-2.x use protobuf 2.5 so only hadoop-1 build needs
custom 2.3.4-spark akka version that shades protobuf-2.5

This change also updates akka version (for hadoop-2.x profiles only) to the
latest 2.3.11 as akka-zeromq_2.11 is not available for akka 2.3.4.

This partially fixes SPARK-7042 (for hadoop-2.x builds)

Author: Konstantin Shaposhnikov <Konstantin.Shaposhnikov@sc.com>

Closes #6492 from kostya-sh/SPARK-7042 and squashes the following commits:

dc195b0 [Konstantin Shaposhnikov] [SPARK-7042] [BUILD] use the standard akka artifacts with hadoop-2.x
---
 pom.xml | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/pom.xml b/pom.xml
index e28d4b9fc2b17..e65448e4b2325 100644
--- a/pom.xml
+++ b/pom.xml
@@ -114,8 +114,8 @@
   <properties>
     <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
     <project.reporting.outputEncoding>UTF-8</project.reporting.outputEncoding>
-    <akka.group>org.spark-project.akka</akka.group>
-    <akka.version>2.3.4-spark</akka.version>
+    <akka.group>com.typesafe.akka</akka.group>
+    <akka.version>2.3.11</akka.version>
     <java.version>1.6</java.version>
     <sbt.project.name>spark</sbt.project.name>
     <mesos.version>0.21.1</mesos.version>
@@ -1670,6 +1670,8 @@
         <hbase.version>0.98.7-hadoop1</hbase.version>
         <avro.mapred.classifier>hadoop1</avro.mapred.classifier>
         <codehaus.jackson.version>1.8.8</codehaus.jackson.version>
+        <akka.group>org.spark-project.akka</akka.group>
+        <akka.version>2.3.4-spark</akka.version>
       </properties>
     </profile>
 

From 835f1380d95a345208b682492f0735155e61a824 Mon Sep 17 00:00:00 2001
From: Yijie Shen <henry.yijieshen@gmail.com>
Date: Sun, 7 Jun 2015 15:30:37 +0100
Subject: [PATCH 399/525] [DOC] [TYPO] Fix typo in standalone deploy scripts
 description

Author: Yijie Shen <henry.yijieshen@gmail.com>

Closes #6691 from yijieshen/patch-2 and squashes the following commits:

b40a4b0 [Yijie Shen] [DOC][TYPO] Fix typo in standalone deploy scripts description
---
 docs/spark-standalone.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/spark-standalone.md b/docs/spark-standalone.md
index 0eed9adacf123..12d7d6e159bea 100644
--- a/docs/spark-standalone.md
+++ b/docs/spark-standalone.md
@@ -77,7 +77,7 @@ Note, the master machine accesses each of the worker machines via ssh. By defaul
 If you do not have a password-less setup, you can set the environment variable SPARK_SSH_FOREGROUND and serially provide a password for each worker.
 
 
-Once you've set up this file, you can launch or stop your cluster with the following shell scripts, based on Hadoop's deploy scripts, and available in `SPARK_HOME/bin`:
+Once you've set up this file, you can launch or stop your cluster with the following shell scripts, based on Hadoop's deploy scripts, and available in `SPARK_HOME/sbin`:
 
 - `sbin/start-master.sh` - Starts a master instance on the machine the script is executed on.
 - `sbin/start-slaves.sh` - Starts a slave instance on each machine specified in the `conf/slaves` file.

From d6d601a07b17069d41eb4114bd5f7ab2c106720d Mon Sep 17 00:00:00 2001
From: Reynold Xin <rxin@databricks.com>
Date: Sun, 7 Jun 2015 10:52:02 -0700
Subject: [PATCH 400/525] [SPARK-8004][SQL] Quote identifier in JDBC data
 source.

This is a follow-up patch to #6577 to replace columnEnclosing to quoteIdentifier.

I also did some minor cleanup to the JdbcDialect file.

Author: Reynold Xin <rxin@databricks.com>

Closes #6689 from rxin/jdbc-quote and squashes the following commits:

bad365f [Reynold Xin] Fixed test compilation...
e39e14e [Reynold Xin] Fixed compilation.
db9a8e0 [Reynold Xin] [SPARK-8004][SQL] Quote identifier in JDBC data source.
---
 .../org/apache/spark/sql/jdbc/JDBCRDD.scala   |  4 +--
 .../apache/spark/sql/jdbc/JdbcDialects.scala  | 34 +++++++++----------
 .../org/apache/spark/sql/jdbc/JDBCSuite.scala |  6 ++--
 3 files changed, 22 insertions(+), 22 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/jdbc/JDBCRDD.scala b/sql/core/src/main/scala/org/apache/spark/sql/jdbc/JDBCRDD.scala
index 2930f7bb4cae1..db68b9c86db1b 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/jdbc/JDBCRDD.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/jdbc/JDBCRDD.scala
@@ -212,13 +212,13 @@ private[sql] object JDBCRDD extends Logging {
       filters: Array[Filter],
       parts: Array[Partition]): RDD[Row] = {
     val dialect = JdbcDialects.get(url)
-    val enclosedColumns = requiredColumns.map(dialect.columnEnclosing(_))
+    val quotedColumns = requiredColumns.map(colName => dialect.quoteIdentifier(colName))
     new JDBCRDD(
       sc,
       getConnector(driver, url, properties),
       pruneSchema(schema, requiredColumns),
       fqTable,
-      enclosedColumns,
+      quotedColumns,
       filters,
       parts,
       properties)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/jdbc/JdbcDialects.scala b/sql/core/src/main/scala/org/apache/spark/sql/jdbc/JdbcDialects.scala
index 04052f80f5e78..8849fc2f1f0ef 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/jdbc/JdbcDialects.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/jdbc/JdbcDialects.scala
@@ -17,11 +17,11 @@
 
 package org.apache.spark.sql.jdbc
 
+import java.sql.Types
+
 import org.apache.spark.sql.types._
 import org.apache.spark.annotation.DeveloperApi
 
-import java.sql.Types
-
 /**
  * :: DeveloperApi ::
  * A database type definition coupled with the jdbc type needed to send null
@@ -82,11 +82,10 @@ abstract class JdbcDialect {
   def getJDBCType(dt: DataType): Option[JdbcType] = None
 
   /**
-   * Enclose column name
-   * @param colName The coulmn name
-   * @return Enclosed column name
+   * Quotes the identifier. This is used to put quotes around the identifier in case the column
+   * name is a reserved keyword, or in case it contains characters that require quotes (e.g. space).
    */
-  def columnEnclosing(colName: String): String = {
+  def quoteIdentifier(colName: String): String = {
     s""""$colName""""
   }
 }
@@ -150,18 +149,19 @@ object JdbcDialects {
 @DeveloperApi
 class AggregatedDialect(dialects: List[JdbcDialect]) extends JdbcDialect {
 
-  require(!dialects.isEmpty)
+  require(dialects.nonEmpty)
 
-  def canHandle(url : String): Boolean =
+  override def canHandle(url : String): Boolean =
     dialects.map(_.canHandle(url)).reduce(_ && _)
 
   override def getCatalystType(
-      sqlType: Int, typeName: String, size: Int, md: MetadataBuilder): Option[DataType] =
-    dialects.map(_.getCatalystType(sqlType, typeName, size, md)).flatten.headOption
-
-  override def getJDBCType(dt: DataType): Option[JdbcType] =
-    dialects.map(_.getJDBCType(dt)).flatten.headOption
+      sqlType: Int, typeName: String, size: Int, md: MetadataBuilder): Option[DataType] = {
+    dialects.flatMap(_.getCatalystType(sqlType, typeName, size, md)).headOption
+  }
 
+  override def getJDBCType(dt: DataType): Option[JdbcType] = {
+    dialects.flatMap(_.getJDBCType(dt)).headOption
+  }
 }
 
 /**
@@ -170,7 +170,7 @@ class AggregatedDialect(dialects: List[JdbcDialect]) extends JdbcDialect {
  */
 @DeveloperApi
 case object NoopDialect extends JdbcDialect {
-  def canHandle(url : String): Boolean = true
+  override def canHandle(url : String): Boolean = true
 }
 
 /**
@@ -179,7 +179,7 @@ case object NoopDialect extends JdbcDialect {
  */
 @DeveloperApi
 case object PostgresDialect extends JdbcDialect {
-  def canHandle(url: String): Boolean = url.startsWith("jdbc:postgresql")
+  override def canHandle(url: String): Boolean = url.startsWith("jdbc:postgresql")
   override def getCatalystType(
       sqlType: Int, typeName: String, size: Int, md: MetadataBuilder): Option[DataType] = {
     if (sqlType == Types.BIT && typeName.equals("bit") && size != 1) {
@@ -205,7 +205,7 @@ case object PostgresDialect extends JdbcDialect {
  */
 @DeveloperApi
 case object MySQLDialect extends JdbcDialect {
-  def canHandle(url : String): Boolean = url.startsWith("jdbc:mysql")
+  override def canHandle(url : String): Boolean = url.startsWith("jdbc:mysql")
   override def getCatalystType(
       sqlType: Int, typeName: String, size: Int, md: MetadataBuilder): Option[DataType] = {
     if (sqlType == Types.VARBINARY && typeName.equals("BIT") && size != 1) {
@@ -218,7 +218,7 @@ case object MySQLDialect extends JdbcDialect {
     } else None
   }
 
-  override def columnEnclosing(colName: String): String = {
+  override def quoteIdentifier(colName: String): String = {
     s"`$colName`"
   }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCSuite.scala
index a228543953536..49d348c3ed21b 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCSuite.scala
@@ -410,13 +410,13 @@ class JDBCSuite extends SparkFunSuite with BeforeAndAfter {
     assert(JdbcDialects.get("test.invalid") == NoopDialect)
   }
 
-  test("Enclosing column names by jdbc dialect") {
+  test("quote column names by jdbc dialect") {
     val MySQL = JdbcDialects.get("jdbc:mysql://127.0.0.1/db")
     val Postgres = JdbcDialects.get("jdbc:postgresql://127.0.0.1/db")
 
     val columns = Seq("abc", "key")
-    val MySQLColumns = columns.map(MySQL.columnEnclosing(_))
-    val PostgresColumns = columns.map(Postgres.columnEnclosing(_))
+    val MySQLColumns = columns.map(MySQL.quoteIdentifier(_))
+    val PostgresColumns = columns.map(Postgres.quoteIdentifier(_))
     assert(MySQLColumns === Seq("`abc`", "`key`"))
     assert(PostgresColumns === Seq(""""abc"""", """"key""""))
   }

From db81b9d89f62f18a3c4c2d9bced8486bfdea54a2 Mon Sep 17 00:00:00 2001
From: Wenchen Fan <cloud0fan@outlook.com>
Date: Sun, 7 Jun 2015 11:07:19 -0700
Subject: [PATCH 401/525] [SPARK-7952][SQL] use internal Decimal instead of
 java.math.BigDecimal

This PR fixes a bug introduced in https://github.com/apache/spark/pull/6505.
Decimal literal's value is not `java.math.BigDecimal`, but Spark SQL internal type: `Decimal`.

Author: Wenchen Fan <cloud0fan@outlook.com>

Closes #6574 from cloud-fan/fix and squashes the following commits:

b0e3549 [Wenchen Fan] rename to BooleanEquality
1987b37 [Wenchen Fan] use Decimal instead of java.math.BigDecimal
f93c420 [Wenchen Fan] compare literal
---
 .../catalyst/analysis/HiveTypeCoercion.scala  | 40 +++++++++----------
 .../analysis/HiveTypeCoercionSuite.scala      | 24 ++++++++++-
 2 files changed, 43 insertions(+), 21 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala
index 9b8a08a88dcb0..a42ffce0d26fa 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala
@@ -87,7 +87,7 @@ trait HiveTypeCoercion {
     WidenTypes ::
     PromoteStrings ::
     DecimalPrecision ::
-    BooleanEqualization ::
+    BooleanEquality ::
     StringToIntegralCasts ::
     FunctionArgumentConversion ::
     CaseWhenCoercion ::
@@ -479,9 +479,9 @@ trait HiveTypeCoercion {
   /**
    * Changes numeric values to booleans so that expressions like true = 1 can be evaluated.
    */
-  object BooleanEqualization extends Rule[LogicalPlan] {
-    private val trueValues = Seq(1.toByte, 1.toShort, 1, 1L, new java.math.BigDecimal(1))
-    private val falseValues = Seq(0.toByte, 0.toShort, 0, 0L, new java.math.BigDecimal(0))
+  object BooleanEquality extends Rule[LogicalPlan] {
+    private val trueValues = Seq(1.toByte, 1.toShort, 1, 1L, Decimal(1))
+    private val falseValues = Seq(0.toByte, 0.toShort, 0, 0L, Decimal(0))
 
     private def buildCaseKeyWhen(booleanExpr: Expression, numericExpr: Expression) = {
       CaseKeyWhen(numericExpr, Seq(
@@ -512,22 +512,22 @@ trait HiveTypeCoercion {
       // all other cases are considered as false.
 
       // We may simplify the expression if one side is literal numeric values
-      case EqualTo(left @ BooleanType(), Literal(value, _: NumericType))
-        if trueValues.contains(value) => left
-      case EqualTo(left @ BooleanType(), Literal(value, _: NumericType))
-        if falseValues.contains(value) => Not(left)
-      case EqualTo(Literal(value, _: NumericType), right @ BooleanType())
-        if trueValues.contains(value) => right
-      case EqualTo(Literal(value, _: NumericType), right @ BooleanType())
-        if falseValues.contains(value) => Not(right)
-      case EqualNullSafe(left @ BooleanType(), Literal(value, _: NumericType))
-        if trueValues.contains(value) => And(IsNotNull(left), left)
-      case EqualNullSafe(left @ BooleanType(), Literal(value, _: NumericType))
-        if falseValues.contains(value) => And(IsNotNull(left), Not(left))
-      case EqualNullSafe(Literal(value, _: NumericType), right @ BooleanType())
-        if trueValues.contains(value) => And(IsNotNull(right), right)
-      case EqualNullSafe(Literal(value, _: NumericType), right @ BooleanType())
-        if falseValues.contains(value) => And(IsNotNull(right), Not(right))
+      case EqualTo(bool @ BooleanType(), Literal(value, _: NumericType))
+        if trueValues.contains(value) => bool
+      case EqualTo(bool @ BooleanType(), Literal(value, _: NumericType))
+        if falseValues.contains(value) => Not(bool)
+      case EqualTo(Literal(value, _: NumericType), bool @ BooleanType())
+        if trueValues.contains(value) => bool
+      case EqualTo(Literal(value, _: NumericType), bool @ BooleanType())
+        if falseValues.contains(value) => Not(bool)
+      case EqualNullSafe(bool @ BooleanType(), Literal(value, _: NumericType))
+        if trueValues.contains(value) => And(IsNotNull(bool), bool)
+      case EqualNullSafe(bool @ BooleanType(), Literal(value, _: NumericType))
+        if falseValues.contains(value) => And(IsNotNull(bool), Not(bool))
+      case EqualNullSafe(Literal(value, _: NumericType), bool @ BooleanType())
+        if trueValues.contains(value) => And(IsNotNull(bool), bool)
+      case EqualNullSafe(Literal(value, _: NumericType), bool @ BooleanType())
+        if falseValues.contains(value) => And(IsNotNull(bool), Not(bool))
 
       case EqualTo(left @ BooleanType(), right @ NumericType()) =>
         transform(left , right)
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercionSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercionSuite.scala
index 0df446636ea89..9977f7af00f6b 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercionSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercionSuite.scala
@@ -147,7 +147,8 @@ class HiveTypeCoercionSuite extends PlanTest {
   }
 
   test("type coercion simplification for equal to") {
-    val be = new HiveTypeCoercion {}.BooleanEqualization
+    val be = new HiveTypeCoercion {}.BooleanEquality
+
     ruleTest(be,
       EqualTo(Literal(true), Literal(1)),
       Literal(true)
@@ -164,5 +165,26 @@ class HiveTypeCoercionSuite extends PlanTest {
       EqualNullSafe(Literal(true), Literal(0)),
       And(IsNotNull(Literal(true)), Not(Literal(true)))
     )
+
+    ruleTest(be,
+      EqualTo(Literal(true), Literal(1L)),
+      Literal(true)
+    )
+    ruleTest(be,
+      EqualTo(Literal(new java.math.BigDecimal(1)), Literal(true)),
+      Literal(true)
+    )
+    ruleTest(be,
+      EqualTo(Literal(BigDecimal(0)), Literal(true)),
+      Not(Literal(true))
+    )
+    ruleTest(be,
+      EqualTo(Literal(Decimal(1)), Literal(true)),
+      Literal(true)
+    )
+    ruleTest(be,
+      EqualTo(Literal.create(Decimal(1), DecimalType(8, 0)), Literal(true)),
+      Literal(true)
+    )
   }
 }

From e84815dc333a69368a48e0152f02934980768a14 Mon Sep 17 00:00:00 2001
From: Sean Owen <sowen@cloudera.com>
Date: Sun, 7 Jun 2015 20:18:13 +0100
Subject: [PATCH 402/525] [SPARK-7733] [CORE] [BUILD] Update build, code to use
 Java 7 for 1.5.0+

Update build to use Java 7, and remove some comments and special-case support for Java 6.

Author: Sean Owen <sowen@cloudera.com>

Closes #6265 from srowen/SPARK-7733 and squashes the following commits:

59bda4e [Sean Owen] Update build to use Java 7, and remove some comments and special-case support for Java 6
---
 bin/spark-class                                | 18 ------------------
 .../spark/util/MutableURLClassLoader.scala     |  4 ++--
 .../scala/org/apache/spark/util/Utils.scala    |  3 +--
 .../spark/util/collection/SorterSuite.scala    |  3 ---
 docs/building-spark.md                         |  6 +-----
 docs/index.md                                  |  2 +-
 docs/programming-guide.md                      |  2 +-
 make-distribution.sh                           | 16 ----------------
 pom.xml                                        |  2 +-
 .../apache/spark/unsafe/PlatformDependent.java |  3 +--
 10 files changed, 8 insertions(+), 51 deletions(-)

diff --git a/bin/spark-class b/bin/spark-class
index 7bb1afe4b44f5..2b59e5df5736f 100755
--- a/bin/spark-class
+++ b/bin/spark-class
@@ -58,24 +58,6 @@ fi
 
 SPARK_ASSEMBLY_JAR="${ASSEMBLY_DIR}/${ASSEMBLY_JARS}"
 
-# Verify that versions of java used to build the jars and run Spark are compatible
-if [ -n "$JAVA_HOME" ]; then
-  JAR_CMD="$JAVA_HOME/bin/jar"
-else
-  JAR_CMD="jar"
-fi
-
-if [ $(command -v "$JAR_CMD") ] ; then
-  jar_error_check=$("$JAR_CMD" -tf "$SPARK_ASSEMBLY_JAR" nonexistent/class/path 2>&1)
-  if [[ "$jar_error_check" =~ "invalid CEN header" ]]; then
-    echo "Loading Spark jar with '$JAR_CMD' failed. " 1>&2
-    echo "This is likely because Spark was compiled with Java 7 and run " 1>&2
-    echo "with Java 6. (see SPARK-1703). Please use Java 7 to run Spark " 1>&2
-    echo "or build Spark with Java 6." 1>&2
-    exit 1
-  fi
-fi
-
 LAUNCH_CLASSPATH="$SPARK_ASSEMBLY_JAR"
 
 # Add the launcher build dir to the classpath if requested.
diff --git a/core/src/main/scala/org/apache/spark/util/MutableURLClassLoader.scala b/core/src/main/scala/org/apache/spark/util/MutableURLClassLoader.scala
index 1e0ba5c28754a..169489df6c1ea 100644
--- a/core/src/main/scala/org/apache/spark/util/MutableURLClassLoader.scala
+++ b/core/src/main/scala/org/apache/spark/util/MutableURLClassLoader.scala
@@ -52,8 +52,8 @@ private[spark] class ChildFirstURLClassLoader(urls: Array[URL], parent: ClassLoa
    * Used to implement fine-grained class loading locks similar to what is done by Java 7. This
    * prevents deadlock issues when using non-hierarchical class loaders.
    *
-   * Note that due to Java 6 compatibility (and some issues with implementing class loaders in
-   * Scala), Java 7's `ClassLoader.registerAsParallelCapable` method is not called.
+   * Note that due to some issues with implementing class loaders in
+   * Scala, Java 7's `ClassLoader.registerAsParallelCapable` method is not called.
    */
   private val locks = new ConcurrentHashMap[String, Object]()
 
diff --git a/core/src/main/scala/org/apache/spark/util/Utils.scala b/core/src/main/scala/org/apache/spark/util/Utils.scala
index 5f132410540fd..153ece6224a6d 100644
--- a/core/src/main/scala/org/apache/spark/util/Utils.scala
+++ b/core/src/main/scala/org/apache/spark/util/Utils.scala
@@ -1295,8 +1295,7 @@ private[spark] object Utils extends Logging {
       } catch {
         case t: Throwable =>
           if (originalThrowable != null) {
-            // We could do originalThrowable.addSuppressed(t), but it's
-            // not available in JDK 1.6.
+            originalThrowable.addSuppressed(t)
             logWarning(s"Suppressing exception in finally: " + t.getMessage, t)
             throw originalThrowable
           } else {
diff --git a/core/src/test/scala/org/apache/spark/util/collection/SorterSuite.scala b/core/src/test/scala/org/apache/spark/util/collection/SorterSuite.scala
index 72fd6daba8de0..b2f5d9009ee5d 100644
--- a/core/src/test/scala/org/apache/spark/util/collection/SorterSuite.scala
+++ b/core/src/test/scala/org/apache/spark/util/collection/SorterSuite.scala
@@ -103,9 +103,6 @@ class SorterSuite extends SparkFunSuite {
    * has the keys and values alternating. The basic Java sorts work only on the keys, so the
    * real Java solution is to make Tuple2s to store the keys and values and sort an array of
    * those, while the Sorter approach can work directly on the input data format.
-   *
-   * Note that the Java implementation varies tremendously between Java 6 and Java 7, when
-   * the Java sort changed from merge sort to TimSort.
    */
   ignore("Sorter benchmark for key-value pairs") {
     val numElements = 25000000 // 25 mil
diff --git a/docs/building-spark.md b/docs/building-spark.md
index 78cb9086f95e8..2128fdffecc05 100644
--- a/docs/building-spark.md
+++ b/docs/building-spark.md
@@ -7,11 +7,7 @@ redirect_from: "building-with-maven.html"
 * This will become a table of contents (this text will be scraped).
 {:toc}
 
-Building Spark using Maven requires Maven 3.0.4 or newer and Java 6+.
-
-**Note:** Building Spark with Java 7 or later can create JAR files that may not be
-readable with early versions of Java 6, due to the large number of files in the JAR
-archive. Build with Java 6 if this is an issue for your deployment.
+Building Spark using Maven requires Maven 3.0.4 or newer and Java 7+.
 
 # Building with `build/mvn`
 
diff --git a/docs/index.md b/docs/index.md
index fac071da81e60..7939657915fc9 100644
--- a/docs/index.md
+++ b/docs/index.md
@@ -20,7 +20,7 @@ Spark runs on both Windows and UNIX-like systems (e.g. Linux, Mac OS). It's easy
 locally on one machine --- all you need is to have `java` installed on your system `PATH`,
 or the `JAVA_HOME` environment variable pointing to a Java installation.
 
-Spark runs on Java 6+, Python 2.6+ and R 3.1+. For the Scala API, Spark {{site.SPARK_VERSION}} uses
+Spark runs on Java 7+, Python 2.6+ and R 3.1+. For the Scala API, Spark {{site.SPARK_VERSION}} uses
 Scala {{site.SCALA_BINARY_VERSION}}. You will need to use a compatible Scala version 
 ({{site.SCALA_BINARY_VERSION}}.x).
 
diff --git a/docs/programming-guide.md b/docs/programming-guide.md
index 10f474f237bfa..d5ff416fe89a4 100644
--- a/docs/programming-guide.md
+++ b/docs/programming-guide.md
@@ -54,7 +54,7 @@ import org.apache.spark.SparkConf
 
 <div data-lang="java"  markdown="1">
 
-Spark {{site.SPARK_VERSION}} works with Java 6 and higher. If you are using Java 8, Spark supports
+Spark {{site.SPARK_VERSION}} works with Java 7 and higher. If you are using Java 8, Spark supports
 [lambda expressions](http://docs.oracle.com/javase/tutorial/java/javaOO/lambdaexpressions.html)
 for concisely writing functions, otherwise you can use the classes in the
 [org.apache.spark.api.java.function](api/java/index.html?org/apache/spark/api/java/function/package-summary.html) package.
diff --git a/make-distribution.sh b/make-distribution.sh
index a2b0c431fb4d0..9f063da3a16c0 100755
--- a/make-distribution.sh
+++ b/make-distribution.sh
@@ -141,22 +141,6 @@ SPARK_HIVE=$("$MVN" help:evaluate -Dexpression=project.activeProfiles -pl sql/hi
     # because we use "set -o pipefail"
     echo -n)
 
-JAVA_CMD="$JAVA_HOME"/bin/java
-JAVA_VERSION=$("$JAVA_CMD" -version 2>&1)
-if [[ ! "$JAVA_VERSION" =~ "1.6" && -z "$SKIP_JAVA_TEST" ]]; then
-  echo "***NOTE***: JAVA_HOME is not set to a JDK 6 installation. The resulting"
-  echo "            distribution may not work well with PySpark and will not run"
-  echo "            with Java 6 (See SPARK-1703 and SPARK-1911)."
-  echo "            This test can be disabled by adding --skip-java-test."
-  echo "Output from 'java -version' was:"
-  echo "$JAVA_VERSION"
-  read -p "Would you like to continue anyways? [y,n]: " -r
-  if [[ ! "$REPLY" =~ ^[Yy]$ ]]; then
-    echo "Okay, exiting."
-    exit 1
-  fi
-fi
-
 if [ "$NAME" == "none" ]; then
   NAME=$SPARK_HADOOP_VERSION
 fi
diff --git a/pom.xml b/pom.xml
index e65448e4b2325..67b6375f576d3 100644
--- a/pom.xml
+++ b/pom.xml
@@ -116,7 +116,7 @@
     <project.reporting.outputEncoding>UTF-8</project.reporting.outputEncoding>
     <akka.group>com.typesafe.akka</akka.group>
     <akka.version>2.3.11</akka.version>
-    <java.version>1.6</java.version>
+    <java.version>1.7</java.version>
     <sbt.project.name>spark</sbt.project.name>
     <mesos.version>0.21.1</mesos.version>
     <mesos.classifier>shaded-protobuf</mesos.classifier>
diff --git a/unsafe/src/main/java/org/apache/spark/unsafe/PlatformDependent.java b/unsafe/src/main/java/org/apache/spark/unsafe/PlatformDependent.java
index 24b2892098059..192c6714b2406 100644
--- a/unsafe/src/main/java/org/apache/spark/unsafe/PlatformDependent.java
+++ b/unsafe/src/main/java/org/apache/spark/unsafe/PlatformDependent.java
@@ -25,8 +25,7 @@ public final class PlatformDependent {
 
   /**
    * Facade in front of {@link sun.misc.Unsafe}, used to avoid directly exposing Unsafe outside of
-   * this package. This also lets us aovid accidental use of deprecated methods or methods that
-   * aren't present in Java 6.
+   * this package. This also lets us avoid accidental use of deprecated methods.
    */
   public static final class UNSAFE {
 

From b127ff8a0c5fb704da574d101a2d0e27ac5f463a Mon Sep 17 00:00:00 2001
From: cody koeninger <cody@koeninger.org>
Date: Sun, 7 Jun 2015 21:42:45 +0100
Subject: [PATCH 403/525] [SPARK-2808] [STREAMING] [KAFKA] cleanup tests from

see if requiring producer acks eliminates the need for waitUntilLeaderOffset calls in tests

Author: cody koeninger <cody@koeninger.org>

Closes #5921 from koeninger/kafka-0.8.2-test-cleanup and squashes the following commits:

1e89dc8 [cody koeninger] Merge branch 'master' into kafka-0.8.2-test-cleanup
4662828 [cody koeninger] [Streaming][Kafka] filter mima issue for removal of method from private test class
af1e083 [cody koeninger] Merge branch 'master' into kafka-0.8.2-test-cleanup
4298ac2 [cody koeninger] [Streaming][Kafka] update comment to trigger jenkins attempt
1274afb [cody koeninger] [Streaming][Kafka] see if requiring producer acks eliminates the need for waitUntilLeaderOffset calls in tests
---
 .../spark/streaming/kafka/KafkaTestUtils.scala  | 17 ++---------------
 .../streaming/kafka/JavaKafkaRDDSuite.java      |  3 ---
 .../spark/streaming/kafka/KafkaRDDSuite.scala   |  4 ----
 project/MimaExcludes.scala                      |  3 +++
 python/pyspark/streaming/tests.py               |  5 -----
 5 files changed, 5 insertions(+), 27 deletions(-)

diff --git a/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/KafkaTestUtils.scala b/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/KafkaTestUtils.scala
index 6dc4e9517d5a4..b608b75952721 100644
--- a/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/KafkaTestUtils.scala
+++ b/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/KafkaTestUtils.scala
@@ -195,6 +195,8 @@ private class KafkaTestUtils extends Logging {
     val props = new Properties()
     props.put("metadata.broker.list", brokerAddress)
     props.put("serializer.class", classOf[StringEncoder].getName)
+    // wait for all in-sync replicas to ack sends
+    props.put("request.required.acks", "-1")
     props
   }
 
@@ -229,21 +231,6 @@ private class KafkaTestUtils extends Logging {
     tryAgain(1)
   }
 
-  /** Wait until the leader offset for the given topic/partition equals the specified offset */
-  def waitUntilLeaderOffset(
-      topic: String,
-      partition: Int,
-      offset: Long): Unit = {
-    eventually(Time(10000), Time(100)) {
-      val kc = new KafkaCluster(Map("metadata.broker.list" -> brokerAddress))
-      val tp = TopicAndPartition(topic, partition)
-      val llo = kc.getLatestLeaderOffsets(Set(tp)).right.get.apply(tp).offset
-      assert(
-        llo == offset,
-        s"$topic $partition $offset not reached after timeout")
-    }
-  }
-
   private def waitUntilMetadataIsPropagated(topic: String, partition: Int): Unit = {
     def isPropagated = server.apis.metadataCache.getPartitionInfo(topic, partition) match {
       case Some(partitionState) =>
diff --git a/external/kafka/src/test/java/org/apache/spark/streaming/kafka/JavaKafkaRDDSuite.java b/external/kafka/src/test/java/org/apache/spark/streaming/kafka/JavaKafkaRDDSuite.java
index 5cf379635354f..a9dc6e50613ca 100644
--- a/external/kafka/src/test/java/org/apache/spark/streaming/kafka/JavaKafkaRDDSuite.java
+++ b/external/kafka/src/test/java/org/apache/spark/streaming/kafka/JavaKafkaRDDSuite.java
@@ -72,9 +72,6 @@ public void testKafkaRDD() throws InterruptedException {
     HashMap<String, String> kafkaParams = new HashMap<String, String>();
     kafkaParams.put("metadata.broker.list", kafkaTestUtils.brokerAddress());
 
-    kafkaTestUtils.waitUntilLeaderOffset(topic1, 0, topic1data.length);
-    kafkaTestUtils.waitUntilLeaderOffset(topic2, 0, topic2data.length);
-
     OffsetRange[] offsetRanges = {
       OffsetRange.create(topic1, 0, 0, 1),
       OffsetRange.create(topic2, 0, 0, 1)
diff --git a/external/kafka/src/test/scala/org/apache/spark/streaming/kafka/KafkaRDDSuite.scala b/external/kafka/src/test/scala/org/apache/spark/streaming/kafka/KafkaRDDSuite.scala
index 054487269a935..d5baf5fd89994 100644
--- a/external/kafka/src/test/scala/org/apache/spark/streaming/kafka/KafkaRDDSuite.scala
+++ b/external/kafka/src/test/scala/org/apache/spark/streaming/kafka/KafkaRDDSuite.scala
@@ -61,8 +61,6 @@ class KafkaRDDSuite extends SparkFunSuite with BeforeAndAfterAll {
     val kafkaParams = Map("metadata.broker.list" -> kafkaTestUtils.brokerAddress,
       "group.id" -> s"test-consumer-${Random.nextInt}")
 
-    kafkaTestUtils.waitUntilLeaderOffset(topic, 0, messages.size)
-
     val offsetRanges = Array(OffsetRange(topic, 0, 0, messages.size))
 
     val rdd = KafkaUtils.createRDD[String, String, StringDecoder, StringDecoder](
@@ -86,7 +84,6 @@ class KafkaRDDSuite extends SparkFunSuite with BeforeAndAfterAll {
     // this is the "lots of messages" case
     kafkaTestUtils.sendMessages(topic, sent)
     val sentCount = sent.values.sum
-    kafkaTestUtils.waitUntilLeaderOffset(topic, 0, sentCount)
 
     // rdd defined from leaders after sending messages, should get the number sent
     val rdd = getRdd(kc, Set(topic))
@@ -113,7 +110,6 @@ class KafkaRDDSuite extends SparkFunSuite with BeforeAndAfterAll {
     val sentOnlyOne = Map("d" -> 1)
 
     kafkaTestUtils.sendMessages(topic, sentOnlyOne)
-    kafkaTestUtils.waitUntilLeaderOffset(topic, 0, sentCount + 1)
 
     assert(rdd2.isDefined)
     assert(rdd2.get.count === 0, "got messages when there shouldn't be any")
diff --git a/project/MimaExcludes.scala b/project/MimaExcludes.scala
index 73e4bfd78e577..8a93ca2999510 100644
--- a/project/MimaExcludes.scala
+++ b/project/MimaExcludes.scala
@@ -47,6 +47,9 @@ object MimaExcludes {
             // Mima false positive (was a private[spark] class)
             ProblemFilters.exclude[MissingClassProblem](
               "org.apache.spark.util.collection.PairIterator"),
+            // Removing a testing method from a private class
+            ProblemFilters.exclude[MissingMethodProblem](
+              "org.apache.spark.streaming.kafka.KafkaTestUtils.waitUntilLeaderOffset"),
             // SQL execution is considered private.
             excludePackage("org.apache.spark.sql.execution")
           )
diff --git a/python/pyspark/streaming/tests.py b/python/pyspark/streaming/tests.py
index 46cb18b2e8ef9..57049beea4dba 100644
--- a/python/pyspark/streaming/tests.py
+++ b/python/pyspark/streaming/tests.py
@@ -615,7 +615,6 @@ def test_kafka_stream(self):
 
         self._kafkaTestUtils.createTopic(topic)
         self._kafkaTestUtils.sendMessages(topic, sendData)
-        self._kafkaTestUtils.waitUntilLeaderOffset(topic, 0, sum(sendData.values()))
 
         stream = KafkaUtils.createStream(self.ssc, self._kafkaTestUtils.zkAddress(),
                                          "test-streaming-consumer", {topic: 1},
@@ -631,7 +630,6 @@ def test_kafka_direct_stream(self):
 
         self._kafkaTestUtils.createTopic(topic)
         self._kafkaTestUtils.sendMessages(topic, sendData)
-        self._kafkaTestUtils.waitUntilLeaderOffset(topic, 0, sum(sendData.values()))
 
         stream = KafkaUtils.createDirectStream(self.ssc, [topic], kafkaParams)
         self._validateStreamResult(sendData, stream)
@@ -646,7 +644,6 @@ def test_kafka_direct_stream_from_offset(self):
 
         self._kafkaTestUtils.createTopic(topic)
         self._kafkaTestUtils.sendMessages(topic, sendData)
-        self._kafkaTestUtils.waitUntilLeaderOffset(topic, 0, sum(sendData.values()))
 
         stream = KafkaUtils.createDirectStream(self.ssc, [topic], kafkaParams, fromOffsets)
         self._validateStreamResult(sendData, stream)
@@ -661,7 +658,6 @@ def test_kafka_rdd(self):
 
         self._kafkaTestUtils.createTopic(topic)
         self._kafkaTestUtils.sendMessages(topic, sendData)
-        self._kafkaTestUtils.waitUntilLeaderOffset(topic, 0, sum(sendData.values()))
         rdd = KafkaUtils.createRDD(self.sc, kafkaParams, offsetRanges)
         self._validateRddResult(sendData, rdd)
 
@@ -677,7 +673,6 @@ def test_kafka_rdd_with_leaders(self):
 
         self._kafkaTestUtils.createTopic(topic)
         self._kafkaTestUtils.sendMessages(topic, sendData)
-        self._kafkaTestUtils.waitUntilLeaderOffset(topic, 0, sum(sendData.values()))
         rdd = KafkaUtils.createRDD(self.sc, kafkaParams, offsetRanges, leaders)
         self._validateRddResult(sendData, rdd)
 

From 5e7b6b67bed9cd0d8c7d4e78df666b807e8f7ef2 Mon Sep 17 00:00:00 2001
From: Davies Liu <davies@databricks.com>
Date: Sun, 7 Jun 2015 14:11:20 -0700
Subject: [PATCH 404/525] [SPARK-8117] [SQL] Push codegen implementation into
 each Expression

This PR move codegen implementation of expressions into Expression class itself, make it easy to manage.

It introduces two APIs in Expression:
```
def gen(ctx: CodeGenContext): GeneratedExpressionCode
def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): Code
```

gen(ctx) will call genSource(ctx, ev) to generate Java source code for the current expression. A expression needs to override genSource().

Here are the types:
```
type Term String
type Code String

/**
 * Java source for evaluating an [[Expression]] given a [[Row]] of input.
 */
case class GeneratedExpressionCode(var code: Code,
                               nullTerm: Term,
                               primitiveTerm: Term,
                               objectTerm: Term)
/**
 * A context for codegen, which is used to bookkeeping the expressions those are not supported
 * by codegen, then they are evaluated directly. The unsupported expression is appended at the
 * end of `references`, the position of it is kept in the code, used to access and evaluate it.
 */
class CodeGenContext {
  /**
   * Holding all the expressions those do not support codegen, will be evaluated directly.
   */
  val references: Seq[Expression] = new mutable.ArrayBuffer[Expression]()
}
```

This is basically #6660, but fixed style violation and compilation failure.

Author: Davies Liu <davies@databricks.com>
Author: Reynold Xin <rxin@databricks.com>

Closes #6690 from rxin/codegen and squashes the following commits:

e1368c2 [Reynold Xin] Fixed tests.
73db80e [Reynold Xin] Fixed compilation failure.
19d6435 [Reynold Xin] Fixed style violation.
9adaeaf [Davies Liu] address comments
f42c732 [Davies Liu] improve coverage and tests
bad6828 [Davies Liu] address comments
e03edaa [Davies Liu] consts fold
86fac2c [Davies Liu] fix style
02262c9 [Davies Liu] address comments
b5d3617 [Davies Liu] Merge pull request #5 from rxin/codegen
48c454f [Reynold Xin] Some code gen update.
2344bc0 [Davies Liu] fix test
12ff88a [Davies Liu] fix build
c5fb514 [Davies Liu] rename
8c6d82d [Davies Liu] update docs
b145047 [Davies Liu] fix style
e57959d [Davies Liu] add type alias
3ff25f8 [Davies Liu] refactor
593d617 [Davies Liu] pushing codegen into Expression
---
 .../catalyst/expressions/BoundAttribute.scala |   9 +
 .../spark/sql/catalyst/expressions/Cast.scala |  42 +
 .../sql/catalyst/expressions/Expression.scala | 100 +++
 .../sql/catalyst/expressions/arithmetic.scala | 161 +++-
 .../expressions/codegen/CodeGenerator.scala   | 750 ++++--------------
 .../codegen/GenerateMutableProjection.scala   |   6 +-
 .../codegen/GenerateOrdering.scala            |  20 +-
 .../codegen/GeneratePredicate.scala           |   4 +-
 .../codegen/GenerateProjection.scala          |  26 +-
 .../expressions/codegen/package.scala         |   3 +
 .../expressions/decimalFunctions.scala        |  19 +
 .../sql/catalyst/expressions/literals.scala   |  54 ++
 .../expressions/mathfuncs/binary.scala        |  24 +-
 .../expressions/mathfuncs/unary.scala         |  30 +-
 .../expressions/namedExpressions.scala        |   6 +-
 .../catalyst/expressions/nullFunctions.scala  |  55 ++
 .../sql/catalyst/expressions/predicates.scala | 192 ++++-
 .../spark/sql/catalyst/expressions/sets.scala |  54 +-
 .../expressions/stringOperations.scala        |  18 +
 .../ExpressionEvaluationSuite.scala           |  87 +-
 .../GeneratedEvaluationSuite.scala            |  27 +-
 .../GeneratedMutableEvaluationSuite.scala     |  61 --
 .../ParquetPartitionDiscoverySuite.scala      |   6 +-
 23 files changed, 1036 insertions(+), 718 deletions(-)
 delete mode 100644 sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/GeneratedMutableEvaluationSuite.scala

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/BoundAttribute.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/BoundAttribute.scala
index 1ffc95c676f6f..005de3166095f 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/BoundAttribute.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/BoundAttribute.scala
@@ -19,6 +19,7 @@ package org.apache.spark.sql.catalyst.expressions
 
 import org.apache.spark.Logging
 import org.apache.spark.sql.catalyst.errors.attachTree
+import org.apache.spark.sql.catalyst.expressions.codegen.{GeneratedExpressionCode, Code, CodeGenContext}
 import org.apache.spark.sql.types._
 import org.apache.spark.sql.catalyst.trees
 
@@ -41,6 +42,14 @@ case class BoundReference(ordinal: Int, dataType: DataType, nullable: Boolean)
   override def qualifiers: Seq[String] = throw new UnsupportedOperationException
 
   override def exprId: ExprId = throw new UnsupportedOperationException
+
+  override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): Code = {
+    s"""
+        boolean ${ev.isNull} = i.isNullAt($ordinal);
+        ${ctx.javaType(dataType)} ${ev.primitive} = ${ev.isNull} ?
+            ${ctx.defaultValue(dataType)} : (${ctx.getColumn(dataType, ordinal)});
+    """
+  }
 }
 
 object BindReferences extends Logging {
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala
index 21adac144112e..5f76a512679a4 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala
@@ -21,6 +21,7 @@ import java.sql.{Date, Timestamp}
 import java.text.{DateFormat, SimpleDateFormat}
 
 import org.apache.spark.Logging
+import org.apache.spark.sql.catalyst.expressions.codegen.{GeneratedExpressionCode, Code, CodeGenContext}
 import org.apache.spark.sql.catalyst.util.DateUtils
 import org.apache.spark.sql.types._
 
@@ -433,6 +434,47 @@ case class Cast(child: Expression, dataType: DataType) extends UnaryExpression w
     val evaluated = child.eval(input)
     if (evaluated == null) null else cast(evaluated)
   }
+
+  override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): Code = {
+    // TODO(cg): Add support for more data types.
+    (child.dataType, dataType) match {
+
+      case (BinaryType, StringType) =>
+        defineCodeGen (ctx, ev, c =>
+          s"new ${ctx.stringType}().set($c)")
+      case (DateType, StringType) =>
+        defineCodeGen(ctx, ev, c =>
+          s"""new ${ctx.stringType}().set(
+                org.apache.spark.sql.catalyst.util.DateUtils.toString($c))""")
+      // Special handling required for timestamps in hive test cases since the toString function
+      // does not match the expected output.
+      case (TimestampType, StringType) =>
+        super.genCode(ctx, ev)
+      case (_, StringType) =>
+        defineCodeGen(ctx, ev, c => s"new ${ctx.stringType}().set(String.valueOf($c))")
+
+      // fallback for DecimalType, this must be before other numeric types
+      case (_, dt: DecimalType) =>
+        super.genCode(ctx, ev)
+
+      case (BooleanType, dt: NumericType) =>
+        defineCodeGen(ctx, ev, c => s"(${ctx.javaType(dt)})($c ? 1 : 0)")
+      case (dt: DecimalType, BooleanType) =>
+        defineCodeGen(ctx, ev, c => s"$c.isZero()")
+      case (dt: NumericType, BooleanType) =>
+        defineCodeGen(ctx, ev, c => s"$c != 0")
+
+      case (_: DecimalType, IntegerType) =>
+        defineCodeGen(ctx, ev, c => s"($c).toInt()")
+      case (_: DecimalType, dt: NumericType) =>
+        defineCodeGen(ctx, ev, c => s"($c).to${ctx.boxedType(dt)}()")
+      case (_: NumericType, dt: NumericType) =>
+        defineCodeGen(ctx, ev, c => s"(${ctx.javaType(dt)})($c)")
+
+      case other =>
+        super.genCode(ctx, ev)
+    }
+  }
 }
 
 object Cast {
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala
index b2b9d1a5e1581..0ed576b3d5870 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala
@@ -18,6 +18,7 @@
 package org.apache.spark.sql.catalyst.expressions
 
 import org.apache.spark.sql.catalyst.analysis.{TypeCheckResult, UnresolvedAttribute}
+import org.apache.spark.sql.catalyst.expressions.codegen.{GeneratedExpressionCode, Code, CodeGenContext, Term}
 import org.apache.spark.sql.catalyst.trees
 import org.apache.spark.sql.catalyst.trees.TreeNode
 import org.apache.spark.sql.types._
@@ -51,6 +52,44 @@ abstract class Expression extends TreeNode[Expression] {
   /** Returns the result of evaluating this expression on a given input Row */
   def eval(input: Row = null): Any
 
+  /**
+   * Returns an [[GeneratedExpressionCode]], which contains Java source code that
+   * can be used to generate the result of evaluating the expression on an input row.
+   *
+   * @param ctx a [[CodeGenContext]]
+   * @return [[GeneratedExpressionCode]]
+   */
+  def gen(ctx: CodeGenContext): GeneratedExpressionCode = {
+    val isNull = ctx.freshName("isNull")
+    val primitive = ctx.freshName("primitive")
+    val ve = GeneratedExpressionCode("", isNull, primitive)
+    ve.code = genCode(ctx, ve)
+    ve
+  }
+
+  /**
+   * Returns Java source code that can be compiled to evaluate this expression.
+   * The default behavior is to call the eval method of the expression. Concrete expression
+   * implementations should override this to do actual code generation.
+   *
+   * @param ctx a [[CodeGenContext]]
+   * @param ev an [[GeneratedExpressionCode]] with unique terms.
+   * @return Java source code
+   */
+  protected def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): Code = {
+    ctx.references += this
+    val objectTerm = ctx.freshName("obj")
+    s"""
+      /* expression: ${this} */
+      Object ${objectTerm} = expressions[${ctx.references.size - 1}].eval(i);
+      boolean ${ev.isNull} = ${objectTerm} == null;
+      ${ctx.javaType(this.dataType)} ${ev.primitive} = ${ctx.defaultValue(this.dataType)};
+      if (!${ev.isNull}) {
+        ${ev.primitive} = (${ctx.boxedType(this.dataType)})${objectTerm};
+      }
+    """
+  }
+
   /**
    * Returns `true` if this expression and all its children have been resolved to a specific schema
    * and input data types checking passed, and `false` if it still contains any unresolved
@@ -116,6 +155,41 @@ abstract class BinaryExpression extends Expression with trees.BinaryNode[Express
   override def nullable: Boolean = left.nullable || right.nullable
 
   override def toString: String = s"($left $symbol $right)"
+
+  /**
+   * Short hand for generating binary evaluation code, which depends on two sub-evaluations of
+   * the same type.  If either of the sub-expressions is null, the result of this computation
+   * is assumed to be null.
+   *
+   * @param f accepts two variable names and returns Java code to compute the output.
+   */
+  protected def defineCodeGen(
+      ctx: CodeGenContext,
+      ev: GeneratedExpressionCode,
+      f: (Term, Term) => Code): String = {
+    // TODO: Right now some timestamp tests fail if we enforce this...
+    if (left.dataType != right.dataType) {
+      // log.warn(s"${left.dataType} != ${right.dataType}")
+    }
+
+    val eval1 = left.gen(ctx)
+    val eval2 = right.gen(ctx)
+    val resultCode = f(eval1.primitive, eval2.primitive)
+
+    s"""
+      ${eval1.code}
+      boolean ${ev.isNull} = ${eval1.isNull};
+      ${ctx.javaType(dataType)} ${ev.primitive} = ${ctx.defaultValue(dataType)};
+      if (!${ev.isNull}) {
+        ${eval2.code}
+        if(!${eval2.isNull}) {
+          ${ev.primitive} = $resultCode;
+        } else {
+          ${ev.isNull} = true;
+        }
+      }
+    """
+  }
 }
 
 private[sql] object BinaryExpression {
@@ -128,6 +202,32 @@ abstract class LeafExpression extends Expression with trees.LeafNode[Expression]
 
 abstract class UnaryExpression extends Expression with trees.UnaryNode[Expression] {
   self: Product =>
+
+  /**
+   * Called by unary expressions to generate a code block that returns null if its parent returns
+   * null, and if not not null, use `f` to generate the expression.
+   *
+   * As an example, the following does a boolean inversion (i.e. NOT).
+   * {{{
+   *   defineCodeGen(ctx, ev, c => s"!($c)")
+   * }}}
+   *
+   * @param f function that accepts a variable name and returns Java code to compute the output.
+   */
+  protected def defineCodeGen(
+      ctx: CodeGenContext,
+      ev: GeneratedExpressionCode,
+      f: Term => Code): Code = {
+    val eval = child.gen(ctx)
+    // reuse the previous isNull
+    ev.isNull = eval.isNull
+    eval.code + s"""
+      ${ctx.javaType(dataType)} ${ev.primitive} = ${ctx.defaultValue(dataType)};
+      if (!${ev.isNull}) {
+        ${ev.primitive} = ${f(eval.primitive)};
+      }
+    """
+  }
 }
 
 // TODO Semantically we probably not need GroupExpression
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/arithmetic.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/arithmetic.scala
index a3770f998d94d..3ac7c92dcd009 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/arithmetic.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/arithmetic.scala
@@ -18,6 +18,7 @@
 package org.apache.spark.sql.catalyst.expressions
 
 import org.apache.spark.sql.catalyst.analysis.TypeCheckResult
+import org.apache.spark.sql.catalyst.expressions.codegen.{Code, GeneratedExpressionCode, CodeGenContext}
 import org.apache.spark.sql.catalyst.util.TypeUtils
 import org.apache.spark.sql.types._
 
@@ -49,6 +50,11 @@ case class UnaryMinus(child: Expression) extends UnaryArithmetic {
 
   private lazy val numeric = TypeUtils.getNumeric(dataType)
 
+  override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): Code = dataType match {
+    case dt: DecimalType => defineCodeGen(ctx, ev, c => s"c.unary_$$minus()")
+    case dt: NumericType => defineCodeGen(ctx, ev, c => s"-($c)")
+  }
+
   protected override def evalInternal(evalE: Any) = numeric.negate(evalE)
 }
 
@@ -67,6 +73,21 @@ case class Sqrt(child: Expression) extends UnaryArithmetic {
     if (value < 0) null
     else math.sqrt(value)
   }
+
+  override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): Code = {
+    val eval = child.gen(ctx)
+    eval.code + s"""
+      boolean ${ev.isNull} = ${eval.isNull};
+      ${ctx.javaType(dataType)} ${ev.primitive} = ${ctx.defaultValue(dataType)};
+      if (!${ev.isNull}) {
+        if (${eval.primitive} < 0.0) {
+          ${ev.isNull} = true;
+        } else {
+          ${ev.primitive} = java.lang.Math.sqrt(${eval.primitive});
+        }
+      }
+    """
+  }
 }
 
 /**
@@ -86,6 +107,9 @@ case class Abs(child: Expression) extends UnaryArithmetic {
 abstract class BinaryArithmetic extends BinaryExpression {
   self: Product =>
 
+  /** Name of the function for this expression on a [[Decimal]] type. */
+  def decimalMethod: String = ""
+
   override def dataType: DataType = left.dataType
 
   override def checkInputDataTypes(): TypeCheckResult = {
@@ -114,6 +138,17 @@ abstract class BinaryArithmetic extends BinaryExpression {
     }
   }
 
+  override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): Code = dataType match {
+    case dt: DecimalType =>
+      defineCodeGen(ctx, ev, (eval1, eval2) => s"$eval1.$decimalMethod($eval2)")
+    // byte and short are casted into int when add, minus, times or divide
+    case ByteType | ShortType =>
+      defineCodeGen(ctx, ev, (eval1, eval2) =>
+        s"(${ctx.javaType(dataType)})($eval1 $symbol $eval2)")
+    case _ =>
+      defineCodeGen(ctx, ev, (eval1, eval2) => s"$eval1 $symbol $eval2")
+  }
+
   protected def evalInternal(evalE1: Any, evalE2: Any): Any =
     sys.error(s"BinaryArithmetics must override either eval or evalInternal")
 }
@@ -124,6 +159,7 @@ private[sql] object BinaryArithmetic {
 
 case class Add(left: Expression, right: Expression) extends BinaryArithmetic {
   override def symbol: String = "+"
+  override def decimalMethod: String = "$plus"
 
   override lazy val resolved =
     childrenResolved && checkInputDataTypes().isSuccess && !DecimalType.isFixed(dataType)
@@ -138,6 +174,7 @@ case class Add(left: Expression, right: Expression) extends BinaryArithmetic {
 
 case class Subtract(left: Expression, right: Expression) extends BinaryArithmetic {
   override def symbol: String = "-"
+  override def decimalMethod: String = "$minus"
 
   override lazy val resolved =
     childrenResolved && checkInputDataTypes().isSuccess && !DecimalType.isFixed(dataType)
@@ -152,6 +189,7 @@ case class Subtract(left: Expression, right: Expression) extends BinaryArithmeti
 
 case class Multiply(left: Expression, right: Expression) extends BinaryArithmetic {
   override def symbol: String = "*"
+  override def decimalMethod: String = "$times"
 
   override lazy val resolved =
     childrenResolved && checkInputDataTypes().isSuccess && !DecimalType.isFixed(dataType)
@@ -166,6 +204,8 @@ case class Multiply(left: Expression, right: Expression) extends BinaryArithmeti
 
 case class Divide(left: Expression, right: Expression) extends BinaryArithmetic {
   override def symbol: String = "/"
+  override def decimalMethod: String = "$divide"
+
   override def nullable: Boolean = true
 
   override lazy val resolved =
@@ -192,10 +232,40 @@ case class Divide(left: Expression, right: Expression) extends BinaryArithmetic
       }
     }
   }
+
+  /**
+   * Special case handling due to division by 0 => null.
+   */
+  override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): Code = {
+    val eval1 = left.gen(ctx)
+    val eval2 = right.gen(ctx)
+    val test = if (left.dataType.isInstanceOf[DecimalType]) {
+      s"${eval2.primitive}.isZero()"
+    } else {
+      s"${eval2.primitive} == 0"
+    }
+    val method = if (left.dataType.isInstanceOf[DecimalType]) {
+      s".$decimalMethod"
+    } else {
+      s"$symbol"
+    }
+    eval1.code + eval2.code +
+      s"""
+      boolean ${ev.isNull} = false;
+      ${ctx.javaType(left.dataType)} ${ev.primitive} = ${ctx.defaultValue(left.dataType)};
+      if (${eval1.isNull} || ${eval2.isNull} || $test) {
+        ${ev.isNull} = true;
+      } else {
+        ${ev.primitive} = ${eval1.primitive}$method(${eval2.primitive});
+      }
+      """
+  }
 }
 
 case class Remainder(left: Expression, right: Expression) extends BinaryArithmetic {
   override def symbol: String = "%"
+  override def decimalMethod: String = "reminder"
+
   override def nullable: Boolean = true
 
   override lazy val resolved =
@@ -222,6 +292,34 @@ case class Remainder(left: Expression, right: Expression) extends BinaryArithmet
       }
     }
   }
+
+  /**
+   * Special case handling for x % 0 ==> null.
+   */
+  override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): Code = {
+    val eval1 = left.gen(ctx)
+    val eval2 = right.gen(ctx)
+    val test = if (left.dataType.isInstanceOf[DecimalType]) {
+      s"${eval2.primitive}.isZero()"
+    } else {
+      s"${eval2.primitive} == 0"
+    }
+    val method = if (left.dataType.isInstanceOf[DecimalType]) {
+      s".$decimalMethod"
+    } else {
+      s"$symbol"
+    }
+    eval1.code + eval2.code +
+      s"""
+      boolean ${ev.isNull} = false;
+      ${ctx.javaType(left.dataType)} ${ev.primitive} = ${ctx.defaultValue(left.dataType)};
+      if (${eval1.isNull} || ${eval2.isNull} || $test) {
+        ${ev.isNull} = true;
+      } else {
+        ${ev.primitive} = ${eval1.primitive}$method(${eval2.primitive});
+      }
+      """
+  }
 }
 
 /**
@@ -271,7 +369,7 @@ case class BitwiseOr(left: Expression, right: Expression) extends BinaryArithmet
 }
 
 /**
- * A function that calculates bitwise xor(^) of two numbers.
+ * A function that calculates bitwise xor of two numbers.
  */
 case class BitwiseXor(left: Expression, right: Expression) extends BinaryArithmetic {
   override def symbol: String = "^"
@@ -313,6 +411,10 @@ case class BitwiseNot(child: Expression) extends UnaryArithmetic {
       ((evalE: Long) => ~evalE).asInstanceOf[(Any) => Any]
   }
 
+  override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): Code = {
+    defineCodeGen(ctx, ev, c => s"(${ctx.javaType(dataType)})~($c)")
+  }
+
   protected override def evalInternal(evalE: Any) = not(evalE)
 }
 
@@ -340,6 +442,33 @@ case class MaxOf(left: Expression, right: Expression) extends BinaryArithmetic {
     }
   }
 
+  override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): Code = {
+    if (ctx.isNativeType(left.dataType)) {
+      val eval1 = left.gen(ctx)
+      val eval2 = right.gen(ctx)
+      eval1.code + eval2.code + s"""
+        boolean ${ev.isNull} = false;
+        ${ctx.javaType(left.dataType)} ${ev.primitive} =
+          ${ctx.defaultValue(left.dataType)};
+
+        if (${eval1.isNull}) {
+          ${ev.isNull} = ${eval2.isNull};
+          ${ev.primitive} = ${eval2.primitive};
+        } else if (${eval2.isNull}) {
+          ${ev.isNull} = ${eval1.isNull};
+          ${ev.primitive} = ${eval1.primitive};
+        } else {
+          if (${eval1.primitive} > ${eval2.primitive}) {
+            ${ev.primitive} = ${eval1.primitive};
+          } else {
+            ${ev.primitive} = ${eval2.primitive};
+          }
+        }
+      """
+    } else {
+      super.genCode(ctx, ev)
+    }
+  }
   override def toString: String = s"MaxOf($left, $right)"
 }
 
@@ -367,5 +496,35 @@ case class MinOf(left: Expression, right: Expression) extends BinaryArithmetic {
     }
   }
 
+  override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): Code = {
+    if (ctx.isNativeType(left.dataType)) {
+
+      val eval1 = left.gen(ctx)
+      val eval2 = right.gen(ctx)
+
+      eval1.code + eval2.code + s"""
+        boolean ${ev.isNull} = false;
+        ${ctx.javaType(left.dataType)} ${ev.primitive} =
+          ${ctx.defaultValue(left.dataType)};
+
+        if (${eval1.isNull}) {
+          ${ev.isNull} = ${eval2.isNull};
+          ${ev.primitive} = ${eval2.primitive};
+        } else if (${eval2.isNull}) {
+          ${ev.isNull} = ${eval1.isNull};
+          ${ev.primitive} = ${eval1.primitive};
+        } else {
+          if (${eval1.primitive} < ${eval2.primitive}) {
+            ${ev.primitive} = ${eval1.primitive};
+          } else {
+            ${ev.primitive} = ${eval2.primitive};
+          }
+        }
+      """
+    } else {
+      super.genCode(ctx, ev)
+    }
+  }
+
   override def toString: String = s"MinOf($left, $right)"
 }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala
index cd604121b7dd9..c8d0aaf79f5f2 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala
@@ -24,7 +24,6 @@ import com.google.common.cache.{CacheBuilder, CacheLoader}
 import org.codehaus.janino.ClassBodyEvaluator
 
 import org.apache.spark.Logging
-import org.apache.spark.sql.catalyst.expressions
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.types._
 
@@ -32,6 +31,157 @@ import org.apache.spark.sql.types._
 class IntegerHashSet extends org.apache.spark.util.collection.OpenHashSet[Int]
 class LongHashSet extends org.apache.spark.util.collection.OpenHashSet[Long]
 
+/**
+ * Java source for evaluating an [[Expression]] given a [[Row]] of input.
+ *
+ * @param code The sequence of statements required to evaluate the expression.
+ * @param isNull A term that holds a boolean value representing whether the expression evaluated
+ *                 to null.
+ * @param primitive A term for a possible primitive value of the result of the evaluation. Not
+ *                      valid if `isNull` is set to `true`.
+ */
+case class GeneratedExpressionCode(var code: Code, var isNull: Term, var primitive: Term)
+
+/**
+ * A context for codegen, which is used to bookkeeping the expressions those are not supported
+ * by codegen, then they are evaluated directly. The unsupported expression is appended at the
+ * end of `references`, the position of it is kept in the code, used to access and evaluate it.
+ */
+class CodeGenContext {
+
+  /**
+   * Holding all the expressions those do not support codegen, will be evaluated directly.
+   */
+  val references: mutable.ArrayBuffer[Expression] = new mutable.ArrayBuffer[Expression]()
+
+  val stringType: String = classOf[UTF8String].getName
+  val decimalType: String = classOf[Decimal].getName
+
+  private val curId = new java.util.concurrent.atomic.AtomicInteger()
+
+  /**
+   * Returns a term name that is unique within this instance of a `CodeGenerator`.
+   *
+   * (Since we aren't in a macro context we do not seem to have access to the built in `freshName`
+   * function.)
+   */
+  def freshName(prefix: String): Term = {
+    s"$prefix${curId.getAndIncrement}"
+  }
+
+  /**
+   * Return the code to access a column for given DataType
+   */
+  def getColumn(dataType: DataType, ordinal: Int): Code = {
+    if (isNativeType(dataType)) {
+      s"i.${accessorForType(dataType)}($ordinal)"
+    } else {
+      s"(${boxedType(dataType)})i.apply($ordinal)"
+    }
+  }
+
+  /**
+   * Return the code to update a column in Row for given DataType
+   */
+  def setColumn(dataType: DataType, ordinal: Int, value: Term): Code = {
+    if (isNativeType(dataType)) {
+      s"${mutatorForType(dataType)}($ordinal, $value)"
+    } else {
+      s"update($ordinal, $value)"
+    }
+  }
+
+  /**
+   * Return the name of accessor in Row for a DataType
+   */
+  def accessorForType(dt: DataType): Term = dt match {
+    case IntegerType => "getInt"
+    case other => s"get${boxedType(dt)}"
+  }
+
+  /**
+   * Return the name of mutator in Row for a DataType
+   */
+  def mutatorForType(dt: DataType): Term = dt match {
+    case IntegerType => "setInt"
+    case other => s"set${boxedType(dt)}"
+  }
+
+  /**
+   * Return the Java type for a DataType
+   */
+  def javaType(dt: DataType): Term = dt match {
+    case IntegerType => "int"
+    case LongType => "long"
+    case ShortType => "short"
+    case ByteType => "byte"
+    case DoubleType => "double"
+    case FloatType => "float"
+    case BooleanType => "boolean"
+    case dt: DecimalType => decimalType
+    case BinaryType => "byte[]"
+    case StringType => stringType
+    case DateType => "int"
+    case TimestampType => "java.sql.Timestamp"
+    case dt: OpenHashSetUDT if dt.elementType == IntegerType => classOf[IntegerHashSet].getName
+    case dt: OpenHashSetUDT if dt.elementType == LongType => classOf[LongHashSet].getName
+    case _ => "Object"
+  }
+
+  /**
+   * Return the boxed type in Java
+   */
+  def boxedType(dt: DataType): Term = dt match {
+    case IntegerType => "Integer"
+    case LongType => "Long"
+    case ShortType => "Short"
+    case ByteType => "Byte"
+    case DoubleType => "Double"
+    case FloatType => "Float"
+    case BooleanType => "Boolean"
+    case DateType => "Integer"
+    case _ => javaType(dt)
+  }
+
+  /**
+   * Return the representation of default value for given DataType
+   */
+  def defaultValue(dt: DataType): Term = dt match {
+    case BooleanType => "false"
+    case FloatType => "-1.0f"
+    case ShortType => "(short)-1"
+    case LongType => "-1L"
+    case ByteType => "(byte)-1"
+    case DoubleType => "-1.0"
+    case IntegerType => "-1"
+    case DateType => "-1"
+    case _ => "null"
+  }
+
+  /**
+   * Returns a function to generate equal expression in Java
+   */
+  def equalFunc(dataType: DataType): ((Term, Term) => Code) = dataType match {
+    case BinaryType => { case (eval1, eval2) =>
+      s"java.util.Arrays.equals($eval1, $eval2)" }
+    case IntegerType | BooleanType | LongType | DoubleType | FloatType | ShortType | ByteType =>
+      { case (eval1, eval2) => s"$eval1 == $eval2" }
+    case other =>
+      { case (eval1, eval2) => s"$eval1.equals($eval2)" }
+  }
+
+  /**
+   * List of data types that have special accessors and setters in [[Row]].
+   */
+  val nativeTypes =
+    Seq(IntegerType, BooleanType, LongType, DoubleType, FloatType, ShortType, ByteType)
+
+  /**
+   * Returns true if the data type has a special accessor and setter in [[Row]].
+   */
+  def isNativeType(dt: DataType): Boolean = nativeTypes.contains(dt)
+}
+
 /**
  * A base class for generators of byte code to perform expression evaluation.  Includes a set of
  * helpers for referring to Catalyst types and building trees that perform evaluation of individual
@@ -39,14 +189,9 @@ class LongHashSet extends org.apache.spark.util.collection.OpenHashSet[Long]
  */
 abstract class CodeGenerator[InType <: AnyRef, OutType <: AnyRef] extends Logging {
 
-  protected val rowType = classOf[Row].getName
-  protected val stringType = classOf[UTF8String].getName
-  protected val decimalType = classOf[Decimal].getName
-  protected val exprType = classOf[Expression].getName
-  protected val mutableRowType = classOf[MutableRow].getName
-  protected val genericMutableRowType = classOf[GenericMutableRow].getName
-
-  private val curId = new java.util.concurrent.atomic.AtomicInteger()
+  protected val exprType: String = classOf[Expression].getName
+  protected val mutableRowType: String = classOf[MutableRow].getName
+  protected val genericMutableRowType: String = classOf[GenericMutableRow].getName
 
   /**
    * Can be flipped on manually in the console to add (expensive) expression evaluation trace code.
@@ -75,10 +220,16 @@ abstract class CodeGenerator[InType <: AnyRef, OutType <: AnyRef] extends Loggin
    */
   protected def compile(code: String): Class[_] = {
     val startTime = System.nanoTime()
-    val clazz = new ClassBodyEvaluator(code).getClazz()
+    val clazz = try {
+      new ClassBodyEvaluator(code).getClazz()
+    } catch {
+      case e: Exception =>
+        logError(s"failed to compile:\n $code", e)
+        throw e
+    }
     val endTime = System.nanoTime()
     def timeMs: Double = (endTime - startTime).toDouble / 1000000
-    logDebug(s"Compiled Java code (${code.size} bytes) in $timeMs ms")
+    logDebug(s"Code (${code.size} bytes) compiled in $timeMs ms")
     clazz
   }
 
@@ -112,586 +263,11 @@ abstract class CodeGenerator[InType <: AnyRef, OutType <: AnyRef] extends Loggin
   /** Generates the requested evaluator given already bound expression(s). */
   def generate(expressions: InType): OutType = cache.get(canonicalize(expressions))
 
-  /**
-   * Returns a term name that is unique within this instance of a `CodeGenerator`.
-   *
-   * (Since we aren't in a macro context we do not seem to have access to the built in `freshName`
-   * function.)
-   */
-  protected def freshName(prefix: String): String = {
-    s"$prefix${curId.getAndIncrement}"
-  }
-
-  /**
-   * Scala ASTs for evaluating an [[Expression]] given a [[Row]] of input.
-   *
-   * @param code The sequence of statements required to evaluate the expression.
-   * @param nullTerm A term that holds a boolean value representing whether the expression evaluated
-   *                 to null.
-   * @param primitiveTerm A term for a possible primitive value of the result of the evaluation. Not
-   *                      valid if `nullTerm` is set to `true`.
-   * @param objectTerm A possibly boxed version of the result of evaluating this expression.
-   */
-  protected case class EvaluatedExpression(
-      code: String,
-      nullTerm: String,
-      primitiveTerm: String,
-      objectTerm: String)
-
-  /**
-   * A context for codegen, which is used to bookkeeping the expressions those are not supported
-   * by codegen, then they are evaluated directly. The unsupported expression is appended at the
-   * end of `references`, the position of it is kept in the code, used to access and evaluate it.
-   */
-  protected class CodeGenContext {
-    /**
-     * Holding all the expressions those do not support codegen, will be evaluated directly.
-     */
-    val references: mutable.ArrayBuffer[Expression] = new mutable.ArrayBuffer[Expression]()
-  }
-
   /**
    * Create a new codegen context for expression evaluator, used to store those
    * expressions that don't support codegen
    */
   def newCodeGenContext(): CodeGenContext = {
-    new CodeGenContext()
-  }
-
-  /**
-   * Given an expression tree returns an [[EvaluatedExpression]], which contains Scala trees that
-   * can be used to determine the result of evaluating the expression on an input row.
-   */
-  def expressionEvaluator(e: Expression, ctx: CodeGenContext): EvaluatedExpression = {
-    val primitiveTerm = freshName("primitiveTerm")
-    val nullTerm = freshName("nullTerm")
-    val objectTerm = freshName("objectTerm")
-
-    implicit class Evaluate1(e: Expression) {
-      def castOrNull(f: String => String, dataType: DataType): String = {
-        val eval = expressionEvaluator(e, ctx)
-        eval.code +
-        s"""
-          boolean $nullTerm = ${eval.nullTerm};
-          ${primitiveForType(dataType)} $primitiveTerm = ${defaultPrimitive(dataType)};
-          if (!$nullTerm) {
-            $primitiveTerm = ${f(eval.primitiveTerm)};
-          }
-        """
-      }
-    }
-
-    implicit class Evaluate2(expressions: (Expression, Expression)) {
-
-      /**
-       * Short hand for generating binary evaluation code, which depends on two sub-evaluations of
-       * the same type.  If either of the sub-expressions is null, the result of this computation
-       * is assumed to be null.
-       *
-       * @param f a function from two primitive term names to a tree that evaluates them.
-       */
-      def evaluate(f: (String, String) => String): String =
-        evaluateAs(expressions._1.dataType)(f)
-
-      def evaluateAs(resultType: DataType)(f: (String, String) => String): String = {
-        // TODO: Right now some timestamp tests fail if we enforce this...
-        if (expressions._1.dataType != expressions._2.dataType) {
-          log.warn(s"${expressions._1.dataType} != ${expressions._2.dataType}")
-        }
-
-        val eval1 = expressionEvaluator(expressions._1, ctx)
-        val eval2 = expressionEvaluator(expressions._2, ctx)
-        val resultCode = f(eval1.primitiveTerm, eval2.primitiveTerm)
-
-        eval1.code + eval2.code +
-        s"""
-          boolean $nullTerm = ${eval1.nullTerm} || ${eval2.nullTerm};
-          ${primitiveForType(resultType)} $primitiveTerm = ${defaultPrimitive(resultType)};
-          if(!$nullTerm) {
-            $primitiveTerm = (${primitiveForType(resultType)})($resultCode);
-          }
-        """
-      }
-    }
-
-    val inputTuple = "i"
-
-    // TODO: Skip generation of null handling code when expression are not nullable.
-    val primitiveEvaluation: PartialFunction[Expression, String] = {
-      case b @ BoundReference(ordinal, dataType, nullable) =>
-        s"""
-          final boolean $nullTerm = $inputTuple.isNullAt($ordinal);
-          final ${primitiveForType(dataType)} $primitiveTerm = $nullTerm ?
-              ${defaultPrimitive(dataType)} : (${getColumn(inputTuple, dataType, ordinal)});
-         """
-
-      case expressions.Literal(null, dataType) =>
-        s"""
-          final boolean $nullTerm = true;
-          ${primitiveForType(dataType)} $primitiveTerm = ${defaultPrimitive(dataType)};
-        """
-
-      case expressions.Literal(value: UTF8String, StringType) =>
-        val arr = s"new byte[]{${value.getBytes.map(_.toString).mkString(", ")}}"
-        s"""
-          final boolean $nullTerm = false;
-          ${stringType} $primitiveTerm =
-            new ${stringType}().set(${arr});
-         """
-
-      case expressions.Literal(value, FloatType) =>
-        s"""
-          final boolean $nullTerm = false;
-          float $primitiveTerm = ${value}f;
-         """
-
-      case expressions.Literal(value, dt @ DecimalType()) =>
-        s"""
-          final boolean $nullTerm = false;
-          ${primitiveForType(dt)} $primitiveTerm = new ${primitiveForType(dt)}().set($value);
-         """
-
-      case expressions.Literal(value, dataType) =>
-        s"""
-          final boolean $nullTerm = false;
-          ${primitiveForType(dataType)} $primitiveTerm = $value;
-         """
-
-      case Cast(child @ BinaryType(), StringType) =>
-        child.castOrNull(c =>
-          s"new ${stringType}().set($c)",
-          StringType)
-
-      case Cast(child @ DateType(), StringType) =>
-        child.castOrNull(c =>
-          s"""new ${stringType}().set(
-                org.apache.spark.sql.catalyst.util.DateUtils.toString($c))""",
-          StringType)
-
-      case Cast(child @ BooleanType(), dt: NumericType)  if !dt.isInstanceOf[DecimalType] =>
-        child.castOrNull(c => s"(${primitiveForType(dt)})($c?1:0)", dt)
-
-      case Cast(child @ DecimalType(), IntegerType) =>
-        child.castOrNull(c => s"($c).toInt()", IntegerType)
-
-      case Cast(child @ DecimalType(), dt: NumericType) if !dt.isInstanceOf[DecimalType] =>
-        child.castOrNull(c => s"($c).to${termForType(dt)}()", dt)
-
-      case Cast(child @ NumericType(), dt: NumericType) if !dt.isInstanceOf[DecimalType] =>
-        child.castOrNull(c => s"(${primitiveForType(dt)})($c)", dt)
-
-      // Special handling required for timestamps in hive test cases since the toString function
-      // does not match the expected output.
-      case Cast(e, StringType) if e.dataType != TimestampType =>
-        e.castOrNull(c =>
-          s"new ${stringType}().set(String.valueOf($c))",
-          StringType)
-
-      case EqualTo(e1 @ BinaryType(), e2 @ BinaryType()) =>
-        (e1, e2).evaluateAs (BooleanType) {
-          case (eval1, eval2) =>
-            s"java.util.Arrays.equals((byte[])$eval1, (byte[])$eval2)"
-        }
-
-      case EqualTo(e1, e2) =>
-        (e1, e2).evaluateAs (BooleanType) { case (eval1, eval2) => s"$eval1 == $eval2" }
-
-      case GreaterThan(e1 @ NumericType(), e2 @ NumericType()) =>
-        (e1, e2).evaluateAs (BooleanType) { case (eval1, eval2) => s"$eval1 > $eval2" }
-      case GreaterThanOrEqual(e1 @ NumericType(), e2 @ NumericType()) =>
-        (e1, e2).evaluateAs (BooleanType) { case (eval1, eval2) => s"$eval1 >= $eval2" }
-      case LessThan(e1 @ NumericType(), e2 @ NumericType()) =>
-        (e1, e2).evaluateAs (BooleanType) { case (eval1, eval2) => s"$eval1 < $eval2" }
-      case LessThanOrEqual(e1 @ NumericType(), e2 @ NumericType()) =>
-        (e1, e2).evaluateAs (BooleanType) { case (eval1, eval2) => s"$eval1 <= $eval2" }
-
-      case And(e1, e2) =>
-        val eval1 = expressionEvaluator(e1, ctx)
-        val eval2 = expressionEvaluator(e2, ctx)
-        s"""
-          ${eval1.code}
-          boolean $nullTerm = false;
-          boolean $primitiveTerm  = false;
-
-          if (!${eval1.nullTerm} && !${eval1.primitiveTerm}) {
-          } else {
-            ${eval2.code}
-            if (!${eval2.nullTerm} && !${eval2.primitiveTerm}) {
-            } else if (!${eval1.nullTerm} && !${eval2.nullTerm}) {
-              $primitiveTerm = true;
-            } else {
-              $nullTerm = true;
-            }
-          }
-         """
-
-      case Or(e1, e2) =>
-        val eval1 = expressionEvaluator(e1, ctx)
-        val eval2 = expressionEvaluator(e2, ctx)
-
-        s"""
-          ${eval1.code}
-          boolean $nullTerm = false;
-          boolean $primitiveTerm = false;
-
-          if (!${eval1.nullTerm} && ${eval1.primitiveTerm}) {
-            $primitiveTerm = true;
-          } else {
-            ${eval2.code}
-            if (!${eval2.nullTerm} && ${eval2.primitiveTerm}) {
-              $primitiveTerm = true;
-            } else if (!${eval1.nullTerm} && !${eval2.nullTerm}) {
-              $primitiveTerm = false;
-            } else {
-              $nullTerm = true;
-            }
-          }
-         """
-
-      case Not(child) =>
-        // Uh, bad function name...
-        child.castOrNull(c => s"!$c", BooleanType)
-
-      case Add(e1 @ DecimalType(), e2 @ DecimalType()) =>
-        (e1, e2) evaluate { case (eval1, eval2) => s"$eval1.$$plus($eval2)" }
-      case Subtract(e1 @ DecimalType(), e2 @ DecimalType()) =>
-        (e1, e2) evaluate { case (eval1, eval2) => s"$eval1.$$minus($eval2)" }
-      case Multiply(e1 @ DecimalType(), e2 @ DecimalType()) =>
-        (e1, e2) evaluate { case (eval1, eval2) => s"$eval1.$$times($eval2)" }
-      case Divide(e1 @ DecimalType(), e2 @ DecimalType()) =>
-        val eval1 = expressionEvaluator(e1, ctx)
-        val eval2 = expressionEvaluator(e2, ctx)
-        eval1.code + eval2.code +
-          s"""
-          boolean $nullTerm = false;
-          ${primitiveForType(e1.dataType)} $primitiveTerm = null;
-          if (${eval1.nullTerm} || ${eval2.nullTerm} || ${eval2.primitiveTerm}.isZero()) {
-            $nullTerm = true;
-          } else {
-            $primitiveTerm = ${eval1.primitiveTerm}.$$div${eval2.primitiveTerm});
-          }
-          """
-      case Remainder(e1 @ DecimalType(), e2 @ DecimalType()) =>
-        val eval1 = expressionEvaluator(e1, ctx)
-        val eval2 = expressionEvaluator(e2, ctx)
-        eval1.code + eval2.code +
-          s"""
-          boolean $nullTerm = false;
-          ${primitiveForType(e1.dataType)} $primitiveTerm = 0;
-          if (${eval1.nullTerm} || ${eval2.nullTerm} || ${eval2.primitiveTerm}.isZero()) {
-            $nullTerm = true;
-          } else {
-            $primitiveTerm = ${eval1.primitiveTerm}.remainder(${eval2.primitiveTerm});
-          }
-         """
-
-      case Add(e1, e2) =>
-        (e1, e2) evaluate { case (eval1, eval2) => s"$eval1 + $eval2" }
-      case Subtract(e1, e2) =>
-        (e1, e2) evaluate { case (eval1, eval2) => s"$eval1 - $eval2" }
-      case Multiply(e1, e2) =>
-        (e1, e2) evaluate { case (eval1, eval2) => s"$eval1 * $eval2" }
-      case Divide(e1, e2) =>
-        val eval1 = expressionEvaluator(e1, ctx)
-        val eval2 = expressionEvaluator(e2, ctx)
-        eval1.code + eval2.code +
-        s"""
-          boolean $nullTerm = false;
-          ${primitiveForType(e1.dataType)} $primitiveTerm = 0;
-          if (${eval1.nullTerm} || ${eval2.nullTerm} || ${eval2.primitiveTerm} == 0) {
-            $nullTerm = true;
-          } else {
-            $primitiveTerm = ${eval1.primitiveTerm} / ${eval2.primitiveTerm};
-          }
-        """
-      case Remainder(e1, e2) =>
-        val eval1 = expressionEvaluator(e1, ctx)
-        val eval2 = expressionEvaluator(e2, ctx)
-        eval1.code + eval2.code +
-        s"""
-          boolean $nullTerm = false;
-          ${primitiveForType(e1.dataType)} $primitiveTerm = 0;
-          if (${eval1.nullTerm} || ${eval2.nullTerm} || ${eval2.primitiveTerm} == 0) {
-            $nullTerm = true;
-          } else {
-            $primitiveTerm = ${eval1.primitiveTerm} % ${eval2.primitiveTerm};
-          }
-         """
-
-      case IsNotNull(e) =>
-        val eval = expressionEvaluator(e, ctx)
-        s"""
-          ${eval.code}
-          boolean $nullTerm = false;
-          boolean $primitiveTerm = !${eval.nullTerm};
-        """
-
-      case IsNull(e) =>
-        val eval = expressionEvaluator(e, ctx)
-        s"""
-          ${eval.code}
-          boolean $nullTerm = false;
-          boolean $primitiveTerm = ${eval.nullTerm};
-        """
-
-      case e @ Coalesce(children) =>
-        s"""
-          boolean $nullTerm = true;
-          ${primitiveForType(e.dataType)} $primitiveTerm = ${defaultPrimitive(e.dataType)};
-        """ +
-        children.map { c =>
-          val eval = expressionEvaluator(c, ctx)
-          s"""
-            if($nullTerm) {
-              ${eval.code}
-              if(!${eval.nullTerm}) {
-                $nullTerm = false;
-                $primitiveTerm = ${eval.primitiveTerm};
-              }
-            }
-          """
-        }.mkString("\n")
-
-      case e @ expressions.If(condition, trueValue, falseValue) =>
-        val condEval = expressionEvaluator(condition, ctx)
-        val trueEval = expressionEvaluator(trueValue, ctx)
-        val falseEval = expressionEvaluator(falseValue, ctx)
-
-        s"""
-          boolean $nullTerm = false;
-          ${primitiveForType(e.dataType)} $primitiveTerm = ${defaultPrimitive(e.dataType)};
-          ${condEval.code}
-          if(!${condEval.nullTerm} && ${condEval.primitiveTerm}) {
-            ${trueEval.code}
-            $nullTerm = ${trueEval.nullTerm};
-            $primitiveTerm = ${trueEval.primitiveTerm};
-          } else {
-            ${falseEval.code}
-            $nullTerm = ${falseEval.nullTerm};
-            $primitiveTerm = ${falseEval.primitiveTerm};
-          }
-        """
-
-      case NewSet(elementType) =>
-        s"""
-          boolean $nullTerm = false;
-          ${hashSetForType(elementType)} $primitiveTerm = new ${hashSetForType(elementType)}();
-        """
-
-      case AddItemToSet(item, set) =>
-        val itemEval = expressionEvaluator(item, ctx)
-        val setEval = expressionEvaluator(set, ctx)
-
-        val elementType = set.dataType.asInstanceOf[OpenHashSetUDT].elementType
-        val htype = hashSetForType(elementType)
-
-        itemEval.code + setEval.code +
-        s"""
-           if (!${itemEval.nullTerm} && !${setEval.nullTerm}) {
-             (($htype)${setEval.primitiveTerm}).add(${itemEval.primitiveTerm});
-           }
-           boolean $nullTerm = false;
-           ${htype} $primitiveTerm = ($htype)${setEval.primitiveTerm};
-         """
-
-      case CombineSets(left, right) =>
-        val leftEval = expressionEvaluator(left, ctx)
-        val rightEval = expressionEvaluator(right, ctx)
-
-        val elementType = left.dataType.asInstanceOf[OpenHashSetUDT].elementType
-        val htype = hashSetForType(elementType)
-
-        leftEval.code + rightEval.code +
-        s"""
-          boolean $nullTerm = false;
-          ${htype} $primitiveTerm =
-            (${htype})${leftEval.primitiveTerm};
-          $primitiveTerm.union((${htype})${rightEval.primitiveTerm});
-        """
-
-      case MaxOf(e1, e2) if !e1.dataType.isInstanceOf[DecimalType] =>
-        val eval1 = expressionEvaluator(e1, ctx)
-        val eval2 = expressionEvaluator(e2, ctx)
-
-        eval1.code + eval2.code +
-        s"""
-          boolean $nullTerm = false;
-          ${primitiveForType(e1.dataType)} $primitiveTerm = ${defaultPrimitive(e1.dataType)};
-
-          if (${eval1.nullTerm}) {
-            $nullTerm = ${eval2.nullTerm};
-            $primitiveTerm = ${eval2.primitiveTerm};
-          } else if (${eval2.nullTerm}) {
-            $nullTerm = ${eval1.nullTerm};
-            $primitiveTerm = ${eval1.primitiveTerm};
-          } else {
-            if (${eval1.primitiveTerm} > ${eval2.primitiveTerm}) {
-              $primitiveTerm = ${eval1.primitiveTerm};
-            } else {
-              $primitiveTerm = ${eval2.primitiveTerm};
-            }
-          }
-        """
-
-      case MinOf(e1, e2) if !e1.dataType.isInstanceOf[DecimalType] =>
-        val eval1 = expressionEvaluator(e1, ctx)
-        val eval2 = expressionEvaluator(e2, ctx)
-
-        eval1.code + eval2.code +
-        s"""
-          boolean $nullTerm = false;
-          ${primitiveForType(e1.dataType)} $primitiveTerm = ${defaultPrimitive(e1.dataType)};
-
-          if (${eval1.nullTerm}) {
-            $nullTerm = ${eval2.nullTerm};
-            $primitiveTerm = ${eval2.primitiveTerm};
-          } else if (${eval2.nullTerm}) {
-            $nullTerm = ${eval1.nullTerm};
-            $primitiveTerm = ${eval1.primitiveTerm};
-          } else {
-            if (${eval1.primitiveTerm} < ${eval2.primitiveTerm}) {
-              $primitiveTerm = ${eval1.primitiveTerm};
-            } else {
-              $primitiveTerm = ${eval2.primitiveTerm};
-            }
-          }
-        """
-
-      case UnscaledValue(child) =>
-        val childEval = expressionEvaluator(child, ctx)
-
-        childEval.code +
-        s"""
-         boolean $nullTerm = ${childEval.nullTerm};
-         long $primitiveTerm = $nullTerm ? -1 : ${childEval.primitiveTerm}.toUnscaledLong();
-         """
-
-      case MakeDecimal(child, precision, scale) =>
-        val eval = expressionEvaluator(child, ctx)
-
-        eval.code +
-        s"""
-         boolean $nullTerm = ${eval.nullTerm};
-         org.apache.spark.sql.types.Decimal $primitiveTerm = ${defaultPrimitive(DecimalType())};
-
-         if (!$nullTerm) {
-           $primitiveTerm = new org.apache.spark.sql.types.Decimal();
-           $primitiveTerm = $primitiveTerm.setOrNull(${eval.primitiveTerm}, $precision, $scale);
-           $nullTerm = $primitiveTerm == null;
-         }
-         """
-    }
-
-    // If there was no match in the partial function above, we fall back on calling the interpreted
-    // expression evaluator.
-    val code: String =
-      primitiveEvaluation.lift.apply(e).getOrElse {
-        logError(s"No rules to generate $e")
-        ctx.references += e
-        s"""
-          /* expression: ${e} */
-          Object $objectTerm = expressions[${ctx.references.size - 1}].eval(i);
-          boolean $nullTerm = $objectTerm == null;
-          ${primitiveForType(e.dataType)} $primitiveTerm = ${defaultPrimitive(e.dataType)};
-          if (!$nullTerm) $primitiveTerm = (${termForType(e.dataType)})$objectTerm;
-         """
-      }
-
-    EvaluatedExpression(code, nullTerm, primitiveTerm, objectTerm)
-  }
-
-  protected def getColumn(inputRow: String, dataType: DataType, ordinal: Int) = {
-    dataType match {
-      case StringType => s"(${stringType})$inputRow.apply($ordinal)"
-      case dt: DataType if isNativeType(dt) => s"$inputRow.${accessorForType(dt)}($ordinal)"
-      case _ => s"(${termForType(dataType)})$inputRow.apply($ordinal)"
-    }
-  }
-
-  protected def setColumn(
-      destinationRow: String,
-      dataType: DataType,
-      ordinal: Int,
-      value: String): String = {
-    dataType match {
-      case StringType => s"$destinationRow.update($ordinal, $value)"
-      case dt: DataType if isNativeType(dt) =>
-        s"$destinationRow.${mutatorForType(dt)}($ordinal, $value)"
-      case _ => s"$destinationRow.update($ordinal, $value)"
-    }
-  }
-
-  protected def accessorForType(dt: DataType) = dt match {
-    case IntegerType => "getInt"
-    case other => s"get${termForType(dt)}"
-  }
-
-  protected def mutatorForType(dt: DataType) = dt match {
-    case IntegerType => "setInt"
-    case other => s"set${termForType(dt)}"
-  }
-
-  protected def hashSetForType(dt: DataType): String = dt match {
-    case IntegerType => classOf[IntegerHashSet].getName
-    case LongType => classOf[LongHashSet].getName
-    case unsupportedType =>
-      sys.error(s"Code generation not support for hashset of type $unsupportedType")
+    new CodeGenContext
   }
-
-  protected def primitiveForType(dt: DataType): String = dt match {
-    case IntegerType => "int"
-    case LongType => "long"
-    case ShortType => "short"
-    case ByteType => "byte"
-    case DoubleType => "double"
-    case FloatType => "float"
-    case BooleanType => "boolean"
-    case dt: DecimalType => decimalType
-    case BinaryType => "byte[]"
-    case StringType => stringType
-    case DateType => "int"
-    case TimestampType => "java.sql.Timestamp"
-    case _ => "Object"
-  }
-
-  protected def defaultPrimitive(dt: DataType): String = dt match {
-    case BooleanType => "false"
-    case FloatType => "-1.0f"
-    case ShortType => "-1"
-    case LongType => "-1"
-    case ByteType => "-1"
-    case DoubleType => "-1.0"
-    case IntegerType => "-1"
-    case DateType => "-1"
-    case dt: DecimalType => "null"
-    case StringType => "null"
-    case _ => "null"
-  }
-
-  protected def termForType(dt: DataType): String = dt match {
-    case IntegerType => "Integer"
-    case LongType => "Long"
-    case ShortType => "Short"
-    case ByteType => "Byte"
-    case DoubleType => "Double"
-    case FloatType => "Float"
-    case BooleanType => "Boolean"
-    case dt: DecimalType => decimalType
-    case BinaryType => "byte[]"
-    case StringType => stringType
-    case DateType => "Integer"
-    case TimestampType => "java.sql.Timestamp"
-    case _ => "Object"
-  }
-
-  /**
-   * List of data types that have special accessors and setters in [[Row]].
-   */
-  protected val nativeTypes =
-    Seq(IntegerType, BooleanType, LongType, DoubleType, FloatType, ShortType, ByteType)
-
-  /**
-   * Returns true if the data type has a special accessor and setter in [[Row]].
-   */
-  protected def isNativeType(dt: DataType) = nativeTypes.contains(dt)
 }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateMutableProjection.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateMutableProjection.scala
index 638b53fe0fe2f..e5ee2accd8a84 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateMutableProjection.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateMutableProjection.scala
@@ -37,13 +37,13 @@ object GenerateMutableProjection extends CodeGenerator[Seq[Expression], () => Mu
   protected def create(expressions: Seq[Expression]): (() => MutableProjection) = {
     val ctx = newCodeGenContext()
     val projectionCode = expressions.zipWithIndex.map { case (e, i) =>
-      val evaluationCode = expressionEvaluator(e, ctx)
+      val evaluationCode = e.gen(ctx)
       evaluationCode.code +
         s"""
-          if(${evaluationCode.nullTerm})
+          if(${evaluationCode.isNull})
             mutableRow.setNullAt($i);
           else
-            ${setColumn("mutableRow", e.dataType, i, evaluationCode.primitiveTerm)};
+            mutableRow.${ctx.setColumn(e.dataType, i, evaluationCode.primitive)};
         """
     }.mkString("\n")
     val code = s"""
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateOrdering.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateOrdering.scala
index 0ff840dab393c..36e155d164a40 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateOrdering.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateOrdering.scala
@@ -52,15 +52,15 @@ object GenerateOrdering extends CodeGenerator[Seq[SortOrder], Ordering[Row]] wit
     val ctx = newCodeGenContext()
 
     val comparisons = ordering.zipWithIndex.map { case (order, i) =>
-      val evalA = expressionEvaluator(order.child, ctx)
-      val evalB = expressionEvaluator(order.child, ctx)
+      val evalA = order.child.gen(ctx)
+      val evalB = order.child.gen(ctx)
       val asc = order.direction == Ascending
       val compare = order.child.dataType match {
         case BinaryType =>
           s"""
             {
-              byte[] x = ${if (asc) evalA.primitiveTerm else evalB.primitiveTerm};
-              byte[] y = ${if (!asc) evalB.primitiveTerm else evalA.primitiveTerm};
+              byte[] x = ${if (asc) evalA.primitive else evalB.primitive};
+              byte[] y = ${if (!asc) evalB.primitive else evalA.primitive};
               int j = 0;
               while (j < x.length && j < y.length) {
                 if (x[j] != y[j]) return x[j] - y[j];
@@ -73,8 +73,8 @@ object GenerateOrdering extends CodeGenerator[Seq[SortOrder], Ordering[Row]] wit
             }"""
         case _: NumericType =>
           s"""
-            if (${evalA.primitiveTerm} != ${evalB.primitiveTerm}) {
-              if (${evalA.primitiveTerm} > ${evalB.primitiveTerm}) {
+            if (${evalA.primitive} != ${evalB.primitive}) {
+              if (${evalA.primitive} > ${evalB.primitive}) {
                 return ${if (asc) "1" else "-1"};
               } else {
                 return ${if (asc) "-1" else "1"};
@@ -82,7 +82,7 @@ object GenerateOrdering extends CodeGenerator[Seq[SortOrder], Ordering[Row]] wit
             }"""
         case _ =>
           s"""
-            int comp = ${evalA.primitiveTerm}.compare(${evalB.primitiveTerm});
+            int comp = ${evalA.primitive}.compare(${evalB.primitive});
             if (comp != 0) {
               return ${if (asc) "comp" else "-comp"};
             }"""
@@ -93,11 +93,11 @@ object GenerateOrdering extends CodeGenerator[Seq[SortOrder], Ordering[Row]] wit
           ${evalA.code}
           i = $b;
           ${evalB.code}
-          if (${evalA.nullTerm} && ${evalB.nullTerm}) {
+          if (${evalA.isNull} && ${evalB.isNull}) {
             // Nothing
-          } else if (${evalA.nullTerm}) {
+          } else if (${evalA.isNull}) {
             return ${if (order.direction == Ascending) "-1" else "1"};
-          } else if (${evalB.nullTerm}) {
+          } else if (${evalB.isNull}) {
             return ${if (order.direction == Ascending) "1" else "-1"};
           } else {
             $compare
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GeneratePredicate.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GeneratePredicate.scala
index fb18769f00da3..4a547b5ce9543 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GeneratePredicate.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GeneratePredicate.scala
@@ -38,7 +38,7 @@ object GeneratePredicate extends CodeGenerator[Expression, (Row) => Boolean] {
 
   protected def create(predicate: Expression): ((Row) => Boolean) = {
     val ctx = newCodeGenContext()
-    val eval = expressionEvaluator(predicate, ctx)
+    val eval = predicate.gen(ctx)
     val code = s"""
       import org.apache.spark.sql.Row;
 
@@ -55,7 +55,7 @@ object GeneratePredicate extends CodeGenerator[Expression, (Row) => Boolean] {
         @Override
         public boolean eval(Row i) {
           ${eval.code}
-          return !${eval.nullTerm} && ${eval.primitiveTerm};
+          return !${eval.isNull} && ${eval.primitive};
         }
       }"""
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateProjection.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateProjection.scala
index d5be1fc12e0f0..7caf4aaab88bb 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateProjection.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateProjection.scala
@@ -45,19 +45,19 @@ object GenerateProjection extends CodeGenerator[Seq[Expression], Projection] {
     val ctx = newCodeGenContext()
     val columns = expressions.zipWithIndex.map {
       case (e, i) =>
-        s"private ${primitiveForType(e.dataType)} c$i = ${defaultPrimitive(e.dataType)};\n"
+        s"private ${ctx.javaType(e.dataType)} c$i = ${ctx.defaultValue(e.dataType)};\n"
     }.mkString("\n      ")
 
     val initColumns = expressions.zipWithIndex.map {
       case (e, i) =>
-        val eval = expressionEvaluator(e, ctx)
+        val eval = e.gen(ctx)
         s"""
         {
           // column$i
           ${eval.code}
-          nullBits[$i] = ${eval.nullTerm};
-          if(!${eval.nullTerm}) {
-            c$i = ${eval.primitiveTerm};
+          nullBits[$i] = ${eval.isNull};
+          if (!${eval.isNull}) {
+            c$i = ${eval.primitive};
           }
         }
         """
@@ -68,10 +68,10 @@ object GenerateProjection extends CodeGenerator[Seq[Expression], Projection] {
     }.mkString("\n        ")
 
     val updateCases = expressions.zipWithIndex.map { case (e, i) =>
-      s"case $i: { c$i = (${termForType(e.dataType)})value; return;}"
+      s"case $i: { c$i = (${ctx.boxedType(e.dataType)})value; return;}"
     }.mkString("\n        ")
 
-    val specificAccessorFunctions = nativeTypes.map { dataType =>
+    val specificAccessorFunctions = ctx.nativeTypes.map { dataType =>
       val cases = expressions.zipWithIndex.map {
         case (e, i) if e.dataType == dataType =>
           s"case $i: return c$i;"
@@ -80,21 +80,21 @@ object GenerateProjection extends CodeGenerator[Seq[Expression], Projection] {
       if (cases.count(_ != '\n') > 0) {
         s"""
       @Override
-      public ${primitiveForType(dataType)} ${accessorForType(dataType)}(int i) {
+      public ${ctx.javaType(dataType)} ${ctx.accessorForType(dataType)}(int i) {
         if (isNullAt(i)) {
-          return ${defaultPrimitive(dataType)};
+          return ${ctx.defaultValue(dataType)};
         }
         switch (i) {
         $cases
         }
-        return ${defaultPrimitive(dataType)};
+        return ${ctx.defaultValue(dataType)};
       }"""
       } else {
         ""
       }
     }.mkString("\n")
 
-    val specificMutatorFunctions = nativeTypes.map { dataType =>
+    val specificMutatorFunctions = ctx.nativeTypes.map { dataType =>
       val cases = expressions.zipWithIndex.map {
         case (e, i) if e.dataType == dataType =>
           s"case $i: { c$i = value; return; }"
@@ -103,7 +103,7 @@ object GenerateProjection extends CodeGenerator[Seq[Expression], Projection] {
       if (cases.count(_ != '\n') > 0) {
         s"""
       @Override
-      public void ${mutatorForType(dataType)}(int i, ${primitiveForType(dataType)} value) {
+      public void ${ctx.mutatorForType(dataType)}(int i, ${ctx.javaType(dataType)} value) {
         nullBits[i] = false;
         switch (i) {
         $cases
@@ -122,7 +122,7 @@ object GenerateProjection extends CodeGenerator[Seq[Expression], Projection] {
         case LongType => s"$col ^ ($col >>> 32)"
         case FloatType => s"Float.floatToIntBits($col)"
         case DoubleType =>
-          s"Double.doubleToLongBits($col) ^ (Double.doubleToLongBits($col) >>> 32)"
+            s"(int)(Double.doubleToLongBits($col) ^ (Double.doubleToLongBits($col) >>> 32))"
         case _ => s"$col.hashCode()"
       }
       s"isNullAt($i) ? 0 : ($nonNull)"
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/package.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/package.scala
index 7f1b12cdd5800..6f9589d20445e 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/package.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/package.scala
@@ -27,6 +27,9 @@ import org.apache.spark.util.Utils
  */
 package object codegen {
 
+  type Term = String
+  type Code = String
+
   /** Canonicalizes an expression so those that differ only by names can reuse the same code. */
   object ExpressionCanonicalizer extends rules.RuleExecutor[Expression] {
     val batches =
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/decimalFunctions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/decimalFunctions.scala
index 65ba18924afe1..ddfadf314f838 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/decimalFunctions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/decimalFunctions.scala
@@ -17,6 +17,7 @@
 
 package org.apache.spark.sql.catalyst.expressions
 
+import org.apache.spark.sql.catalyst.expressions.codegen.{GeneratedExpressionCode, Code, CodeGenContext}
 import org.apache.spark.sql.types._
 
 /** Return the unscaled Long value of a Decimal, assuming it fits in a Long */
@@ -35,6 +36,10 @@ case class UnscaledValue(child: Expression) extends UnaryExpression {
       childResult.asInstanceOf[Decimal].toUnscaledLong
     }
   }
+
+  override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): Code = {
+    defineCodeGen(ctx, ev, c => s"$c.toUnscaledLong()")
+  }
 }
 
 /** Create a Decimal from an unscaled Long value */
@@ -53,4 +58,18 @@ case class MakeDecimal(child: Expression, precision: Int, scale: Int) extends Un
       new Decimal().setOrNull(childResult.asInstanceOf[Long], precision, scale)
     }
   }
+
+  override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): Code = {
+    val eval = child.gen(ctx)
+    eval.code + s"""
+      boolean ${ev.isNull} = ${eval.isNull};
+      ${ctx.decimalType} ${ev.primitive} = null;
+
+      if (!${ev.isNull}) {
+        ${ev.primitive} = (new ${ctx.decimalType}()).setOrNull(
+          ${eval.primitive}, $precision, $scale);
+        ${ev.isNull} = ${ev.primitive} == null;
+      }
+      """
+  }
 }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/literals.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/literals.scala
index d3ca3d9a4b18b..3a9271678bc9c 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/literals.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/literals.scala
@@ -20,6 +20,7 @@ package org.apache.spark.sql.catalyst.expressions
 import java.sql.{Date, Timestamp}
 
 import org.apache.spark.sql.catalyst.CatalystTypeConverters
+import org.apache.spark.sql.catalyst.expressions.codegen.{Code, CodeGenContext, GeneratedExpressionCode}
 import org.apache.spark.sql.catalyst.util.DateUtils
 import org.apache.spark.sql.types._
 
@@ -78,7 +79,60 @@ case class Literal protected (value: Any, dataType: DataType) extends LeafExpres
 
   override def toString: String = if (value != null) value.toString else "null"
 
+  override def equals(other: Any): Boolean = other match {
+    case o: Literal =>
+      dataType.equals(o.dataType) &&
+        (value == null && null == o.value || value != null && value.equals(o.value))
+    case _ => false
+  }
+
   override def eval(input: Row): Any = value
+
+  override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): Code = {
+    // change the isNull and primitive to consts, to inline them
+    if (value == null) {
+      ev.isNull = "true"
+      ev.primitive = ctx.defaultValue(dataType)
+      ""
+    } else {
+      dataType match {
+        case BooleanType =>
+          ev.isNull = "false"
+          ev.primitive = value.toString
+          ""
+        case FloatType =>  // This must go before NumericType
+          val v = value.asInstanceOf[Float]
+          if (v.isNaN || v.isInfinite) {
+            super.genCode(ctx, ev)
+          } else {
+            ev.isNull = "false"
+            ev.primitive = s"${value}f"
+            ""
+          }
+        case DoubleType =>  // This must go before NumericType
+          val v = value.asInstanceOf[Double]
+          if (v.isNaN || v.isInfinite) {
+            super.genCode(ctx, ev)
+          } else {
+            ev.isNull = "false"
+            ev.primitive = s"${value}"
+            ""
+          }
+
+        case ByteType | ShortType =>  // This must go before NumericType
+          ev.isNull = "false"
+          ev.primitive = s"(${ctx.javaType(dataType)})$value"
+          ""
+        case dt: NumericType if !dt.isInstanceOf[DecimalType] =>
+          ev.isNull = "false"
+          ev.primitive = value.toString
+          ""
+        // eval() version may be faster for non-primitive types
+        case other =>
+          super.genCode(ctx, ev)
+      }
+    }
+  }
 }
 
 // TODO: Specialize
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/mathfuncs/binary.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/mathfuncs/binary.scala
index db853a2b97fad..88211acd7713c 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/mathfuncs/binary.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/mathfuncs/binary.scala
@@ -17,6 +17,7 @@
 
 package org.apache.spark.sql.catalyst.expressions.mathfuncs
 
+import org.apache.spark.sql.catalyst.expressions.codegen._
 import org.apache.spark.sql.catalyst.expressions.{ExpectsInputTypes, BinaryExpression, Expression, Row}
 import org.apache.spark.sql.types._
 
@@ -49,6 +50,10 @@ abstract class BinaryMathExpression(f: (Double, Double) => Double, name: String)
       }
     }
   }
+
+  override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): Code = {
+    defineCodeGen(ctx, ev, (c1, c2) => s"java.lang.Math.${name.toLowerCase}($c1, $c2)")
+  }
 }
 
 case class Atan2(left: Expression, right: Expression)
@@ -70,9 +75,26 @@ case class Atan2(left: Expression, right: Expression)
       }
     }
   }
+
+  override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): Code = {
+    defineCodeGen(ctx, ev, (c1, c2) => s"java.lang.Math.atan2($c1 + 0.0, $c2 + 0.0)") + s"""
+      if (Double.valueOf(${ev.primitive}).isNaN()) {
+        ${ev.isNull} = true;
+      }
+      """
+  }
 }
 
 case class Hypot(left: Expression, right: Expression)
   extends BinaryMathExpression(math.hypot, "HYPOT")
 
-case class Pow(left: Expression, right: Expression) extends BinaryMathExpression(math.pow, "POWER")
+case class Pow(left: Expression, right: Expression)
+  extends BinaryMathExpression(math.pow, "POWER") {
+  override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): Code = {
+    defineCodeGen(ctx, ev, (c1, c2) => s"java.lang.Math.pow($c1, $c2)") + s"""
+      if (Double.valueOf(${ev.primitive}).isNaN()) {
+        ${ev.isNull} = true;
+      }
+      """
+  }
+}
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/mathfuncs/unary.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/mathfuncs/unary.scala
index 41b422346a02d..5563cd94bf86d 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/mathfuncs/unary.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/mathfuncs/unary.scala
@@ -17,6 +17,7 @@
 
 package org.apache.spark.sql.catalyst.expressions.mathfuncs
 
+import org.apache.spark.sql.catalyst.expressions.codegen.{Code, CodeGenContext, GeneratedExpressionCode}
 import org.apache.spark.sql.catalyst.expressions.{ExpectsInputTypes, Expression, Row, UnaryExpression}
 import org.apache.spark.sql.types._
 
@@ -44,6 +45,23 @@ abstract class UnaryMathExpression(f: Double => Double, name: String)
       if (result.isNaN) null else result
     }
   }
+
+  // name of function in java.lang.Math
+  def funcName: String = name.toLowerCase
+
+  override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): Code = {
+    val eval = child.gen(ctx)
+    eval.code + s"""
+      boolean ${ev.isNull} = ${eval.isNull};
+      ${ctx.javaType(dataType)} ${ev.primitive} = ${ctx.defaultValue(dataType)};
+      if (!${ev.isNull}) {
+        ${ev.primitive} = java.lang.Math.${funcName}(${eval.primitive});
+        if (Double.valueOf(${ev.primitive}).isNaN()) {
+          ${ev.isNull} = true;
+        }
+      }
+    """
+  }
 }
 
 case class Acos(child: Expression) extends UnaryMathExpression(math.acos, "ACOS")
@@ -72,7 +90,9 @@ case class Log10(child: Expression) extends UnaryMathExpression(math.log10, "LOG
 
 case class Log1p(child: Expression) extends UnaryMathExpression(math.log1p, "LOG1P")
 
-case class Rint(child: Expression) extends UnaryMathExpression(math.rint, "ROUND")
+case class Rint(child: Expression) extends UnaryMathExpression(math.rint, "ROUND") {
+  override def funcName: String = "rint"
+}
 
 case class Signum(child: Expression) extends UnaryMathExpression(math.signum, "SIGNUM")
 
@@ -84,6 +104,10 @@ case class Tan(child: Expression) extends UnaryMathExpression(math.tan, "TAN")
 
 case class Tanh(child: Expression) extends UnaryMathExpression(math.tanh, "TANH")
 
-case class ToDegrees(child: Expression) extends UnaryMathExpression(math.toDegrees, "DEGREES")
+case class ToDegrees(child: Expression) extends UnaryMathExpression(math.toDegrees, "DEGREES") {
+  override def funcName: String = "toDegrees"
+}
 
-case class ToRadians(child: Expression) extends UnaryMathExpression(math.toRadians, "RADIANS")
+case class ToRadians(child: Expression) extends UnaryMathExpression(math.toRadians, "RADIANS") {
+  override def funcName: String = "toRadians"
+}
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/namedExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/namedExpressions.scala
index 00565ec651a59..2e4b9ba678433 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/namedExpressions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/namedExpressions.scala
@@ -17,10 +17,10 @@
 
 package org.apache.spark.sql.catalyst.expressions
 
-import org.apache.spark.sql.catalyst.trees
 import org.apache.spark.sql.catalyst.analysis.UnresolvedAttribute
 import org.apache.spark.sql.catalyst.errors.TreeNodeException
-import org.apache.spark.sql.catalyst.trees.LeafNode
+import org.apache.spark.sql.catalyst.expressions.codegen.{CodeGenContext, GeneratedExpressionCode}
+import org.apache.spark.sql.catalyst.trees
 import org.apache.spark.sql.types._
 
 object NamedExpression {
@@ -116,6 +116,8 @@ case class Alias(child: Expression, name: String)(
 
   override def eval(input: Row): Any = child.eval(input)
 
+  override def gen(ctx: CodeGenContext): GeneratedExpressionCode = child.gen(ctx)
+
   override def dataType: DataType = child.dataType
   override def nullable: Boolean = child.nullable
   override def metadata: Metadata = {
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/nullFunctions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/nullFunctions.scala
index 5070570b4740d..9ecfb3ccc262f 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/nullFunctions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/nullFunctions.scala
@@ -17,6 +17,7 @@
 
 package org.apache.spark.sql.catalyst.expressions
 
+import org.apache.spark.sql.catalyst.expressions.codegen.{GeneratedExpressionCode, Code, CodeGenContext}
 import org.apache.spark.sql.catalyst.trees
 import org.apache.spark.sql.catalyst.analysis.UnresolvedException
 import org.apache.spark.sql.types.DataType
@@ -51,6 +52,25 @@ case class Coalesce(children: Seq[Expression]) extends Expression {
     }
     result
   }
+
+  override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): Code = {
+    s"""
+      boolean ${ev.isNull} = true;
+      ${ctx.javaType(dataType)} ${ev.primitive} = ${ctx.defaultValue(dataType)};
+    """ +
+    children.map { e =>
+      val eval = e.gen(ctx)
+      s"""
+        if (${ev.isNull}) {
+          ${eval.code}
+          if (!${eval.isNull}) {
+            ${ev.isNull} = false;
+            ${ev.primitive} = ${eval.primitive};
+          }
+        }
+      """
+    }.mkString("\n")
+  }
 }
 
 case class IsNull(child: Expression) extends Predicate with trees.UnaryNode[Expression] {
@@ -61,6 +81,13 @@ case class IsNull(child: Expression) extends Predicate with trees.UnaryNode[Expr
     child.eval(input) == null
   }
 
+  override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): Code = {
+    val eval = child.gen(ctx)
+    ev.isNull = "false"
+    ev.primitive = eval.isNull
+    eval.code
+  }
+
   override def toString: String = s"IS NULL $child"
 }
 
@@ -72,6 +99,13 @@ case class IsNotNull(child: Expression) extends Predicate with trees.UnaryNode[E
   override def eval(input: Row): Any = {
     child.eval(input) != null
   }
+
+  override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): Code = {
+    val eval = child.gen(ctx)
+    ev.isNull = "false"
+    ev.primitive = s"(!(${eval.isNull}))"
+    eval.code
+  }
 }
 
 /**
@@ -95,4 +129,25 @@ case class AtLeastNNonNulls(n: Int, children: Seq[Expression]) extends Predicate
     }
     numNonNulls >= n
   }
+
+  override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): Code = {
+    val nonnull = ctx.freshName("nonnull")
+    val code = children.map { e =>
+      val eval = e.gen(ctx)
+      s"""
+        if ($nonnull < $n) {
+          ${eval.code}
+          if (!${eval.isNull}) {
+            $nonnull += 1;
+          }
+        }
+      """
+    }.mkString("\n")
+    s"""
+      int $nonnull = 0;
+      $code
+      boolean ${ev.isNull} = false;
+      boolean ${ev.primitive} = $nonnull >= $n;
+     """
+  }
 }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala
index 58273b166fe91..1d0f19a400d63 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala
@@ -18,9 +18,10 @@
 package org.apache.spark.sql.catalyst.expressions
 
 import org.apache.spark.sql.catalyst.analysis.TypeCheckResult
-import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
 import org.apache.spark.sql.catalyst.util.TypeUtils
-import org.apache.spark.sql.types.{BinaryType, BooleanType, DataType}
+import org.apache.spark.sql.catalyst.expressions.codegen.{GeneratedExpressionCode, Code, CodeGenContext}
+import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
+import org.apache.spark.sql.types._
 
 object InterpretedPredicate {
   def create(expression: Expression, inputSchema: Seq[Attribute]): (Row => Boolean) =
@@ -82,6 +83,10 @@ case class Not(child: Expression) extends UnaryExpression with Predicate with Ex
       case b: Boolean => !b
     }
   }
+
+  override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): Code = {
+    defineCodeGen(ctx, ev, c => s"!($c)")
+  }
 }
 
 /**
@@ -141,6 +146,29 @@ case class And(left: Expression, right: Expression)
       }
     }
   }
+
+  override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): Code = {
+    val eval1 = left.gen(ctx)
+    val eval2 = right.gen(ctx)
+
+    // The result should be `false`, if any of them is `false` whenever the other is null or not.
+    s"""
+      ${eval1.code}
+      boolean ${ev.isNull} = false;
+      boolean ${ev.primitive}  = false;
+
+      if (!${eval1.isNull} && !${eval1.primitive}) {
+      } else {
+        ${eval2.code}
+        if (!${eval2.isNull} && !${eval2.primitive}) {
+        } else if (!${eval1.isNull} && !${eval2.isNull}) {
+          ${ev.primitive} = true;
+        } else {
+          ${ev.isNull} = true;
+        }
+      }
+     """
+  }
 }
 
 case class Or(left: Expression, right: Expression)
@@ -167,6 +195,29 @@ case class Or(left: Expression, right: Expression)
       }
     }
   }
+
+  override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): Code = {
+    val eval1 = left.gen(ctx)
+    val eval2 = right.gen(ctx)
+
+    // The result should be `true`, if any of them is `true` whenever the other is null or not.
+    s"""
+      ${eval1.code}
+      boolean ${ev.isNull} = false;
+      boolean ${ev.primitive} = true;
+
+      if (!${eval1.isNull} && ${eval1.primitive}) {
+      } else {
+        ${eval2.code}
+        if (!${eval2.isNull} && ${eval2.primitive}) {
+        } else if (!${eval1.isNull} && !${eval2.isNull}) {
+          ${ev.primitive} = false;
+        } else {
+          ${ev.isNull} = true;
+        }
+      }
+     """
+  }
 }
 
 abstract class BinaryComparison extends BinaryExpression with Predicate {
@@ -198,6 +249,20 @@ abstract class BinaryComparison extends BinaryExpression with Predicate {
     }
   }
 
+  override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): Code = {
+    left.dataType match {
+      case dt: NumericType if ctx.isNativeType(dt) => defineCodeGen (ctx, ev, {
+        (c1, c3) => s"$c1 $symbol $c3"
+      })
+      case TimestampType =>
+        // java.sql.Timestamp does not have compare()
+        super.genCode(ctx, ev)
+      case other => defineCodeGen (ctx, ev, {
+        (c1, c2) => s"$c1.compare($c2) $symbol 0"
+      })
+    }
+  }
+
   protected def evalInternal(evalE1: Any, evalE2: Any): Any =
     sys.error(s"BinaryComparisons must override either eval or evalInternal")
 }
@@ -215,6 +280,9 @@ case class EqualTo(left: Expression, right: Expression) extends BinaryComparison
     if (left.dataType != BinaryType) l == r
     else java.util.Arrays.equals(l.asInstanceOf[Array[Byte]], r.asInstanceOf[Array[Byte]])
   }
+  override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): Code = {
+    defineCodeGen(ctx, ev, ctx.equalFunc(left.dataType))
+  }
 }
 
 case class EqualNullSafe(left: Expression, right: Expression) extends BinaryComparison {
@@ -235,6 +303,17 @@ case class EqualNullSafe(left: Expression, right: Expression) extends BinaryComp
       l == r
     }
   }
+
+  override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): Code = {
+    val eval1 = left.gen(ctx)
+    val eval2 = right.gen(ctx)
+    val equalCode = ctx.equalFunc(left.dataType)(eval1.primitive, eval2.primitive)
+    ev.isNull = "false"
+    eval1.code + eval2.code + s"""
+        boolean ${ev.primitive} = (${eval1.isNull} && ${eval2.isNull}) ||
+           (!${eval1.isNull} && $equalCode);
+      """
+  }
 }
 
 case class LessThan(left: Expression, right: Expression) extends BinaryComparison {
@@ -309,6 +388,27 @@ case class If(predicate: Expression, trueValue: Expression, falseValue: Expressi
     }
   }
 
+  override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): Code = {
+    val condEval = predicate.gen(ctx)
+    val trueEval = trueValue.gen(ctx)
+    val falseEval = falseValue.gen(ctx)
+
+    s"""
+      ${condEval.code}
+      boolean ${ev.isNull} = false;
+      ${ctx.javaType(dataType)} ${ev.primitive} = ${ctx.defaultValue(dataType)};
+      if (!${condEval.isNull} && ${condEval.primitive}) {
+        ${trueEval.code}
+        ${ev.isNull} = ${trueEval.isNull};
+        ${ev.primitive} = ${trueEval.primitive};
+      } else {
+        ${falseEval.code}
+        ${ev.isNull} = ${falseEval.isNull};
+        ${ev.primitive} = ${falseEval.primitive};
+      }
+    """
+  }
+
   override def toString: String = s"if ($predicate) $trueValue else $falseValue"
 }
 
@@ -393,6 +493,48 @@ case class CaseWhen(branches: Seq[Expression]) extends CaseWhenLike {
     return res
   }
 
+  override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): Code = {
+    val len = branchesArr.length
+    val got = ctx.freshName("got")
+
+    val cases = (0 until len/2).map { i =>
+      val cond = branchesArr(i * 2).gen(ctx)
+      val res = branchesArr(i * 2 + 1).gen(ctx)
+      s"""
+        if (!$got) {
+          ${cond.code}
+          if (!${cond.isNull} && ${cond.primitive}) {
+            $got = true;
+            ${res.code}
+            ${ev.isNull} = ${res.isNull};
+            ${ev.primitive} = ${res.primitive};
+          }
+        }
+      """
+    }.mkString("\n")
+
+    val other = if (len % 2 == 1) {
+      val res = branchesArr(len - 1).gen(ctx)
+      s"""
+        if (!$got) {
+          ${res.code}
+          ${ev.isNull} = ${res.isNull};
+          ${ev.primitive} = ${res.primitive};
+        }
+      """
+    } else {
+      ""
+    }
+
+    s"""
+      boolean $got = false;
+      boolean ${ev.isNull} = true;
+      ${ctx.javaType(dataType)} ${ev.primitive} = ${ctx.defaultValue(dataType)};
+      $cases
+      $other
+    """
+  }
+
   override def toString: String = {
     "CASE" + branches.sliding(2, 2).map {
       case Seq(cond, value) => s" WHEN $cond THEN $value"
@@ -444,6 +586,52 @@ case class CaseKeyWhen(key: Expression, branches: Seq[Expression]) extends CaseW
     return res
   }
 
+  override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): Code = {
+    val keyEval = key.gen(ctx)
+    val len = branchesArr.length
+    val got = ctx.freshName("got")
+
+    val cases = (0 until len/2).map { i =>
+      val cond = branchesArr(i * 2).gen(ctx)
+      val res = branchesArr(i * 2 + 1).gen(ctx)
+      s"""
+        if (!$got) {
+          ${cond.code}
+          if (${keyEval.isNull} && ${cond.isNull} ||
+            !${keyEval.isNull} && !${cond.isNull}
+             && ${ctx.equalFunc(key.dataType)(keyEval.primitive, cond.primitive)}) {
+            $got = true;
+            ${res.code}
+            ${ev.isNull} = ${res.isNull};
+            ${ev.primitive} = ${res.primitive};
+          }
+        }
+      """
+    }.mkString("\n")
+
+    val other = if (len % 2 == 1) {
+      val res = branchesArr(len - 1).gen(ctx)
+      s"""
+        if (!$got) {
+          ${res.code}
+          ${ev.isNull} = ${res.isNull};
+          ${ev.primitive} = ${res.primitive};
+        }
+      """
+    } else {
+      ""
+    }
+
+    s"""
+      boolean $got = false;
+      boolean ${ev.isNull} = true;
+      ${ctx.javaType(dataType)} ${ev.primitive} = ${ctx.defaultValue(dataType)};
+      ${keyEval.code}
+      $cases
+      $other
+    """
+  }
+
   private def equalNullSafe(l: Any, r: Any) = {
     if (l == null && r == null) {
       true
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/sets.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/sets.scala
index b65bf165f21db..b39349b988389 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/sets.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/sets.scala
@@ -17,6 +17,7 @@
 
 package org.apache.spark.sql.catalyst.expressions
 
+import org.apache.spark.sql.catalyst.expressions.codegen.{GeneratedExpressionCode, Code, CodeGenContext}
 import org.apache.spark.sql.types._
 import org.apache.spark.util.collection.OpenHashSet
 
@@ -60,6 +61,17 @@ case class NewSet(elementType: DataType) extends LeafExpression {
     new OpenHashSet[Any]()
   }
 
+  override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): Code = {
+    elementType match {
+      case IntegerType | LongType =>
+        ev.isNull = "false"
+        s"""
+          ${ctx.javaType(dataType)} ${ev.primitive} = new ${ctx.javaType(dataType)}();
+        """
+      case _ => super.genCode(ctx, ev)
+    }
+  }
+
   override def toString: String = s"new Set($dataType)"
 }
 
@@ -91,6 +103,25 @@ case class AddItemToSet(item: Expression, set: Expression) extends Expression {
     }
   }
 
+  override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): Code = {
+    val elementType = set.dataType.asInstanceOf[OpenHashSetUDT].elementType
+    elementType match {
+      case IntegerType | LongType =>
+        val itemEval = item.gen(ctx)
+        val setEval = set.gen(ctx)
+        val htype = ctx.javaType(dataType)
+
+        ev.isNull = "false"
+        ev.primitive = setEval.primitive
+        itemEval.code + setEval.code +  s"""
+          if (!${itemEval.isNull} && !${setEval.isNull}) {
+           (($htype)${setEval.primitive}).add(${itemEval.primitive});
+          }
+         """
+      case _ => super.genCode(ctx, ev)
+    }
+  }
+
   override def toString: String = s"$set += $item"
 }
 
@@ -116,14 +147,31 @@ case class CombineSets(left: Expression, right: Expression) extends BinaryExpres
           val rightValue = iterator.next()
           leftEval.add(rightValue)
         }
-        leftEval
-      } else {
-        null
       }
+      leftEval
     } else {
       null
     }
   }
+
+  override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): Code = {
+    val elementType = left.dataType.asInstanceOf[OpenHashSetUDT].elementType
+    elementType match {
+      case IntegerType | LongType =>
+        val leftEval = left.gen(ctx)
+        val rightEval = right.gen(ctx)
+        val htype = ctx.javaType(dataType)
+
+        ev.isNull = leftEval.isNull
+        ev.primitive = leftEval.primitive
+        leftEval.code + rightEval.code + s"""
+          if (!${leftEval.isNull} && !${rightEval.isNull}) {
+            ${leftEval.primitive}.union((${htype})${rightEval.primitive});
+          }
+        """
+      case _ => super.genCode(ctx, ev)
+    }
+  }
 }
 
 /**
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala
index c4ef9c30907f1..78adb509b470b 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala
@@ -20,6 +20,7 @@ package org.apache.spark.sql.catalyst.expressions
 import java.util.regex.Pattern
 
 import org.apache.spark.sql.catalyst.analysis.UnresolvedException
+import org.apache.spark.sql.catalyst.expressions.codegen._
 import org.apache.spark.sql.types._
 
 trait StringRegexExpression extends ExpectsInputTypes {
@@ -137,6 +138,10 @@ case class Upper(child: Expression) extends UnaryExpression with CaseConversionE
   override def convert(v: UTF8String): UTF8String = v.toUpperCase()
 
   override def toString: String = s"Upper($child)"
+
+  override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): Code = {
+    defineCodeGen(ctx, ev, c => s"($c).toUpperCase()")
+  }
 }
 
 /**
@@ -147,6 +152,10 @@ case class Lower(child: Expression) extends UnaryExpression with CaseConversionE
   override def convert(v: UTF8String): UTF8String = v.toLowerCase()
 
   override def toString: String = s"Lower($child)"
+
+  override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): Code = {
+    defineCodeGen(ctx, ev, c => s"($c).toLowerCase()")
+  }
 }
 
 /** A base trait for functions that compare two strings, returning a boolean. */
@@ -181,6 +190,9 @@ trait StringComparison extends ExpectsInputTypes {
 case class Contains(left: Expression, right: Expression)
     extends BinaryExpression with Predicate with StringComparison {
   override def compare(l: UTF8String, r: UTF8String): Boolean = l.contains(r)
+  override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): Code = {
+    defineCodeGen(ctx, ev, (c1, c2) => s"($c1).contains($c2)")
+  }
 }
 
 /**
@@ -189,6 +201,9 @@ case class Contains(left: Expression, right: Expression)
 case class StartsWith(left: Expression, right: Expression)
     extends BinaryExpression with Predicate with StringComparison {
   override def compare(l: UTF8String, r: UTF8String): Boolean = l.startsWith(r)
+  override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): Code = {
+    defineCodeGen(ctx, ev, (c1, c2) => s"($c1).startsWith($c2)")
+  }
 }
 
 /**
@@ -197,6 +212,9 @@ case class StartsWith(left: Expression, right: Expression)
 case class EndsWith(left: Expression, right: Expression)
     extends BinaryExpression with Predicate with StringComparison {
   override def compare(l: UTF8String, r: UTF8String): Boolean = l.endsWith(r)
+  override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): Code = {
+    defineCodeGen(ctx, ev, (c1, c2) => s"($c1).endsWith($c2)")
+  }
 }
 
 /**
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionEvaluationSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionEvaluationSuite.scala
index 5df528770ca6e..eea2edc323eea 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionEvaluationSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionEvaluationSuite.scala
@@ -28,6 +28,7 @@ import org.apache.spark.SparkFunSuite
 import org.apache.spark.sql.catalyst.CatalystTypeConverters
 import org.apache.spark.sql.catalyst.analysis.UnresolvedExtractValue
 import org.apache.spark.sql.catalyst.dsl.expressions._
+import org.apache.spark.sql.catalyst.expressions.codegen.{GenerateProjection, GenerateMutableProjection}
 import org.apache.spark.sql.catalyst.expressions.mathfuncs._
 import org.apache.spark.sql.catalyst.util.DateUtils
 import org.apache.spark.sql.types._
@@ -35,11 +36,20 @@ import org.apache.spark.sql.types._
 
 class ExpressionEvaluationBaseSuite extends SparkFunSuite {
 
+  def checkEvaluation(expression: Expression, expected: Any, inputRow: Row = EmptyRow): Unit = {
+    checkEvaluationWithoutCodegen(expression, expected, inputRow)
+    checkEvaluationWithGeneratedMutableProjection(expression, expected, inputRow)
+    checkEvaluationWithGeneratedProjection(expression, expected, inputRow)
+  }
+
   def evaluate(expression: Expression, inputRow: Row = EmptyRow): Any = {
     expression.eval(inputRow)
   }
 
-  def checkEvaluation(expression: Expression, expected: Any, inputRow: Row = EmptyRow): Unit = {
+  def checkEvaluationWithoutCodegen(
+      expression: Expression,
+      expected: Any,
+      inputRow: Row = EmptyRow): Unit = {
     val actual = try evaluate(expression, inputRow) catch {
       case e: Exception => fail(s"Exception evaluating $expression", e)
     }
@@ -49,6 +59,68 @@ class ExpressionEvaluationBaseSuite extends SparkFunSuite {
     }
   }
 
+  def checkEvaluationWithGeneratedMutableProjection(
+      expression: Expression,
+      expected: Any,
+      inputRow: Row = EmptyRow): Unit = {
+
+    val plan = try {
+      GenerateMutableProjection.generate(Alias(expression, s"Optimized($expression)")() :: Nil)()
+    } catch {
+      case e: Throwable =>
+        val ctx = GenerateProjection.newCodeGenContext()
+        val evaluated = expression.gen(ctx)
+        fail(
+          s"""
+            |Code generation of $expression failed:
+            |${evaluated.code}
+            |$e
+          """.stripMargin)
+    }
+
+    val actual = plan(inputRow).apply(0)
+    if (actual != expected) {
+      val input = if (inputRow == EmptyRow) "" else s", input: $inputRow"
+      fail(s"Incorrect Evaluation: $expression, actual: $actual, expected: $expected$input")
+    }
+  }
+
+  def checkEvaluationWithGeneratedProjection(
+      expression: Expression,
+      expected: Any,
+      inputRow: Row = EmptyRow): Unit = {
+    val ctx = GenerateProjection.newCodeGenContext()
+    lazy val evaluated = expression.gen(ctx)
+
+    val plan = try {
+      GenerateProjection.generate(Alias(expression, s"Optimized($expression)")() :: Nil)
+    } catch {
+      case e: Throwable =>
+        fail(
+          s"""
+            |Code generation of $expression failed:
+            |${evaluated.code}
+            |$e
+          """.stripMargin)
+    }
+
+    val actual = plan(inputRow)
+    val expectedRow = new GenericRow(Array[Any](CatalystTypeConverters.convertToCatalyst(expected)))
+    if (actual.hashCode() != expectedRow.hashCode()) {
+      fail(
+        s"""
+          |Mismatched hashCodes for values: $actual, $expectedRow
+          |Hash Codes: ${actual.hashCode()} != ${expectedRow.hashCode()}
+          |Expressions: ${expression}
+          |Code: ${evaluated}
+        """.stripMargin)
+    }
+    if (actual != expectedRow) {
+      val input = if (inputRow == EmptyRow) "" else s", input: $inputRow"
+      fail(s"Incorrect Evaluation: $expression, actual: $actual, expected: $expected$input")
+    }
+  }
+
   def checkDoubleEvaluation(
       expression: Expression,
       expected: Spread[Double],
@@ -69,8 +141,16 @@ class ExpressionEvaluationSuite extends ExpressionEvaluationBaseSuite {
   test("literals") {
     checkEvaluation(Literal(1), 1)
     checkEvaluation(Literal(true), true)
+    checkEvaluation(Literal(false), false)
     checkEvaluation(Literal(0L), 0L)
+    List(0.0, -0.0, Double.NegativeInfinity, Double.PositiveInfinity).foreach {
+      d => {
+        checkEvaluation(Literal(d), d)
+        checkEvaluation(Literal(d.toFloat), d.toFloat)
+      }
+    }
     checkEvaluation(Literal("test"), "test")
+    checkEvaluation(Literal.create(null, StringType), null)
     checkEvaluation(Literal(1) + Literal(1), 2)
   }
 
@@ -1367,6 +1447,11 @@ class ExpressionEvaluationSuite extends ExpressionEvaluationBaseSuite {
 // TODO: Make the tests work with codegen.
 class ExpressionEvaluationWithoutCodeGenSuite extends ExpressionEvaluationBaseSuite {
 
+  override def checkEvaluation(
+      expression: Expression, expected: Any, inputRow: Row = EmptyRow): Unit = {
+    checkEvaluationWithoutCodegen(expression, expected, inputRow)
+  }
+
   test("CreateStruct") {
     val row = Row(1, 2, 3)
     val c1 = 'a.int.at(0).as("a")
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/GeneratedEvaluationSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/GeneratedEvaluationSuite.scala
index 8cfd853afa35f..371a73181dad7 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/GeneratedEvaluationSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/GeneratedEvaluationSuite.scala
@@ -21,34 +21,9 @@ import org.apache.spark.sql.catalyst.dsl.expressions._
 import org.apache.spark.sql.catalyst.expressions.codegen._
 
 /**
- * Overrides our expression evaluation tests to use code generation for evaluation.
+ * Additional tests for code generation.
  */
 class GeneratedEvaluationSuite extends ExpressionEvaluationSuite {
-  override def checkEvaluation(
-      expression: Expression,
-      expected: Any,
-      inputRow: Row = EmptyRow): Unit = {
-    val plan = try {
-      GenerateMutableProjection.generate(Alias(expression, s"Optimized($expression)")() :: Nil)()
-    } catch {
-      case e: Throwable =>
-        val ctx = GenerateProjection.newCodeGenContext()
-        val evaluated = GenerateProjection.expressionEvaluator(expression, ctx)
-        fail(
-          s"""
-            |Code generation of $expression failed:
-            |${evaluated.code}
-            |$e
-          """.stripMargin)
-    }
-
-    val actual = plan(inputRow).apply(0)
-    if (actual != expected) {
-      val input = if (inputRow == EmptyRow) "" else s", input: $inputRow"
-      fail(s"Incorrect Evaluation: $expression, actual: $actual, expected: $expected$input")
-    }
-  }
-
 
   test("multithreaded eval") {
     import scala.concurrent._
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/GeneratedMutableEvaluationSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/GeneratedMutableEvaluationSuite.scala
deleted file mode 100644
index 9ab1f7d7ad0db..0000000000000
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/GeneratedMutableEvaluationSuite.scala
+++ /dev/null
@@ -1,61 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.sql.catalyst.expressions
-
-import org.apache.spark.sql.catalyst.CatalystTypeConverters
-import org.apache.spark.sql.catalyst.expressions.codegen._
-
-/**
- * Overrides our expression evaluation tests to use generated code on mutable rows.
- */
-class GeneratedMutableEvaluationSuite extends ExpressionEvaluationSuite {
-  override def checkEvaluation(
-      expression: Expression,
-      expected: Any,
-      inputRow: Row = EmptyRow): Unit = {
-    val ctx = GenerateProjection.newCodeGenContext()
-    lazy val evaluated = GenerateProjection.expressionEvaluator(expression, ctx)
-
-    val plan = try {
-      GenerateProjection.generate(Alias(expression, s"Optimized($expression)")() :: Nil)
-    } catch {
-      case e: Throwable =>
-        fail(
-          s"""
-            |Code generation of $expression failed:
-            |${evaluated.code}
-            |$e
-          """.stripMargin)
-    }
-
-    val actual = plan(inputRow)
-    val expectedRow = new GenericRow(Array[Any](CatalystTypeConverters.convertToCatalyst(expected)))
-    if (actual.hashCode() != expectedRow.hashCode()) {
-      fail(
-        s"""
-          |Mismatched hashCodes for values: $actual, $expectedRow
-          |Hash Codes: ${actual.hashCode()} != ${expectedRow.hashCode()}
-          |${evaluated.code}
-        """.stripMargin)
-    }
-    if (actual != expectedRow) {
-      val input = if (inputRow == EmptyRow) "" else s", input: $inputRow"
-      fail(s"Incorrect Evaluation: $expression, actual: $actual, expected: $expected$input")
-    }
-  }
-}
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetPartitionDiscoverySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetPartitionDiscoverySuite.scala
index 8979a0a210a42..d9a010a9815a1 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetPartitionDiscoverySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetPartitionDiscoverySuite.scala
@@ -53,7 +53,7 @@ class ParquetPartitionDiscoverySuite extends QueryTest with ParquetTest {
 
     check("10", Literal.create(10, IntegerType))
     check("1000000000000000", Literal.create(1000000000000000L, LongType))
-    check("1.5", Literal.create(1.5, FloatType))
+    check("1.5", Literal.create(1.5f, FloatType))
     check("hello", Literal.create("hello", StringType))
     check(defaultPartitionName, Literal.create(null, NullType))
   }
@@ -83,13 +83,13 @@ class ParquetPartitionDiscoverySuite extends QueryTest with ParquetTest {
         ArrayBuffer(
           Literal.create(10, IntegerType),
           Literal.create("hello", StringType),
-          Literal.create(1.5, FloatType)))
+          Literal.create(1.5f, FloatType)))
     })
 
     check("file://path/a=10/b_hello/c=1.5", Some {
       PartitionValues(
         ArrayBuffer("c"),
-        ArrayBuffer(Literal.create(1.5, FloatType)))
+        ArrayBuffer(Literal.create(1.5f, FloatType)))
     })
 
     check("file:///", None)

From f74be744d41586690e73ec57e5551c1fbabc1d6f Mon Sep 17 00:00:00 2001
From: Reynold Xin <rxin@databricks.com>
Date: Sun, 7 Jun 2015 18:45:24 -0700
Subject: [PATCH 405/525] [SPARK-8149][SQL] Break ExpressionEvaluationSuite
 down to multiple files

Also moved a few files in expressions package around to match test suites.

Author: Reynold Xin <rxin@databricks.com>

Closes #6693 from rxin/expr-refactoring and squashes the following commits:

857599f [Reynold Xin] Fixed style violation.
c0eb74b [Reynold Xin] Fixed compilation.
b3a40f8 [Reynold Xin] Refactored expression test suites.
---
 .../spark/sql/catalyst/expressions/Cast.scala |    2 +-
 .../sql/catalyst/expressions/Expression.scala |    6 +-
 .../sql/catalyst/expressions/arithmetic.scala |   96 --
 .../sql/catalyst/expressions/bitwise.scala    |  120 ++
 .../catalyst/expressions/conditionals.scala   |  313 ++++
 .../{mathfuncs/unary.scala => math.scala}     |   99 +-
 .../expressions/mathfuncs/binary.scala        |  100 --
 .../sql/catalyst/expressions/predicates.scala |  290 ----
 .../ArithmeticExpressionSuite.scala           |  144 ++
 .../expressions/BitwiseFunctionsSuite.scala   |   80 +
 .../sql/catalyst/expressions/CastSuite.scala  |  532 ++++++
 ...nSuite.scala => CodeGenerationSuite.scala} |    3 +-
 .../expressions/ComplexTypeSuite.scala        |  122 ++
 .../ConditionalExpressionSuite.scala          |   96 ++
 .../expressions/ExpressionEvalHelper.scala    |  134 ++
 .../ExpressionEvaluationSuite.scala           | 1461 -----------------
 .../expressions/LiteralExpressionSuite.scala  |   55 +
 .../expressions/MathFunctionsSuite.scala      |  179 ++
 .../expressions/NullFunctionsSuite.scala      |   65 +
 .../catalyst/expressions/PredicateSuite.scala |  179 ++
 .../expressions/StringFunctionsSuite.scala    |  218 +++
 .../ExpressionOptimizationSuite.scala         |    3 +-
 .../org/apache/spark/sql/functions.scala      |    1 -
 23 files changed, 2340 insertions(+), 1958 deletions(-)
 create mode 100644 sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/bitwise.scala
 create mode 100644 sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/conditionals.scala
 rename sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/{mathfuncs/unary.scala => math.scala} (54%)
 delete mode 100644 sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/mathfuncs/binary.scala
 create mode 100644 sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ArithmeticExpressionSuite.scala
 create mode 100644 sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/BitwiseFunctionsSuite.scala
 create mode 100644 sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CastSuite.scala
 rename sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/{GeneratedEvaluationSuite.scala => CodeGenerationSuite.scala} (94%)
 create mode 100644 sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ComplexTypeSuite.scala
 create mode 100644 sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ConditionalExpressionSuite.scala
 create mode 100644 sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionEvalHelper.scala
 delete mode 100644 sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionEvaluationSuite.scala
 create mode 100644 sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/LiteralExpressionSuite.scala
 create mode 100644 sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/MathFunctionsSuite.scala
 create mode 100644 sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/NullFunctionsSuite.scala
 create mode 100644 sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/PredicateSuite.scala
 create mode 100644 sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringFunctionsSuite.scala

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala
index 5f76a512679a4..2a1f96409daf4 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala
@@ -161,7 +161,7 @@ case class Cast(child: Expression, dataType: DataType) extends UnaryExpression w
         try Timestamp.valueOf(n) catch { case _: java.lang.IllegalArgumentException => null }
       })
     case BooleanType =>
-      buildCast[Boolean](_, b => new Timestamp((if (b) 1 else 0)))
+      buildCast[Boolean](_, b => new Timestamp(if (b) 1 else 0))
     case LongType =>
       buildCast[Long](_, l => new Timestamp(l))
     case IntegerType =>
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala
index 0ed576b3d5870..432d65eee54fb 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala
@@ -81,11 +81,11 @@ abstract class Expression extends TreeNode[Expression] {
     val objectTerm = ctx.freshName("obj")
     s"""
       /* expression: ${this} */
-      Object ${objectTerm} = expressions[${ctx.references.size - 1}].eval(i);
-      boolean ${ev.isNull} = ${objectTerm} == null;
+      Object $objectTerm = expressions[${ctx.references.size - 1}].eval(i);
+      boolean ${ev.isNull} = $objectTerm == null;
       ${ctx.javaType(this.dataType)} ${ev.primitive} = ${ctx.defaultValue(this.dataType)};
       if (!${ev.isNull}) {
-        ${ev.primitive} = (${ctx.boxedType(this.dataType)})${objectTerm};
+        ${ev.primitive} = (${ctx.boxedType(this.dataType)}) $objectTerm;
       }
     """
   }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/arithmetic.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/arithmetic.scala
index 3ac7c92dcd009..d4efda2e04c29 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/arithmetic.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/arithmetic.scala
@@ -322,102 +322,6 @@ case class Remainder(left: Expression, right: Expression) extends BinaryArithmet
   }
 }
 
-/**
- * A function that calculates bitwise and(&) of two numbers.
- */
-case class BitwiseAnd(left: Expression, right: Expression) extends BinaryArithmetic {
-  override def symbol: String = "&"
-
-  protected def checkTypesInternal(t: DataType) =
-    TypeUtils.checkForBitwiseExpr(t, "operator " + symbol)
-
-  private lazy val and: (Any, Any) => Any = dataType match {
-    case ByteType =>
-      ((evalE1: Byte, evalE2: Byte) => (evalE1 & evalE2).toByte).asInstanceOf[(Any, Any) => Any]
-    case ShortType =>
-      ((evalE1: Short, evalE2: Short) => (evalE1 & evalE2).toShort).asInstanceOf[(Any, Any) => Any]
-    case IntegerType =>
-      ((evalE1: Int, evalE2: Int) => evalE1 & evalE2).asInstanceOf[(Any, Any) => Any]
-    case LongType =>
-      ((evalE1: Long, evalE2: Long) => evalE1 & evalE2).asInstanceOf[(Any, Any) => Any]
-  }
-
-  protected override def evalInternal(evalE1: Any, evalE2: Any) = and(evalE1, evalE2)
-}
-
-/**
- * A function that calculates bitwise or(|) of two numbers.
- */
-case class BitwiseOr(left: Expression, right: Expression) extends BinaryArithmetic {
-  override def symbol: String = "|"
-
-  protected def checkTypesInternal(t: DataType) =
-    TypeUtils.checkForBitwiseExpr(t, "operator " + symbol)
-
-  private lazy val or: (Any, Any) => Any = dataType match {
-    case ByteType =>
-      ((evalE1: Byte, evalE2: Byte) => (evalE1 | evalE2).toByte).asInstanceOf[(Any, Any) => Any]
-    case ShortType =>
-      ((evalE1: Short, evalE2: Short) => (evalE1 | evalE2).toShort).asInstanceOf[(Any, Any) => Any]
-    case IntegerType =>
-      ((evalE1: Int, evalE2: Int) => evalE1 | evalE2).asInstanceOf[(Any, Any) => Any]
-    case LongType =>
-      ((evalE1: Long, evalE2: Long) => evalE1 | evalE2).asInstanceOf[(Any, Any) => Any]
-  }
-
-  protected override def evalInternal(evalE1: Any, evalE2: Any) = or(evalE1, evalE2)
-}
-
-/**
- * A function that calculates bitwise xor of two numbers.
- */
-case class BitwiseXor(left: Expression, right: Expression) extends BinaryArithmetic {
-  override def symbol: String = "^"
-
-  protected def checkTypesInternal(t: DataType) =
-    TypeUtils.checkForBitwiseExpr(t, "operator " + symbol)
-
-  private lazy val xor: (Any, Any) => Any = dataType match {
-    case ByteType =>
-      ((evalE1: Byte, evalE2: Byte) => (evalE1 ^ evalE2).toByte).asInstanceOf[(Any, Any) => Any]
-    case ShortType =>
-      ((evalE1: Short, evalE2: Short) => (evalE1 ^ evalE2).toShort).asInstanceOf[(Any, Any) => Any]
-    case IntegerType =>
-      ((evalE1: Int, evalE2: Int) => evalE1 ^ evalE2).asInstanceOf[(Any, Any) => Any]
-    case LongType =>
-      ((evalE1: Long, evalE2: Long) => evalE1 ^ evalE2).asInstanceOf[(Any, Any) => Any]
-  }
-
-  protected override def evalInternal(evalE1: Any, evalE2: Any): Any = xor(evalE1, evalE2)
-}
-
-/**
- * A function that calculates bitwise not(~) of a number.
- */
-case class BitwiseNot(child: Expression) extends UnaryArithmetic {
-  override def toString: String = s"~$child"
-
-  override def checkInputDataTypes(): TypeCheckResult =
-    TypeUtils.checkForBitwiseExpr(child.dataType, "operator ~")
-
-  private lazy val not: (Any) => Any = dataType match {
-    case ByteType =>
-      ((evalE: Byte) => (~evalE).toByte).asInstanceOf[(Any) => Any]
-    case ShortType =>
-      ((evalE: Short) => (~evalE).toShort).asInstanceOf[(Any) => Any]
-    case IntegerType =>
-      ((evalE: Int) => ~evalE).asInstanceOf[(Any) => Any]
-    case LongType =>
-      ((evalE: Long) => ~evalE).asInstanceOf[(Any) => Any]
-  }
-
-  override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): Code = {
-    defineCodeGen(ctx, ev, c => s"(${ctx.javaType(dataType)})~($c)")
-  }
-
-  protected override def evalInternal(evalE: Any) = not(evalE)
-}
-
 case class MaxOf(left: Expression, right: Expression) extends BinaryArithmetic {
   override def nullable: Boolean = left.nullable && right.nullable
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/bitwise.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/bitwise.scala
new file mode 100644
index 0000000000000..ef34586261e70
--- /dev/null
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/bitwise.scala
@@ -0,0 +1,120 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.catalyst.expressions
+
+import org.apache.spark.sql.catalyst.analysis.TypeCheckResult
+import org.apache.spark.sql.catalyst.expressions.codegen._
+import org.apache.spark.sql.catalyst.util.TypeUtils
+import org.apache.spark.sql.types._
+
+
+/**
+ * A function that calculates bitwise and(&) of two numbers.
+ */
+case class BitwiseAnd(left: Expression, right: Expression) extends BinaryArithmetic {
+  override def symbol: String = "&"
+
+  protected def checkTypesInternal(t: DataType) =
+    TypeUtils.checkForBitwiseExpr(t, "operator " + symbol)
+
+  private lazy val and: (Any, Any) => Any = dataType match {
+    case ByteType =>
+      ((evalE1: Byte, evalE2: Byte) => (evalE1 & evalE2).toByte).asInstanceOf[(Any, Any) => Any]
+    case ShortType =>
+      ((evalE1: Short, evalE2: Short) => (evalE1 & evalE2).toShort).asInstanceOf[(Any, Any) => Any]
+    case IntegerType =>
+      ((evalE1: Int, evalE2: Int) => evalE1 & evalE2).asInstanceOf[(Any, Any) => Any]
+    case LongType =>
+      ((evalE1: Long, evalE2: Long) => evalE1 & evalE2).asInstanceOf[(Any, Any) => Any]
+  }
+
+  protected override def evalInternal(evalE1: Any, evalE2: Any) = and(evalE1, evalE2)
+}
+
+/**
+ * A function that calculates bitwise or(|) of two numbers.
+ */
+case class BitwiseOr(left: Expression, right: Expression) extends BinaryArithmetic {
+  override def symbol: String = "|"
+
+  protected def checkTypesInternal(t: DataType) =
+    TypeUtils.checkForBitwiseExpr(t, "operator " + symbol)
+
+  private lazy val or: (Any, Any) => Any = dataType match {
+    case ByteType =>
+      ((evalE1: Byte, evalE2: Byte) => (evalE1 | evalE2).toByte).asInstanceOf[(Any, Any) => Any]
+    case ShortType =>
+      ((evalE1: Short, evalE2: Short) => (evalE1 | evalE2).toShort).asInstanceOf[(Any, Any) => Any]
+    case IntegerType =>
+      ((evalE1: Int, evalE2: Int) => evalE1 | evalE2).asInstanceOf[(Any, Any) => Any]
+    case LongType =>
+      ((evalE1: Long, evalE2: Long) => evalE1 | evalE2).asInstanceOf[(Any, Any) => Any]
+  }
+
+  protected override def evalInternal(evalE1: Any, evalE2: Any) = or(evalE1, evalE2)
+}
+
+/**
+ * A function that calculates bitwise xor of two numbers.
+ */
+case class BitwiseXor(left: Expression, right: Expression) extends BinaryArithmetic {
+  override def symbol: String = "^"
+
+  protected def checkTypesInternal(t: DataType) =
+    TypeUtils.checkForBitwiseExpr(t, "operator " + symbol)
+
+  private lazy val xor: (Any, Any) => Any = dataType match {
+    case ByteType =>
+      ((evalE1: Byte, evalE2: Byte) => (evalE1 ^ evalE2).toByte).asInstanceOf[(Any, Any) => Any]
+    case ShortType =>
+      ((evalE1: Short, evalE2: Short) => (evalE1 ^ evalE2).toShort).asInstanceOf[(Any, Any) => Any]
+    case IntegerType =>
+      ((evalE1: Int, evalE2: Int) => evalE1 ^ evalE2).asInstanceOf[(Any, Any) => Any]
+    case LongType =>
+      ((evalE1: Long, evalE2: Long) => evalE1 ^ evalE2).asInstanceOf[(Any, Any) => Any]
+  }
+
+  protected override def evalInternal(evalE1: Any, evalE2: Any): Any = xor(evalE1, evalE2)
+}
+
+/**
+ * A function that calculates bitwise not(~) of a number.
+ */
+case class BitwiseNot(child: Expression) extends UnaryArithmetic {
+  override def toString: String = s"~$child"
+
+  override def checkInputDataTypes(): TypeCheckResult =
+    TypeUtils.checkForBitwiseExpr(child.dataType, "operator ~")
+
+  private lazy val not: (Any) => Any = dataType match {
+    case ByteType =>
+      ((evalE: Byte) => (~evalE).toByte).asInstanceOf[(Any) => Any]
+    case ShortType =>
+      ((evalE: Short) => (~evalE).toShort).asInstanceOf[(Any) => Any]
+    case IntegerType =>
+      ((evalE: Int) => ~evalE).asInstanceOf[(Any) => Any]
+    case LongType =>
+      ((evalE: Long) => ~evalE).asInstanceOf[(Any) => Any]
+  }
+
+  override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): Code = {
+    defineCodeGen(ctx, ev, c => s"(${ctx.javaType(dataType)})~($c)")
+  }
+
+  protected override def evalInternal(evalE: Any) = not(evalE)
+}
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/conditionals.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/conditionals.scala
new file mode 100644
index 0000000000000..3aa86edd7ab20
--- /dev/null
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/conditionals.scala
@@ -0,0 +1,313 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.catalyst.expressions
+
+import org.apache.spark.sql.catalyst.analysis.TypeCheckResult
+import org.apache.spark.sql.catalyst.expressions.codegen._
+import org.apache.spark.sql.types.{BooleanType, DataType}
+
+
+case class If(predicate: Expression, trueValue: Expression, falseValue: Expression)
+  extends Expression {
+
+  override def children: Seq[Expression] = predicate :: trueValue :: falseValue :: Nil
+  override def nullable: Boolean = trueValue.nullable || falseValue.nullable
+
+  override def checkInputDataTypes(): TypeCheckResult = {
+    if (predicate.dataType != BooleanType) {
+      TypeCheckResult.TypeCheckFailure(
+        s"type of predicate expression in If should be boolean, not ${predicate.dataType}")
+    } else if (trueValue.dataType != falseValue.dataType) {
+      TypeCheckResult.TypeCheckFailure(
+        s"differing types in If (${trueValue.dataType} and ${falseValue.dataType}).")
+    } else {
+      TypeCheckResult.TypeCheckSuccess
+    }
+  }
+
+  override def dataType: DataType = trueValue.dataType
+
+  override def eval(input: Row): Any = {
+    if (true == predicate.eval(input)) {
+      trueValue.eval(input)
+    } else {
+      falseValue.eval(input)
+    }
+  }
+
+  override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): Code = {
+    val condEval = predicate.gen(ctx)
+    val trueEval = trueValue.gen(ctx)
+    val falseEval = falseValue.gen(ctx)
+
+    s"""
+      ${condEval.code}
+      boolean ${ev.isNull} = false;
+      ${ctx.javaType(dataType)} ${ev.primitive} = ${ctx.defaultValue(dataType)};
+      if (!${condEval.isNull} && ${condEval.primitive}) {
+        ${trueEval.code}
+        ${ev.isNull} = ${trueEval.isNull};
+        ${ev.primitive} = ${trueEval.primitive};
+      } else {
+        ${falseEval.code}
+        ${ev.isNull} = ${falseEval.isNull};
+        ${ev.primitive} = ${falseEval.primitive};
+      }
+    """
+  }
+
+  override def toString: String = s"if ($predicate) $trueValue else $falseValue"
+}
+
+trait CaseWhenLike extends Expression {
+  self: Product =>
+
+  // Note that `branches` are considered in consecutive pairs (cond, val), and the optional last
+  // element is the value for the default catch-all case (if provided).
+  // Hence, `branches` consists of at least two elements, and can have an odd or even length.
+  def branches: Seq[Expression]
+
+  @transient lazy val whenList =
+    branches.sliding(2, 2).collect { case Seq(whenExpr, _) => whenExpr }.toSeq
+  @transient lazy val thenList =
+    branches.sliding(2, 2).collect { case Seq(_, thenExpr) => thenExpr }.toSeq
+  val elseValue = if (branches.length % 2 == 0) None else Option(branches.last)
+
+  // both then and else expressions should be considered.
+  def valueTypes: Seq[DataType] = (thenList ++ elseValue).map(_.dataType)
+  def valueTypesEqual: Boolean = valueTypes.distinct.size == 1
+
+  override def checkInputDataTypes(): TypeCheckResult = {
+    if (valueTypesEqual) {
+      checkTypesInternal()
+    } else {
+      TypeCheckResult.TypeCheckFailure(
+        "THEN and ELSE expressions should all be same type or coercible to a common type")
+    }
+  }
+
+  protected def checkTypesInternal(): TypeCheckResult
+
+  override def dataType: DataType = thenList.head.dataType
+
+  override def nullable: Boolean = {
+    // If no value is nullable and no elseValue is provided, the whole statement defaults to null.
+    thenList.exists(_.nullable) || (elseValue.map(_.nullable).getOrElse(true))
+  }
+}
+
+// scalastyle:off
+/**
+ * Case statements of the form "CASE WHEN a THEN b [WHEN c THEN d]* [ELSE e] END".
+ * Refer to this link for the corresponding semantics:
+ * https://cwiki.apache.org/confluence/display/Hive/LanguageManual+UDF#LanguageManualUDF-ConditionalFunctions
+ */
+// scalastyle:on
+case class CaseWhen(branches: Seq[Expression]) extends CaseWhenLike {
+
+  // Use private[this] Array to speed up evaluation.
+  @transient private[this] lazy val branchesArr = branches.toArray
+
+  override def children: Seq[Expression] = branches
+
+  override protected def checkTypesInternal(): TypeCheckResult = {
+    if (whenList.forall(_.dataType == BooleanType)) {
+      TypeCheckResult.TypeCheckSuccess
+    } else {
+      val index = whenList.indexWhere(_.dataType != BooleanType)
+      TypeCheckResult.TypeCheckFailure(
+        s"WHEN expressions in CaseWhen should all be boolean type, " +
+          s"but the ${index + 1}th when expression's type is ${whenList(index)}")
+    }
+  }
+
+  /** Written in imperative fashion for performance considerations. */
+  override def eval(input: Row): Any = {
+    val len = branchesArr.length
+    var i = 0
+    // If all branches fail and an elseVal is not provided, the whole statement
+    // defaults to null, according to Hive's semantics.
+    while (i < len - 1) {
+      if (branchesArr(i).eval(input) == true) {
+        return branchesArr(i + 1).eval(input)
+      }
+      i += 2
+    }
+    var res: Any = null
+    if (i == len - 1) {
+      res = branchesArr(i).eval(input)
+    }
+    return res
+  }
+
+  override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): Code = {
+    val len = branchesArr.length
+    val got = ctx.freshName("got")
+
+    val cases = (0 until len/2).map { i =>
+      val cond = branchesArr(i * 2).gen(ctx)
+      val res = branchesArr(i * 2 + 1).gen(ctx)
+      s"""
+        if (!$got) {
+          ${cond.code}
+          if (!${cond.isNull} && ${cond.primitive}) {
+            $got = true;
+            ${res.code}
+            ${ev.isNull} = ${res.isNull};
+            ${ev.primitive} = ${res.primitive};
+          }
+        }
+      """
+    }.mkString("\n")
+
+    val other = if (len % 2 == 1) {
+      val res = branchesArr(len - 1).gen(ctx)
+      s"""
+        if (!$got) {
+          ${res.code}
+          ${ev.isNull} = ${res.isNull};
+          ${ev.primitive} = ${res.primitive};
+        }
+      """
+    } else {
+      ""
+    }
+
+    s"""
+      boolean $got = false;
+      boolean ${ev.isNull} = true;
+      ${ctx.javaType(dataType)} ${ev.primitive} = ${ctx.defaultValue(dataType)};
+      $cases
+      $other
+    """
+  }
+
+  override def toString: String = {
+    "CASE" + branches.sliding(2, 2).map {
+      case Seq(cond, value) => s" WHEN $cond THEN $value"
+      case Seq(elseValue) => s" ELSE $elseValue"
+    }.mkString
+  }
+}
+
+// scalastyle:off
+/**
+ * Case statements of the form "CASE a WHEN b THEN c [WHEN d THEN e]* [ELSE f] END".
+ * Refer to this link for the corresponding semantics:
+ * https://cwiki.apache.org/confluence/display/Hive/LanguageManual+UDF#LanguageManualUDF-ConditionalFunctions
+ */
+// scalastyle:on
+case class CaseKeyWhen(key: Expression, branches: Seq[Expression]) extends CaseWhenLike {
+
+  // Use private[this] Array to speed up evaluation.
+  @transient private[this] lazy val branchesArr = branches.toArray
+
+  override def children: Seq[Expression] = key +: branches
+
+  override protected def checkTypesInternal(): TypeCheckResult = {
+    if ((key +: whenList).map(_.dataType).distinct.size > 1) {
+      TypeCheckResult.TypeCheckFailure(
+        "key and WHEN expressions should all be same type or coercible to a common type")
+    } else {
+      TypeCheckResult.TypeCheckSuccess
+    }
+  }
+
+  /** Written in imperative fashion for performance considerations. */
+  override def eval(input: Row): Any = {
+    val evaluatedKey = key.eval(input)
+    val len = branchesArr.length
+    var i = 0
+    // If all branches fail and an elseVal is not provided, the whole statement
+    // defaults to null, according to Hive's semantics.
+    while (i < len - 1) {
+      if (equalNullSafe(evaluatedKey, branchesArr(i).eval(input))) {
+        return branchesArr(i + 1).eval(input)
+      }
+      i += 2
+    }
+    var res: Any = null
+    if (i == len - 1) {
+      res = branchesArr(i).eval(input)
+    }
+    return res
+  }
+
+  override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): Code = {
+    val keyEval = key.gen(ctx)
+    val len = branchesArr.length
+    val got = ctx.freshName("got")
+
+    val cases = (0 until len/2).map { i =>
+      val cond = branchesArr(i * 2).gen(ctx)
+      val res = branchesArr(i * 2 + 1).gen(ctx)
+      s"""
+        if (!$got) {
+          ${cond.code}
+          if (${keyEval.isNull} && ${cond.isNull} ||
+            !${keyEval.isNull} && !${cond.isNull}
+             && ${ctx.equalFunc(key.dataType)(keyEval.primitive, cond.primitive)}) {
+            $got = true;
+            ${res.code}
+            ${ev.isNull} = ${res.isNull};
+            ${ev.primitive} = ${res.primitive};
+          }
+        }
+      """
+    }.mkString("\n")
+
+    val other = if (len % 2 == 1) {
+      val res = branchesArr(len - 1).gen(ctx)
+      s"""
+        if (!$got) {
+          ${res.code}
+          ${ev.isNull} = ${res.isNull};
+          ${ev.primitive} = ${res.primitive};
+        }
+      """
+    } else {
+      ""
+    }
+
+    s"""
+      boolean $got = false;
+      boolean ${ev.isNull} = true;
+      ${ctx.javaType(dataType)} ${ev.primitive} = ${ctx.defaultValue(dataType)};
+      ${keyEval.code}
+      $cases
+      $other
+    """
+  }
+
+  private def equalNullSafe(l: Any, r: Any) = {
+    if (l == null && r == null) {
+      true
+    } else if (l == null || r == null) {
+      false
+    } else {
+      l == r
+    }
+  }
+
+  override def toString: String = {
+    s"CASE $key" + branches.sliding(2, 2).map {
+      case Seq(cond, value) => s" WHEN $cond THEN $value"
+      case Seq(elseValue) => s" ELSE $elseValue"
+    }.mkString
+  }
+}
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/mathfuncs/unary.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/math.scala
similarity index 54%
rename from sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/mathfuncs/unary.scala
rename to sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/math.scala
index 5563cd94bf86d..a18067e4a58f1 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/mathfuncs/unary.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/math.scala
@@ -15,11 +15,10 @@
  * limitations under the License.
  */
 
-package org.apache.spark.sql.catalyst.expressions.mathfuncs
+package org.apache.spark.sql.catalyst.expressions
 
-import org.apache.spark.sql.catalyst.expressions.codegen.{Code, CodeGenContext, GeneratedExpressionCode}
-import org.apache.spark.sql.catalyst.expressions.{ExpectsInputTypes, Expression, Row, UnaryExpression}
-import org.apache.spark.sql.types._
+import org.apache.spark.sql.catalyst.expressions.codegen._
+import org.apache.spark.sql.types.{DataType, DoubleType}
 
 /**
  * A unary expression specifically for math functions. Math Functions expect a specific type of
@@ -64,6 +63,47 @@ abstract class UnaryMathExpression(f: Double => Double, name: String)
   }
 }
 
+/**
+ * A binary expression specifically for math functions that take two `Double`s as input and returns
+ * a `Double`.
+ * @param f The math function.
+ * @param name The short name of the function
+ */
+abstract class BinaryMathExpression(f: (Double, Double) => Double, name: String)
+  extends BinaryExpression with Serializable with ExpectsInputTypes { self: Product =>
+
+  override def expectedChildTypes: Seq[DataType] = Seq(DoubleType, DoubleType)
+
+  override def toString: String = s"$name($left, $right)"
+
+  override def dataType: DataType = DoubleType
+
+  override def eval(input: Row): Any = {
+    val evalE1 = left.eval(input)
+    if (evalE1 == null) {
+      null
+    } else {
+      val evalE2 = right.eval(input)
+      if (evalE2 == null) {
+        null
+      } else {
+        val result = f(evalE1.asInstanceOf[Double], evalE2.asInstanceOf[Double])
+        if (result.isNaN) null else result
+      }
+    }
+  }
+
+  override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): Code = {
+    defineCodeGen(ctx, ev, (c1, c2) => s"java.lang.Math.${name.toLowerCase}($c1, $c2)")
+  }
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// Unary math functions
+////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
 case class Acos(child: Expression) extends UnaryMathExpression(math.acos, "ACOS")
 
 case class Asin(child: Expression) extends UnaryMathExpression(math.asin, "ASIN")
@@ -111,3 +151,54 @@ case class ToDegrees(child: Expression) extends UnaryMathExpression(math.toDegre
 case class ToRadians(child: Expression) extends UnaryMathExpression(math.toRadians, "RADIANS") {
   override def funcName: String = "toRadians"
 }
+
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// Binary math functions
+////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+case class Atan2(left: Expression, right: Expression)
+  extends BinaryMathExpression(math.atan2, "ATAN2") {
+
+  override def eval(input: Row): Any = {
+    val evalE1 = left.eval(input)
+    if (evalE1 == null) {
+      null
+    } else {
+      val evalE2 = right.eval(input)
+      if (evalE2 == null) {
+        null
+      } else {
+        // With codegen, the values returned by -0.0 and 0.0 are different. Handled with +0.0
+        val result = math.atan2(evalE1.asInstanceOf[Double] + 0.0,
+          evalE2.asInstanceOf[Double] + 0.0)
+        if (result.isNaN) null else result
+      }
+    }
+  }
+
+  override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): Code = {
+    defineCodeGen(ctx, ev, (c1, c2) => s"java.lang.Math.atan2($c1 + 0.0, $c2 + 0.0)") + s"""
+      if (Double.valueOf(${ev.primitive}).isNaN()) {
+        ${ev.isNull} = true;
+      }
+      """
+  }
+}
+
+case class Hypot(left: Expression, right: Expression)
+  extends BinaryMathExpression(math.hypot, "HYPOT")
+
+case class Pow(left: Expression, right: Expression)
+  extends BinaryMathExpression(math.pow, "POWER") {
+  override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): Code = {
+    defineCodeGen(ctx, ev, (c1, c2) => s"java.lang.Math.pow($c1, $c2)") + s"""
+      if (Double.valueOf(${ev.primitive}).isNaN()) {
+        ${ev.isNull} = true;
+      }
+      """
+  }
+}
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/mathfuncs/binary.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/mathfuncs/binary.scala
deleted file mode 100644
index 88211acd7713c..0000000000000
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/mathfuncs/binary.scala
+++ /dev/null
@@ -1,100 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.sql.catalyst.expressions.mathfuncs
-
-import org.apache.spark.sql.catalyst.expressions.codegen._
-import org.apache.spark.sql.catalyst.expressions.{ExpectsInputTypes, BinaryExpression, Expression, Row}
-import org.apache.spark.sql.types._
-
-/**
- * A binary expression specifically for math functions that take two `Double`s as input and returns
- * a `Double`.
- * @param f The math function.
- * @param name The short name of the function
- */
-abstract class BinaryMathExpression(f: (Double, Double) => Double, name: String)
-  extends BinaryExpression with Serializable with ExpectsInputTypes { self: Product =>
-
-  override def expectedChildTypes: Seq[DataType] = Seq(DoubleType, DoubleType)
-
-  override def toString: String = s"$name($left, $right)"
-
-  override def dataType: DataType = DoubleType
-
-  override def eval(input: Row): Any = {
-    val evalE1 = left.eval(input)
-    if (evalE1 == null) {
-      null
-    } else {
-      val evalE2 = right.eval(input)
-      if (evalE2 == null) {
-        null
-      } else {
-        val result = f(evalE1.asInstanceOf[Double], evalE2.asInstanceOf[Double])
-        if (result.isNaN) null else result
-      }
-    }
-  }
-
-  override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): Code = {
-    defineCodeGen(ctx, ev, (c1, c2) => s"java.lang.Math.${name.toLowerCase}($c1, $c2)")
-  }
-}
-
-case class Atan2(left: Expression, right: Expression)
-  extends BinaryMathExpression(math.atan2, "ATAN2") {
-
-  override def eval(input: Row): Any = {
-    val evalE1 = left.eval(input)
-    if (evalE1 == null) {
-      null
-    } else {
-      val evalE2 = right.eval(input)
-      if (evalE2 == null) {
-        null
-      } else {
-        // With codegen, the values returned by -0.0 and 0.0 are different. Handled with +0.0
-        val result = math.atan2(evalE1.asInstanceOf[Double] + 0.0,
-          evalE2.asInstanceOf[Double] + 0.0)
-        if (result.isNaN) null else result
-      }
-    }
-  }
-
-  override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): Code = {
-    defineCodeGen(ctx, ev, (c1, c2) => s"java.lang.Math.atan2($c1 + 0.0, $c2 + 0.0)") + s"""
-      if (Double.valueOf(${ev.primitive}).isNaN()) {
-        ${ev.isNull} = true;
-      }
-      """
-  }
-}
-
-case class Hypot(left: Expression, right: Expression)
-  extends BinaryMathExpression(math.hypot, "HYPOT")
-
-case class Pow(left: Expression, right: Expression)
-  extends BinaryMathExpression(math.pow, "POWER") {
-  override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): Code = {
-    defineCodeGen(ctx, ev, (c1, c2) => s"java.lang.Math.pow($c1, $c2)") + s"""
-      if (Double.valueOf(${ev.primitive}).isNaN()) {
-        ${ev.isNull} = true;
-      }
-      """
-  }
-}
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala
index 1d0f19a400d63..5edcf3bd77d20 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala
@@ -359,293 +359,3 @@ case class GreaterThanOrEqual(left: Expression, right: Expression) extends Binar
 
   protected override def evalInternal(evalE1: Any, evalE2: Any) = ordering.gteq(evalE1, evalE2)
 }
-
-case class If(predicate: Expression, trueValue: Expression, falseValue: Expression)
-  extends Expression {
-
-  override def children: Seq[Expression] = predicate :: trueValue :: falseValue :: Nil
-  override def nullable: Boolean = trueValue.nullable || falseValue.nullable
-
-  override def checkInputDataTypes(): TypeCheckResult = {
-    if (predicate.dataType != BooleanType) {
-      TypeCheckResult.TypeCheckFailure(
-        s"type of predicate expression in If should be boolean, not ${predicate.dataType}")
-    } else if (trueValue.dataType != falseValue.dataType) {
-      TypeCheckResult.TypeCheckFailure(
-        s"differing types in If (${trueValue.dataType} and ${falseValue.dataType}).")
-    } else {
-      TypeCheckResult.TypeCheckSuccess
-    }
-  }
-
-  override def dataType: DataType = trueValue.dataType
-
-  override def eval(input: Row): Any = {
-    if (true == predicate.eval(input)) {
-      trueValue.eval(input)
-    } else {
-      falseValue.eval(input)
-    }
-  }
-
-  override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): Code = {
-    val condEval = predicate.gen(ctx)
-    val trueEval = trueValue.gen(ctx)
-    val falseEval = falseValue.gen(ctx)
-
-    s"""
-      ${condEval.code}
-      boolean ${ev.isNull} = false;
-      ${ctx.javaType(dataType)} ${ev.primitive} = ${ctx.defaultValue(dataType)};
-      if (!${condEval.isNull} && ${condEval.primitive}) {
-        ${trueEval.code}
-        ${ev.isNull} = ${trueEval.isNull};
-        ${ev.primitive} = ${trueEval.primitive};
-      } else {
-        ${falseEval.code}
-        ${ev.isNull} = ${falseEval.isNull};
-        ${ev.primitive} = ${falseEval.primitive};
-      }
-    """
-  }
-
-  override def toString: String = s"if ($predicate) $trueValue else $falseValue"
-}
-
-trait CaseWhenLike extends Expression {
-  self: Product =>
-
-  // Note that `branches` are considered in consecutive pairs (cond, val), and the optional last
-  // element is the value for the default catch-all case (if provided).
-  // Hence, `branches` consists of at least two elements, and can have an odd or even length.
-  def branches: Seq[Expression]
-
-  @transient lazy val whenList =
-    branches.sliding(2, 2).collect { case Seq(whenExpr, _) => whenExpr }.toSeq
-  @transient lazy val thenList =
-    branches.sliding(2, 2).collect { case Seq(_, thenExpr) => thenExpr }.toSeq
-  val elseValue = if (branches.length % 2 == 0) None else Option(branches.last)
-
-  // both then and else expressions should be considered.
-  def valueTypes: Seq[DataType] = (thenList ++ elseValue).map(_.dataType)
-  def valueTypesEqual: Boolean = valueTypes.distinct.size == 1
-
-  override def checkInputDataTypes(): TypeCheckResult = {
-    if (valueTypesEqual) {
-      checkTypesInternal()
-    } else {
-      TypeCheckResult.TypeCheckFailure(
-        "THEN and ELSE expressions should all be same type or coercible to a common type")
-    }
-  }
-
-  protected def checkTypesInternal(): TypeCheckResult
-
-  override def dataType: DataType = thenList.head.dataType
-
-  override def nullable: Boolean = {
-    // If no value is nullable and no elseValue is provided, the whole statement defaults to null.
-    thenList.exists(_.nullable) || (elseValue.map(_.nullable).getOrElse(true))
-  }
-}
-
-// scalastyle:off
-/**
- * Case statements of the form "CASE WHEN a THEN b [WHEN c THEN d]* [ELSE e] END".
- * Refer to this link for the corresponding semantics:
- * https://cwiki.apache.org/confluence/display/Hive/LanguageManual+UDF#LanguageManualUDF-ConditionalFunctions
- */
-// scalastyle:on
-case class CaseWhen(branches: Seq[Expression]) extends CaseWhenLike {
-
-  // Use private[this] Array to speed up evaluation.
-  @transient private[this] lazy val branchesArr = branches.toArray
-
-  override def children: Seq[Expression] = branches
-
-  override protected def checkTypesInternal(): TypeCheckResult = {
-    if (whenList.forall(_.dataType == BooleanType)) {
-      TypeCheckResult.TypeCheckSuccess
-    } else {
-      val index = whenList.indexWhere(_.dataType != BooleanType)
-      TypeCheckResult.TypeCheckFailure(
-        s"WHEN expressions in CaseWhen should all be boolean type, " +
-        s"but the ${index + 1}th when expression's type is ${whenList(index)}")
-    }
-  }
-
-  /** Written in imperative fashion for performance considerations. */
-  override def eval(input: Row): Any = {
-    val len = branchesArr.length
-    var i = 0
-    // If all branches fail and an elseVal is not provided, the whole statement
-    // defaults to null, according to Hive's semantics.
-    while (i < len - 1) {
-      if (branchesArr(i).eval(input) == true) {
-        return branchesArr(i + 1).eval(input)
-      }
-      i += 2
-    }
-    var res: Any = null
-    if (i == len - 1) {
-      res = branchesArr(i).eval(input)
-    }
-    return res
-  }
-
-  override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): Code = {
-    val len = branchesArr.length
-    val got = ctx.freshName("got")
-
-    val cases = (0 until len/2).map { i =>
-      val cond = branchesArr(i * 2).gen(ctx)
-      val res = branchesArr(i * 2 + 1).gen(ctx)
-      s"""
-        if (!$got) {
-          ${cond.code}
-          if (!${cond.isNull} && ${cond.primitive}) {
-            $got = true;
-            ${res.code}
-            ${ev.isNull} = ${res.isNull};
-            ${ev.primitive} = ${res.primitive};
-          }
-        }
-      """
-    }.mkString("\n")
-
-    val other = if (len % 2 == 1) {
-      val res = branchesArr(len - 1).gen(ctx)
-      s"""
-        if (!$got) {
-          ${res.code}
-          ${ev.isNull} = ${res.isNull};
-          ${ev.primitive} = ${res.primitive};
-        }
-      """
-    } else {
-      ""
-    }
-
-    s"""
-      boolean $got = false;
-      boolean ${ev.isNull} = true;
-      ${ctx.javaType(dataType)} ${ev.primitive} = ${ctx.defaultValue(dataType)};
-      $cases
-      $other
-    """
-  }
-
-  override def toString: String = {
-    "CASE" + branches.sliding(2, 2).map {
-      case Seq(cond, value) => s" WHEN $cond THEN $value"
-      case Seq(elseValue) => s" ELSE $elseValue"
-    }.mkString
-  }
-}
-
-// scalastyle:off
-/**
- * Case statements of the form "CASE a WHEN b THEN c [WHEN d THEN e]* [ELSE f] END".
- * Refer to this link for the corresponding semantics:
- * https://cwiki.apache.org/confluence/display/Hive/LanguageManual+UDF#LanguageManualUDF-ConditionalFunctions
- */
-// scalastyle:on
-case class CaseKeyWhen(key: Expression, branches: Seq[Expression]) extends CaseWhenLike {
-
-  // Use private[this] Array to speed up evaluation.
-  @transient private[this] lazy val branchesArr = branches.toArray
-
-  override def children: Seq[Expression] = key +: branches
-
-  override protected def checkTypesInternal(): TypeCheckResult = {
-    if ((key +: whenList).map(_.dataType).distinct.size > 1) {
-      TypeCheckResult.TypeCheckFailure(
-        "key and WHEN expressions should all be same type or coercible to a common type")
-    } else {
-      TypeCheckResult.TypeCheckSuccess
-    }
-  }
-
-  /** Written in imperative fashion for performance considerations. */
-  override def eval(input: Row): Any = {
-    val evaluatedKey = key.eval(input)
-    val len = branchesArr.length
-    var i = 0
-    // If all branches fail and an elseVal is not provided, the whole statement
-    // defaults to null, according to Hive's semantics.
-    while (i < len - 1) {
-      if (equalNullSafe(evaluatedKey, branchesArr(i).eval(input))) {
-        return branchesArr(i + 1).eval(input)
-      }
-      i += 2
-    }
-    var res: Any = null
-    if (i == len - 1) {
-      res = branchesArr(i).eval(input)
-    }
-    return res
-  }
-
-  override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): Code = {
-    val keyEval = key.gen(ctx)
-    val len = branchesArr.length
-    val got = ctx.freshName("got")
-
-    val cases = (0 until len/2).map { i =>
-      val cond = branchesArr(i * 2).gen(ctx)
-      val res = branchesArr(i * 2 + 1).gen(ctx)
-      s"""
-        if (!$got) {
-          ${cond.code}
-          if (${keyEval.isNull} && ${cond.isNull} ||
-            !${keyEval.isNull} && !${cond.isNull}
-             && ${ctx.equalFunc(key.dataType)(keyEval.primitive, cond.primitive)}) {
-            $got = true;
-            ${res.code}
-            ${ev.isNull} = ${res.isNull};
-            ${ev.primitive} = ${res.primitive};
-          }
-        }
-      """
-    }.mkString("\n")
-
-    val other = if (len % 2 == 1) {
-      val res = branchesArr(len - 1).gen(ctx)
-      s"""
-        if (!$got) {
-          ${res.code}
-          ${ev.isNull} = ${res.isNull};
-          ${ev.primitive} = ${res.primitive};
-        }
-      """
-    } else {
-      ""
-    }
-
-    s"""
-      boolean $got = false;
-      boolean ${ev.isNull} = true;
-      ${ctx.javaType(dataType)} ${ev.primitive} = ${ctx.defaultValue(dataType)};
-      ${keyEval.code}
-      $cases
-      $other
-    """
-  }
-
-  private def equalNullSafe(l: Any, r: Any) = {
-    if (l == null && r == null) {
-      true
-    } else if (l == null || r == null) {
-      false
-    } else {
-      l == r
-    }
-  }
-
-  override def toString: String = {
-    s"CASE $key" + branches.sliding(2, 2).map {
-      case Seq(cond, value) => s" WHEN $cond THEN $value"
-      case Seq(elseValue) => s" ELSE $elseValue"
-    }.mkString
-  }
-}
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ArithmeticExpressionSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ArithmeticExpressionSuite.scala
new file mode 100644
index 0000000000000..e1afa81a7a82f
--- /dev/null
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ArithmeticExpressionSuite.scala
@@ -0,0 +1,144 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.catalyst.expressions
+
+import org.scalatest.Matchers._
+
+import org.apache.spark.SparkFunSuite
+import org.apache.spark.sql.catalyst.dsl.expressions._
+import org.apache.spark.sql.types.{DoubleType, IntegerType}
+
+
+class ArithmeticExpressionSuite extends SparkFunSuite with ExpressionEvalHelper {
+
+  test("arithmetic") {
+    val row = create_row(1, 2, 3, null)
+    val c1 = 'a.int.at(0)
+    val c2 = 'a.int.at(1)
+    val c3 = 'a.int.at(2)
+    val c4 = 'a.int.at(3)
+
+    checkEvaluation(UnaryMinus(c1), -1, row)
+    checkEvaluation(UnaryMinus(Literal.create(100, IntegerType)), -100)
+
+    checkEvaluation(Add(c1, c4), null, row)
+    checkEvaluation(Add(c1, c2), 3, row)
+    checkEvaluation(Add(c1, Literal.create(null, IntegerType)), null, row)
+    checkEvaluation(Add(Literal.create(null, IntegerType), c2), null, row)
+    checkEvaluation(
+      Add(Literal.create(null, IntegerType), Literal.create(null, IntegerType)), null, row)
+
+    checkEvaluation(-c1, -1, row)
+    checkEvaluation(c1 + c2, 3, row)
+    checkEvaluation(c1 - c2, -1, row)
+    checkEvaluation(c1 * c2, 2, row)
+    checkEvaluation(c1 / c2, 0, row)
+    checkEvaluation(c1 % c2, 1, row)
+  }
+
+  test("fractional arithmetic") {
+    val row = create_row(1.1, 2.0, 3.1, null)
+    val c1 = 'a.double.at(0)
+    val c2 = 'a.double.at(1)
+    val c3 = 'a.double.at(2)
+    val c4 = 'a.double.at(3)
+
+    checkEvaluation(UnaryMinus(c1), -1.1, row)
+    checkEvaluation(UnaryMinus(Literal.create(100.0, DoubleType)), -100.0)
+    checkEvaluation(Add(c1, c4), null, row)
+    checkEvaluation(Add(c1, c2), 3.1, row)
+    checkEvaluation(Add(c1, Literal.create(null, DoubleType)), null, row)
+    checkEvaluation(Add(Literal.create(null, DoubleType), c2), null, row)
+    checkEvaluation(
+      Add(Literal.create(null, DoubleType), Literal.create(null, DoubleType)), null, row)
+
+    checkEvaluation(-c1, -1.1, row)
+    checkEvaluation(c1 + c2, 3.1, row)
+    checkDoubleEvaluation(c1 - c2, (-0.9 +- 0.001), row)
+    checkDoubleEvaluation(c1 * c2, (2.2 +- 0.001), row)
+    checkDoubleEvaluation(c1 / c2, (0.55 +- 0.001), row)
+    checkDoubleEvaluation(c3 % c2, (1.1 +- 0.001), row)
+  }
+
+  test("Divide") {
+    checkEvaluation(Divide(Literal(2), Literal(1)), 2)
+    checkEvaluation(Divide(Literal(1.0), Literal(2.0)), 0.5)
+    checkEvaluation(Divide(Literal(1), Literal(2)), 0)
+    checkEvaluation(Divide(Literal(1), Literal(0)), null)
+    checkEvaluation(Divide(Literal(1.0), Literal(0.0)), null)
+    checkEvaluation(Divide(Literal(0.0), Literal(0.0)), null)
+    checkEvaluation(Divide(Literal(0), Literal.create(null, IntegerType)), null)
+    checkEvaluation(Divide(Literal(1), Literal.create(null, IntegerType)), null)
+    checkEvaluation(Divide(Literal.create(null, IntegerType), Literal(0)), null)
+    checkEvaluation(Divide(Literal.create(null, DoubleType), Literal(0.0)), null)
+    checkEvaluation(Divide(Literal.create(null, IntegerType), Literal(1)), null)
+    checkEvaluation(Divide(Literal.create(null, IntegerType), Literal.create(null, IntegerType)),
+      null)
+  }
+
+  test("Remainder") {
+    checkEvaluation(Remainder(Literal(2), Literal(1)), 0)
+    checkEvaluation(Remainder(Literal(1.0), Literal(2.0)), 1.0)
+    checkEvaluation(Remainder(Literal(1), Literal(2)), 1)
+    checkEvaluation(Remainder(Literal(1), Literal(0)), null)
+    checkEvaluation(Remainder(Literal(1.0), Literal(0.0)), null)
+    checkEvaluation(Remainder(Literal(0.0), Literal(0.0)), null)
+    checkEvaluation(Remainder(Literal(0), Literal.create(null, IntegerType)), null)
+    checkEvaluation(Remainder(Literal(1), Literal.create(null, IntegerType)), null)
+    checkEvaluation(Remainder(Literal.create(null, IntegerType), Literal(0)), null)
+    checkEvaluation(Remainder(Literal.create(null, DoubleType), Literal(0.0)), null)
+    checkEvaluation(Remainder(Literal.create(null, IntegerType), Literal(1)), null)
+    checkEvaluation(Remainder(Literal.create(null, IntegerType), Literal.create(null, IntegerType)),
+      null)
+  }
+
+  test("MaxOf") {
+    checkEvaluation(MaxOf(1, 2), 2)
+    checkEvaluation(MaxOf(2, 1), 2)
+    checkEvaluation(MaxOf(1L, 2L), 2L)
+    checkEvaluation(MaxOf(2L, 1L), 2L)
+
+    checkEvaluation(MaxOf(Literal.create(null, IntegerType), 2), 2)
+    checkEvaluation(MaxOf(2, Literal.create(null, IntegerType)), 2)
+  }
+
+  test("MinOf") {
+    checkEvaluation(MinOf(1, 2), 1)
+    checkEvaluation(MinOf(2, 1), 1)
+    checkEvaluation(MinOf(1L, 2L), 1L)
+    checkEvaluation(MinOf(2L, 1L), 1L)
+
+    checkEvaluation(MinOf(Literal.create(null, IntegerType), 1), 1)
+    checkEvaluation(MinOf(1, Literal.create(null, IntegerType)), 1)
+  }
+
+  test("SQRT") {
+    val inputSequence = (1 to (1<<24) by 511).map(_ * (1L<<24))
+    val expectedResults = inputSequence.map(l => math.sqrt(l.toDouble))
+    val rowSequence = inputSequence.map(l => create_row(l.toDouble))
+    val d = 'a.double.at(0)
+
+    for ((row, expected) <- rowSequence zip expectedResults) {
+      checkEvaluation(Sqrt(d), expected, row)
+    }
+
+    checkEvaluation(Sqrt(Literal.create(null, DoubleType)), null, create_row(null))
+    checkEvaluation(Sqrt(-1), null, EmptyRow)
+    checkEvaluation(Sqrt(-1.5), null, EmptyRow)
+  }
+}
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/BitwiseFunctionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/BitwiseFunctionsSuite.scala
new file mode 100644
index 0000000000000..c9bbc7a8b8c14
--- /dev/null
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/BitwiseFunctionsSuite.scala
@@ -0,0 +1,80 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.catalyst.expressions
+
+import org.apache.spark.SparkFunSuite
+import org.apache.spark.sql.catalyst.dsl.expressions._
+import org.apache.spark.sql.types._
+
+
+class BitwiseFunctionsSuite extends SparkFunSuite with ExpressionEvalHelper {
+
+  test("Bitwise operations") {
+    val row = create_row(1, 2, 3, null)
+    val c1 = 'a.int.at(0)
+    val c2 = 'a.int.at(1)
+    val c3 = 'a.int.at(2)
+    val c4 = 'a.int.at(3)
+
+    checkEvaluation(BitwiseAnd(c1, c4), null, row)
+    checkEvaluation(BitwiseAnd(c1, c2), 0, row)
+    checkEvaluation(BitwiseAnd(c1, Literal.create(null, IntegerType)), null, row)
+    checkEvaluation(
+      BitwiseAnd(Literal.create(null, IntegerType), Literal.create(null, IntegerType)), null, row)
+
+    checkEvaluation(BitwiseOr(c1, c4), null, row)
+    checkEvaluation(BitwiseOr(c1, c2), 3, row)
+    checkEvaluation(BitwiseOr(c1, Literal.create(null, IntegerType)), null, row)
+    checkEvaluation(
+      BitwiseOr(Literal.create(null, IntegerType), Literal.create(null, IntegerType)), null, row)
+
+    checkEvaluation(BitwiseXor(c1, c4), null, row)
+    checkEvaluation(BitwiseXor(c1, c2), 3, row)
+    checkEvaluation(BitwiseXor(c1, Literal.create(null, IntegerType)), null, row)
+    checkEvaluation(
+      BitwiseXor(Literal.create(null, IntegerType), Literal.create(null, IntegerType)), null, row)
+
+    checkEvaluation(BitwiseNot(c4), null, row)
+    checkEvaluation(BitwiseNot(c1), -2, row)
+    checkEvaluation(BitwiseNot(Literal.create(null, IntegerType)), null, row)
+
+    checkEvaluation(c1 & c2, 0, row)
+    checkEvaluation(c1 | c2, 3, row)
+    checkEvaluation(c1 ^ c2, 3, row)
+    checkEvaluation(~c1, -2, row)
+  }
+
+  test("unary BitwiseNOT") {
+    checkEvaluation(BitwiseNot(1), -2)
+    assert(BitwiseNot(1).dataType === IntegerType)
+    assert(BitwiseNot(1).eval(EmptyRow).isInstanceOf[Int])
+
+    checkEvaluation(BitwiseNot(1.toLong), -2.toLong)
+    assert(BitwiseNot(1.toLong).dataType === LongType)
+    assert(BitwiseNot(1.toLong).eval(EmptyRow).isInstanceOf[Long])
+
+    checkEvaluation(BitwiseNot(1.toShort), -2.toShort)
+    assert(BitwiseNot(1.toShort).dataType === ShortType)
+    assert(BitwiseNot(1.toShort).eval(EmptyRow).isInstanceOf[Short])
+
+    checkEvaluation(BitwiseNot(1.toByte), -2.toByte)
+    assert(BitwiseNot(1.toByte).dataType === ByteType)
+    assert(BitwiseNot(1.toByte).eval(EmptyRow).isInstanceOf[Byte])
+  }
+
+}
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CastSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CastSuite.scala
new file mode 100644
index 0000000000000..5bc7c30eee1b6
--- /dev/null
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CastSuite.scala
@@ -0,0 +1,532 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.catalyst.expressions
+
+import java.sql.{Timestamp, Date}
+
+import org.apache.spark.SparkFunSuite
+import org.apache.spark.sql.types._
+
+/**
+ * Test suite for data type casting expression [[Cast]].
+ */
+class CastSuite extends SparkFunSuite with ExpressionEvalHelper {
+
+  private def cast(v: Any, targetType: DataType): Cast = {
+    v match {
+      case lit: Expression => Cast(lit, targetType)
+      case _ => Cast(Literal(v), targetType)
+    }
+  }
+
+  // expected cannot be null
+  private def checkCast(v: Any, expected: Any): Unit = {
+    checkEvaluation(cast(v, Literal(expected).dataType), expected)
+  }
+
+  test("cast from int") {
+    checkCast(0, false)
+    checkCast(1, true)
+    checkCast(5, true)
+    checkCast(1, 1.toByte)
+    checkCast(1, 1.toShort)
+    checkCast(1, 1)
+    checkCast(1, 1.toLong)
+    checkCast(1, 1.0f)
+    checkCast(1, 1.0)
+    checkCast(123, "123")
+
+    checkEvaluation(cast(123, DecimalType.Unlimited), Decimal(123))
+    checkEvaluation(cast(123, DecimalType(3, 0)), Decimal(123))
+    checkEvaluation(cast(123, DecimalType(3, 1)), null)
+    checkEvaluation(cast(123, DecimalType(2, 0)), null)
+  }
+
+  test("cast from long") {
+    checkCast(0L, false)
+    checkCast(1L, true)
+    checkCast(5L, true)
+    checkCast(1L, 1.toByte)
+    checkCast(1L, 1.toShort)
+    checkCast(1L, 1)
+    checkCast(1L, 1.toLong)
+    checkCast(1L, 1.0f)
+    checkCast(1L, 1.0)
+    checkCast(123L, "123")
+
+    checkEvaluation(cast(123L, DecimalType.Unlimited), Decimal(123))
+    checkEvaluation(cast(123L, DecimalType(3, 0)), Decimal(123))
+    checkEvaluation(cast(123L, DecimalType(3, 1)), Decimal(123.0))
+
+    // TODO: Fix the following bug and re-enable it.
+    // checkEvaluation(cast(123L, DecimalType(2, 0)), null)
+  }
+
+  test("cast from boolean") {
+    checkEvaluation(cast(true, IntegerType), 1)
+    checkEvaluation(cast(false, IntegerType), 0)
+    checkEvaluation(cast(true, StringType), "true")
+    checkEvaluation(cast(false, StringType), "false")
+    checkEvaluation(cast(cast(1, BooleanType), IntegerType), 1)
+    checkEvaluation(cast(cast(0, BooleanType), IntegerType), 0)
+  }
+
+  test("cast from int 2") {
+    checkEvaluation(cast(1, LongType), 1.toLong)
+    checkEvaluation(cast(cast(1000, TimestampType), LongType), 1.toLong)
+    checkEvaluation(cast(cast(-1200, TimestampType), LongType), -2.toLong)
+
+    checkEvaluation(cast(123, DecimalType.Unlimited), Decimal(123))
+    checkEvaluation(cast(123, DecimalType(3, 0)), Decimal(123))
+    checkEvaluation(cast(123, DecimalType(3, 1)), null)
+    checkEvaluation(cast(123, DecimalType(2, 0)), null)
+  }
+
+  test("cast from float") {
+
+  }
+
+  test("cast from double") {
+    checkEvaluation(cast(cast(1.toDouble, TimestampType), DoubleType), 1.toDouble)
+    checkEvaluation(cast(cast(1.toDouble, TimestampType), DoubleType), 1.toDouble)
+  }
+
+  test("cast from string") {
+    assert(cast("abcdef", StringType).nullable === false)
+    assert(cast("abcdef", BinaryType).nullable === false)
+    assert(cast("abcdef", BooleanType).nullable === false)
+    assert(cast("abcdef", TimestampType).nullable === true)
+    assert(cast("abcdef", LongType).nullable === true)
+    assert(cast("abcdef", IntegerType).nullable === true)
+    assert(cast("abcdef", ShortType).nullable === true)
+    assert(cast("abcdef", ByteType).nullable === true)
+    assert(cast("abcdef", DecimalType.Unlimited).nullable === true)
+    assert(cast("abcdef", DecimalType(4, 2)).nullable === true)
+    assert(cast("abcdef", DoubleType).nullable === true)
+    assert(cast("abcdef", FloatType).nullable === true)
+  }
+
+  test("data type casting") {
+    val sd = "1970-01-01"
+    val d = Date.valueOf(sd)
+    val zts = sd + " 00:00:00"
+    val sts = sd + " 00:00:02"
+    val nts = sts + ".1"
+    val ts = Timestamp.valueOf(nts)
+
+    checkEvaluation(cast("abdef", StringType), "abdef")
+    checkEvaluation(cast("abdef", DecimalType.Unlimited), null)
+    checkEvaluation(cast("abdef", TimestampType), null)
+    checkEvaluation(cast("12.65", DecimalType.Unlimited), Decimal(12.65))
+
+    checkEvaluation(cast(cast(sd, DateType), StringType), sd)
+    checkEvaluation(cast(cast(d, StringType), DateType), 0)
+    checkEvaluation(cast(cast(nts, TimestampType), StringType), nts)
+    checkEvaluation(cast(cast(ts, StringType), TimestampType), ts)
+
+    // all convert to string type to check
+    checkEvaluation(cast(cast(cast(nts, TimestampType), DateType), StringType), sd)
+    checkEvaluation(cast(cast(cast(ts, DateType), TimestampType), StringType), zts)
+
+    checkEvaluation(cast(cast("abdef", BinaryType), StringType), "abdef")
+
+    checkEvaluation(cast(cast(cast(cast(
+      cast(cast("5", ByteType), ShortType), IntegerType), FloatType), DoubleType), LongType),
+      5.toLong)
+    checkEvaluation(
+      cast(cast(cast(cast(cast(cast("5", ByteType), TimestampType),
+        DecimalType.Unlimited), LongType), StringType), ShortType),
+      0.toShort)
+    checkEvaluation(
+      cast(cast(cast(cast(cast(cast("5", TimestampType), ByteType),
+        DecimalType.Unlimited), LongType), StringType), ShortType),
+      null)
+    checkEvaluation(cast(cast(cast(cast(cast(cast("5", DecimalType.Unlimited),
+      ByteType), TimestampType), LongType), StringType), ShortType),
+      0.toShort)
+
+    checkEvaluation(cast("23", DoubleType), 23d)
+    checkEvaluation(cast("23", IntegerType), 23)
+    checkEvaluation(cast("23", FloatType), 23f)
+    checkEvaluation(cast("23", DecimalType.Unlimited), Decimal(23))
+    checkEvaluation(cast("23", ByteType), 23.toByte)
+    checkEvaluation(cast("23", ShortType), 23.toShort)
+    checkEvaluation(cast("2012-12-11", DoubleType), null)
+    checkEvaluation(cast(123, IntegerType), 123)
+
+
+    checkEvaluation(cast(Literal.create(null, IntegerType), ShortType), null)
+  }
+
+  test("cast and add") {
+    checkEvaluation(Add(Literal(23d), cast(true, DoubleType)), 24d)
+    checkEvaluation(Add(Literal(23), cast(true, IntegerType)), 24)
+    checkEvaluation(Add(Literal(23f), cast(true, FloatType)), 24f)
+    checkEvaluation(Add(Literal(Decimal(23)), cast(true, DecimalType.Unlimited)), Decimal(24))
+    checkEvaluation(Add(Literal(23.toByte), cast(true, ByteType)), 24.toByte)
+    checkEvaluation(Add(Literal(23.toShort), cast(true, ShortType)), 24.toShort)
+  }
+
+  test("casting to fixed-precision decimals") {
+    // Overflow and rounding for casting to fixed-precision decimals:
+    // - Values should round with HALF_UP mode by default when you lower scale
+    // - Values that would overflow the target precision should turn into null
+    // - Because of this, casts to fixed-precision decimals should be nullable
+
+    assert(cast(123, DecimalType.Unlimited).nullable === false)
+    assert(cast(10.03f, DecimalType.Unlimited).nullable === true)
+    assert(cast(10.03, DecimalType.Unlimited).nullable === true)
+    assert(cast(Decimal(10.03), DecimalType.Unlimited).nullable === false)
+
+    assert(cast(123, DecimalType(2, 1)).nullable === true)
+    assert(cast(10.03f, DecimalType(2, 1)).nullable === true)
+    assert(cast(10.03, DecimalType(2, 1)).nullable === true)
+    assert(cast(Decimal(10.03), DecimalType(2, 1)).nullable === true)
+
+
+    checkEvaluation(cast(10.03, DecimalType.Unlimited), Decimal(10.03))
+    checkEvaluation(cast(10.03, DecimalType(4, 2)), Decimal(10.03))
+    checkEvaluation(cast(10.03, DecimalType(3, 1)), Decimal(10.0))
+    checkEvaluation(cast(10.03, DecimalType(2, 0)), Decimal(10))
+    checkEvaluation(cast(10.03, DecimalType(1, 0)), null)
+    checkEvaluation(cast(10.03, DecimalType(2, 1)), null)
+    checkEvaluation(cast(10.03, DecimalType(3, 2)), null)
+    checkEvaluation(cast(Decimal(10.03), DecimalType(3, 1)), Decimal(10.0))
+    checkEvaluation(cast(Decimal(10.03), DecimalType(3, 2)), null)
+
+    checkEvaluation(cast(10.05, DecimalType.Unlimited), Decimal(10.05))
+    checkEvaluation(cast(10.05, DecimalType(4, 2)), Decimal(10.05))
+    checkEvaluation(cast(10.05, DecimalType(3, 1)), Decimal(10.1))
+    checkEvaluation(cast(10.05, DecimalType(2, 0)), Decimal(10))
+    checkEvaluation(cast(10.05, DecimalType(1, 0)), null)
+    checkEvaluation(cast(10.05, DecimalType(2, 1)), null)
+    checkEvaluation(cast(10.05, DecimalType(3, 2)), null)
+    checkEvaluation(cast(Decimal(10.05), DecimalType(3, 1)), Decimal(10.1))
+    checkEvaluation(cast(Decimal(10.05), DecimalType(3, 2)), null)
+
+    checkEvaluation(cast(9.95, DecimalType(3, 2)), Decimal(9.95))
+    checkEvaluation(cast(9.95, DecimalType(3, 1)), Decimal(10.0))
+    checkEvaluation(cast(9.95, DecimalType(2, 0)), Decimal(10))
+    checkEvaluation(cast(9.95, DecimalType(2, 1)), null)
+    checkEvaluation(cast(9.95, DecimalType(1, 0)), null)
+    checkEvaluation(cast(Decimal(9.95), DecimalType(3, 1)), Decimal(10.0))
+    checkEvaluation(cast(Decimal(9.95), DecimalType(1, 0)), null)
+
+    checkEvaluation(cast(-9.95, DecimalType(3, 2)), Decimal(-9.95))
+    checkEvaluation(cast(-9.95, DecimalType(3, 1)), Decimal(-10.0))
+    checkEvaluation(cast(-9.95, DecimalType(2, 0)), Decimal(-10))
+    checkEvaluation(cast(-9.95, DecimalType(2, 1)), null)
+    checkEvaluation(cast(-9.95, DecimalType(1, 0)), null)
+    checkEvaluation(cast(Decimal(-9.95), DecimalType(3, 1)), Decimal(-10.0))
+    checkEvaluation(cast(Decimal(-9.95), DecimalType(1, 0)), null)
+
+    checkEvaluation(cast(Double.NaN, DecimalType.Unlimited), null)
+    checkEvaluation(cast(1.0 / 0.0, DecimalType.Unlimited), null)
+    checkEvaluation(cast(Float.NaN, DecimalType.Unlimited), null)
+    checkEvaluation(cast(1.0f / 0.0f, DecimalType.Unlimited), null)
+
+    checkEvaluation(cast(Double.NaN, DecimalType(2, 1)), null)
+    checkEvaluation(cast(1.0 / 0.0, DecimalType(2, 1)), null)
+    checkEvaluation(cast(Float.NaN, DecimalType(2, 1)), null)
+    checkEvaluation(cast(1.0f / 0.0f, DecimalType(2, 1)), null)
+  }
+
+  test("cast from date") {
+    val d = Date.valueOf("1970-01-01")
+    checkEvaluation(cast(d, ShortType), null)
+    checkEvaluation(cast(d, IntegerType), null)
+    checkEvaluation(cast(d, LongType), null)
+    checkEvaluation(cast(d, FloatType), null)
+    checkEvaluation(cast(d, DoubleType), null)
+    checkEvaluation(cast(d, DecimalType.Unlimited), null)
+    checkEvaluation(cast(d, DecimalType(10, 2)), null)
+    checkEvaluation(cast(d, StringType), "1970-01-01")
+    checkEvaluation(cast(cast(d, TimestampType), StringType), "1970-01-01 00:00:00")
+  }
+
+  test("cast from timestamp") {
+    val millis = 15 * 1000 + 2
+    val seconds = millis * 1000 + 2
+    val ts = new Timestamp(millis)
+    val tss = new Timestamp(seconds)
+    checkEvaluation(cast(ts, ShortType), 15.toShort)
+    checkEvaluation(cast(ts, IntegerType), 15)
+    checkEvaluation(cast(ts, LongType), 15.toLong)
+    checkEvaluation(cast(ts, FloatType), 15.002f)
+    checkEvaluation(cast(ts, DoubleType), 15.002)
+    checkEvaluation(cast(cast(tss, ShortType), TimestampType), ts)
+    checkEvaluation(cast(cast(tss, IntegerType), TimestampType), ts)
+    checkEvaluation(cast(cast(tss, LongType), TimestampType), ts)
+    checkEvaluation(
+      cast(cast(millis.toFloat / 1000, TimestampType), FloatType),
+      millis.toFloat / 1000)
+    checkEvaluation(
+      cast(cast(millis.toDouble / 1000, TimestampType), DoubleType),
+      millis.toDouble / 1000)
+    checkEvaluation(
+      cast(cast(Decimal(1), TimestampType), DecimalType.Unlimited),
+      Decimal(1))
+
+    // A test for higher precision than millis
+    checkEvaluation(cast(cast(0.00000001, TimestampType), DoubleType), 0.00000001)
+
+    checkEvaluation(cast(Double.NaN, TimestampType), null)
+    checkEvaluation(cast(1.0 / 0.0, TimestampType), null)
+    checkEvaluation(cast(Float.NaN, TimestampType), null)
+    checkEvaluation(cast(1.0f / 0.0f, TimestampType), null)
+  }
+
+  test("cast from array") {
+    val array = Literal.create(Seq("123", "abc", "", null),
+      ArrayType(StringType, containsNull = true))
+    val array_notNull = Literal.create(Seq("123", "abc", ""),
+      ArrayType(StringType, containsNull = false))
+
+    {
+      val ret = cast(array, ArrayType(IntegerType, containsNull = true))
+      assert(ret.resolved === true)
+      checkEvaluation(ret, Seq(123, null, null, null))
+    }
+    {
+      val ret = cast(array, ArrayType(IntegerType, containsNull = false))
+      assert(ret.resolved === false)
+    }
+    {
+      val ret = cast(array, ArrayType(BooleanType, containsNull = true))
+      assert(ret.resolved === true)
+      checkEvaluation(ret, Seq(true, true, false, null))
+    }
+    {
+      val ret = cast(array, ArrayType(BooleanType, containsNull = false))
+      assert(ret.resolved === false)
+    }
+
+    {
+      val ret = cast(array_notNull, ArrayType(IntegerType, containsNull = true))
+      assert(ret.resolved === true)
+      checkEvaluation(ret, Seq(123, null, null))
+    }
+    {
+      val ret = cast(array_notNull, ArrayType(IntegerType, containsNull = false))
+      assert(ret.resolved === false)
+    }
+    {
+      val ret = cast(array_notNull, ArrayType(BooleanType, containsNull = true))
+      assert(ret.resolved === true)
+      checkEvaluation(ret, Seq(true, true, false))
+    }
+    {
+      val ret = cast(array_notNull, ArrayType(BooleanType, containsNull = false))
+      assert(ret.resolved === true)
+      checkEvaluation(ret, Seq(true, true, false))
+    }
+
+    {
+      val ret = cast(array, IntegerType)
+      assert(ret.resolved === false)
+    }
+  }
+
+  test("cast from map") {
+    val map = Literal.create(
+      Map("a" -> "123", "b" -> "abc", "c" -> "", "d" -> null),
+      MapType(StringType, StringType, valueContainsNull = true))
+    val map_notNull = Literal.create(
+      Map("a" -> "123", "b" -> "abc", "c" -> ""),
+      MapType(StringType, StringType, valueContainsNull = false))
+
+    {
+      val ret = cast(map, MapType(StringType, IntegerType, valueContainsNull = true))
+      assert(ret.resolved === true)
+      checkEvaluation(ret, Map("a" -> 123, "b" -> null, "c" -> null, "d" -> null))
+    }
+    {
+      val ret = cast(map, MapType(StringType, IntegerType, valueContainsNull = false))
+      assert(ret.resolved === false)
+    }
+    {
+      val ret = cast(map, MapType(StringType, BooleanType, valueContainsNull = true))
+      assert(ret.resolved === true)
+      checkEvaluation(ret, Map("a" -> true, "b" -> true, "c" -> false, "d" -> null))
+    }
+    {
+      val ret = cast(map, MapType(StringType, BooleanType, valueContainsNull = false))
+      assert(ret.resolved === false)
+    }
+    {
+      val ret = cast(map, MapType(IntegerType, StringType, valueContainsNull = true))
+      assert(ret.resolved === false)
+    }
+
+    {
+      val ret = cast(map_notNull, MapType(StringType, IntegerType, valueContainsNull = true))
+      assert(ret.resolved === true)
+      checkEvaluation(ret, Map("a" -> 123, "b" -> null, "c" -> null))
+    }
+    {
+      val ret = cast(map_notNull, MapType(StringType, IntegerType, valueContainsNull = false))
+      assert(ret.resolved === false)
+    }
+    {
+      val ret = cast(map_notNull, MapType(StringType, BooleanType, valueContainsNull = true))
+      assert(ret.resolved === true)
+      checkEvaluation(ret, Map("a" -> true, "b" -> true, "c" -> false))
+    }
+    {
+      val ret = cast(map_notNull, MapType(StringType, BooleanType, valueContainsNull = false))
+      assert(ret.resolved === true)
+      checkEvaluation(ret, Map("a" -> true, "b" -> true, "c" -> false))
+    }
+    {
+      val ret = cast(map_notNull, MapType(IntegerType, StringType, valueContainsNull = true))
+      assert(ret.resolved === false)
+    }
+
+    {
+      val ret = cast(map, IntegerType)
+      assert(ret.resolved === false)
+    }
+  }
+
+  test("cast from struct") {
+    val struct = Literal.create(
+      Row("123", "abc", "", null),
+      StructType(Seq(
+        StructField("a", StringType, nullable = true),
+        StructField("b", StringType, nullable = true),
+        StructField("c", StringType, nullable = true),
+        StructField("d", StringType, nullable = true))))
+    val struct_notNull = Literal.create(
+      Row("123", "abc", ""),
+      StructType(Seq(
+        StructField("a", StringType, nullable = false),
+        StructField("b", StringType, nullable = false),
+        StructField("c", StringType, nullable = false))))
+
+    {
+      val ret = cast(struct, StructType(Seq(
+        StructField("a", IntegerType, nullable = true),
+        StructField("b", IntegerType, nullable = true),
+        StructField("c", IntegerType, nullable = true),
+        StructField("d", IntegerType, nullable = true))))
+      assert(ret.resolved === true)
+      checkEvaluation(ret, Row(123, null, null, null))
+    }
+    {
+      val ret = cast(struct, StructType(Seq(
+        StructField("a", IntegerType, nullable = true),
+        StructField("b", IntegerType, nullable = true),
+        StructField("c", IntegerType, nullable = false),
+        StructField("d", IntegerType, nullable = true))))
+      assert(ret.resolved === false)
+    }
+    {
+      val ret = cast(struct, StructType(Seq(
+        StructField("a", BooleanType, nullable = true),
+        StructField("b", BooleanType, nullable = true),
+        StructField("c", BooleanType, nullable = true),
+        StructField("d", BooleanType, nullable = true))))
+      assert(ret.resolved === true)
+      checkEvaluation(ret, Row(true, true, false, null))
+    }
+    {
+      val ret = cast(struct, StructType(Seq(
+        StructField("a", BooleanType, nullable = true),
+        StructField("b", BooleanType, nullable = true),
+        StructField("c", BooleanType, nullable = false),
+        StructField("d", BooleanType, nullable = true))))
+      assert(ret.resolved === false)
+    }
+
+    {
+      val ret = cast(struct_notNull, StructType(Seq(
+        StructField("a", IntegerType, nullable = true),
+        StructField("b", IntegerType, nullable = true),
+        StructField("c", IntegerType, nullable = true))))
+      assert(ret.resolved === true)
+      checkEvaluation(ret, Row(123, null, null))
+    }
+    {
+      val ret = cast(struct_notNull, StructType(Seq(
+        StructField("a", IntegerType, nullable = true),
+        StructField("b", IntegerType, nullable = true),
+        StructField("c", IntegerType, nullable = false))))
+      assert(ret.resolved === false)
+    }
+    {
+      val ret = cast(struct_notNull, StructType(Seq(
+        StructField("a", BooleanType, nullable = true),
+        StructField("b", BooleanType, nullable = true),
+        StructField("c", BooleanType, nullable = true))))
+      assert(ret.resolved === true)
+      checkEvaluation(ret, Row(true, true, false))
+    }
+    {
+      val ret = cast(struct_notNull, StructType(Seq(
+        StructField("a", BooleanType, nullable = true),
+        StructField("b", BooleanType, nullable = true),
+        StructField("c", BooleanType, nullable = false))))
+      assert(ret.resolved === true)
+      checkEvaluation(ret, Row(true, true, false))
+    }
+
+    {
+      val ret = cast(struct, StructType(Seq(
+        StructField("a", StringType, nullable = true),
+        StructField("b", StringType, nullable = true),
+        StructField("c", StringType, nullable = true))))
+      assert(ret.resolved === false)
+    }
+    {
+      val ret = cast(struct, IntegerType)
+      assert(ret.resolved === false)
+    }
+  }
+
+  test("complex casting") {
+    val complex = Literal.create(
+      Row(
+        Seq("123", "abc", ""),
+        Map("a" -> "123", "b" -> "abc", "c" -> ""),
+        Row(0)),
+      StructType(Seq(
+        StructField("a",
+          ArrayType(StringType, containsNull = false), nullable = true),
+        StructField("m",
+          MapType(StringType, StringType, valueContainsNull = false), nullable = true),
+        StructField("s",
+          StructType(Seq(
+            StructField("i", IntegerType, nullable = true)))))))
+
+    val ret = cast(complex, StructType(Seq(
+      StructField("a",
+        ArrayType(IntegerType, containsNull = true), nullable = true),
+      StructField("m",
+        MapType(StringType, BooleanType, valueContainsNull = false), nullable = true),
+      StructField("s",
+        StructType(Seq(
+          StructField("l", LongType, nullable = true)))))))
+
+    assert(ret.resolved === true)
+    checkEvaluation(ret, Row(
+      Seq(123, null, null),
+      Map("a" -> true, "b" -> true, "c" -> false),
+      Row(0L)))
+  }
+
+}
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/GeneratedEvaluationSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CodeGenerationSuite.scala
similarity index 94%
rename from sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/GeneratedEvaluationSuite.scala
rename to sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CodeGenerationSuite.scala
index 371a73181dad7..481b335d15dfd 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/GeneratedEvaluationSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CodeGenerationSuite.scala
@@ -17,13 +17,14 @@
 
 package org.apache.spark.sql.catalyst.expressions
 
+import org.apache.spark.SparkFunSuite
 import org.apache.spark.sql.catalyst.dsl.expressions._
 import org.apache.spark.sql.catalyst.expressions.codegen._
 
 /**
  * Additional tests for code generation.
  */
-class GeneratedEvaluationSuite extends ExpressionEvaluationSuite {
+class CodeGenerationSuite extends SparkFunSuite {
 
   test("multithreaded eval") {
     import scala.concurrent._
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ComplexTypeSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ComplexTypeSuite.scala
new file mode 100644
index 0000000000000..f151dd2a47f78
--- /dev/null
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ComplexTypeSuite.scala
@@ -0,0 +1,122 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.catalyst.expressions
+
+import org.apache.spark.SparkFunSuite
+import org.apache.spark.sql.catalyst.analysis.UnresolvedExtractValue
+import org.apache.spark.sql.catalyst.dsl.expressions._
+import org.apache.spark.sql.types._
+
+
+class ComplexTypeSuite extends SparkFunSuite with ExpressionEvalHelper {
+
+  test("CreateStruct") {
+    val row = Row(1, 2, 3)
+    val c1 = 'a.int.at(0).as("a")
+    val c3 = 'c.int.at(2).as("c")
+    checkEvaluation(CreateStruct(Seq(c1, c3)), Row(1, 3), row)
+  }
+
+  test("complex type") {
+    val row = create_row(
+      "^Ba*n",                                // 0
+      null.asInstanceOf[UTF8String],          // 1
+      create_row("aa", "bb"),                 // 2
+      Map("aa"->"bb"),                        // 3
+      Seq("aa", "bb")                         // 4
+    )
+
+    val typeS = StructType(
+      StructField("a", StringType, true) :: StructField("b", StringType, true) :: Nil
+    )
+    val typeMap = MapType(StringType, StringType)
+    val typeArray = ArrayType(StringType)
+
+    checkEvaluation(GetMapValue(BoundReference(3, typeMap, true),
+      Literal("aa")), "bb", row)
+    checkEvaluation(GetMapValue(Literal.create(null, typeMap), Literal("aa")), null, row)
+    checkEvaluation(
+      GetMapValue(Literal.create(null, typeMap), Literal.create(null, StringType)), null, row)
+    checkEvaluation(GetMapValue(BoundReference(3, typeMap, true),
+      Literal.create(null, StringType)), null, row)
+
+    checkEvaluation(GetArrayItem(BoundReference(4, typeArray, true),
+      Literal(1)), "bb", row)
+    checkEvaluation(GetArrayItem(Literal.create(null, typeArray), Literal(1)), null, row)
+    checkEvaluation(
+      GetArrayItem(Literal.create(null, typeArray), Literal.create(null, IntegerType)), null, row)
+    checkEvaluation(GetArrayItem(BoundReference(4, typeArray, true),
+      Literal.create(null, IntegerType)), null, row)
+
+    def getStructField(expr: Expression, fieldName: String): ExtractValue = {
+      expr.dataType match {
+        case StructType(fields) =>
+          val field = fields.find(_.name == fieldName).get
+          GetStructField(expr, field, fields.indexOf(field))
+      }
+    }
+
+    def quickResolve(u: UnresolvedExtractValue): ExtractValue = {
+      ExtractValue(u.child, u.extraction, _ == _)
+    }
+
+    checkEvaluation(getStructField(BoundReference(2, typeS, nullable = true), "a"), "aa", row)
+    checkEvaluation(getStructField(Literal.create(null, typeS), "a"), null, row)
+
+    val typeS_notNullable = StructType(
+      StructField("a", StringType, nullable = false)
+        :: StructField("b", StringType, nullable = false) :: Nil
+    )
+
+    assert(getStructField(BoundReference(2, typeS, nullable = true), "a").nullable === true)
+    assert(getStructField(BoundReference(2, typeS_notNullable, nullable = false), "a").nullable
+      === false)
+
+    assert(getStructField(Literal.create(null, typeS), "a").nullable === true)
+    assert(getStructField(Literal.create(null, typeS_notNullable), "a").nullable === true)
+
+    checkEvaluation(quickResolve('c.map(typeMap).at(3).getItem("aa")), "bb", row)
+    checkEvaluation(quickResolve('c.array(typeArray.elementType).at(4).getItem(1)), "bb", row)
+    checkEvaluation(quickResolve('c.struct(typeS).at(2).getField("a")), "aa", row)
+  }
+
+  test("error message of ExtractValue") {
+    val structType = StructType(StructField("a", StringType, true) :: Nil)
+    val arrayStructType = ArrayType(structType)
+    val arrayType = ArrayType(StringType)
+    val otherType = StringType
+
+    def checkErrorMessage(
+      childDataType: DataType,
+      fieldDataType: DataType,
+      errorMesage: String): Unit = {
+      val e = intercept[org.apache.spark.sql.AnalysisException] {
+        ExtractValue(
+          Literal.create(null, childDataType),
+          Literal.create(null, fieldDataType),
+          _ == _)
+      }
+      assert(e.getMessage().contains(errorMesage))
+    }
+
+    checkErrorMessage(structType, IntegerType, "Field name should be String Literal")
+    checkErrorMessage(arrayStructType, BooleanType, "Field name should be String Literal")
+    checkErrorMessage(arrayType, StringType, "Array index should be integral type")
+    checkErrorMessage(otherType, StringType, "Can't extract value from")
+  }
+}
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ConditionalExpressionSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ConditionalExpressionSuite.scala
new file mode 100644
index 0000000000000..152c4e4111244
--- /dev/null
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ConditionalExpressionSuite.scala
@@ -0,0 +1,96 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.catalyst.expressions
+
+import org.apache.spark.SparkFunSuite
+import org.apache.spark.sql.catalyst.dsl.expressions._
+import org.apache.spark.sql.types.{IntegerType, BooleanType}
+
+
+class ConditionalExpressionSuite extends SparkFunSuite with ExpressionEvalHelper {
+
+  test("case when") {
+    val row = create_row(null, false, true, "a", "b", "c")
+    val c1 = 'a.boolean.at(0)
+    val c2 = 'a.boolean.at(1)
+    val c3 = 'a.boolean.at(2)
+    val c4 = 'a.string.at(3)
+    val c5 = 'a.string.at(4)
+    val c6 = 'a.string.at(5)
+
+    checkEvaluation(CaseWhen(Seq(c1, c4, c6)), "c", row)
+    checkEvaluation(CaseWhen(Seq(c2, c4, c6)), "c", row)
+    checkEvaluation(CaseWhen(Seq(c3, c4, c6)), "a", row)
+    checkEvaluation(CaseWhen(Seq(Literal.create(null, BooleanType), c4, c6)), "c", row)
+    checkEvaluation(CaseWhen(Seq(Literal.create(false, BooleanType), c4, c6)), "c", row)
+    checkEvaluation(CaseWhen(Seq(Literal.create(true, BooleanType), c4, c6)), "a", row)
+
+    checkEvaluation(CaseWhen(Seq(c3, c4, c2, c5, c6)), "a", row)
+    checkEvaluation(CaseWhen(Seq(c2, c4, c3, c5, c6)), "b", row)
+    checkEvaluation(CaseWhen(Seq(c1, c4, c2, c5, c6)), "c", row)
+    checkEvaluation(CaseWhen(Seq(c1, c4, c2, c5)), null, row)
+
+    assert(CaseWhen(Seq(c2, c4, c6)).nullable === true)
+    assert(CaseWhen(Seq(c2, c4, c3, c5, c6)).nullable === true)
+    assert(CaseWhen(Seq(c2, c4, c3, c5)).nullable === true)
+
+    val c4_notNull = 'a.boolean.notNull.at(3)
+    val c5_notNull = 'a.boolean.notNull.at(4)
+    val c6_notNull = 'a.boolean.notNull.at(5)
+
+    assert(CaseWhen(Seq(c2, c4_notNull, c6_notNull)).nullable === false)
+    assert(CaseWhen(Seq(c2, c4, c6_notNull)).nullable === true)
+    assert(CaseWhen(Seq(c2, c4_notNull, c6)).nullable === true)
+
+    assert(CaseWhen(Seq(c2, c4_notNull, c3, c5_notNull, c6_notNull)).nullable === false)
+    assert(CaseWhen(Seq(c2, c4, c3, c5_notNull, c6_notNull)).nullable === true)
+    assert(CaseWhen(Seq(c2, c4_notNull, c3, c5, c6_notNull)).nullable === true)
+    assert(CaseWhen(Seq(c2, c4_notNull, c3, c5_notNull, c6)).nullable === true)
+
+    assert(CaseWhen(Seq(c2, c4_notNull, c3, c5_notNull)).nullable === true)
+    assert(CaseWhen(Seq(c2, c4, c3, c5_notNull)).nullable === true)
+    assert(CaseWhen(Seq(c2, c4_notNull, c3, c5)).nullable === true)
+  }
+
+  test("case key when") {
+    val row = create_row(null, 1, 2, "a", "b", "c")
+    val c1 = 'a.int.at(0)
+    val c2 = 'a.int.at(1)
+    val c3 = 'a.int.at(2)
+    val c4 = 'a.string.at(3)
+    val c5 = 'a.string.at(4)
+    val c6 = 'a.string.at(5)
+
+    val literalNull = Literal.create(null, IntegerType)
+    val literalInt = Literal(1)
+    val literalString = Literal("a")
+
+    checkEvaluation(CaseKeyWhen(c1, Seq(c2, c4, c5)), "b", row)
+    checkEvaluation(CaseKeyWhen(c1, Seq(c2, c4, literalNull, c5, c6)), "b", row)
+    checkEvaluation(CaseKeyWhen(c2, Seq(literalInt, c4, c5)), "a", row)
+    checkEvaluation(CaseKeyWhen(c2, Seq(c1, c4, c5)), "b", row)
+    checkEvaluation(CaseKeyWhen(c4, Seq(literalString, c2, c3)), 1, row)
+    checkEvaluation(CaseKeyWhen(c4, Seq(c6, c3, c5, c2, Literal(3))), 3, row)
+
+    checkEvaluation(CaseKeyWhen(literalInt, Seq(c2, c4, c5)), "a", row)
+    checkEvaluation(CaseKeyWhen(literalString, Seq(c5, c2, c4, c3)), 2, row)
+    checkEvaluation(CaseKeyWhen(c6, Seq(c5, c2, c4, c3)), null, row)
+    checkEvaluation(CaseKeyWhen(literalNull, Seq(c2, c5, c1, c6)), "c", row)
+  }
+
+}
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionEvalHelper.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionEvalHelper.scala
new file mode 100644
index 0000000000000..87a92b87962f8
--- /dev/null
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionEvalHelper.scala
@@ -0,0 +1,134 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.catalyst.expressions
+
+import org.scalactic.TripleEqualsSupport.Spread
+import org.scalatest.Matchers._
+
+import org.apache.spark.SparkFunSuite
+import org.apache.spark.sql.catalyst.CatalystTypeConverters
+import org.apache.spark.sql.catalyst.expressions.codegen.{GenerateProjection, GenerateMutableProjection}
+
+/**
+ * A few helper functions for expression evaluation testing. Mixin this trait to use them.
+ */
+trait ExpressionEvalHelper {
+  self: SparkFunSuite =>
+
+  protected def create_row(values: Any*): Row = {
+    new GenericRow(values.map(CatalystTypeConverters.convertToCatalyst).toArray)
+  }
+
+  protected def checkEvaluation(
+      expression: Expression, expected: Any, inputRow: Row = EmptyRow): Unit = {
+    checkEvaluationWithoutCodegen(expression, expected, inputRow)
+    checkEvaluationWithGeneratedMutableProjection(expression, expected, inputRow)
+    checkEvaluationWithGeneratedProjection(expression, expected, inputRow)
+  }
+
+  protected def evaluate(expression: Expression, inputRow: Row = EmptyRow): Any = {
+    expression.eval(inputRow)
+  }
+
+  protected def checkEvaluationWithoutCodegen(
+      expression: Expression,
+      expected: Any,
+      inputRow: Row = EmptyRow): Unit = {
+    val actual = try evaluate(expression, inputRow) catch {
+      case e: Exception => fail(s"Exception evaluating $expression", e)
+    }
+    if (actual != expected) {
+      val input = if (inputRow == EmptyRow) "" else s", input: $inputRow"
+      fail(s"Incorrect evaluation (codegen off): $expression, " +
+        s"actual: $actual, " +
+        s"expected: $expected$input")
+    }
+  }
+
+  protected def checkEvaluationWithGeneratedMutableProjection(
+      expression: Expression,
+      expected: Any,
+      inputRow: Row = EmptyRow): Unit = {
+
+    val plan = try {
+      GenerateMutableProjection.generate(Alias(expression, s"Optimized($expression)")() :: Nil)()
+    } catch {
+      case e: Throwable =>
+        val ctx = GenerateProjection.newCodeGenContext()
+        val evaluated = expression.gen(ctx)
+        fail(
+          s"""
+            |Code generation of $expression failed:
+            |${evaluated.code}
+            |$e
+          """.stripMargin)
+    }
+
+    val actual = plan(inputRow).apply(0)
+    if (actual != expected) {
+      val input = if (inputRow == EmptyRow) "" else s", input: $inputRow"
+      fail(s"Incorrect Evaluation: $expression, actual: $actual, expected: $expected$input")
+    }
+  }
+
+  protected def checkEvaluationWithGeneratedProjection(
+      expression: Expression,
+      expected: Any,
+      inputRow: Row = EmptyRow): Unit = {
+    val ctx = GenerateProjection.newCodeGenContext()
+    lazy val evaluated = expression.gen(ctx)
+
+    val plan = try {
+      GenerateProjection.generate(Alias(expression, s"Optimized($expression)")() :: Nil)
+    } catch {
+      case e: Throwable =>
+        fail(
+          s"""
+            |Code generation of $expression failed:
+            |${evaluated.code}
+            |$e
+          """.stripMargin)
+    }
+
+    val actual = plan(inputRow)
+    val expectedRow = new GenericRow(Array[Any](CatalystTypeConverters.convertToCatalyst(expected)))
+    if (actual.hashCode() != expectedRow.hashCode()) {
+      fail(
+        s"""
+          |Mismatched hashCodes for values: $actual, $expectedRow
+          |Hash Codes: ${actual.hashCode()} != ${expectedRow.hashCode()}
+          |Expressions: $expression
+          |Code: $evaluated
+        """.stripMargin)
+    }
+    if (actual != expectedRow) {
+      val input = if (inputRow == EmptyRow) "" else s", input: $inputRow"
+      fail(s"Incorrect Evaluation: $expression, actual: $actual, expected: $expected$input")
+    }
+  }
+
+  protected def checkDoubleEvaluation(
+      expression: Expression,
+      expected: Spread[Double],
+      inputRow: Row = EmptyRow): Unit = {
+    val actual = try evaluate(expression, inputRow) catch {
+      case e: Exception => fail(s"Exception evaluating $expression", e)
+    }
+    actual.asInstanceOf[Double] shouldBe expected
+  }
+}
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionEvaluationSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionEvaluationSuite.scala
deleted file mode 100644
index eea2edc323eea..0000000000000
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionEvaluationSuite.scala
+++ /dev/null
@@ -1,1461 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.sql.catalyst.expressions
-
-import java.sql.{Date, Timestamp}
-
-import scala.collection.immutable.HashSet
-
-import org.scalactic.TripleEqualsSupport.Spread
-import org.scalatest.Matchers._
-
-import org.apache.spark.SparkFunSuite
-import org.apache.spark.sql.catalyst.CatalystTypeConverters
-import org.apache.spark.sql.catalyst.analysis.UnresolvedExtractValue
-import org.apache.spark.sql.catalyst.dsl.expressions._
-import org.apache.spark.sql.catalyst.expressions.codegen.{GenerateProjection, GenerateMutableProjection}
-import org.apache.spark.sql.catalyst.expressions.mathfuncs._
-import org.apache.spark.sql.catalyst.util.DateUtils
-import org.apache.spark.sql.types._
-
-
-class ExpressionEvaluationBaseSuite extends SparkFunSuite {
-
-  def checkEvaluation(expression: Expression, expected: Any, inputRow: Row = EmptyRow): Unit = {
-    checkEvaluationWithoutCodegen(expression, expected, inputRow)
-    checkEvaluationWithGeneratedMutableProjection(expression, expected, inputRow)
-    checkEvaluationWithGeneratedProjection(expression, expected, inputRow)
-  }
-
-  def evaluate(expression: Expression, inputRow: Row = EmptyRow): Any = {
-    expression.eval(inputRow)
-  }
-
-  def checkEvaluationWithoutCodegen(
-      expression: Expression,
-      expected: Any,
-      inputRow: Row = EmptyRow): Unit = {
-    val actual = try evaluate(expression, inputRow) catch {
-      case e: Exception => fail(s"Exception evaluating $expression", e)
-    }
-    if (actual != expected) {
-      val input = if (inputRow == EmptyRow) "" else s", input: $inputRow"
-      fail(s"Incorrect Evaluation: $expression, actual: $actual, expected: $expected$input")
-    }
-  }
-
-  def checkEvaluationWithGeneratedMutableProjection(
-      expression: Expression,
-      expected: Any,
-      inputRow: Row = EmptyRow): Unit = {
-
-    val plan = try {
-      GenerateMutableProjection.generate(Alias(expression, s"Optimized($expression)")() :: Nil)()
-    } catch {
-      case e: Throwable =>
-        val ctx = GenerateProjection.newCodeGenContext()
-        val evaluated = expression.gen(ctx)
-        fail(
-          s"""
-            |Code generation of $expression failed:
-            |${evaluated.code}
-            |$e
-          """.stripMargin)
-    }
-
-    val actual = plan(inputRow).apply(0)
-    if (actual != expected) {
-      val input = if (inputRow == EmptyRow) "" else s", input: $inputRow"
-      fail(s"Incorrect Evaluation: $expression, actual: $actual, expected: $expected$input")
-    }
-  }
-
-  def checkEvaluationWithGeneratedProjection(
-      expression: Expression,
-      expected: Any,
-      inputRow: Row = EmptyRow): Unit = {
-    val ctx = GenerateProjection.newCodeGenContext()
-    lazy val evaluated = expression.gen(ctx)
-
-    val plan = try {
-      GenerateProjection.generate(Alias(expression, s"Optimized($expression)")() :: Nil)
-    } catch {
-      case e: Throwable =>
-        fail(
-          s"""
-            |Code generation of $expression failed:
-            |${evaluated.code}
-            |$e
-          """.stripMargin)
-    }
-
-    val actual = plan(inputRow)
-    val expectedRow = new GenericRow(Array[Any](CatalystTypeConverters.convertToCatalyst(expected)))
-    if (actual.hashCode() != expectedRow.hashCode()) {
-      fail(
-        s"""
-          |Mismatched hashCodes for values: $actual, $expectedRow
-          |Hash Codes: ${actual.hashCode()} != ${expectedRow.hashCode()}
-          |Expressions: ${expression}
-          |Code: ${evaluated}
-        """.stripMargin)
-    }
-    if (actual != expectedRow) {
-      val input = if (inputRow == EmptyRow) "" else s", input: $inputRow"
-      fail(s"Incorrect Evaluation: $expression, actual: $actual, expected: $expected$input")
-    }
-  }
-
-  def checkDoubleEvaluation(
-      expression: Expression,
-      expected: Spread[Double],
-      inputRow: Row = EmptyRow): Unit = {
-    val actual = try evaluate(expression, inputRow) catch {
-      case e: Exception => fail(s"Exception evaluating $expression", e)
-    }
-    actual.asInstanceOf[Double] shouldBe expected
-  }
-}
-
-class ExpressionEvaluationSuite extends ExpressionEvaluationBaseSuite {
-
-  def create_row(values: Any*): Row = {
-    new GenericRow(values.map(CatalystTypeConverters.convertToCatalyst).toArray)
-  }
-
-  test("literals") {
-    checkEvaluation(Literal(1), 1)
-    checkEvaluation(Literal(true), true)
-    checkEvaluation(Literal(false), false)
-    checkEvaluation(Literal(0L), 0L)
-    List(0.0, -0.0, Double.NegativeInfinity, Double.PositiveInfinity).foreach {
-      d => {
-        checkEvaluation(Literal(d), d)
-        checkEvaluation(Literal(d.toFloat), d.toFloat)
-      }
-    }
-    checkEvaluation(Literal("test"), "test")
-    checkEvaluation(Literal.create(null, StringType), null)
-    checkEvaluation(Literal(1) + Literal(1), 2)
-  }
-
-  test("unary BitwiseNOT") {
-    checkEvaluation(BitwiseNot(1), -2)
-    assert(BitwiseNot(1).dataType === IntegerType)
-    assert(BitwiseNot(1).eval(EmptyRow).isInstanceOf[Int])
-    checkEvaluation(BitwiseNot(1.toLong), -2.toLong)
-    assert(BitwiseNot(1.toLong).dataType === LongType)
-    assert(BitwiseNot(1.toLong).eval(EmptyRow).isInstanceOf[Long])
-    checkEvaluation(BitwiseNot(1.toShort), -2.toShort)
-    assert(BitwiseNot(1.toShort).dataType === ShortType)
-    assert(BitwiseNot(1.toShort).eval(EmptyRow).isInstanceOf[Short])
-    checkEvaluation(BitwiseNot(1.toByte), -2.toByte)
-    assert(BitwiseNot(1.toByte).dataType === ByteType)
-    assert(BitwiseNot(1.toByte).eval(EmptyRow).isInstanceOf[Byte])
-  }
-
-  // scalastyle:off
-  /**
-   * Checks for three-valued-logic.  Based on:
-   * http://en.wikipedia.org/wiki/Null_(SQL)#Comparisons_with_NULL_and_the_three-valued_logic_.283VL.29
-   * I.e. in flat cpo "False -> Unknown -> True",
-   *   OR is lowest upper bound,
-   *   AND is greatest lower bound.
-   * p       q       p OR q  p AND q  p = q
-   * True    True    True    True     True
-   * True    False   True    False    False
-   * True    Unknown True    Unknown  Unknown
-   * False   True    True    False    False
-   * False   False   False   False    True
-   * False   Unknown Unknown False    Unknown
-   * Unknown True    True    Unknown  Unknown
-   * Unknown False   Unknown False    Unknown
-   * Unknown Unknown Unknown Unknown  Unknown
-   *
-   * p       NOT p
-   * True    False
-   * False   True
-   * Unknown Unknown
-   */
-  // scalastyle:on
-  val notTrueTable =
-    (true, false) ::
-    (false, true) ::
-    (null, null) :: Nil
-
-  test("3VL Not") {
-    notTrueTable.foreach {
-      case (v, answer) =>
-        checkEvaluation(!Literal.create(v, BooleanType), answer)
-    }
-  }
-
-  booleanLogicTest("AND", _ && _,
-    (true, true, true) ::
-    (true, false, false) ::
-    (true, null, null) ::
-    (false, true, false) ::
-    (false, false, false) ::
-    (false, null, false) ::
-    (null, true, null) ::
-    (null, false, false) ::
-    (null, null, null) :: Nil)
-
-  booleanLogicTest("OR", _ || _,
-    (true, true, true) ::
-    (true, false, true) ::
-    (true, null, true) ::
-    (false, true, true) ::
-    (false, false, false) ::
-    (false, null, null) ::
-    (null, true, true) ::
-    (null, false, null) ::
-    (null, null, null) :: Nil)
-
-  booleanLogicTest("=", _ === _,
-    (true, true, true) ::
-    (true, false, false) ::
-    (true, null, null) ::
-    (false, true, false) ::
-    (false, false, true) ::
-    (false, null, null) ::
-    (null, true, null) ::
-    (null, false, null) ::
-    (null, null, null) :: Nil)
-
-  def booleanLogicTest(
-      name: String,
-      op: (Expression, Expression) => Expression,
-      truthTable: Seq[(Any, Any, Any)]) {
-    test(s"3VL $name") {
-      truthTable.foreach {
-        case (l, r, answer) =>
-          val expr = op(Literal.create(l, BooleanType), Literal.create(r, BooleanType))
-          checkEvaluation(expr, answer)
-      }
-    }
-  }
-
-  test("IN") {
-    checkEvaluation(In(Literal(1), Seq(Literal(1), Literal(2))), true)
-    checkEvaluation(In(Literal(2), Seq(Literal(1), Literal(2))), true)
-    checkEvaluation(In(Literal(3), Seq(Literal(1), Literal(2))), false)
-    checkEvaluation(
-      In(Literal(1), Seq(Literal(1), Literal(2))) && In(Literal(2), Seq(Literal(1), Literal(2))),
-      true)
-  }
-
-  test("Divide") {
-    checkEvaluation(Divide(Literal(2), Literal(1)), 2)
-    checkEvaluation(Divide(Literal(1.0), Literal(2.0)), 0.5)
-    checkEvaluation(Divide(Literal(1), Literal(2)), 0)
-    checkEvaluation(Divide(Literal(1), Literal(0)), null)
-    checkEvaluation(Divide(Literal(1.0), Literal(0.0)), null)
-    checkEvaluation(Divide(Literal(0.0), Literal(0.0)), null)
-    checkEvaluation(Divide(Literal(0), Literal.create(null, IntegerType)), null)
-    checkEvaluation(Divide(Literal(1), Literal.create(null, IntegerType)), null)
-    checkEvaluation(Divide(Literal.create(null, IntegerType), Literal(0)), null)
-    checkEvaluation(Divide(Literal.create(null, DoubleType), Literal(0.0)), null)
-    checkEvaluation(Divide(Literal.create(null, IntegerType), Literal(1)), null)
-    checkEvaluation(Divide(Literal.create(null, IntegerType), Literal.create(null, IntegerType)),
-      null)
-  }
-
-  test("Remainder") {
-    checkEvaluation(Remainder(Literal(2), Literal(1)), 0)
-    checkEvaluation(Remainder(Literal(1.0), Literal(2.0)), 1.0)
-    checkEvaluation(Remainder(Literal(1), Literal(2)), 1)
-    checkEvaluation(Remainder(Literal(1), Literal(0)), null)
-    checkEvaluation(Remainder(Literal(1.0), Literal(0.0)), null)
-    checkEvaluation(Remainder(Literal(0.0), Literal(0.0)), null)
-    checkEvaluation(Remainder(Literal(0), Literal.create(null, IntegerType)), null)
-    checkEvaluation(Remainder(Literal(1), Literal.create(null, IntegerType)), null)
-    checkEvaluation(Remainder(Literal.create(null, IntegerType), Literal(0)), null)
-    checkEvaluation(Remainder(Literal.create(null, DoubleType), Literal(0.0)), null)
-    checkEvaluation(Remainder(Literal.create(null, IntegerType), Literal(1)), null)
-    checkEvaluation(Remainder(Literal.create(null, IntegerType), Literal.create(null, IntegerType)),
-      null)
-  }
-
-  test("INSET") {
-    val hS = HashSet[Any]() + 1 + 2
-    val nS = HashSet[Any]() + 1 + 2 + null
-    val one = Literal(1)
-    val two = Literal(2)
-    val three = Literal(3)
-    val nl = Literal(null)
-    val s = Seq(one, two)
-    val nullS = Seq(one, two, null)
-    checkEvaluation(InSet(one, hS), true)
-    checkEvaluation(InSet(two, hS), true)
-    checkEvaluation(InSet(two, nS), true)
-    checkEvaluation(InSet(nl, nS), true)
-    checkEvaluation(InSet(three, hS), false)
-    checkEvaluation(InSet(three, nS), false)
-    checkEvaluation(InSet(one, hS) && InSet(two, hS), true)
-  }
-
-  test("MaxOf") {
-    checkEvaluation(MaxOf(1, 2), 2)
-    checkEvaluation(MaxOf(2, 1), 2)
-    checkEvaluation(MaxOf(1L, 2L), 2L)
-    checkEvaluation(MaxOf(2L, 1L), 2L)
-
-    checkEvaluation(MaxOf(Literal.create(null, IntegerType), 2), 2)
-    checkEvaluation(MaxOf(2, Literal.create(null, IntegerType)), 2)
-  }
-
-  test("MinOf") {
-    checkEvaluation(MinOf(1, 2), 1)
-    checkEvaluation(MinOf(2, 1), 1)
-    checkEvaluation(MinOf(1L, 2L), 1L)
-    checkEvaluation(MinOf(2L, 1L), 1L)
-
-    checkEvaluation(MinOf(Literal.create(null, IntegerType), 1), 1)
-    checkEvaluation(MinOf(1, Literal.create(null, IntegerType)), 1)
-  }
-
-  test("LIKE literal Regular Expression") {
-    checkEvaluation(Literal.create(null, StringType).like("a"), null)
-    checkEvaluation(Literal.create("a", StringType).like(Literal.create(null, StringType)), null)
-    checkEvaluation(Literal.create(null, StringType).like(Literal.create(null, StringType)), null)
-    checkEvaluation("abdef" like "abdef", true)
-    checkEvaluation("a_%b" like "a\\__b", true)
-    checkEvaluation("addb" like "a_%b", true)
-    checkEvaluation("addb" like "a\\__b", false)
-    checkEvaluation("addb" like "a%\\%b", false)
-    checkEvaluation("a_%b" like "a%\\%b", true)
-    checkEvaluation("addb" like "a%", true)
-    checkEvaluation("addb" like "**", false)
-    checkEvaluation("abc" like "a%", true)
-    checkEvaluation("abc"  like "b%", false)
-    checkEvaluation("abc"  like "bc%", false)
-    checkEvaluation("a\nb" like "a_b", true)
-    checkEvaluation("ab" like "a%b", true)
-    checkEvaluation("a\nb" like "a%b", true)
-  }
-
-  test("LIKE Non-literal Regular Expression") {
-    val regEx = 'a.string.at(0)
-    checkEvaluation("abcd" like regEx, null, create_row(null))
-    checkEvaluation("abdef" like regEx, true, create_row("abdef"))
-    checkEvaluation("a_%b" like regEx, true, create_row("a\\__b"))
-    checkEvaluation("addb" like regEx, true, create_row("a_%b"))
-    checkEvaluation("addb" like regEx, false, create_row("a\\__b"))
-    checkEvaluation("addb" like regEx, false, create_row("a%\\%b"))
-    checkEvaluation("a_%b" like regEx, true, create_row("a%\\%b"))
-    checkEvaluation("addb" like regEx, true, create_row("a%"))
-    checkEvaluation("addb" like regEx, false, create_row("**"))
-    checkEvaluation("abc" like regEx, true, create_row("a%"))
-    checkEvaluation("abc" like regEx, false, create_row("b%"))
-    checkEvaluation("abc" like regEx, false, create_row("bc%"))
-    checkEvaluation("a\nb" like regEx, true, create_row("a_b"))
-    checkEvaluation("ab" like regEx, true, create_row("a%b"))
-    checkEvaluation("a\nb" like regEx, true, create_row("a%b"))
-
-    checkEvaluation(Literal.create(null, StringType) like regEx, null, create_row("bc%"))
-  }
-
-  test("RLIKE literal Regular Expression") {
-    checkEvaluation(Literal.create(null, StringType) rlike "abdef", null)
-    checkEvaluation("abdef" rlike Literal.create(null, StringType), null)
-    checkEvaluation(Literal.create(null, StringType) rlike Literal.create(null, StringType), null)
-    checkEvaluation("abdef" rlike "abdef", true)
-    checkEvaluation("abbbbc" rlike "a.*c", true)
-
-    checkEvaluation("fofo" rlike "^fo", true)
-    checkEvaluation("fo\no" rlike "^fo\no$", true)
-    checkEvaluation("Bn" rlike "^Ba*n", true)
-    checkEvaluation("afofo" rlike "fo", true)
-    checkEvaluation("afofo" rlike "^fo", false)
-    checkEvaluation("Baan" rlike "^Ba?n", false)
-    checkEvaluation("axe" rlike "pi|apa", false)
-    checkEvaluation("pip" rlike "^(pi)*$", false)
-
-    checkEvaluation("abc"  rlike "^ab", true)
-    checkEvaluation("abc"  rlike "^bc", false)
-    checkEvaluation("abc"  rlike "^ab", true)
-    checkEvaluation("abc"  rlike "^bc", false)
-
-    intercept[java.util.regex.PatternSyntaxException] {
-      evaluate("abbbbc" rlike "**")
-    }
-  }
-
-  test("RLIKE Non-literal Regular Expression") {
-    val regEx = 'a.string.at(0)
-    checkEvaluation("abdef" rlike regEx, true, create_row("abdef"))
-    checkEvaluation("abbbbc" rlike regEx, true, create_row("a.*c"))
-    checkEvaluation("fofo" rlike regEx, true, create_row("^fo"))
-    checkEvaluation("fo\no" rlike regEx, true, create_row("^fo\no$"))
-    checkEvaluation("Bn" rlike regEx, true, create_row("^Ba*n"))
-
-    intercept[java.util.regex.PatternSyntaxException] {
-      evaluate("abbbbc" rlike regEx, create_row("**"))
-    }
-  }
-
-  test("data type casting") {
-
-    val sd = "1970-01-01"
-    val d = Date.valueOf(sd)
-    val zts = sd + " 00:00:00"
-    val sts = sd + " 00:00:02"
-    val nts = sts + ".1"
-    val ts = Timestamp.valueOf(nts)
-
-    checkEvaluation("abdef" cast StringType, "abdef")
-    checkEvaluation("abdef" cast DecimalType.Unlimited, null)
-    checkEvaluation("abdef" cast TimestampType, null)
-    checkEvaluation("12.65" cast DecimalType.Unlimited, Decimal(12.65))
-
-    checkEvaluation(Literal(1) cast LongType, 1.toLong)
-    checkEvaluation(Cast(Literal(1000) cast TimestampType, LongType), 1.toLong)
-    checkEvaluation(Cast(Literal(-1200) cast TimestampType, LongType), -2.toLong)
-    checkEvaluation(Cast(Literal(1.toDouble) cast TimestampType, DoubleType), 1.toDouble)
-    checkEvaluation(Cast(Literal(1.toDouble) cast TimestampType, DoubleType), 1.toDouble)
-
-    checkEvaluation(Cast(Literal(sd) cast DateType, StringType), sd)
-    checkEvaluation(Cast(Literal(d) cast StringType, DateType), 0)
-    checkEvaluation(Cast(Literal(nts) cast TimestampType, StringType), nts)
-    checkEvaluation(Cast(Literal(ts) cast StringType, TimestampType), ts)
-    // all convert to string type to check
-    checkEvaluation(
-      Cast(Cast(Literal(nts) cast TimestampType, DateType), StringType), sd)
-    checkEvaluation(
-      Cast(Cast(Literal(ts) cast DateType, TimestampType), StringType), zts)
-
-    checkEvaluation(Cast("abdef" cast BinaryType, StringType), "abdef")
-
-    checkEvaluation(Cast(Cast(Cast(Cast(
-      Cast("5" cast ByteType, ShortType), IntegerType), FloatType), DoubleType), LongType),
-      5.toLong)
-    checkEvaluation(Cast(Cast(Cast(Cast(Cast("5" cast
-      ByteType, TimestampType), DecimalType.Unlimited), LongType), StringType), ShortType),
-      0.toShort)
-    checkEvaluation(Cast(Cast(Cast(Cast(Cast("5" cast
-      TimestampType, ByteType), DecimalType.Unlimited), LongType), StringType), ShortType), null)
-    checkEvaluation(Cast(Cast(Cast(Cast(Cast("5" cast
-      DecimalType.Unlimited, ByteType), TimestampType), LongType), StringType), ShortType),
-      0.toShort)
-    checkEvaluation(Literal(true) cast IntegerType, 1)
-    checkEvaluation(Literal(false) cast IntegerType, 0)
-    checkEvaluation(Literal(true) cast StringType, "true")
-    checkEvaluation(Literal(false) cast StringType, "false")
-    checkEvaluation(Cast(Literal(1) cast BooleanType, IntegerType), 1)
-    checkEvaluation(Cast(Literal(0) cast BooleanType, IntegerType), 0)
-    checkEvaluation("23" cast DoubleType, 23d)
-    checkEvaluation("23" cast IntegerType, 23)
-    checkEvaluation("23" cast FloatType, 23f)
-    checkEvaluation("23" cast DecimalType.Unlimited, Decimal(23))
-    checkEvaluation("23" cast ByteType, 23.toByte)
-    checkEvaluation("23" cast ShortType, 23.toShort)
-    checkEvaluation("2012-12-11" cast DoubleType, null)
-    checkEvaluation(Literal(123) cast IntegerType, 123)
-
-    checkEvaluation(Literal(23d) + Cast(true, DoubleType), 24d)
-    checkEvaluation(Literal(23) + Cast(true, IntegerType), 24)
-    checkEvaluation(Literal(23f) + Cast(true, FloatType), 24f)
-    checkEvaluation(Literal(Decimal(23)) + Cast(true, DecimalType.Unlimited), Decimal(24))
-    checkEvaluation(Literal(23.toByte) + Cast(true, ByteType), 24.toByte)
-    checkEvaluation(Literal(23.toShort) + Cast(true, ShortType), 24.toShort)
-
-    intercept[Exception] {evaluate(Literal(1) cast BinaryType, null)}
-
-    assert(("abcdef" cast StringType).nullable === false)
-    assert(("abcdef" cast BinaryType).nullable === false)
-    assert(("abcdef" cast BooleanType).nullable === false)
-    assert(("abcdef" cast TimestampType).nullable === true)
-    assert(("abcdef" cast LongType).nullable === true)
-    assert(("abcdef" cast IntegerType).nullable === true)
-    assert(("abcdef" cast ShortType).nullable === true)
-    assert(("abcdef" cast ByteType).nullable === true)
-    assert(("abcdef" cast DecimalType.Unlimited).nullable === true)
-    assert(("abcdef" cast DecimalType(4, 2)).nullable === true)
-    assert(("abcdef" cast DoubleType).nullable === true)
-    assert(("abcdef" cast FloatType).nullable === true)
-
-    checkEvaluation(Cast(Literal.create(null, IntegerType), ShortType), null)
-  }
-
-  test("date") {
-    val d1 = DateUtils.fromJavaDate(Date.valueOf("1970-01-01"))
-    val d2 = DateUtils.fromJavaDate(Date.valueOf("1970-01-02"))
-    checkEvaluation(Literal(d1) < Literal(d2), true)
-  }
-
-  test("casting to fixed-precision decimals") {
-    // Overflow and rounding for casting to fixed-precision decimals:
-    // - Values should round with HALF_UP mode by default when you lower scale
-    // - Values that would overflow the target precision should turn into null
-    // - Because of this, casts to fixed-precision decimals should be nullable
-
-    assert(Cast(Literal(123), DecimalType.Unlimited).nullable === false)
-    assert(Cast(Literal(10.03f), DecimalType.Unlimited).nullable === true)
-    assert(Cast(Literal(10.03), DecimalType.Unlimited).nullable === true)
-    assert(Cast(Literal(Decimal(10.03)), DecimalType.Unlimited).nullable === false)
-
-    assert(Cast(Literal(123), DecimalType(2, 1)).nullable === true)
-    assert(Cast(Literal(10.03f), DecimalType(2, 1)).nullable === true)
-    assert(Cast(Literal(10.03), DecimalType(2, 1)).nullable === true)
-    assert(Cast(Literal(Decimal(10.03)), DecimalType(2, 1)).nullable === true)
-
-    checkEvaluation(Cast(Literal(123), DecimalType.Unlimited), Decimal(123))
-    checkEvaluation(Cast(Literal(123), DecimalType(3, 0)), Decimal(123))
-    checkEvaluation(Cast(Literal(123), DecimalType(3, 1)), null)
-    checkEvaluation(Cast(Literal(123), DecimalType(2, 0)), null)
-
-    checkEvaluation(Cast(Literal(10.03), DecimalType.Unlimited), Decimal(10.03))
-    checkEvaluation(Cast(Literal(10.03), DecimalType(4, 2)), Decimal(10.03))
-    checkEvaluation(Cast(Literal(10.03), DecimalType(3, 1)), Decimal(10.0))
-    checkEvaluation(Cast(Literal(10.03), DecimalType(2, 0)), Decimal(10))
-    checkEvaluation(Cast(Literal(10.03), DecimalType(1, 0)), null)
-    checkEvaluation(Cast(Literal(10.03), DecimalType(2, 1)), null)
-    checkEvaluation(Cast(Literal(10.03), DecimalType(3, 2)), null)
-    checkEvaluation(Cast(Literal(Decimal(10.03)), DecimalType(3, 1)), Decimal(10.0))
-    checkEvaluation(Cast(Literal(Decimal(10.03)), DecimalType(3, 2)), null)
-
-    checkEvaluation(Cast(Literal(10.05), DecimalType.Unlimited), Decimal(10.05))
-    checkEvaluation(Cast(Literal(10.05), DecimalType(4, 2)), Decimal(10.05))
-    checkEvaluation(Cast(Literal(10.05), DecimalType(3, 1)), Decimal(10.1))
-    checkEvaluation(Cast(Literal(10.05), DecimalType(2, 0)), Decimal(10))
-    checkEvaluation(Cast(Literal(10.05), DecimalType(1, 0)), null)
-    checkEvaluation(Cast(Literal(10.05), DecimalType(2, 1)), null)
-    checkEvaluation(Cast(Literal(10.05), DecimalType(3, 2)), null)
-    checkEvaluation(Cast(Literal(Decimal(10.05)), DecimalType(3, 1)), Decimal(10.1))
-    checkEvaluation(Cast(Literal(Decimal(10.05)), DecimalType(3, 2)), null)
-
-    checkEvaluation(Cast(Literal(9.95), DecimalType(3, 2)), Decimal(9.95))
-    checkEvaluation(Cast(Literal(9.95), DecimalType(3, 1)), Decimal(10.0))
-    checkEvaluation(Cast(Literal(9.95), DecimalType(2, 0)), Decimal(10))
-    checkEvaluation(Cast(Literal(9.95), DecimalType(2, 1)), null)
-    checkEvaluation(Cast(Literal(9.95), DecimalType(1, 0)), null)
-    checkEvaluation(Cast(Literal(Decimal(9.95)), DecimalType(3, 1)), Decimal(10.0))
-    checkEvaluation(Cast(Literal(Decimal(9.95)), DecimalType(1, 0)), null)
-
-    checkEvaluation(Cast(Literal(-9.95), DecimalType(3, 2)), Decimal(-9.95))
-    checkEvaluation(Cast(Literal(-9.95), DecimalType(3, 1)), Decimal(-10.0))
-    checkEvaluation(Cast(Literal(-9.95), DecimalType(2, 0)), Decimal(-10))
-    checkEvaluation(Cast(Literal(-9.95), DecimalType(2, 1)), null)
-    checkEvaluation(Cast(Literal(-9.95), DecimalType(1, 0)), null)
-    checkEvaluation(Cast(Literal(Decimal(-9.95)), DecimalType(3, 1)), Decimal(-10.0))
-    checkEvaluation(Cast(Literal(Decimal(-9.95)), DecimalType(1, 0)), null)
-
-    checkEvaluation(Cast(Literal(Double.NaN), DecimalType.Unlimited), null)
-    checkEvaluation(Cast(Literal(1.0 / 0.0), DecimalType.Unlimited), null)
-    checkEvaluation(Cast(Literal(Float.NaN), DecimalType.Unlimited), null)
-    checkEvaluation(Cast(Literal(1.0f / 0.0f), DecimalType.Unlimited), null)
-
-    checkEvaluation(Cast(Literal(Double.NaN), DecimalType(2, 1)), null)
-    checkEvaluation(Cast(Literal(1.0 / 0.0), DecimalType(2, 1)), null)
-    checkEvaluation(Cast(Literal(Float.NaN), DecimalType(2, 1)), null)
-    checkEvaluation(Cast(Literal(1.0f / 0.0f), DecimalType(2, 1)), null)
-  }
-
-  test("timestamp") {
-    val ts1 = new Timestamp(12)
-    val ts2 = new Timestamp(123)
-    checkEvaluation(Literal("ab") < Literal("abc"), true)
-    checkEvaluation(Literal(ts1) < Literal(ts2), true)
-  }
-
-  test("date casting") {
-    val d = Date.valueOf("1970-01-01")
-    checkEvaluation(Cast(Literal(d), ShortType), null)
-    checkEvaluation(Cast(Literal(d), IntegerType), null)
-    checkEvaluation(Cast(Literal(d), LongType), null)
-    checkEvaluation(Cast(Literal(d), FloatType), null)
-    checkEvaluation(Cast(Literal(d), DoubleType), null)
-    checkEvaluation(Cast(Literal(d), DecimalType.Unlimited), null)
-    checkEvaluation(Cast(Literal(d), DecimalType(10, 2)), null)
-    checkEvaluation(Cast(Literal(d), StringType), "1970-01-01")
-    checkEvaluation(Cast(Cast(Literal(d), TimestampType), StringType), "1970-01-01 00:00:00")
-  }
-
-  test("timestamp casting") {
-    val millis = 15 * 1000 + 2
-    val seconds = millis * 1000 + 2
-    val ts = new Timestamp(millis)
-    val tss = new Timestamp(seconds)
-    checkEvaluation(Cast(ts, ShortType), 15.toShort)
-    checkEvaluation(Cast(ts, IntegerType), 15)
-    checkEvaluation(Cast(ts, LongType), 15.toLong)
-    checkEvaluation(Cast(ts, FloatType), 15.002f)
-    checkEvaluation(Cast(ts, DoubleType), 15.002)
-    checkEvaluation(Cast(Cast(tss, ShortType), TimestampType), ts)
-    checkEvaluation(Cast(Cast(tss, IntegerType), TimestampType), ts)
-    checkEvaluation(Cast(Cast(tss, LongType), TimestampType), ts)
-    checkEvaluation(Cast(Cast(millis.toFloat / 1000, TimestampType), FloatType),
-      millis.toFloat / 1000)
-    checkEvaluation(Cast(Cast(millis.toDouble / 1000, TimestampType), DoubleType),
-      millis.toDouble / 1000)
-    checkEvaluation(Cast(Literal(Decimal(1)) cast TimestampType, DecimalType.Unlimited), Decimal(1))
-
-    // A test for higher precision than millis
-    checkEvaluation(Cast(Cast(0.00000001, TimestampType), DoubleType), 0.00000001)
-
-    checkEvaluation(Cast(Literal(Double.NaN), TimestampType), null)
-    checkEvaluation(Cast(Literal(1.0 / 0.0), TimestampType), null)
-    checkEvaluation(Cast(Literal(Float.NaN), TimestampType), null)
-    checkEvaluation(Cast(Literal(1.0f / 0.0f), TimestampType), null)
-  }
-
-  test("array casting") {
-    val array = Literal.create(Seq("123", "abc", "", null),
-      ArrayType(StringType, containsNull = true))
-    val array_notNull = Literal.create(Seq("123", "abc", ""),
-      ArrayType(StringType, containsNull = false))
-
-    {
-      val cast = Cast(array, ArrayType(IntegerType, containsNull = true))
-      assert(cast.resolved === true)
-      checkEvaluation(cast, Seq(123, null, null, null))
-    }
-    {
-      val cast = Cast(array, ArrayType(IntegerType, containsNull = false))
-      assert(cast.resolved === false)
-    }
-    {
-      val cast = Cast(array, ArrayType(BooleanType, containsNull = true))
-      assert(cast.resolved === true)
-      checkEvaluation(cast, Seq(true, true, false, null))
-    }
-    {
-      val cast = Cast(array, ArrayType(BooleanType, containsNull = false))
-      assert(cast.resolved === false)
-    }
-
-    {
-      val cast = Cast(array_notNull, ArrayType(IntegerType, containsNull = true))
-      assert(cast.resolved === true)
-      checkEvaluation(cast, Seq(123, null, null))
-    }
-    {
-      val cast = Cast(array_notNull, ArrayType(IntegerType, containsNull = false))
-      assert(cast.resolved === false)
-    }
-    {
-      val cast = Cast(array_notNull, ArrayType(BooleanType, containsNull = true))
-      assert(cast.resolved === true)
-      checkEvaluation(cast, Seq(true, true, false))
-    }
-    {
-      val cast = Cast(array_notNull, ArrayType(BooleanType, containsNull = false))
-      assert(cast.resolved === true)
-      checkEvaluation(cast, Seq(true, true, false))
-    }
-
-    {
-      val cast = Cast(array, IntegerType)
-      assert(cast.resolved === false)
-    }
-  }
-
-  test("map casting") {
-    val map = Literal.create(
-      Map("a" -> "123", "b" -> "abc", "c" -> "", "d" -> null),
-      MapType(StringType, StringType, valueContainsNull = true))
-    val map_notNull = Literal.create(
-      Map("a" -> "123", "b" -> "abc", "c" -> ""),
-      MapType(StringType, StringType, valueContainsNull = false))
-
-    {
-      val cast = Cast(map, MapType(StringType, IntegerType, valueContainsNull = true))
-      assert(cast.resolved === true)
-      checkEvaluation(cast, Map("a" -> 123, "b" -> null, "c" -> null, "d" -> null))
-    }
-    {
-      val cast = Cast(map, MapType(StringType, IntegerType, valueContainsNull = false))
-      assert(cast.resolved === false)
-    }
-    {
-      val cast = Cast(map, MapType(StringType, BooleanType, valueContainsNull = true))
-      assert(cast.resolved === true)
-      checkEvaluation(cast, Map("a" -> true, "b" -> true, "c" -> false, "d" -> null))
-    }
-    {
-      val cast = Cast(map, MapType(StringType, BooleanType, valueContainsNull = false))
-      assert(cast.resolved === false)
-    }
-    {
-      val cast = Cast(map, MapType(IntegerType, StringType, valueContainsNull = true))
-      assert(cast.resolved === false)
-    }
-
-    {
-      val cast = Cast(map_notNull, MapType(StringType, IntegerType, valueContainsNull = true))
-      assert(cast.resolved === true)
-      checkEvaluation(cast, Map("a" -> 123, "b" -> null, "c" -> null))
-    }
-    {
-      val cast = Cast(map_notNull, MapType(StringType, IntegerType, valueContainsNull = false))
-      assert(cast.resolved === false)
-    }
-    {
-      val cast = Cast(map_notNull, MapType(StringType, BooleanType, valueContainsNull = true))
-      assert(cast.resolved === true)
-      checkEvaluation(cast, Map("a" -> true, "b" -> true, "c" -> false))
-    }
-    {
-      val cast = Cast(map_notNull, MapType(StringType, BooleanType, valueContainsNull = false))
-      assert(cast.resolved === true)
-      checkEvaluation(cast, Map("a" -> true, "b" -> true, "c" -> false))
-    }
-    {
-      val cast = Cast(map_notNull, MapType(IntegerType, StringType, valueContainsNull = true))
-      assert(cast.resolved === false)
-    }
-
-    {
-      val cast = Cast(map, IntegerType)
-      assert(cast.resolved === false)
-    }
-  }
-
-  test("struct casting") {
-    val struct = Literal.create(
-      Row("123", "abc", "", null),
-      StructType(Seq(
-        StructField("a", StringType, nullable = true),
-        StructField("b", StringType, nullable = true),
-        StructField("c", StringType, nullable = true),
-        StructField("d", StringType, nullable = true))))
-    val struct_notNull = Literal.create(
-      Row("123", "abc", ""),
-      StructType(Seq(
-        StructField("a", StringType, nullable = false),
-        StructField("b", StringType, nullable = false),
-        StructField("c", StringType, nullable = false))))
-
-    {
-      val cast = Cast(struct, StructType(Seq(
-        StructField("a", IntegerType, nullable = true),
-        StructField("b", IntegerType, nullable = true),
-        StructField("c", IntegerType, nullable = true),
-        StructField("d", IntegerType, nullable = true))))
-      assert(cast.resolved === true)
-      checkEvaluation(cast, Row(123, null, null, null))
-    }
-    {
-      val cast = Cast(struct, StructType(Seq(
-        StructField("a", IntegerType, nullable = true),
-        StructField("b", IntegerType, nullable = true),
-        StructField("c", IntegerType, nullable = false),
-        StructField("d", IntegerType, nullable = true))))
-      assert(cast.resolved === false)
-    }
-    {
-      val cast = Cast(struct, StructType(Seq(
-        StructField("a", BooleanType, nullable = true),
-        StructField("b", BooleanType, nullable = true),
-        StructField("c", BooleanType, nullable = true),
-        StructField("d", BooleanType, nullable = true))))
-      assert(cast.resolved === true)
-      checkEvaluation(cast, Row(true, true, false, null))
-    }
-    {
-      val cast = Cast(struct, StructType(Seq(
-        StructField("a", BooleanType, nullable = true),
-        StructField("b", BooleanType, nullable = true),
-        StructField("c", BooleanType, nullable = false),
-        StructField("d", BooleanType, nullable = true))))
-      assert(cast.resolved === false)
-    }
-
-    {
-      val cast = Cast(struct_notNull, StructType(Seq(
-        StructField("a", IntegerType, nullable = true),
-        StructField("b", IntegerType, nullable = true),
-        StructField("c", IntegerType, nullable = true))))
-      assert(cast.resolved === true)
-      checkEvaluation(cast, Row(123, null, null))
-    }
-    {
-      val cast = Cast(struct_notNull, StructType(Seq(
-        StructField("a", IntegerType, nullable = true),
-        StructField("b", IntegerType, nullable = true),
-        StructField("c", IntegerType, nullable = false))))
-      assert(cast.resolved === false)
-    }
-    {
-      val cast = Cast(struct_notNull, StructType(Seq(
-        StructField("a", BooleanType, nullable = true),
-        StructField("b", BooleanType, nullable = true),
-        StructField("c", BooleanType, nullable = true))))
-      assert(cast.resolved === true)
-      checkEvaluation(cast, Row(true, true, false))
-    }
-    {
-      val cast = Cast(struct_notNull, StructType(Seq(
-        StructField("a", BooleanType, nullable = true),
-        StructField("b", BooleanType, nullable = true),
-        StructField("c", BooleanType, nullable = false))))
-      assert(cast.resolved === true)
-      checkEvaluation(cast, Row(true, true, false))
-    }
-
-    {
-      val cast = Cast(struct, StructType(Seq(
-        StructField("a", StringType, nullable = true),
-        StructField("b", StringType, nullable = true),
-        StructField("c", StringType, nullable = true))))
-      assert(cast.resolved === false)
-    }
-    {
-      val cast = Cast(struct, IntegerType)
-      assert(cast.resolved === false)
-    }
-  }
-
-  test("complex casting") {
-    val complex = Literal.create(
-      Row(
-        Seq("123", "abc", ""),
-        Map("a" -> "123", "b" -> "abc", "c" -> ""),
-        Row(0)),
-      StructType(Seq(
-        StructField("a",
-          ArrayType(StringType, containsNull = false), nullable = true),
-        StructField("m",
-          MapType(StringType, StringType, valueContainsNull = false), nullable = true),
-        StructField("s",
-          StructType(Seq(
-            StructField("i", IntegerType, nullable = true)))))))
-
-    val cast = Cast(complex, StructType(Seq(
-      StructField("a",
-        ArrayType(IntegerType, containsNull = true), nullable = true),
-      StructField("m",
-        MapType(StringType, BooleanType, valueContainsNull = false), nullable = true),
-      StructField("s",
-        StructType(Seq(
-          StructField("l", LongType, nullable = true)))))))
-
-    assert(cast.resolved === true)
-    checkEvaluation(cast, Row(
-      Seq(123, null, null),
-      Map("a" -> true, "b" -> true, "c" -> false),
-      Row(0L)))
-  }
-
-  test("null checking") {
-    val row = create_row("^Ba*n", null, true, null)
-    val c1 = 'a.string.at(0)
-    val c2 = 'a.string.at(1)
-    val c3 = 'a.boolean.at(2)
-    val c4 = 'a.boolean.at(3)
-
-    checkEvaluation(c1.isNull, false, row)
-    checkEvaluation(c1.isNotNull, true, row)
-
-    checkEvaluation(c2.isNull, true, row)
-    checkEvaluation(c2.isNotNull, false, row)
-
-    checkEvaluation(Literal.create(1, ShortType).isNull, false)
-    checkEvaluation(Literal.create(1, ShortType).isNotNull, true)
-
-    checkEvaluation(Literal.create(null, ShortType).isNull, true)
-    checkEvaluation(Literal.create(null, ShortType).isNotNull, false)
-
-    checkEvaluation(Coalesce(c1 :: c2 :: Nil), "^Ba*n", row)
-    checkEvaluation(Coalesce(Literal.create(null, StringType) :: Nil), null, row)
-    checkEvaluation(Coalesce(Literal.create(null, StringType) :: c1 :: c2 :: Nil), "^Ba*n", row)
-
-    checkEvaluation(
-      If(c3, Literal.create("a", StringType), Literal.create("b", StringType)), "a", row)
-    checkEvaluation(If(c3, c1, c2), "^Ba*n", row)
-    checkEvaluation(If(c4, c2, c1), "^Ba*n", row)
-    checkEvaluation(If(Literal.create(null, BooleanType), c2, c1), "^Ba*n", row)
-    checkEvaluation(If(Literal.create(true, BooleanType), c1, c2), "^Ba*n", row)
-    checkEvaluation(If(Literal.create(false, BooleanType), c2, c1), "^Ba*n", row)
-    checkEvaluation(If(Literal.create(false, BooleanType),
-      Literal.create("a", StringType), Literal.create("b", StringType)), "b", row)
-
-    checkEvaluation(c1 in (c1, c2), true, row)
-    checkEvaluation(
-      Literal.create("^Ba*n", StringType) in (Literal.create("^Ba*n", StringType)), true, row)
-    checkEvaluation(
-      Literal.create("^Ba*n", StringType) in (Literal.create("^Ba*n", StringType), c2), true, row)
-  }
-
-  test("case when") {
-    val row = create_row(null, false, true, "a", "b", "c")
-    val c1 = 'a.boolean.at(0)
-    val c2 = 'a.boolean.at(1)
-    val c3 = 'a.boolean.at(2)
-    val c4 = 'a.string.at(3)
-    val c5 = 'a.string.at(4)
-    val c6 = 'a.string.at(5)
-
-    checkEvaluation(CaseWhen(Seq(c1, c4, c6)), "c", row)
-    checkEvaluation(CaseWhen(Seq(c2, c4, c6)), "c", row)
-    checkEvaluation(CaseWhen(Seq(c3, c4, c6)), "a", row)
-    checkEvaluation(CaseWhen(Seq(Literal.create(null, BooleanType), c4, c6)), "c", row)
-    checkEvaluation(CaseWhen(Seq(Literal.create(false, BooleanType), c4, c6)), "c", row)
-    checkEvaluation(CaseWhen(Seq(Literal.create(true, BooleanType), c4, c6)), "a", row)
-
-    checkEvaluation(CaseWhen(Seq(c3, c4, c2, c5, c6)), "a", row)
-    checkEvaluation(CaseWhen(Seq(c2, c4, c3, c5, c6)), "b", row)
-    checkEvaluation(CaseWhen(Seq(c1, c4, c2, c5, c6)), "c", row)
-    checkEvaluation(CaseWhen(Seq(c1, c4, c2, c5)), null, row)
-
-    assert(CaseWhen(Seq(c2, c4, c6)).nullable === true)
-    assert(CaseWhen(Seq(c2, c4, c3, c5, c6)).nullable === true)
-    assert(CaseWhen(Seq(c2, c4, c3, c5)).nullable === true)
-
-    val c4_notNull = 'a.boolean.notNull.at(3)
-    val c5_notNull = 'a.boolean.notNull.at(4)
-    val c6_notNull = 'a.boolean.notNull.at(5)
-
-    assert(CaseWhen(Seq(c2, c4_notNull, c6_notNull)).nullable === false)
-    assert(CaseWhen(Seq(c2, c4, c6_notNull)).nullable === true)
-    assert(CaseWhen(Seq(c2, c4_notNull, c6)).nullable === true)
-
-    assert(CaseWhen(Seq(c2, c4_notNull, c3, c5_notNull, c6_notNull)).nullable === false)
-    assert(CaseWhen(Seq(c2, c4, c3, c5_notNull, c6_notNull)).nullable === true)
-    assert(CaseWhen(Seq(c2, c4_notNull, c3, c5, c6_notNull)).nullable === true)
-    assert(CaseWhen(Seq(c2, c4_notNull, c3, c5_notNull, c6)).nullable === true)
-
-    assert(CaseWhen(Seq(c2, c4_notNull, c3, c5_notNull)).nullable === true)
-    assert(CaseWhen(Seq(c2, c4, c3, c5_notNull)).nullable === true)
-    assert(CaseWhen(Seq(c2, c4_notNull, c3, c5)).nullable === true)
-  }
-
-  test("case key when") {
-    val row = create_row(null, 1, 2, "a", "b", "c")
-    val c1 = 'a.int.at(0)
-    val c2 = 'a.int.at(1)
-    val c3 = 'a.int.at(2)
-    val c4 = 'a.string.at(3)
-    val c5 = 'a.string.at(4)
-    val c6 = 'a.string.at(5)
-
-    val literalNull = Literal.create(null, IntegerType)
-    val literalInt = Literal(1)
-    val literalString = Literal("a")
-
-    checkEvaluation(CaseKeyWhen(c1, Seq(c2, c4, c5)), "b", row)
-    checkEvaluation(CaseKeyWhen(c1, Seq(c2, c4, literalNull, c5, c6)), "b", row)
-    checkEvaluation(CaseKeyWhen(c2, Seq(literalInt, c4, c5)), "a", row)
-    checkEvaluation(CaseKeyWhen(c2, Seq(c1, c4, c5)), "b", row)
-    checkEvaluation(CaseKeyWhen(c4, Seq(literalString, c2, c3)), 1, row)
-    checkEvaluation(CaseKeyWhen(c4, Seq(c6, c3, c5, c2, Literal(3))), 3, row)
-
-    checkEvaluation(CaseKeyWhen(literalInt, Seq(c2, c4, c5)), "a", row)
-    checkEvaluation(CaseKeyWhen(literalString, Seq(c5, c2, c4, c3)), 2, row)
-    checkEvaluation(CaseKeyWhen(c6, Seq(c5, c2, c4, c3)), null, row)
-    checkEvaluation(CaseKeyWhen(literalNull, Seq(c2, c5, c1, c6)), "c", row)
-  }
-
-  test("complex type") {
-    val row = create_row(
-      "^Ba*n",                                // 0
-      null.asInstanceOf[UTF8String],          // 1
-      create_row("aa", "bb"),                 // 2
-      Map("aa"->"bb"),                        // 3
-      Seq("aa", "bb")                         // 4
-    )
-
-    val typeS = StructType(
-      StructField("a", StringType, true) :: StructField("b", StringType, true) :: Nil
-    )
-    val typeMap = MapType(StringType, StringType)
-    val typeArray = ArrayType(StringType)
-
-    checkEvaluation(GetMapValue(BoundReference(3, typeMap, true),
-      Literal("aa")), "bb", row)
-    checkEvaluation(GetMapValue(Literal.create(null, typeMap), Literal("aa")), null, row)
-    checkEvaluation(
-      GetMapValue(Literal.create(null, typeMap), Literal.create(null, StringType)), null, row)
-    checkEvaluation(GetMapValue(BoundReference(3, typeMap, true),
-      Literal.create(null, StringType)), null, row)
-
-    checkEvaluation(GetArrayItem(BoundReference(4, typeArray, true),
-      Literal(1)), "bb", row)
-    checkEvaluation(GetArrayItem(Literal.create(null, typeArray), Literal(1)), null, row)
-    checkEvaluation(
-      GetArrayItem(Literal.create(null, typeArray), Literal.create(null, IntegerType)), null, row)
-    checkEvaluation(GetArrayItem(BoundReference(4, typeArray, true),
-      Literal.create(null, IntegerType)), null, row)
-
-    def getStructField(expr: Expression, fieldName: String): ExtractValue = {
-      expr.dataType match {
-        case StructType(fields) =>
-          val field = fields.find(_.name == fieldName).get
-          GetStructField(expr, field, fields.indexOf(field))
-      }
-    }
-
-    def quickResolve(u: UnresolvedExtractValue): ExtractValue = {
-      ExtractValue(u.child, u.extraction, _ == _)
-    }
-
-    checkEvaluation(getStructField(BoundReference(2, typeS, nullable = true), "a"), "aa", row)
-    checkEvaluation(getStructField(Literal.create(null, typeS), "a"), null, row)
-
-    val typeS_notNullable = StructType(
-      StructField("a", StringType, nullable = false)
-        :: StructField("b", StringType, nullable = false) :: Nil
-    )
-
-    assert(getStructField(BoundReference(2, typeS, nullable = true), "a").nullable === true)
-    assert(getStructField(BoundReference(2, typeS_notNullable, nullable = false), "a").nullable
-      === false)
-
-    assert(getStructField(Literal.create(null, typeS), "a").nullable === true)
-    assert(getStructField(Literal.create(null, typeS_notNullable), "a").nullable === true)
-
-    checkEvaluation(quickResolve('c.map(typeMap).at(3).getItem("aa")), "bb", row)
-    checkEvaluation(quickResolve('c.array(typeArray.elementType).at(4).getItem(1)), "bb", row)
-    checkEvaluation(quickResolve('c.struct(typeS).at(2).getField("a")), "aa", row)
-  }
-
-  test("error message of ExtractValue") {
-    val structType = StructType(StructField("a", StringType, true) :: Nil)
-    val arrayStructType = ArrayType(structType)
-    val arrayType = ArrayType(StringType)
-    val otherType = StringType
-
-    def checkErrorMessage(
-        childDataType: DataType,
-        fieldDataType: DataType,
-        errorMesage: String): Unit = {
-      val e = intercept[org.apache.spark.sql.AnalysisException] {
-        ExtractValue(
-          Literal.create(null, childDataType),
-          Literal.create(null, fieldDataType),
-          _ == _)
-      }
-      assert(e.getMessage().contains(errorMesage))
-    }
-
-    checkErrorMessage(structType, IntegerType, "Field name should be String Literal")
-    checkErrorMessage(arrayStructType, BooleanType, "Field name should be String Literal")
-    checkErrorMessage(arrayType, StringType, "Array index should be integral type")
-    checkErrorMessage(otherType, StringType, "Can't extract value from")
-  }
-
-  test("arithmetic") {
-    val row = create_row(1, 2, 3, null)
-    val c1 = 'a.int.at(0)
-    val c2 = 'a.int.at(1)
-    val c3 = 'a.int.at(2)
-    val c4 = 'a.int.at(3)
-
-    checkEvaluation(UnaryMinus(c1), -1, row)
-    checkEvaluation(UnaryMinus(Literal.create(100, IntegerType)), -100)
-
-    checkEvaluation(Add(c1, c4), null, row)
-    checkEvaluation(Add(c1, c2), 3, row)
-    checkEvaluation(Add(c1, Literal.create(null, IntegerType)), null, row)
-    checkEvaluation(Add(Literal.create(null, IntegerType), c2), null, row)
-    checkEvaluation(
-      Add(Literal.create(null, IntegerType), Literal.create(null, IntegerType)), null, row)
-
-    checkEvaluation(-c1, -1, row)
-    checkEvaluation(c1 + c2, 3, row)
-    checkEvaluation(c1 - c2, -1, row)
-    checkEvaluation(c1 * c2, 2, row)
-    checkEvaluation(c1 / c2, 0, row)
-    checkEvaluation(c1 % c2, 1, row)
-  }
-
-  test("fractional arithmetic") {
-    val row = create_row(1.1, 2.0, 3.1, null)
-    val c1 = 'a.double.at(0)
-    val c2 = 'a.double.at(1)
-    val c3 = 'a.double.at(2)
-    val c4 = 'a.double.at(3)
-
-    checkEvaluation(UnaryMinus(c1), -1.1, row)
-    checkEvaluation(UnaryMinus(Literal.create(100.0, DoubleType)), -100.0)
-    checkEvaluation(Add(c1, c4), null, row)
-    checkEvaluation(Add(c1, c2), 3.1, row)
-    checkEvaluation(Add(c1, Literal.create(null, DoubleType)), null, row)
-    checkEvaluation(Add(Literal.create(null, DoubleType), c2), null, row)
-    checkEvaluation(
-      Add(Literal.create(null, DoubleType), Literal.create(null, DoubleType)), null, row)
-
-    checkEvaluation(-c1, -1.1, row)
-    checkEvaluation(c1 + c2, 3.1, row)
-    checkDoubleEvaluation(c1 - c2, (-0.9 +- 0.001), row)
-    checkDoubleEvaluation(c1 * c2, (2.2 +- 0.001), row)
-    checkDoubleEvaluation(c1 / c2, (0.55 +- 0.001), row)
-    checkDoubleEvaluation(c3 % c2, (1.1 +- 0.001), row)
-  }
-
-  test("BinaryComparison") {
-    val row = create_row(1, 2, 3, null, 3, null)
-    val c1 = 'a.int.at(0)
-    val c2 = 'a.int.at(1)
-    val c3 = 'a.int.at(2)
-    val c4 = 'a.int.at(3)
-    val c5 = 'a.int.at(4)
-    val c6 = 'a.int.at(5)
-
-    checkEvaluation(LessThan(c1, c4), null, row)
-    checkEvaluation(LessThan(c1, c2), true, row)
-    checkEvaluation(LessThan(c1, Literal.create(null, IntegerType)), null, row)
-    checkEvaluation(LessThan(Literal.create(null, IntegerType), c2), null, row)
-    checkEvaluation(
-      LessThan(Literal.create(null, IntegerType), Literal.create(null, IntegerType)), null, row)
-
-    checkEvaluation(c1 < c2, true, row)
-    checkEvaluation(c1 <= c2, true, row)
-    checkEvaluation(c1 > c2, false, row)
-    checkEvaluation(c1 >= c2, false, row)
-    checkEvaluation(c1 === c2, false, row)
-    checkEvaluation(c1 !== c2, true, row)
-    checkEvaluation(c4 <=> c1, false, row)
-    checkEvaluation(c1 <=> c4, false, row)
-    checkEvaluation(c4 <=> c6, true, row)
-    checkEvaluation(c3 <=> c5, true, row)
-    checkEvaluation(Literal(true) <=> Literal.create(null, BooleanType), false, row)
-    checkEvaluation(Literal.create(null, BooleanType) <=> Literal(true), false, row)
-  }
-
-  test("StringComparison") {
-    val row = create_row("abc", null)
-    val c1 = 'a.string.at(0)
-    val c2 = 'a.string.at(1)
-
-    checkEvaluation(c1 contains "b", true, row)
-    checkEvaluation(c1 contains "x", false, row)
-    checkEvaluation(c2 contains "b", null, row)
-    checkEvaluation(c1 contains Literal.create(null, StringType), null, row)
-
-    checkEvaluation(c1 startsWith "a", true, row)
-    checkEvaluation(c1 startsWith "b", false, row)
-    checkEvaluation(c2 startsWith "a", null, row)
-    checkEvaluation(c1 startsWith Literal.create(null, StringType), null, row)
-
-    checkEvaluation(c1 endsWith "c", true, row)
-    checkEvaluation(c1 endsWith "b", false, row)
-    checkEvaluation(c2 endsWith "b", null, row)
-    checkEvaluation(c1 endsWith Literal.create(null, StringType), null, row)
-  }
-
-  test("Substring") {
-    val row = create_row("example", "example".toArray.map(_.toByte))
-
-    val s = 'a.string.at(0)
-
-    // substring from zero position with less-than-full length
-    checkEvaluation(
-      Substring(s, Literal.create(0, IntegerType), Literal.create(2, IntegerType)), "ex", row)
-    checkEvaluation(
-      Substring(s, Literal.create(1, IntegerType), Literal.create(2, IntegerType)), "ex", row)
-
-    // substring from zero position with full length
-    checkEvaluation(
-      Substring(s, Literal.create(0, IntegerType), Literal.create(7, IntegerType)), "example", row)
-    checkEvaluation(
-      Substring(s, Literal.create(1, IntegerType), Literal.create(7, IntegerType)), "example", row)
-
-    // substring from zero position with greater-than-full length
-    checkEvaluation(Substring(s, Literal.create(0, IntegerType), Literal.create(100, IntegerType)),
-      "example", row)
-    checkEvaluation(Substring(s, Literal.create(1, IntegerType), Literal.create(100, IntegerType)),
-      "example", row)
-
-    // substring from nonzero position with less-than-full length
-    checkEvaluation(Substring(s, Literal.create(2, IntegerType), Literal.create(2, IntegerType)),
-      "xa", row)
-
-    // substring from nonzero position with full length
-    checkEvaluation(Substring(s, Literal.create(2, IntegerType), Literal.create(6, IntegerType)),
-      "xample", row)
-
-    // substring from nonzero position with greater-than-full length
-    checkEvaluation(Substring(s, Literal.create(2, IntegerType), Literal.create(100, IntegerType)),
-      "xample", row)
-
-    // zero-length substring (within string bounds)
-    checkEvaluation(Substring(s, Literal.create(0, IntegerType), Literal.create(0, IntegerType)),
-      "", row)
-
-    // zero-length substring (beyond string bounds)
-    checkEvaluation(Substring(s, Literal.create(100, IntegerType), Literal.create(4, IntegerType)),
-      "", row)
-
-    // substring(null, _, _) -> null
-    checkEvaluation(Substring(s, Literal.create(100, IntegerType), Literal.create(4, IntegerType)),
-      null, create_row(null))
-
-    // substring(_, null, _) -> null
-    checkEvaluation(Substring(s, Literal.create(null, IntegerType), Literal.create(4, IntegerType)),
-      null, row)
-
-    // substring(_, _, null) -> null
-    checkEvaluation(
-      Substring(s, Literal.create(100, IntegerType), Literal.create(null, IntegerType)),
-      null,
-      row)
-
-    // 2-arg substring from zero position
-    checkEvaluation(
-      Substring(s, Literal.create(0, IntegerType), Literal.create(Integer.MAX_VALUE, IntegerType)),
-      "example",
-      row)
-    checkEvaluation(
-      Substring(s, Literal.create(1, IntegerType), Literal.create(Integer.MAX_VALUE, IntegerType)),
-      "example",
-      row)
-
-    // 2-arg substring from nonzero position
-    checkEvaluation(
-      Substring(s, Literal.create(2, IntegerType), Literal.create(Integer.MAX_VALUE, IntegerType)),
-      "xample",
-      row)
-
-    val s_notNull = 'a.string.notNull.at(0)
-
-    assert(Substring(s, Literal.create(0, IntegerType), Literal.create(2, IntegerType)).nullable
-      === true)
-    assert(
-      Substring(s_notNull, Literal.create(0, IntegerType), Literal.create(2, IntegerType)).nullable
-        === false)
-    assert(Substring(s_notNull,
-      Literal.create(null, IntegerType), Literal.create(2, IntegerType)).nullable === true)
-    assert(Substring(s_notNull,
-      Literal.create(0, IntegerType), Literal.create(null, IntegerType)).nullable === true)
-
-    checkEvaluation(s.substr(0, 2), "ex", row)
-    checkEvaluation(s.substr(0), "example", row)
-    checkEvaluation(s.substring(0, 2), "ex", row)
-    checkEvaluation(s.substring(0), "example", row)
-  }
-
-  test("SQRT") {
-    val inputSequence = (1 to (1<<24) by 511).map(_ * (1L<<24))
-    val expectedResults = inputSequence.map(l => math.sqrt(l.toDouble))
-    val rowSequence = inputSequence.map(l => create_row(l.toDouble))
-    val d = 'a.double.at(0)
-
-    for ((row, expected) <- rowSequence zip expectedResults) {
-      checkEvaluation(Sqrt(d), expected, row)
-    }
-
-    checkEvaluation(Sqrt(Literal.create(null, DoubleType)), null, create_row(null))
-    checkEvaluation(Sqrt(-1), null, EmptyRow)
-    checkEvaluation(Sqrt(-1.5), null, EmptyRow)
-  }
-
-  test("Bitwise operations") {
-    val row = create_row(1, 2, 3, null)
-    val c1 = 'a.int.at(0)
-    val c2 = 'a.int.at(1)
-    val c3 = 'a.int.at(2)
-    val c4 = 'a.int.at(3)
-
-    checkEvaluation(BitwiseAnd(c1, c4), null, row)
-    checkEvaluation(BitwiseAnd(c1, c2), 0, row)
-    checkEvaluation(BitwiseAnd(c1, Literal.create(null, IntegerType)), null, row)
-    checkEvaluation(
-      BitwiseAnd(Literal.create(null, IntegerType), Literal.create(null, IntegerType)), null, row)
-
-    checkEvaluation(BitwiseOr(c1, c4), null, row)
-    checkEvaluation(BitwiseOr(c1, c2), 3, row)
-    checkEvaluation(BitwiseOr(c1, Literal.create(null, IntegerType)), null, row)
-    checkEvaluation(
-      BitwiseOr(Literal.create(null, IntegerType), Literal.create(null, IntegerType)), null, row)
-
-    checkEvaluation(BitwiseXor(c1, c4), null, row)
-    checkEvaluation(BitwiseXor(c1, c2), 3, row)
-    checkEvaluation(BitwiseXor(c1, Literal.create(null, IntegerType)), null, row)
-    checkEvaluation(
-      BitwiseXor(Literal.create(null, IntegerType), Literal.create(null, IntegerType)), null, row)
-
-    checkEvaluation(BitwiseNot(c4), null, row)
-    checkEvaluation(BitwiseNot(c1), -2, row)
-    checkEvaluation(BitwiseNot(Literal.create(null, IntegerType)), null, row)
-
-    checkEvaluation(c1 & c2, 0, row)
-    checkEvaluation(c1 | c2, 3, row)
-    checkEvaluation(c1 ^ c2, 3, row)
-    checkEvaluation(~c1, -2, row)
-  }
-
-  /**
-   * Used for testing math functions for DataFrames.
-   * @param c The DataFrame function
-   * @param f The functions in scala.math
-   * @param domain The set of values to run the function with
-   * @param expectNull Whether the given values should return null or not
-   * @tparam T Generic type for primitives
-   */
-  def unaryMathFunctionEvaluation[@specialized(Int, Double, Float, Long) T](
-      c: Expression => Expression,
-      f: T => T,
-      domain: Iterable[T] = (-20 to 20).map(_ * 0.1),
-      expectNull: Boolean = false): Unit = {
-    if (expectNull) {
-      domain.foreach { value =>
-        checkEvaluation(c(Literal(value)), null, EmptyRow)
-      }
-    } else {
-      domain.foreach { value =>
-        checkEvaluation(c(Literal(value)), f(value), EmptyRow)
-      }
-    }
-    checkEvaluation(c(Literal.create(null, DoubleType)), null, create_row(null))
-  }
-
-  test("sin") {
-    unaryMathFunctionEvaluation(Sin, math.sin)
-  }
-
-  test("asin") {
-    unaryMathFunctionEvaluation(Asin, math.asin, (-10 to 10).map(_ * 0.1))
-    unaryMathFunctionEvaluation(Asin, math.asin, (11 to 20).map(_ * 0.1), true)
-  }
-
-  test("sinh") {
-    unaryMathFunctionEvaluation(Sinh, math.sinh)
-  }
-
-  test("cos") {
-    unaryMathFunctionEvaluation(Cos, math.cos)
-  }
-
-  test("acos") {
-    unaryMathFunctionEvaluation(Acos, math.acos, (-10 to 10).map(_ * 0.1))
-    unaryMathFunctionEvaluation(Acos, math.acos, (11 to 20).map(_ * 0.1), true)
-  }
-
-  test("cosh") {
-    unaryMathFunctionEvaluation(Cosh, math.cosh)
-  }
-
-  test("tan") {
-    unaryMathFunctionEvaluation(Tan, math.tan)
-  }
-
-  test("atan") {
-    unaryMathFunctionEvaluation(Atan, math.atan)
-  }
-
-  test("tanh") {
-    unaryMathFunctionEvaluation(Tanh, math.tanh)
-  }
-
-  test("toDegrees") {
-    unaryMathFunctionEvaluation(ToDegrees, math.toDegrees)
-  }
-
-  test("toRadians") {
-    unaryMathFunctionEvaluation(ToRadians, math.toRadians)
-  }
-
-  test("cbrt") {
-    unaryMathFunctionEvaluation(Cbrt, math.cbrt)
-  }
-
-  test("ceil") {
-    unaryMathFunctionEvaluation(Ceil, math.ceil)
-  }
-
-  test("floor") {
-    unaryMathFunctionEvaluation(Floor, math.floor)
-  }
-
-  test("rint") {
-    unaryMathFunctionEvaluation(Rint, math.rint)
-  }
-
-  test("exp") {
-    unaryMathFunctionEvaluation(Exp, math.exp)
-  }
-
-  test("expm1") {
-    unaryMathFunctionEvaluation(Expm1, math.expm1)
-  }
-
-  test("signum") {
-    unaryMathFunctionEvaluation[Double](Signum, math.signum)
-  }
-
-  test("log") {
-    unaryMathFunctionEvaluation(Log, math.log, (0 to 20).map(_ * 0.1))
-    unaryMathFunctionEvaluation(Log, math.log, (-5 to -1).map(_ * 0.1), true)
-  }
-
-  test("log10") {
-    unaryMathFunctionEvaluation(Log10, math.log10, (0 to 20).map(_ * 0.1))
-    unaryMathFunctionEvaluation(Log10, math.log10, (-5 to -1).map(_ * 0.1), true)
-  }
-
-  test("log1p") {
-    unaryMathFunctionEvaluation(Log1p, math.log1p, (-1 to 20).map(_ * 0.1))
-    unaryMathFunctionEvaluation(Log1p, math.log1p, (-10 to -2).map(_ * 1.0), true)
-  }
-
-  /**
-   * Used for testing math functions for DataFrames.
-   * @param c The DataFrame function
-   * @param f The functions in scala.math
-   * @param domain The set of values to run the function with
-   */
-  def binaryMathFunctionEvaluation(
-      c: (Expression, Expression) => Expression,
-      f: (Double, Double) => Double,
-      domain: Iterable[(Double, Double)] = (-20 to 20).map(v => (v * 0.1, v * -0.1)),
-      expectNull: Boolean = false): Unit = {
-    if (expectNull) {
-      domain.foreach { case (v1, v2) =>
-        checkEvaluation(c(v1, v2), null, create_row(null))
-      }
-    } else {
-      domain.foreach { case (v1, v2) =>
-        checkEvaluation(c(v1, v2), f(v1 + 0.0, v2 + 0.0), EmptyRow)
-        checkEvaluation(c(v2, v1), f(v2 + 0.0, v1 + 0.0), EmptyRow)
-      }
-    }
-    checkEvaluation(c(Literal.create(null, DoubleType), 1.0), null, create_row(null))
-    checkEvaluation(c(1.0, Literal.create(null, DoubleType)), null, create_row(null))
-  }
-
-  test("pow") {
-    binaryMathFunctionEvaluation(Pow, math.pow, (-5 to 5).map(v => (v * 1.0, v * 1.0)))
-    binaryMathFunctionEvaluation(Pow, math.pow, Seq((-1.0, 0.9), (-2.2, 1.7), (-2.2, -1.7)), true)
-  }
-
-  test("hypot") {
-    binaryMathFunctionEvaluation(Hypot, math.hypot)
-  }
-
-  test("atan2") {
-    binaryMathFunctionEvaluation(Atan2, math.atan2)
-  }
-}
-
-// TODO: Make the tests work with codegen.
-class ExpressionEvaluationWithoutCodeGenSuite extends ExpressionEvaluationBaseSuite {
-
-  override def checkEvaluation(
-      expression: Expression, expected: Any, inputRow: Row = EmptyRow): Unit = {
-    checkEvaluationWithoutCodegen(expression, expected, inputRow)
-  }
-
-  test("CreateStruct") {
-    val row = Row(1, 2, 3)
-    val c1 = 'a.int.at(0).as("a")
-    val c3 = 'c.int.at(2).as("c")
-    checkEvaluation(CreateStruct(Seq(c1, c3)), Row(1, 3), row)
-  }
-}
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/LiteralExpressionSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/LiteralExpressionSuite.scala
new file mode 100644
index 0000000000000..f44f55dfb92d1
--- /dev/null
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/LiteralExpressionSuite.scala
@@ -0,0 +1,55 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.catalyst.expressions
+
+import org.apache.spark.SparkFunSuite
+import org.apache.spark.sql.types.StringType
+
+
+class LiteralExpressionSuite extends SparkFunSuite with ExpressionEvalHelper {
+
+  // TODO: Add tests for all data types.
+
+  test("boolean literals") {
+    checkEvaluation(Literal(true), true)
+    checkEvaluation(Literal(false), false)
+  }
+
+  test("int literals") {
+    checkEvaluation(Literal(1), 1)
+    checkEvaluation(Literal(0L), 0L)
+  }
+
+  test("double literals") {
+    List(0.0, -0.0, Double.NegativeInfinity, Double.PositiveInfinity).foreach {
+      d => {
+        checkEvaluation(Literal(d), d)
+        checkEvaluation(Literal(d.toFloat), d.toFloat)
+      }
+    }
+  }
+
+  test("string literals") {
+    checkEvaluation(Literal("test"), "test")
+    checkEvaluation(Literal.create(null, StringType), null)
+  }
+
+  test("sum two literals") {
+    checkEvaluation(Add(Literal(1), Literal(1)), 2)
+  }
+}
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/MathFunctionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/MathFunctionsSuite.scala
new file mode 100644
index 0000000000000..25ebc70d095d8
--- /dev/null
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/MathFunctionsSuite.scala
@@ -0,0 +1,179 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.catalyst.expressions
+
+import org.apache.spark.SparkFunSuite
+import org.apache.spark.sql.types.DoubleType
+
+class MathFunctionsSuite extends SparkFunSuite with ExpressionEvalHelper {
+
+  /**
+   * Used for testing unary math expressions.
+   *
+   * @param c expression
+   * @param f The functions in scala.math
+   * @param domain The set of values to run the function with
+   * @param expectNull Whether the given values should return null or not
+   * @tparam T Generic type for primitives
+   */
+  private def testUnary[T](
+      c: Expression => Expression,
+      f: T => T,
+      domain: Iterable[T] = (-20 to 20).map(_ * 0.1),
+      expectNull: Boolean = false): Unit = {
+    if (expectNull) {
+      domain.foreach { value =>
+        checkEvaluation(c(Literal(value)), null, EmptyRow)
+      }
+    } else {
+      domain.foreach { value =>
+        checkEvaluation(c(Literal(value)), f(value), EmptyRow)
+      }
+    }
+    checkEvaluation(c(Literal.create(null, DoubleType)), null, create_row(null))
+  }
+
+  /**
+   * Used for testing binary math expressions.
+   *
+   * @param c The DataFrame function
+   * @param f The functions in scala.math
+   * @param domain The set of values to run the function with
+   */
+  private def testBinary(
+      c: (Expression, Expression) => Expression,
+      f: (Double, Double) => Double,
+      domain: Iterable[(Double, Double)] = (-20 to 20).map(v => (v * 0.1, v * -0.1)),
+      expectNull: Boolean = false): Unit = {
+    if (expectNull) {
+      domain.foreach { case (v1, v2) =>
+        checkEvaluation(c(Literal(v1), Literal(v2)), null, create_row(null))
+      }
+    } else {
+      domain.foreach { case (v1, v2) =>
+        checkEvaluation(c(Literal(v1), Literal(v2)), f(v1 + 0.0, v2 + 0.0), EmptyRow)
+        checkEvaluation(c(Literal(v2), Literal(v1)), f(v2 + 0.0, v1 + 0.0), EmptyRow)
+      }
+    }
+    checkEvaluation(c(Literal.create(null, DoubleType), Literal(1.0)), null, create_row(null))
+    checkEvaluation(c(Literal(1.0), Literal.create(null, DoubleType)), null, create_row(null))
+  }
+
+  test("sin") {
+    testUnary(Sin, math.sin)
+  }
+
+  test("asin") {
+    testUnary(Asin, math.asin, (-10 to 10).map(_ * 0.1))
+    testUnary(Asin, math.asin, (11 to 20).map(_ * 0.1), expectNull = true)
+  }
+
+  test("sinh") {
+    testUnary(Sinh, math.sinh)
+  }
+
+  test("cos") {
+    testUnary(Cos, math.cos)
+  }
+
+  test("acos") {
+    testUnary(Acos, math.acos, (-10 to 10).map(_ * 0.1))
+    testUnary(Acos, math.acos, (11 to 20).map(_ * 0.1), expectNull = true)
+  }
+
+  test("cosh") {
+    testUnary(Cosh, math.cosh)
+  }
+
+  test("tan") {
+    testUnary(Tan, math.tan)
+  }
+
+  test("atan") {
+    testUnary(Atan, math.atan)
+  }
+
+  test("tanh") {
+    testUnary(Tanh, math.tanh)
+  }
+
+  test("toDegrees") {
+    testUnary(ToDegrees, math.toDegrees)
+  }
+
+  test("toRadians") {
+    testUnary(ToRadians, math.toRadians)
+  }
+
+  test("cbrt") {
+    testUnary(Cbrt, math.cbrt)
+  }
+
+  test("ceil") {
+    testUnary(Ceil, math.ceil)
+  }
+
+  test("floor") {
+    testUnary(Floor, math.floor)
+  }
+
+  test("rint") {
+    testUnary(Rint, math.rint)
+  }
+
+  test("exp") {
+    testUnary(Exp, math.exp)
+  }
+
+  test("expm1") {
+    testUnary(Expm1, math.expm1)
+  }
+
+  test("signum") {
+    testUnary[Double](Signum, math.signum)
+  }
+
+  test("log") {
+    testUnary(Log, math.log, (0 to 20).map(_ * 0.1))
+    testUnary(Log, math.log, (-5 to -1).map(_ * 0.1), expectNull = true)
+  }
+
+  test("log10") {
+    testUnary(Log10, math.log10, (0 to 20).map(_ * 0.1))
+    testUnary(Log10, math.log10, (-5 to -1).map(_ * 0.1), expectNull = true)
+  }
+
+  test("log1p") {
+    testUnary(Log1p, math.log1p, (-1 to 20).map(_ * 0.1))
+    testUnary(Log1p, math.log1p, (-10 to -2).map(_ * 1.0), expectNull = true)
+  }
+
+  test("pow") {
+    testBinary(Pow, math.pow, (-5 to 5).map(v => (v * 1.0, v * 1.0)))
+    testBinary(Pow, math.pow, Seq((-1.0, 0.9), (-2.2, 1.7), (-2.2, -1.7)), expectNull = true)
+  }
+
+  test("hypot") {
+    testBinary(Hypot, math.hypot)
+  }
+
+  test("atan2") {
+    testBinary(Atan2, math.atan2)
+  }
+
+}
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/NullFunctionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/NullFunctionsSuite.scala
new file mode 100644
index 0000000000000..ccdada8b56f83
--- /dev/null
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/NullFunctionsSuite.scala
@@ -0,0 +1,65 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.catalyst.expressions
+
+import org.apache.spark.SparkFunSuite
+import org.apache.spark.sql.catalyst.dsl.expressions._
+import org.apache.spark.sql.types.{BooleanType, StringType, ShortType}
+
+class NullFunctionsSuite extends SparkFunSuite with ExpressionEvalHelper {
+
+  test("null checking") {
+    val row = create_row("^Ba*n", null, true, null)
+    val c1 = 'a.string.at(0)
+    val c2 = 'a.string.at(1)
+    val c3 = 'a.boolean.at(2)
+    val c4 = 'a.boolean.at(3)
+
+    checkEvaluation(c1.isNull, false, row)
+    checkEvaluation(c1.isNotNull, true, row)
+
+    checkEvaluation(c2.isNull, true, row)
+    checkEvaluation(c2.isNotNull, false, row)
+
+    checkEvaluation(Literal.create(1, ShortType).isNull, false)
+    checkEvaluation(Literal.create(1, ShortType).isNotNull, true)
+
+    checkEvaluation(Literal.create(null, ShortType).isNull, true)
+    checkEvaluation(Literal.create(null, ShortType).isNotNull, false)
+
+    checkEvaluation(Coalesce(c1 :: c2 :: Nil), "^Ba*n", row)
+    checkEvaluation(Coalesce(Literal.create(null, StringType) :: Nil), null, row)
+    checkEvaluation(Coalesce(Literal.create(null, StringType) :: c1 :: c2 :: Nil), "^Ba*n", row)
+
+    checkEvaluation(
+      If(c3, Literal.create("a", StringType), Literal.create("b", StringType)), "a", row)
+    checkEvaluation(If(c3, c1, c2), "^Ba*n", row)
+    checkEvaluation(If(c4, c2, c1), "^Ba*n", row)
+    checkEvaluation(If(Literal.create(null, BooleanType), c2, c1), "^Ba*n", row)
+    checkEvaluation(If(Literal.create(true, BooleanType), c1, c2), "^Ba*n", row)
+    checkEvaluation(If(Literal.create(false, BooleanType), c2, c1), "^Ba*n", row)
+    checkEvaluation(If(Literal.create(false, BooleanType),
+      Literal.create("a", StringType), Literal.create("b", StringType)), "b", row)
+
+    checkEvaluation(c1 in (c1, c2), true, row)
+    checkEvaluation(
+      Literal.create("^Ba*n", StringType) in (Literal.create("^Ba*n", StringType)), true, row)
+    checkEvaluation(
+      Literal.create("^Ba*n", StringType) in (Literal.create("^Ba*n", StringType), c2), true, row)
+  }
+}
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/PredicateSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/PredicateSuite.scala
new file mode 100644
index 0000000000000..b6261bfba0786
--- /dev/null
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/PredicateSuite.scala
@@ -0,0 +1,179 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.catalyst.expressions
+
+import java.sql.{Date, Timestamp}
+
+import scala.collection.immutable.HashSet
+
+import org.apache.spark.SparkFunSuite
+import org.apache.spark.sql.catalyst.dsl.expressions._
+import org.apache.spark.sql.catalyst.util.DateUtils
+import org.apache.spark.sql.types.{IntegerType, BooleanType}
+
+
+class PredicateSuite extends SparkFunSuite with ExpressionEvalHelper {
+
+  private def booleanLogicTest(
+    name: String,
+    op: (Expression, Expression) => Expression,
+    truthTable: Seq[(Any, Any, Any)]) {
+    test(s"3VL $name") {
+      truthTable.foreach {
+        case (l, r, answer) =>
+          val expr = op(Literal.create(l, BooleanType), Literal.create(r, BooleanType))
+          checkEvaluation(expr, answer)
+      }
+    }
+  }
+
+  // scalastyle:off
+  /**
+   * Checks for three-valued-logic.  Based on:
+   * http://en.wikipedia.org/wiki/Null_(SQL)#Comparisons_with_NULL_and_the_three-valued_logic_.283VL.29
+   * I.e. in flat cpo "False -> Unknown -> True",
+   *   OR is lowest upper bound,
+   *   AND is greatest lower bound.
+   * p       q       p OR q  p AND q  p = q
+   * True    True    True    True     True
+   * True    False   True    False    False
+   * True    Unknown True    Unknown  Unknown
+   * False   True    True    False    False
+   * False   False   False   False    True
+   * False   Unknown Unknown False    Unknown
+   * Unknown True    True    Unknown  Unknown
+   * Unknown False   Unknown False    Unknown
+   * Unknown Unknown Unknown Unknown  Unknown
+   *
+   * p       NOT p
+   * True    False
+   * False   True
+   * Unknown Unknown
+   */
+  // scalastyle:on
+  val notTrueTable =
+    (true, false) ::
+      (false, true) ::
+      (null, null) :: Nil
+
+  test("3VL Not") {
+    notTrueTable.foreach { case (v, answer) =>
+      checkEvaluation(Not(Literal.create(v, BooleanType)), answer)
+    }
+  }
+
+  booleanLogicTest("AND", And,
+    (true, true, true) ::
+      (true, false, false) ::
+      (true, null, null) ::
+      (false, true, false) ::
+      (false, false, false) ::
+      (false, null, false) ::
+      (null, true, null) ::
+      (null, false, false) ::
+      (null, null, null) :: Nil)
+
+  booleanLogicTest("OR", Or,
+    (true, true, true) ::
+      (true, false, true) ::
+      (true, null, true) ::
+      (false, true, true) ::
+      (false, false, false) ::
+      (false, null, null) ::
+      (null, true, true) ::
+      (null, false, null) ::
+      (null, null, null) :: Nil)
+
+  booleanLogicTest("=", EqualTo,
+    (true, true, true) ::
+      (true, false, false) ::
+      (true, null, null) ::
+      (false, true, false) ::
+      (false, false, true) ::
+      (false, null, null) ::
+      (null, true, null) ::
+      (null, false, null) ::
+      (null, null, null) :: Nil)
+
+  test("IN") {
+    checkEvaluation(In(Literal(1), Seq(Literal(1), Literal(2))), true)
+    checkEvaluation(In(Literal(2), Seq(Literal(1), Literal(2))), true)
+    checkEvaluation(In(Literal(3), Seq(Literal(1), Literal(2))), false)
+    checkEvaluation(
+      And(In(Literal(1), Seq(Literal(1), Literal(2))), In(Literal(2), Seq(Literal(1), Literal(2)))),
+      true)
+  }
+
+  test("INSET") {
+    val hS = HashSet[Any]() + 1 + 2
+    val nS = HashSet[Any]() + 1 + 2 + null
+    val one = Literal(1)
+    val two = Literal(2)
+    val three = Literal(3)
+    val nl = Literal(null)
+    val s = Seq(one, two)
+    val nullS = Seq(one, two, null)
+    checkEvaluation(InSet(one, hS), true)
+    checkEvaluation(InSet(two, hS), true)
+    checkEvaluation(InSet(two, nS), true)
+    checkEvaluation(InSet(nl, nS), true)
+    checkEvaluation(InSet(three, hS), false)
+    checkEvaluation(InSet(three, nS), false)
+    checkEvaluation(And(InSet(one, hS), InSet(two, hS)), true)
+  }
+
+
+  test("BinaryComparison") {
+    val row = create_row(1, 2, 3, null, 3, null)
+    val c1 = 'a.int.at(0)
+    val c2 = 'a.int.at(1)
+    val c3 = 'a.int.at(2)
+    val c4 = 'a.int.at(3)
+    val c5 = 'a.int.at(4)
+    val c6 = 'a.int.at(5)
+
+    checkEvaluation(LessThan(c1, c4), null, row)
+    checkEvaluation(LessThan(c1, c2), true, row)
+    checkEvaluation(LessThan(c1, Literal.create(null, IntegerType)), null, row)
+    checkEvaluation(LessThan(Literal.create(null, IntegerType), c2), null, row)
+    checkEvaluation(
+      LessThan(Literal.create(null, IntegerType), Literal.create(null, IntegerType)), null, row)
+
+    checkEvaluation(c1 < c2, true, row)
+    checkEvaluation(c1 <= c2, true, row)
+    checkEvaluation(c1 > c2, false, row)
+    checkEvaluation(c1 >= c2, false, row)
+    checkEvaluation(c1 === c2, false, row)
+    checkEvaluation(c1 !== c2, true, row)
+    checkEvaluation(c4 <=> c1, false, row)
+    checkEvaluation(c1 <=> c4, false, row)
+    checkEvaluation(c4 <=> c6, true, row)
+    checkEvaluation(c3 <=> c5, true, row)
+    checkEvaluation(Literal(true) <=> Literal.create(null, BooleanType), false, row)
+    checkEvaluation(Literal.create(null, BooleanType) <=> Literal(true), false, row)
+
+    val d1 = DateUtils.fromJavaDate(Date.valueOf("1970-01-01"))
+    val d2 = DateUtils.fromJavaDate(Date.valueOf("1970-01-02"))
+    checkEvaluation(Literal(d1) < Literal(d2), true)
+
+    val ts1 = new Timestamp(12)
+    val ts2 = new Timestamp(123)
+    checkEvaluation(Literal("ab") < Literal("abc"), true)
+    checkEvaluation(Literal(ts1) < Literal(ts2), true)
+  }
+}
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringFunctionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringFunctionsSuite.scala
new file mode 100644
index 0000000000000..2e81296c4e623
--- /dev/null
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringFunctionsSuite.scala
@@ -0,0 +1,218 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.catalyst.expressions
+
+import org.apache.spark.SparkFunSuite
+import org.apache.spark.sql.catalyst.dsl.expressions._
+import org.apache.spark.sql.types.{IntegerType, StringType}
+
+
+class StringFunctionsSuite extends SparkFunSuite with ExpressionEvalHelper {
+
+  test("StringComparison") {
+    val row = create_row("abc", null)
+    val c1 = 'a.string.at(0)
+    val c2 = 'a.string.at(1)
+
+    checkEvaluation(c1 contains "b", true, row)
+    checkEvaluation(c1 contains "x", false, row)
+    checkEvaluation(c2 contains "b", null, row)
+    checkEvaluation(c1 contains Literal.create(null, StringType), null, row)
+
+    checkEvaluation(c1 startsWith "a", true, row)
+    checkEvaluation(c1 startsWith "b", false, row)
+    checkEvaluation(c2 startsWith "a", null, row)
+    checkEvaluation(c1 startsWith Literal.create(null, StringType), null, row)
+
+    checkEvaluation(c1 endsWith "c", true, row)
+    checkEvaluation(c1 endsWith "b", false, row)
+    checkEvaluation(c2 endsWith "b", null, row)
+    checkEvaluation(c1 endsWith Literal.create(null, StringType), null, row)
+  }
+
+  test("Substring") {
+    val row = create_row("example", "example".toArray.map(_.toByte))
+
+    val s = 'a.string.at(0)
+
+    // substring from zero position with less-than-full length
+    checkEvaluation(
+      Substring(s, Literal.create(0, IntegerType), Literal.create(2, IntegerType)), "ex", row)
+    checkEvaluation(
+      Substring(s, Literal.create(1, IntegerType), Literal.create(2, IntegerType)), "ex", row)
+
+    // substring from zero position with full length
+    checkEvaluation(
+      Substring(s, Literal.create(0, IntegerType), Literal.create(7, IntegerType)), "example", row)
+    checkEvaluation(
+      Substring(s, Literal.create(1, IntegerType), Literal.create(7, IntegerType)), "example", row)
+
+    // substring from zero position with greater-than-full length
+    checkEvaluation(Substring(s, Literal.create(0, IntegerType), Literal.create(100, IntegerType)),
+      "example", row)
+    checkEvaluation(Substring(s, Literal.create(1, IntegerType), Literal.create(100, IntegerType)),
+      "example", row)
+
+    // substring from nonzero position with less-than-full length
+    checkEvaluation(Substring(s, Literal.create(2, IntegerType), Literal.create(2, IntegerType)),
+      "xa", row)
+
+    // substring from nonzero position with full length
+    checkEvaluation(Substring(s, Literal.create(2, IntegerType), Literal.create(6, IntegerType)),
+      "xample", row)
+
+    // substring from nonzero position with greater-than-full length
+    checkEvaluation(Substring(s, Literal.create(2, IntegerType), Literal.create(100, IntegerType)),
+      "xample", row)
+
+    // zero-length substring (within string bounds)
+    checkEvaluation(Substring(s, Literal.create(0, IntegerType), Literal.create(0, IntegerType)),
+      "", row)
+
+    // zero-length substring (beyond string bounds)
+    checkEvaluation(Substring(s, Literal.create(100, IntegerType), Literal.create(4, IntegerType)),
+      "", row)
+
+    // substring(null, _, _) -> null
+    checkEvaluation(Substring(s, Literal.create(100, IntegerType), Literal.create(4, IntegerType)),
+      null, create_row(null))
+
+    // substring(_, null, _) -> null
+    checkEvaluation(Substring(s, Literal.create(null, IntegerType), Literal.create(4, IntegerType)),
+      null, row)
+
+    // substring(_, _, null) -> null
+    checkEvaluation(
+      Substring(s, Literal.create(100, IntegerType), Literal.create(null, IntegerType)),
+      null,
+      row)
+
+    // 2-arg substring from zero position
+    checkEvaluation(
+      Substring(s, Literal.create(0, IntegerType), Literal.create(Integer.MAX_VALUE, IntegerType)),
+      "example",
+      row)
+    checkEvaluation(
+      Substring(s, Literal.create(1, IntegerType), Literal.create(Integer.MAX_VALUE, IntegerType)),
+      "example",
+      row)
+
+    // 2-arg substring from nonzero position
+    checkEvaluation(
+      Substring(s, Literal.create(2, IntegerType), Literal.create(Integer.MAX_VALUE, IntegerType)),
+      "xample",
+      row)
+
+    val s_notNull = 'a.string.notNull.at(0)
+
+    assert(Substring(s, Literal.create(0, IntegerType), Literal.create(2, IntegerType)).nullable
+      === true)
+    assert(
+      Substring(s_notNull, Literal.create(0, IntegerType), Literal.create(2, IntegerType)).nullable
+        === false)
+    assert(Substring(s_notNull,
+      Literal.create(null, IntegerType), Literal.create(2, IntegerType)).nullable === true)
+    assert(Substring(s_notNull,
+      Literal.create(0, IntegerType), Literal.create(null, IntegerType)).nullable === true)
+
+    checkEvaluation(s.substr(0, 2), "ex", row)
+    checkEvaluation(s.substr(0), "example", row)
+    checkEvaluation(s.substring(0, 2), "ex", row)
+    checkEvaluation(s.substring(0), "example", row)
+  }
+
+  test("LIKE literal Regular Expression") {
+    checkEvaluation(Literal.create(null, StringType).like("a"), null)
+    checkEvaluation(Literal.create("a", StringType).like(Literal.create(null, StringType)), null)
+    checkEvaluation(Literal.create(null, StringType).like(Literal.create(null, StringType)), null)
+    checkEvaluation("abdef" like "abdef", true)
+    checkEvaluation("a_%b" like "a\\__b", true)
+    checkEvaluation("addb" like "a_%b", true)
+    checkEvaluation("addb" like "a\\__b", false)
+    checkEvaluation("addb" like "a%\\%b", false)
+    checkEvaluation("a_%b" like "a%\\%b", true)
+    checkEvaluation("addb" like "a%", true)
+    checkEvaluation("addb" like "**", false)
+    checkEvaluation("abc" like "a%", true)
+    checkEvaluation("abc"  like "b%", false)
+    checkEvaluation("abc"  like "bc%", false)
+    checkEvaluation("a\nb" like "a_b", true)
+    checkEvaluation("ab" like "a%b", true)
+    checkEvaluation("a\nb" like "a%b", true)
+  }
+
+  test("LIKE Non-literal Regular Expression") {
+    val regEx = 'a.string.at(0)
+    checkEvaluation("abcd" like regEx, null, create_row(null))
+    checkEvaluation("abdef" like regEx, true, create_row("abdef"))
+    checkEvaluation("a_%b" like regEx, true, create_row("a\\__b"))
+    checkEvaluation("addb" like regEx, true, create_row("a_%b"))
+    checkEvaluation("addb" like regEx, false, create_row("a\\__b"))
+    checkEvaluation("addb" like regEx, false, create_row("a%\\%b"))
+    checkEvaluation("a_%b" like regEx, true, create_row("a%\\%b"))
+    checkEvaluation("addb" like regEx, true, create_row("a%"))
+    checkEvaluation("addb" like regEx, false, create_row("**"))
+    checkEvaluation("abc" like regEx, true, create_row("a%"))
+    checkEvaluation("abc" like regEx, false, create_row("b%"))
+    checkEvaluation("abc" like regEx, false, create_row("bc%"))
+    checkEvaluation("a\nb" like regEx, true, create_row("a_b"))
+    checkEvaluation("ab" like regEx, true, create_row("a%b"))
+    checkEvaluation("a\nb" like regEx, true, create_row("a%b"))
+
+    checkEvaluation(Literal.create(null, StringType) like regEx, null, create_row("bc%"))
+  }
+
+  test("RLIKE literal Regular Expression") {
+    checkEvaluation(Literal.create(null, StringType) rlike "abdef", null)
+    checkEvaluation("abdef" rlike Literal.create(null, StringType), null)
+    checkEvaluation(Literal.create(null, StringType) rlike Literal.create(null, StringType), null)
+    checkEvaluation("abdef" rlike "abdef", true)
+    checkEvaluation("abbbbc" rlike "a.*c", true)
+
+    checkEvaluation("fofo" rlike "^fo", true)
+    checkEvaluation("fo\no" rlike "^fo\no$", true)
+    checkEvaluation("Bn" rlike "^Ba*n", true)
+    checkEvaluation("afofo" rlike "fo", true)
+    checkEvaluation("afofo" rlike "^fo", false)
+    checkEvaluation("Baan" rlike "^Ba?n", false)
+    checkEvaluation("axe" rlike "pi|apa", false)
+    checkEvaluation("pip" rlike "^(pi)*$", false)
+
+    checkEvaluation("abc"  rlike "^ab", true)
+    checkEvaluation("abc"  rlike "^bc", false)
+    checkEvaluation("abc"  rlike "^ab", true)
+    checkEvaluation("abc"  rlike "^bc", false)
+
+    intercept[java.util.regex.PatternSyntaxException] {
+      evaluate("abbbbc" rlike "**")
+    }
+  }
+
+  test("RLIKE Non-literal Regular Expression") {
+    val regEx = 'a.string.at(0)
+    checkEvaluation("abdef" rlike regEx, true, create_row("abdef"))
+    checkEvaluation("abbbbc" rlike regEx, true, create_row("a.*c"))
+    checkEvaluation("fofo" rlike regEx, true, create_row("^fo"))
+    checkEvaluation("fo\no" rlike regEx, true, create_row("^fo\no$"))
+    checkEvaluation("Bn" rlike regEx, true, create_row("^Ba*n"))
+
+    intercept[java.util.regex.PatternSyntaxException] {
+      evaluate("abbbbc" rlike regEx, create_row("**"))
+    }
+  }
+}
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/ExpressionOptimizationSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/ExpressionOptimizationSuite.scala
index a4a3a66b8b229..f33a18d53b1a9 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/ExpressionOptimizationSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/ExpressionOptimizationSuite.scala
@@ -17,6 +17,7 @@
 
 package org.apache.spark.sql.catalyst.optimizer
 
+import org.apache.spark.SparkFunSuite
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.plans.logical._
 
@@ -24,7 +25,7 @@ import org.apache.spark.sql.catalyst.plans.logical._
  * Overrides our expression evaluation tests and reruns them after optimization has occured.  This
  * is to ensure that constant folding and other optimizations do not break anything.
  */
-class ExpressionOptimizationSuite extends ExpressionEvaluationSuite {
+class ExpressionOptimizationSuite extends SparkFunSuite with ExpressionEvalHelper {
   override def checkEvaluation(
       expression: Expression,
       expected: Any,
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
index 77327f2b84eaa..454af47913bf1 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
@@ -24,7 +24,6 @@ import org.apache.spark.annotation.Experimental
 import org.apache.spark.sql.catalyst.ScalaReflection
 import org.apache.spark.sql.catalyst.analysis.{UnresolvedFunction, Star}
 import org.apache.spark.sql.catalyst.expressions._
-import org.apache.spark.sql.catalyst.expressions.mathfuncs._
 import org.apache.spark.sql.types._
 import org.apache.spark.util.Utils
 

From 72ba0fc4fd441f4bf25f19bed59ba0a39dd04b65 Mon Sep 17 00:00:00 2001
From: Reynold Xin <rxin@databricks.com>
Date: Sun, 7 Jun 2015 23:16:19 -0700
Subject: [PATCH 406/525] [SPARK-8154][SQL] Remove Term/Code type aliases in
 code generation.

From my perspective as a code reviewer, I find them more confusing than using String directly.

Author: Reynold Xin <rxin@databricks.com>

Closes #6694 from rxin/SPARK-8154 and squashes the following commits:

4e5056c [Reynold Xin] [SPARK-8154][SQL] Remove Term/Code type aliases in code generation.
---
 .../catalyst/expressions/BoundAttribute.scala |  4 ++--
 .../spark/sql/catalyst/expressions/Cast.scala |  4 ++--
 .../sql/catalyst/expressions/Expression.scala | 10 +++++-----
 .../sql/catalyst/expressions/arithmetic.scala | 16 +++++++--------
 .../sql/catalyst/expressions/bitwise.scala    | 10 ++++++++--
 .../expressions/codegen/CodeGenerator.scala   | 20 +++++++++----------
 .../expressions/codegen/package.scala         |  3 ---
 .../catalyst/expressions/conditionals.scala   |  6 +++---
 .../expressions/decimalFunctions.scala        |  6 +++---
 .../sql/catalyst/expressions/literals.scala   |  4 ++--
 .../spark/sql/catalyst/expressions/math.scala |  8 ++++----
 .../catalyst/expressions/nullFunctions.scala  | 10 +++++-----
 .../sql/catalyst/expressions/predicates.scala | 16 +++++++--------
 .../spark/sql/catalyst/expressions/sets.scala |  8 ++++----
 .../expressions/stringOperations.scala        | 10 +++++-----
 15 files changed, 69 insertions(+), 66 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/BoundAttribute.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/BoundAttribute.scala
index 005de3166095f..fcadf9595e768 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/BoundAttribute.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/BoundAttribute.scala
@@ -19,7 +19,7 @@ package org.apache.spark.sql.catalyst.expressions
 
 import org.apache.spark.Logging
 import org.apache.spark.sql.catalyst.errors.attachTree
-import org.apache.spark.sql.catalyst.expressions.codegen.{GeneratedExpressionCode, Code, CodeGenContext}
+import org.apache.spark.sql.catalyst.expressions.codegen.{GeneratedExpressionCode, CodeGenContext}
 import org.apache.spark.sql.types._
 import org.apache.spark.sql.catalyst.trees
 
@@ -43,7 +43,7 @@ case class BoundReference(ordinal: Int, dataType: DataType, nullable: Boolean)
 
   override def exprId: ExprId = throw new UnsupportedOperationException
 
-  override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): Code = {
+  override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
     s"""
         boolean ${ev.isNull} = i.isNullAt($ordinal);
         ${ctx.javaType(dataType)} ${ev.primitive} = ${ev.isNull} ?
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala
index 2a1f96409daf4..18102d1acb5b3 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala
@@ -21,7 +21,7 @@ import java.sql.{Date, Timestamp}
 import java.text.{DateFormat, SimpleDateFormat}
 
 import org.apache.spark.Logging
-import org.apache.spark.sql.catalyst.expressions.codegen.{GeneratedExpressionCode, Code, CodeGenContext}
+import org.apache.spark.sql.catalyst.expressions.codegen.{CodeGenContext, GeneratedExpressionCode}
 import org.apache.spark.sql.catalyst.util.DateUtils
 import org.apache.spark.sql.types._
 
@@ -435,7 +435,7 @@ case class Cast(child: Expression, dataType: DataType) extends UnaryExpression w
     if (evaluated == null) null else cast(evaluated)
   }
 
-  override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): Code = {
+  override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
     // TODO(cg): Add support for more data types.
     (child.dataType, dataType) match {
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala
index 432d65eee54fb..a9a9c0cfb7027 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala
@@ -18,7 +18,7 @@
 package org.apache.spark.sql.catalyst.expressions
 
 import org.apache.spark.sql.catalyst.analysis.{TypeCheckResult, UnresolvedAttribute}
-import org.apache.spark.sql.catalyst.expressions.codegen.{GeneratedExpressionCode, Code, CodeGenContext, Term}
+import org.apache.spark.sql.catalyst.expressions.codegen.{GeneratedExpressionCode, CodeGenContext}
 import org.apache.spark.sql.catalyst.trees
 import org.apache.spark.sql.catalyst.trees.TreeNode
 import org.apache.spark.sql.types._
@@ -76,7 +76,7 @@ abstract class Expression extends TreeNode[Expression] {
    * @param ev an [[GeneratedExpressionCode]] with unique terms.
    * @return Java source code
    */
-  protected def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): Code = {
+  protected def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
     ctx.references += this
     val objectTerm = ctx.freshName("obj")
     s"""
@@ -166,7 +166,7 @@ abstract class BinaryExpression extends Expression with trees.BinaryNode[Express
   protected def defineCodeGen(
       ctx: CodeGenContext,
       ev: GeneratedExpressionCode,
-      f: (Term, Term) => Code): String = {
+      f: (String, String) => String): String = {
     // TODO: Right now some timestamp tests fail if we enforce this...
     if (left.dataType != right.dataType) {
       // log.warn(s"${left.dataType} != ${right.dataType}")
@@ -182,7 +182,7 @@ abstract class BinaryExpression extends Expression with trees.BinaryNode[Express
       ${ctx.javaType(dataType)} ${ev.primitive} = ${ctx.defaultValue(dataType)};
       if (!${ev.isNull}) {
         ${eval2.code}
-        if(!${eval2.isNull}) {
+        if (!${eval2.isNull}) {
           ${ev.primitive} = $resultCode;
         } else {
           ${ev.isNull} = true;
@@ -217,7 +217,7 @@ abstract class UnaryExpression extends Expression with trees.UnaryNode[Expressio
   protected def defineCodeGen(
       ctx: CodeGenContext,
       ev: GeneratedExpressionCode,
-      f: Term => Code): Code = {
+      f: String => String): String = {
     val eval = child.gen(ctx)
     // reuse the previous isNull
     ev.isNull = eval.isNull
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/arithmetic.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/arithmetic.scala
index d4efda2e04c29..124274c94203c 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/arithmetic.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/arithmetic.scala
@@ -18,7 +18,7 @@
 package org.apache.spark.sql.catalyst.expressions
 
 import org.apache.spark.sql.catalyst.analysis.TypeCheckResult
-import org.apache.spark.sql.catalyst.expressions.codegen.{Code, GeneratedExpressionCode, CodeGenContext}
+import org.apache.spark.sql.catalyst.expressions.codegen.{GeneratedExpressionCode, CodeGenContext}
 import org.apache.spark.sql.catalyst.util.TypeUtils
 import org.apache.spark.sql.types._
 
@@ -50,7 +50,7 @@ case class UnaryMinus(child: Expression) extends UnaryArithmetic {
 
   private lazy val numeric = TypeUtils.getNumeric(dataType)
 
-  override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): Code = dataType match {
+  override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = dataType match {
     case dt: DecimalType => defineCodeGen(ctx, ev, c => s"c.unary_$$minus()")
     case dt: NumericType => defineCodeGen(ctx, ev, c => s"-($c)")
   }
@@ -74,7 +74,7 @@ case class Sqrt(child: Expression) extends UnaryArithmetic {
     else math.sqrt(value)
   }
 
-  override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): Code = {
+  override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
     val eval = child.gen(ctx)
     eval.code + s"""
       boolean ${ev.isNull} = ${eval.isNull};
@@ -138,7 +138,7 @@ abstract class BinaryArithmetic extends BinaryExpression {
     }
   }
 
-  override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): Code = dataType match {
+  override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = dataType match {
     case dt: DecimalType =>
       defineCodeGen(ctx, ev, (eval1, eval2) => s"$eval1.$decimalMethod($eval2)")
     // byte and short are casted into int when add, minus, times or divide
@@ -236,7 +236,7 @@ case class Divide(left: Expression, right: Expression) extends BinaryArithmetic
   /**
    * Special case handling due to division by 0 => null.
    */
-  override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): Code = {
+  override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
     val eval1 = left.gen(ctx)
     val eval2 = right.gen(ctx)
     val test = if (left.dataType.isInstanceOf[DecimalType]) {
@@ -296,7 +296,7 @@ case class Remainder(left: Expression, right: Expression) extends BinaryArithmet
   /**
    * Special case handling for x % 0 ==> null.
    */
-  override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): Code = {
+  override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
     val eval1 = left.gen(ctx)
     val eval2 = right.gen(ctx)
     val test = if (left.dataType.isInstanceOf[DecimalType]) {
@@ -346,7 +346,7 @@ case class MaxOf(left: Expression, right: Expression) extends BinaryArithmetic {
     }
   }
 
-  override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): Code = {
+  override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
     if (ctx.isNativeType(left.dataType)) {
       val eval1 = left.gen(ctx)
       val eval2 = right.gen(ctx)
@@ -400,7 +400,7 @@ case class MinOf(left: Expression, right: Expression) extends BinaryArithmetic {
     }
   }
 
-  override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): Code = {
+  override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
     if (ctx.isNativeType(left.dataType)) {
 
       val eval1 = left.gen(ctx)
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/bitwise.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/bitwise.scala
index ef34586261e70..9002dda7bf4d0 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/bitwise.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/bitwise.scala
@@ -25,6 +25,8 @@ import org.apache.spark.sql.types._
 
 /**
  * A function that calculates bitwise and(&) of two numbers.
+ *
+ * Code generation inherited from BinaryArithmetic.
  */
 case class BitwiseAnd(left: Expression, right: Expression) extends BinaryArithmetic {
   override def symbol: String = "&"
@@ -48,6 +50,8 @@ case class BitwiseAnd(left: Expression, right: Expression) extends BinaryArithme
 
 /**
  * A function that calculates bitwise or(|) of two numbers.
+ *
+ * Code generation inherited from BinaryArithmetic.
  */
 case class BitwiseOr(left: Expression, right: Expression) extends BinaryArithmetic {
   override def symbol: String = "|"
@@ -71,6 +75,8 @@ case class BitwiseOr(left: Expression, right: Expression) extends BinaryArithmet
 
 /**
  * A function that calculates bitwise xor of two numbers.
+ *
+ * Code generation inherited from BinaryArithmetic.
  */
 case class BitwiseXor(left: Expression, right: Expression) extends BinaryArithmetic {
   override def symbol: String = "^"
@@ -112,8 +118,8 @@ case class BitwiseNot(child: Expression) extends UnaryArithmetic {
       ((evalE: Long) => ~evalE).asInstanceOf[(Any) => Any]
   }
 
-  override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): Code = {
-    defineCodeGen(ctx, ev, c => s"(${ctx.javaType(dataType)})~($c)")
+  override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
+    defineCodeGen(ctx, ev, c => s"(${ctx.javaType(dataType)}) ~($c)")
   }
 
   protected override def evalInternal(evalE: Any) = not(evalE)
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala
index c8d0aaf79f5f2..e95682f952a7b 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala
@@ -40,7 +40,7 @@ class LongHashSet extends org.apache.spark.util.collection.OpenHashSet[Long]
  * @param primitive A term for a possible primitive value of the result of the evaluation. Not
  *                      valid if `isNull` is set to `true`.
  */
-case class GeneratedExpressionCode(var code: Code, var isNull: Term, var primitive: Term)
+case class GeneratedExpressionCode(var code: String, var isNull: String, var primitive: String)
 
 /**
  * A context for codegen, which is used to bookkeeping the expressions those are not supported
@@ -65,14 +65,14 @@ class CodeGenContext {
    * (Since we aren't in a macro context we do not seem to have access to the built in `freshName`
    * function.)
    */
-  def freshName(prefix: String): Term = {
+  def freshName(prefix: String): String = {
     s"$prefix${curId.getAndIncrement}"
   }
 
   /**
    * Return the code to access a column for given DataType
    */
-  def getColumn(dataType: DataType, ordinal: Int): Code = {
+  def getColumn(dataType: DataType, ordinal: Int): String = {
     if (isNativeType(dataType)) {
       s"i.${accessorForType(dataType)}($ordinal)"
     } else {
@@ -83,7 +83,7 @@ class CodeGenContext {
   /**
    * Return the code to update a column in Row for given DataType
    */
-  def setColumn(dataType: DataType, ordinal: Int, value: Term): Code = {
+  def setColumn(dataType: DataType, ordinal: Int, value: String): String = {
     if (isNativeType(dataType)) {
       s"${mutatorForType(dataType)}($ordinal, $value)"
     } else {
@@ -94,7 +94,7 @@ class CodeGenContext {
   /**
    * Return the name of accessor in Row for a DataType
    */
-  def accessorForType(dt: DataType): Term = dt match {
+  def accessorForType(dt: DataType): String = dt match {
     case IntegerType => "getInt"
     case other => s"get${boxedType(dt)}"
   }
@@ -102,7 +102,7 @@ class CodeGenContext {
   /**
    * Return the name of mutator in Row for a DataType
    */
-  def mutatorForType(dt: DataType): Term = dt match {
+  def mutatorForType(dt: DataType): String = dt match {
     case IntegerType => "setInt"
     case other => s"set${boxedType(dt)}"
   }
@@ -110,7 +110,7 @@ class CodeGenContext {
   /**
    * Return the Java type for a DataType
    */
-  def javaType(dt: DataType): Term = dt match {
+  def javaType(dt: DataType): String = dt match {
     case IntegerType => "int"
     case LongType => "long"
     case ShortType => "short"
@@ -131,7 +131,7 @@ class CodeGenContext {
   /**
    * Return the boxed type in Java
    */
-  def boxedType(dt: DataType): Term = dt match {
+  def boxedType(dt: DataType): String = dt match {
     case IntegerType => "Integer"
     case LongType => "Long"
     case ShortType => "Short"
@@ -146,7 +146,7 @@ class CodeGenContext {
   /**
    * Return the representation of default value for given DataType
    */
-  def defaultValue(dt: DataType): Term = dt match {
+  def defaultValue(dt: DataType): String = dt match {
     case BooleanType => "false"
     case FloatType => "-1.0f"
     case ShortType => "(short)-1"
@@ -161,7 +161,7 @@ class CodeGenContext {
   /**
    * Returns a function to generate equal expression in Java
    */
-  def equalFunc(dataType: DataType): ((Term, Term) => Code) = dataType match {
+  def equalFunc(dataType: DataType): ((String, String) => String) = dataType match {
     case BinaryType => { case (eval1, eval2) =>
       s"java.util.Arrays.equals($eval1, $eval2)" }
     case IntegerType | BooleanType | LongType | DoubleType | FloatType | ShortType | ByteType =>
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/package.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/package.scala
index 6f9589d20445e..7f1b12cdd5800 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/package.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/package.scala
@@ -27,9 +27,6 @@ import org.apache.spark.util.Utils
  */
 package object codegen {
 
-  type Term = String
-  type Code = String
-
   /** Canonicalizes an expression so those that differ only by names can reuse the same code. */
   object ExpressionCanonicalizer extends rules.RuleExecutor[Expression] {
     val batches =
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/conditionals.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/conditionals.scala
index 3aa86edd7ab20..1a5cde26c9b13 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/conditionals.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/conditionals.scala
@@ -50,7 +50,7 @@ case class If(predicate: Expression, trueValue: Expression, falseValue: Expressi
     }
   }
 
-  override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): Code = {
+  override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
     val condEval = predicate.gen(ctx)
     val trueEval = trueValue.gen(ctx)
     val falseEval = falseValue.gen(ctx)
@@ -155,7 +155,7 @@ case class CaseWhen(branches: Seq[Expression]) extends CaseWhenLike {
     return res
   }
 
-  override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): Code = {
+  override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
     val len = branchesArr.length
     val got = ctx.freshName("got")
 
@@ -248,7 +248,7 @@ case class CaseKeyWhen(key: Expression, branches: Seq[Expression]) extends CaseW
     return res
   }
 
-  override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): Code = {
+  override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
     val keyEval = key.gen(ctx)
     val len = branchesArr.length
     val got = ctx.freshName("got")
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/decimalFunctions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/decimalFunctions.scala
index ddfadf314f838..8ab6d977dd3a6 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/decimalFunctions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/decimalFunctions.scala
@@ -17,7 +17,7 @@
 
 package org.apache.spark.sql.catalyst.expressions
 
-import org.apache.spark.sql.catalyst.expressions.codegen.{GeneratedExpressionCode, Code, CodeGenContext}
+import org.apache.spark.sql.catalyst.expressions.codegen.{GeneratedExpressionCode, CodeGenContext}
 import org.apache.spark.sql.types._
 
 /** Return the unscaled Long value of a Decimal, assuming it fits in a Long */
@@ -37,7 +37,7 @@ case class UnscaledValue(child: Expression) extends UnaryExpression {
     }
   }
 
-  override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): Code = {
+  override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
     defineCodeGen(ctx, ev, c => s"$c.toUnscaledLong()")
   }
 }
@@ -59,7 +59,7 @@ case class MakeDecimal(child: Expression, precision: Int, scale: Int) extends Un
     }
   }
 
-  override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): Code = {
+  override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
     val eval = child.gen(ctx)
     eval.code + s"""
       boolean ${ev.isNull} = ${eval.isNull};
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/literals.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/literals.scala
index 3a9271678bc9c..297b35b4da94c 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/literals.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/literals.scala
@@ -20,7 +20,7 @@ package org.apache.spark.sql.catalyst.expressions
 import java.sql.{Date, Timestamp}
 
 import org.apache.spark.sql.catalyst.CatalystTypeConverters
-import org.apache.spark.sql.catalyst.expressions.codegen.{Code, CodeGenContext, GeneratedExpressionCode}
+import org.apache.spark.sql.catalyst.expressions.codegen.{CodeGenContext, GeneratedExpressionCode}
 import org.apache.spark.sql.catalyst.util.DateUtils
 import org.apache.spark.sql.types._
 
@@ -88,7 +88,7 @@ case class Literal protected (value: Any, dataType: DataType) extends LeafExpres
 
   override def eval(input: Row): Any = value
 
-  override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): Code = {
+  override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
     // change the isNull and primitive to consts, to inline them
     if (value == null) {
       ev.isNull = "true"
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/math.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/math.scala
index a18067e4a58f1..7dacb6a9b47b6 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/math.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/math.scala
@@ -48,7 +48,7 @@ abstract class UnaryMathExpression(f: Double => Double, name: String)
   // name of function in java.lang.Math
   def funcName: String = name.toLowerCase
 
-  override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): Code = {
+  override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
     val eval = child.gen(ctx)
     eval.code + s"""
       boolean ${ev.isNull} = ${eval.isNull};
@@ -93,7 +93,7 @@ abstract class BinaryMathExpression(f: (Double, Double) => Double, name: String)
     }
   }
 
-  override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): Code = {
+  override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
     defineCodeGen(ctx, ev, (c1, c2) => s"java.lang.Math.${name.toLowerCase}($c1, $c2)")
   }
 }
@@ -180,7 +180,7 @@ case class Atan2(left: Expression, right: Expression)
     }
   }
 
-  override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): Code = {
+  override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
     defineCodeGen(ctx, ev, (c1, c2) => s"java.lang.Math.atan2($c1 + 0.0, $c2 + 0.0)") + s"""
       if (Double.valueOf(${ev.primitive}).isNaN()) {
         ${ev.isNull} = true;
@@ -194,7 +194,7 @@ case class Hypot(left: Expression, right: Expression)
 
 case class Pow(left: Expression, right: Expression)
   extends BinaryMathExpression(math.pow, "POWER") {
-  override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): Code = {
+  override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
     defineCodeGen(ctx, ev, (c1, c2) => s"java.lang.Math.pow($c1, $c2)") + s"""
       if (Double.valueOf(${ev.primitive}).isNaN()) {
         ${ev.isNull} = true;
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/nullFunctions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/nullFunctions.scala
index 9ecfb3ccc262f..c2d1a4eadae29 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/nullFunctions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/nullFunctions.scala
@@ -17,7 +17,7 @@
 
 package org.apache.spark.sql.catalyst.expressions
 
-import org.apache.spark.sql.catalyst.expressions.codegen.{GeneratedExpressionCode, Code, CodeGenContext}
+import org.apache.spark.sql.catalyst.expressions.codegen.{GeneratedExpressionCode, CodeGenContext}
 import org.apache.spark.sql.catalyst.trees
 import org.apache.spark.sql.catalyst.analysis.UnresolvedException
 import org.apache.spark.sql.types.DataType
@@ -53,7 +53,7 @@ case class Coalesce(children: Seq[Expression]) extends Expression {
     result
   }
 
-  override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): Code = {
+  override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
     s"""
       boolean ${ev.isNull} = true;
       ${ctx.javaType(dataType)} ${ev.primitive} = ${ctx.defaultValue(dataType)};
@@ -81,7 +81,7 @@ case class IsNull(child: Expression) extends Predicate with trees.UnaryNode[Expr
     child.eval(input) == null
   }
 
-  override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): Code = {
+  override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
     val eval = child.gen(ctx)
     ev.isNull = "false"
     ev.primitive = eval.isNull
@@ -100,7 +100,7 @@ case class IsNotNull(child: Expression) extends Predicate with trees.UnaryNode[E
     child.eval(input) != null
   }
 
-  override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): Code = {
+  override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
     val eval = child.gen(ctx)
     ev.isNull = "false"
     ev.primitive = s"(!(${eval.isNull}))"
@@ -130,7 +130,7 @@ case class AtLeastNNonNulls(n: Int, children: Seq[Expression]) extends Predicate
     numNonNulls >= n
   }
 
-  override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): Code = {
+  override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
     val nonnull = ctx.freshName("nonnull")
     val code = children.map { e =>
       val eval = e.gen(ctx)
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala
index 5edcf3bd77d20..3cbdfdfb13847 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala
@@ -19,7 +19,7 @@ package org.apache.spark.sql.catalyst.expressions
 
 import org.apache.spark.sql.catalyst.analysis.TypeCheckResult
 import org.apache.spark.sql.catalyst.util.TypeUtils
-import org.apache.spark.sql.catalyst.expressions.codegen.{GeneratedExpressionCode, Code, CodeGenContext}
+import org.apache.spark.sql.catalyst.expressions.codegen.{GeneratedExpressionCode, CodeGenContext}
 import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
 import org.apache.spark.sql.types._
 
@@ -84,7 +84,7 @@ case class Not(child: Expression) extends UnaryExpression with Predicate with Ex
     }
   }
 
-  override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): Code = {
+  override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
     defineCodeGen(ctx, ev, c => s"!($c)")
   }
 }
@@ -147,7 +147,7 @@ case class And(left: Expression, right: Expression)
     }
   }
 
-  override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): Code = {
+  override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
     val eval1 = left.gen(ctx)
     val eval2 = right.gen(ctx)
 
@@ -155,7 +155,7 @@ case class And(left: Expression, right: Expression)
     s"""
       ${eval1.code}
       boolean ${ev.isNull} = false;
-      boolean ${ev.primitive}  = false;
+      boolean ${ev.primitive} = false;
 
       if (!${eval1.isNull} && !${eval1.primitive}) {
       } else {
@@ -196,7 +196,7 @@ case class Or(left: Expression, right: Expression)
     }
   }
 
-  override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): Code = {
+  override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
     val eval1 = left.gen(ctx)
     val eval2 = right.gen(ctx)
 
@@ -249,7 +249,7 @@ abstract class BinaryComparison extends BinaryExpression with Predicate {
     }
   }
 
-  override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): Code = {
+  override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
     left.dataType match {
       case dt: NumericType if ctx.isNativeType(dt) => defineCodeGen (ctx, ev, {
         (c1, c3) => s"$c1 $symbol $c3"
@@ -280,7 +280,7 @@ case class EqualTo(left: Expression, right: Expression) extends BinaryComparison
     if (left.dataType != BinaryType) l == r
     else java.util.Arrays.equals(l.asInstanceOf[Array[Byte]], r.asInstanceOf[Array[Byte]])
   }
-  override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): Code = {
+  override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
     defineCodeGen(ctx, ev, ctx.equalFunc(left.dataType))
   }
 }
@@ -304,7 +304,7 @@ case class EqualNullSafe(left: Expression, right: Expression) extends BinaryComp
     }
   }
 
-  override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): Code = {
+  override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
     val eval1 = left.gen(ctx)
     val eval2 = right.gen(ctx)
     val equalCode = ctx.equalFunc(left.dataType)(eval1.primitive, eval2.primitive)
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/sets.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/sets.scala
index b39349b988389..2bcb960e9177e 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/sets.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/sets.scala
@@ -17,7 +17,7 @@
 
 package org.apache.spark.sql.catalyst.expressions
 
-import org.apache.spark.sql.catalyst.expressions.codegen.{GeneratedExpressionCode, Code, CodeGenContext}
+import org.apache.spark.sql.catalyst.expressions.codegen.{GeneratedExpressionCode, CodeGenContext}
 import org.apache.spark.sql.types._
 import org.apache.spark.util.collection.OpenHashSet
 
@@ -61,7 +61,7 @@ case class NewSet(elementType: DataType) extends LeafExpression {
     new OpenHashSet[Any]()
   }
 
-  override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): Code = {
+  override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
     elementType match {
       case IntegerType | LongType =>
         ev.isNull = "false"
@@ -103,7 +103,7 @@ case class AddItemToSet(item: Expression, set: Expression) extends Expression {
     }
   }
 
-  override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): Code = {
+  override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
     val elementType = set.dataType.asInstanceOf[OpenHashSetUDT].elementType
     elementType match {
       case IntegerType | LongType =>
@@ -154,7 +154,7 @@ case class CombineSets(left: Expression, right: Expression) extends BinaryExpres
     }
   }
 
-  override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): Code = {
+  override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
     val elementType = left.dataType.asInstanceOf[OpenHashSetUDT].elementType
     elementType match {
       case IntegerType | LongType =>
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala
index 78adb509b470b..aae122a981e47 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala
@@ -139,7 +139,7 @@ case class Upper(child: Expression) extends UnaryExpression with CaseConversionE
 
   override def toString: String = s"Upper($child)"
 
-  override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): Code = {
+  override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
     defineCodeGen(ctx, ev, c => s"($c).toUpperCase()")
   }
 }
@@ -153,7 +153,7 @@ case class Lower(child: Expression) extends UnaryExpression with CaseConversionE
 
   override def toString: String = s"Lower($child)"
 
-  override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): Code = {
+  override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
     defineCodeGen(ctx, ev, c => s"($c).toLowerCase()")
   }
 }
@@ -190,7 +190,7 @@ trait StringComparison extends ExpectsInputTypes {
 case class Contains(left: Expression, right: Expression)
     extends BinaryExpression with Predicate with StringComparison {
   override def compare(l: UTF8String, r: UTF8String): Boolean = l.contains(r)
-  override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): Code = {
+  override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
     defineCodeGen(ctx, ev, (c1, c2) => s"($c1).contains($c2)")
   }
 }
@@ -201,7 +201,7 @@ case class Contains(left: Expression, right: Expression)
 case class StartsWith(left: Expression, right: Expression)
     extends BinaryExpression with Predicate with StringComparison {
   override def compare(l: UTF8String, r: UTF8String): Boolean = l.startsWith(r)
-  override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): Code = {
+  override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
     defineCodeGen(ctx, ev, (c1, c2) => s"($c1).startsWith($c2)")
   }
 }
@@ -212,7 +212,7 @@ case class StartsWith(left: Expression, right: Expression)
 case class EndsWith(left: Expression, right: Expression)
     extends BinaryExpression with Predicate with StringComparison {
   override def compare(l: UTF8String, r: UTF8String): Boolean = l.endsWith(r)
-  override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): Code = {
+  override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
     defineCodeGen(ctx, ev, (c1, c2) => s"($c1).endsWith($c2)")
   }
 }

From 10fc2f6f51819f263eec941bdc1db22c554f9118 Mon Sep 17 00:00:00 2001
From: Daoyuan Wang <daoyuan.wang@intel.com>
Date: Mon, 8 Jun 2015 01:07:50 -0700
Subject: [PATCH 407/525] [SPARK-4761] [DOC] [SQL] kryo default setting in SQL
 Thrift server

this is a follow up of #3621

/cc liancheng pwendell

Author: Daoyuan Wang <daoyuan.wang@intel.com>

Closes #6639 from adrian-wang/kryodoc and squashes the following commits:

3c4b1cf [Daoyuan Wang] [DOC] kryo default setting in SQL Thrift server
---
 docs/configuration.md | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/docs/configuration.md b/docs/configuration.md
index 9667cebe0b87c..3960e7e78bde1 100644
--- a/docs/configuration.md
+++ b/docs/configuration.md
@@ -618,7 +618,7 @@ Apart from these, the following properties are also available, and may be useful
 </tr>
 <tr>
   <td><code>spark.kryo.referenceTracking</code></td>
-  <td>true</td>
+  <td>true (false when using Spark SQL Thrift Server)</td>
   <td>
     Whether to track references to the same object when serializing data with Kryo, which is
     necessary if your object graphs have loops and useful for efficiency if they contain multiple
@@ -679,7 +679,10 @@ Apart from these, the following properties are also available, and may be useful
 </tr>
 <tr>
   <td><code>spark.serializer</code></td>
-  <td>org.apache.spark.serializer.<br />JavaSerializer</td>
+  <td>
+    org.apache.spark.serializer.<br />JavaSerializer (org.apache.spark.serializer.<br />
+    KryoSerializer when using Spark SQL Thrift Server)
+  </td>
   <td>
     Class to use for serializing objects that will be sent over the network or need to be cached
     in serialized form. The default of Java serialization works with any Serializable Java object

From eacd4a929bf5d697c33b1b705dcf958651cd20f4 Mon Sep 17 00:00:00 2001
From: linweizhong <linweizhong@huawei.com>
Date: Mon, 8 Jun 2015 09:34:16 +0100
Subject: [PATCH 408/525] [SPARK-7705] [YARN] Cleanup of .sparkStaging
 directory fails if application is killed

As I have tested, if we cancel or kill the app then the final status may be undefined, killed or succeeded, so clean up staging directory when appMaster exit at any final application status.

Author: linweizhong <linweizhong@huawei.com>

Closes #6409 from Sephiroth-Lin/SPARK-7705 and squashes the following commits:

3a5a0a5 [linweizhong] Update
83dc274 [linweizhong] Update
923d44d [linweizhong] Update
0dd7c2d [linweizhong] Update
b76a102 [linweizhong] Update code style
7846b69 [linweizhong] Update
bd6cf0d [linweizhong] Refactor
aed9f18 [linweizhong] Clean up stagingDir when launch app on yarn
95595c3 [linweizhong] Cleanup of .sparkStaging directory when AppMaster exit at any final application status
---
 .../org/apache/spark/deploy/yarn/Client.scala | 34 ++++++++++++-------
 1 file changed, 21 insertions(+), 13 deletions(-)

diff --git a/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala b/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala
index 234051eb7d3bb..f4d43214b08ca 100644
--- a/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala
+++ b/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala
@@ -121,24 +121,31 @@ private[spark] class Client(
     } catch {
       case e: Throwable =>
         if (appId != null) {
-          val appStagingDir = getAppStagingDir(appId)
-          try {
-            val preserveFiles = sparkConf.getBoolean("spark.yarn.preserve.staging.files", false)
-            val stagingDirPath = new Path(appStagingDir)
-            val fs = FileSystem.get(hadoopConf)
-            if (!preserveFiles && fs.exists(stagingDirPath)) {
-              logInfo("Deleting staging directory " + stagingDirPath)
-              fs.delete(stagingDirPath, true)
-            }
-          } catch {
-            case ioe: IOException =>
-              logWarning("Failed to cleanup staging dir " + appStagingDir, ioe)
-          }
+          cleanupStagingDir(appId)
         }
         throw e
     }
   }
 
+  /**
+   * Cleanup application staging directory.
+   */
+  private def cleanupStagingDir(appId: ApplicationId): Unit = {
+    val appStagingDir = getAppStagingDir(appId)
+    try {
+      val preserveFiles = sparkConf.getBoolean("spark.yarn.preserve.staging.files", false)
+      val stagingDirPath = new Path(appStagingDir)
+      val fs = FileSystem.get(hadoopConf)
+      if (!preserveFiles && fs.exists(stagingDirPath)) {
+        logInfo("Deleting staging directory " + stagingDirPath)
+        fs.delete(stagingDirPath, true)
+      }
+    } catch {
+      case ioe: IOException =>
+        logWarning("Failed to cleanup staging dir " + appStagingDir, ioe)
+    }
+  }
+
   /**
    * Set up the context for submitting our ApplicationMaster.
    * This uses the YarnClientApplication not available in the Yarn alpha API.
@@ -782,6 +789,7 @@ private[spark] class Client(
       if (state == YarnApplicationState.FINISHED ||
         state == YarnApplicationState.FAILED ||
         state == YarnApplicationState.KILLED) {
+        cleanupStagingDir(appId)
         return (state, report.getFinalApplicationStatus)
       }
 

From 03ef6be9ce61a13dcd9d8c71298fb4be39119411 Mon Sep 17 00:00:00 2001
From: Liang-Chi Hsieh <viirya@gmail.com>
Date: Mon, 8 Jun 2015 17:50:38 +0800
Subject: [PATCH 409/525] [SPARK-7939] [SQL] Add conf to enable/disable
 partition column type inference

JIRA: https://issues.apache.org/jira/browse/SPARK-7939

Author: Liang-Chi Hsieh <viirya@gmail.com>

Closes #6503 from viirya/disable_partition_type_inference and squashes the following commits:

3e90470 [Liang-Chi Hsieh] Default to enable type inference and update docs.
455edb1 [Liang-Chi Hsieh] Merge remote-tracking branch 'upstream/master' into disable_partition_type_inference
9a57933 [Liang-Chi Hsieh] Add conf to enable/disable partition column type inference.
---
 docs/sql-programming-guide.md                 |  6 +-
 .../scala/org/apache/spark/sql/SQLConf.scala  |  6 ++
 .../spark/sql/sources/PartitioningUtils.scala | 48 ++++++-----
 .../apache/spark/sql/sources/interfaces.scala |  4 +-
 .../ParquetPartitionDiscoverySuite.scala      | 79 ++++++++++++++++++-
 5 files changed, 119 insertions(+), 24 deletions(-)

diff --git a/docs/sql-programming-guide.md b/docs/sql-programming-guide.md
index cde5830c733e0..40e33f757d693 100644
--- a/docs/sql-programming-guide.md
+++ b/docs/sql-programming-guide.md
@@ -1102,7 +1102,11 @@ root
 {% endhighlight %}
 
 Notice that the data types of the partitioning columns are automatically inferred.  Currently,
-numeric data types and string type are supported.
+numeric data types and string type are supported. Sometimes users may not want to automatically
+infer the data types of the partitioning columns. For these use cases, the automatic type inference
+can be configured by `spark.sql.sources.partitionColumnTypeInference.enabled`, which is default to
+`true`. When type inference is disabled, string type will be used for the partitioning columns.
+
 
 ### Schema merging
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SQLConf.scala b/sql/core/src/main/scala/org/apache/spark/sql/SQLConf.scala
index 77c6af27d1007..c778889045d02 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/SQLConf.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/SQLConf.scala
@@ -71,6 +71,9 @@ private[spark] object SQLConf {
   // Whether to perform partition discovery when loading external data sources.  Default to true.
   val PARTITION_DISCOVERY_ENABLED = "spark.sql.sources.partitionDiscovery.enabled"
 
+  // Whether to perform partition column type inference. Default to true.
+  val PARTITION_COLUMN_TYPE_INFERENCE = "spark.sql.sources.partitionColumnTypeInference.enabled"
+
   // The output committer class used by FSBasedRelation. The specified class needs to be a
   // subclass of org.apache.hadoop.mapreduce.OutputCommitter.
   val OUTPUT_COMMITTER_CLASS = "spark.sql.sources.outputCommitterClass"
@@ -250,6 +253,9 @@ private[sql] class SQLConf extends Serializable with CatalystConf {
   private[spark] def partitionDiscoveryEnabled() =
     getConf(SQLConf.PARTITION_DISCOVERY_ENABLED, "true").toBoolean
 
+  private[spark] def partitionColumnTypeInferenceEnabled() =
+    getConf(SQLConf.PARTITION_COLUMN_TYPE_INFERENCE, "true").toBoolean
+
   // Do not use a value larger than 4000 as the default value of this property.
   // See the comments of SCHEMA_STRING_LENGTH_THRESHOLD above for more information.
   private[spark] def schemaStringLengthThreshold: Int =
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/sources/PartitioningUtils.scala b/sql/core/src/main/scala/org/apache/spark/sql/sources/PartitioningUtils.scala
index c4c99de5a38dc..9f6ec2ed8fc8d 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/sources/PartitioningUtils.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/sources/PartitioningUtils.scala
@@ -72,10 +72,11 @@ private[sql] object PartitioningUtils {
    */
   private[sql] def parsePartitions(
       paths: Seq[Path],
-      defaultPartitionName: String): PartitionSpec = {
+      defaultPartitionName: String,
+      typeInference: Boolean): PartitionSpec = {
     // First, we need to parse every partition's path and see if we can find partition values.
     val pathsWithPartitionValues = paths.flatMap { path =>
-      parsePartition(path, defaultPartitionName).map(path -> _)
+      parsePartition(path, defaultPartitionName, typeInference).map(path -> _)
     }
 
     if (pathsWithPartitionValues.isEmpty) {
@@ -124,7 +125,8 @@ private[sql] object PartitioningUtils {
    */
   private[sql] def parsePartition(
       path: Path,
-      defaultPartitionName: String): Option[PartitionValues] = {
+      defaultPartitionName: String,
+      typeInference: Boolean): Option[PartitionValues] = {
     val columns = ArrayBuffer.empty[(String, Literal)]
     // Old Hadoop versions don't have `Path.isRoot`
     var finished = path.getParent == null
@@ -137,7 +139,7 @@ private[sql] object PartitioningUtils {
         return None
       }
 
-      val maybeColumn = parsePartitionColumn(chopped.getName, defaultPartitionName)
+      val maybeColumn = parsePartitionColumn(chopped.getName, defaultPartitionName, typeInference)
       maybeColumn.foreach(columns += _)
       chopped = chopped.getParent
       finished = maybeColumn.isEmpty || chopped.getParent == null
@@ -153,7 +155,8 @@ private[sql] object PartitioningUtils {
 
   private def parsePartitionColumn(
       columnSpec: String,
-      defaultPartitionName: String): Option[(String, Literal)] = {
+      defaultPartitionName: String,
+      typeInference: Boolean): Option[(String, Literal)] = {
     val equalSignIndex = columnSpec.indexOf('=')
     if (equalSignIndex == -1) {
       None
@@ -164,7 +167,7 @@ private[sql] object PartitioningUtils {
       val rawColumnValue = columnSpec.drop(equalSignIndex + 1)
       assert(rawColumnValue.nonEmpty, s"Empty partition column value in '$columnSpec'")
 
-      val literal = inferPartitionColumnValue(rawColumnValue, defaultPartitionName)
+      val literal = inferPartitionColumnValue(rawColumnValue, defaultPartitionName, typeInference)
       Some(columnName -> literal)
     }
   }
@@ -211,19 +214,28 @@ private[sql] object PartitioningUtils {
    */
   private[sql] def inferPartitionColumnValue(
       raw: String,
-      defaultPartitionName: String): Literal = {
-    // First tries integral types
-    Try(Literal.create(Integer.parseInt(raw), IntegerType))
-      .orElse(Try(Literal.create(JLong.parseLong(raw), LongType)))
-      // Then falls back to fractional types
-      .orElse(Try(Literal.create(JFloat.parseFloat(raw), FloatType)))
-      .orElse(Try(Literal.create(JDouble.parseDouble(raw), DoubleType)))
-      .orElse(Try(Literal.create(new JBigDecimal(raw), DecimalType.Unlimited)))
-      // Then falls back to string
-      .getOrElse {
-        if (raw == defaultPartitionName) Literal.create(null, NullType)
-        else Literal.create(unescapePathName(raw), StringType)
+      defaultPartitionName: String,
+      typeInference: Boolean): Literal = {
+    if (typeInference) {
+      // First tries integral types
+      Try(Literal.create(Integer.parseInt(raw), IntegerType))
+        .orElse(Try(Literal.create(JLong.parseLong(raw), LongType)))
+        // Then falls back to fractional types
+        .orElse(Try(Literal.create(JFloat.parseFloat(raw), FloatType)))
+        .orElse(Try(Literal.create(JDouble.parseDouble(raw), DoubleType)))
+        .orElse(Try(Literal.create(new JBigDecimal(raw), DecimalType.Unlimited)))
+        // Then falls back to string
+        .getOrElse {
+          if (raw == defaultPartitionName) Literal.create(null, NullType)
+          else Literal.create(unescapePathName(raw), StringType)
+        }
+    } else {
+      if (raw == defaultPartitionName) {
+        Literal.create(null, NullType)
+      } else {
+        Literal.create(unescapePathName(raw), StringType)
       }
+    }
   }
 
   private val upCastingOrder: Seq[DataType] =
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/sources/interfaces.scala b/sql/core/src/main/scala/org/apache/spark/sql/sources/interfaces.scala
index 25887ba9a15b0..d1547fb1e4abb 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/sources/interfaces.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/sources/interfaces.scala
@@ -491,9 +491,11 @@ abstract class HadoopFsRelation private[sql](maybePartitionSpec: Option[Partitio
   }
 
   private def discoverPartitions(): PartitionSpec = {
+    val typeInference = sqlContext.conf.partitionColumnTypeInferenceEnabled()
     // We use leaf dirs containing data files to discover the schema.
     val leafDirs = fileStatusCache.leafDirToChildrenFiles.keys.toSeq
-    PartitioningUtils.parsePartitions(leafDirs, PartitioningUtils.DEFAULT_PARTITION_NAME)
+    PartitioningUtils.parsePartitions(leafDirs, PartitioningUtils.DEFAULT_PARTITION_NAME,
+      typeInference)
   }
 
   /**
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetPartitionDiscoverySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetPartitionDiscoverySuite.scala
index d9a010a9815a1..c2f1cc8ffd1fb 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetPartitionDiscoverySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetPartitionDiscoverySuite.scala
@@ -48,7 +48,7 @@ class ParquetPartitionDiscoverySuite extends QueryTest with ParquetTest {
 
   test("column type inference") {
     def check(raw: String, literal: Literal): Unit = {
-      assert(inferPartitionColumnValue(raw, defaultPartitionName) === literal)
+      assert(inferPartitionColumnValue(raw, defaultPartitionName, true) === literal)
     }
 
     check("10", Literal.create(10, IntegerType))
@@ -60,12 +60,12 @@ class ParquetPartitionDiscoverySuite extends QueryTest with ParquetTest {
 
   test("parse partition") {
     def check(path: String, expected: Option[PartitionValues]): Unit = {
-      assert(expected === parsePartition(new Path(path), defaultPartitionName))
+      assert(expected === parsePartition(new Path(path), defaultPartitionName, true))
     }
 
     def checkThrows[T <: Throwable: Manifest](path: String, expected: String): Unit = {
       val message = intercept[T] {
-        parsePartition(new Path(path), defaultPartitionName).get
+        parsePartition(new Path(path), defaultPartitionName, true).get
       }.getMessage
 
       assert(message.contains(expected))
@@ -105,7 +105,7 @@ class ParquetPartitionDiscoverySuite extends QueryTest with ParquetTest {
 
   test("parse partitions") {
     def check(paths: Seq[String], spec: PartitionSpec): Unit = {
-      assert(parsePartitions(paths.map(new Path(_)), defaultPartitionName) === spec)
+      assert(parsePartitions(paths.map(new Path(_)), defaultPartitionName, true) === spec)
     }
 
     check(Seq(
@@ -174,6 +174,77 @@ class ParquetPartitionDiscoverySuite extends QueryTest with ParquetTest {
       PartitionSpec.emptySpec)
   }
 
+  test("parse partitions with type inference disabled") {
+    def check(paths: Seq[String], spec: PartitionSpec): Unit = {
+      assert(parsePartitions(paths.map(new Path(_)), defaultPartitionName, false) === spec)
+    }
+
+    check(Seq(
+      "hdfs://host:9000/path/a=10/b=hello"),
+      PartitionSpec(
+        StructType(Seq(
+          StructField("a", StringType),
+          StructField("b", StringType))),
+        Seq(Partition(Row("10", "hello"), "hdfs://host:9000/path/a=10/b=hello"))))
+
+    check(Seq(
+      "hdfs://host:9000/path/a=10/b=20",
+      "hdfs://host:9000/path/a=10.5/b=hello"),
+      PartitionSpec(
+        StructType(Seq(
+          StructField("a", StringType),
+          StructField("b", StringType))),
+        Seq(
+          Partition(Row("10", "20"), "hdfs://host:9000/path/a=10/b=20"),
+          Partition(Row("10.5", "hello"), "hdfs://host:9000/path/a=10.5/b=hello"))))
+
+    check(Seq(
+      "hdfs://host:9000/path/_temporary",
+      "hdfs://host:9000/path/a=10/b=20",
+      "hdfs://host:9000/path/a=10.5/b=hello",
+      "hdfs://host:9000/path/a=10.5/_temporary",
+      "hdfs://host:9000/path/a=10.5/_TeMpOrArY",
+      "hdfs://host:9000/path/a=10.5/b=hello/_temporary",
+      "hdfs://host:9000/path/a=10.5/b=hello/_TEMPORARY",
+      "hdfs://host:9000/path/_temporary/path",
+      "hdfs://host:9000/path/a=11/_temporary/path",
+      "hdfs://host:9000/path/a=10.5/b=world/_temporary/path"),
+      PartitionSpec(
+        StructType(Seq(
+          StructField("a", StringType),
+          StructField("b", StringType))),
+        Seq(
+          Partition(Row("10", "20"), "hdfs://host:9000/path/a=10/b=20"),
+          Partition(Row("10.5", "hello"), "hdfs://host:9000/path/a=10.5/b=hello"))))
+
+    check(Seq(
+      s"hdfs://host:9000/path/a=10/b=20",
+      s"hdfs://host:9000/path/a=$defaultPartitionName/b=hello"),
+      PartitionSpec(
+        StructType(Seq(
+          StructField("a", StringType),
+          StructField("b", StringType))),
+        Seq(
+          Partition(Row("10", "20"), s"hdfs://host:9000/path/a=10/b=20"),
+          Partition(Row(null, "hello"), s"hdfs://host:9000/path/a=$defaultPartitionName/b=hello"))))
+
+    check(Seq(
+      s"hdfs://host:9000/path/a=10/b=$defaultPartitionName",
+      s"hdfs://host:9000/path/a=10.5/b=$defaultPartitionName"),
+      PartitionSpec(
+        StructType(Seq(
+          StructField("a", StringType),
+          StructField("b", StringType))),
+        Seq(
+          Partition(Row("10", null), s"hdfs://host:9000/path/a=10/b=$defaultPartitionName"),
+          Partition(Row("10.5", null), s"hdfs://host:9000/path/a=10.5/b=$defaultPartitionName"))))
+
+    check(Seq(
+      s"hdfs://host:9000/path1",
+      s"hdfs://host:9000/path2"),
+      PartitionSpec.emptySpec)
+  }
+
   test("read partitioned table - normal case") {
     withTempDir { base =>
       for {

From a1d9e5cc60d317ecf8fe390b66b623ae39c4534d Mon Sep 17 00:00:00 2001
From: Marcelo Vanzin <vanzin@cloudera.com>
Date: Mon, 8 Jun 2015 15:37:28 +0100
Subject: [PATCH 410/525] [SPARK-8126] [BUILD] Use custom temp directory during
 build.

Even with all the efforts to cleanup the temp directories created by
unit tests, Spark leaves a lot of garbage in /tmp after a test run.
This change overrides java.io.tmpdir to place those files under the
build directory instead.

After an sbt full unit test run, I was left with > 400 MB of temp
files. Since they're now under the build dir, it's much easier to
clean them up.

Also make a slight change to a unit test to make it not pollute the
source directory with test data.

Author: Marcelo Vanzin <vanzin@cloudera.com>

Closes #6674 from vanzin/SPARK-8126 and squashes the following commits:

0f8ad41 [Marcelo Vanzin] Make sure tmp dir exists when tests run.
643e916 [Marcelo Vanzin] [MINOR] [BUILD] Use custom temp directory during build.
---
 .../spark/deploy/SparkSubmitUtilsSuite.scala  | 22 +++++++++--------
 pom.xml                                       | 24 ++++++++++++++++++-
 project/SparkBuild.scala                      |  6 +++++
 3 files changed, 41 insertions(+), 11 deletions(-)

diff --git a/core/src/test/scala/org/apache/spark/deploy/SparkSubmitUtilsSuite.scala b/core/src/test/scala/org/apache/spark/deploy/SparkSubmitUtilsSuite.scala
index 8fda5c8b472c9..07d261cc428c4 100644
--- a/core/src/test/scala/org/apache/spark/deploy/SparkSubmitUtilsSuite.scala
+++ b/core/src/test/scala/org/apache/spark/deploy/SparkSubmitUtilsSuite.scala
@@ -28,9 +28,12 @@ import org.apache.ivy.plugins.resolver.IBiblioResolver
 
 import org.apache.spark.SparkFunSuite
 import org.apache.spark.deploy.SparkSubmitUtils.MavenCoordinate
+import org.apache.spark.util.Utils
 
 class SparkSubmitUtilsSuite extends SparkFunSuite with BeforeAndAfterAll {
 
+  private var tempIvyPath: String = _
+
   private val noOpOutputStream = new OutputStream {
     def write(b: Int) = {}
   }
@@ -47,6 +50,7 @@ class SparkSubmitUtilsSuite extends SparkFunSuite with BeforeAndAfterAll {
     super.beforeAll()
     // We don't want to write logs during testing
     SparkSubmitUtils.printStream = new BufferPrintStream
+    tempIvyPath = Utils.createTempDir(namePrefix = "ivy").getAbsolutePath()
   }
 
   test("incorrect maven coordinate throws error") {
@@ -90,21 +94,20 @@ class SparkSubmitUtilsSuite extends SparkFunSuite with BeforeAndAfterAll {
   }
 
   test("ivy path works correctly") {
-    val ivyPath = "dummy" + File.separator +  "ivy"
     val md = SparkSubmitUtils.getModuleDescriptor
     val artifacts = for (i <- 0 until 3) yield new MDArtifact(md, s"jar-$i", "jar", "jar")
-    var jPaths = SparkSubmitUtils.resolveDependencyPaths(artifacts.toArray, new File(ivyPath))
+    var jPaths = SparkSubmitUtils.resolveDependencyPaths(artifacts.toArray, new File(tempIvyPath))
     for (i <- 0 until 3) {
-      val index = jPaths.indexOf(ivyPath)
+      val index = jPaths.indexOf(tempIvyPath)
       assert(index >= 0)
-      jPaths = jPaths.substring(index + ivyPath.length)
+      jPaths = jPaths.substring(index + tempIvyPath.length)
     }
     val main = MavenCoordinate("my.awesome.lib", "mylib", "0.1")
     IvyTestUtils.withRepository(main, None, None) { repo =>
       // end to end
       val jarPath = SparkSubmitUtils.resolveMavenCoordinates(main.toString, Option(repo),
-        Option(ivyPath), true)
-      assert(jarPath.indexOf(ivyPath) >= 0, "should use non-default ivy path")
+        Option(tempIvyPath), true)
+      assert(jarPath.indexOf(tempIvyPath) >= 0, "should use non-default ivy path")
     }
   }
 
@@ -123,13 +126,12 @@ class SparkSubmitUtilsSuite extends SparkFunSuite with BeforeAndAfterAll {
       assert(jarPath.indexOf("mylib") >= 0, "should find artifact")
     }
     // Local ivy repository with modified home
-    val dummyIvyPath = "dummy" + File.separator + "ivy"
-    val dummyIvyLocal = new File(dummyIvyPath, "local" + File.separator)
+    val dummyIvyLocal = new File(tempIvyPath, "local" + File.separator)
     IvyTestUtils.withRepository(main, None, Some(dummyIvyLocal), true) { repo =>
       val jarPath = SparkSubmitUtils.resolveMavenCoordinates(main.toString, None,
-        Some(dummyIvyPath), true)
+        Some(tempIvyPath), true)
       assert(jarPath.indexOf("mylib") >= 0, "should find artifact")
-      assert(jarPath.indexOf(dummyIvyPath) >= 0, "should be in new ivy path")
+      assert(jarPath.indexOf(tempIvyPath) >= 0, "should be in new ivy path")
     }
   }
 
diff --git a/pom.xml b/pom.xml
index 67b6375f576d3..5a5d183e3dcca 100644
--- a/pom.xml
+++ b/pom.xml
@@ -179,7 +179,7 @@
     <parquet.deps.scope>compile</parquet.deps.scope>
 
     <!--
-      Overridable test home. So that you can call individual pom files directory without
+      Overridable test home. So that you can call individual pom files directly without
       things breaking.
     -->
     <spark.test.home>${session.executionRootDirectory}</spark.test.home>
@@ -1256,6 +1256,7 @@
             <systemProperties>
               <derby.system.durability>test</derby.system.durability>
               <java.awt.headless>true</java.awt.headless>
+              <java.io.tmpdir>${project.build.directory}/tmp</java.io.tmpdir>
               <spark.test.home>${spark.test.home}</spark.test.home>
               <spark.testing>1</spark.testing>
               <spark.ui.enabled>false</spark.ui.enabled>
@@ -1289,6 +1290,7 @@
             <systemProperties>
               <derby.system.durability>test</derby.system.durability>
               <java.awt.headless>true</java.awt.headless>
+              <java.io.tmpdir>${project.build.directory}/tmp</java.io.tmpdir>
               <spark.test.home>${spark.test.home}</spark.test.home>
               <spark.testing>1</spark.testing>
               <spark.ui.enabled>false</spark.ui.enabled>
@@ -1548,6 +1550,26 @@
           </execution>
         </executions>
       </plugin>
+
+      <plugin>
+        <groupId>org.apache.maven.plugins</groupId>
+        <artifactId>maven-antrun-plugin</artifactId>
+        <executions>
+          <execution>
+            <id>create-tmp-dir</id>
+            <phase>generate-test-resources</phase>
+            <goals>
+              <goal>run</goal>
+            </goals>
+            <configuration>
+              <target>
+                <mkdir dir="${project.build.directory}/tmp" />
+              </target>
+            </configuration>
+          </execution>
+        </executions>
+      </plugin>
+
       <!-- Enable surefire and scalatest in all children, in one place: -->
       <plugin>
         <groupId>org.apache.maven.plugins</groupId>
diff --git a/project/SparkBuild.scala b/project/SparkBuild.scala
index ef3a175bac209..d7e374558c5e2 100644
--- a/project/SparkBuild.scala
+++ b/project/SparkBuild.scala
@@ -51,6 +51,11 @@ object BuildCommons {
   // Root project.
   val spark = ProjectRef(buildLocation, "spark")
   val sparkHome = buildLocation
+
+  val testTempDir = s"$sparkHome/target/tmp"
+  if (!new File(testTempDir).isDirectory()) {
+    require(new File(testTempDir).mkdirs())
+  }
 }
 
 object SparkBuild extends PomBuild {
@@ -496,6 +501,7 @@ object TestSettings {
       "SPARK_DIST_CLASSPATH" ->
         (fullClasspath in Test).value.files.map(_.getAbsolutePath).mkString(":").stripSuffix(":"),
       "JAVA_HOME" -> sys.env.get("JAVA_HOME").getOrElse(sys.props("java.home"))),
+    javaOptions in Test += s"-Djava.io.tmpdir=$testTempDir",
     javaOptions in Test += "-Dspark.test.home=" + sparkHome,
     javaOptions in Test += "-Dspark.testing=1",
     javaOptions in Test += "-Dspark.port.maxRetries=100",

From e3e9c70384028cc0c322ccea14f19d3b6d6b39eb Mon Sep 17 00:00:00 2001
From: MechCoder <manojkumarsivaraj334@gmail.com>
Date: Mon, 8 Jun 2015 15:45:12 +0100
Subject: [PATCH 411/525] [SPARK-8140] [MLLIB] Remove empty model check in
 StreamingLinearAlgorithm

1. Prevent creating a map of data to find numFeatures
2. If model is empty, then initialize with a zero vector of numFeature

Author: MechCoder <manojkumarsivaraj334@gmail.com>

Closes #6684 from MechCoder/spark-8140 and squashes the following commits:

7fbf5f9 [MechCoder] [SPARK-8140] Remove empty model check in StreamingLinearAlgorithm And other minor cosmits
---
 .../apache/spark/mllib/optimization/GradientDescent.scala   | 2 +-
 .../spark/mllib/regression/GeneralizedLinearAlgorithm.scala | 6 +++---
 .../spark/mllib/regression/StreamingLinearAlgorithm.scala   | 3 ---
 .../mllib/regression/StreamingLinearRegressionWithSGD.scala | 2 +-
 4 files changed, 5 insertions(+), 8 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/optimization/GradientDescent.scala b/mllib/src/main/scala/org/apache/spark/mllib/optimization/GradientDescent.scala
index 4b7d0589c973b..06e45e10c5bf4 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/optimization/GradientDescent.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/optimization/GradientDescent.scala
@@ -179,7 +179,7 @@ object GradientDescent extends Logging {
      * if it's L2 updater; for L1 updater, the same logic is followed.
      */
     var regVal = updater.compute(
-      weights, Vectors.dense(new Array[Double](weights.size)), 0, 1, regParam)._2
+      weights, Vectors.zeros(weights.size), 0, 1, regParam)._2
 
     for (i <- 1 to numIterations) {
       val bcWeights = data.context.broadcast(weights)
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/regression/GeneralizedLinearAlgorithm.scala b/mllib/src/main/scala/org/apache/spark/mllib/regression/GeneralizedLinearAlgorithm.scala
index 26be30ff9d6fd..6709bd79bc820 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/regression/GeneralizedLinearAlgorithm.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/regression/GeneralizedLinearAlgorithm.scala
@@ -195,11 +195,11 @@ abstract class GeneralizedLinearAlgorithm[M <: GeneralizedLinearModel]
      */
     val initialWeights = {
       if (numOfLinearPredictor == 1) {
-        Vectors.dense(new Array[Double](numFeatures))
+        Vectors.zeros(numFeatures)
       } else if (addIntercept) {
-        Vectors.dense(new Array[Double]((numFeatures + 1) * numOfLinearPredictor))
+        Vectors.zeros((numFeatures + 1) * numOfLinearPredictor)
       } else {
-        Vectors.dense(new Array[Double](numFeatures * numOfLinearPredictor))
+        Vectors.zeros(numFeatures * numOfLinearPredictor)
       }
     }
     run(input, initialWeights)
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/regression/StreamingLinearAlgorithm.scala b/mllib/src/main/scala/org/apache/spark/mllib/regression/StreamingLinearAlgorithm.scala
index cea8f3f47307b..39308e5ae1dde 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/regression/StreamingLinearAlgorithm.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/regression/StreamingLinearAlgorithm.scala
@@ -87,9 +87,6 @@ abstract class StreamingLinearAlgorithm[
         model match {
           case Some(m) =>
             m.weights
-          case None =>
-            val numFeatures = rdd.first().features.size
-            Vectors.dense(numFeatures)
         }
       model = Some(algorithm.run(rdd, initialWeights))
       logInfo("Model updated at time %s".format(time.toString))
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/regression/StreamingLinearRegressionWithSGD.scala b/mllib/src/main/scala/org/apache/spark/mllib/regression/StreamingLinearRegressionWithSGD.scala
index a49153bf73c0d..235e043c7754b 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/regression/StreamingLinearRegressionWithSGD.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/regression/StreamingLinearRegressionWithSGD.scala
@@ -79,7 +79,7 @@ class StreamingLinearRegressionWithSGD private[mllib] (
     this
   }
 
-  /** Set the initial weights. Default: [0.0, 0.0]. */
+  /** Set the initial weights. */
   def setInitialWeights(initialWeights: Vector): this.type = {
     this.model = Some(algorithm.createModel(initialWeights, 0.0))
     this

From 149d1b28e899177ed170292fd2af30aad5a610e0 Mon Sep 17 00:00:00 2001
From: Mingfei <mingfei.shi@intel.com>
Date: Mon, 8 Jun 2015 16:23:43 +0100
Subject: [PATCH 412/525] [SMALL FIX] Return null if catch EOFException

Return null if catch EOFException, just like function "asKeyValueIterator" in this class

Author: Mingfei <mingfei.shi@intel.com>

Closes #6703 from shimingfei/returnNull and squashes the following commits:

205deec [Mingfei] return null if catch EOFException
---
 core/src/main/scala/org/apache/spark/serializer/Serializer.scala | 1 +
 1 file changed, 1 insertion(+)

diff --git a/core/src/main/scala/org/apache/spark/serializer/Serializer.scala b/core/src/main/scala/org/apache/spark/serializer/Serializer.scala
index f1bdff96d3df1..bd2704dc81871 100644
--- a/core/src/main/scala/org/apache/spark/serializer/Serializer.scala
+++ b/core/src/main/scala/org/apache/spark/serializer/Serializer.scala
@@ -182,6 +182,7 @@ abstract class DeserializationStream {
       } catch {
         case eof: EOFException =>
           finished = true
+          null
       }
     }
 

From 49f19b954b32d57d03ca0e25ea4205d01e794d48 Mon Sep 17 00:00:00 2001
From: Daoyuan Wang <daoyuan.wang@intel.com>
Date: Mon, 8 Jun 2015 09:41:06 -0700
Subject: [PATCH 413/525] [MINOR] change new Exception to
 IllegalArgumentException

Author: Daoyuan Wang <daoyuan.wang@intel.com>

Closes #6434 from adrian-wang/joinerr and squashes the following commits:

ee1b64f [Daoyuan Wang] break line
f7c53e9 [Daoyuan Wang] to IllegalArgumentException
f8dea2d [Daoyuan Wang] sys.err to IllegalStateException
be82259 [Daoyuan Wang] change new exception to sys.err
---
 .../apache/spark/sql/execution/joins/HashOuterJoin.scala  | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashOuterJoin.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashOuterJoin.scala
index 45574392996ca..c21a453115292 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashOuterJoin.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashOuterJoin.scala
@@ -48,7 +48,8 @@ case class HashOuterJoin(
     case LeftOuter => left.outputPartitioning
     case RightOuter => right.outputPartitioning
     case FullOuter => UnknownPartitioning(left.outputPartitioning.numPartitions)
-    case x => throw new Exception(s"HashOuterJoin should not take $x as the JoinType")
+    case x =>
+      throw new IllegalArgumentException(s"HashOuterJoin should not take $x as the JoinType")
   }
 
   override def requiredChildDistribution: Seq[ClusteredDistribution] =
@@ -63,7 +64,7 @@ case class HashOuterJoin(
       case FullOuter =>
         left.output.map(_.withNullability(true)) ++ right.output.map(_.withNullability(true))
       case x =>
-        throw new Exception(s"HashOuterJoin should not take $x as the JoinType")
+        throw new IllegalArgumentException(s"HashOuterJoin should not take $x as the JoinType")
     }
   }
 
@@ -216,7 +217,8 @@ case class HashOuterJoin(
               rightHashTable.getOrElse(key, EMPTY_LIST), joinedRow)
           }
 
-        case x => throw new Exception(s"HashOuterJoin should not take $x as the JoinType")
+        case x =>
+          throw new IllegalArgumentException(s"HashOuterJoin should not take $x as the JoinType")
       }
     }
   }

From ed5c2dccd0397c4c4b0008c437e6845dd583c9c2 Mon Sep 17 00:00:00 2001
From: Daoyuan Wang <daoyuan.wang@intel.com>
Date: Mon, 8 Jun 2015 11:06:27 -0700
Subject: [PATCH 414/525] [SPARK-8158] [SQL] several fix for HiveShim

1. explicitly import implicit conversion support.
2. use .nonEmpty instead of .size > 0
3. use val instead of var
4. comment indention

Author: Daoyuan Wang <daoyuan.wang@intel.com>

Closes #6700 from adrian-wang/shimsimprove and squashes the following commits:

d22e108 [Daoyuan Wang] several fix for HiveShim
---
 .../org/apache/spark/sql/hive/HiveShim.scala  | 21 ++++++++++---------
 1 file changed, 11 insertions(+), 10 deletions(-)

diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveShim.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveShim.scala
index fa5409f602444..d08c594151654 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveShim.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveShim.scala
@@ -20,6 +20,11 @@ package org.apache.spark.sql.hive
 import java.io.{InputStream, OutputStream}
 import java.rmi.server.UID
 
+/* Implicit conversions */
+import scala.collection.JavaConversions._
+import scala.language.implicitConversions
+import scala.reflect.ClassTag
+
 import com.esotericsoftware.kryo.Kryo
 import com.esotericsoftware.kryo.io.{Input, Output}
 import org.apache.hadoop.conf.Configuration
@@ -35,10 +40,6 @@ import org.apache.spark.Logging
 import org.apache.spark.sql.types.Decimal
 import org.apache.spark.util.Utils
 
-/* Implicit conversions */
-import scala.collection.JavaConversions._
-import scala.reflect.ClassTag
-
 private[hive] object HiveShim {
   // Precision and scale to pass for unlimited decimals; these are the same as the precision and
   // scale Hive 0.13 infers for BigDecimals from sources that don't specify them (e.g. UDFs)
@@ -68,10 +69,10 @@ private[hive] object HiveShim {
    * Cannot use ColumnProjectionUtils.appendReadColumns directly, if ids is null or empty
    */
   def appendReadColumns(conf: Configuration, ids: Seq[Integer], names: Seq[String]) {
-    if (ids != null && ids.size > 0) {
+    if (ids != null && ids.nonEmpty) {
       ColumnProjectionUtils.appendReadColumns(conf, ids)
     }
-    if (names != null && names.size > 0) {
+    if (names != null && names.nonEmpty) {
       appendReadColumnNames(conf, names)
     }
   }
@@ -197,11 +198,11 @@ private[hive] object HiveShim {
   }
 
   /*
- * Bug introduced in hive-0.13. FileSinkDesc is serializable, but its member path is not.
- * Fix it through wrapper.
- * */
+   * Bug introduced in hive-0.13. FileSinkDesc is serializable, but its member path is not.
+   * Fix it through wrapper.
+   */
   implicit def wrapperToFileSinkDesc(w: ShimFileSinkDesc): FileSinkDesc = {
-    var f = new FileSinkDesc(new Path(w.dir), w.tableInfo, w.compressed)
+    val f = new FileSinkDesc(new Path(w.dir), w.tableInfo, w.compressed)
     f.setCompressCodec(w.compressCodec)
     f.setCompressType(w.compressType)
     f.setTableInfo(w.tableInfo)

From bbdfc0a40fb39760c122e7b9ce80aa1e340e55ee Mon Sep 17 00:00:00 2001
From: Cheng Lian <lian@databricks.com>
Date: Mon, 8 Jun 2015 11:34:18 -0700
Subject: [PATCH 415/525] [SPARK-8121] [SQL] Fixes InsertIntoHadoopFsRelation
 job initialization for Hadoop 1.x

For Hadoop 1.x, `TaskAttemptContext` constructor clones the `Configuration` argument, thus configurations done in `HadoopFsRelation.prepareForWriteJob()` are not populated to *driver* side `TaskAttemptContext` (executor side configurations are properly populated). Currently this should only affect Parquet output committer class configuration.

Author: Cheng Lian <lian@databricks.com>

Closes #6669 from liancheng/spark-8121 and squashes the following commits:

73819e8 [Cheng Lian] Minor logging fix
fce089c [Cheng Lian] Adds more logging
b6f78a6 [Cheng Lian] Fixes compilation error introduced while rebasing
963a1aa [Cheng Lian] Addresses @yhuai's comment
c3a0b1a [Cheng Lian] Fixes InsertIntoHadoopFsRelation job initialization
---
 .../scala/org/apache/spark/sql/SQLConf.scala  |  1 +
 .../apache/spark/sql/parquet/newParquet.scala |  7 +++
 .../apache/spark/sql/sources/commands.scala   | 18 +++++--
 .../spark/sql/parquet/ParquetIOSuite.scala    | 52 ++++++++++++++++---
 4 files changed, 65 insertions(+), 13 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SQLConf.scala b/sql/core/src/main/scala/org/apache/spark/sql/SQLConf.scala
index c778889045d02..be786f9b7f49e 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/SQLConf.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/SQLConf.scala
@@ -76,6 +76,7 @@ private[spark] object SQLConf {
 
   // The output committer class used by FSBasedRelation. The specified class needs to be a
   // subclass of org.apache.hadoop.mapreduce.OutputCommitter.
+  // NOTE: This property should be set in Hadoop `Configuration` rather than Spark `SQLConf`
   val OUTPUT_COMMITTER_CLASS = "spark.sql.sources.outputCommitterClass"
 
   // Whether to perform eager analysis when constructing a dataframe.
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/newParquet.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/newParquet.scala
index 5dda440240e60..7af4eb1ca4716 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/newParquet.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/newParquet.scala
@@ -212,6 +212,13 @@ private[sql] class ParquetRelation2(
         classOf[ParquetOutputCommitter],
         classOf[ParquetOutputCommitter])
 
+    if (conf.get("spark.sql.parquet.output.committer.class") == null) {
+      logInfo("Using default output committer for Parquet: " +
+        classOf[ParquetOutputCommitter].getCanonicalName)
+    } else {
+      logInfo("Using user defined output committer for Parquet: " + committerClass.getCanonicalName)
+    }
+
     conf.setClass(
       SQLConf.OUTPUT_COMMITTER_CLASS,
       committerClass,
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/sources/commands.scala b/sql/core/src/main/scala/org/apache/spark/sql/sources/commands.scala
index bd3aad6631748..c94199bfcd233 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/sources/commands.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/sources/commands.scala
@@ -297,12 +297,16 @@ private[sql] abstract class BaseWriterContainer(
   def driverSideSetup(): Unit = {
     setupIDs(0, 0, 0)
     setupConf()
-    taskAttemptContext = newTaskAttemptContext(serializableConf.value, taskAttemptId)
 
-    // This preparation must happen before initializing output format and output committer, since
-    // their initialization involves the job configuration, which can be potentially decorated in
-    // `relation.prepareJobForWrite`.
+    // Order of the following two lines is important.  For Hadoop 1, TaskAttemptContext constructor
+    // clones the Configuration object passed in.  If we initialize the TaskAttemptContext first,
+    // configurations made in prepareJobForWrite(job) are not populated into the TaskAttemptContext.
+    //
+    // Also, the `prepareJobForWrite` call must happen before initializing output format and output
+    // committer, since their initialization involve the job configuration, which can be potentially
+    // decorated in `prepareJobForWrite`.
     outputWriterFactory = relation.prepareJobForWrite(job)
+    taskAttemptContext = newTaskAttemptContext(serializableConf.value, taskAttemptId)
 
     outputFormatClass = job.getOutputFormatClass
     outputCommitter = newOutputCommitter(taskAttemptContext)
@@ -331,6 +335,8 @@ private[sql] abstract class BaseWriterContainer(
       SQLConf.OUTPUT_COMMITTER_CLASS, null, classOf[OutputCommitter])
 
     Option(committerClass).map { clazz =>
+      logInfo(s"Using user defined output committer class ${clazz.getCanonicalName}")
+
       // Every output format based on org.apache.hadoop.mapreduce.lib.output.OutputFormat
       // has an associated output committer. To override this output committer,
       // we will first try to use the output committer set in SQLConf.OUTPUT_COMMITTER_CLASS.
@@ -350,7 +356,9 @@ private[sql] abstract class BaseWriterContainer(
     }.getOrElse {
       // If output committer class is not set, we will use the one associated with the
       // file output format.
-      outputFormatClass.newInstance().getOutputCommitter(context)
+      val outputCommitter = outputFormatClass.newInstance().getOutputCommitter(context)
+      logInfo(s"Using output committer class ${outputCommitter.getClass.getCanonicalName}")
+      outputCommitter
     }
   }
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetIOSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetIOSuite.scala
index 2b6a27032e637..46b25859d9a68 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetIOSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetIOSuite.scala
@@ -23,16 +23,18 @@ import scala.reflect.runtime.universe.TypeTag
 
 import org.apache.hadoop.conf.Configuration
 import org.apache.hadoop.fs.{FileSystem, Path}
-import org.scalatest.BeforeAndAfterAll
+import org.apache.hadoop.mapreduce.{JobContext, TaskAttemptContext}
 import org.apache.parquet.example.data.simple.SimpleGroup
 import org.apache.parquet.example.data.{Group, GroupWriter}
 import org.apache.parquet.hadoop.api.WriteSupport
 import org.apache.parquet.hadoop.api.WriteSupport.WriteContext
-import org.apache.parquet.hadoop.metadata.{ParquetMetadata, FileMetaData, CompressionCodecName}
-import org.apache.parquet.hadoop.{Footer, ParquetFileWriter, ParquetWriter}
+import org.apache.parquet.hadoop.metadata.{CompressionCodecName, FileMetaData, ParquetMetadata}
+import org.apache.parquet.hadoop.{Footer, ParquetFileWriter, ParquetOutputCommitter, ParquetWriter}
 import org.apache.parquet.io.api.RecordConsumer
 import org.apache.parquet.schema.{MessageType, MessageTypeParser}
+import org.scalatest.BeforeAndAfterAll
 
+import org.apache.spark.SparkException
 import org.apache.spark.sql.catalyst.ScalaReflection
 import org.apache.spark.sql.catalyst.expressions.Row
 import org.apache.spark.sql.catalyst.util.DateUtils
@@ -196,7 +198,7 @@ class ParquetIOSuiteBase extends QueryTest with ParquetTest {
 
     withParquetDataFrame(allNulls :: Nil) { df =>
       val rows = df.collect()
-      assert(rows.size === 1)
+      assert(rows.length === 1)
       assert(rows.head === Row(Seq.fill(5)(null): _*))
     }
   }
@@ -209,7 +211,7 @@ class ParquetIOSuiteBase extends QueryTest with ParquetTest {
 
     withParquetDataFrame(allNones :: Nil) { df =>
       val rows = df.collect()
-      assert(rows.size === 1)
+      assert(rows.length === 1)
       assert(rows.head === Row(Seq.fill(3)(null): _*))
     }
   }
@@ -379,6 +381,8 @@ class ParquetIOSuiteBase extends QueryTest with ParquetTest {
   }
 
   test("SPARK-6352 DirectParquetOutputCommitter") {
+    val clonedConf = new Configuration(configuration)
+
     // Write to a parquet file and let it fail.
     // _temporary should be missing if direct output committer works.
     try {
@@ -393,14 +397,46 @@ class ParquetIOSuiteBase extends QueryTest with ParquetTest {
         val fs = path.getFileSystem(configuration)
         assert(!fs.exists(path))
       }
+    } finally {
+      // Hadoop 1 doesn't have `Configuration.unset`
+      configuration.clear()
+      clonedConf.foreach(entry => configuration.set(entry.getKey, entry.getValue))
     }
-    finally {
-      configuration.set("spark.sql.parquet.output.committer.class",
-        "org.apache.parquet.hadoop.ParquetOutputCommitter")
+  }
+
+  test("SPARK-8121: spark.sql.parquet.output.committer.class shouldn't be overriden") {
+    withTempPath { dir =>
+      val clonedConf = new Configuration(configuration)
+
+      configuration.set(
+        SQLConf.OUTPUT_COMMITTER_CLASS, classOf[ParquetOutputCommitter].getCanonicalName)
+
+      configuration.set(
+        "spark.sql.parquet.output.committer.class",
+        classOf[BogusParquetOutputCommitter].getCanonicalName)
+
+      try {
+        val message = intercept[SparkException] {
+          sqlContext.range(0, 1).write.parquet(dir.getCanonicalPath)
+        }.getCause.getMessage
+        assert(message === "Intentional exception for testing purposes")
+      } finally {
+        // Hadoop 1 doesn't have `Configuration.unset`
+        configuration.clear()
+        clonedConf.foreach(entry => configuration.set(entry.getKey, entry.getValue))
+      }
     }
   }
 }
 
+class BogusParquetOutputCommitter(outputPath: Path, context: TaskAttemptContext)
+  extends ParquetOutputCommitter(outputPath, context) {
+
+  override def commitJob(jobContext: JobContext): Unit = {
+    sys.error("Intentional exception for testing purposes")
+  }
+}
+
 class ParquetDataSourceOnIOSuite extends ParquetIOSuiteBase with BeforeAndAfterAll {
   private lazy val originalConf = sqlContext.conf.parquetUseDataSourceApi
 

From fe7669d3072b72954ad0c3f2f8846a0fde839ead Mon Sep 17 00:00:00 2001
From: Wenchen Fan <cloud0fan@outlook.com>
Date: Mon, 8 Jun 2015 11:52:02 -0700
Subject: [PATCH 416/525] [SQL][minor] remove duplicated cases in
 `DecimalPrecision`

We already have a rule to do type coercion for fixed decimal and unlimited decimal in `WidenTypes`, so we don't need to handle them in `DecimalPrecision`.

Author: Wenchen Fan <cloud0fan@outlook.com>

Closes #6698 from cloud-fan/fix and squashes the following commits:

413ad4a [Wenchen Fan] remove duplicated cases
---
 .../spark/sql/catalyst/analysis/HiveTypeCoercion.scala      | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala
index a42ffce0d26fa..737905c3582ba 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala
@@ -445,12 +445,6 @@ trait HiveTypeCoercion {
                                   e2 @ DecimalType.Expression(p2, s2)) if p1 != p2 || s1 != s2 =>
           val resultType = DecimalType(max(p1, p2), max(s1, s2))
           b.makeCopy(Array(Cast(e1, resultType), Cast(e2, resultType)))
-        case b @ BinaryComparison(e1 @ DecimalType.Fixed(_, _), e2)
-          if e2.dataType == DecimalType.Unlimited =>
-          b.makeCopy(Array(Cast(e1, DecimalType.Unlimited), e2))
-        case b @ BinaryComparison(e1, e2 @ DecimalType.Fixed(_, _))
-          if e1.dataType == DecimalType.Unlimited =>
-          b.makeCopy(Array(e1, Cast(e2, DecimalType.Unlimited)))
 
         // Promote integers inside a binary expression with fixed-precision decimals to decimals,
         // and fixed-precision decimals in an expression with floats / doubles to doubles

From 51853891686f353dc9decc31066b0de01ed8b49e Mon Sep 17 00:00:00 2001
From: Reynold Xin <rxin@databricks.com>
Date: Mon, 8 Jun 2015 13:15:44 -0700
Subject: [PATCH 417/525] [SPARK-8148] Do not use FloatType in partition column
 inference.

Use DoubleType instead to be more stable and robust.

Author: Reynold Xin <rxin@databricks.com>

Closes #6692 from rxin/SPARK-8148 and squashes the following commits:

6742ecc [Reynold Xin] [SPARK-8148] Do not use FloatType in partition column inference.
---
 .../spark/sql/sources/PartitioningUtils.scala    | 16 +++++++++-------
 .../parquet/ParquetPartitionDiscoverySuite.scala | 12 ++++++------
 2 files changed, 15 insertions(+), 13 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/sources/PartitioningUtils.scala b/sql/core/src/main/scala/org/apache/spark/sql/sources/PartitioningUtils.scala
index 9f6ec2ed8fc8d..7a2b5b949dd4e 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/sources/PartitioningUtils.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/sources/PartitioningUtils.scala
@@ -17,7 +17,7 @@
 
 package org.apache.spark.sql.sources
 
-import java.lang.{Double => JDouble, Float => JFloat, Long => JLong}
+import java.lang.{Double => JDouble, Float => JFloat, Integer => JInteger, Long => JLong}
 import java.math.{BigDecimal => JBigDecimal}
 
 import scala.collection.mutable.ArrayBuffer
@@ -178,7 +178,7 @@ private[sql] object PartitioningUtils {
    * {{{
    *   NullType ->
    *   IntegerType -> LongType ->
-   *   FloatType -> DoubleType -> DecimalType.Unlimited ->
+   *   DoubleType -> DecimalType.Unlimited ->
    *   StringType
    * }}}
    */
@@ -208,8 +208,8 @@ private[sql] object PartitioningUtils {
   }
 
   /**
-   * Converts a string to a `Literal` with automatic type inference.  Currently only supports
-   * [[IntegerType]], [[LongType]], [[FloatType]], [[DoubleType]], [[DecimalType.Unlimited]], and
+   * Converts a string to a [[Literal]] with automatic type inference.  Currently only supports
+   * [[IntegerType]], [[LongType]], [[DoubleType]], [[DecimalType.Unlimited]], and
    * [[StringType]].
    */
   private[sql] def inferPartitionColumnValue(
@@ -221,13 +221,15 @@ private[sql] object PartitioningUtils {
       Try(Literal.create(Integer.parseInt(raw), IntegerType))
         .orElse(Try(Literal.create(JLong.parseLong(raw), LongType)))
         // Then falls back to fractional types
-        .orElse(Try(Literal.create(JFloat.parseFloat(raw), FloatType)))
         .orElse(Try(Literal.create(JDouble.parseDouble(raw), DoubleType)))
         .orElse(Try(Literal.create(new JBigDecimal(raw), DecimalType.Unlimited)))
         // Then falls back to string
         .getOrElse {
-          if (raw == defaultPartitionName) Literal.create(null, NullType)
-          else Literal.create(unescapePathName(raw), StringType)
+          if (raw == defaultPartitionName) {
+            Literal.create(null, NullType)
+          } else {
+            Literal.create(unescapePathName(raw), StringType)
+          }
         }
     } else {
       if (raw == defaultPartitionName) {
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetPartitionDiscoverySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetPartitionDiscoverySuite.scala
index c2f1cc8ffd1fb..3240079483545 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetPartitionDiscoverySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetPartitionDiscoverySuite.scala
@@ -53,7 +53,7 @@ class ParquetPartitionDiscoverySuite extends QueryTest with ParquetTest {
 
     check("10", Literal.create(10, IntegerType))
     check("1000000000000000", Literal.create(1000000000000000L, LongType))
-    check("1.5", Literal.create(1.5f, FloatType))
+    check("1.5", Literal.create(1.5, DoubleType))
     check("hello", Literal.create("hello", StringType))
     check(defaultPartitionName, Literal.create(null, NullType))
   }
@@ -83,13 +83,13 @@ class ParquetPartitionDiscoverySuite extends QueryTest with ParquetTest {
         ArrayBuffer(
           Literal.create(10, IntegerType),
           Literal.create("hello", StringType),
-          Literal.create(1.5f, FloatType)))
+          Literal.create(1.5, DoubleType)))
     })
 
     check("file://path/a=10/b_hello/c=1.5", Some {
       PartitionValues(
         ArrayBuffer("c"),
-        ArrayBuffer(Literal.create(1.5f, FloatType)))
+        ArrayBuffer(Literal.create(1.5, DoubleType)))
     })
 
     check("file:///", None)
@@ -121,7 +121,7 @@ class ParquetPartitionDiscoverySuite extends QueryTest with ParquetTest {
       "hdfs://host:9000/path/a=10.5/b=hello"),
       PartitionSpec(
         StructType(Seq(
-          StructField("a", FloatType),
+          StructField("a", DoubleType),
           StructField("b", StringType))),
         Seq(
           Partition(Row(10, "20"), "hdfs://host:9000/path/a=10/b=20"),
@@ -140,7 +140,7 @@ class ParquetPartitionDiscoverySuite extends QueryTest with ParquetTest {
       "hdfs://host:9000/path/a=10.5/b=world/_temporary/path"),
       PartitionSpec(
         StructType(Seq(
-          StructField("a", FloatType),
+          StructField("a", DoubleType),
           StructField("b", StringType))),
         Seq(
           Partition(Row(10, "20"), "hdfs://host:9000/path/a=10/b=20"),
@@ -162,7 +162,7 @@ class ParquetPartitionDiscoverySuite extends QueryTest with ParquetTest {
       s"hdfs://host:9000/path/a=10.5/b=$defaultPartitionName"),
       PartitionSpec(
         StructType(Seq(
-          StructField("a", FloatType),
+          StructField("a", DoubleType),
           StructField("b", StringType))),
         Seq(
           Partition(Row(10, null), s"hdfs://host:9000/path/a=10/b=$defaultPartitionName"),

From f3eec92ce7e13cc461d2f0404f26730259210f12 Mon Sep 17 00:00:00 2001
From: Andrew Or <andrew@databricks.com>
Date: Mon, 8 Jun 2015 18:09:21 -0700
Subject: [PATCH 418/525] [SPARK-8162] [HOTFIX] Fix NPE in spark-shell

This was caused by this commit: f271347

This patch does not attempt to fix the root cause of why the `VisibleForTesting` annotation causes a NPE in the shell. We should find a way to fix that separately.

Author: Andrew Or <andrew@databricks.com>

Closes #6711 from andrewor14/fix-spark-shell and squashes the following commits:

bf62ecc [Andrew Or] Prevent NPE in spark-shell
---
 .../scala/org/apache/spark/ui/jobs/JobProgressListener.scala    | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/core/src/main/scala/org/apache/spark/ui/jobs/JobProgressListener.scala b/core/src/main/scala/org/apache/spark/ui/jobs/JobProgressListener.scala
index 730f9806e518e..0c854f04890b6 100644
--- a/core/src/main/scala/org/apache/spark/ui/jobs/JobProgressListener.scala
+++ b/core/src/main/scala/org/apache/spark/ui/jobs/JobProgressListener.scala
@@ -539,11 +539,11 @@ class JobProgressListener(conf: SparkConf) extends SparkListener with Logging {
   /**
    * For testing only. Wait until at least `numExecutors` executors are up, or throw
    * `TimeoutException` if the waiting time elapsed before `numExecutors` executors up.
+   * Exposed for testing.
    *
    * @param numExecutors the number of executors to wait at least
    * @param timeout time to wait in milliseconds
    */
-  @VisibleForTesting
   private[spark] def waitUntilExecutorsUp(numExecutors: Int, timeout: Long): Unit = {
     val finishTime = System.currentTimeMillis() + timeout
     while (System.currentTimeMillis() < finishTime) {

From 82870d507dfaeeaf315d6766ca1496205c6216d3 Mon Sep 17 00:00:00 2001
From: Xiangrui Meng <meng@databricks.com>
Date: Mon, 8 Jun 2015 21:33:47 -0700
Subject: [PATCH 419/525] [SPARK-8168] [MLLIB] Add Python friendly constructor
 to PipelineModel

This makes the constructor callable in Python. dbtsai

Author: Xiangrui Meng <meng@databricks.com>

Closes #6709 from mengxr/SPARK-8168 and squashes the following commits:

f871de4 [Xiangrui Meng] Add Python friendly constructor to PipelineModel
---
 .../scala/org/apache/spark/ml/Pipeline.scala    |  8 ++++++++
 .../org/apache/spark/ml/PipelineSuite.scala     | 17 +++++++++++++++++
 2 files changed, 25 insertions(+)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/Pipeline.scala b/mllib/src/main/scala/org/apache/spark/ml/Pipeline.scala
index 11a4722722ea1..a9bd28df71ee1 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/Pipeline.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/Pipeline.scala
@@ -17,6 +17,9 @@
 
 package org.apache.spark.ml
 
+import java.{util => ju}
+
+import scala.collection.JavaConverters._
 import scala.collection.mutable.ListBuffer
 
 import org.apache.spark.Logging
@@ -175,6 +178,11 @@ class PipelineModel private[ml] (
     val stages: Array[Transformer])
   extends Model[PipelineModel] with Logging {
 
+  /** A Java/Python-friendly auxiliary constructor. */
+  private[ml] def this(uid: String, stages: ju.List[Transformer]) = {
+    this(uid, stages.asScala.toArray)
+  }
+
   override def validateParams(): Unit = {
     super.validateParams()
     stages.foreach(_.validateParams())
diff --git a/mllib/src/test/scala/org/apache/spark/ml/PipelineSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/PipelineSuite.scala
index 05bf58e63abaf..29394fefcbc43 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/PipelineSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/PipelineSuite.scala
@@ -17,6 +17,8 @@
 
 package org.apache.spark.ml
 
+import scala.collection.JavaConverters._
+
 import org.mockito.Matchers.{any, eq => meq}
 import org.mockito.Mockito.when
 import org.scalatest.mock.MockitoSugar.mock
@@ -81,4 +83,19 @@ class PipelineSuite extends SparkFunSuite {
       pipeline.fit(dataset)
     }
   }
+
+  test("pipeline model constructors") {
+    val transform0 = mock[Transformer]
+    val model1 = mock[MyModel]
+
+    val stages = Array(transform0, model1)
+    val pipelineModel0 = new PipelineModel("pipeline0", stages)
+    assert(pipelineModel0.uid === "pipeline0")
+    assert(pipelineModel0.stages === stages)
+
+    val stagesAsList = stages.toList.asJava
+    val pipelineModel1 = new PipelineModel("pipeline1", stagesAsList)
+    assert(pipelineModel1.uid === "pipeline1")
+    assert(pipelineModel1.stages === stages)
+  }
 }

From a5c52c1a3488b69bec19e460d2d1fdb0c9ada58d Mon Sep 17 00:00:00 2001
From: hqzizania <qian.huang@intel.com>
Date: Mon, 8 Jun 2015 21:40:12 -0700
Subject: [PATCH 420/525] [SPARK-6820] [SPARKR] Convert NAs to null type in
 SparkR DataFrames

Author: hqzizania <qian.huang@intel.com>

Closes #6190 from hqzizania/R and squashes the following commits:

1641f9e [hqzizania] fixes and add test units
bb3411a [hqzizania] Convert NAs to null type in SparkR DataFrames
---
 R/pkg/R/serialize.R              |  8 +++++++
 R/pkg/inst/tests/test_sparkSQL.R | 37 ++++++++++++++++++++++++++++++++
 2 files changed, 45 insertions(+)

diff --git a/R/pkg/R/serialize.R b/R/pkg/R/serialize.R
index 2081786e6f833..3169d7968f8fe 100644
--- a/R/pkg/R/serialize.R
+++ b/R/pkg/R/serialize.R
@@ -37,6 +37,14 @@ writeObject <- function(con, object, writeType = TRUE) {
   # passing in vectors as arrays and instead require arrays to be passed
   # as lists.
   type <- class(object)[[1]]  # class of POSIXlt is c("POSIXlt", "POSIXt")
+  # Checking types is needed here, since ‘is.na’ only handles atomic vectors,
+  # lists and pairlists
+  if (type %in% c("integer", "character", "logical", "double", "numeric")) {
+    if (is.na(object)) {
+      object <- NULL
+      type <- "NULL"
+    }
+  }
   if (writeType) {
     writeType(con, type)
   }
diff --git a/R/pkg/inst/tests/test_sparkSQL.R b/R/pkg/inst/tests/test_sparkSQL.R
index 30edfc8a7bd94..8946348ef801c 100644
--- a/R/pkg/inst/tests/test_sparkSQL.R
+++ b/R/pkg/inst/tests/test_sparkSQL.R
@@ -101,6 +101,43 @@ test_that("create DataFrame from RDD", {
   expect_equal(dtypes(df), list(c("a", "int"), c("b", "string")))
 })
 
+test_that("convert NAs to null type in DataFrames", {
+  rdd <- parallelize(sc, list(list(1L, 2L), list(NA, 4L)))
+  df <- createDataFrame(sqlContext, rdd, list("a", "b"))
+  expect_true(is.na(collect(df)[2, "a"]))
+  expect_equal(collect(df)[2, "b"], 4L)
+
+  l <- data.frame(x = 1L, y = c(1L, NA_integer_, 3L))
+  df <- createDataFrame(sqlContext, l)
+  expect_equal(collect(df)[2, "x"], 1L)
+  expect_true(is.na(collect(df)[2, "y"]))
+
+  rdd <- parallelize(sc, list(list(1, 2), list(NA, 4)))
+  df <- createDataFrame(sqlContext, rdd, list("a", "b"))
+  expect_true(is.na(collect(df)[2, "a"]))
+  expect_equal(collect(df)[2, "b"], 4)
+
+  l <- data.frame(x = 1, y = c(1, NA_real_, 3))
+  df <- createDataFrame(sqlContext, l)
+  expect_equal(collect(df)[2, "x"], 1)
+  expect_true(is.na(collect(df)[2, "y"]))
+
+  l <- list("a", "b", NA, "d")
+  df <- createDataFrame(sqlContext, l)
+  expect_true(is.na(collect(df)[3, "_1"]))
+  expect_equal(collect(df)[4, "_1"], "d")
+
+  l <- list("a", "b", NA_character_, "d")
+  df <- createDataFrame(sqlContext, l)
+  expect_true(is.na(collect(df)[3, "_1"]))
+  expect_equal(collect(df)[4, "_1"], "d")
+
+  l <- list(TRUE, FALSE, NA, TRUE)
+  df <- createDataFrame(sqlContext, l)
+  expect_true(is.na(collect(df)[3, "_1"]))
+  expect_equal(collect(df)[4, "_1"], TRUE)
+})
+
 test_that("toDF", {
   rdd <- lapply(parallelize(sc, 1:10), function(x) { list(x, as.character(x)) })
   df <- toDF(rdd, list("a", "b"))

From 7658eb28a2ea28c06e3b5a26f7734a7dc36edc19 Mon Sep 17 00:00:00 2001
From: Liang-Chi Hsieh <viirya@gmail.com>
Date: Mon, 8 Jun 2015 23:27:05 -0700
Subject: [PATCH 421/525] [SPARK-7990][SQL] Add methods to facilitate equi-join
 on multiple joining keys

JIRA: https://issues.apache.org/jira/browse/SPARK-7990

Author: Liang-Chi Hsieh <viirya@gmail.com>

Closes #6616 from viirya/multi_keys_equi_join and squashes the following commits:

cd5c888 [Liang-Chi Hsieh] Import reduce in python3.
c43722c [Liang-Chi Hsieh] For comments.
0400e89 [Liang-Chi Hsieh] Fix scala style.
cc90015 [Liang-Chi Hsieh] Add methods to facilitate equi-join on multiple joining keys.
---
 python/pyspark/sql/dataframe.py               | 45 +++++++++++++------
 .../org/apache/spark/sql/DataFrame.scala      | 40 ++++++++++++++---
 .../apache/spark/sql/DataFrameJoinSuite.scala |  9 ++++
 3 files changed, 75 insertions(+), 19 deletions(-)

diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py
index 2d8c59518b35a..e9dd05e2d0c7a 100644
--- a/python/pyspark/sql/dataframe.py
+++ b/python/pyspark/sql/dataframe.py
@@ -22,6 +22,7 @@
 if sys.version >= '3':
     basestring = unicode = str
     long = int
+    from functools import reduce
 else:
     from itertools import imap as map
 
@@ -503,36 +504,52 @@ def alias(self, alias):
 
     @ignore_unicode_prefix
     @since(1.3)
-    def join(self, other, joinExprs=None, joinType=None):
+    def join(self, other, on=None, how=None):
         """Joins with another :class:`DataFrame`, using the given join expression.
 
         The following performs a full outer join between ``df1`` and ``df2``.
 
         :param other: Right side of the join
-        :param joinExprs: a string for join column name, or a join expression (Column).
-            If joinExprs is a string indicating the name of the join column,
-            the column must exist on both sides, and this performs an inner equi-join.
-        :param joinType: str, default 'inner'.
+        :param on: a string for join column name, a list of column names,
+            , a join expression (Column) or a list of Columns.
+            If `on` is a string or a list of string indicating the name of the join column(s),
+            the column(s) must exist on both sides, and this performs an inner equi-join.
+        :param how: str, default 'inner'.
             One of `inner`, `outer`, `left_outer`, `right_outer`, `semijoin`.
 
         >>> df.join(df2, df.name == df2.name, 'outer').select(df.name, df2.height).collect()
         [Row(name=None, height=80), Row(name=u'Alice', height=None), Row(name=u'Bob', height=85)]
 
+        >>> cond = [df.name == df3.name, df.age == df3.age]
+        >>> df.join(df3, cond, 'outer').select(df.name, df3.age).collect()
+        [Row(name=u'Bob', age=5), Row(name=u'Alice', age=2)]
+
         >>> df.join(df2, 'name').select(df.name, df2.height).collect()
         [Row(name=u'Bob', height=85)]
+
+        >>> df.join(df4, ['name', 'age']).select(df.name, df.age).collect()
+        [Row(name=u'Bob', age=5)]
         """
 
-        if joinExprs is None:
+        if on is not None and not isinstance(on, list):
+            on = [on]
+
+        if on is None or len(on) == 0:
             jdf = self._jdf.join(other._jdf)
-        elif isinstance(joinExprs, basestring):
-            jdf = self._jdf.join(other._jdf, joinExprs)
+
+        if isinstance(on[0], basestring):
+            jdf = self._jdf.join(other._jdf, self._jseq(on))
         else:
-            assert isinstance(joinExprs, Column), "joinExprs should be Column"
-            if joinType is None:
-                jdf = self._jdf.join(other._jdf, joinExprs._jc)
+            assert isinstance(on[0], Column), "on should be Column or list of Column"
+            if len(on) > 1:
+                on = reduce(lambda x, y: x.__and__(y), on)
+            else:
+                on = on[0]
+            if how is None:
+                jdf = self._jdf.join(other._jdf, on._jc, "inner")
             else:
-                assert isinstance(joinType, basestring), "joinType should be basestring"
-                jdf = self._jdf.join(other._jdf, joinExprs._jc, joinType)
+                assert isinstance(how, basestring), "how should be basestring"
+                jdf = self._jdf.join(other._jdf, on._jc, how)
         return DataFrame(jdf, self.sql_ctx)
 
     @ignore_unicode_prefix
@@ -1315,6 +1332,8 @@ def _test():
         .toDF(StructType([StructField('age', IntegerType()),
                           StructField('name', StringType())]))
     globs['df2'] = sc.parallelize([Row(name='Tom', height=80), Row(name='Bob', height=85)]).toDF()
+    globs['df3'] = sc.parallelize([Row(name='Alice', age=2),
+                                   Row(name='Bob', age=5)]).toDF()
     globs['df4'] = sc.parallelize([Row(name='Alice', age=10, height=80),
                                   Row(name='Bob', age=5, height=None),
                                   Row(name='Tom', age=None, height=None),
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
index 4a224153e1a37..59f64dd4bc648 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
@@ -395,22 +395,50 @@ class DataFrame private[sql](
    * @since 1.4.0
    */
   def join(right: DataFrame, usingColumn: String): DataFrame = {
+    join(right, Seq(usingColumn))
+  }
+
+  /**
+   * Inner equi-join with another [[DataFrame]] using the given columns.
+   *
+   * Different from other join functions, the join columns will only appear once in the output,
+   * i.e. similar to SQL's `JOIN USING` syntax.
+   *
+   * {{{
+   *   // Joining df1 and df2 using the columns "user_id" and "user_name"
+   *   df1.join(df2, Seq("user_id", "user_name"))
+   * }}}
+   *
+   * Note that if you perform a self-join using this function without aliasing the input
+   * [[DataFrame]]s, you will NOT be able to reference any columns after the join, since
+   * there is no way to disambiguate which side of the join you would like to reference.
+   *
+   * @param right Right side of the join operation.
+   * @param usingColumns Names of the columns to join on. This columns must exist on both sides.
+   * @group dfops
+   * @since 1.4.0
+   */
+  def join(right: DataFrame, usingColumns: Seq[String]): DataFrame = {
     // Analyze the self join. The assumption is that the analyzer will disambiguate left vs right
     // by creating a new instance for one of the branch.
     val joined = sqlContext.executePlan(
       Join(logicalPlan, right.logicalPlan, joinType = Inner, None)).analyzed.asInstanceOf[Join]
 
-    // Project only one of the join column.
-    val joinedCol = joined.right.resolve(usingColumn)
+    // Project only one of the join columns.
+    val joinedCols = usingColumns.map(col => joined.right.resolve(col))
+    val condition = usingColumns.map { col =>
+      catalyst.expressions.EqualTo(joined.left.resolve(col), joined.right.resolve(col))
+    }.reduceLeftOption[catalyst.expressions.BinaryExpression] { (cond, eqTo) =>
+      catalyst.expressions.And(cond, eqTo)
+    }
+
     Project(
-      joined.output.filterNot(_ == joinedCol),
+      joined.output.filterNot(joinedCols.contains(_)),
       Join(
         joined.left,
         joined.right,
         joinType = Inner,
-        Some(catalyst.expressions.EqualTo(
-          joined.left.resolve(usingColumn),
-          joined.right.resolve(usingColumn))))
+        condition)
     )
   }
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameJoinSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameJoinSuite.scala
index 051d13e9a544f..6165764632c29 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameJoinSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameJoinSuite.scala
@@ -34,6 +34,15 @@ class DataFrameJoinSuite extends QueryTest {
       Row(1, "1", "2") :: Row(2, "2", "3") :: Row(3, "3", "4") :: Nil)
   }
 
+  test("join - join using multiple columns") {
+    val df = Seq(1, 2, 3).map(i => (i, i + 1, i.toString)).toDF("int", "int2", "str")
+    val df2 = Seq(1, 2, 3).map(i => (i, i + 1, (i + 1).toString)).toDF("int", "int2", "str")
+
+    checkAnswer(
+      df.join(df2, Seq("int", "int2")),
+      Row(1, 2, "1", "2") :: Row(2, 3, "2", "3") :: Row(3, 4, "3", "4") :: Nil)
+  }
+
   test("join - join using self join") {
     val df = Seq(1, 2, 3).map(i => (i, i.toString)).toDF("int", "str")
 

From 0902a11940e550e85a53e110b490fe90e16ddaf4 Mon Sep 17 00:00:00 2001
From: Sean Owen <sowen@cloudera.com>
Date: Tue, 9 Jun 2015 08:00:04 +0100
Subject: [PATCH 422/525] [SPARK-8101] [CORE] Upgrade netty to avoid memory
 leak accord to netty #3837 issues

Update to Netty 4.0.28-Final

Author: Sean Owen <sowen@cloudera.com>

Closes #6701 from srowen/SPARK-8101 and squashes the following commits:

f3b6369 [Sean Owen] Update to Netty 4.0.28-Final
---
 pom.xml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pom.xml b/pom.xml
index 5a5d183e3dcca..e9700a5d7b149 100644
--- a/pom.xml
+++ b/pom.xml
@@ -587,7 +587,7 @@
       <dependency>
         <groupId>io.netty</groupId>
         <artifactId>netty-all</artifactId>
-        <version>4.0.23.Final</version>
+        <version>4.0.28.Final</version>
       </dependency>
       <dependency>
         <groupId>org.apache.derby</groupId>

From 1b499993ad185b04dd5065facb565cbe7e249521 Mon Sep 17 00:00:00 2001
From: Reynold Xin <rxin@databricks.com>
Date: Tue, 9 Jun 2015 16:24:38 +0800
Subject: [PATCH 423/525] [SPARK-7886] Add built-in expressions to
 FunctionRegistry.

This patch switches to using FunctionRegistry for built-in expressions. It is based on #6463, but with some work to simplify it along with unit tests.

TODOs for future pull requests:
- Use static registration so we don't need to register all functions every time we start a new SQLContext
- Switch to using this in HiveContext

Author: Reynold Xin <rxin@databricks.com>
Author: Santiago M. Mola <santi@mola.io>

Closes #6710 from rxin/udf-registry and squashes the following commits:

6930822 [Reynold Xin] Fixed Python test.
b802c9a [Reynold Xin] Made UDF case insensitive.
e60d815 [Reynold Xin] Made UDF case insensitive.
852f9c0 [Reynold Xin] Fixed style violation.
e76a3c1 [Reynold Xin] Fixed parser.
52ddaba [Reynold Xin] Fixed compilation.
ee7854f [Reynold Xin] Improved error reporting.
ff906f2 [Reynold Xin] More robust constructor calling.
77b46f1 [Reynold Xin] Simplified the code.
2a2a149 [Reynold Xin] Merge pull request #6463 from smola/SPARK-7886
8616924 [Santiago M. Mola] [SPARK-7886] Add built-in expressions to FunctionRegistry.
---
 python/pyspark/sql/dataframe.py               |   2 +-
 .../apache/spark/sql/catalyst/SqlParser.scala |  75 +++++------
 .../sql/catalyst/analysis/Analyzer.scala      |   4 +-
 .../catalyst/analysis/FunctionRegistry.scala  | 127 +++++++++++++-----
 .../sql/catalyst/expressions/Expression.scala |   9 ++
 .../sql/catalyst/expressions/random.scala     |  23 +++-
 .../expressions/stringOperations.scala        |   7 +
 .../sql/catalyst/util/StringKeyHashMap.scala  |  44 ++++++
 .../org/apache/spark/sql/SQLContext.scala     |   6 +-
 .../scala/org/apache/spark/sql/UDFSuite.scala |  42 ++++++
 .../apache/spark/sql/hive/HiveContext.scala   |   9 +-
 .../org/apache/spark/sql/hive/hiveUdfs.scala  |  14 +-
 12 files changed, 269 insertions(+), 93 deletions(-)
 create mode 100644 sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/StringKeyHashMap.scala

diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py
index e9dd05e2d0c7a..9615e576497cd 100644
--- a/python/pyspark/sql/dataframe.py
+++ b/python/pyspark/sql/dataframe.py
@@ -746,7 +746,7 @@ def selectExpr(self, *expr):
         This is a variant of :func:`select` that accepts SQL expressions.
 
         >>> df.selectExpr("age * 2", "abs(age)").collect()
-        [Row((age * 2)=4, Abs(age)=2), Row((age * 2)=10, Abs(age)=5)]
+        [Row((age * 2)=4, 'abs(age)=2), Row((age * 2)=10, 'abs(age)=5)]
         """
         if len(expr) == 1 and isinstance(expr[0], list):
             expr = expr[0]
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/SqlParser.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/SqlParser.scala
index e85312aee7d16..f74c17d583359 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/SqlParser.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/SqlParser.scala
@@ -19,6 +19,7 @@ package org.apache.spark.sql.catalyst
 
 import scala.language.implicitConversions
 
+import org.apache.spark.sql.AnalysisException
 import org.apache.spark.sql.catalyst.analysis._
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.plans._
@@ -48,26 +49,21 @@ class SqlParser extends AbstractSparkSQLParser with DataTypeParser {
 
   // Keyword is a convention with AbstractSparkSQLParser, which will scan all of the `Keyword`
   // properties via reflection the class in runtime for constructing the SqlLexical object
-  protected val ABS = Keyword("ABS")
   protected val ALL = Keyword("ALL")
   protected val AND = Keyword("AND")
   protected val APPROXIMATE = Keyword("APPROXIMATE")
   protected val AS = Keyword("AS")
   protected val ASC = Keyword("ASC")
-  protected val AVG = Keyword("AVG")
   protected val BETWEEN = Keyword("BETWEEN")
   protected val BY = Keyword("BY")
   protected val CASE = Keyword("CASE")
   protected val CAST = Keyword("CAST")
-  protected val COALESCE = Keyword("COALESCE")
-  protected val COUNT = Keyword("COUNT")
   protected val DESC = Keyword("DESC")
   protected val DISTINCT = Keyword("DISTINCT")
   protected val ELSE = Keyword("ELSE")
   protected val END = Keyword("END")
   protected val EXCEPT = Keyword("EXCEPT")
   protected val FALSE = Keyword("FALSE")
-  protected val FIRST = Keyword("FIRST")
   protected val FROM = Keyword("FROM")
   protected val FULL = Keyword("FULL")
   protected val GROUP = Keyword("GROUP")
@@ -80,13 +76,9 @@ class SqlParser extends AbstractSparkSQLParser with DataTypeParser {
   protected val INTO = Keyword("INTO")
   protected val IS = Keyword("IS")
   protected val JOIN = Keyword("JOIN")
-  protected val LAST = Keyword("LAST")
   protected val LEFT = Keyword("LEFT")
   protected val LIKE = Keyword("LIKE")
   protected val LIMIT = Keyword("LIMIT")
-  protected val LOWER = Keyword("LOWER")
-  protected val MAX = Keyword("MAX")
-  protected val MIN = Keyword("MIN")
   protected val NOT = Keyword("NOT")
   protected val NULL = Keyword("NULL")
   protected val ON = Keyword("ON")
@@ -100,15 +92,10 @@ class SqlParser extends AbstractSparkSQLParser with DataTypeParser {
   protected val RLIKE = Keyword("RLIKE")
   protected val SELECT = Keyword("SELECT")
   protected val SEMI = Keyword("SEMI")
-  protected val SQRT = Keyword("SQRT")
-  protected val SUBSTR = Keyword("SUBSTR")
-  protected val SUBSTRING = Keyword("SUBSTRING")
-  protected val SUM = Keyword("SUM")
   protected val TABLE = Keyword("TABLE")
   protected val THEN = Keyword("THEN")
   protected val TRUE = Keyword("TRUE")
   protected val UNION = Keyword("UNION")
-  protected val UPPER = Keyword("UPPER")
   protected val WHEN = Keyword("WHEN")
   protected val WHERE = Keyword("WHERE")
   protected val WITH = Keyword("WITH")
@@ -277,25 +264,36 @@ class SqlParser extends AbstractSparkSQLParser with DataTypeParser {
       )
 
   protected lazy val function: Parser[Expression] =
-    ( SUM   ~> "(" ~> expression             <~ ")" ^^ { case exp => Sum(exp) }
-    | SUM   ~> "(" ~> DISTINCT ~> expression <~ ")" ^^ { case exp => SumDistinct(exp) }
-    | COUNT ~  "(" ~> "*"                    <~ ")" ^^ { case _ => Count(Literal(1)) }
-    | COUNT ~  "(" ~> expression             <~ ")" ^^ { case exp => Count(exp) }
-    | COUNT ~> "(" ~> DISTINCT ~> repsep(expression, ",") <~ ")" ^^
-      { case exps => CountDistinct(exps) }
-    | APPROXIMATE ~ COUNT ~ "(" ~ DISTINCT ~> expression <~ ")" ^^
-      { case exp => ApproxCountDistinct(exp) }
-    | APPROXIMATE ~> "(" ~> floatLit ~ ")" ~ COUNT ~ "(" ~ DISTINCT ~ expression <~ ")" ^^
-      { case s ~ _ ~ _ ~ _ ~ _ ~ e => ApproxCountDistinct(e, s.toDouble) }
-    | FIRST ~ "(" ~> expression <~ ")" ^^ { case exp => First(exp) }
-    | LAST  ~ "(" ~> expression <~ ")" ^^ { case exp => Last(exp) }
-    | AVG   ~ "(" ~> expression <~ ")" ^^ { case exp => Average(exp) }
-    | MIN   ~ "(" ~> expression <~ ")" ^^ { case exp => Min(exp) }
-    | MAX   ~ "(" ~> expression <~ ")" ^^ { case exp => Max(exp) }
-    | UPPER ~ "(" ~> expression <~ ")" ^^ { case exp => Upper(exp) }
-    | LOWER ~ "(" ~> expression <~ ")" ^^ { case exp => Lower(exp) }
-    | IF ~ "(" ~> expression ~ ("," ~> expression) ~ ("," ~> expression) <~ ")" ^^
-      { case c ~ t ~ f => If(c, t, f) }
+    ( ident <~ ("(" ~ "*" ~ ")") ^^ { case udfName =>
+      if (lexical.normalizeKeyword(udfName) == "count") {
+        Count(Literal(1))
+      } else {
+        throw new AnalysisException(s"invalid expression $udfName(*)")
+      }
+    }
+    | ident ~ ("(" ~> repsep(expression, ",")) <~ ")" ^^
+      { case udfName ~ exprs => UnresolvedFunction(udfName, exprs) }
+    | ident ~ ("(" ~ DISTINCT ~> repsep(expression, ",")) <~ ")" ^^ { case udfName ~ exprs =>
+      lexical.normalizeKeyword(udfName) match {
+        case "sum" => SumDistinct(exprs.head)
+        case "count" => CountDistinct(exprs)
+      }
+    }
+    | APPROXIMATE ~> ident ~ ("(" ~ DISTINCT ~> expression <~ ")") ^^ { case udfName ~ exp =>
+      if (lexical.normalizeKeyword(udfName) == "count") {
+        ApproxCountDistinct(exp)
+      } else {
+        throw new AnalysisException(s"invalid function approximate $udfName")
+      }
+    }
+    | APPROXIMATE ~> "(" ~> floatLit ~ ")" ~ ident ~ "(" ~ DISTINCT ~ expression <~ ")" ^^
+      { case s ~ _ ~ udfName ~ _ ~ _ ~ exp =>
+        if (lexical.normalizeKeyword(udfName) == "count") {
+          ApproxCountDistinct(exp, s.toDouble)
+        } else {
+          throw new AnalysisException(s"invalid function approximate($floatLit) $udfName")
+        }
+      }
     | CASE ~> expression.? ~ rep1(WHEN ~> expression ~ (THEN ~> expression)) ~
         (ELSE ~> expression).? <~ END ^^ {
           case casePart ~ altPart ~ elsePart =>
@@ -304,16 +302,7 @@ class SqlParser extends AbstractSparkSQLParser with DataTypeParser {
             } ++ elsePart
             casePart.map(CaseKeyWhen(_, branches)).getOrElse(CaseWhen(branches))
         }
-    | (SUBSTR | SUBSTRING) ~ "(" ~> expression ~ ("," ~> expression) <~ ")" ^^
-      { case s ~ p => Substring(s, p, Literal(Integer.MAX_VALUE)) }
-    | (SUBSTR | SUBSTRING) ~ "(" ~> expression ~ ("," ~> expression) ~ ("," ~> expression) <~ ")" ^^
-      { case s ~ p ~ l => Substring(s, p, l) }
-    | COALESCE ~ "(" ~> repsep(expression, ",") <~ ")" ^^ { case exprs => Coalesce(exprs) }
-    | SQRT  ~ "(" ~> expression <~ ")" ^^ { case exp => Sqrt(exp) }
-    | ABS   ~ "(" ~> expression <~ ")" ^^ { case exp => Abs(exp) }
-    | ident ~ ("(" ~> repsep(expression, ",")) <~ ")" ^^
-      { case udfName ~ exprs => UnresolvedFunction(udfName, exprs) }
-    )
+      )
 
   protected lazy val cast: Parser[Expression] =
     CAST ~ "(" ~> expression ~ (AS ~> dataType) <~ ")" ^^ {
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
index 5883d938b676d..02b10c444d1a7 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
@@ -461,7 +461,9 @@ class Analyzer(
       case q: LogicalPlan =>
         q transformExpressions {
           case u @ UnresolvedFunction(name, children) if u.childrenResolved =>
-            registry.lookupFunction(name, children)
+            withPosition(u) {
+              registry.lookupFunction(name, children)
+            }
         }
     }
   }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
index 0849faa9bfa7b..406f6fad8413b 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
@@ -17,24 +17,27 @@
 
 package org.apache.spark.sql.catalyst.analysis
 
-import org.apache.spark.sql.catalyst.CatalystConf
-import org.apache.spark.sql.catalyst.expressions.Expression
-import scala.collection.mutable
+import scala.reflect.ClassTag
+import scala.util.{Failure, Success, Try}
+
+import org.apache.spark.sql.AnalysisException
+import org.apache.spark.sql.catalyst.analysis.FunctionRegistry.FunctionBuilder
+import org.apache.spark.sql.catalyst.expressions._
+import org.apache.spark.sql.catalyst.util.StringKeyHashMap
+
 
 /** A catalog for looking up user defined functions, used by an [[Analyzer]]. */
 trait FunctionRegistry {
-  type FunctionBuilder = Seq[Expression] => Expression
 
   def registerFunction(name: String, builder: FunctionBuilder): Unit
 
+  @throws[AnalysisException]("If function does not exist")
   def lookupFunction(name: String, children: Seq[Expression]): Expression
-
-  def conf: CatalystConf
 }
 
 trait OverrideFunctionRegistry extends FunctionRegistry {
 
-  val functionBuilders = StringKeyHashMap[FunctionBuilder](conf.caseSensitiveAnalysis)
+  private val functionBuilders = StringKeyHashMap[FunctionBuilder](caseSensitive = false)
 
   override def registerFunction(name: String, builder: FunctionBuilder): Unit = {
     functionBuilders.put(name, builder)
@@ -45,16 +48,19 @@ trait OverrideFunctionRegistry extends FunctionRegistry {
   }
 }
 
-class SimpleFunctionRegistry(val conf: CatalystConf) extends FunctionRegistry {
+class SimpleFunctionRegistry extends FunctionRegistry {
 
-  val functionBuilders = StringKeyHashMap[FunctionBuilder](conf.caseSensitiveAnalysis)
+  private val functionBuilders = StringKeyHashMap[FunctionBuilder](caseSensitive = false)
 
   override def registerFunction(name: String, builder: FunctionBuilder): Unit = {
     functionBuilders.put(name, builder)
   }
 
   override def lookupFunction(name: String, children: Seq[Expression]): Expression = {
-    functionBuilders(name)(children)
+    val func = functionBuilders.get(name).getOrElse {
+      throw new AnalysisException(s"undefined function $name")
+    }
+    func(children)
   }
 }
 
@@ -70,30 +76,89 @@ object EmptyFunctionRegistry extends FunctionRegistry {
   override def lookupFunction(name: String, children: Seq[Expression]): Expression = {
     throw new UnsupportedOperationException
   }
-
-  override def conf: CatalystConf = throw new UnsupportedOperationException
 }
 
-/**
- * Build a map with String type of key, and it also supports either key case
- * sensitive or insensitive.
- * TODO move this into util folder?
- */
-object StringKeyHashMap {
-  def apply[T](caseSensitive: Boolean): StringKeyHashMap[T] = caseSensitive match {
-    case false => new StringKeyHashMap[T](_.toLowerCase)
-    case true => new StringKeyHashMap[T](identity)
-  }
-}
 
-class StringKeyHashMap[T](normalizer: (String) => String) {
-  private val base = new collection.mutable.HashMap[String, T]()
+object FunctionRegistry {
 
-  def apply(key: String): T = base(normalizer(key))
+  type FunctionBuilder = Seq[Expression] => Expression
 
-  def get(key: String): Option[T] = base.get(normalizer(key))
-  def put(key: String, value: T): Option[T] = base.put(normalizer(key), value)
-  def remove(key: String): Option[T] = base.remove(normalizer(key))
-  def iterator: Iterator[(String, T)] = base.toIterator
+  val expressions: Map[String, FunctionBuilder] = Map(
+    // Non aggregate functions
+    expression[Abs]("abs"),
+    expression[CreateArray]("array"),
+    expression[Coalesce]("coalesce"),
+    expression[Explode]("explode"),
+    expression[Lower]("lower"),
+    expression[Substring]("substr"),
+    expression[Substring]("substring"),
+    expression[Rand]("rand"),
+    expression[Randn]("randn"),
+    expression[CreateStruct]("struct"),
+    expression[Sqrt]("sqrt"),
+    expression[Upper]("upper"),
+
+    // Math functions
+    expression[Acos]("acos"),
+    expression[Asin]("asin"),
+    expression[Atan]("atan"),
+    expression[Atan2]("atan2"),
+    expression[Cbrt]("cbrt"),
+    expression[Ceil]("ceil"),
+    expression[Cos]("cos"),
+    expression[Exp]("exp"),
+    expression[Expm1]("expm1"),
+    expression[Floor]("floor"),
+    expression[Hypot]("hypot"),
+    expression[Log]("log"),
+    expression[Log10]("log10"),
+    expression[Log1p]("log1p"),
+    expression[Pow]("pow"),
+    expression[Rint]("rint"),
+    expression[Signum]("signum"),
+    expression[Sin]("sin"),
+    expression[Sinh]("sinh"),
+    expression[Tan]("tan"),
+    expression[Tanh]("tanh"),
+    expression[ToDegrees]("todegrees"),
+    expression[ToRadians]("toradians"),
+
+    // aggregate functions
+    expression[Average]("avg"),
+    expression[Count]("count"),
+    expression[First]("first"),
+    expression[Last]("last"),
+    expression[Max]("max"),
+    expression[Min]("min"),
+    expression[Sum]("sum")
+  )
+
+  /** See usage above. */
+  private def expression[T <: Expression](name: String)
+      (implicit tag: ClassTag[T]): (String, FunctionBuilder) = {
+    // Use the companion class to find apply methods.
+    val objectClass = Class.forName(tag.runtimeClass.getName + "$")
+    val companionObj = objectClass.getDeclaredField("MODULE$").get(null)
+
+    // See if we can find an apply that accepts Seq[Expression]
+    val varargApply = Try(objectClass.getDeclaredMethod("apply", classOf[Seq[_]])).toOption
+
+    val builder = (expressions: Seq[Expression]) => {
+      if (varargApply.isDefined) {
+        // If there is an apply method that accepts Seq[Expression], use that one.
+        varargApply.get.invoke(companionObj, expressions).asInstanceOf[Expression]
+      } else {
+        // Otherwise, find an apply method that matches the number of arguments, and use that.
+        val params = Seq.fill(expressions.size)(classOf[Expression])
+        val f = Try(objectClass.getDeclaredMethod("apply", params : _*)) match {
+          case Success(e) =>
+            e
+          case Failure(e) =>
+            throw new AnalysisException(s"Invalid number of arguments for function $name")
+        }
+        f.invoke(companionObj, expressions : _*).asInstanceOf[Expression]
+      }
+    }
+    (name, builder)
+  }
 }
-
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala
index a9a9c0cfb7027..f2ed1f0929987 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala
@@ -23,6 +23,15 @@ import org.apache.spark.sql.catalyst.trees
 import org.apache.spark.sql.catalyst.trees.TreeNode
 import org.apache.spark.sql.types._
 
+
+/**
+ * For Catalyst to work correctly, concrete implementations of [[Expression]]s must be case classes
+ * whose constructor arguments are all Expressions types. In addition, if we want to support more
+ * than one constructor, define those constructors explicitly as apply methods in the companion
+ * object.
+ *
+ * See [[Substring]] for an example.
+ */
 abstract class Expression extends TreeNode[Expression] {
   self: Product =>
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/random.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/random.scala
index b2647124c4e49..6e4e9cb1be090 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/random.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/random.scala
@@ -18,6 +18,7 @@
 package org.apache.spark.sql.catalyst.expressions
 
 import org.apache.spark.TaskContext
+import org.apache.spark.sql.AnalysisException
 import org.apache.spark.sql.types.{DataType, DoubleType}
 import org.apache.spark.util.Utils
 import org.apache.spark.util.random.XORShiftRandom
@@ -46,11 +47,29 @@ abstract class RDG(seed: Long) extends LeafExpression with Serializable {
 }
 
 /** Generate a random column with i.i.d. uniformly distributed values in [0, 1). */
-case class Rand(seed: Long = Utils.random.nextLong()) extends RDG(seed) {
+case class Rand(seed: Long) extends RDG(seed) {
   override def eval(input: Row): Double = rng.nextDouble()
 }
 
+object Rand {
+  def apply(): Rand = apply(Utils.random.nextLong())
+
+  def apply(seed: Expression): Rand = apply(seed match {
+    case IntegerLiteral(s) => s
+    case _ => throw new AnalysisException("Input argument to rand must be an integer literal.")
+  })
+}
+
 /** Generate a random column with i.i.d. gaussian random distribution. */
-case class Randn(seed: Long = Utils.random.nextLong()) extends RDG(seed) {
+case class Randn(seed: Long) extends RDG(seed) {
   override def eval(input: Row): Double = rng.nextGaussian()
 }
+
+object Randn {
+  def apply(): Randn = apply(Utils.random.nextLong())
+
+  def apply(seed: Expression): Randn = apply(seed match {
+    case IntegerLiteral(s) => s
+    case _ => throw new AnalysisException("Input argument to rand must be an integer literal.")
+  })
+}
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala
index aae122a981e47..856f56488c7a5 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala
@@ -227,6 +227,7 @@ case class Substring(str: Expression, pos: Expression, len: Expression)
   override def foldable: Boolean = str.foldable && pos.foldable && len.foldable
 
   override  def nullable: Boolean = str.nullable || pos.nullable || len.nullable
+
   override def dataType: DataType = {
     if (!resolved) {
       throw new UnresolvedException(this, s"Cannot resolve since $children are not resolved")
@@ -287,3 +288,9 @@ case class Substring(str: Expression, pos: Expression, len: Expression)
     case _ => s"SUBSTR($str, $pos, $len)"
   }
 }
+
+object Substring {
+  def apply(str: Expression, pos: Expression): Substring = {
+    apply(str, pos, Literal(Integer.MAX_VALUE))
+  }
+}
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/StringKeyHashMap.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/StringKeyHashMap.scala
new file mode 100644
index 0000000000000..191d5e6399fc9
--- /dev/null
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/StringKeyHashMap.scala
@@ -0,0 +1,44 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.catalyst.util
+
+/**
+ * Build a map with String type of key, and it also supports either key case
+ * sensitive or insensitive.
+ */
+object StringKeyHashMap {
+  def apply[T](caseSensitive: Boolean): StringKeyHashMap[T] = caseSensitive match {
+    case false => new StringKeyHashMap[T](_.toLowerCase)
+    case true => new StringKeyHashMap[T](identity)
+  }
+}
+
+
+class StringKeyHashMap[T](normalizer: (String) => String) {
+  private val base = new collection.mutable.HashMap[String, T]()
+
+  def apply(key: String): T = base(normalizer(key))
+
+  def get(key: String): Option[T] = base.get(normalizer(key))
+
+  def put(key: String, value: T): Option[T] = base.put(normalizer(key), value)
+
+  def remove(key: String): Option[T] = base.remove(normalizer(key))
+
+  def iterator: Iterator[(String, T)] = base.toIterator
+}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
index ddb54025baa24..8cad3885b7d46 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
@@ -120,7 +120,11 @@ class SQLContext(@transient val sparkContext: SparkContext)
 
   // TODO how to handle the temp function per user session?
   @transient
-  protected[sql] lazy val functionRegistry: FunctionRegistry = new SimpleFunctionRegistry(conf)
+  protected[sql] lazy val functionRegistry: FunctionRegistry = {
+    val fr = new SimpleFunctionRegistry
+    FunctionRegistry.expressions.foreach { case (name, func) => fr.registerFunction(name, func) }
+    fr
+  }
 
   @transient
   protected[sql] lazy val analyzer: Analyzer =
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/UDFSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/UDFSuite.scala
index 064c040d2b771..703a34c47ec20 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/UDFSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/UDFSuite.scala
@@ -25,6 +25,48 @@ class UDFSuite extends QueryTest {
   private lazy val ctx = org.apache.spark.sql.test.TestSQLContext
   import ctx.implicits._
 
+  test("built-in fixed arity expressions") {
+    val df = ctx.emptyDataFrame
+    df.selectExpr("rand()", "randn()", "rand(5)", "randn(50)")
+  }
+
+  test("built-in vararg expressions") {
+    val df = Seq((1, 2)).toDF("a", "b")
+    df.selectExpr("array(a, b)")
+    df.selectExpr("struct(a, b)")
+  }
+
+  test("built-in expressions with multiple constructors") {
+    val df = Seq(("abcd", 2)).toDF("a", "b")
+    df.selectExpr("substr(a, 2)", "substr(a, 2, 3)").collect()
+  }
+
+  test("count") {
+    val df = Seq(("abcd", 2)).toDF("a", "b")
+    df.selectExpr("count(a)")
+  }
+
+  test("count distinct") {
+    val df = Seq(("abcd", 2)).toDF("a", "b")
+    df.selectExpr("count(distinct a)")
+  }
+
+  test("error reporting for incorrect number of arguments") {
+    val df = ctx.emptyDataFrame
+    val e = intercept[AnalysisException] {
+      df.selectExpr("substr('abcd', 2, 3, 4)")
+    }
+    assert(e.getMessage.contains("arguments"))
+  }
+
+  test("error reporting for undefined functions") {
+    val df = ctx.emptyDataFrame
+    val e = intercept[AnalysisException] {
+      df.selectExpr("a_function_that_does_not_exist()")
+    }
+    assert(e.getMessage.contains("undefined function"))
+  }
+
   test("Simple UDF") {
     ctx.udf.register("strLenScala", (_: String).length)
     assert(ctx.sql("SELECT strLenScala('test')").head().getInt(0) === 4)
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala
index b8f294c262af7..3b8cafb4a6c37 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala
@@ -39,13 +39,12 @@ import org.apache.hadoop.hive.serde2.io.{DateWritable, TimestampWritable}
 import org.apache.spark.SparkContext
 import org.apache.spark.annotation.Experimental
 import org.apache.spark.sql._
-import org.apache.spark.sql.catalyst.analysis.{Analyzer, EliminateSubQueries, OverrideCatalog, OverrideFunctionRegistry}
+import org.apache.spark.sql.catalyst.analysis._
 import org.apache.spark.sql.catalyst.plans.logical._
 import org.apache.spark.sql.execution.{ExecutedCommand, ExtractPythonUdfs, SetCommand}
 import org.apache.spark.sql.hive.client._
 import org.apache.spark.sql.hive.execution.{DescribeHiveTableCommand, HiveNativeCommand}
 import org.apache.spark.sql.sources.DataSourceStrategy
-import org.apache.spark.sql.catalyst.CatalystConf
 import org.apache.spark.sql.types._
 import org.apache.spark.util.Utils
 
@@ -374,10 +373,8 @@ class HiveContext(sc: SparkContext) extends SQLContext(sc) {
 
   // Note that HiveUDFs will be overridden by functions registered in this context.
   @transient
-  override protected[sql] lazy val functionRegistry =
-    new HiveFunctionRegistry with OverrideFunctionRegistry {
-      override def conf: CatalystConf = currentSession().conf
-    }
+  override protected[sql] lazy val functionRegistry: FunctionRegistry =
+    new HiveFunctionRegistry with OverrideFunctionRegistry
 
   /* An analyzer that uses the Hive metastore. */
   @transient
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/hiveUdfs.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/hiveUdfs.scala
index 01f47352b2313..6e6ac987b668a 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/hiveUdfs.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/hiveUdfs.scala
@@ -17,11 +17,8 @@
 
 package org.apache.spark.sql.hive
 
-import org.apache.hadoop.hive.ql.udf.generic.GenericUDAFEvaluator.AggregationBuffer
-import org.apache.hadoop.hive.ql.udf.generic.GenericUDFUtils.ConversionHelper
-import org.apache.spark.sql.AnalysisException
-
 import scala.collection.mutable.ArrayBuffer
+import scala.collection.JavaConversions._
 
 import org.apache.hadoop.hive.serde2.objectinspector.{ObjectInspector, ConstantObjectInspector}
 import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory.ObjectInspectorOptions
@@ -30,8 +27,11 @@ import org.apache.hadoop.hive.ql.exec._
 import org.apache.hadoop.hive.ql.udf.{UDFType => HiveUDFType}
 import org.apache.hadoop.hive.ql.udf.generic._
 import org.apache.hadoop.hive.ql.udf.generic.GenericUDF._
+import org.apache.hadoop.hive.ql.udf.generic.GenericUDAFEvaluator.AggregationBuffer
+import org.apache.hadoop.hive.ql.udf.generic.GenericUDFUtils.ConversionHelper
 
 import org.apache.spark.Logging
+import org.apache.spark.sql.AnalysisException
 import org.apache.spark.sql.catalyst.analysis
 import org.apache.spark.sql.catalyst.errors.TreeNodeException
 import org.apache.spark.sql.catalyst.expressions._
@@ -40,20 +40,18 @@ import org.apache.spark.sql.catalyst.rules.Rule
 import org.apache.spark.sql.hive.HiveShim._
 import org.apache.spark.sql.types._
 
-/* Implicit conversions */
-import scala.collection.JavaConversions._
 
 private[hive] abstract class HiveFunctionRegistry
   extends analysis.FunctionRegistry with HiveInspectors {
 
   def getFunctionInfo(name: String): FunctionInfo = FunctionRegistry.getFunctionInfo(name)
 
-  def lookupFunction(name: String, children: Seq[Expression]): Expression = {
+  override def lookupFunction(name: String, children: Seq[Expression]): Expression = {
     // We only look it up to see if it exists, but do not include it in the HiveUDF since it is
     // not always serializable.
     val functionInfo: FunctionInfo =
       Option(FunctionRegistry.getFunctionInfo(name.toLowerCase)).getOrElse(
-        sys.error(s"Couldn't find function $name"))
+        throw new AnalysisException(s"undefined function $name"))
 
     val functionClassName = functionInfo.getFunctionClass.getName
 

From e6fb6cedf3ecbde6f01d4753d7d05d0c52827fce Mon Sep 17 00:00:00 2001
From: Kousuke Saruta <sarutak@oss.nttdata.co.jp>
Date: Tue, 9 Jun 2015 12:19:01 +0100
Subject: [PATCH 424/525] [STREAMING] [DOC] Remove duplicated description about
 WAL

I noticed there is a duplicated description about WAL.

```
To ensure zero-data loss, you have to additionally enable Write Ahead Logs in Spark Streaming. To ensure zero data loss, enable the Write Ahead Logs (introduced in Spark 1.2).
```

Let's remove the duplication.

I don't file this issue in JIRA because it's minor.

Author: Kousuke Saruta <sarutak@oss.nttdata.co.jp>

Closes #6719 from sarutak/remove-multiple-description and squashes the following commits:

cc9bb21 [Kousuke Saruta] Removed duplicated description about WAL
---
 docs/streaming-kafka-integration.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/streaming-kafka-integration.md b/docs/streaming-kafka-integration.md
index d6d5605948a5a..998c8c994e4b4 100644
--- a/docs/streaming-kafka-integration.md
+++ b/docs/streaming-kafka-integration.md
@@ -7,7 +7,7 @@ title: Spark Streaming + Kafka Integration Guide
 ## Approach 1: Receiver-based Approach
 This approach uses a Receiver to receive the data. The Received is implemented using the Kafka high-level consumer API. As with all receivers, the data received from Kafka through a Receiver is stored in Spark executors, and then jobs launched by Spark Streaming processes the data. 
 
-However, under default configuration, this approach can lose data under failures (see [receiver reliability](streaming-programming-guide.html#receiver-reliability). To ensure zero-data loss, you have to additionally enable Write Ahead Logs in Spark Streaming. To ensure zero data loss, enable the Write Ahead Logs (introduced in Spark 1.2). This synchronously saves all the received Kafka data into write ahead logs on a distributed file system (e.g HDFS), so that all the data can be recovered on failure. See [Deploying section](streaming-programming-guide.html#deploying-applications) in the streaming programming guide for more details on Write Ahead Logs.
+However, under default configuration, this approach can lose data under failures (see [receiver reliability](streaming-programming-guide.html#receiver-reliability). To ensure zero-data loss, you have to additionally enable Write Ahead Logs in Spark Streaming (introduced in Spark 1.2). This synchronously saves all the received Kafka data into write ahead logs on a distributed file system (e.g HDFS), so that all the data can be recovered on failure. See [Deploying section](streaming-programming-guide.html#deploying-applications) in the streaming programming guide for more details on Write Ahead Logs.
 
 Next, we discuss how to use this approach in your streaming application.
 

From 6c1723abeb4e0580efec05a655343f46521fc265 Mon Sep 17 00:00:00 2001
From: MechCoder <manojkumarsivaraj334@gmail.com>
Date: Tue, 9 Jun 2015 15:00:35 +0100
Subject: [PATCH 425/525] [SPARK-8140] [MLLIB] Remove construct to get weights
 in StreamingLinearAlgorithm

Author: MechCoder <manojkumarsivaraj334@gmail.com>

Closes #6720 from MechCoder/empty_model_check and squashes the following commits:

3a07de5 [MechCoder] Remove construct to get weights in StreamingLinearAlgorithm
---
 .../spark/mllib/regression/StreamingLinearAlgorithm.scala  | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/regression/StreamingLinearAlgorithm.scala b/mllib/src/main/scala/org/apache/spark/mllib/regression/StreamingLinearAlgorithm.scala
index 39308e5ae1dde..aee51bf22d8d0 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/regression/StreamingLinearAlgorithm.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/regression/StreamingLinearAlgorithm.scala
@@ -83,12 +83,7 @@ abstract class StreamingLinearAlgorithm[
       throw new IllegalArgumentException("Model must be initialized before starting training.")
     }
     data.foreachRDD { (rdd, time) =>
-      val initialWeights =
-        model match {
-          case Some(m) =>
-            m.weights
-        }
-      model = Some(algorithm.run(rdd, initialWeights))
+      model = Some(algorithm.run(rdd, model.get.weights))
       logInfo("Model updated at time %s".format(time.toString))
       val display = model.get.weights.size match {
         case x if x > 100 => model.get.weights.toArray.take(100).mkString("[", ",", "...")

From 490d5a72ec1e5105f030fd7110acf62534e05f5a Mon Sep 17 00:00:00 2001
From: FavioVazquez <favio.vazquezp@gmail.com>
Date: Tue, 9 Jun 2015 15:02:18 +0100
Subject: [PATCH 426/525] [SPARK-8274] [DOCUMENTATION-MLLIB] Fix wrong URLs in
 MLlib Frequent Pattern Mining Documentation

There is a mistake in the URLs of the Scala section of FP-Growth in the MLlib Frequent Pattern Mining documentation. The URL points to https://spark.apache.org/docs/latest/api/java/org/apache/spark/mllib/fpm/FPGrowth.html which is the Java's API, the link should point to the Scala API https://spark.apache.org/docs/latest/api/scala/index.html#org.apache.spark.mllib.fpm.FPGrowth

There's another mistake in the FP-GrowthModel in the same section, the link points, again, to the Java's API https://spark.apache.org/docs/latest/api/java/org/apache/spark/mllib/fpm/FPGrowthModel.html, the link should point to the Scala API https://spark.apache.org/docs/latest/api/scala/index.html#org.apache.spark.mllib.fpm.FPGrowthModel

Author: FavioVazquez <favio.vazquezp@gmail.com>

Closes #6722 from FavioVazquez/fix-wrog-urls-mllib-fpgrowth and squashes the following commits:

e1ca54d [FavioVazquez] - Fixed wrong URLs in MLlib Frequent Pattern Mining, FP-Growth Scala section
ad882a3 [FavioVazquez] Merge remote-tracking branch 'upstream/master'
f27a20b [FavioVazquez] Merge remote-tracking branch 'upstream/master'
9af7074 [FavioVazquez] Merge remote-tracking branch 'upstream/master'
edab1ef [FavioVazquez] Merge remote-tracking branch 'upstream/master'
b2e2f8c [FavioVazquez] Merge remote-tracking branch 'upstream/master'
---
 docs/mllib-frequent-pattern-mining.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/mllib-frequent-pattern-mining.md b/docs/mllib-frequent-pattern-mining.md
index 9fd9be0dd01b1..bcc066a185526 100644
--- a/docs/mllib-frequent-pattern-mining.md
+++ b/docs/mllib-frequent-pattern-mining.md
@@ -39,11 +39,11 @@ MLlib's FP-growth implementation takes the following (hyper-)parameters:
 <div class="codetabs">
 <div data-lang="scala" markdown="1">
 
-[`FPGrowth`](api/java/org/apache/spark/mllib/fpm/FPGrowth.html) implements the
+[`FPGrowth`](api/scala/index.html#org.apache.spark.mllib.fpm.FPGrowth) implements the
 FP-growth algorithm.
 It take a `JavaRDD` of transactions, where each transaction is an `Iterable` of items of a generic type.
 Calling `FPGrowth.run` with transactions returns an
-[`FPGrowthModel`](api/java/org/apache/spark/mllib/fpm/FPGrowthModel.html)
+[`FPGrowthModel`](api/scala/index.html#org.apache.spark.mllib.fpm.FPGrowthModel)
 that stores the frequent itemsets with their frequencies.
 
 {% highlight scala %}

From 0d5892dc723d203e7d892d3beacbaa97aedb1a24 Mon Sep 17 00:00:00 2001
From: Andrew Or <andrew@databricks.com>
Date: Tue, 9 Jun 2015 15:44:02 -0700
Subject: [PATCH 427/525] [MINOR] [UI] DAG visualization: trim whitespace from
 input

Just as a safeguard against DOM rewriting.

Author: Andrew Or <andrew@databricks.com>

Closes #6732 from andrewor14/dag-viz-trim and squashes the following commits:

7e9bacb [Andrew Or] [MINOR] [UI] DAG visualization: trim whitespace from input
---
 .../main/resources/org/apache/spark/ui/static/spark-dag-viz.js  | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/core/src/main/resources/org/apache/spark/ui/static/spark-dag-viz.js b/core/src/main/resources/org/apache/spark/ui/static/spark-dag-viz.js
index e96af8768daa0..7a0dec2a3eaec 100644
--- a/core/src/main/resources/org/apache/spark/ui/static/spark-dag-viz.js
+++ b/core/src/main/resources/org/apache/spark/ui/static/spark-dag-viz.js
@@ -235,7 +235,7 @@ function renderDagVizForJob(svgContainer) {
     // them separately later. Note that we cannot draw them now because we need to
     // put these edges in a separate container that is on top of all stage graphs.
     metadata.selectAll(".incoming-edge").each(function(v) {
-      var edge = d3.select(this).text().split(","); // e.g. 3,4 => [3, 4]
+      var edge = d3.select(this).text().trim().split(","); // e.g. 3,4 => [3, 4]
       crossStageEdges.push(edge);
     });
   });

From 6e4fb0c9e8f03cf068c422777cfce82a89e8e738 Mon Sep 17 00:00:00 2001
From: Patrick Wendell <patrick@databricks.com>
Date: Tue, 9 Jun 2015 16:14:21 -0700
Subject: [PATCH 428/525] [SPARK-6511] [DOCUMENTATION] Explain how to use
 Hadoop provided builds

This provides preliminary documentation pointing out how to use the
Hadoop free builds. I am hoping over time this list can grow to
include most of the popular Hadoop distributions.

Getting more people using these builds will help us long term reduce
the number of binaries we build.

Author: Patrick Wendell <patrick@databricks.com>

Closes #6729 from pwendell/hadoop-provided and squashes the following commits:

1113b76 [Patrick Wendell] [SPARK-6511] [Documentation] Explain how to use Hadoop provided builds
---
 docs/hadoop-provided.md | 26 ++++++++++++++++++++++++++
 docs/index.md           | 10 +++++++---
 2 files changed, 33 insertions(+), 3 deletions(-)
 create mode 100644 docs/hadoop-provided.md

diff --git a/docs/hadoop-provided.md b/docs/hadoop-provided.md
new file mode 100644
index 0000000000000..0ba5a58051abc
--- /dev/null
+++ b/docs/hadoop-provided.md
@@ -0,0 +1,26 @@
+---
+layout: global
+displayTitle: Using Spark's "Hadoop Free" Build
+title: Using Spark's "Hadoop Free" Build
+---
+
+Spark uses Hadoop client libraries for HDFS and YARN. Starting in version Spark 1.4, the project packages "Hadoop free" builds that lets you more easily connect a single Spark binary to any Hadoop version. To use these builds, you need to modify `SPARK_DIST_CLASSPATH` to include Hadoop's package jars. The most convenient place to do this is by adding an entry in `conf/spark-env.sh`.
+
+This page describes how to connect Spark to Hadoop for different types of distributions.
+
+# Apache Hadoop
+For Apache distributions, you can use Hadoop's 'classpath' command. For instance:
+
+{% highlight bash %}
+### in conf/spark-env.sh ###
+
+# If 'hadoop' binary is on your PATH
+export SPARK_DIST_CLASSPATH=$(hadoop classpath)
+
+# With explicit path to 'hadoop' binary
+export SPARK_DIST_CLASSPATH=$(/path/to/hadoop/bin/hadoop classpath)
+
+# Passing a Hadoop configuration directory
+export SPARK_DIST_CLASSPATH=$(hadoop classpath --config /path/to/configs)
+
+{% endhighlight %}
diff --git a/docs/index.md b/docs/index.md
index 7939657915fc9..d85cf12defefd 100644
--- a/docs/index.md
+++ b/docs/index.md
@@ -12,9 +12,13 @@ It also supports a rich set of higher-level tools including [Spark SQL](sql-prog
 
 # Downloading
 
-Get Spark from the [downloads page](http://spark.apache.org/downloads.html) of the project website. This documentation is for Spark version {{site.SPARK_VERSION}}. The downloads page 
-contains Spark packages for many popular HDFS versions. If you'd like to build Spark from 
-scratch, visit [Building Spark](building-spark.html).
+Get Spark from the [downloads page](http://spark.apache.org/downloads.html) of the project website. This documentation is for Spark version {{site.SPARK_VERSION}}. Spark uses Hadoop's client libraries for HDFS and YARN. Downloads are pre-packaged for a handful of popular Hadoop versions.
+Users can also download a "Hadoop free" binary and run Spark with any Hadoop version
+[by augmenting Spark's classpath](hadoop-provided.html). 
+
+If you'd like to build Spark from 
+source, visit [Building Spark](building-spark.html).
+
 
 Spark runs on both Windows and UNIX-like systems (e.g. Linux, Mac OS). It's easy to run
 locally on one machine --- all you need is to have `java` installed on your system `PATH`,

From 778f3ca81f8d90faec0775509632fe68f1399dc4 Mon Sep 17 00:00:00 2001
From: "navis.ryu" <navis@apache.org>
Date: Tue, 9 Jun 2015 19:33:00 -0700
Subject: [PATCH 429/525] [SPARK-7792] [SQL] HiveContext registerTempTable not
 thread safe

Just replaced mutable.HashMap to ConcurrentHashMap

Author: navis.ryu <navis@apache.org>

Closes #6699 from navis/SPARK-7792 and squashes the following commits:

f03654a [navis.ryu] [SPARK-7792] [SQL] HiveContext registerTempTable not thread safe
---
 .../spark/sql/catalyst/analysis/Catalog.scala | 28 +++++++++++--------
 1 file changed, 17 insertions(+), 11 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Catalog.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Catalog.scala
index 3e240fd55e250..1541491608b24 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Catalog.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Catalog.scala
@@ -17,7 +17,11 @@
 
 package org.apache.spark.sql.catalyst.analysis
 
+import java.util.concurrent.ConcurrentHashMap
+
+import scala.collection.JavaConversions._
 import scala.collection.mutable
+import scala.collection.mutable.ArrayBuffer
 
 import org.apache.spark.sql.catalyst.CatalystConf
 import org.apache.spark.sql.catalyst.EmptyConf
@@ -81,18 +85,18 @@ trait Catalog {
 }
 
 class SimpleCatalog(val conf: CatalystConf) extends Catalog {
-  val tables = new mutable.HashMap[String, LogicalPlan]()
+  val tables = new ConcurrentHashMap[String, LogicalPlan]
 
   override def registerTable(
       tableIdentifier: Seq[String],
       plan: LogicalPlan): Unit = {
     val tableIdent = processTableIdentifier(tableIdentifier)
-    tables += ((getDbTableName(tableIdent), plan))
+    tables.put(getDbTableName(tableIdent), plan)
   }
 
   override def unregisterTable(tableIdentifier: Seq[String]): Unit = {
     val tableIdent = processTableIdentifier(tableIdentifier)
-    tables -= getDbTableName(tableIdent)
+    tables.remove(getDbTableName(tableIdent))
   }
 
   override def unregisterAllTables(): Unit = {
@@ -101,10 +105,7 @@ class SimpleCatalog(val conf: CatalystConf) extends Catalog {
 
   override def tableExists(tableIdentifier: Seq[String]): Boolean = {
     val tableIdent = processTableIdentifier(tableIdentifier)
-    tables.get(getDbTableName(tableIdent)) match {
-      case Some(_) => true
-      case None => false
-    }
+    tables.containsKey(getDbTableName(tableIdent))
   }
 
   override def lookupRelation(
@@ -112,7 +113,10 @@ class SimpleCatalog(val conf: CatalystConf) extends Catalog {
       alias: Option[String] = None): LogicalPlan = {
     val tableIdent = processTableIdentifier(tableIdentifier)
     val tableFullName = getDbTableName(tableIdent)
-    val table = tables.getOrElse(tableFullName, sys.error(s"Table Not Found: $tableFullName"))
+    val table = tables.get(tableFullName)
+    if (table == null) {
+      sys.error(s"Table Not Found: $tableFullName")
+    }
     val tableWithQualifiers = Subquery(tableIdent.last, table)
 
     // If an alias was specified by the lookup, wrap the plan in a subquery so that attributes are
@@ -121,9 +125,11 @@ class SimpleCatalog(val conf: CatalystConf) extends Catalog {
   }
 
   override def getTables(databaseName: Option[String]): Seq[(String, Boolean)] = {
-    tables.map {
-      case (name, _) => (name, true)
-    }.toSeq
+    val result = ArrayBuffer.empty[(String, Boolean)]
+    for (name <- tables.keySet()) {
+      result += ((name, true))
+    }
+    result
   }
 
   override def refreshTable(databaseName: String, tableName: String): Unit = {

From 57c60c5be7aa731ca1a6966f4285eb02f481eb71 Mon Sep 17 00:00:00 2001
From: Reynold Xin <rxin@databricks.com>
Date: Wed, 10 Jun 2015 00:36:16 -0700
Subject: [PATCH 430/525] [SPARK-7886] Use FunctionRegistry for built-in
 expressions in HiveContext.

This builds on #6710 and also uses FunctionRegistry for function lookup in HiveContext.

Author: Reynold Xin <rxin@databricks.com>

Closes #6712 from rxin/udf-registry-hive and squashes the following commits:

f4c2df0 [Reynold Xin] Fixed style violation.
0bd4127 [Reynold Xin] Fixed Python UDFs.
f9a0378 [Reynold Xin] Disable one more test.
5609494 [Reynold Xin] Disable some failing tests.
4efea20 [Reynold Xin] Don't check children resolved for UDF resolution.
2ebe549 [Reynold Xin] Removed more hardcoded functions.
aadce78 [Reynold Xin] [SPARK-7886] Use FunctionRegistry for built-in expressions in HiveContext.
---
 .../apache/spark/sql/catalyst/SqlParser.scala |  2 +-
 .../sql/catalyst/analysis/Analyzer.scala      |  9 +--
 .../catalyst/analysis/FunctionRegistry.scala  | 12 +++-
 .../sql/catalyst/expressions/Expression.scala |  8 +--
 .../org/apache/spark/sql/SQLContext.scala     |  7 +-
 .../spark/sql/execution/pythonUdfs.scala      | 66 ++++++++++---------
 .../execution/HiveCompatibilitySuite.scala    | 10 +--
 .../apache/spark/sql/hive/HiveContext.scala   |  2 +-
 .../org/apache/spark/sql/hive/HiveQl.scala    | 30 ---------
 .../org/apache/spark/sql/hive/hiveUdfs.scala  | 51 +++++++-------
 10 files changed, 92 insertions(+), 105 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/SqlParser.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/SqlParser.scala
index f74c17d583359..da3a717f90058 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/SqlParser.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/SqlParser.scala
@@ -68,7 +68,6 @@ class SqlParser extends AbstractSparkSQLParser with DataTypeParser {
   protected val FULL = Keyword("FULL")
   protected val GROUP = Keyword("GROUP")
   protected val HAVING = Keyword("HAVING")
-  protected val IF = Keyword("IF")
   protected val IN = Keyword("IN")
   protected val INNER = Keyword("INNER")
   protected val INSERT = Keyword("INSERT")
@@ -277,6 +276,7 @@ class SqlParser extends AbstractSparkSQLParser with DataTypeParser {
       lexical.normalizeKeyword(udfName) match {
         case "sum" => SumDistinct(exprs.head)
         case "count" => CountDistinct(exprs)
+        case _ => throw new AnalysisException(s"function $udfName does not support DISTINCT")
       }
     }
     | APPROXIMATE ~> ident ~ ("(" ~ DISTINCT ~> expression <~ ")") ^^ { case udfName ~ exp =>
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
index 02b10c444d1a7..c4f12cfe87993 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
@@ -460,7 +460,7 @@ class Analyzer(
     def apply(plan: LogicalPlan): LogicalPlan = plan transform {
       case q: LogicalPlan =>
         q transformExpressions {
-          case u @ UnresolvedFunction(name, children) if u.childrenResolved =>
+          case u @ UnresolvedFunction(name, children) =>
             withPosition(u) {
               registry.lookupFunction(name, children)
             }
@@ -494,20 +494,21 @@ class Analyzer(
   object UnresolvedHavingClauseAttributes extends Rule[LogicalPlan] {
     def apply(plan: LogicalPlan): LogicalPlan = plan transformUp {
       case filter @ Filter(havingCondition, aggregate @ Aggregate(_, originalAggExprs, _))
-          if aggregate.resolved && containsAggregate(havingCondition) => {
+          if aggregate.resolved && containsAggregate(havingCondition) =>
+
         val evaluatedCondition = Alias(havingCondition, "havingCondition")()
         val aggExprsWithHaving = evaluatedCondition +: originalAggExprs
 
         Project(aggregate.output,
           Filter(evaluatedCondition.toAttribute,
             aggregate.copy(aggregateExpressions = aggExprsWithHaving)))
-      }
     }
 
-    protected def containsAggregate(condition: Expression): Boolean =
+    protected def containsAggregate(condition: Expression): Boolean = {
       condition
         .collect { case ae: AggregateExpression => ae }
         .nonEmpty
+    }
   }
 
   /**
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
index 406f6fad8413b..936ffc7d5ff55 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
@@ -35,7 +35,7 @@ trait FunctionRegistry {
   def lookupFunction(name: String, children: Seq[Expression]): Expression
 }
 
-trait OverrideFunctionRegistry extends FunctionRegistry {
+class OverrideFunctionRegistry(underlying: FunctionRegistry) extends FunctionRegistry {
 
   private val functionBuilders = StringKeyHashMap[FunctionBuilder](caseSensitive = false)
 
@@ -43,8 +43,8 @@ trait OverrideFunctionRegistry extends FunctionRegistry {
     functionBuilders.put(name, builder)
   }
 
-  abstract override def lookupFunction(name: String, children: Seq[Expression]): Expression = {
-    functionBuilders.get(name).map(_(children)).getOrElse(super.lookupFunction(name, children))
+  override def lookupFunction(name: String, children: Seq[Expression]): Expression = {
+    functionBuilders.get(name).map(_(children)).getOrElse(underlying.lookupFunction(name, children))
   }
 }
 
@@ -133,6 +133,12 @@ object FunctionRegistry {
     expression[Sum]("sum")
   )
 
+  val builtin: FunctionRegistry = {
+    val fr = new SimpleFunctionRegistry
+    expressions.foreach { case (name, builder) => fr.registerFunction(name, builder) }
+    fr
+  }
+
   /** See usage above. */
   private def expression[T <: Expression](name: String)
       (implicit tag: ClassTag[T]): (String, FunctionBuilder) = {
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala
index f2ed1f0929987..a05794f1dbd86 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala
@@ -25,10 +25,10 @@ import org.apache.spark.sql.types._
 
 
 /**
- * For Catalyst to work correctly, concrete implementations of [[Expression]]s must be case classes
- * whose constructor arguments are all Expressions types. In addition, if we want to support more
- * than one constructor, define those constructors explicitly as apply methods in the companion
- * object.
+ * If an expression wants to be exposed in the function registry (so users can call it with
+ * "name(arguments...)", the concrete implementation must be a case class whose constructor
+ * arguments are all Expressions types. In addition, if it needs to support more than one
+ * constructor, define those constructors explicitly as apply methods in the companion object.
  *
  * See [[Substring]] for an example.
  */
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
index 8cad3885b7d46..5f758adf3dfc6 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
@@ -120,11 +120,8 @@ class SQLContext(@transient val sparkContext: SparkContext)
 
   // TODO how to handle the temp function per user session?
   @transient
-  protected[sql] lazy val functionRegistry: FunctionRegistry = {
-    val fr = new SimpleFunctionRegistry
-    FunctionRegistry.expressions.foreach { case (name, func) => fr.registerFunction(name, func) }
-    fr
-  }
+  protected[sql] lazy val functionRegistry: FunctionRegistry =
+    new OverrideFunctionRegistry(FunctionRegistry.builtin)
 
   @transient
   protected[sql] lazy val analyzer: Analyzer =
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/pythonUdfs.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/pythonUdfs.scala
index 55f3ff4709013..342587904789a 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/pythonUdfs.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/pythonUdfs.scala
@@ -57,7 +57,7 @@ private[spark] case class PythonUDF(
   def nullable: Boolean = true
 
   override def eval(input: Row): Any = {
-    sys.error("PythonUDFs can not be directly evaluated.")
+    throw new UnsupportedOperationException("PythonUDFs can not be directly evaluated.")
   }
 }
 
@@ -71,43 +71,49 @@ private[spark] case class PythonUDF(
 private[spark] object ExtractPythonUdfs extends Rule[LogicalPlan] {
   def apply(plan: LogicalPlan): LogicalPlan = plan transform {
     // Skip EvaluatePython nodes.
-    case p: EvaluatePython => p
+    case plan: EvaluatePython => plan
 
-    case l: LogicalPlan =>
+    case plan: LogicalPlan =>
       // Extract any PythonUDFs from the current operator.
-      val udfs = l.expressions.flatMap(_.collect { case udf: PythonUDF => udf})
+      val udfs = plan.expressions.flatMap(_.collect { case udf: PythonUDF => udf })
       if (udfs.isEmpty) {
         // If there aren't any, we are done.
-        l
+        plan
       } else {
         // Pick the UDF we are going to evaluate (TODO: Support evaluating multiple UDFs at a time)
         // If there is more than one, we will add another evaluation operator in a subsequent pass.
-        val udf = udfs.head
-
-        var evaluation: EvaluatePython = null
-
-        // Rewrite the child that has the input required for the UDF
-        val newChildren = l.children.map { child =>
-          // Check to make sure that the UDF can be evaluated with only the input of this child.
-          // Other cases are disallowed as they are ambiguous or would require a cartisian product.
-          if (udf.references.subsetOf(child.outputSet)) {
-            evaluation = EvaluatePython(udf, child)
-            evaluation
-          } else if (udf.references.intersect(child.outputSet).nonEmpty) {
-            sys.error(s"Invalid PythonUDF $udf, requires attributes from more than one child.")
-          } else {
-            child
-          }
+        udfs.find(_.resolved) match {
+          case Some(udf) =>
+            var evaluation: EvaluatePython = null
+
+            // Rewrite the child that has the input required for the UDF
+            val newChildren = plan.children.map { child =>
+              // Check to make sure that the UDF can be evaluated with only the input of this child.
+              // Other cases are disallowed as they are ambiguous or would require a cartesian
+              // product.
+              if (udf.references.subsetOf(child.outputSet)) {
+                evaluation = EvaluatePython(udf, child)
+                evaluation
+              } else if (udf.references.intersect(child.outputSet).nonEmpty) {
+                sys.error(s"Invalid PythonUDF $udf, requires attributes from more than one child.")
+              } else {
+                child
+              }
+            }
+
+            assert(evaluation != null, "Unable to evaluate PythonUDF.  Missing input attributes.")
+
+            // Trim away the new UDF value if it was only used for filtering or something.
+            logical.Project(
+              plan.output,
+              plan.transformExpressions {
+                case p: PythonUDF if p.fastEquals(udf) => evaluation.resultAttribute
+              }.withNewChildren(newChildren))
+
+          case None =>
+            // If there is no Python UDF that is resolved, skip this round.
+            plan
         }
-
-        assert(evaluation != null, "Unable to evaluate PythonUDF.  Missing input attributes.")
-
-        // Trim away the new UDF value if it was only used for filtering or something.
-        logical.Project(
-          l.output,
-          l.transformExpressions {
-            case p: PythonUDF if p.fastEquals(udf) => evaluation.resultAttribute
-          }.withNewChildren(newChildren))
       }
   }
 }
diff --git a/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveCompatibilitySuite.scala b/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveCompatibilitySuite.scala
index 048f78b4daa8d..0693c7ea5b332 100644
--- a/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveCompatibilitySuite.scala
+++ b/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveCompatibilitySuite.scala
@@ -817,19 +817,19 @@ class HiveCompatibilitySuite extends HiveQueryFileTest with BeforeAndAfter {
     "udf2",
     "udf5",
     "udf6",
-    "udf7",
+    // "udf7",  turn this on after we figure out null vs nan vs infinity
     "udf8",
     "udf9",
     "udf_10_trims",
     "udf_E",
     "udf_PI",
     "udf_abs",
-    "udf_acos",
+    // "udf_acos",  turn this on after we figure out null vs nan vs infinity
     "udf_add",
     "udf_array",
     "udf_array_contains",
     "udf_ascii",
-    "udf_asin",
+    // "udf_asin",  turn this on after we figure out null vs nan vs infinity
     "udf_atan",
     "udf_avg",
     "udf_bigint",
@@ -917,7 +917,7 @@ class HiveCompatibilitySuite extends HiveQueryFileTest with BeforeAndAfter {
     "udf_repeat",
     "udf_rlike",
     "udf_round",
-    "udf_round_3",
+    //  "udf_round_3",  TODO: FIX THIS failed due to cast exception
     "udf_rpad",
     "udf_rtrim",
     "udf_second",
@@ -931,7 +931,7 @@ class HiveCompatibilitySuite extends HiveQueryFileTest with BeforeAndAfter {
     "udf_stddev_pop",
     "udf_stddev_samp",
     "udf_string",
-    "udf_struct",
+    // "udf_struct",  TODO: FIX THIS and enable it.
     "udf_substring",
     "udf_subtract",
     "udf_sum",
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala
index 3b8cafb4a6c37..3b75b0b04102d 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala
@@ -374,7 +374,7 @@ class HiveContext(sc: SparkContext) extends SQLContext(sc) {
   // Note that HiveUDFs will be overridden by functions registered in this context.
   @transient
   override protected[sql] lazy val functionRegistry: FunctionRegistry =
-    new HiveFunctionRegistry with OverrideFunctionRegistry
+    new OverrideFunctionRegistry(new HiveFunctionRegistry(FunctionRegistry.builtin))
 
   /* An analyzer that uses the Hive metastore. */
   @transient
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala
index 9544d12c9053c..041483ebfb8d9 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala
@@ -1307,16 +1307,9 @@ https://cwiki.apache.org/confluence/display/Hive/Enhanced+Aggregation%2C+Cube%2C
     HiveParser.DecimalLiteral)
 
   /* Case insensitive matches */
-  val ARRAY = "(?i)ARRAY".r
   val COALESCE = "(?i)COALESCE".r
   val COUNT = "(?i)COUNT".r
-  val AVG = "(?i)AVG".r
   val SUM = "(?i)SUM".r
-  val MAX = "(?i)MAX".r
-  val MIN = "(?i)MIN".r
-  val UPPER = "(?i)UPPER".r
-  val LOWER = "(?i)LOWER".r
-  val RAND = "(?i)RAND".r
   val AND = "(?i)AND".r
   val OR = "(?i)OR".r
   val NOT = "(?i)NOT".r
@@ -1330,8 +1323,6 @@ https://cwiki.apache.org/confluence/display/Hive/Enhanced+Aggregation%2C+Cube%2C
   val BETWEEN = "(?i)BETWEEN".r
   val WHEN = "(?i)WHEN".r
   val CASE = "(?i)CASE".r
-  val SUBSTR = "(?i)SUBSTR(?:ING)?".r
-  val SQRT = "(?i)SQRT".r
 
   protected def nodeToExpr(node: Node): Expression = node match {
     /* Attribute References */
@@ -1353,18 +1344,9 @@ https://cwiki.apache.org/confluence/display/Hive/Enhanced+Aggregation%2C+Cube%2C
       UnresolvedStar(Some(name))
 
     /* Aggregate Functions */
-    case Token("TOK_FUNCTION", Token(AVG(), Nil) :: arg :: Nil) => Average(nodeToExpr(arg))
-    case Token("TOK_FUNCTION", Token(COUNT(), Nil) :: arg :: Nil) => Count(nodeToExpr(arg))
     case Token("TOK_FUNCTIONSTAR", Token(COUNT(), Nil) :: Nil) => Count(Literal(1))
     case Token("TOK_FUNCTIONDI", Token(COUNT(), Nil) :: args) => CountDistinct(args.map(nodeToExpr))
-    case Token("TOK_FUNCTION", Token(SUM(), Nil) :: arg :: Nil) => Sum(nodeToExpr(arg))
     case Token("TOK_FUNCTIONDI", Token(SUM(), Nil) :: arg :: Nil) => SumDistinct(nodeToExpr(arg))
-    case Token("TOK_FUNCTION", Token(MAX(), Nil) :: arg :: Nil) => Max(nodeToExpr(arg))
-    case Token("TOK_FUNCTION", Token(MIN(), Nil) :: arg :: Nil) => Min(nodeToExpr(arg))
-
-    /* System functions about string operations */
-    case Token("TOK_FUNCTION", Token(UPPER(), Nil) :: arg :: Nil) => Upper(nodeToExpr(arg))
-    case Token("TOK_FUNCTION", Token(LOWER(), Nil) :: arg :: Nil) => Lower(nodeToExpr(arg))
 
     /* Casts */
     case Token("TOK_FUNCTION", Token("TOK_STRING", Nil) :: arg :: Nil) =>
@@ -1414,7 +1396,6 @@ https://cwiki.apache.org/confluence/display/Hive/Enhanced+Aggregation%2C+Cube%2C
     case Token("&", left :: right:: Nil) => BitwiseAnd(nodeToExpr(left), nodeToExpr(right))
     case Token("|", left :: right:: Nil) => BitwiseOr(nodeToExpr(left), nodeToExpr(right))
     case Token("^", left :: right:: Nil) => BitwiseXor(nodeToExpr(left), nodeToExpr(right))
-    case Token("TOK_FUNCTION", Token(SQRT(), Nil) :: arg :: Nil) => Sqrt(nodeToExpr(arg))
 
     /* Comparisons */
     case Token("=", left :: right:: Nil) => EqualTo(nodeToExpr(left), nodeToExpr(right))
@@ -1469,17 +1450,6 @@ https://cwiki.apache.org/confluence/display/Hive/Enhanced+Aggregation%2C+Cube%2C
     case Token("[", child :: ordinal :: Nil) =>
       UnresolvedExtractValue(nodeToExpr(child), nodeToExpr(ordinal))
 
-    /* Other functions */
-    case Token("TOK_FUNCTION", Token(ARRAY(), Nil) :: children) =>
-      CreateArray(children.map(nodeToExpr))
-    case Token("TOK_FUNCTION", Token(RAND(), Nil) :: Nil) => Rand()
-    case Token("TOK_FUNCTION", Token(RAND(), Nil) :: seed :: Nil) => Rand(seed.toString.toLong)
-    case Token("TOK_FUNCTION", Token(SUBSTR(), Nil) :: string :: pos :: Nil) =>
-      Substring(nodeToExpr(string), nodeToExpr(pos), Literal.create(Integer.MAX_VALUE, IntegerType))
-    case Token("TOK_FUNCTION", Token(SUBSTR(), Nil) :: string :: pos :: length :: Nil) =>
-      Substring(nodeToExpr(string), nodeToExpr(pos), nodeToExpr(length))
-    case Token("TOK_FUNCTION", Token(COALESCE(), Nil) :: list) => Coalesce(list.map(nodeToExpr))
-
     /* Window Functions */
     case Token("TOK_FUNCTION", Token(name, Nil) +: args :+ Token("TOK_WINDOWSPEC", spec)) =>
       val function = UnresolvedWindowFunction(name, args.map(nodeToExpr))
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/hiveUdfs.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/hiveUdfs.scala
index 6e6ac987b668a..a46ee9da9039c 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/hiveUdfs.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/hiveUdfs.scala
@@ -19,6 +19,7 @@ package org.apache.spark.sql.hive
 
 import scala.collection.mutable.ArrayBuffer
 import scala.collection.JavaConversions._
+import scala.util.Try
 
 import org.apache.hadoop.hive.serde2.objectinspector.{ObjectInspector, ConstantObjectInspector}
 import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory.ObjectInspectorOptions
@@ -33,6 +34,7 @@ import org.apache.hadoop.hive.ql.udf.generic.GenericUDFUtils.ConversionHelper
 import org.apache.spark.Logging
 import org.apache.spark.sql.AnalysisException
 import org.apache.spark.sql.catalyst.analysis
+import org.apache.spark.sql.catalyst.analysis.FunctionRegistry.FunctionBuilder
 import org.apache.spark.sql.catalyst.errors.TreeNodeException
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.plans.logical._
@@ -41,35 +43,40 @@ import org.apache.spark.sql.hive.HiveShim._
 import org.apache.spark.sql.types._
 
 
-private[hive] abstract class HiveFunctionRegistry
+private[hive] class HiveFunctionRegistry(underlying: analysis.FunctionRegistry)
   extends analysis.FunctionRegistry with HiveInspectors {
 
   def getFunctionInfo(name: String): FunctionInfo = FunctionRegistry.getFunctionInfo(name)
 
   override def lookupFunction(name: String, children: Seq[Expression]): Expression = {
-    // We only look it up to see if it exists, but do not include it in the HiveUDF since it is
-    // not always serializable.
-    val functionInfo: FunctionInfo =
-      Option(FunctionRegistry.getFunctionInfo(name.toLowerCase)).getOrElse(
-        throw new AnalysisException(s"undefined function $name"))
-
-    val functionClassName = functionInfo.getFunctionClass.getName
-
-    if (classOf[UDF].isAssignableFrom(functionInfo.getFunctionClass)) {
-      HiveSimpleUdf(new HiveFunctionWrapper(functionClassName), children)
-    } else if (classOf[GenericUDF].isAssignableFrom(functionInfo.getFunctionClass)) {
-      HiveGenericUdf(new HiveFunctionWrapper(functionClassName), children)
-    } else if (
-         classOf[AbstractGenericUDAFResolver].isAssignableFrom(functionInfo.getFunctionClass)) {
-      HiveGenericUdaf(new HiveFunctionWrapper(functionClassName), children)
-    } else if (classOf[UDAF].isAssignableFrom(functionInfo.getFunctionClass)) {
-      HiveUdaf(new HiveFunctionWrapper(functionClassName), children)
-    } else if (classOf[GenericUDTF].isAssignableFrom(functionInfo.getFunctionClass)) {
-      HiveGenericUdtf(new HiveFunctionWrapper(functionClassName), children)
-    } else {
-      sys.error(s"No handler for udf ${functionInfo.getFunctionClass}")
+    Try(underlying.lookupFunction(name, children)).getOrElse {
+      // We only look it up to see if it exists, but do not include it in the HiveUDF since it is
+      // not always serializable.
+      val functionInfo: FunctionInfo =
+        Option(FunctionRegistry.getFunctionInfo(name.toLowerCase)).getOrElse(
+          throw new AnalysisException(s"undefined function $name"))
+
+      val functionClassName = functionInfo.getFunctionClass.getName
+
+      if (classOf[UDF].isAssignableFrom(functionInfo.getFunctionClass)) {
+        HiveSimpleUdf(new HiveFunctionWrapper(functionClassName), children)
+      } else if (classOf[GenericUDF].isAssignableFrom(functionInfo.getFunctionClass)) {
+        HiveGenericUdf(new HiveFunctionWrapper(functionClassName), children)
+      } else if (
+        classOf[AbstractGenericUDAFResolver].isAssignableFrom(functionInfo.getFunctionClass)) {
+        HiveGenericUdaf(new HiveFunctionWrapper(functionClassName), children)
+      } else if (classOf[UDAF].isAssignableFrom(functionInfo.getFunctionClass)) {
+        HiveUdaf(new HiveFunctionWrapper(functionClassName), children)
+      } else if (classOf[GenericUDTF].isAssignableFrom(functionInfo.getFunctionClass)) {
+        HiveGenericUdtf(new HiveFunctionWrapper(functionClassName), children)
+      } else {
+        sys.error(s"No handler for udf ${functionInfo.getFunctionClass}")
+      }
     }
   }
+
+  override def registerFunction(name: String, builder: FunctionBuilder): Unit =
+    throw new UnsupportedOperationException
 }
 
 private[hive] case class HiveSimpleUdf(funcWrapper: HiveFunctionWrapper, children: Seq[Expression])

From e90035e676e492de840f44b61b330db526313019 Mon Sep 17 00:00:00 2001
From: Reynold Xin <rxin@databricks.com>
Date: Wed, 10 Jun 2015 18:58:01 +0800
Subject: [PATCH 431/525] [SPARK-7886] Added unit test for HAVING aggregate
 pushdown.

This is a followup to #6712.

Author: Reynold Xin <rxin@databricks.com>

Closes #6739 from rxin/6712-followup and squashes the following commits:

fd9acfb [Reynold Xin] [SPARK-7886] Added unit test for HAVING aggregate pushdown.
---
 .../test/scala/org/apache/spark/sql/SQLQuerySuite.scala    | 7 +++++++
 .../src/main/scala/org/apache/spark/sql/hive/HiveQl.scala  | 1 -
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
index 5babc4332cc77..3ca5ff347dd0c 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
@@ -38,6 +38,13 @@ class SQLQuerySuite extends QueryTest with BeforeAndAfterAll with SQLTestUtils {
   import sqlContext.implicits._
   import sqlContext.sql
 
+  test("having clause") {
+    Seq(("one", 1), ("two", 2), ("three", 3), ("one", 5)).toDF("k", "v").registerTempTable("hav")
+    checkAnswer(
+      sql("SELECT k, sum(v) FROM hav GROUP BY k HAVING sum(v) > 2"),
+      Row("one", 6) :: Row("three", 3) :: Nil)
+  }
+
   test("SPARK-6743: no columns from cache") {
     Seq(
       (83, 0, 38),
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala
index 041483ebfb8d9..ca4b80b51b23f 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala
@@ -1307,7 +1307,6 @@ https://cwiki.apache.org/confluence/display/Hive/Enhanced+Aggregation%2C+Cube%2C
     HiveParser.DecimalLiteral)
 
   /* Case insensitive matches */
-  val COALESCE = "(?i)COALESCE".r
   val COUNT = "(?i)COUNT".r
   val SUM = "(?i)SUM".r
   val AND = "(?i)AND".r

From c6ba7cca3338e3f4f719d86dbcff4406d949edc7 Mon Sep 17 00:00:00 2001
From: Daoyuan Wang <daoyuan.wang@intel.com>
Date: Wed, 10 Jun 2015 09:45:45 -0700
Subject: [PATCH 432/525] [SPARK-8215] [SPARK-8212] [SQL] add leaf math
 expression for e and pi

Author: Daoyuan Wang <daoyuan.wang@intel.com>

Closes #6716 from adrian-wang/epi and squashes the following commits:

e2e8dbd [Daoyuan Wang] move tests
11b351c [Daoyuan Wang] add tests and remove pu
db331c9 [Daoyuan Wang] py style
599ddd8 [Daoyuan Wang] add py
e6783ef [Daoyuan Wang] register function
82d426e [Daoyuan Wang] add function entry
dbf3ab5 [Daoyuan Wang] add PI and E
---
 .../catalyst/analysis/FunctionRegistry.scala  |  2 ++
 .../spark/sql/catalyst/expressions/math.scala | 35 +++++++++++++++++++
 .../expressions/MathFunctionsSuite.scala      | 22 ++++++++++++
 .../org/apache/spark/sql/functions.scala      | 18 ++++++++++
 .../spark/sql/DataFrameFunctionsSuite.scala   | 19 ++++++++++
 5 files changed, 96 insertions(+)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
index 936ffc7d5ff55..ba89a5c8d1372 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
@@ -106,6 +106,7 @@ object FunctionRegistry {
     expression[Cbrt]("cbrt"),
     expression[Ceil]("ceil"),
     expression[Cos]("cos"),
+    expression[EulerNumber]("e"),
     expression[Exp]("exp"),
     expression[Expm1]("expm1"),
     expression[Floor]("floor"),
@@ -113,6 +114,7 @@ object FunctionRegistry {
     expression[Log]("log"),
     expression[Log10]("log10"),
     expression[Log1p]("log1p"),
+    expression[Pi]("pi"),
     expression[Pow]("pow"),
     expression[Rint]("rint"),
     expression[Signum]("signum"),
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/math.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/math.scala
index 7dacb6a9b47b6..e1d8c9a0cdb5a 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/math.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/math.scala
@@ -20,9 +20,34 @@ package org.apache.spark.sql.catalyst.expressions
 import org.apache.spark.sql.catalyst.expressions.codegen._
 import org.apache.spark.sql.types.{DataType, DoubleType}
 
+/**
+ * A leaf expression specifically for math constants. Math constants expect no input.
+ * @param c The math constant.
+ * @param name The short name of the function
+ */
+abstract class LeafMathExpression(c: Double, name: String)
+  extends LeafExpression with Serializable {
+  self: Product =>
+
+  override def dataType: DataType = DoubleType
+  override def foldable: Boolean = true
+  override def nullable: Boolean = false
+  override def toString: String = s"$name()"
+
+  override def eval(input: Row): Any = c
+
+  override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
+    s"""
+      boolean ${ev.isNull} = false;
+      ${ctx.javaType(dataType)} ${ev.primitive} = java.lang.Math.$name;
+    """
+  }
+}
+
 /**
  * A unary expression specifically for math functions. Math Functions expect a specific type of
  * input format, therefore these functions extend `ExpectsInputTypes`.
+ * @param f The math function.
  * @param name The short name of the function
  */
 abstract class UnaryMathExpression(f: Double => Double, name: String)
@@ -98,6 +123,16 @@ abstract class BinaryMathExpression(f: (Double, Double) => Double, name: String)
   }
 }
 
+////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// Leaf math functions
+////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+case class EulerNumber() extends LeafMathExpression(math.E, "E")
+
+case class Pi() extends LeafMathExpression(math.Pi, "PI")
+
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 // Unary math functions
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/MathFunctionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/MathFunctionsSuite.scala
index 25ebc70d095d8..1fe69059d39da 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/MathFunctionsSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/MathFunctionsSuite.scala
@@ -22,6 +22,20 @@ import org.apache.spark.sql.types.DoubleType
 
 class MathFunctionsSuite extends SparkFunSuite with ExpressionEvalHelper {
 
+  /**
+   * Used for testing leaf math expressions.
+   *
+   * @param e expression
+   * @param c The constants in scala.math
+   * @tparam T Generic type for primitives
+   */
+  private def testLeaf[T](
+      e: () => Expression,
+      c: T): Unit = {
+    checkEvaluation(e(), c, EmptyRow)
+    checkEvaluation(e(), c, create_row(null))
+  }
+
   /**
    * Used for testing unary math expressions.
    *
@@ -74,6 +88,14 @@ class MathFunctionsSuite extends SparkFunSuite with ExpressionEvalHelper {
     checkEvaluation(c(Literal(1.0), Literal.create(null, DoubleType)), null, create_row(null))
   }
 
+  test("e") {
+    testLeaf(EulerNumber, math.E)
+  }
+
+  test("pi") {
+    testLeaf(Pi, math.Pi)
+  }
+
   test("sin") {
     testUnary(Sin, math.sin)
   }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
index 454af47913bf1..b3fc1e6cd987e 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
@@ -944,6 +944,15 @@ object functions {
    */
   def cosh(columnName: String): Column = cosh(Column(columnName))
 
+  /**
+   * Returns the double value that is closer than any other to e, the base of the natural
+   * logarithms.
+   *
+   * @group math_funcs
+   * @since 1.5.0
+   */
+  def e(): Column = EulerNumber()
+
   /**
    * Computes the exponential of the given value.
    *
@@ -1105,6 +1114,15 @@ object functions {
    */
   def log1p(columnName: String): Column = log1p(Column(columnName))
 
+  /**
+   * Returns the double value that is closer than any other to pi, the ratio of the circumference
+   * of a circle to its diameter.
+   *
+   * @group math_funcs
+   * @since 1.5.0
+   */
+  def pi(): Column = Pi()
+
   /**
    * Returns the value of the first argument raised to the power of the second argument.
    *
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala
index 53c2befb73702..b93ad39f5da45 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala
@@ -85,6 +85,25 @@ class DataFrameFunctionsSuite extends QueryTest {
     }
   }
 
+  test("constant functions") {
+    checkAnswer(
+      testData2.select(e()).limit(1),
+      Row(scala.math.E)
+    )
+    checkAnswer(
+      testData2.select(pi()).limit(1),
+      Row(scala.math.Pi)
+    )
+    checkAnswer(
+      ctx.sql("SELECT E()"),
+      Row(scala.math.E)
+    )
+    checkAnswer(
+      ctx.sql("SELECT PI()"),
+      Row(scala.math.Pi)
+    )
+  }
+
   test("bitwiseNOT") {
     checkAnswer(
       testData2.select(bitwiseNOT($"a")),

From 2b550a521e45e1dbca2cca40ddd94e20c013831c Mon Sep 17 00:00:00 2001
From: Ilya Ganelin <ilya.ganelin@capitalone.com>
Date: Wed, 10 Jun 2015 11:21:12 -0700
Subject: [PATCH 433/525] [SPARK-7996] Deprecate the developer api
 SparkEnv.actorSystem

Changed ```SparkEnv.actorSystem``` to be a function such that we can use the deprecated flag with it and added a deprecated message.

Author: Ilya Ganelin <ilya.ganelin@capitalone.com>

Closes #6731 from ilganeli/SPARK-7996 and squashes the following commits:

be43817 [Ilya Ganelin] Restored to val
9ed89e7 [Ilya Ganelin] Added a version info for deprecation
9610b08 [Ilya Ganelin] Converted actorSystem to function and added deprecated flag
---
 core/src/main/scala/org/apache/spark/SparkEnv.scala | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/core/src/main/scala/org/apache/spark/SparkEnv.scala b/core/src/main/scala/org/apache/spark/SparkEnv.scala
index a185954089528..b0665570e2681 100644
--- a/core/src/main/scala/org/apache/spark/SparkEnv.scala
+++ b/core/src/main/scala/org/apache/spark/SparkEnv.scala
@@ -20,6 +20,8 @@ package org.apache.spark
 import java.io.File
 import java.net.Socket
 
+import akka.actor.ActorSystem
+
 import scala.collection.JavaConversions._
 import scala.collection.mutable
 import scala.util.Properties
@@ -75,7 +77,8 @@ class SparkEnv (
     val conf: SparkConf) extends Logging {
 
   // TODO Remove actorSystem
-  val actorSystem = rpcEnv.asInstanceOf[AkkaRpcEnv].actorSystem
+  @deprecated("Actor system is no longer supported as of 1.4")
+  val actorSystem: ActorSystem = rpcEnv.asInstanceOf[AkkaRpcEnv].actorSystem
 
   private[spark] var isStopped = false
   private val pythonWorkers = mutable.HashMap[(String, Map[String, String]), PythonWorkerFactory]()

From 8f7308f9c49805b9486aaae5f60e4481e8ba24e8 Mon Sep 17 00:00:00 2001
From: Cheng Lian <lian@databricks.com>
Date: Wed, 10 Jun 2015 11:48:14 -0700
Subject: [PATCH 434/525] [SQL] [MINOR] Fixes a minor Java example error in SQL
 programming guide

Author: Cheng Lian <lian@databricks.com>

Closes #6749 from liancheng/java-sample-fix and squashes the following commits:

5b44585 [Cheng Lian] Fixes a minor Java example error in SQL programming guide
---
 docs/sql-programming-guide.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/sql-programming-guide.md b/docs/sql-programming-guide.md
index 40e33f757d693..c5ab074e4439f 100644
--- a/docs/sql-programming-guide.md
+++ b/docs/sql-programming-guide.md
@@ -1479,7 +1479,7 @@ expressed in HiveQL.
 
 {% highlight java %}
 // sc is an existing JavaSparkContext.
-HiveContext sqlContext = new org.apache.spark.sql.hive.HiveContext(sc);
+HiveContext sqlContext = new org.apache.spark.sql.hive.HiveContext(sc.sc);
 
 sqlContext.sql("CREATE TABLE IF NOT EXISTS src (key INT, value STRING)");
 sqlContext.sql("LOAD DATA LOCAL INPATH 'examples/src/main/resources/kv1.txt' INTO TABLE src");

From 38112905bc3b33f2ae75274afba1c30e116f6e46 Mon Sep 17 00:00:00 2001
From: Marcelo Vanzin <vanzin@cloudera.com>
Date: Wed, 10 Jun 2015 13:17:29 -0700
Subject: [PATCH 435/525] [SPARK-5479] [YARN] Handle --py-files correctly in
 YARN.

The bug description is a little misleading: the actual issue is that
.py files are not handled correctly when distributed by YARN. They're
added to "spark.submit.pyFiles", which, when processed by context.py,
explicitly whitelists certain extensions (see PACKAGE_EXTENSIONS),
and that does not include .py files.

On top of that, archives were not handled at all! They made it to the
driver's python path, but never made it to executors, since the mechanism
used to propagate their location (spark.submit.pyFiles) only works on
the driver side.

So, instead, ignore "spark.submit.pyFiles" and just build PYTHONPATH
correctly for both driver and executors. Individual .py files are
placed in a subdirectory of the container's local dir in the cluster,
which is then added to the python path. Archives are added directly.

The change, as a side effect, ends up solving the symptom described
in the bug. The issue was not that the files were not being distributed,
but that they were never made visible to the python application
running under Spark.

Also included is a proper unit test for running python on YARN, which
broke in several different ways with the previous code.

A short walk around of the changes:
- SparkSubmit does not try to be smart about how YARN handles python
  files anymore. It just passes down the configs to the YARN client
  code.
- The YARN client distributes python files and archives differently,
  placing the files in a subdirectory.
- The YARN client now sets PYTHONPATH for the processes it launches;
  to properly handle different locations, it uses YARN's support for
  embedding env variables, so to avoid YARN expanding those at the
  wrong time, SparkConf is now propagated to the AM using a conf file
  instead of command line options.
- Because the Client initialization code is a maze of implicit
  dependencies, some code needed to be moved around to make sure
  all needed state was available when the code ran.
- The pyspark tests in YarnClusterSuite now actually distribute and try
  to use both a python file and an archive containing a different python
  module. Also added a yarn-client tests for completeness.
- I cleaned up some of the code around distributing files to YARN, to
  avoid adding more copied & pasted code to handle the new files being
  distributed.

Author: Marcelo Vanzin <vanzin@cloudera.com>

Closes #6360 from vanzin/SPARK-5479 and squashes the following commits:

bcaf7e6 [Marcelo Vanzin] Feedback.
c47501f [Marcelo Vanzin] Fix yarn-client mode.
46b1d0c [Marcelo Vanzin] Merge branch 'master' into SPARK-5479
c743778 [Marcelo Vanzin] Only pyspark cares about python archives.
c8e5a82 [Marcelo Vanzin] Actually run pyspark in client mode.
705571d [Marcelo Vanzin] Move some code to the YARN module.
1dd4d0c [Marcelo Vanzin] Review feedback.
71ee736 [Marcelo Vanzin] Merge branch 'master' into SPARK-5479
220358b [Marcelo Vanzin] Scalastyle.
cdbb990 [Marcelo Vanzin] Merge branch 'master' into SPARK-5479
7fe3cd4 [Marcelo Vanzin] No need to distribute primary file to executors.
09045f1 [Marcelo Vanzin] Style.
943cbf4 [Marcelo Vanzin] [SPARK-5479] [yarn] Handle --py-files correctly in YARN.
---
 .../org/apache/spark/deploy/SparkSubmit.scala |  77 +----
 .../spark/deploy/yarn/ApplicationMaster.scala |  20 +-
 .../yarn/ApplicationMasterArguments.scala     |  12 +-
 .../org/apache/spark/deploy/yarn/Client.scala | 295 +++++++++++-------
 .../spark/deploy/yarn/ClientArguments.scala   |   4 +-
 .../cluster/YarnClientSchedulerBackend.scala  |   5 +-
 .../spark/deploy/yarn/ClientSuite.scala       |   4 +-
 .../spark/deploy/yarn/YarnClusterSuite.scala  |  61 ++--
 8 files changed, 270 insertions(+), 208 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala b/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
index a0eae774268ed..b8978e25a02d2 100644
--- a/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
@@ -324,55 +324,20 @@ object SparkSubmit {
         // Usage: PythonAppRunner <main python file> <extra python files> [app arguments]
         args.mainClass = "org.apache.spark.deploy.PythonRunner"
         args.childArgs = ArrayBuffer(args.primaryResource, args.pyFiles) ++ args.childArgs
-        args.files = mergeFileLists(args.files, args.primaryResource)
+        if (clusterManager != YARN) {
+          // The YARN backend distributes the primary file differently, so don't merge it.
+          args.files = mergeFileLists(args.files, args.primaryResource)
+        }
+      }
+      if (clusterManager != YARN) {
+        // The YARN backend handles python files differently, so don't merge the lists.
+        args.files = mergeFileLists(args.files, args.pyFiles)
       }
-      args.files = mergeFileLists(args.files, args.pyFiles)
       if (args.pyFiles != null) {
         sysProps("spark.submit.pyFiles") = args.pyFiles
       }
     }
 
-    // In yarn mode for a python app, add pyspark archives to files
-    // that can be distributed with the job
-    if (args.isPython && clusterManager == YARN) {
-      var pyArchives: String = null
-      val pyArchivesEnvOpt = sys.env.get("PYSPARK_ARCHIVES_PATH")
-      if (pyArchivesEnvOpt.isDefined) {
-        pyArchives = pyArchivesEnvOpt.get
-      } else {
-        if (!sys.env.contains("SPARK_HOME")) {
-          printErrorAndExit("SPARK_HOME does not exist for python application in yarn mode.")
-        }
-        val pythonPath = new ArrayBuffer[String]
-        for (sparkHome <- sys.env.get("SPARK_HOME")) {
-          val pyLibPath = Seq(sparkHome, "python", "lib").mkString(File.separator)
-          val pyArchivesFile = new File(pyLibPath, "pyspark.zip")
-          if (!pyArchivesFile.exists()) {
-            printErrorAndExit("pyspark.zip does not exist for python application in yarn mode.")
-          }
-          val py4jFile = new File(pyLibPath, "py4j-0.8.2.1-src.zip")
-          if (!py4jFile.exists()) {
-            printErrorAndExit("py4j-0.8.2.1-src.zip does not exist for python application " +
-              "in yarn mode.")
-          }
-          pythonPath += pyArchivesFile.getAbsolutePath()
-          pythonPath += py4jFile.getAbsolutePath()
-        }
-        pyArchives = pythonPath.mkString(",")
-      }
-
-      pyArchives = pyArchives.split(",").map { localPath =>
-        val localURI = Utils.resolveURI(localPath)
-        if (localURI.getScheme != "local") {
-          args.files = mergeFileLists(args.files, localURI.toString)
-          new Path(localPath).getName
-        } else {
-          localURI.getPath
-        }
-      }.mkString(File.pathSeparator)
-      sysProps("spark.submit.pyArchives") = pyArchives
-    }
-
     // If we're running a R app, set the main class to our specific R runner
     if (args.isR && deployMode == CLIENT) {
       if (args.primaryResource == SPARKR_SHELL) {
@@ -386,19 +351,10 @@ object SparkSubmit {
       }
     }
 
-    if (isYarnCluster) {
-      // In yarn-cluster mode for a python app, add primary resource and pyFiles to files
-      // that can be distributed with the job
-      if (args.isPython) {
-        args.files = mergeFileLists(args.files, args.primaryResource)
-        args.files = mergeFileLists(args.files, args.pyFiles)
-      }
-
+    if (isYarnCluster && args.isR) {
       // In yarn-cluster mode for a R app, add primary resource to files
       // that can be distributed with the job
-      if (args.isR) {
-        args.files = mergeFileLists(args.files, args.primaryResource)
-      }
+      args.files = mergeFileLists(args.files, args.primaryResource)
     }
 
     // Special flag to avoid deprecation warnings at the client
@@ -515,17 +471,18 @@ object SparkSubmit {
       }
     }
 
+    // Let YARN know it's a pyspark app, so it distributes needed libraries.
+    if (clusterManager == YARN && args.isPython) {
+      sysProps.put("spark.yarn.isPython", "true")
+    }
+
     // In yarn-cluster mode, use yarn.Client as a wrapper around the user class
     if (isYarnCluster) {
       childMainClass = "org.apache.spark.deploy.yarn.Client"
       if (args.isPython) {
-        val mainPyFile = new Path(args.primaryResource).getName
-        childArgs += ("--primary-py-file", mainPyFile)
+        childArgs += ("--primary-py-file", args.primaryResource)
         if (args.pyFiles != null) {
-          // These files will be distributed to each machine's working directory, so strip the
-          // path prefix
-          val pyFilesNames = args.pyFiles.split(",").map(p => (new Path(p)).getName).mkString(",")
-          childArgs += ("--py-files", pyFilesNames)
+          childArgs += ("--py-files", args.pyFiles)
         }
         childArgs += ("--class", "org.apache.spark.deploy.PythonRunner")
       } else if (args.isR) {
diff --git a/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala b/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala
index 002d7b6eaf498..83dafa4a125d2 100644
--- a/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala
+++ b/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala
@@ -32,7 +32,7 @@ import org.apache.hadoop.yarn.conf.YarnConfiguration
 import org.apache.spark.rpc._
 import org.apache.spark.{Logging, SecurityManager, SparkConf, SparkContext, SparkEnv}
 import org.apache.spark.SparkException
-import org.apache.spark.deploy.{PythonRunner, SparkHadoopUtil}
+import org.apache.spark.deploy.SparkHadoopUtil
 import org.apache.spark.deploy.history.HistoryServer
 import org.apache.spark.scheduler.cluster.{CoarseGrainedSchedulerBackend, YarnSchedulerBackend}
 import org.apache.spark.scheduler.cluster.CoarseGrainedClusterMessages._
@@ -46,6 +46,14 @@ private[spark] class ApplicationMaster(
     client: YarnRMClient)
   extends Logging {
 
+  // Load the properties file with the Spark configuration and set entries as system properties,
+  // so that user code run inside the AM also has access to them.
+  if (args.propertiesFile != null) {
+    Utils.getPropertiesFromFile(args.propertiesFile).foreach { case (k, v) =>
+      sys.props(k) = v
+    }
+  }
+
   // TODO: Currently, task to container is computed once (TaskSetManager) - which need not be
   // optimal as more containers are available. Might need to handle this better.
 
@@ -490,9 +498,11 @@ private[spark] class ApplicationMaster(
         new MutableURLClassLoader(urls, Utils.getContextOrSparkClassLoader)
       }
 
+    var userArgs = args.userArgs
     if (args.primaryPyFile != null && args.primaryPyFile.endsWith(".py")) {
-      System.setProperty("spark.submit.pyFiles",
-        PythonRunner.formatPaths(args.pyFiles).mkString(","))
+      // When running pyspark, the app is run using PythonRunner. The second argument is the list
+      // of files to add to PYTHONPATH, which Client.scala already handles, so it's empty.
+      userArgs = Seq(args.primaryPyFile, "") ++ userArgs
     }
     if (args.primaryRFile != null && args.primaryRFile.endsWith(".R")) {
       // TODO(davies): add R dependencies here
@@ -503,9 +513,7 @@ private[spark] class ApplicationMaster(
     val userThread = new Thread {
       override def run() {
         try {
-          val mainArgs = new Array[String](args.userArgs.size)
-          args.userArgs.copyToArray(mainArgs, 0, args.userArgs.size)
-          mainMethod.invoke(null, mainArgs)
+          mainMethod.invoke(null, userArgs.toArray)
           finish(FinalApplicationStatus.SUCCEEDED, ApplicationMaster.EXIT_SUCCESS)
           logDebug("Done running users class")
         } catch {
diff --git a/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMasterArguments.scala b/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMasterArguments.scala
index ae6dc1094d724..68e9f6b4db7f4 100644
--- a/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMasterArguments.scala
+++ b/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMasterArguments.scala
@@ -26,11 +26,11 @@ class ApplicationMasterArguments(val args: Array[String]) {
   var userClass: String = null
   var primaryPyFile: String = null
   var primaryRFile: String = null
-  var pyFiles: String = null
-  var userArgs: Seq[String] = Seq[String]()
+  var userArgs: Seq[String] = Nil
   var executorMemory = 1024
   var executorCores = 1
   var numExecutors = DEFAULT_NUMBER_EXECUTORS
+  var propertiesFile: String = null
 
   parseArgs(args.toList)
 
@@ -59,10 +59,6 @@ class ApplicationMasterArguments(val args: Array[String]) {
           primaryRFile = value
           args = tail
 
-        case ("--py-files") :: value :: tail =>
-          pyFiles = value
-          args = tail
-
         case ("--args" | "--arg") :: value :: tail =>
           userArgsBuffer += value
           args = tail
@@ -79,6 +75,10 @@ class ApplicationMasterArguments(val args: Array[String]) {
           executorCores = value
           args = tail
 
+        case ("--properties-file") :: value :: tail =>
+          propertiesFile = value
+          args = tail
+
         case _ =>
           printUsageAndExit(1, args)
       }
diff --git a/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala b/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala
index f4d43214b08ca..ec9402afff329 100644
--- a/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala
+++ b/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala
@@ -17,11 +17,12 @@
 
 package org.apache.spark.deploy.yarn
 
-import java.io.{ByteArrayInputStream, DataInputStream, File, FileOutputStream, IOException}
+import java.io.{ByteArrayInputStream, DataInputStream, File, FileOutputStream, IOException,
+  OutputStreamWriter}
 import java.net.{InetAddress, UnknownHostException, URI, URISyntaxException}
 import java.nio.ByteBuffer
 import java.security.PrivilegedExceptionAction
-import java.util.UUID
+import java.util.{Properties, UUID}
 import java.util.zip.{ZipEntry, ZipOutputStream}
 
 import scala.collection.JavaConversions._
@@ -29,6 +30,7 @@ import scala.collection.mutable.{ArrayBuffer, HashMap, HashSet, ListBuffer, Map}
 import scala.reflect.runtime.universe
 import scala.util.{Try, Success, Failure}
 
+import com.google.common.base.Charsets.UTF_8
 import com.google.common.base.Objects
 import com.google.common.io.Files
 
@@ -247,7 +249,9 @@ private[spark] class Client(
    * This is used for setting up a container launch context for our ApplicationMaster.
    * Exposed for testing.
    */
-  def prepareLocalResources(appStagingDir: String): HashMap[String, LocalResource] = {
+  def prepareLocalResources(
+      appStagingDir: String,
+      pySparkArchives: Seq[String]): HashMap[String, LocalResource] = {
     logInfo("Preparing resources for our AM container")
     // Upload Spark and the application JAR to the remote file system if necessary,
     // and add them as local resources to the application master.
@@ -277,20 +281,6 @@ private[spark] class Client(
           "for alternatives.")
     }
 
-    // If we passed in a keytab, make sure we copy the keytab to the staging directory on
-    // HDFS, and setup the relevant environment vars, so the AM can login again.
-    if (loginFromKeytab) {
-      logInfo("To enable the AM to login from keytab, credentials are being copied over to the AM" +
-        " via the YARN Secure Distributed Cache.")
-      val localUri = new URI(args.keytab)
-      val localPath = getQualifiedLocalPath(localUri, hadoopConf)
-      val destinationPath = copyFileToRemote(dst, localPath, replication)
-      val destFs = FileSystem.get(destinationPath.toUri(), hadoopConf)
-      distCacheMgr.addResource(
-        destFs, hadoopConf, destinationPath, localResources, LocalResourceType.FILE,
-        sparkConf.get("spark.yarn.keytab"), statCache, appMasterOnly = true)
-    }
-
     def addDistributedUri(uri: URI): Boolean = {
       val uriStr = uri.toString()
       if (distributedUris.contains(uriStr)) {
@@ -302,6 +292,57 @@ private[spark] class Client(
       }
     }
 
+    /**
+     * Distribute a file to the cluster.
+     *
+     * If the file's path is a "local:" URI, it's actually not distributed. Other files are copied
+     * to HDFS (if not already there) and added to the application's distributed cache.
+     *
+     * @param path URI of the file to distribute.
+     * @param resType Type of resource being distributed.
+     * @param destName Name of the file in the distributed cache.
+     * @param targetDir Subdirectory where to place the file.
+     * @param appMasterOnly Whether to distribute only to the AM.
+     * @return A 2-tuple. First item is whether the file is a "local:" URI. Second item is the
+     *         localized path for non-local paths, or the input `path` for local paths.
+     *         The localized path will be null if the URI has already been added to the cache.
+     */
+    def distribute(
+        path: String,
+        resType: LocalResourceType = LocalResourceType.FILE,
+        destName: Option[String] = None,
+        targetDir: Option[String] = None,
+        appMasterOnly: Boolean = false): (Boolean, String) = {
+      val localURI = new URI(path.trim())
+      if (localURI.getScheme != LOCAL_SCHEME) {
+        if (addDistributedUri(localURI)) {
+          val localPath = getQualifiedLocalPath(localURI, hadoopConf)
+          val linkname = targetDir.map(_ + "/").getOrElse("") +
+            destName.orElse(Option(localURI.getFragment())).getOrElse(localPath.getName())
+          val destPath = copyFileToRemote(dst, localPath, replication)
+          distCacheMgr.addResource(
+            fs, hadoopConf, destPath, localResources, resType, linkname, statCache,
+            appMasterOnly = appMasterOnly)
+          (false, linkname)
+        } else {
+          (false, null)
+        }
+      } else {
+        (true, path.trim())
+      }
+    }
+
+    // If we passed in a keytab, make sure we copy the keytab to the staging directory on
+    // HDFS, and setup the relevant environment vars, so the AM can login again.
+    if (loginFromKeytab) {
+      logInfo("To enable the AM to login from keytab, credentials are being copied over to the AM" +
+        " via the YARN Secure Distributed Cache.")
+      val (_, localizedPath) = distribute(args.keytab,
+        destName = Some(sparkConf.get("spark.yarn.keytab")),
+        appMasterOnly = true)
+      require(localizedPath != null, "Keytab file already distributed.")
+    }
+
     /**
      * Copy the given main resource to the distributed cache if the scheme is not "local".
      * Otherwise, set the corresponding key in our SparkConf to handle it downstream.
@@ -314,33 +355,18 @@ private[spark] class Client(
       (SPARK_JAR, sparkJar(sparkConf), CONF_SPARK_JAR),
       (APP_JAR, args.userJar, CONF_SPARK_USER_JAR),
       ("log4j.properties", oldLog4jConf.orNull, null)
-    ).foreach { case (destName, _localPath, confKey) =>
-      val localPath: String = if (_localPath != null) _localPath.trim() else ""
-      if (!localPath.isEmpty()) {
-        val localURI = new URI(localPath)
-        if (localURI.getScheme != LOCAL_SCHEME) {
-          if (addDistributedUri(localURI)) {
-            val src = getQualifiedLocalPath(localURI, hadoopConf)
-            val destPath = copyFileToRemote(dst, src, replication)
-            val destFs = FileSystem.get(destPath.toUri(), hadoopConf)
-            distCacheMgr.addResource(destFs, hadoopConf, destPath,
-              localResources, LocalResourceType.FILE, destName, statCache)
-          }
-        } else if (confKey != null) {
+    ).foreach { case (destName, path, confKey) =>
+      if (path != null && !path.trim().isEmpty()) {
+        val (isLocal, localizedPath) = distribute(path, destName = Some(destName))
+        if (isLocal && confKey != null) {
+          require(localizedPath != null, s"Path $path already distributed.")
           // If the resource is intended for local use only, handle this downstream
           // by setting the appropriate property
-          sparkConf.set(confKey, localPath)
+          sparkConf.set(confKey, localizedPath)
         }
       }
     }
 
-    createConfArchive().foreach { file =>
-      require(addDistributedUri(file.toURI()))
-      val destPath = copyFileToRemote(dst, new Path(file.toURI()), replication)
-      distCacheMgr.addResource(fs, hadoopConf, destPath, localResources, LocalResourceType.ARCHIVE,
-        LOCALIZED_HADOOP_CONF_DIR, statCache, appMasterOnly = true)
-    }
-
     /**
      * Do the same for any additional resources passed in through ClientArguments.
      * Each resource category is represented by a 3-tuple of:
@@ -356,21 +382,10 @@ private[spark] class Client(
     ).foreach { case (flist, resType, addToClasspath) =>
       if (flist != null && !flist.isEmpty()) {
         flist.split(',').foreach { file =>
-          val localURI = new URI(file.trim())
-          if (localURI.getScheme != LOCAL_SCHEME) {
-            if (addDistributedUri(localURI)) {
-              val localPath = new Path(localURI)
-              val linkname = Option(localURI.getFragment()).getOrElse(localPath.getName())
-              val destPath = copyFileToRemote(dst, localPath, replication)
-              distCacheMgr.addResource(
-                fs, hadoopConf, destPath, localResources, resType, linkname, statCache)
-              if (addToClasspath) {
-                cachedSecondaryJarLinks += linkname
-              }
-            }
-          } else if (addToClasspath) {
-            // Resource is intended for local use only and should be added to the class path
-            cachedSecondaryJarLinks += file.trim()
+          val (_, localizedPath) = distribute(file, resType = resType)
+          require(localizedPath != null)
+          if (addToClasspath) {
+            cachedSecondaryJarLinks += localizedPath
           }
         }
       }
@@ -379,11 +394,31 @@ private[spark] class Client(
       sparkConf.set(CONF_SPARK_YARN_SECONDARY_JARS, cachedSecondaryJarLinks.mkString(","))
     }
 
+    if (isClusterMode && args.primaryPyFile != null) {
+      distribute(args.primaryPyFile, appMasterOnly = true)
+    }
+
+    pySparkArchives.foreach { f => distribute(f) }
+
+    // The python files list needs to be treated especially. All files that are not an
+    // archive need to be placed in a subdirectory that will be added to PYTHONPATH.
+    args.pyFiles.foreach { f =>
+      val targetDir = if (f.endsWith(".py")) Some(LOCALIZED_PYTHON_DIR) else None
+      distribute(f, targetDir = targetDir)
+    }
+
+    // Distribute an archive with Hadoop and Spark configuration for the AM.
+    val (_, confLocalizedPath) = distribute(createConfArchive().getAbsolutePath(),
+      resType = LocalResourceType.ARCHIVE,
+      destName = Some(LOCALIZED_CONF_DIR),
+      appMasterOnly = true)
+    require(confLocalizedPath != null)
+
     localResources
   }
 
   /**
-   * Create an archive with the Hadoop config files for distribution.
+   * Create an archive with the config files for distribution.
    *
    * These are only used by the AM, since executors will use the configuration object broadcast by
    * the driver. The files are zipped and added to the job as an archive, so that YARN will explode
@@ -395,8 +430,11 @@ private[spark] class Client(
    *
    * Currently this makes a shallow copy of the conf directory. If there are cases where a
    * Hadoop config directory contains subdirectories, this code will have to be fixed.
+   *
+   * The archive also contains some Spark configuration. Namely, it saves the contents of
+   * SparkConf in a file to be loaded by the AM process.
    */
-  private def createConfArchive(): Option[File] = {
+  private def createConfArchive(): File = {
     val hadoopConfFiles = new HashMap[String, File]()
     Seq("HADOOP_CONF_DIR", "YARN_CONF_DIR").foreach { envKey =>
       sys.env.get(envKey).foreach { path =>
@@ -411,28 +449,32 @@ private[spark] class Client(
       }
     }
 
-    if (!hadoopConfFiles.isEmpty) {
-      val hadoopConfArchive = File.createTempFile(LOCALIZED_HADOOP_CONF_DIR, ".zip",
-        new File(Utils.getLocalDir(sparkConf)))
+    val confArchive = File.createTempFile(LOCALIZED_CONF_DIR, ".zip",
+      new File(Utils.getLocalDir(sparkConf)))
+    val confStream = new ZipOutputStream(new FileOutputStream(confArchive))
 
-      val hadoopConfStream = new ZipOutputStream(new FileOutputStream(hadoopConfArchive))
-      try {
-        hadoopConfStream.setLevel(0)
-        hadoopConfFiles.foreach { case (name, file) =>
-          if (file.canRead()) {
-            hadoopConfStream.putNextEntry(new ZipEntry(name))
-            Files.copy(file, hadoopConfStream)
-            hadoopConfStream.closeEntry()
-          }
+    try {
+      confStream.setLevel(0)
+      hadoopConfFiles.foreach { case (name, file) =>
+        if (file.canRead()) {
+          confStream.putNextEntry(new ZipEntry(name))
+          Files.copy(file, confStream)
+          confStream.closeEntry()
         }
-      } finally {
-        hadoopConfStream.close()
       }
 
-      Some(hadoopConfArchive)
-    } else {
-      None
+      // Save Spark configuration to a file in the archive.
+      val props = new Properties()
+      sparkConf.getAll.foreach { case (k, v) => props.setProperty(k, v) }
+      confStream.putNextEntry(new ZipEntry(SPARK_CONF_FILE))
+      val writer = new OutputStreamWriter(confStream, UTF_8)
+      props.store(writer, "Spark configuration.")
+      writer.flush()
+      confStream.closeEntry()
+    } finally {
+      confStream.close()
     }
+    confArchive
   }
 
   /**
@@ -460,7 +502,9 @@ private[spark] class Client(
   /**
    * Set up the environment for launching our ApplicationMaster container.
    */
-  private def setupLaunchEnv(stagingDir: String): HashMap[String, String] = {
+  private def setupLaunchEnv(
+      stagingDir: String,
+      pySparkArchives: Seq[String]): HashMap[String, String] = {
     logInfo("Setting up the launch environment for our AM container")
     val env = new HashMap[String, String]()
     val extraCp = sparkConf.getOption("spark.driver.extraClassPath")
@@ -478,9 +522,6 @@ private[spark] class Client(
       val renewalInterval = getTokenRenewalInterval(stagingDirPath)
       sparkConf.set("spark.yarn.token.renewal.interval", renewalInterval.toString)
     }
-    // Set the environment variables to be passed on to the executors.
-    distCacheMgr.setDistFilesEnv(env)
-    distCacheMgr.setDistArchivesEnv(env)
 
     // Pick up any environment variables for the AM provided through spark.yarn.appMasterEnv.*
     val amEnvPrefix = "spark.yarn.appMasterEnv."
@@ -497,15 +538,32 @@ private[spark] class Client(
       env("SPARK_YARN_USER_ENV") = userEnvs
     }
 
-    // if spark.submit.pyArchives is in sparkConf, append pyArchives to PYTHONPATH
-    // that can be passed on to the ApplicationMaster and the executors.
-    if (sparkConf.contains("spark.submit.pyArchives")) {
-      var pythonPath = sparkConf.get("spark.submit.pyArchives")
-      if (env.contains("PYTHONPATH")) {
-        pythonPath = Seq(env.get("PYTHONPATH"), pythonPath).mkString(File.pathSeparator)
+    // If pyFiles contains any .py files, we need to add LOCALIZED_PYTHON_DIR to the PYTHONPATH
+    // of the container processes too. Add all non-.py files directly to PYTHONPATH.
+    //
+    // NOTE: the code currently does not handle .py files defined with a "local:" scheme.
+    val pythonPath = new ListBuffer[String]()
+    val (pyFiles, pyArchives) = args.pyFiles.partition(_.endsWith(".py"))
+    if (pyFiles.nonEmpty) {
+      pythonPath += buildPath(YarnSparkHadoopUtil.expandEnvironment(Environment.PWD),
+        LOCALIZED_PYTHON_DIR)
+    }
+    (pySparkArchives ++ pyArchives).foreach { path =>
+      val uri = new URI(path)
+      if (uri.getScheme != LOCAL_SCHEME) {
+        pythonPath += buildPath(YarnSparkHadoopUtil.expandEnvironment(Environment.PWD),
+          new Path(path).getName())
+      } else {
+        pythonPath += uri.getPath()
       }
-      env("PYTHONPATH") = pythonPath
-      sparkConf.setExecutorEnv("PYTHONPATH", pythonPath)
+    }
+
+    // Finally, update the Spark config to propagate PYTHONPATH to the AM and executors.
+    if (pythonPath.nonEmpty) {
+      val pythonPathStr = (sys.env.get("PYTHONPATH") ++ pythonPath)
+        .mkString(YarnSparkHadoopUtil.getClassPathSeparator)
+      env("PYTHONPATH") = pythonPathStr
+      sparkConf.setExecutorEnv("PYTHONPATH", pythonPathStr)
     }
 
     // In cluster mode, if the deprecated SPARK_JAVA_OPTS is set, we need to propagate it to
@@ -555,8 +613,19 @@ private[spark] class Client(
     logInfo("Setting up container launch context for our AM")
     val appId = newAppResponse.getApplicationId
     val appStagingDir = getAppStagingDir(appId)
-    val localResources = prepareLocalResources(appStagingDir)
-    val launchEnv = setupLaunchEnv(appStagingDir)
+    val pySparkArchives =
+      if (sys.props.getOrElse("spark.yarn.isPython", "false").toBoolean) {
+        findPySparkArchives()
+      } else {
+        Nil
+      }
+    val launchEnv = setupLaunchEnv(appStagingDir, pySparkArchives)
+    val localResources = prepareLocalResources(appStagingDir, pySparkArchives)
+
+    // Set the environment variables to be passed on to the executors.
+    distCacheMgr.setDistFilesEnv(launchEnv)
+    distCacheMgr.setDistArchivesEnv(launchEnv)
+
     val amContainer = Records.newRecord(classOf[ContainerLaunchContext])
     amContainer.setLocalResources(localResources)
     amContainer.setEnvironment(launchEnv)
@@ -596,13 +665,6 @@ private[spark] class Client(
       javaOpts += "-XX:CMSIncrementalDutyCycle=10"
     }
 
-    // Forward the Spark configuration to the application master / executors.
-    // TODO: it might be nicer to pass these as an internal environment variable rather than
-    // as Java options, due to complications with string parsing of nested quotes.
-    for ((k, v) <- sparkConf.getAll) {
-      javaOpts += YarnSparkHadoopUtil.escapeForShell(s"-D$k=$v")
-    }
-
     // Include driver-specific java options if we are launching a driver
     if (isClusterMode) {
       val driverOpts = sparkConf.getOption("spark.driver.extraJavaOptions")
@@ -655,14 +717,8 @@ private[spark] class Client(
         Nil
       }
     val primaryPyFile =
-      if (args.primaryPyFile != null) {
-        Seq("--primary-py-file", args.primaryPyFile)
-      } else {
-        Nil
-      }
-    val pyFiles =
-      if (args.pyFiles != null) {
-        Seq("--py-files", args.pyFiles)
+      if (isClusterMode && args.primaryPyFile != null) {
+        Seq("--primary-py-file", new Path(args.primaryPyFile).getName())
       } else {
         Nil
       }
@@ -678,9 +734,6 @@ private[spark] class Client(
       } else {
         Class.forName("org.apache.spark.deploy.yarn.ExecutorLauncher").getName
       }
-    if (args.primaryPyFile != null && args.primaryPyFile.endsWith(".py")) {
-      args.userArgs = ArrayBuffer(args.primaryPyFile, args.pyFiles) ++ args.userArgs
-    }
     if (args.primaryRFile != null && args.primaryRFile.endsWith(".R")) {
       args.userArgs = ArrayBuffer(args.primaryRFile) ++ args.userArgs
     }
@@ -688,11 +741,13 @@ private[spark] class Client(
       Seq("--arg", YarnSparkHadoopUtil.escapeForShell(arg))
     }
     val amArgs =
-      Seq(amClass) ++ userClass ++ userJar ++ primaryPyFile ++ pyFiles ++ primaryRFile ++
+      Seq(amClass) ++ userClass ++ userJar ++ primaryPyFile ++ primaryRFile ++
         userArgs ++ Seq(
           "--executor-memory", args.executorMemory.toString + "m",
           "--executor-cores", args.executorCores.toString,
-          "--num-executors ", args.numExecutors.toString)
+          "--num-executors ", args.numExecutors.toString,
+          "--properties-file", buildPath(YarnSparkHadoopUtil.expandEnvironment(Environment.PWD),
+            LOCALIZED_CONF_DIR, SPARK_CONF_FILE))
 
     // Command for the ApplicationMaster
     val commands = prefixEnv ++ Seq(
@@ -857,6 +912,22 @@ private[spark] class Client(
       }
     }
   }
+
+  private def findPySparkArchives(): Seq[String] = {
+    sys.env.get("PYSPARK_ARCHIVES_PATH")
+      .map(_.split(",").toSeq)
+      .getOrElse {
+        val pyLibPath = Seq(sys.env("SPARK_HOME"), "python", "lib").mkString(File.separator)
+        val pyArchivesFile = new File(pyLibPath, "pyspark.zip")
+        require(pyArchivesFile.exists(),
+          "pyspark.zip not found; cannot run pyspark application in YARN mode.")
+        val py4jFile = new File(pyLibPath, "py4j-0.8.2.1-src.zip")
+        require(py4jFile.exists(),
+          "py4j-0.8.2.1-src.zip not found; cannot run pyspark application in YARN mode.")
+        Seq(pyArchivesFile.getAbsolutePath(), py4jFile.getAbsolutePath())
+      }
+  }
+
 }
 
 object Client extends Logging {
@@ -907,8 +978,14 @@ object Client extends Logging {
   // Distribution-defined classpath to add to processes
   val ENV_DIST_CLASSPATH = "SPARK_DIST_CLASSPATH"
 
-  // Subdirectory where the user's hadoop config files will be placed.
-  val LOCALIZED_HADOOP_CONF_DIR = "__hadoop_conf__"
+  // Subdirectory where the user's Spark and Hadoop config files will be placed.
+  val LOCALIZED_CONF_DIR = "__spark_conf__"
+
+  // Name of the file in the conf archive containing Spark configuration.
+  val SPARK_CONF_FILE = "__spark_conf__.properties"
+
+  // Subdirectory where the user's python files (not archives) will be placed.
+  val LOCALIZED_PYTHON_DIR = "__pyfiles__"
 
   /**
    * Find the user-defined Spark jar if configured, or return the jar containing this
@@ -1033,7 +1110,7 @@ object Client extends Logging {
     if (isAM) {
       addClasspathEntry(
         YarnSparkHadoopUtil.expandEnvironment(Environment.PWD) + Path.SEPARATOR +
-          LOCALIZED_HADOOP_CONF_DIR, env)
+          LOCALIZED_CONF_DIR, env)
     }
 
     if (sparkConf.getBoolean("spark.yarn.user.classpath.first", false)) {
diff --git a/yarn/src/main/scala/org/apache/spark/deploy/yarn/ClientArguments.scala b/yarn/src/main/scala/org/apache/spark/deploy/yarn/ClientArguments.scala
index 9c7b1b3988082..35e990602a6cf 100644
--- a/yarn/src/main/scala/org/apache/spark/deploy/yarn/ClientArguments.scala
+++ b/yarn/src/main/scala/org/apache/spark/deploy/yarn/ClientArguments.scala
@@ -30,7 +30,7 @@ private[spark] class ClientArguments(args: Array[String], sparkConf: SparkConf)
   var archives: String = null
   var userJar: String = null
   var userClass: String = null
-  var pyFiles: String = null
+  var pyFiles: Seq[String] = Nil
   var primaryPyFile: String = null
   var primaryRFile: String = null
   var userArgs: ArrayBuffer[String] = new ArrayBuffer[String]()
@@ -228,7 +228,7 @@ private[spark] class ClientArguments(args: Array[String], sparkConf: SparkConf)
           args = tail
 
         case ("--py-files") :: value :: tail =>
-          pyFiles = value
+          pyFiles = value.split(",")
           args = tail
 
         case ("--files") :: value :: tail =>
diff --git a/yarn/src/main/scala/org/apache/spark/scheduler/cluster/YarnClientSchedulerBackend.scala b/yarn/src/main/scala/org/apache/spark/scheduler/cluster/YarnClientSchedulerBackend.scala
index 99c05329b4d73..1c8d7ec57635f 100644
--- a/yarn/src/main/scala/org/apache/spark/scheduler/cluster/YarnClientSchedulerBackend.scala
+++ b/yarn/src/main/scala/org/apache/spark/scheduler/cluster/YarnClientSchedulerBackend.scala
@@ -76,7 +76,8 @@ private[spark] class YarnClientSchedulerBackend(
         ("--executor-memory", "SPARK_EXECUTOR_MEMORY", "spark.executor.memory"),
         ("--executor-cores", "SPARK_WORKER_CORES", "spark.executor.cores"),
         ("--executor-cores", "SPARK_EXECUTOR_CORES", "spark.executor.cores"),
-        ("--queue", "SPARK_YARN_QUEUE", "spark.yarn.queue")
+        ("--queue", "SPARK_YARN_QUEUE", "spark.yarn.queue"),
+        ("--py-files", null, "spark.submit.pyFiles")
       )
     // Warn against the following deprecated environment variables: env var -> suggestion
     val deprecatedEnvVars = Map(
@@ -86,7 +87,7 @@ private[spark] class YarnClientSchedulerBackend(
     optionTuples.foreach { case (optionName, envVar, sparkProp) =>
       if (sc.getConf.contains(sparkProp)) {
         extraArgs += (optionName, sc.getConf.get(sparkProp))
-      } else if (System.getenv(envVar) != null) {
+      } else if (envVar != null && System.getenv(envVar) != null) {
         extraArgs += (optionName, System.getenv(envVar))
         if (deprecatedEnvVars.contains(envVar)) {
           logWarning(s"NOTE: $envVar is deprecated. Use ${deprecatedEnvVars(envVar)} instead.")
diff --git a/yarn/src/test/scala/org/apache/spark/deploy/yarn/ClientSuite.scala b/yarn/src/test/scala/org/apache/spark/deploy/yarn/ClientSuite.scala
index 01d33c9ce9297..4ec976aa31387 100644
--- a/yarn/src/test/scala/org/apache/spark/deploy/yarn/ClientSuite.scala
+++ b/yarn/src/test/scala/org/apache/spark/deploy/yarn/ClientSuite.scala
@@ -113,7 +113,7 @@ class ClientSuite extends SparkFunSuite with Matchers with BeforeAndAfterAll {
         Environment.PWD.$()
       }
     cp should contain(pwdVar)
-    cp should contain (s"$pwdVar${Path.SEPARATOR}${Client.LOCALIZED_HADOOP_CONF_DIR}")
+    cp should contain (s"$pwdVar${Path.SEPARATOR}${Client.LOCALIZED_CONF_DIR}")
     cp should not contain (Client.SPARK_JAR)
     cp should not contain (Client.APP_JAR)
   }
@@ -129,7 +129,7 @@ class ClientSuite extends SparkFunSuite with Matchers with BeforeAndAfterAll {
 
     val tempDir = Utils.createTempDir()
     try {
-      client.prepareLocalResources(tempDir.getAbsolutePath())
+      client.prepareLocalResources(tempDir.getAbsolutePath(), Nil)
       sparkConf.getOption(Client.CONF_SPARK_USER_JAR) should be (Some(USER))
 
       // The non-local path should be propagated by name only, since it will end up in the app's
diff --git a/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnClusterSuite.scala b/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnClusterSuite.scala
index 93d587d0cb36a..a0f25ba450068 100644
--- a/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnClusterSuite.scala
+++ b/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnClusterSuite.scala
@@ -56,6 +56,7 @@ class YarnClusterSuite extends SparkFunSuite with BeforeAndAfterAll with Matcher
     """.stripMargin
 
   private val TEST_PYFILE = """
+    |import mod1, mod2
     |import sys
     |from operator import add
     |
@@ -67,7 +68,7 @@ class YarnClusterSuite extends SparkFunSuite with BeforeAndAfterAll with Matcher
     |    sc = SparkContext(conf=SparkConf())
     |    status = open(sys.argv[1],'w')
     |    result = "failure"
-    |    rdd = sc.parallelize(range(10))
+    |    rdd = sc.parallelize(range(10)).map(lambda x: x * mod1.func() * mod2.func())
     |    cnt = rdd.count()
     |    if cnt == 10:
     |        result = "success"
@@ -76,6 +77,11 @@ class YarnClusterSuite extends SparkFunSuite with BeforeAndAfterAll with Matcher
     |    sc.stop()
     """.stripMargin
 
+  private val TEST_PYMODULE = """
+    |def func():
+    |    return 42
+    """.stripMargin
+
   private var yarnCluster: MiniYARNCluster = _
   private var tempDir: File = _
   private var fakeSparkJar: File = _
@@ -124,7 +130,7 @@ class YarnClusterSuite extends SparkFunSuite with BeforeAndAfterAll with Matcher
     logInfo(s"RM address in configuration is ${config.get(YarnConfiguration.RM_ADDRESS)}")
 
     fakeSparkJar = File.createTempFile("sparkJar", null, tempDir)
-    hadoopConfDir = new File(tempDir, Client.LOCALIZED_HADOOP_CONF_DIR)
+    hadoopConfDir = new File(tempDir, Client.LOCALIZED_CONF_DIR)
     assert(hadoopConfDir.mkdir())
     File.createTempFile("token", ".txt", hadoopConfDir)
   }
@@ -151,26 +157,12 @@ class YarnClusterSuite extends SparkFunSuite with BeforeAndAfterAll with Matcher
     }
   }
 
-  // Enable this once fix SPARK-6700
-  test("run Python application in yarn-cluster mode") {
-    val primaryPyFile = new File(tempDir, "test.py")
-    Files.write(TEST_PYFILE, primaryPyFile, UTF_8)
-    val pyFile = new File(tempDir, "test2.py")
-    Files.write(TEST_PYFILE, pyFile, UTF_8)
-    var result = File.createTempFile("result", null, tempDir)
+  test("run Python application in yarn-client mode") {
+    testPySpark(true)
+  }
 
-    // The sbt assembly does not include pyspark / py4j python dependencies, so we need to
-    // propagate SPARK_HOME so that those are added to PYTHONPATH. See PythonUtils.scala.
-    val sparkHome = sys.props("spark.test.home")
-    val extraConf = Map(
-      "spark.executorEnv.SPARK_HOME" -> sparkHome,
-      "spark.yarn.appMasterEnv.SPARK_HOME" -> sparkHome)
-
-    runSpark(false, primaryPyFile.getAbsolutePath(),
-      sparkArgs = Seq("--py-files", pyFile.getAbsolutePath()),
-      appArgs = Seq(result.getAbsolutePath()),
-      extraConf = extraConf)
-    checkResult(result)
+  test("run Python application in yarn-cluster mode") {
+    testPySpark(false)
   }
 
   test("user class path first in client mode") {
@@ -188,6 +180,33 @@ class YarnClusterSuite extends SparkFunSuite with BeforeAndAfterAll with Matcher
     checkResult(result)
   }
 
+  private def testPySpark(clientMode: Boolean): Unit = {
+    val primaryPyFile = new File(tempDir, "test.py")
+    Files.write(TEST_PYFILE, primaryPyFile, UTF_8)
+
+    val moduleDir =
+      if (clientMode) {
+        // In client-mode, .py files added with --py-files are not visible in the driver.
+        // This is something that the launcher library would have to handle.
+        tempDir
+      } else {
+        val subdir = new File(tempDir, "pyModules")
+        subdir.mkdir()
+        subdir
+      }
+    val pyModule = new File(moduleDir, "mod1.py")
+    Files.write(TEST_PYMODULE, pyModule, UTF_8)
+
+    val mod2Archive = TestUtils.createJarWithFiles(Map("mod2.py" -> TEST_PYMODULE), moduleDir)
+    val pyFiles = Seq(pyModule.getAbsolutePath(), mod2Archive.getPath()).mkString(",")
+    val result = File.createTempFile("result", null, tempDir)
+
+    runSpark(clientMode, primaryPyFile.getAbsolutePath(),
+      sparkArgs = Seq("--py-files", pyFiles),
+      appArgs = Seq(result.getAbsolutePath()))
+    checkResult(result)
+  }
+
   private def testUseClassPathFirst(clientMode: Boolean): Unit = {
     // Create a jar file that contains a different version of "test.resource".
     val originalJar = TestUtils.createJarWithFiles(Map("test.resource" -> "ORIGINAL"), tempDir)

From 30ebf1a233295539c2455bd838bae7315711e1e2 Mon Sep 17 00:00:00 2001
From: Hossein <hossein@databricks.com>
Date: Wed, 10 Jun 2015 13:18:48 -0700
Subject: [PATCH 436/525] [SPARK-8282] [SPARKR] Make number of threads used in
 RBackend configurable

Read number of threads for RBackend from configuration.

[SPARK-8282] #comment Linking with JIRA

Author: Hossein <hossein@databricks.com>

Closes #6730 from falaki/SPARK-8282 and squashes the following commits:

33b3d98 [Hossein] Documented new config parameter
70f2a9c [Hossein] Fixing import
ec44225 [Hossein] Read number of threads for RBackend from configuration
---
 .../main/scala/org/apache/spark/api/r/RBackend.scala |  5 +++--
 docs/configuration.md                                | 12 ++++++++++++
 2 files changed, 15 insertions(+), 2 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/api/r/RBackend.scala b/core/src/main/scala/org/apache/spark/api/r/RBackend.scala
index d24c650d37bb0..1a5f2bca26c2b 100644
--- a/core/src/main/scala/org/apache/spark/api/r/RBackend.scala
+++ b/core/src/main/scala/org/apache/spark/api/r/RBackend.scala
@@ -29,7 +29,7 @@ import io.netty.channel.socket.nio.NioServerSocketChannel
 import io.netty.handler.codec.LengthFieldBasedFrameDecoder
 import io.netty.handler.codec.bytes.{ByteArrayDecoder, ByteArrayEncoder}
 
-import org.apache.spark.Logging
+import org.apache.spark.{Logging, SparkConf}
 
 /**
  * Netty-based backend server that is used to communicate between R and Java.
@@ -41,7 +41,8 @@ private[spark] class RBackend {
   private[this] var bossGroup: EventLoopGroup = null
 
   def init(): Int = {
-    bossGroup = new NioEventLoopGroup(2)
+    val conf = new SparkConf()
+    bossGroup = new NioEventLoopGroup(conf.getInt("spark.r.numRBackendThreads", 2))
     val workerGroup = bossGroup
     val handler = new RBackendHandler(this)
 
diff --git a/docs/configuration.md b/docs/configuration.md
index 3960e7e78bde1..95a322f79b40b 100644
--- a/docs/configuration.md
+++ b/docs/configuration.md
@@ -1495,6 +1495,18 @@ Apart from these, the following properties are also available, and may be useful
 </tr>
 </table>
 
+#### SparkR
+<table class="table">
+<tr><th>Property Name</th><th>Default</th><th>Meaning</th></tr>
+<tr>
+  <td><code>spark.r.numRBackendThreads</code></td>
+  <td>2</td>
+  <td>
+    Number of threads used by RBackend to handle RPC calls from SparkR package.
+  </td>
+</tr>
+</table>
+
 #### Cluster Managers
 Each cluster manager in Spark has additional configuration options. Configurations
 can be found on the pages for each mode:

From 19e30b48f3c6d0b72871d3e15b9564c1b2822700 Mon Sep 17 00:00:00 2001
From: Adam Roberts <aroberts@uk.ibm.com>
Date: Wed, 10 Jun 2015 13:21:01 -0700
Subject: [PATCH 437/525] [SPARK-7756] CORE RDDOperationScope fix for IBM Java

IBM Java has an extra method when we do getStackTrace(): this is "getStackTraceImpl", a native method. This causes two tests to fail within "DStreamScopeSuite" when running with IBM Java. Instead of "map" or "filter" being the method names found, "getStackTrace" is returned. This commit addresses such an issue by using dropWhile. Given that our current method is withScope, we look for the next method that isn't ours: we don't care about methods that come before us in the stack trace: e.g. getStackTrace (regardless of how many levels this might go).

IBM:
java.lang.Thread.getStackTraceImpl(Native Method)
java.lang.Thread.getStackTrace(Thread.java:1117)
org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:104)

Oracle:
PRINTING STACKTRACE!!!
java.lang.Thread.getStackTrace(Thread.java:1552)
org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:106)

I've tested this with Oracle and IBM Java, no side effects for other tests introduced.

Author: Adam Roberts <aroberts@uk.ibm.com>
Author: a-roberts <aroberts@uk.ibm.com>

Closes #6740 from a-roberts/RDDScopeStackCrawlFix and squashes the following commits:

13ce390 [Adam Roberts] Ensure consistency with String equality checking
a4fc0e0 [a-roberts] Update RDDOperationScope.scala
---
 .../scala/org/apache/spark/rdd/RDDOperationScope.scala     | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/rdd/RDDOperationScope.scala b/core/src/main/scala/org/apache/spark/rdd/RDDOperationScope.scala
index 6b09dfafc889c..44667281c1063 100644
--- a/core/src/main/scala/org/apache/spark/rdd/RDDOperationScope.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/RDDOperationScope.scala
@@ -95,10 +95,9 @@ private[spark] object RDDOperationScope extends Logging {
   private[spark] def withScope[T](
       sc: SparkContext,
       allowNesting: Boolean = false)(body: => T): T = {
-    val stackTrace = Thread.currentThread.getStackTrace().tail // ignore "Thread#getStackTrace"
-    val ourMethodName = stackTrace(1).getMethodName // i.e. withScope
-    // Climb upwards to find the first method that's called something different
-    val callerMethodName = stackTrace
+    val ourMethodName = "withScope"
+    val callerMethodName = Thread.currentThread.getStackTrace()
+      .dropWhile(_.getMethodName != ourMethodName)
       .find(_.getMethodName != ourMethodName)
       .map(_.getMethodName)
       .getOrElse {

From e90c9d92d9a86e9960c10a5c043f3c02f6c636f9 Mon Sep 17 00:00:00 2001
From: zsxwing <zsxwing@gmail.com>
Date: Wed, 10 Jun 2015 13:22:52 -0700
Subject: [PATCH 438/525] [SPARK-7527] [CORE] Fix createNullValue to return the
 correct null values and REPL mode detection

The root cause of SPARK-7527 is `createNullValue` returns an incompatible value `Byte(0)` for `char` and `boolean`.

This PR fixes it and corrects the class name of the main class, and also adds an unit test to demonstrate it.

Author: zsxwing <zsxwing@gmail.com>

Closes #6735 from zsxwing/SPARK-7527 and squashes the following commits:

bbdb271 [zsxwing] Use pattern match in createNullValue
b0a0e7e [zsxwing] Remove the noisy in the test output
903e269 [zsxwing] Remove the code for Utils.isInInterpreter == false
5f92dc1 [zsxwing] Fix createNullValue to return the correct null values and REPL mode detection
---
 .../apache/spark/util/ClosureCleaner.scala    | 40 ++++++++---------
 .../scala/org/apache/spark/util/Utils.scala   |  9 +---
 .../spark/util/ClosureCleanerSuite.scala      | 44 +++++++++++++++++++
 3 files changed, 64 insertions(+), 29 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/util/ClosureCleaner.scala b/core/src/main/scala/org/apache/spark/util/ClosureCleaner.scala
index 6f2966bd4fd31..305de4c75539d 100644
--- a/core/src/main/scala/org/apache/spark/util/ClosureCleaner.scala
+++ b/core/src/main/scala/org/apache/spark/util/ClosureCleaner.scala
@@ -109,7 +109,14 @@ private[spark] object ClosureCleaner extends Logging {
 
   private def createNullValue(cls: Class[_]): AnyRef = {
     if (cls.isPrimitive) {
-      new java.lang.Byte(0: Byte) // Should be convertible to any primitive type
+      cls match {
+        case java.lang.Boolean.TYPE => new java.lang.Boolean(false)
+        case java.lang.Character.TYPE => new java.lang.Character('\0')
+        case java.lang.Void.TYPE =>
+          // This should not happen because `Foo(void x) {}` does not compile.
+          throw new IllegalStateException("Unexpected void parameter in constructor")
+        case _ => new java.lang.Byte(0: Byte)
+      }
     } else {
       null
     }
@@ -319,28 +326,17 @@ private[spark] object ClosureCleaner extends Logging {
   private def instantiateClass(
       cls: Class[_],
       enclosingObject: AnyRef): AnyRef = {
-    if (!Utils.isInInterpreter) {
-      // This is a bona fide closure class, whose constructor has no effects
-      // other than to set its fields, so use its constructor
-      val cons = cls.getConstructors()(0)
-      val params = cons.getParameterTypes.map(createNullValue).toArray
-      if (enclosingObject != null) {
-        params(0) = enclosingObject // First param is always enclosing object
-      }
-      return cons.newInstance(params: _*).asInstanceOf[AnyRef]
-    } else {
-      // Use reflection to instantiate object without calling constructor
-      val rf = sun.reflect.ReflectionFactory.getReflectionFactory()
-      val parentCtor = classOf[java.lang.Object].getDeclaredConstructor()
-      val newCtor = rf.newConstructorForSerialization(cls, parentCtor)
-      val obj = newCtor.newInstance().asInstanceOf[AnyRef]
-      if (enclosingObject != null) {
-        val field = cls.getDeclaredField("$outer")
-        field.setAccessible(true)
-        field.set(obj, enclosingObject)
-      }
-      obj
+    // Use reflection to instantiate object without calling constructor
+    val rf = sun.reflect.ReflectionFactory.getReflectionFactory()
+    val parentCtor = classOf[java.lang.Object].getDeclaredConstructor()
+    val newCtor = rf.newConstructorForSerialization(cls, parentCtor)
+    val obj = newCtor.newInstance().asInstanceOf[AnyRef]
+    if (enclosingObject != null) {
+      val field = cls.getDeclaredField("$outer")
+      field.setAccessible(true)
+      field.set(obj, enclosingObject)
     }
+    obj
   }
 }
 
diff --git a/core/src/main/scala/org/apache/spark/util/Utils.scala b/core/src/main/scala/org/apache/spark/util/Utils.scala
index 153ece6224a6d..19157af5b6f4d 100644
--- a/core/src/main/scala/org/apache/spark/util/Utils.scala
+++ b/core/src/main/scala/org/apache/spark/util/Utils.scala
@@ -1804,15 +1804,10 @@ private[spark] object Utils extends Logging {
 
   lazy val isInInterpreter: Boolean = {
     try {
-      val interpClass = classForName("spark.repl.Main")
+      val interpClass = classForName("org.apache.spark.repl.Main")
       interpClass.getMethod("interp").invoke(null) != null
     } catch {
-      // Returning true seems to be a mistake.
-      // Currently changing it to false causes tests failures in Streaming.
-      // For a more detailed discussion, please, refer to
-      // https://github.com/apache/spark/pull/5835#issuecomment-101042271 and subsequent comments.
-      // Addressing this changed is tracked as https://issues.apache.org/jira/browse/SPARK-7527
-      case _: ClassNotFoundException => true
+      case _: ClassNotFoundException => false
     }
   }
 
diff --git a/core/src/test/scala/org/apache/spark/util/ClosureCleanerSuite.scala b/core/src/test/scala/org/apache/spark/util/ClosureCleanerSuite.scala
index 70cd27b04347d..1053c6caf7718 100644
--- a/core/src/test/scala/org/apache/spark/util/ClosureCleanerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/util/ClosureCleanerSuite.scala
@@ -121,6 +121,10 @@ class ClosureCleanerSuite extends SparkFunSuite {
       expectCorrectException { TestUserClosuresActuallyCleaned.testSubmitJob(sc) }
     }
   }
+
+  test("createNullValue") {
+    new TestCreateNullValue().run()
+  }
 }
 
 // A non-serializable class we create in closures to make sure that we aren't
@@ -350,3 +354,43 @@ private object TestUserClosuresActuallyCleaned {
     )
   }
 }
+
+class TestCreateNullValue {
+
+  var x = 5
+
+  def getX: Int = x
+
+  def run(): Unit = {
+    val bo: Boolean = true
+    val c: Char = '1'
+    val b: Byte = 1
+    val s: Short = 1
+    val i: Int = 1
+    val l: Long = 1
+    val f: Float = 1
+    val d: Double = 1
+
+    // Bring in all primitive types into the closure such that they become
+    // parameters of the closure constructor. This allows us to test whether
+    // null values are created correctly for each type.
+    val nestedClosure = () => {
+      if (s.toString == "123") { // Don't really output them to avoid noisy
+        println(bo)
+        println(c)
+        println(b)
+        println(s)
+        println(i)
+        println(l)
+        println(f)
+        println(d)
+      }
+
+      val closure = () => {
+        println(getX)
+      }
+      ClosureCleaner.clean(closure)
+    }
+    nestedClosure()
+  }
+}

From 80043e9e761c44ce2c3a432dcd1989be573f8bb4 Mon Sep 17 00:00:00 2001
From: zsxwing <zsxwing@gmail.com>
Date: Wed, 10 Jun 2015 13:25:59 -0700
Subject: [PATCH 439/525] [SPARK-7261] [CORE] Change default log level to WARN
 in the REPL

1. Add `log4j-defaults-repl.properties` that has log level WARN.
2. When logging is initialized, check whether inside the REPL. If so, use `log4j-defaults-repl.properties`.
3. Print the following information if using `log4j-defaults-repl.properties`:
```
Using Spark's repl log4j profile: org/apache/spark/log4j-defaults-repl.properties
To adjust logging level use sc.setLogLevel("INFO")
```

Author: zsxwing <zsxwing@gmail.com>

Closes #6734 from zsxwing/log4j-repl and squashes the following commits:

3835eff [zsxwing] Change default log level to WARN in the REPL
---
 .rat-excludes                                 |  1 +
 .../spark/log4j-defaults-repl.properties      | 12 +++++++++
 .../main/scala/org/apache/spark/Logging.scala | 26 ++++++++++++++-----
 3 files changed, 32 insertions(+), 7 deletions(-)
 create mode 100644 core/src/main/resources/org/apache/spark/log4j-defaults-repl.properties

diff --git a/.rat-excludes b/.rat-excludes
index 994c7e86f8a91..aa008e6e920f5 100644
--- a/.rat-excludes
+++ b/.rat-excludes
@@ -28,6 +28,7 @@ spark-env.sh
 spark-env.cmd
 spark-env.sh.template
 log4j-defaults.properties
+log4j-defaults-repl.properties
 bootstrap-tooltip.js
 jquery-1.11.1.min.js
 d3.min.js
diff --git a/core/src/main/resources/org/apache/spark/log4j-defaults-repl.properties b/core/src/main/resources/org/apache/spark/log4j-defaults-repl.properties
new file mode 100644
index 0000000000000..b146f8a784127
--- /dev/null
+++ b/core/src/main/resources/org/apache/spark/log4j-defaults-repl.properties
@@ -0,0 +1,12 @@
+# Set everything to be logged to the console
+log4j.rootCategory=WARN, console
+log4j.appender.console=org.apache.log4j.ConsoleAppender
+log4j.appender.console.target=System.err
+log4j.appender.console.layout=org.apache.log4j.PatternLayout
+log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n
+
+# Settings to quiet third party logs that are too verbose
+log4j.logger.org.spark-project.jetty=WARN
+log4j.logger.org.spark-project.jetty.util.component.AbstractLifeCycle=ERROR
+log4j.logger.org.apache.spark.repl.SparkIMain$exprTyper=INFO
+log4j.logger.org.apache.spark.repl.SparkILoop$SparkILoopInterpreter=INFO
diff --git a/core/src/main/scala/org/apache/spark/Logging.scala b/core/src/main/scala/org/apache/spark/Logging.scala
index 419d093d55643..7fcb7830e7b0b 100644
--- a/core/src/main/scala/org/apache/spark/Logging.scala
+++ b/core/src/main/scala/org/apache/spark/Logging.scala
@@ -121,13 +121,25 @@ trait Logging {
     if (usingLog4j12) {
       val log4j12Initialized = LogManager.getRootLogger.getAllAppenders.hasMoreElements
       if (!log4j12Initialized) {
-        val defaultLogProps = "org/apache/spark/log4j-defaults.properties"
-        Option(Utils.getSparkClassLoader.getResource(defaultLogProps)) match {
-          case Some(url) =>
-            PropertyConfigurator.configure(url)
-            System.err.println(s"Using Spark's default log4j profile: $defaultLogProps")
-          case None =>
-            System.err.println(s"Spark was unable to load $defaultLogProps")
+        if (Utils.isInInterpreter) {
+          val replDefaultLogProps = "org/apache/spark/log4j-defaults-repl.properties"
+          Option(Utils.getSparkClassLoader.getResource(replDefaultLogProps)) match {
+            case Some(url) =>
+              PropertyConfigurator.configure(url)
+              System.err.println(s"Using Spark's repl log4j profile: $replDefaultLogProps")
+              System.err.println("To adjust logging level use sc.setLogLevel(\"INFO\")")
+            case None =>
+              System.err.println(s"Spark was unable to load $replDefaultLogProps")
+          }
+        } else {
+          val defaultLogProps = "org/apache/spark/log4j-defaults.properties"
+          Option(Utils.getSparkClassLoader.getResource(defaultLogProps)) match {
+            case Some(url) =>
+              PropertyConfigurator.configure(url)
+              System.err.println(s"Using Spark's default log4j profile: $defaultLogProps")
+            case None =>
+              System.err.println(s"Spark was unable to load $defaultLogProps")
+          }
         }
       }
     }

From cb871c44c38a4c1575ed076389f14641afafad7d Mon Sep 17 00:00:00 2001
From: WangTaoTheTonic <wangtao111@huawei.com>
Date: Wed, 10 Jun 2015 13:30:16 -0700
Subject: [PATCH 440/525] [SPARK-8290] spark class command builder need read
 SPARK_JAVA_OPTS and SPARK_DRIVER_MEMORY properly

SPARK_JAVA_OPTS was missed in reconstructing the launcher part, we should add it back so process launched by spark-class could read it properly. And so does `SPARK_DRIVER_MEMORY`.

The missing part is [here](https://github.com/apache/spark/blob/1c30afdf94b27e1ad65df0735575306e65d148a1/bin/spark-class#L97).

Author: WangTaoTheTonic <wangtao111@huawei.com>
Author: Tao Wang <wangtao111@huawei.com>

Closes #6741 from WangTaoTheTonic/SPARK-8290 and squashes the following commits:

bd89f0f [Tao Wang] make sure the memory setting is right too
e313520 [WangTaoTheTonic] spark class command builder need read SPARK_JAVA_OPTS
---
 .../org/apache/spark/launcher/SparkClassCommandBuilder.java    | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/launcher/src/main/java/org/apache/spark/launcher/SparkClassCommandBuilder.java b/launcher/src/main/java/org/apache/spark/launcher/SparkClassCommandBuilder.java
index d80abf2a8676e..de85720febf23 100644
--- a/launcher/src/main/java/org/apache/spark/launcher/SparkClassCommandBuilder.java
+++ b/launcher/src/main/java/org/apache/spark/launcher/SparkClassCommandBuilder.java
@@ -93,6 +93,9 @@ public List<String> buildCommand(Map<String, String> env) throws IOException {
         toolsDir.getAbsolutePath(), className);
 
       javaOptsKeys.add("SPARK_JAVA_OPTS");
+    } else {
+      javaOptsKeys.add("SPARK_JAVA_OPTS");
+      memKey = "SPARK_DRIVER_MEMORY";
     }
 
     List<String> cmd = buildJavaCommand(extraClassPath);

From 5014d0ed7e2f69810654003f8dd38078b945cf05 Mon Sep 17 00:00:00 2001
From: WangTaoTheTonic <wangtao111@huawei.com>
Date: Wed, 10 Jun 2015 13:34:19 -0700
Subject: [PATCH 441/525] [SPARK-8273] Driver hangs up when yarn shutdown in
 client mode

In client mode, if yarn was shut down with spark application running, the application will hang up after several retries(default: 30) because the exception throwed by YarnClientImpl could not be caught by upper level, we should exit in case that user can not be aware that.

The exception we wanna catch is [here](https://github.com/apache/hadoop/blob/branch-2.7.0/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/retry/RetryInvocationHandler.java#L122), and I try to fix it refer to [MR](https://github.com/apache/hadoop/blob/branch-2.7.0/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-jobclient/src/main/java/org/apache/hadoop/mapred/ClientServiceDelegate.java#L320).

Author: WangTaoTheTonic <wangtao111@huawei.com>

Closes #6717 from WangTaoTheTonic/SPARK-8273 and squashes the following commits:

28752d6 [WangTaoTheTonic] catch the throwed exception
---
 yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala b/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala
index ec9402afff329..da1ec2a0fe2e9 100644
--- a/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala
+++ b/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala
@@ -29,6 +29,7 @@ import scala.collection.JavaConversions._
 import scala.collection.mutable.{ArrayBuffer, HashMap, HashSet, ListBuffer, Map}
 import scala.reflect.runtime.universe
 import scala.util.{Try, Success, Failure}
+import scala.util.control.NonFatal
 
 import com.google.common.base.Charsets.UTF_8
 import com.google.common.base.Objects
@@ -826,6 +827,9 @@ private[spark] class Client(
           case e: ApplicationNotFoundException =>
             logError(s"Application $appId not found.")
             return (YarnApplicationState.KILLED, FinalApplicationStatus.KILLED)
+          case NonFatal(e) =>
+            logError(s"Failed to contact YARN for application $appId.", e)
+            return (YarnApplicationState.FAILED, FinalApplicationStatus.FAILED)
         }
       val state = report.getYarnApplicationState
 

From 96a7c888d806adfdb2c722025a1079ed7eaa2052 Mon Sep 17 00:00:00 2001
From: Shivaram Venkataraman <shivaram@cs.berkeley.edu>
Date: Wed, 10 Jun 2015 15:03:40 -0700
Subject: [PATCH 442/525] [SPARK-2774] Set preferred locations for reduce tasks

Set preferred locations for reduce tasks.
The basic design is that we maintain a map from reducerId to a list of (sizes, locations) for each
shuffle. We then set the preferred locations to be any machines that have 20% of more of the output
that needs to be read by the reduce task.  This will result in at most 5 preferred locations for
each reduce task.

Selecting the preferred locations involves O(# map tasks * # reduce tasks) computation, so we
restrict this feature to cases where we have fewer than 1000 map tasks and 1000 reduce tasks.

Author: Shivaram Venkataraman <shivaram@cs.berkeley.edu>

Closes #6652 from shivaram/reduce-locations and squashes the following commits:

492e25e [Shivaram Venkataraman] Remove unused import
2ef2d39 [Shivaram Venkataraman] Address code review comments
897a914 [Shivaram Venkataraman] Remove unused hash map
f5be578 [Shivaram Venkataraman] Use fraction of map outputs to determine locations Also removes caching of preferred locations to make the API cleaner
68bc29e [Shivaram Venkataraman] Fix line length
1090b58 [Shivaram Venkataraman] Change flag name
77ce7d8 [Shivaram Venkataraman] Merge branch 'master' of https://github.com/apache/spark into reduce-locations
e5d56bd [Shivaram Venkataraman] Add flag to turn off locality for shuffle deps
6cfae98 [Shivaram Venkataraman] Filter out zero blocks, rename variables
9d5831a [Shivaram Venkataraman] Address some more comments
8e31266 [Shivaram Venkataraman] Fix style
0df3180 [Shivaram Venkataraman] Address code review comments
e7d5449 [Shivaram Venkataraman] Fix merge issues
ad7cb53 [Shivaram Venkataraman] Merge branch 'master' of https://github.com/apache/spark into reduce-locations
df14cee [Shivaram Venkataraman] Merge branch 'master' of https://github.com/apache/spark into reduce-locations
5093aea [Shivaram Venkataraman] Merge branch 'master' of https://github.com/apache/spark into reduce-locations
0171d3c [Shivaram Venkataraman] Merge branch 'master' of https://github.com/apache/spark into reduce-locations
bc4dfd6 [Shivaram Venkataraman] Merge branch 'master' of https://github.com/apache/spark into reduce-locations
774751b [Shivaram Venkataraman] Fix bug introduced by line length adjustment
34d0283 [Shivaram Venkataraman] Fix style issues
3b464b7 [Shivaram Venkataraman] Set preferred locations for reduce tasks This is another attempt at #1697 addressing some of the earlier concerns. This adds a couple of thresholds based on number map and reduce tasks beyond which we don't use preferred locations for reduce tasks.
---
 .../org/apache/spark/MapOutputTracker.scala   | 49 +++++++++++-
 .../apache/spark/scheduler/DAGScheduler.scala | 37 ++++++++-
 .../apache/spark/MapOutputTrackerSuite.scala  | 35 +++++++++
 .../spark/scheduler/DAGSchedulerSuite.scala   | 76 +++++++++++++++----
 4 files changed, 177 insertions(+), 20 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/MapOutputTracker.scala b/core/src/main/scala/org/apache/spark/MapOutputTracker.scala
index 018422827e1c8..862ffe868f58f 100644
--- a/core/src/main/scala/org/apache/spark/MapOutputTracker.scala
+++ b/core/src/main/scala/org/apache/spark/MapOutputTracker.scala
@@ -21,7 +21,7 @@ import java.io._
 import java.util.concurrent.ConcurrentHashMap
 import java.util.zip.{GZIPInputStream, GZIPOutputStream}
 
-import scala.collection.mutable.{HashSet, Map}
+import scala.collection.mutable.{HashMap, HashSet, Map}
 import scala.collection.JavaConversions._
 import scala.reflect.ClassTag
 
@@ -284,6 +284,53 @@ private[spark] class MapOutputTrackerMaster(conf: SparkConf)
     cachedSerializedStatuses.contains(shuffleId) || mapStatuses.contains(shuffleId)
   }
 
+  /**
+   * Return a list of locations that each have fraction of map output greater than the specified
+   * threshold.
+   *
+   * @param shuffleId id of the shuffle
+   * @param reducerId id of the reduce task
+   * @param numReducers total number of reducers in the shuffle
+   * @param fractionThreshold fraction of total map output size that a location must have
+   *                          for it to be considered large.
+   *
+   * This method is not thread-safe.
+   */
+  def getLocationsWithLargestOutputs(
+      shuffleId: Int,
+      reducerId: Int,
+      numReducers: Int,
+      fractionThreshold: Double)
+    : Option[Array[BlockManagerId]] = {
+
+    if (mapStatuses.contains(shuffleId)) {
+      val statuses = mapStatuses(shuffleId)
+      if (statuses.nonEmpty) {
+        // HashMap to add up sizes of all blocks at the same location
+        val locs = new HashMap[BlockManagerId, Long]
+        var totalOutputSize = 0L
+        var mapIdx = 0
+        while (mapIdx < statuses.length) {
+          val status = statuses(mapIdx)
+          val blockSize = status.getSizeForBlock(reducerId)
+          if (blockSize > 0) {
+            locs(status.location) = locs.getOrElse(status.location, 0L) + blockSize
+            totalOutputSize += blockSize
+          }
+          mapIdx = mapIdx + 1
+        }
+        val topLocs = locs.filter { case (loc, size) =>
+          size.toDouble / totalOutputSize >= fractionThreshold
+        }
+        // Return if we have any locations which satisfy the required threshold
+        if (topLocs.nonEmpty) {
+          return Some(topLocs.map(_._1).toArray)
+        }
+      }
+    }
+    None
+  }
+
   def incrementEpoch() {
     epochLock.synchronized {
       epoch += 1
diff --git a/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala b/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala
index 75a567fb31520..aea6674ed20be 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala
@@ -137,6 +137,22 @@ class DAGScheduler(
   private[scheduler] val eventProcessLoop = new DAGSchedulerEventProcessLoop(this)
   taskScheduler.setDAGScheduler(this)
 
+  // Flag to control if reduce tasks are assigned preferred locations
+  private val shuffleLocalityEnabled =
+    sc.getConf.getBoolean("spark.shuffle.reduceLocality.enabled", true)
+  // Number of map, reduce tasks above which we do not assign preferred locations
+  // based on map output sizes. We limit the size of jobs for which assign preferred locations
+  // as computing the top locations by size becomes expensive.
+  private[this] val SHUFFLE_PREF_MAP_THRESHOLD = 1000
+  // NOTE: This should be less than 2000 as we use HighlyCompressedMapStatus beyond that
+  private[this] val SHUFFLE_PREF_REDUCE_THRESHOLD = 1000
+
+  // Fraction of total map output that must be at a location for it to considered as a preferred
+  // location for a reduce task.
+  // Making this larger will focus on fewer locations where most data can be read locally, but
+  // may lead to more delay in scheduling if those locations are busy.
+  private[scheduler] val REDUCER_PREF_LOCS_FRACTION = 0.2
+
   // Called by TaskScheduler to report task's starting.
   def taskStarted(task: Task[_], taskInfo: TaskInfo) {
     eventProcessLoop.post(BeginEvent(task, taskInfo))
@@ -1384,17 +1400,32 @@ class DAGScheduler(
     if (rddPrefs.nonEmpty) {
       return rddPrefs.map(TaskLocation(_))
     }
-    // If the RDD has narrow dependencies, pick the first partition of the first narrow dep
-    // that has any placement preferences. Ideally we would choose based on transfer sizes,
-    // but this will do for now.
+
     rdd.dependencies.foreach {
       case n: NarrowDependency[_] =>
+        // If the RDD has narrow dependencies, pick the first partition of the first narrow dep
+        // that has any placement preferences. Ideally we would choose based on transfer sizes,
+        // but this will do for now.
         for (inPart <- n.getParents(partition)) {
           val locs = getPreferredLocsInternal(n.rdd, inPart, visited)
           if (locs != Nil) {
             return locs
           }
         }
+      case s: ShuffleDependency[_, _, _] =>
+        // For shuffle dependencies, pick locations which have at least REDUCER_PREF_LOCS_FRACTION
+        // of data as preferred locations
+        if (shuffleLocalityEnabled &&
+            rdd.partitions.size < SHUFFLE_PREF_REDUCE_THRESHOLD &&
+            s.rdd.partitions.size < SHUFFLE_PREF_MAP_THRESHOLD) {
+          // Get the preferred map output locations for this reducer
+          val topLocsForReducer = mapOutputTracker.getLocationsWithLargestOutputs(s.shuffleId,
+            partition, rdd.partitions.size, REDUCER_PREF_LOCS_FRACTION)
+          if (topLocsForReducer.nonEmpty) {
+            return topLocsForReducer.get.map(loc => TaskLocation(loc.host, loc.executorId))
+          }
+        }
+
       case _ =>
     }
     Nil
diff --git a/core/src/test/scala/org/apache/spark/MapOutputTrackerSuite.scala b/core/src/test/scala/org/apache/spark/MapOutputTrackerSuite.scala
index 1fab69678d040..7a1961137cce5 100644
--- a/core/src/test/scala/org/apache/spark/MapOutputTrackerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/MapOutputTrackerSuite.scala
@@ -205,4 +205,39 @@ class MapOutputTrackerSuite extends SparkFunSuite {
 //    masterTracker.stop() // this throws an exception
     rpcEnv.shutdown()
   }
+
+  test("getLocationsWithLargestOutputs with multiple outputs in same machine") {
+    val rpcEnv = createRpcEnv("test")
+    val tracker = new MapOutputTrackerMaster(conf)
+    tracker.trackerEndpoint = rpcEnv.setupEndpoint(MapOutputTracker.ENDPOINT_NAME,
+      new MapOutputTrackerMasterEndpoint(rpcEnv, tracker, conf))
+    // Setup 3 map tasks
+    // on hostA with output size 2
+    // on hostA with output size 2
+    // on hostB with output size 3
+    tracker.registerShuffle(10, 3)
+    tracker.registerMapOutput(10, 0, MapStatus(BlockManagerId("a", "hostA", 1000),
+        Array(2L)))
+    tracker.registerMapOutput(10, 1, MapStatus(BlockManagerId("a", "hostA", 1000),
+        Array(2L)))
+    tracker.registerMapOutput(10, 2, MapStatus(BlockManagerId("b", "hostB", 1000),
+        Array(3L)))
+
+    // When the threshold is 50%, only host A should be returned as a preferred location
+    // as it has 4 out of 7 bytes of output.
+    val topLocs50 = tracker.getLocationsWithLargestOutputs(10, 0, 1, 0.5)
+    assert(topLocs50.nonEmpty)
+    assert(topLocs50.get.size === 1)
+    assert(topLocs50.get.head === BlockManagerId("a", "hostA", 1000))
+
+    // When the threshold is 20%, both hosts should be returned as preferred locations.
+    val topLocs20 = tracker.getLocationsWithLargestOutputs(10, 0, 1, 0.2)
+    assert(topLocs20.nonEmpty)
+    assert(topLocs20.get.size === 2)
+    assert(topLocs20.get.toSet ===
+           Seq(BlockManagerId("a", "hostA", 1000), BlockManagerId("b", "hostB", 1000)).toSet)
+
+    tracker.stop()
+    rpcEnv.shutdown()
+  }
 }
diff --git a/core/src/test/scala/org/apache/spark/scheduler/DAGSchedulerSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/DAGSchedulerSuite.scala
index 47b2868753c0e..833b600746e90 100644
--- a/core/src/test/scala/org/apache/spark/scheduler/DAGSchedulerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/scheduler/DAGSchedulerSuite.scala
@@ -490,8 +490,8 @@ class DAGSchedulerSuite
     val reduceRdd = new MyRDD(sc, 2, List(shuffleDep))
     submit(reduceRdd, Array(0, 1))
     complete(taskSets(0), Seq(
-        (Success, makeMapStatus("hostA", 1)),
-        (Success, makeMapStatus("hostB", 1))))
+        (Success, makeMapStatus("hostA", reduceRdd.partitions.size)),
+        (Success, makeMapStatus("hostB", reduceRdd.partitions.size))))
     // the 2nd ResultTask failed
     complete(taskSets(1), Seq(
         (Success, 42),
@@ -501,7 +501,7 @@ class DAGSchedulerSuite
     // ask the scheduler to try it again
     scheduler.resubmitFailedStages()
     // have the 2nd attempt pass
-    complete(taskSets(2), Seq((Success, makeMapStatus("hostA", 1))))
+    complete(taskSets(2), Seq((Success, makeMapStatus("hostA", reduceRdd.partitions.size))))
     // we can see both result blocks now
     assert(mapOutputTracker.getServerStatuses(shuffleId, 0).map(_._1.host) ===
       Array("hostA", "hostB"))
@@ -517,8 +517,8 @@ class DAGSchedulerSuite
     val reduceRdd = new MyRDD(sc, 2, List(shuffleDep))
     submit(reduceRdd, Array(0, 1))
     complete(taskSets(0), Seq(
-      (Success, makeMapStatus("hostA", 1)),
-      (Success, makeMapStatus("hostB", 1))))
+      (Success, makeMapStatus("hostA", reduceRdd.partitions.size)),
+      (Success, makeMapStatus("hostB", reduceRdd.partitions.size))))
     // The MapOutputTracker should know about both map output locations.
     assert(mapOutputTracker.getServerStatuses(shuffleId, 0).map(_._1.host) ===
       Array("hostA", "hostB"))
@@ -560,18 +560,18 @@ class DAGSchedulerSuite
     assert(newEpoch > oldEpoch)
     val taskSet = taskSets(0)
     // should be ignored for being too old
-    runEvent(CompletionEvent(
-      taskSet.tasks(0), Success, makeMapStatus("hostA", 1), null, createFakeTaskInfo(), null))
+    runEvent(CompletionEvent(taskSet.tasks(0), Success, makeMapStatus("hostA",
+      reduceRdd.partitions.size), null, createFakeTaskInfo(), null))
     // should work because it's a non-failed host
-    runEvent(CompletionEvent(
-      taskSet.tasks(0), Success, makeMapStatus("hostB", 1), null, createFakeTaskInfo(), null))
+    runEvent(CompletionEvent(taskSet.tasks(0), Success, makeMapStatus("hostB",
+      reduceRdd.partitions.size), null, createFakeTaskInfo(), null))
     // should be ignored for being too old
-    runEvent(CompletionEvent(
-      taskSet.tasks(0), Success, makeMapStatus("hostA", 1), null, createFakeTaskInfo(), null))
+    runEvent(CompletionEvent(taskSet.tasks(0), Success, makeMapStatus("hostA",
+      reduceRdd.partitions.size), null, createFakeTaskInfo(), null))
     // should work because it's a new epoch
     taskSet.tasks(1).epoch = newEpoch
-    runEvent(CompletionEvent(
-      taskSet.tasks(1), Success, makeMapStatus("hostA", 1), null, createFakeTaskInfo(), null))
+    runEvent(CompletionEvent(taskSet.tasks(1), Success, makeMapStatus("hostA",
+      reduceRdd.partitions.size), null, createFakeTaskInfo(), null))
     assert(mapOutputTracker.getServerStatuses(shuffleId, 0).map(_._1) ===
            Array(makeBlockManagerId("hostB"), makeBlockManagerId("hostA")))
     complete(taskSets(1), Seq((Success, 42), (Success, 43)))
@@ -800,6 +800,50 @@ class DAGSchedulerSuite
     assertDataStructuresEmpty()
   }
 
+  test("reduce tasks should be placed locally with map output") {
+    // Create an shuffleMapRdd with 1 partition
+    val shuffleMapRdd = new MyRDD(sc, 1, Nil)
+    val shuffleDep = new ShuffleDependency(shuffleMapRdd, null)
+    val shuffleId = shuffleDep.shuffleId
+    val reduceRdd = new MyRDD(sc, 1, List(shuffleDep))
+    submit(reduceRdd, Array(0))
+    complete(taskSets(0), Seq(
+        (Success, makeMapStatus("hostA", 1))))
+    assert(mapOutputTracker.getServerStatuses(shuffleId, 0).map(_._1) ===
+           Array(makeBlockManagerId("hostA")))
+
+    // Reducer should run on the same host that map task ran
+    val reduceTaskSet = taskSets(1)
+    assertLocations(reduceTaskSet, Seq(Seq("hostA")))
+    complete(reduceTaskSet, Seq((Success, 42)))
+    assert(results === Map(0 -> 42))
+    assertDataStructuresEmpty
+  }
+
+  test("reduce task locality preferences should only include machines with largest map outputs") {
+    val numMapTasks = 4
+    // Create an shuffleMapRdd with more partitions
+    val shuffleMapRdd = new MyRDD(sc, numMapTasks, Nil)
+    val shuffleDep = new ShuffleDependency(shuffleMapRdd, null)
+    val shuffleId = shuffleDep.shuffleId
+    val reduceRdd = new MyRDD(sc, 1, List(shuffleDep))
+    submit(reduceRdd, Array(0))
+
+    val statuses = (1 to numMapTasks).map { i =>
+      (Success, makeMapStatus("host" + i, 1, (10*i).toByte))
+    }
+    complete(taskSets(0), statuses)
+
+    // Reducer should prefer the last 3 hosts as they have 20%, 30% and 40% of data
+    val hosts = (1 to numMapTasks).map(i => "host" + i).reverse.take(numMapTasks - 1)
+
+    val reduceTaskSet = taskSets(1)
+    assertLocations(reduceTaskSet, Seq(hosts))
+    complete(reduceTaskSet, Seq((Success, 42)))
+    assert(results === Map(0 -> 42))
+    assertDataStructuresEmpty
+  }
+
   /**
    * Assert that the supplied TaskSet has exactly the given hosts as its preferred locations.
    * Note that this checks only the host and not the executor ID.
@@ -807,12 +851,12 @@ class DAGSchedulerSuite
   private def assertLocations(taskSet: TaskSet, hosts: Seq[Seq[String]]) {
     assert(hosts.size === taskSet.tasks.size)
     for ((taskLocs, expectedLocs) <- taskSet.tasks.map(_.preferredLocations).zip(hosts)) {
-      assert(taskLocs.map(_.host) === expectedLocs)
+      assert(taskLocs.map(_.host).toSet === expectedLocs.toSet)
     }
   }
 
-  private def makeMapStatus(host: String, reduces: Int): MapStatus =
-    MapStatus(makeBlockManagerId(host), Array.fill[Long](reduces)(2))
+  private def makeMapStatus(host: String, reduces: Int, sizes: Byte = 2): MapStatus =
+    MapStatus(makeBlockManagerId(host), Array.fill[Long](reduces)(sizes))
 
   private def makeBlockManagerId(host: String): BlockManagerId =
     BlockManagerId("exec-" + host, host, 12345)

From b928f543845ddd39e914a0e8f0b0205fd86100c5 Mon Sep 17 00:00:00 2001
From: Paavo <pparkkin@gmail.com>
Date: Wed, 10 Jun 2015 23:17:42 +0100
Subject: [PATCH 443/525] [SPARK-8200] [MLLIB] Check for empty RDDs in
 StreamingLinearAlgorithm

Test cases for both StreamingLinearRegression and StreamingLogisticRegression, and code fix.

Edit:
This contribution is my original work and I license the work to the project under the project's open source license.

Author: Paavo <pparkkin@gmail.com>

Closes #6713 from pparkkin/streamingmodel-empty-rdd and squashes the following commits:

ff5cd78 [Paavo] Update strings to use interpolation.
db234cf [Paavo] Use !rdd.isEmpty.
54ad89e [Paavo] Test case for empty stream.
393e36f [Paavo] Ignore empty RDDs.
0bfc365 [Paavo] Test case for empty stream.
---
 .../regression/StreamingLinearAlgorithm.scala  | 14 ++++++++------
 .../StreamingLogisticRegressionSuite.scala     | 17 +++++++++++++++++
 .../StreamingLinearRegressionSuite.scala       | 18 ++++++++++++++++++
 3 files changed, 43 insertions(+), 6 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/regression/StreamingLinearAlgorithm.scala b/mllib/src/main/scala/org/apache/spark/mllib/regression/StreamingLinearAlgorithm.scala
index aee51bf22d8d0..141052ba813ee 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/regression/StreamingLinearAlgorithm.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/regression/StreamingLinearAlgorithm.scala
@@ -83,13 +83,15 @@ abstract class StreamingLinearAlgorithm[
       throw new IllegalArgumentException("Model must be initialized before starting training.")
     }
     data.foreachRDD { (rdd, time) =>
-      model = Some(algorithm.run(rdd, model.get.weights))
-      logInfo("Model updated at time %s".format(time.toString))
-      val display = model.get.weights.size match {
-        case x if x > 100 => model.get.weights.toArray.take(100).mkString("[", ",", "...")
-        case _ => model.get.weights.toArray.mkString("[", ",", "]")
+      if (!rdd.isEmpty) {
+        model = Some(algorithm.run(rdd, model.get.weights))
+        logInfo(s"Model updated at time ${time.toString}")
+        val display = model.get.weights.size match {
+          case x if x > 100 => model.get.weights.toArray.take(100).mkString("[", ",", "...")
+          case _ => model.get.weights.toArray.mkString("[", ",", "]")
+        }
+        logInfo(s"Current model: weights, ${display}")
       }
-      logInfo("Current model: weights, %s".format (display))
     }
   }
 
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/classification/StreamingLogisticRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/classification/StreamingLogisticRegressionSuite.scala
index e98b61e13e21f..fd653296c9d97 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/classification/StreamingLogisticRegressionSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/classification/StreamingLogisticRegressionSuite.scala
@@ -158,4 +158,21 @@ class StreamingLogisticRegressionSuite extends SparkFunSuite with TestSuiteBase
     val error = output.map(batch => batch.map(p => math.abs(p._1 - p._2)).sum / nPoints).toList
     assert(error.head > 0.8 & error.last < 0.2)
   }
+
+  // Test empty RDDs in a stream
+  test("handling empty RDDs in a stream") {
+    val model = new StreamingLogisticRegressionWithSGD()
+      .setInitialWeights(Vectors.dense(-0.1))
+      .setStepSize(0.01)
+      .setNumIterations(10)
+    val numBatches = 10
+    val emptyInput = Seq.empty[Seq[LabeledPoint]]
+    val ssc = setupStreams(emptyInput,
+      (inputDStream: DStream[LabeledPoint]) => {
+        model.trainOn(inputDStream)
+        model.predictOnValues(inputDStream.map(x => (x.label, x.features)))
+      }
+    )
+    val output: Seq[Seq[(Double, Double)]] = runStreams(ssc, numBatches, numBatches)
+  }
 }
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/regression/StreamingLinearRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/regression/StreamingLinearRegressionSuite.scala
index 9a379406d5061..f5e2d31056cbd 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/regression/StreamingLinearRegressionSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/regression/StreamingLinearRegressionSuite.scala
@@ -166,4 +166,22 @@ class StreamingLinearRegressionSuite extends SparkFunSuite with TestSuiteBase {
     val error = output.map(batch => batch.map(p => math.abs(p._1 - p._2)).sum / nPoints).toList
     assert((error.head - error.last) > 2)
   }
+
+  // Test empty RDDs in a stream
+  test("handling empty RDDs in a stream") {
+    val model = new StreamingLinearRegressionWithSGD()
+      .setInitialWeights(Vectors.dense(0.0, 0.0))
+      .setStepSize(0.2)
+      .setNumIterations(25)
+    val numBatches = 10
+    val nPoints = 100
+    val emptyInput = Seq.empty[Seq[LabeledPoint]]
+    val ssc = setupStreams(emptyInput,
+      (inputDStream: DStream[LabeledPoint]) => {
+        model.trainOn(inputDStream)
+        model.predictOnValues(inputDStream.map(x => (x.label, x.features)))
+      }
+    )
+    val output: Seq[Seq[(Double, Double)]] = runStreams(ssc, numBatches, numBatches)
+  }
 }

From 37719e0cd0b00cc5ffee0ebe1652d465a574db0f Mon Sep 17 00:00:00 2001
From: Davies Liu <davies@databricks.com>
Date: Wed, 10 Jun 2015 16:55:39 -0700
Subject: [PATCH 444/525] [SPARK-8189] [SQL] use Long for TimestampType in SQL

This PR change to use Long as internal type for TimestampType for efficiency, which means it will the precision below 100ns.

Author: Davies Liu <davies@databricks.com>

Closes #6733 from davies/timestamp and squashes the following commits:

d9565fa [Davies Liu] remove print
65cf2f1 [Davies Liu] fix Timestamp in SparkR
86fecfb [Davies Liu] disable two timestamp tests
8f77ee0 [Davies Liu] fix scala style
246ee74 [Davies Liu] address comments
309d2e1 [Davies Liu] use Long for TimestampType in SQL
---
 .../scala/org/apache/spark/api/r/SerDe.scala  | 17 +++--
 python/pyspark/sql/types.py                   | 11 ++++
 .../scala/org/apache/spark/sql/BaseRow.java   |  6 ++
 .../main/scala/org/apache/spark/sql/Row.scala |  8 ++-
 .../sql/catalyst/CatalystTypeConverters.scala | 13 +++-
 .../spark/sql/catalyst/expressions/Cast.scala | 62 +++++++++----------
 .../expressions/SpecificMutableRow.scala      |  1 +
 .../expressions/codegen/CodeGenerator.scala   |  4 +-
 .../codegen/GenerateProjection.scala          | 10 ++-
 .../sql/catalyst/expressions/literals.scala   | 15 +++--
 .../sql/catalyst/expressions/predicates.scala |  6 +-
 .../spark/sql/catalyst/util/DateUtils.scala   | 44 ++++++++++---
 .../spark/sql/types/TimestampType.scala       | 10 +--
 .../sql/catalyst/expressions/CastSuite.scala  | 11 ++--
 .../sql/catalyst/util/DateUtilsSuite.scala    | 40 ++++++++++++
 .../spark/sql/types/DataTypeSuite.scala       |  2 +-
 .../spark/sql/columnar/ColumnStats.scala      | 21 +------
 .../spark/sql/columnar/ColumnType.scala       | 19 +++---
 .../sql/execution/SparkSqlSerializer2.scala   | 17 ++---
 .../spark/sql/execution/debug/package.scala   |  2 +
 .../spark/sql/execution/pythonUdfs.scala      |  7 ++-
 .../org/apache/spark/sql/jdbc/JDBCRDD.scala   | 10 ++-
 .../apache/spark/sql/json/JacksonParser.scala |  5 +-
 .../org/apache/spark/sql/json/JsonRDD.scala   | 10 ++-
 .../spark/sql/parquet/ParquetConverter.scala  |  9 +--
 .../sql/parquet/ParquetTableSupport.scala     | 10 +--
 .../apache/spark/sql/CachedTableSuite.scala   |  2 +-
 .../spark/sql/columnar/ColumnStatsSuite.scala |  2 +-
 .../spark/sql/columnar/ColumnTypeSuite.scala  | 11 ++--
 .../sql/columnar/ColumnarTestUtils.scala      |  9 +--
 .../org/apache/spark/sql/jdbc/JDBCSuite.scala |  2 +-
 .../org/apache/spark/sql/json/JsonSuite.scala | 14 +++--
 .../execution/HiveCompatibilitySuite.scala    |  8 ++-
 .../spark/sql/hive/HiveInspectors.scala       | 20 +++---
 .../apache/spark/sql/hive/TableReader.scala   |  4 +-
 ...cast #5-0-dbd7bcd167d322d6617b884c02c7f247 |  2 +-
 36 files changed, 272 insertions(+), 172 deletions(-)
 create mode 100644 sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DateUtilsSuite.scala

diff --git a/core/src/main/scala/org/apache/spark/api/r/SerDe.scala b/core/src/main/scala/org/apache/spark/api/r/SerDe.scala
index f8e3f1a79082e..56adc857d4ce0 100644
--- a/core/src/main/scala/org/apache/spark/api/r/SerDe.scala
+++ b/core/src/main/scala/org/apache/spark/api/r/SerDe.scala
@@ -18,7 +18,7 @@
 package org.apache.spark.api.r
 
 import java.io.{DataInputStream, DataOutputStream}
-import java.sql.{Date, Time}
+import java.sql.{Timestamp, Date, Time}
 
 import scala.collection.JavaConversions._
 
@@ -107,9 +107,12 @@ private[spark] object SerDe {
     Date.valueOf(readString(in))
   }
 
-  def readTime(in: DataInputStream): Time = {
-    val t = in.readDouble()
-    new Time((t * 1000L).toLong)
+  def readTime(in: DataInputStream): Timestamp = {
+    val seconds = in.readDouble()
+    val sec = Math.floor(seconds).toLong
+    val t = new Timestamp(sec * 1000L)
+    t.setNanos(((seconds - sec) * 1e9).toInt)
+    t
   }
 
   def readBytesArr(in: DataInputStream): Array[Array[Byte]] = {
@@ -227,6 +230,9 @@ private[spark] object SerDe {
         case "java.sql.Time" =>
           writeType(dos, "time")
           writeTime(dos, value.asInstanceOf[Time])
+        case "java.sql.Timestamp" =>
+          writeType(dos, "time")
+          writeTime(dos, value.asInstanceOf[Timestamp])
         case "[B" =>
           writeType(dos, "raw")
           writeBytes(dos, value.asInstanceOf[Array[Byte]])
@@ -289,6 +295,9 @@ private[spark] object SerDe {
     out.writeDouble(value.getTime.toDouble / 1000.0)
   }
 
+  def writeTime(out: DataOutputStream, value: Timestamp): Unit = {
+    out.writeDouble((value.getTime / 1000).toDouble + value.getNanos.toDouble / 1e9)
+  }
 
   // NOTE: Only works for ASCII right now
   def writeString(out: DataOutputStream, value: String): Unit = {
diff --git a/python/pyspark/sql/types.py b/python/pyspark/sql/types.py
index b6ec6137c9180..8f286b631f4f0 100644
--- a/python/pyspark/sql/types.py
+++ b/python/pyspark/sql/types.py
@@ -19,6 +19,7 @@
 import decimal
 import time
 import datetime
+import calendar
 import keyword
 import warnings
 import json
@@ -654,6 +655,8 @@ def _need_python_to_sql_conversion(dataType):
             _need_python_to_sql_conversion(dataType.valueType)
     elif isinstance(dataType, UserDefinedType):
         return True
+    elif isinstance(dataType, TimestampType):
+        return True
     else:
         return False
 
@@ -707,6 +710,14 @@ def converter(obj):
         return lambda m: dict([(key_converter(k), value_converter(v)) for k, v in m.items()])
     elif isinstance(dataType, UserDefinedType):
         return lambda obj: dataType.serialize(obj)
+    elif isinstance(dataType, TimestampType):
+
+        def to_posix_timstamp(dt):
+            if dt.tzinfo is None:
+                return int(time.mktime(dt.timetuple()) * 1e7 + dt.microsecond * 10)
+            else:
+                return int(calendar.timegm(dt.utctimetuple()) * 1e7 + dt.microsecond * 10)
+        return to_posix_timstamp
     else:
         raise ValueError("Unexpected type %r" % dataType)
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/BaseRow.java b/sql/catalyst/src/main/scala/org/apache/spark/sql/BaseRow.java
index d138b43a3482b..6584882a62fd1 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/BaseRow.java
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/BaseRow.java
@@ -19,6 +19,7 @@
 
 import java.math.BigDecimal;
 import java.sql.Date;
+import java.sql.Timestamp;
 import java.util.List;
 
 import scala.collection.Seq;
@@ -103,6 +104,11 @@ public Date getDate(int i) {
     throw new UnsupportedOperationException();
   }
 
+  @Override
+  public Timestamp getTimestamp(int i) {
+    throw new UnsupportedOperationException();
+  }
+
   @Override
   public <T> Seq<T> getSeq(int i) {
     throw new UnsupportedOperationException();
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/Row.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/Row.scala
index 0d460b634d9b0..8aaf5d7d89154 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/Row.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/Row.scala
@@ -260,9 +260,15 @@ trait Row extends Serializable {
    *
    * @throws ClassCastException when data type does not match.
    */
-  // TODO(davies): This is not the right default implementation, we use Int as Date internally
   def getDate(i: Int): java.sql.Date = apply(i).asInstanceOf[java.sql.Date]
 
+  /**
+   * Returns the value at position i of date type as java.sql.Timestamp.
+   *
+   * @throws ClassCastException when data type does not match.
+   */
+  def getTimestamp(i: Int): java.sql.Timestamp = apply(i).asInstanceOf[java.sql.Timestamp]
+
   /**
    * Returns the value at position i of array type as a Scala Seq.
    *
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/CatalystTypeConverters.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/CatalystTypeConverters.scala
index 2e7b4c236d8f8..beb82dbc08642 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/CatalystTypeConverters.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/CatalystTypeConverters.scala
@@ -19,7 +19,7 @@ package org.apache.spark.sql.catalyst
 
 import java.lang.{Iterable => JavaIterable}
 import java.math.{BigDecimal => JavaBigDecimal}
-import java.sql.Date
+import java.sql.{Timestamp, Date}
 import java.util.{Map => JavaMap}
 import javax.annotation.Nullable
 
@@ -58,6 +58,7 @@ object CatalystTypeConverters {
       case structType: StructType => StructConverter(structType)
       case StringType => StringConverter
       case DateType => DateConverter
+      case TimestampType => TimestampConverter
       case dt: DecimalType => BigDecimalConverter
       case BooleanType => BooleanConverter
       case ByteType => ByteConverter
@@ -274,6 +275,15 @@ object CatalystTypeConverters {
     override def toScalaImpl(row: Row, column: Int): Date = toScala(row.getInt(column))
   }
 
+  private object TimestampConverter extends CatalystTypeConverter[Timestamp, Timestamp, Any] {
+    override def toCatalystImpl(scalaValue: Timestamp): Long =
+      DateUtils.fromJavaTimestamp(scalaValue)
+    override def toScala(catalystValue: Any): Timestamp =
+      if (catalystValue == null) null
+      else DateUtils.toJavaTimestamp(catalystValue.asInstanceOf[Long])
+    override def toScalaImpl(row: Row, column: Int): Timestamp = toScala(row.getLong(column))
+  }
+
   private object BigDecimalConverter extends CatalystTypeConverter[Any, JavaBigDecimal, Decimal] {
     override def toCatalystImpl(scalaValue: Any): Decimal = scalaValue match {
       case d: BigDecimal => Decimal(d)
@@ -367,6 +377,7 @@ object CatalystTypeConverters {
   def convertToCatalyst(a: Any): Any = a match {
     case s: String => StringConverter.toCatalyst(s)
     case d: Date => DateConverter.toCatalyst(d)
+    case t: Timestamp => TimestampConverter.toCatalyst(t)
     case d: BigDecimal => BigDecimalConverter.toCatalyst(d)
     case d: JavaBigDecimal => BigDecimalConverter.toCatalyst(d)
     case seq: Seq[Any] => seq.map(convertToCatalyst)
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala
index 18102d1acb5b3..8d93957fea2fc 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala
@@ -113,7 +113,8 @@ case class Cast(child: Expression, dataType: DataType) extends UnaryExpression w
   private[this] def castToString(from: DataType): Any => Any = from match {
     case BinaryType => buildCast[Array[Byte]](_, UTF8String(_))
     case DateType => buildCast[Int](_, d => UTF8String(DateUtils.toString(d)))
-    case TimestampType => buildCast[Timestamp](_, t => UTF8String(timestampToString(t)))
+    case TimestampType => buildCast[Long](_,
+      t => UTF8String(timestampToString(DateUtils.toJavaTimestamp(t))))
     case _ => buildCast[Any](_, o => UTF8String(o.toString))
   }
 
@@ -127,7 +128,7 @@ case class Cast(child: Expression, dataType: DataType) extends UnaryExpression w
     case StringType =>
       buildCast[UTF8String](_, _.length() != 0)
     case TimestampType =>
-      buildCast[Timestamp](_, t => t.getTime() != 0 || t.getNanos() != 0)
+      buildCast[Long](_, t => t != 0)
     case DateType =>
       // Hive would return null when cast from date to boolean
       buildCast[Int](_, d => null)
@@ -158,20 +159,21 @@ case class Cast(child: Expression, dataType: DataType) extends UnaryExpression w
         if (periodIdx != -1 && n.length() - periodIdx > 9) {
           n = n.substring(0, periodIdx + 10)
         }
-        try Timestamp.valueOf(n) catch { case _: java.lang.IllegalArgumentException => null }
+        try DateUtils.fromJavaTimestamp(Timestamp.valueOf(n))
+        catch { case _: java.lang.IllegalArgumentException => null }
       })
     case BooleanType =>
-      buildCast[Boolean](_, b => new Timestamp(if (b) 1 else 0))
+      buildCast[Boolean](_, b => (if (b) 1L else 0))
     case LongType =>
-      buildCast[Long](_, l => new Timestamp(l))
+      buildCast[Long](_, l => longToTimestamp(l))
     case IntegerType =>
-      buildCast[Int](_, i => new Timestamp(i))
+      buildCast[Int](_, i => longToTimestamp(i.toLong))
     case ShortType =>
-      buildCast[Short](_, s => new Timestamp(s))
+      buildCast[Short](_, s => longToTimestamp(s.toLong))
     case ByteType =>
-      buildCast[Byte](_, b => new Timestamp(b))
+      buildCast[Byte](_, b => longToTimestamp(b.toLong))
     case DateType =>
-      buildCast[Int](_, d => new Timestamp(DateUtils.toJavaDate(d).getTime))
+      buildCast[Int](_, d => DateUtils.toMillisSinceEpoch(d) * 10000)
     // TimestampWritable.decimalToTimestamp
     case DecimalType() =>
       buildCast[Decimal](_, d => decimalToTimestamp(d))
@@ -191,25 +193,17 @@ case class Cast(child: Expression, dataType: DataType) extends UnaryExpression w
       })
   }
 
-  private[this] def decimalToTimestamp(d: Decimal) = {
-    val seconds = Math.floor(d.toDouble).toLong
-    val bd = (d.toBigDecimal - seconds) * 1000000000
-    val nanos = bd.intValue()
-
-    val millis = seconds * 1000
-    val t = new Timestamp(millis)
-
-    // remaining fractional portion as nanos
-    t.setNanos(nanos)
-    t
+  private[this] def decimalToTimestamp(d: Decimal): Long = {
+    (d.toBigDecimal * 10000000L).longValue()
   }
 
-  // Timestamp to long, converting milliseconds to seconds
-  private[this] def timestampToLong(ts: Timestamp) = Math.floor(ts.getTime / 1000.0).toLong
-
-  private[this] def timestampToDouble(ts: Timestamp) = {
-    // First part is the seconds since the beginning of time, followed by nanosecs.
-    Math.floor(ts.getTime / 1000.0).toLong + ts.getNanos.toDouble / 1000000000
+  // converting milliseconds to 100ns
+  private[this] def longToTimestamp(t: Long): Long = t * 10000L
+  // converting 100ns to seconds
+  private[this] def timestampToLong(ts: Long): Long = math.floor(ts.toDouble / 10000000L).toLong
+  // converting 100ns to seconds in double
+  private[this] def timestampToDouble(ts: Long): Double = {
+    ts / 10000000.0
   }
 
   // Converts Timestamp to string according to Hive TimestampWritable convention
@@ -234,7 +228,7 @@ case class Cast(child: Expression, dataType: DataType) extends UnaryExpression w
     case TimestampType =>
       // throw valid precision more than seconds, according to Hive.
       // Timestamp.nanos is in 0 to 999,999,999, no more than a second.
-      buildCast[Timestamp](_, t => DateUtils.millisToDays(t.getTime))
+      buildCast[Long](_, t => DateUtils.millisToDays(t / 10000L))
     // Hive throws this exception as a Semantic Exception
     // It is never possible to compare result when hive return with exception,
     // so we can return null
@@ -253,7 +247,7 @@ case class Cast(child: Expression, dataType: DataType) extends UnaryExpression w
     case DateType =>
       buildCast[Int](_, d => null)
     case TimestampType =>
-      buildCast[Timestamp](_, t => timestampToLong(t))
+      buildCast[Long](_, t => timestampToLong(t))
     case x: NumericType =>
       b => x.numeric.asInstanceOf[Numeric[Any]].toLong(b)
   }
@@ -269,7 +263,7 @@ case class Cast(child: Expression, dataType: DataType) extends UnaryExpression w
     case DateType =>
       buildCast[Int](_, d => null)
     case TimestampType =>
-      buildCast[Timestamp](_, t => timestampToLong(t).toInt)
+      buildCast[Long](_, t => timestampToLong(t).toInt)
     case x: NumericType =>
       b => x.numeric.asInstanceOf[Numeric[Any]].toInt(b)
   }
@@ -285,7 +279,7 @@ case class Cast(child: Expression, dataType: DataType) extends UnaryExpression w
     case DateType =>
       buildCast[Int](_, d => null)
     case TimestampType =>
-      buildCast[Timestamp](_, t => timestampToLong(t).toShort)
+      buildCast[Long](_, t => timestampToLong(t).toShort)
     case x: NumericType =>
       b => x.numeric.asInstanceOf[Numeric[Any]].toInt(b).toShort
   }
@@ -301,7 +295,7 @@ case class Cast(child: Expression, dataType: DataType) extends UnaryExpression w
     case DateType =>
       buildCast[Int](_, d => null)
     case TimestampType =>
-      buildCast[Timestamp](_, t => timestampToLong(t).toByte)
+      buildCast[Long](_, t => timestampToLong(t).toByte)
     case x: NumericType =>
       b => x.numeric.asInstanceOf[Numeric[Any]].toInt(b).toByte
   }
@@ -334,7 +328,7 @@ case class Cast(child: Expression, dataType: DataType) extends UnaryExpression w
       buildCast[Int](_, d => null) // date can't cast to decimal in Hive
     case TimestampType =>
       // Note that we lose precision here.
-      buildCast[Timestamp](_, t => changePrecision(Decimal(timestampToDouble(t)), target))
+      buildCast[Long](_, t => changePrecision(Decimal(timestampToDouble(t)), target))
     case DecimalType() =>
       b => changePrecision(b.asInstanceOf[Decimal].clone(), target)
     case LongType =>
@@ -358,7 +352,7 @@ case class Cast(child: Expression, dataType: DataType) extends UnaryExpression w
     case DateType =>
       buildCast[Int](_, d => null)
     case TimestampType =>
-      buildCast[Timestamp](_, t => timestampToDouble(t))
+      buildCast[Long](_, t => timestampToDouble(t))
     case x: NumericType =>
       b => x.numeric.asInstanceOf[Numeric[Any]].toDouble(b)
   }
@@ -374,7 +368,7 @@ case class Cast(child: Expression, dataType: DataType) extends UnaryExpression w
     case DateType =>
       buildCast[Int](_, d => null)
     case TimestampType =>
-      buildCast[Timestamp](_, t => timestampToDouble(t).toFloat)
+      buildCast[Long](_, t => timestampToDouble(t).toFloat)
     case x: NumericType =>
       b => x.numeric.asInstanceOf[Numeric[Any]].toFloat(b)
   }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/SpecificMutableRow.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/SpecificMutableRow.scala
index aa4099e4d7bf9..2c884517d62a7 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/SpecificMutableRow.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/SpecificMutableRow.scala
@@ -203,6 +203,7 @@ final class SpecificMutableRow(val values: Array[MutableValue]) extends MutableR
         case BooleanType => new MutableBoolean
         case LongType => new MutableLong
         case DateType => new MutableInt // We use INT for DATE internally
+        case TimestampType => new MutableLong  // We use Long for Timestamp internally
         case _ => new MutableAny
       }.toArray)
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala
index e95682f952a7b..80aa8fa056146 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala
@@ -122,7 +122,7 @@ class CodeGenContext {
     case BinaryType => "byte[]"
     case StringType => stringType
     case DateType => "int"
-    case TimestampType => "java.sql.Timestamp"
+    case TimestampType => "long"
     case dt: OpenHashSetUDT if dt.elementType == IntegerType => classOf[IntegerHashSet].getName
     case dt: OpenHashSetUDT if dt.elementType == LongType => classOf[LongHashSet].getName
     case _ => "Object"
@@ -140,6 +140,7 @@ class CodeGenContext {
     case FloatType => "Float"
     case BooleanType => "Boolean"
     case DateType => "Integer"
+    case TimestampType => "Long"
     case _ => javaType(dt)
   }
 
@@ -155,6 +156,7 @@ class CodeGenContext {
     case DoubleType => "-1.0"
     case IntegerType => "-1"
     case DateType => "-1"
+    case TimestampType => "-1L"
     case _ => "null"
   }
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateProjection.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateProjection.scala
index 7caf4aaab88bb..274429cd1c55f 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateProjection.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateProjection.scala
@@ -73,7 +73,9 @@ object GenerateProjection extends CodeGenerator[Seq[Expression], Projection] {
 
     val specificAccessorFunctions = ctx.nativeTypes.map { dataType =>
       val cases = expressions.zipWithIndex.map {
-        case (e, i) if e.dataType == dataType =>
+        case (e, i) if e.dataType == dataType
+          || dataType == IntegerType && e.dataType == DateType
+          || dataType == LongType && e.dataType == TimestampType =>
           s"case $i: return c$i;"
         case _ => ""
       }.mkString("\n        ")
@@ -96,7 +98,9 @@ object GenerateProjection extends CodeGenerator[Seq[Expression], Projection] {
 
     val specificMutatorFunctions = ctx.nativeTypes.map { dataType =>
       val cases = expressions.zipWithIndex.map {
-        case (e, i) if e.dataType == dataType =>
+        case (e, i) if e.dataType == dataType
+          || dataType == IntegerType && e.dataType == DateType
+          || dataType == LongType && e.dataType == TimestampType =>
           s"case $i: { c$i = value; return; }"
         case _ => ""
       }.mkString("\n")
@@ -119,7 +123,7 @@ object GenerateProjection extends CodeGenerator[Seq[Expression], Projection] {
       val nonNull = e.dataType match {
         case BooleanType => s"$col ? 0 : 1"
         case ByteType | ShortType | IntegerType | DateType => s"$col"
-        case LongType => s"$col ^ ($col >>> 32)"
+        case LongType | TimestampType => s"$col ^ ($col >>> 32)"
         case FloatType => s"Float.floatToIntBits($col)"
         case DoubleType =>
             s"(int)(Double.doubleToLongBits($col) ^ (Double.doubleToLongBits($col) >>> 32))"
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/literals.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/literals.scala
index 297b35b4da94c..833c08a293dcb 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/literals.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/literals.scala
@@ -37,7 +37,7 @@ object Literal {
     case d: BigDecimal => Literal(Decimal(d), DecimalType.Unlimited)
     case d: java.math.BigDecimal => Literal(Decimal(d), DecimalType.Unlimited)
     case d: Decimal => Literal(d, DecimalType.Unlimited)
-    case t: Timestamp => Literal(t, TimestampType)
+    case t: Timestamp => Literal(DateUtils.fromJavaTimestamp(t), TimestampType)
     case d: Date => Literal(DateUtils.fromJavaDate(d), DateType)
     case a: Array[Byte] => Literal(a, BinaryType)
     case null => Literal(null, NullType)
@@ -100,7 +100,7 @@ case class Literal protected (value: Any, dataType: DataType) extends LeafExpres
           ev.isNull = "false"
           ev.primitive = value.toString
           ""
-        case FloatType =>  // This must go before NumericType
+        case FloatType =>
           val v = value.asInstanceOf[Float]
           if (v.isNaN || v.isInfinite) {
             super.genCode(ctx, ev)
@@ -109,7 +109,7 @@ case class Literal protected (value: Any, dataType: DataType) extends LeafExpres
             ev.primitive = s"${value}f"
             ""
           }
-        case DoubleType =>  // This must go before NumericType
+        case DoubleType =>
           val v = value.asInstanceOf[Double]
           if (v.isNaN || v.isInfinite) {
             super.genCode(ctx, ev)
@@ -118,15 +118,18 @@ case class Literal protected (value: Any, dataType: DataType) extends LeafExpres
             ev.primitive = s"${value}"
             ""
           }
-
-        case ByteType | ShortType =>  // This must go before NumericType
+        case ByteType | ShortType =>
           ev.isNull = "false"
           ev.primitive = s"(${ctx.javaType(dataType)})$value"
           ""
-        case dt: NumericType if !dt.isInstanceOf[DecimalType] =>
+        case IntegerType | DateType =>
           ev.isNull = "false"
           ev.primitive = value.toString
           ""
+        case TimestampType | LongType =>
+          ev.isNull = "false"
+          ev.primitive = s"${value}L"
+          ""
         // eval() version may be faster for non-primitive types
         case other =>
           super.genCode(ctx, ev)
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala
index 3cbdfdfb13847..2c49352874fc3 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala
@@ -254,9 +254,9 @@ abstract class BinaryComparison extends BinaryExpression with Predicate {
       case dt: NumericType if ctx.isNativeType(dt) => defineCodeGen (ctx, ev, {
         (c1, c3) => s"$c1 $symbol $c3"
       })
-      case TimestampType =>
-        // java.sql.Timestamp does not have compare()
-        super.genCode(ctx, ev)
+      case DateType | TimestampType => defineCodeGen (ctx, ev, {
+        (c1, c3) => s"$c1 $symbol $c3"
+      })
       case other => defineCodeGen (ctx, ev, {
         (c1, c2) => s"$c1.compare($c2) $symbol 0"
       })
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateUtils.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateUtils.scala
index ad649acf536f9..5cadc141af1df 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateUtils.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateUtils.scala
@@ -17,7 +17,7 @@
 
 package org.apache.spark.sql.catalyst.util
 
-import java.sql.Date
+import java.sql.{Timestamp, Date}
 import java.text.SimpleDateFormat
 import java.util.{Calendar, TimeZone}
 
@@ -28,6 +28,7 @@ import org.apache.spark.sql.catalyst.expressions.Cast
  */
 object DateUtils {
   private val MILLIS_PER_DAY = 86400000
+  private val HUNDRED_NANOS_PER_SECOND = 10000000L
 
   // Java TimeZone has no mention of thread safety. Use thread local instance to be safe.
   private val LOCAL_TIMEZONE = new ThreadLocal[TimeZone] {
@@ -45,17 +46,17 @@ object DateUtils {
     ((millisLocal + LOCAL_TIMEZONE.get().getOffset(millisLocal)) / MILLIS_PER_DAY).toInt
   }
 
-  private def toMillisSinceEpoch(days: Int): Long = {
+  def toMillisSinceEpoch(days: Int): Long = {
     val millisUtc = days.toLong * MILLIS_PER_DAY
     millisUtc - LOCAL_TIMEZONE.get().getOffset(millisUtc)
   }
 
-  def fromJavaDate(date: java.sql.Date): Int = {
+  def fromJavaDate(date: Date): Int = {
     javaDateToDays(date)
   }
 
-  def toJavaDate(daysSinceEpoch: Int): java.sql.Date = {
-    new java.sql.Date(toMillisSinceEpoch(daysSinceEpoch))
+  def toJavaDate(daysSinceEpoch: Int): Date = {
+    new Date(toMillisSinceEpoch(daysSinceEpoch))
   }
 
   def toString(days: Int): String = Cast.threadLocalDateFormat.get.format(toJavaDate(days))
@@ -64,9 +65,9 @@ object DateUtils {
     if (!s.contains('T')) {
       // JDBC escape string
       if (s.contains(' ')) {
-        java.sql.Timestamp.valueOf(s)
+        Timestamp.valueOf(s)
       } else {
-        java.sql.Date.valueOf(s)
+        Date.valueOf(s)
       }
     } else if (s.endsWith("Z")) {
       // this is zero timezone of ISO8601
@@ -87,4 +88,33 @@ object DateUtils {
       ISO8601GMT.parse(s)
     }
   }
+
+  /**
+   * Return a java.sql.Timestamp from number of 100ns since epoch
+   */
+  def toJavaTimestamp(num100ns: Long): Timestamp = {
+    // setNanos() will overwrite the millisecond part, so the milliseconds should be
+    // cut off at seconds
+    var seconds = num100ns / HUNDRED_NANOS_PER_SECOND
+    var nanos = num100ns % HUNDRED_NANOS_PER_SECOND
+    // setNanos() can not accept negative value
+    if (nanos < 0) {
+      nanos += HUNDRED_NANOS_PER_SECOND
+      seconds -= 1
+    }
+    val t = new Timestamp(seconds * 1000)
+    t.setNanos(nanos.toInt * 100)
+    t
+  }
+
+  /**
+   * Return the number of 100ns since epoch from java.sql.Timestamp.
+   */
+  def fromJavaTimestamp(t: Timestamp): Long = {
+    if (t != null) {
+      t.getTime() * 10000L + (t.getNanos().toLong / 100) % 10000L
+    } else {
+      0L
+    }
+  }
 }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/TimestampType.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/TimestampType.scala
index aebabfc475925..a558641fcfed7 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/TimestampType.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/TimestampType.scala
@@ -17,8 +17,6 @@
 
 package org.apache.spark.sql.types
 
-import java.sql.Timestamp
-
 import scala.math.Ordering
 import scala.reflect.runtime.universe.typeTag
 
@@ -38,18 +36,16 @@ class TimestampType private() extends AtomicType {
   // The companion object and this class is separated so the companion object also subclasses
   // this type. Otherwise, the companion object would be of type "TimestampType$" in byte code.
   // Defined with a private constructor so the companion object is the only possible instantiation.
-  private[sql] type InternalType = Timestamp
+  private[sql] type InternalType = Long
 
   @transient private[sql] lazy val tag = ScalaReflectionLock.synchronized { typeTag[InternalType] }
 
-  private[sql] val ordering = new Ordering[InternalType] {
-    def compare(x: Timestamp, y: Timestamp): Int = x.compareTo(y)
-  }
+  private[sql] val ordering = implicitly[Ordering[InternalType]]
 
   /**
    * The default size of a value of the TimestampType is 12 bytes.
    */
-  override def defaultSize: Int = 12
+  override def defaultSize: Int = 8
 
   private[spark] override def asNullable: TimestampType = this
 }
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CastSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CastSuite.scala
index 5bc7c30eee1b6..3aca94db3bd8f 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CastSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CastSuite.scala
@@ -20,6 +20,7 @@ package org.apache.spark.sql.catalyst.expressions
 import java.sql.{Timestamp, Date}
 
 import org.apache.spark.SparkFunSuite
+import org.apache.spark.sql.catalyst.util.DateUtils
 import org.apache.spark.sql.types._
 
 /**
@@ -137,7 +138,7 @@ class CastSuite extends SparkFunSuite with ExpressionEvalHelper {
     checkEvaluation(cast(cast(sd, DateType), StringType), sd)
     checkEvaluation(cast(cast(d, StringType), DateType), 0)
     checkEvaluation(cast(cast(nts, TimestampType), StringType), nts)
-    checkEvaluation(cast(cast(ts, StringType), TimestampType), ts)
+    checkEvaluation(cast(cast(ts, StringType), TimestampType), DateUtils.fromJavaTimestamp(ts))
 
     // all convert to string type to check
     checkEvaluation(cast(cast(cast(nts, TimestampType), DateType), StringType), sd)
@@ -269,9 +270,9 @@ class CastSuite extends SparkFunSuite with ExpressionEvalHelper {
     checkEvaluation(cast(ts, LongType), 15.toLong)
     checkEvaluation(cast(ts, FloatType), 15.002f)
     checkEvaluation(cast(ts, DoubleType), 15.002)
-    checkEvaluation(cast(cast(tss, ShortType), TimestampType), ts)
-    checkEvaluation(cast(cast(tss, IntegerType), TimestampType), ts)
-    checkEvaluation(cast(cast(tss, LongType), TimestampType), ts)
+    checkEvaluation(cast(cast(tss, ShortType), TimestampType), DateUtils.fromJavaTimestamp(ts))
+    checkEvaluation(cast(cast(tss, IntegerType), TimestampType), DateUtils.fromJavaTimestamp(ts))
+    checkEvaluation(cast(cast(tss, LongType), TimestampType), DateUtils.fromJavaTimestamp(ts))
     checkEvaluation(
       cast(cast(millis.toFloat / 1000, TimestampType), FloatType),
       millis.toFloat / 1000)
@@ -283,7 +284,7 @@ class CastSuite extends SparkFunSuite with ExpressionEvalHelper {
       Decimal(1))
 
     // A test for higher precision than millis
-    checkEvaluation(cast(cast(0.00000001, TimestampType), DoubleType), 0.00000001)
+    checkEvaluation(cast(cast(0.0000001, TimestampType), DoubleType), 0.0000001)
 
     checkEvaluation(cast(Double.NaN, TimestampType), null)
     checkEvaluation(cast(1.0 / 0.0, TimestampType), null)
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DateUtilsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DateUtilsSuite.scala
new file mode 100644
index 0000000000000..a4245545ffc1d
--- /dev/null
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DateUtilsSuite.scala
@@ -0,0 +1,40 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.catalyst.util
+
+import java.sql.Timestamp
+
+import org.apache.spark.SparkFunSuite
+
+
+class DateUtilsSuite extends SparkFunSuite {
+
+  test("timestamp") {
+    val now = new Timestamp(System.currentTimeMillis())
+    now.setNanos(100)
+    val ns = DateUtils.fromJavaTimestamp(now)
+    assert(ns % 10000000L == 1)
+    assert(DateUtils.toJavaTimestamp(ns) == now)
+
+    List(-111111111111L, -1L, 0, 1L, 111111111111L).foreach { t =>
+      val ts = DateUtils.toJavaTimestamp(t)
+      assert(DateUtils.fromJavaTimestamp(ts) == t)
+      assert(DateUtils.toJavaTimestamp(DateUtils.fromJavaTimestamp(ts)) == ts)
+    }
+  }
+}
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/types/DataTypeSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/types/DataTypeSuite.scala
index 261c4fcad24aa..077c0ad70ac4f 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/types/DataTypeSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/types/DataTypeSuite.scala
@@ -190,7 +190,7 @@ class DataTypeSuite extends SparkFunSuite {
   checkDefaultSize(DecimalType(10, 5), 4096)
   checkDefaultSize(DecimalType.Unlimited, 4096)
   checkDefaultSize(DateType, 4)
-  checkDefaultSize(TimestampType, 12)
+  checkDefaultSize(TimestampType, 8)
   checkDefaultSize(StringType, 4096)
   checkDefaultSize(BinaryType, 4096)
   checkDefaultSize(ArrayType(DoubleType, true), 800)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/columnar/ColumnStats.scala b/sql/core/src/main/scala/org/apache/spark/sql/columnar/ColumnStats.scala
index b0f983c180673..83881a3687090 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/columnar/ColumnStats.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/columnar/ColumnStats.scala
@@ -17,10 +17,8 @@
 
 package org.apache.spark.sql.columnar
 
-import java.sql.Timestamp
-
 import org.apache.spark.sql.Row
-import org.apache.spark.sql.catalyst.expressions.{AttributeMap, Attribute, AttributeReference}
+import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeMap, AttributeReference}
 import org.apache.spark.sql.types._
 
 private[sql] class ColumnStatisticsSchema(a: Attribute) extends Serializable {
@@ -234,22 +232,7 @@ private[sql] class StringColumnStats extends ColumnStats {
 
 private[sql] class DateColumnStats extends IntColumnStats
 
-private[sql] class TimestampColumnStats extends ColumnStats {
-  protected var upper: Timestamp = null
-  protected var lower: Timestamp = null
-
-  override def gatherStats(row: Row, ordinal: Int): Unit = {
-    super.gatherStats(row, ordinal)
-    if (!row.isNullAt(ordinal)) {
-      val value = row(ordinal).asInstanceOf[Timestamp]
-      if (upper == null || value.compareTo(upper) > 0) upper = value
-      if (lower == null || value.compareTo(lower) < 0) lower = value
-      sizeInBytes += TIMESTAMP.defaultSize
-    }
-  }
-
-  override def collectedStatistics: Row = Row(lower, upper, nullCount, count, sizeInBytes)
-}
+private[sql] class TimestampColumnStats extends LongColumnStats
 
 private[sql] class BinaryColumnStats extends ColumnStats {
   override def gatherStats(row: Row, ordinal: Int): Unit = {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/columnar/ColumnType.scala b/sql/core/src/main/scala/org/apache/spark/sql/columnar/ColumnType.scala
index 20be5ca9d0046..c9c4d630fb5f4 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/columnar/ColumnType.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/columnar/ColumnType.scala
@@ -18,7 +18,6 @@
 package org.apache.spark.sql.columnar
 
 import java.nio.ByteBuffer
-import java.sql.Timestamp
 
 import scala.reflect.runtime.universe.TypeTag
 
@@ -355,22 +354,20 @@ private[sql] object DATE extends NativeColumnType(DateType, 8, 4) {
   }
 }
 
-private[sql] object TIMESTAMP extends NativeColumnType(TimestampType, 9, 12) {
-  override def extract(buffer: ByteBuffer): Timestamp = {
-    val timestamp = new Timestamp(buffer.getLong())
-    timestamp.setNanos(buffer.getInt())
-    timestamp
+private[sql] object TIMESTAMP extends NativeColumnType(TimestampType, 9, 8) {
+  override def extract(buffer: ByteBuffer): Long = {
+    buffer.getLong
   }
 
-  override def append(v: Timestamp, buffer: ByteBuffer): Unit = {
-    buffer.putLong(v.getTime).putInt(v.getNanos)
+  override def append(v: Long, buffer: ByteBuffer): Unit = {
+    buffer.putLong(v)
   }
 
-  override def getField(row: Row, ordinal: Int): Timestamp = {
-    row(ordinal).asInstanceOf[Timestamp]
+  override def getField(row: Row, ordinal: Int): Long = {
+    row(ordinal).asInstanceOf[Long]
   }
 
-  override def setField(row: MutableRow, ordinal: Int, value: Timestamp): Unit = {
+  override def setField(row: MutableRow, ordinal: Int, value: Long): Unit = {
     row(ordinal) = value
   }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlSerializer2.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlSerializer2.scala
index 256d527d7b636..60f3b2d539ffe 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlSerializer2.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlSerializer2.scala
@@ -20,14 +20,13 @@ package org.apache.spark.sql.execution
 import java.io._
 import java.math.{BigDecimal, BigInteger}
 import java.nio.ByteBuffer
-import java.sql.Timestamp
 
 import scala.reflect.ClassTag
 
-import org.apache.spark.serializer._
 import org.apache.spark.Logging
+import org.apache.spark.serializer._
 import org.apache.spark.sql.Row
-import org.apache.spark.sql.catalyst.expressions.{SpecificMutableRow, MutableRow, GenericMutableRow}
+import org.apache.spark.sql.catalyst.expressions.{GenericMutableRow, MutableRow, SpecificMutableRow}
 import org.apache.spark.sql.types._
 
 /**
@@ -304,11 +303,7 @@ private[sql] object SparkSqlSerializer2 {
                 out.writeByte(NULL)
               } else {
                 out.writeByte(NOT_NULL)
-                val timestamp = row.getAs[java.sql.Timestamp](i)
-                val time = timestamp.getTime
-                val nanos = timestamp.getNanos
-                out.writeLong(time - (nanos / 1000000)) // Write the milliseconds value.
-                out.writeInt(nanos)                     // Write the nanoseconds part.
+                out.writeLong(row.getAs[Long](i))
               }
 
             case StringType =>
@@ -429,11 +424,7 @@ private[sql] object SparkSqlSerializer2 {
               if (in.readByte() == NULL) {
                 mutableRow.setNullAt(i)
               } else {
-                val time = in.readLong() // Read the milliseconds value.
-                val nanos = in.readInt() // Read the nanoseconds part.
-                val timestamp = new Timestamp(time)
-                timestamp.setNanos(nanos)
-                mutableRow.update(i, timestamp)
+                mutableRow.update(i, in.readLong())
               }
 
             case StringType =>
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/debug/package.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/debug/package.scala
index dffb265601bdb..720b529d5946f 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/debug/package.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/debug/package.scala
@@ -170,6 +170,8 @@ package object debug {
       case (_: Short, ShortType) =>
       case (_: Boolean, BooleanType) =>
       case (_: Double, DoubleType) =>
+      case (_: Int, DateType) =>
+      case (_: Long, TimestampType) =>
       case (v, udt: UserDefinedType[_]) => typeCheck(v, udt.sqlType)
 
       case (d, t) => sys.error(s"Invalid data found: got $d (${d.getClass}) expected $t")
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/pythonUdfs.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/pythonUdfs.scala
index 342587904789a..955b478a4882f 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/pythonUdfs.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/pythonUdfs.scala
@@ -148,6 +148,7 @@ object EvaluatePython {
     case (ud, udt: UserDefinedType[_]) => toJava(udt.serialize(ud), udt.sqlType)
 
     case (date: Int, DateType) => DateUtils.toJavaDate(date)
+    case (t: Long, TimestampType) => DateUtils.toJavaTimestamp(t)
     case (s: UTF8String, StringType) => s.toString
 
     // Pyrolite can handle Timestamp and Decimal
@@ -186,10 +187,12 @@ object EvaluatePython {
       }): Row
 
     case (c: java.util.Calendar, DateType) =>
-      DateUtils.fromJavaDate(new java.sql.Date(c.getTime().getTime()))
+      DateUtils.fromJavaDate(new java.sql.Date(c.getTimeInMillis))
 
     case (c: java.util.Calendar, TimestampType) =>
-      new java.sql.Timestamp(c.getTime().getTime())
+      c.getTimeInMillis * 10000L
+    case (t: java.sql.Timestamp, TimestampType) =>
+      DateUtils.fromJavaTimestamp(t)
 
     case (_, udt: UserDefinedType[_]) =>
       fromJava(obj, udt.sqlType)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/jdbc/JDBCRDD.scala b/sql/core/src/main/scala/org/apache/spark/sql/jdbc/JDBCRDD.scala
index db68b9c86db1b..9028d5ed72c92 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/jdbc/JDBCRDD.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/jdbc/JDBCRDD.scala
@@ -385,7 +385,7 @@ private[sql] class JDBCRDD(
               // DateUtils.fromJavaDate does not handle null value, so we need to check it.
               val dateVal = rs.getDate(pos)
               if (dateVal != null) {
-                mutableRow.update(i, DateUtils.fromJavaDate(dateVal))
+                mutableRow.setInt(i, DateUtils.fromJavaDate(dateVal))
               } else {
                 mutableRow.update(i, null)
               }
@@ -417,7 +417,13 @@ private[sql] class JDBCRDD(
             case LongConversion => mutableRow.setLong(i, rs.getLong(pos))
             // TODO(davies): use getBytes for better performance, if the encoding is UTF-8
             case StringConversion => mutableRow.setString(i, rs.getString(pos))
-            case TimestampConversion => mutableRow.update(i, rs.getTimestamp(pos))
+            case TimestampConversion =>
+              val t = rs.getTimestamp(pos)
+              if (t != null) {
+                mutableRow.setLong(i, DateUtils.fromJavaTimestamp(t))
+              } else {
+                mutableRow.update(i, null)
+              }
             case BinaryConversion => mutableRow.update(i, rs.getBytes(pos))
             case BinaryLongConversion => {
               val bytes = rs.getBytes(pos)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/json/JacksonParser.scala b/sql/core/src/main/scala/org/apache/spark/sql/json/JacksonParser.scala
index 0e223758051a6..4e07cf36ae434 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/json/JacksonParser.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/json/JacksonParser.scala
@@ -18,7 +18,6 @@
 package org.apache.spark.sql.json
 
 import java.io.ByteArrayOutputStream
-import java.sql.Timestamp
 
 import scala.collection.Map
 
@@ -65,10 +64,10 @@ private[sql] object JacksonParser {
         DateUtils.millisToDays(DateUtils.stringToTime(parser.getText).getTime)
 
       case (VALUE_STRING, TimestampType) =>
-        new Timestamp(DateUtils.stringToTime(parser.getText).getTime)
+        DateUtils.stringToTime(parser.getText).getTime * 10000L
 
       case (VALUE_NUMBER_INT, TimestampType) =>
-        new Timestamp(parser.getLongValue)
+        parser.getLongValue * 10000L
 
       case (_, StringType) =>
         val writer = new ByteArrayOutputStream()
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/json/JsonRDD.scala b/sql/core/src/main/scala/org/apache/spark/sql/json/JsonRDD.scala
index 7e1e21f5fbb99..fb0d137bdbfdb 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/json/JsonRDD.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/json/JsonRDD.scala
@@ -17,8 +17,6 @@
 
 package org.apache.spark.sql.json
 
-import java.sql.Timestamp
-
 import scala.collection.Map
 import scala.collection.convert.Wrappers.{JListWrapper, JMapWrapper}
 
@@ -398,11 +396,11 @@ private[sql] object JsonRDD extends Logging {
     }
   }
 
-  private def toTimestamp(value: Any): Timestamp = {
+  private def toTimestamp(value: Any): Long = {
     value match {
-      case value: java.lang.Integer => new Timestamp(value.asInstanceOf[Int].toLong)
-      case value: java.lang.Long => new Timestamp(value)
-      case value: java.lang.String => toTimestamp(DateUtils.stringToTime(value).getTime)
+      case value: java.lang.Integer => value.asInstanceOf[Int].toLong * 10000L
+      case value: java.lang.Long => value * 10000L
+      case value: java.lang.String => DateUtils.stringToTime(value).getTime * 10000L
     }
   }
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetConverter.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetConverter.scala
index 85c2ce740fe52..ddc5097f88fb1 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetConverter.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetConverter.scala
@@ -28,6 +28,7 @@ import org.apache.parquet.io.api.{PrimitiveConverter, GroupConverter, Binary, Co
 import org.apache.parquet.schema.MessageType
 
 import org.apache.spark.sql.catalyst.expressions._
+import org.apache.spark.sql.catalyst.util.DateUtils
 import org.apache.spark.sql.parquet.CatalystConverter.FieldType
 import org.apache.spark.sql.types._
 import org.apache.spark.sql.parquet.timestamp.NanoTime
@@ -266,8 +267,8 @@ private[parquet] abstract class CatalystConverter extends GroupConverter {
   /**
    * Read a Timestamp value from a Parquet Int96Value
    */
-  protected[parquet] def readTimestamp(value: Binary): Timestamp = {
-    CatalystTimestampConverter.convertToTimestamp(value)
+  protected[parquet] def readTimestamp(value: Binary): Long = {
+    DateUtils.fromJavaTimestamp(CatalystTimestampConverter.convertToTimestamp(value))
   }
 }
 
@@ -401,7 +402,7 @@ private[parquet] class CatalystPrimitiveRowConverter(
     current.setInt(fieldIndex, value)
 
   override protected[parquet] def updateDate(fieldIndex: Int, value: Int): Unit =
-    current.update(fieldIndex, value)
+    current.setInt(fieldIndex, value)
 
   override protected[parquet] def updateLong(fieldIndex: Int, value: Long): Unit =
     current.setLong(fieldIndex, value)
@@ -425,7 +426,7 @@ private[parquet] class CatalystPrimitiveRowConverter(
     current.update(fieldIndex, UTF8String(value))
 
   override protected[parquet] def updateTimestamp(fieldIndex: Int, value: Binary): Unit =
-    current.update(fieldIndex, readTimestamp(value))
+    current.setLong(fieldIndex, readTimestamp(value))
 
   override protected[parquet] def updateDecimal(
       fieldIndex: Int, value: Binary, ctype: DecimalType): Unit = {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableSupport.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableSupport.scala
index 89db408b1c382..e03dbdec0491d 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableSupport.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableSupport.scala
@@ -29,6 +29,7 @@ import org.apache.parquet.schema.MessageType
 
 import org.apache.spark.Logging
 import org.apache.spark.sql.catalyst.expressions.{Attribute, Row}
+import org.apache.spark.sql.catalyst.util.DateUtils
 import org.apache.spark.sql.types._
 
 /**
@@ -204,7 +205,7 @@ private[parquet] class RowWriteSupport extends WriteSupport[Row] with Logging {
         case IntegerType => writer.addInteger(value.asInstanceOf[Int])
         case ShortType => writer.addInteger(value.asInstanceOf[Short])
         case LongType => writer.addLong(value.asInstanceOf[Long])
-        case TimestampType => writeTimestamp(value.asInstanceOf[java.sql.Timestamp])
+        case TimestampType => writeTimestamp(value.asInstanceOf[Long])
         case ByteType => writer.addInteger(value.asInstanceOf[Byte])
         case DoubleType => writer.addDouble(value.asInstanceOf[Double])
         case FloatType => writer.addFloat(value.asInstanceOf[Float])
@@ -311,8 +312,9 @@ private[parquet] class RowWriteSupport extends WriteSupport[Row] with Logging {
     writer.addBinary(Binary.fromByteArray(scratchBytes, 0, numBytes))
   }
 
-  private[parquet] def writeTimestamp(ts: java.sql.Timestamp): Unit = {
-    val binaryNanoTime = CatalystTimestampConverter.convertFromTimestamp(ts)
+  private[parquet] def writeTimestamp(ts: Long): Unit = {
+    val binaryNanoTime = CatalystTimestampConverter.convertFromTimestamp(
+      DateUtils.toJavaTimestamp(ts))
     writer.addBinary(binaryNanoTime)
   }
 }
@@ -357,7 +359,7 @@ private[parquet] class MutableRowWriteSupport extends RowWriteSupport {
       case FloatType => writer.addFloat(record.getFloat(index))
       case BooleanType => writer.addBoolean(record.getBoolean(index))
       case DateType => writer.addInteger(record.getInt(index))
-      case TimestampType => writeTimestamp(record(index).asInstanceOf[java.sql.Timestamp])
+      case TimestampType => writeTimestamp(record(index).asInstanceOf[Long])
       case d: DecimalType =>
         if (d.precisionInfo == None || d.precisionInfo.get.precision > 18) {
           sys.error(s"Unsupported datatype $d, cannot write to consumer")
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/CachedTableSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/CachedTableSuite.scala
index 72e60d9aa75cb..17a3cec48b856 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/CachedTableSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/CachedTableSuite.scala
@@ -25,7 +25,7 @@ import org.scalatest.concurrent.Eventually._
 import org.apache.spark.Accumulators
 import org.apache.spark.sql.TestData._
 import org.apache.spark.sql.columnar._
-import org.apache.spark.storage.{RDDBlockId, StorageLevel}
+import org.apache.spark.storage.{StorageLevel, RDDBlockId}
 
 case class BigData(s: String)
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/columnar/ColumnStatsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/columnar/ColumnStatsSuite.scala
index 339e719f39f16..16836628cb73a 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/columnar/ColumnStatsSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/columnar/ColumnStatsSuite.scala
@@ -31,7 +31,7 @@ class ColumnStatsSuite extends SparkFunSuite {
   testColumnStats(classOf[FixedDecimalColumnStats], FIXED_DECIMAL(15, 10), Row(null, null, 0))
   testColumnStats(classOf[StringColumnStats], STRING, Row(null, null, 0))
   testColumnStats(classOf[DateColumnStats], DATE, Row(Int.MaxValue, Int.MinValue, 0))
-  testColumnStats(classOf[TimestampColumnStats], TIMESTAMP, Row(null, null, 0))
+  testColumnStats(classOf[TimestampColumnStats], TIMESTAMP, Row(Long.MaxValue, Long.MinValue, 0))
 
   def testColumnStats[T <: AtomicType, U <: ColumnStats](
       columnStatsClass: Class[U],
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/columnar/ColumnTypeSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/columnar/ColumnTypeSuite.scala
index a1e76eaa982cc..8421e670ff05d 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/columnar/ColumnTypeSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/columnar/ColumnTypeSuite.scala
@@ -18,17 +18,16 @@
 package org.apache.spark.sql.columnar
 
 import java.nio.ByteBuffer
-import java.sql.Timestamp
 
-import com.esotericsoftware.kryo.{Serializer, Kryo}
 import com.esotericsoftware.kryo.io.{Input, Output}
-import org.apache.spark.serializer.KryoRegistrator
+import com.esotericsoftware.kryo.{Kryo, Serializer}
 
-import org.apache.spark.{Logging, SparkConf, SparkFunSuite}
+import org.apache.spark.serializer.KryoRegistrator
 import org.apache.spark.sql.catalyst.expressions.GenericMutableRow
 import org.apache.spark.sql.columnar.ColumnarTestUtils._
 import org.apache.spark.sql.execution.SparkSqlSerializer
 import org.apache.spark.sql.types._
+import org.apache.spark.{Logging, SparkConf, SparkFunSuite}
 
 class ColumnTypeSuite extends SparkFunSuite with Logging {
   val DEFAULT_BUFFER_SIZE = 512
@@ -36,7 +35,7 @@ class ColumnTypeSuite extends SparkFunSuite with Logging {
   test("defaultSize") {
     val checks = Map(
       INT -> 4, SHORT -> 2, LONG -> 8, BYTE -> 1, DOUBLE -> 8, FLOAT -> 4,
-      FIXED_DECIMAL(15, 10) -> 8, BOOLEAN -> 1, STRING -> 8, DATE -> 4, TIMESTAMP -> 12,
+      FIXED_DECIMAL(15, 10) -> 8, BOOLEAN -> 1, STRING -> 8, DATE -> 4, TIMESTAMP -> 8,
       BINARY -> 16, GENERIC -> 16)
 
     checks.foreach { case (columnType, expectedSize) =>
@@ -69,7 +68,7 @@ class ColumnTypeSuite extends SparkFunSuite with Logging {
     checkActualSize(BOOLEAN, true, 1)
     checkActualSize(STRING, UTF8String("hello"), 4 + "hello".getBytes("utf-8").length)
     checkActualSize(DATE, 0, 4)
-    checkActualSize(TIMESTAMP, new Timestamp(0L), 12)
+    checkActualSize(TIMESTAMP, 0L, 8)
 
     val binary = Array.fill[Byte](4)(0: Byte)
     checkActualSize(BINARY, binary, 4 + 4)
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/columnar/ColumnarTestUtils.scala b/sql/core/src/test/scala/org/apache/spark/sql/columnar/ColumnarTestUtils.scala
index 75d993e563e06..c5d38595c0bec 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/columnar/ColumnarTestUtils.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/columnar/ColumnarTestUtils.scala
@@ -17,14 +17,12 @@
 
 package org.apache.spark.sql.columnar
 
-import java.sql.Timestamp
-
 import scala.collection.immutable.HashSet
 import scala.util.Random
 
 import org.apache.spark.sql.Row
 import org.apache.spark.sql.catalyst.expressions.GenericMutableRow
-import org.apache.spark.sql.types.{UTF8String, DataType, Decimal, AtomicType}
+import org.apache.spark.sql.types.{AtomicType, DataType, Decimal, UTF8String}
 
 object ColumnarTestUtils {
   def makeNullRow(length: Int): GenericMutableRow = {
@@ -52,10 +50,7 @@ object ColumnarTestUtils {
       case BOOLEAN => Random.nextBoolean()
       case BINARY => randomBytes(Random.nextInt(32))
       case DATE => Random.nextInt()
-      case TIMESTAMP =>
-        val timestamp = new Timestamp(Random.nextLong())
-        timestamp.setNanos(Random.nextInt(999999999))
-        timestamp
+      case TIMESTAMP => Random.nextLong()
       case _ =>
         // Using a random one-element map instead of an arbitrary object
         Map(Random.nextInt() -> Random.nextString(Random.nextInt(32)))
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCSuite.scala
index 49d348c3ed21b..69ab1c292d221 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCSuite.scala
@@ -326,7 +326,7 @@ class JDBCSuite extends SparkFunSuite with BeforeAndAfter {
     assert(cal.get(Calendar.HOUR) === 11)
     assert(cal.get(Calendar.MINUTE) === 22)
     assert(cal.get(Calendar.SECOND) === 33)
-    assert(rows(0).getAs[java.sql.Timestamp](2).getNanos === 543543543)
+    assert(rows(0).getAs[java.sql.Timestamp](2).getNanos === 543543500)
   }
 
   test("test DATE types") {
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/json/JsonSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/json/JsonSuite.scala
index d889c7be17ce7..fca24364fe6ec 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/json/JsonSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/json/JsonSuite.scala
@@ -76,21 +76,25 @@ class JsonSuite extends QueryTest with TestJsonData {
     checkTypePromotion(
       Decimal(doubleNumber), enforceCorrectType(doubleNumber, DecimalType.Unlimited))
 
-    checkTypePromotion(new Timestamp(intNumber), enforceCorrectType(intNumber, TimestampType))
-    checkTypePromotion(new Timestamp(intNumber.toLong),
+    checkTypePromotion(DateUtils.fromJavaTimestamp(new Timestamp(intNumber)),
+        enforceCorrectType(intNumber, TimestampType))
+    checkTypePromotion(DateUtils.fromJavaTimestamp(new Timestamp(intNumber.toLong)),
         enforceCorrectType(intNumber.toLong, TimestampType))
     val strTime = "2014-09-30 12:34:56"
-    checkTypePromotion(Timestamp.valueOf(strTime), enforceCorrectType(strTime, TimestampType))
+    checkTypePromotion(DateUtils.fromJavaTimestamp(Timestamp.valueOf(strTime)),
+        enforceCorrectType(strTime, TimestampType))
 
     val strDate = "2014-10-15"
     checkTypePromotion(
       DateUtils.fromJavaDate(Date.valueOf(strDate)), enforceCorrectType(strDate, DateType))
 
     val ISO8601Time1 = "1970-01-01T01:00:01.0Z"
-    checkTypePromotion(new Timestamp(3601000), enforceCorrectType(ISO8601Time1, TimestampType))
+    checkTypePromotion(DateUtils.fromJavaTimestamp(new Timestamp(3601000)),
+        enforceCorrectType(ISO8601Time1, TimestampType))
     checkTypePromotion(DateUtils.millisToDays(3601000), enforceCorrectType(ISO8601Time1, DateType))
     val ISO8601Time2 = "1970-01-01T02:00:01-01:00"
-    checkTypePromotion(new Timestamp(10801000), enforceCorrectType(ISO8601Time2, TimestampType))
+    checkTypePromotion(DateUtils.fromJavaTimestamp(new Timestamp(10801000)),
+        enforceCorrectType(ISO8601Time2, TimestampType))
     checkTypePromotion(DateUtils.millisToDays(10801000), enforceCorrectType(ISO8601Time2, DateType))
   }
 
diff --git a/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveCompatibilitySuite.scala b/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveCompatibilitySuite.scala
index 0693c7ea5b332..82c0b494598a8 100644
--- a/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveCompatibilitySuite.scala
+++ b/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveCompatibilitySuite.scala
@@ -252,7 +252,11 @@ class HiveCompatibilitySuite extends HiveQueryFileTest with BeforeAndAfter {
     "load_dyn_part14.*", // These work alone but fail when run with other tests...
 
     // the answer is sensitive for jdk version
-    "udf_java_method"
+    "udf_java_method",
+
+    // Spark SQL use Long for TimestampType, lose the precision under 100ns
+    "timestamp_1",
+    "timestamp_2"
   )
 
   /**
@@ -795,8 +799,6 @@ class HiveCompatibilitySuite extends HiveQueryFileTest with BeforeAndAfter {
     "stats_publisher_error_1",
     "subq2",
     "tablename_with_select",
-    "timestamp_1",
-    "timestamp_2",
     "timestamp_3",
     "timestamp_comparison",
     "timestamp_lazy",
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveInspectors.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveInspectors.scala
index c466203cd0220..1f14cba78f479 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveInspectors.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveInspectors.scala
@@ -250,7 +250,8 @@ private[hive] trait HiveInspectors {
         PrimitiveObjectInspectorFactory.javaHiveDecimalObjectInspector,
         poi.getWritableConstantValue.getHiveDecimal)
     case poi: WritableConstantTimestampObjectInspector =>
-      poi.getWritableConstantValue.getTimestamp.clone()
+      val t = poi.getWritableConstantValue
+      t.getSeconds * 10000000L + t.getNanos / 100L
     case poi: WritableConstantIntObjectInspector =>
       poi.getWritableConstantValue.get()
     case poi: WritableConstantDoubleObjectInspector =>
@@ -313,11 +314,11 @@ private[hive] trait HiveInspectors {
       case x: DateObjectInspector if x.preferWritable() =>
         DateUtils.fromJavaDate(x.getPrimitiveWritableObject(data).get())
       case x: DateObjectInspector => DateUtils.fromJavaDate(x.getPrimitiveJavaObject(data))
-      // org.apache.hadoop.hive.serde2.io.TimestampWritable.set will reset current time object
-      // if next timestamp is null, so Timestamp object is cloned
       case x: TimestampObjectInspector if x.preferWritable() =>
-        x.getPrimitiveWritableObject(data).getTimestamp.clone()
-      case ti: TimestampObjectInspector => ti.getPrimitiveJavaObject(data).clone()
+        val t = x.getPrimitiveWritableObject(data)
+        t.getSeconds * 10000000L + t.getNanos / 100
+      case ti: TimestampObjectInspector =>
+        DateUtils.fromJavaTimestamp(ti.getPrimitiveJavaObject(data))
       case _ => pi.getPrimitiveJavaObject(data)
     }
     case li: ListObjectInspector =>
@@ -356,6 +357,9 @@ private[hive] trait HiveInspectors {
     case _: JavaDateObjectInspector =>
       (o: Any) => DateUtils.toJavaDate(o.asInstanceOf[Int])
 
+    case _: JavaTimestampObjectInspector =>
+      (o: Any) => DateUtils.toJavaTimestamp(o.asInstanceOf[Long])
+
     case soi: StandardStructObjectInspector =>
       val wrappers = soi.getAllStructFieldRefs.map(ref => wrapperFor(ref.getFieldObjectInspector))
       (o: Any) => {
@@ -465,7 +469,7 @@ private[hive] trait HiveInspectors {
       case _: DateObjectInspector if x.preferWritable() => getDateWritable(a)
       case _: DateObjectInspector => DateUtils.toJavaDate(a.asInstanceOf[Int])
       case _: TimestampObjectInspector if x.preferWritable() => getTimestampWritable(a)
-      case _: TimestampObjectInspector => a.asInstanceOf[java.sql.Timestamp]
+      case _: TimestampObjectInspector => DateUtils.toJavaTimestamp(a.asInstanceOf[Long])
     }
     case x: SettableStructObjectInspector =>
       val fieldRefs = x.getAllStructFieldRefs
@@ -727,7 +731,7 @@ private[hive] trait HiveInspectors {
       TypeInfoFactory.voidTypeInfo, null)
 
   private def getStringWritable(value: Any): hadoopIo.Text =
-    if (value == null) null else new hadoopIo.Text(value.asInstanceOf[UTF8String].toString)
+    if (value == null) null else new hadoopIo.Text(value.asInstanceOf[UTF8String].getBytes)
 
   private def getIntWritable(value: Any): hadoopIo.IntWritable =
     if (value == null) null else new hadoopIo.IntWritable(value.asInstanceOf[Int])
@@ -776,7 +780,7 @@ private[hive] trait HiveInspectors {
     if (value == null) {
       null
     } else {
-      new hiveIo.TimestampWritable(value.asInstanceOf[java.sql.Timestamp])
+      new hiveIo.TimestampWritable(DateUtils.toJavaTimestamp(value.asInstanceOf[Long]))
     }
 
   private def getDecimalWritable(value: Any): hiveIo.HiveDecimalWritable =
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/TableReader.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/TableReader.scala
index 334bfccc9d200..d3c82d8c2e326 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/TableReader.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/TableReader.scala
@@ -363,10 +363,10 @@ private[hive] object HadoopTableReader extends HiveInspectors with Logging {
             row.update(ordinal, HiveShim.toCatalystDecimal(oi, value))
         case oi: TimestampObjectInspector =>
           (value: Any, row: MutableRow, ordinal: Int) =>
-            row.update(ordinal, oi.getPrimitiveJavaObject(value).clone())
+            row.setLong(ordinal, DateUtils.fromJavaTimestamp(oi.getPrimitiveJavaObject(value)))
         case oi: DateObjectInspector =>
           (value: Any, row: MutableRow, ordinal: Int) =>
-            row.update(ordinal, DateUtils.fromJavaDate(oi.getPrimitiveJavaObject(value)))
+            row.setInt(ordinal, DateUtils.fromJavaDate(oi.getPrimitiveJavaObject(value)))
         case oi: BinaryObjectInspector =>
           (value: Any, row: MutableRow, ordinal: Int) =>
             row.update(ordinal, oi.getPrimitiveJavaObject(value))
diff --git a/sql/hive/src/test/resources/golden/timestamp cast #5-0-dbd7bcd167d322d6617b884c02c7f247 b/sql/hive/src/test/resources/golden/timestamp cast #5-0-dbd7bcd167d322d6617b884c02c7f247
index 27de46fdf22ac..84a31a5a6970b 100644
--- a/sql/hive/src/test/resources/golden/timestamp cast #5-0-dbd7bcd167d322d6617b884c02c7f247	
+++ b/sql/hive/src/test/resources/golden/timestamp cast #5-0-dbd7bcd167d322d6617b884c02c7f247	
@@ -1 +1 @@
--0.0010000000000000009
+-0.001

From 6a47114bc297f0bce874e425feb1c24a5c26cef0 Mon Sep 17 00:00:00 2001
From: "navis.ryu" <navis@apache.org>
Date: Wed, 10 Jun 2015 18:19:12 -0700
Subject: [PATCH 445/525] [SPARK-8285] [SQL] CombineSum should be calculated as
 unlimited decimal first

    case cs  CombineSum(expr) =>
        val calcType = expr.dataType
          expr.dataType match {
            case DecimalType.Fixed(_, _) =>
              DecimalType.Unlimited
            case _ =>
              expr.dataType
          }
calcType is always expr.dataType. credits are all belong to IntelliJ

Author: navis.ryu <navis@apache.org>

Closes #6736 from navis/SPARK-8285 and squashes the following commits:

20382c1 [navis.ryu] [SPARK-8285] [SQL] CombineSum should be calculated as unlimited decimal first
---
 .../org/apache/spark/sql/execution/GeneratedAggregate.scala   | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/GeneratedAggregate.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/GeneratedAggregate.scala
index 3e27c1bde2dfd..af3791734d0c9 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/GeneratedAggregate.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/GeneratedAggregate.scala
@@ -118,7 +118,7 @@ case class GeneratedAggregate(
         AggregateEvaluation(currentSum :: Nil, initialValue :: Nil, updateFunction :: Nil, result)
 
       case cs @ CombineSum(expr) =>
-        val calcType = expr.dataType
+        val calcType =
           expr.dataType match {
             case DecimalType.Fixed(_, _) =>
               DecimalType.Unlimited
@@ -129,7 +129,7 @@ case class GeneratedAggregate(
         val currentSum = AttributeReference("currentSum", calcType, nullable = true)()
         val initialValue = Literal.create(null, calcType)
 
-        // Coalasce avoids double calculation...
+        // Coalesce avoids double calculation...
         // but really, common sub expression elimination would be better....
         val zero = Cast(Literal(0), calcType)
         // If we're evaluating UnscaledValue(x), we can do Count on x directly, since its

From 4e42842e82e058d54329bd66185d8a7e77ab335a Mon Sep 17 00:00:00 2001
From: Wenchen Fan <cloud0fan@outlook.com>
Date: Wed, 10 Jun 2015 18:22:47 -0700
Subject: [PATCH 446/525] [SPARK-8164] transformExpressions should support
 nested expression sequence

Currently we only support `Seq[Expression]`, we should handle cases like `Seq[Seq[Expression]]` so that we can remove the unnecessary `GroupExpression`.

Author: Wenchen Fan <cloud0fan@outlook.com>

Closes #6706 from cloud-fan/clean and squashes the following commits:

60a1193 [Wenchen Fan] support nested expression sequence and remove GroupExpression
---
 .../sql/catalyst/analysis/Analyzer.scala      |  6 ++---
 .../sql/catalyst/expressions/Expression.scala | 12 ----------
 .../spark/sql/catalyst/plans/QueryPlan.scala  | 22 +++++++++----------
 .../plans/logical/basicOperators.scala        |  2 +-
 .../sql/catalyst/trees/TreeNodeSuite.scala    | 14 ++++++++++++
 .../apache/spark/sql/execution/Expand.scala   |  4 ++--
 6 files changed, 30 insertions(+), 30 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
index c4f12cfe87993..cbd8def4f1d3c 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
@@ -172,8 +172,8 @@ class Analyzer(
      * expressions which equal GroupBy expressions with Literal(null), if those expressions
      * are not set for this grouping set (according to the bit mask).
      */
-    private[this] def expand(g: GroupingSets): Seq[GroupExpression] = {
-      val result = new scala.collection.mutable.ArrayBuffer[GroupExpression]
+    private[this] def expand(g: GroupingSets): Seq[Seq[Expression]] = {
+      val result = new scala.collection.mutable.ArrayBuffer[Seq[Expression]]
 
       g.bitmasks.foreach { bitmask =>
         // get the non selected grouping attributes according to the bit mask
@@ -194,7 +194,7 @@ class Analyzer(
             Literal.create(bitmask, IntegerType)
         })
 
-        result += GroupExpression(substitution)
+        result += substitution
       }
 
       result.toSeq
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala
index a05794f1dbd86..63dd5f9854aed 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala
@@ -239,18 +239,6 @@ abstract class UnaryExpression extends Expression with trees.UnaryNode[Expressio
   }
 }
 
-// TODO Semantically we probably not need GroupExpression
-// All we need is holding the Seq[Expression], and ONLY used in doing the
-// expressions transformation correctly. Probably will be removed since it's
-// not like a real expressions.
-case class GroupExpression(children: Seq[Expression]) extends Expression {
-  self: Product =>
-  override def eval(input: Row): Any = throw new UnsupportedOperationException
-  override def nullable: Boolean = false
-  override def foldable: Boolean = false
-  override def dataType: DataType = throw new UnsupportedOperationException
-}
-
 /**
  * Expressions that require a specific `DataType` as input should implement this trait
  * so that the proper type conversions can be performed in the analyzer.
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/QueryPlan.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/QueryPlan.scala
index eff5c61644944..2f545bb432165 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/QueryPlan.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/QueryPlan.scala
@@ -81,17 +81,16 @@ abstract class QueryPlan[PlanType <: TreeNode[PlanType]] extends TreeNode[PlanTy
       }
     }
 
-    val newArgs = productIterator.map {
+    def recursiveTransform(arg: Any): AnyRef = arg match {
       case e: Expression => transformExpressionDown(e)
       case Some(e: Expression) => Some(transformExpressionDown(e))
       case m: Map[_, _] => m
       case d: DataType => d // Avoid unpacking Structs
-      case seq: Traversable[_] => seq.map {
-        case e: Expression => transformExpressionDown(e)
-        case other => other
-      }
+      case seq: Traversable[_] => seq.map(recursiveTransform)
       case other: AnyRef => other
-    }.toArray
+    }
+
+    val newArgs = productIterator.map(recursiveTransform).toArray
 
     if (changed) makeCopy(newArgs) else this
   }
@@ -114,17 +113,16 @@ abstract class QueryPlan[PlanType <: TreeNode[PlanType]] extends TreeNode[PlanTy
       }
     }
 
-    val newArgs = productIterator.map {
+    def recursiveTransform(arg: Any): AnyRef = arg match {
       case e: Expression => transformExpressionUp(e)
       case Some(e: Expression) => Some(transformExpressionUp(e))
       case m: Map[_, _] => m
       case d: DataType => d // Avoid unpacking Structs
-      case seq: Traversable[_] => seq.map {
-        case e: Expression => transformExpressionUp(e)
-        case other => other
-      }
+      case seq: Traversable[_] => seq.map(recursiveTransform)
       case other: AnyRef => other
-    }.toArray
+    }
+
+    val newArgs = productIterator.map(recursiveTransform).toArray
 
     if (changed) makeCopy(newArgs) else this
   }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicOperators.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicOperators.scala
index e77e5c27b687a..963c7820914f3 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicOperators.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicOperators.scala
@@ -226,7 +226,7 @@ case class Window(
  * @param child       Child operator
  */
 case class Expand(
-    projections: Seq[GroupExpression],
+    projections: Seq[Seq[Expression]],
     output: Seq[Attribute],
     child: LogicalPlan) extends UnaryNode {
   override def statistics: Statistics = {
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/trees/TreeNodeSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/trees/TreeNodeSuite.scala
index 67db3d5e6d751..8ec79c3d4d28d 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/trees/TreeNodeSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/trees/TreeNodeSuite.scala
@@ -31,6 +31,11 @@ case class Dummy(optKey: Option[Expression]) extends Expression {
   override def eval(input: Row): Any = null.asInstanceOf[Any]
 }
 
+case class ComplexPlan(exprs: Seq[Seq[Expression]])
+  extends org.apache.spark.sql.catalyst.plans.logical.LeafNode {
+  override def output: Seq[Attribute] = Nil
+}
+
 class TreeNodeSuite extends SparkFunSuite {
   test("top node changed") {
     val after = Literal(1) transform { case Literal(1, _) => Literal(2) }
@@ -220,4 +225,13 @@ class TreeNodeSuite extends SparkFunSuite {
       assert(expected === actual)
     }
   }
+
+  test("transformExpressions on nested expression sequence") {
+    val plan = ComplexPlan(Seq(Seq(Literal(1)), Seq(Literal(2))))
+    val actual = plan.transformExpressions {
+      case Literal(value, _) => Literal(value.toString)
+    }
+    val expected = ComplexPlan(Seq(Seq(Literal("1")), Seq(Literal("2"))))
+    assert(expected === actual)
+  }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/Expand.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/Expand.scala
index f16ca36909fab..4b601c11924b9 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/Expand.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/Expand.scala
@@ -34,7 +34,7 @@ import org.apache.spark.sql.catalyst.plans.physical.{UnknownPartitioning, Partit
  */
 @DeveloperApi
 case class Expand(
-    projections: Seq[GroupExpression],
+    projections: Seq[Seq[Expression]],
     output: Seq[Attribute],
     child: SparkPlan)
   extends UnaryNode {
@@ -49,7 +49,7 @@ case class Expand(
       // workers via closure. However we can't assume the Projection
       // is serializable because of the code gen, so we have to
       // create the projections within each of the partition processing.
-      val groups = projections.map(ee => newProjection(ee.children, child.output)).toArray
+      val groups = projections.map(ee => newProjection(ee, child.output)).toArray
 
       new Iterator[Row] {
         private[this] var result: Row = _

From 9fe3adccef687c92ff1ac17d946af089c8e28d66 Mon Sep 17 00:00:00 2001
From: Cheng Hao <hao.cheng@intel.com>
Date: Wed, 10 Jun 2015 19:55:10 -0700
Subject: [PATCH 447/525] [SPARK-8248][SQL] string function: length

Author: Cheng Hao <hao.cheng@intel.com>

Closes #6724 from chenghao-intel/length and squashes the following commits:

aaa3c31 [Cheng Hao] revert the additional change
97148a9 [Cheng Hao] remove the codegen testing temporally
ae08003 [Cheng Hao] update the comments
1eb1fd1 [Cheng Hao] simplify the code as commented
3e92d32 [Cheng Hao] use the selectExpr in unit test intead of SQLQuery
3c729aa [Cheng Hao] fix bug for constant null value in codegen
3641f06 [Cheng Hao] keep the length() method for registered function
8e30171 [Cheng Hao] update the code as comment
db604ae [Cheng Hao] Add code gen support
548d2ef [Cheng Hao] register the length()
09a0738 [Cheng Hao] add length support
---
 .../catalyst/analysis/FunctionRegistry.scala  | 13 +++++++-----
 .../sql/catalyst/expressions/Expression.scala |  3 +++
 .../expressions/stringOperations.scala        | 21 +++++++++++++++++++
 .../expressions/StringFunctionsSuite.scala    | 12 +++++++++++
 .../org/apache/spark/sql/functions.scala      | 18 ++++++++++++++++
 .../spark/sql/DataFrameFunctionsSuite.scala   | 20 ++++++++++++++++++
 6 files changed, 82 insertions(+), 5 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
index ba89a5c8d1372..39875d7f216b2 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
@@ -89,14 +89,10 @@ object FunctionRegistry {
     expression[CreateArray]("array"),
     expression[Coalesce]("coalesce"),
     expression[Explode]("explode"),
-    expression[Lower]("lower"),
-    expression[Substring]("substr"),
-    expression[Substring]("substring"),
     expression[Rand]("rand"),
     expression[Randn]("randn"),
     expression[CreateStruct]("struct"),
     expression[Sqrt]("sqrt"),
-    expression[Upper]("upper"),
 
     // Math functions
     expression[Acos]("acos"),
@@ -132,7 +128,14 @@ object FunctionRegistry {
     expression[Last]("last"),
     expression[Max]("max"),
     expression[Min]("min"),
-    expression[Sum]("sum")
+    expression[Sum]("sum"),
+
+    // string functions
+    expression[Lower]("lower"),
+    expression[StringLength]("length"),
+    expression[Substring]("substr"),
+    expression[Substring]("substring"),
+    expression[Upper]("upper")
   )
 
   val builtin: FunctionRegistry = {
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala
index 63dd5f9854aed..8c1e4d74f9df1 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala
@@ -212,6 +212,9 @@ abstract class LeafExpression extends Expression with trees.LeafNode[Expression]
 abstract class UnaryExpression extends Expression with trees.UnaryNode[Expression] {
   self: Product =>
 
+  override def foldable: Boolean = child.foldable
+  override def nullable: Boolean = child.nullable
+
   /**
    * Called by unary expressions to generate a code block that returns null if its parent returns
    * null, and if not not null, use `f` to generate the expression.
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala
index 856f56488c7a5..345038323ddc5 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala
@@ -294,3 +294,24 @@ object Substring {
     apply(str, pos, Literal(Integer.MAX_VALUE))
   }
 }
+
+/**
+ * A function that return the length of the given string expression.
+ */
+case class StringLength(child: Expression) extends UnaryExpression with ExpectsInputTypes {
+  override def dataType: DataType = IntegerType
+  override def expectedChildTypes: Seq[DataType] = Seq(StringType)
+
+  override def eval(input: Row): Any = {
+    val string = child.eval(input)
+    if (string == null) null else string.asInstanceOf[UTF8String].length
+  }
+
+  override def toString: String = s"length($child)"
+
+  override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
+    defineCodeGen(ctx, ev, c => s"($c).length()")
+  }
+}
+
+
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringFunctionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringFunctionsSuite.scala
index 2e81296c4e623..d363e631540d8 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringFunctionsSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringFunctionsSuite.scala
@@ -215,4 +215,16 @@ class StringFunctionsSuite extends SparkFunSuite with ExpressionEvalHelper {
       evaluate("abbbbc" rlike regEx, create_row("**"))
     }
   }
+
+  test("length for string") {
+    val regEx = 'a.string.at(0)
+    checkEvaluation(StringLength(Literal("abc")), 3, create_row("abdef"))
+    checkEvaluation(StringLength(regEx), 5, create_row("abdef"))
+    checkEvaluation(StringLength(regEx), 0, create_row(""))
+    checkEvaluation(StringLength(regEx), null, create_row(null))
+    // TODO currently bug in codegen, let's temporally disable this
+    // checkEvaluation(StringLength(Literal.create(null, StringType)), null, create_row("abdef"))
+  }
+
+
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
index b3fc1e6cd987e..083f6b6bceee8 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
@@ -37,6 +37,7 @@ import org.apache.spark.util.Utils
  * @groupname normal_funcs Non-aggregate functions
  * @groupname math_funcs Math functions
  * @groupname window_funcs Window functions
+ * @groupname string_funcs String functions
  * @groupname Ungrouped Support functions for DataFrames.
  * @since 1.3.0
  */
@@ -1317,6 +1318,23 @@ object functions {
    */
   def toRadians(columnName: String): Column = toRadians(Column(columnName))
 
+  //////////////////////////////////////////////////////////////////////////////////////////////
+  // String functions
+  //////////////////////////////////////////////////////////////////////////////////////////////
+
+  /**
+   * Computes the length of a given string value
+   * @group string_funcs
+   * @since 1.5.0
+   */
+  def strlen(e: Column): Column = StringLength(e.expr)
+
+  /**
+   * Computes the length of a given string column
+   * @group string_funcs
+   * @since 1.5.0
+   */
+  def strlen(columnName: String): Column = strlen(Column(columnName))
 
   //////////////////////////////////////////////////////////////////////////////////////////////
   //////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala
index b93ad39f5da45..171a2151e67ae 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala
@@ -109,4 +109,24 @@ class DataFrameFunctionsSuite extends QueryTest {
       testData2.select(bitwiseNOT($"a")),
       testData2.collect().toSeq.map(r => Row(~r.getInt(0))))
   }
+
+  test("length") {
+    checkAnswer(
+      nullStrings.select(strlen($"s"), strlen("s")),
+      nullStrings.collect().toSeq.map { r =>
+        val v = r.getString(1)
+        val l = if (v == null) null else v.length
+        Row(l, l)
+      })
+
+    checkAnswer(
+      nullStrings.selectExpr("length(s)"),
+      nullStrings.collect().toSeq.map { r =>
+        val v = r.getString(1)
+        val l = if (v == null) null else v.length
+        Row(l)
+      })
+  }
+
+
 }

From 2758ff0a96f03a61e10999b2462acf7a13236b7c Mon Sep 17 00:00:00 2001
From: Daoyuan Wang <daoyuan.wang@intel.com>
Date: Wed, 10 Jun 2015 20:22:32 -0700
Subject: [PATCH 448/525] [SPARK-8217] [SQL] math function log2

Author: Daoyuan Wang <daoyuan.wang@intel.com>

This patch had conflicts when merged, resolved by
Committer: Reynold Xin <rxin@databricks.com>

Closes #6718 from adrian-wang/udflog2 and squashes the following commits:

3909f48 [Daoyuan Wang] math function: log2
---
 .../catalyst/analysis/FunctionRegistry.scala  |  1 +
 .../spark/sql/catalyst/expressions/math.scala | 17 ++++++++++++++++
 .../expressions/MathFunctionsSuite.scala      |  6 ++++++
 .../org/apache/spark/sql/functions.scala      | 20 +++++++++++++++++--
 .../spark/sql/DataFrameFunctionsSuite.scala   | 12 +++++++++++
 5 files changed, 54 insertions(+), 2 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
index 39875d7f216b2..a7816e327526f 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
@@ -111,6 +111,7 @@ object FunctionRegistry {
     expression[Log10]("log10"),
     expression[Log1p]("log1p"),
     expression[Pi]("pi"),
+    expression[Log2]("log2"),
     expression[Pow]("pow"),
     expression[Rint]("rint"),
     expression[Signum]("signum"),
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/math.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/math.scala
index e1d8c9a0cdb5a..97e960b8d6422 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/math.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/math.scala
@@ -161,6 +161,23 @@ case class Floor(child: Expression) extends UnaryMathExpression(math.floor, "FLO
 
 case class Log(child: Expression) extends UnaryMathExpression(math.log, "LOG")
 
+case class Log2(child: Expression)
+  extends UnaryMathExpression((x: Double) => math.log(x) / math.log(2), "LOG2") {
+  override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
+    val eval = child.gen(ctx)
+    eval.code + s"""
+      boolean ${ev.isNull} = ${eval.isNull};
+      ${ctx.javaType(dataType)} ${ev.primitive} = ${ctx.defaultValue(dataType)};
+      if (!${ev.isNull}) {
+        ${ev.primitive} = java.lang.Math.log(${eval.primitive}) / java.lang.Math.log(2);
+        if (Double.valueOf(${ev.primitive}).isNaN()) {
+          ${ev.isNull} = true;
+        }
+      }
+    """
+  }
+}
+
 case class Log10(child: Expression) extends UnaryMathExpression(math.log10, "LOG10")
 
 case class Log1p(child: Expression) extends UnaryMathExpression(math.log1p, "LOG1P")
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/MathFunctionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/MathFunctionsSuite.scala
index 1fe69059d39da..864c954ee82cb 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/MathFunctionsSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/MathFunctionsSuite.scala
@@ -185,6 +185,12 @@ class MathFunctionsSuite extends SparkFunSuite with ExpressionEvalHelper {
     testUnary(Log1p, math.log1p, (-10 to -2).map(_ * 1.0), expectNull = true)
   }
 
+  test("log2") {
+    def f: (Double) => Double = (x: Double) => math.log(x) / math.log(2)
+    testUnary(Log2, f, (0 to 20).map(_ * 0.1))
+    testUnary(Log2, f, (-5 to -1).map(_ * 1.0), expectNull = true)
+  }
+
   test("pow") {
     testBinary(Pow, math.pow, (-5 to 5).map(v => (v * 1.0, v * 1.0)))
     testBinary(Pow, math.pow, Seq((-1.0, 0.9), (-2.2, 1.7), (-2.2, -1.7)), expectNull = true)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
index 083f6b6bceee8..c5b77724aae17 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
@@ -1084,7 +1084,7 @@ object functions {
   def log(columnName: String): Column = log(Column(columnName))
 
   /**
-   * Computes the logarithm of the given value in Base 10.
+   * Computes the logarithm of the given value in base 10.
    *
    * @group math_funcs
    * @since 1.4.0
@@ -1092,7 +1092,7 @@ object functions {
   def log10(e: Column): Column = Log10(e.expr)
 
   /**
-   * Computes the logarithm of the given value in Base 10.
+   * Computes the logarithm of the given value in base 10.
    *
    * @group math_funcs
    * @since 1.4.0
@@ -1124,6 +1124,22 @@ object functions {
    */
   def pi(): Column = Pi()
 
+  /**
+   * Computes the logarithm of the given column in base 2.
+   *
+   * @group math_funcs
+   * @since 1.5.0
+   */
+  def log2(expr: Column): Column = Log2(expr.expr)
+
+  /**
+   * Computes the logarithm of the given value in base 2.
+   *
+   * @group math_funcs
+   * @since 1.5.0
+   */
+  def log2(columnName: String): Column = log2(Column(columnName))
+
   /**
    * Returns the value of the first argument raised to the power of the second argument.
    *
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala
index 171a2151e67ae..659b64c185f43 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala
@@ -128,5 +128,17 @@ class DataFrameFunctionsSuite extends QueryTest {
       })
   }
 
+  test("log2 functions test") {
+    val df = Seq((1, 2)).toDF("a", "b")
+    checkAnswer(
+      df.select(log2("b") + log2("a")),
+      Row(1))
 
+    checkAnswer(
+      ctx.sql("SELECT LOG2(8)"),
+      Row(3))
+    checkAnswer(
+      ctx.sql("SELECT LOG2(null)"),
+      Row(null))
+  }
 }

From a777eb04bf981312b640326607158f78dd4163cd Mon Sep 17 00:00:00 2001
From: Patrick Wendell <patrick@databricks.com>
Date: Wed, 10 Jun 2015 21:13:47 -0700
Subject: [PATCH 449/525] [HOTFIX] Adding more contributor name bindings

---
 dev/create-release/known_translations | 42 +++++++++++++++++++++++++++
 1 file changed, 42 insertions(+)

diff --git a/dev/create-release/known_translations b/dev/create-release/known_translations
index 0a599b5a65549..bbd4330e1c2e5 100644
--- a/dev/create-release/known_translations
+++ b/dev/create-release/known_translations
@@ -91,3 +91,45 @@ zapletal-martin - Martin Zapletal
 zuxqoj - Shekhar Bansal
 mingyukim - Mingyu Kim
 sigmoidanalytics - Mayur Rustagi
+AiHe - Ai He
+BenFradet - Ben Fradet
+FavioVazquez - Favio Vazquez
+JaysonSunshine - Jayson Sunshine
+Liuchang0812 - Liu Chang
+Sephiroth-Lin - Sephiroth Lin
+baishuo - Cheng Lian
+daisukebe - Shixiong Zhu
+dobashim - Masaru Dobashi
+ehnalis - Zoltan Zvara
+emres - Emre Sevinc
+gchen - Guancheng Chen
+haiyangsea - Haiyang Sea
+hlin09 - Hao Lin
+hqzizania - Qian Huang
+jeanlyn - Jean Lyn
+jerluc - Jeremy A. Lucas
+jrabary - Jaonary Rabarisoa
+judynash - Judy Nash
+kaka1992 - Chen Song
+ksonj - Kalle Jepsen
+kuromatsu-nobuyuki - Nobuyuki Kuromatsu
+lazyman500 - Dong Xu
+leahmcguire - Leah McGuire
+mbittmann - Mark Bittmann
+mbonaci - Marko Bonaci
+meawoppl - Matthew Goodman
+nyaapa - Arsenii Krasikov
+phatak-dev - Madhukara Phatak
+prabeesh - Prabeesh K
+rakeshchalasani - Rakesh Chalasani
+raschild - Marcelo Vanzin
+rekhajoshm - Rekha Joshi
+sisihj - June He
+szheng79 - Shuai Zheng
+ted-yu - Andrew Or
+texasmichelle - Michelle Casbon
+vinodkc - Vinod KC
+yongtang - Yong Tang
+ypcat - Pei-Lun Lee
+zhichao-li - Zhichao Li
+zzcclp - Zhichao Zhang

From e84545fa771dde90de5675a9c551fe287af6f7fb Mon Sep 17 00:00:00 2001
From: Patrick Wendell <patrick@databricks.com>
Date: Wed, 10 Jun 2015 22:56:36 -0700
Subject: [PATCH 450/525] [HOTFIX] Fixing errors in name mappings

---
 dev/create-release/known_translations | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/dev/create-release/known_translations b/dev/create-release/known_translations
index bbd4330e1c2e5..5f2671a6e5053 100644
--- a/dev/create-release/known_translations
+++ b/dev/create-release/known_translations
@@ -97,8 +97,6 @@ FavioVazquez - Favio Vazquez
 JaysonSunshine - Jayson Sunshine
 Liuchang0812 - Liu Chang
 Sephiroth-Lin - Sephiroth Lin
-baishuo - Cheng Lian
-daisukebe - Shixiong Zhu
 dobashim - Masaru Dobashi
 ehnalis - Zoltan Zvara
 emres - Emre Sevinc
@@ -122,11 +120,9 @@ nyaapa - Arsenii Krasikov
 phatak-dev - Madhukara Phatak
 prabeesh - Prabeesh K
 rakeshchalasani - Rakesh Chalasani
-raschild - Marcelo Vanzin
 rekhajoshm - Rekha Joshi
 sisihj - June He
 szheng79 - Shuai Zheng
-ted-yu - Andrew Or
 texasmichelle - Michelle Casbon
 vinodkc - Vinod KC
 yongtang - Yong Tang

From 6b68366df345d4572cf138f9efe17e23d0d1971e Mon Sep 17 00:00:00 2001
From: Adam Roberts <aroberts@uk.ibm.com>
Date: Thu, 11 Jun 2015 08:40:46 +0100
Subject: [PATCH 451/525] [SPARK-8289] Specify stack size for consistency with
 Java tests - resolves test failures

This change is a simple one and specifies a stack size of 4096k instead of the vendor default for Java tests (the defaults vary between Java vendors). This remedies test failures observed with JavaALSSuite with IBM and Oracle Java owing to a lower default size in comparison to the size with OpenJDK. 4096k is a suitable default where the tests pass with each Java vendor tested. The alternative is to reduce the number of iterations in the test (no observed failures with 5 iterations instead of 15).

-Xss works with Oracle's HotSpot VM, IBM's J9 VM and OpenJDK (IcedTea).

I have ensured this does not have any negative implications for other tests.

Author: Adam Roberts <aroberts@uk.ibm.com>
Author: a-roberts <aroberts@uk.ibm.com>

Closes #6727 from a-roberts/IncJavaStackSize and squashes the following commits:

ab40aea [Adam Roberts] Specify stack size for SBT builds
5032d8d [a-roberts] Update pom.xml
---
 pom.xml                  | 2 +-
 project/SparkBuild.scala | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/pom.xml b/pom.xml
index e9700a5d7b149..6d4f717d4931b 100644
--- a/pom.xml
+++ b/pom.xml
@@ -1244,7 +1244,7 @@
               <include>**/*Suite.java</include>
             </includes>
             <reportsDirectory>${project.build.directory}/surefire-reports</reportsDirectory>
-            <argLine>-Xmx3g -XX:MaxPermSize=${MaxPermGen} -XX:ReservedCodeCacheSize=512m</argLine>
+            <argLine>-Xmx3g -Xss4096k -XX:MaxPermSize=${MaxPermGen} -XX:ReservedCodeCacheSize=512m</argLine>
             <environmentVariables>
               <!--
                 Setting SPARK_DIST_CLASSPATH is a simple way to make sure any child processes
diff --git a/project/SparkBuild.scala b/project/SparkBuild.scala
index d7e374558c5e2..aa75a64b63caf 100644
--- a/project/SparkBuild.scala
+++ b/project/SparkBuild.scala
@@ -514,7 +514,7 @@ object TestSettings {
     javaOptions in Test ++= System.getProperties.filter(_._1 startsWith "spark")
       .map { case (k,v) => s"-D$k=$v" }.toSeq,
     javaOptions in Test += "-ea",
-    javaOptions in Test ++= "-Xmx3g -XX:PermSize=128M -XX:MaxNewSize=256m -XX:MaxPermSize=1g"
+    javaOptions in Test ++= "-Xmx3g -Xss4096k -XX:PermSize=128M -XX:MaxNewSize=256m -XX:MaxPermSize=1g"
       .split(" ").toSeq,
     javaOptions += "-Xmx3g",
     // Show full stack trace and duration in test cases.

From 424b0075a1a31c251451c6a75c6ba8e81c39453d Mon Sep 17 00:00:00 2001
From: Davies Liu <davies@databricks.com>
Date: Thu, 11 Jun 2015 01:00:41 -0700
Subject: [PATCH 452/525] [SPARK-6411] [SQL] [PySpark] support date/datetime
 with timezone in Python

Spark SQL does not support timezone, and Pyrolite does not support timezone well. This patch will convert datetime into POSIX timestamp (without confusing of timezone), which is used by SQL. If the datetime object does not have timezone, it's treated as local time.

The timezone in RDD will be lost after one round trip, all the datetime from SQL will be local time.

Because of Pyrolite, datetime from SQL only has precision as 1 millisecond.

This PR also drop the timezone in date, convert it to number of days since epoch (used in SQL).

Author: Davies Liu <davies@databricks.com>

Closes #6250 from davies/tzone and squashes the following commits:

44d8497 [Davies Liu] add timezone support for DateType
99d9d9c [Davies Liu] use int for timestamp
10aa7ca [Davies Liu] Merge branch 'master' of github.com:apache/spark into tzone
6a29aa4 [Davies Liu] support datetime with timezone
---
 python/pyspark/sql/tests.py                   | 32 +++++++++++++++++++
 python/pyspark/sql/types.py                   | 27 ++++++++++------
 .../spark/sql/execution/pythonUdfs.scala      |  3 +-
 3 files changed, 51 insertions(+), 11 deletions(-)

diff --git a/python/pyspark/sql/tests.py b/python/pyspark/sql/tests.py
index a6fce50c76c2b..b5fbb7d098820 100644
--- a/python/pyspark/sql/tests.py
+++ b/python/pyspark/sql/tests.py
@@ -26,6 +26,7 @@
 import tempfile
 import pickle
 import functools
+import time
 import datetime
 
 import py4j
@@ -47,6 +48,20 @@
 from pyspark.sql.window import Window
 
 
+class UTC(datetime.tzinfo):
+    """UTC"""
+    ZERO = datetime.timedelta(0)
+
+    def utcoffset(self, dt):
+        return self.ZERO
+
+    def tzname(self, dt):
+        return "UTC"
+
+    def dst(self, dt):
+        return self.ZERO
+
+
 class ExamplePointUDT(UserDefinedType):
     """
     User-defined type (UDT) for ExamplePoint.
@@ -588,6 +603,23 @@ def test_filter_with_datetime(self):
         self.assertEqual(0, df.filter(df.date > date).count())
         self.assertEqual(0, df.filter(df.time > time).count())
 
+    def test_time_with_timezone(self):
+        day = datetime.date.today()
+        now = datetime.datetime.now()
+        ts = time.mktime(now.timetuple()) + now.microsecond / 1e6
+        # class in __main__ is not serializable
+        from pyspark.sql.tests import UTC
+        utc = UTC()
+        utcnow = datetime.datetime.fromtimestamp(ts, utc)
+        df = self.sqlCtx.createDataFrame([(day, now, utcnow)])
+        day1, now1, utcnow1 = df.first()
+        # Pyrolite serialize java.sql.Date as datetime, will be fixed in new version
+        self.assertEqual(day1.date(), day)
+        # Pyrolite does not support microsecond, the error should be
+        # less than 1 millisecond
+        self.assertTrue(now - now1 < datetime.timedelta(0.001))
+        self.assertTrue(now - utcnow1 < datetime.timedelta(0.001))
+
     def test_dropna(self):
         schema = StructType([
             StructField("name", StringType(), True),
diff --git a/python/pyspark/sql/types.py b/python/pyspark/sql/types.py
index 8f286b631f4f0..23d9adb0daea1 100644
--- a/python/pyspark/sql/types.py
+++ b/python/pyspark/sql/types.py
@@ -655,12 +655,15 @@ def _need_python_to_sql_conversion(dataType):
             _need_python_to_sql_conversion(dataType.valueType)
     elif isinstance(dataType, UserDefinedType):
         return True
-    elif isinstance(dataType, TimestampType):
+    elif isinstance(dataType, (DateType, TimestampType)):
         return True
     else:
         return False
 
 
+EPOCH_ORDINAL = datetime.datetime(1970, 1, 1).toordinal()
+
+
 def _python_to_sql_converter(dataType):
     """
     Returns a converter that converts a Python object into a SQL datum for the given type.
@@ -698,26 +701,32 @@ def converter(obj):
                     return tuple(c(d.get(n)) for n, c in zip(names, converters))
                 else:
                     return tuple(c(v) for c, v in zip(converters, obj))
-            else:
+            elif obj is not None:
                 raise ValueError("Unexpected tuple %r with type %r" % (obj, dataType))
         return converter
     elif isinstance(dataType, ArrayType):
         element_converter = _python_to_sql_converter(dataType.elementType)
-        return lambda a: [element_converter(v) for v in a]
+        return lambda a: a and [element_converter(v) for v in a]
     elif isinstance(dataType, MapType):
         key_converter = _python_to_sql_converter(dataType.keyType)
         value_converter = _python_to_sql_converter(dataType.valueType)
-        return lambda m: dict([(key_converter(k), value_converter(v)) for k, v in m.items()])
+        return lambda m: m and dict([(key_converter(k), value_converter(v)) for k, v in m.items()])
+
     elif isinstance(dataType, UserDefinedType):
-        return lambda obj: dataType.serialize(obj)
+        return lambda obj: obj and dataType.serialize(obj)
+
+    elif isinstance(dataType, DateType):
+        return lambda d: d and d.toordinal() - EPOCH_ORDINAL
+
     elif isinstance(dataType, TimestampType):
 
         def to_posix_timstamp(dt):
-            if dt.tzinfo is None:
-                return int(time.mktime(dt.timetuple()) * 1e7 + dt.microsecond * 10)
-            else:
-                return int(calendar.timegm(dt.utctimetuple()) * 1e7 + dt.microsecond * 10)
+            if dt:
+                seconds = (calendar.timegm(dt.utctimetuple()) if dt.tzinfo
+                           else time.mktime(dt.timetuple()))
+                return int(seconds * 1e7 + dt.microsecond * 10)
         return to_posix_timstamp
+
     else:
         raise ValueError("Unexpected type %r" % dataType)
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/pythonUdfs.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/pythonUdfs.scala
index 955b478a4882f..b1333ec09a09a 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/pythonUdfs.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/pythonUdfs.scala
@@ -28,8 +28,7 @@ import org.apache.spark.annotation.DeveloperApi
 import org.apache.spark.api.python.{PythonBroadcast, PythonRDD}
 import org.apache.spark.broadcast.Broadcast
 import org.apache.spark.rdd.RDD
-import org.apache.spark.sql.catalyst.expressions.Row
-import org.apache.spark.sql.catalyst.expressions._
+import org.apache.spark.sql.catalyst.expressions.{Row, _}
 import org.apache.spark.sql.catalyst.plans.logical
 import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
 import org.apache.spark.sql.catalyst.rules.Rule

From 1191c3efc605d9c6d1df4b38ddae8d210a361b5b Mon Sep 17 00:00:00 2001
From: Davies Liu <davies@databricks.com>
Date: Thu, 11 Jun 2015 12:57:33 -0700
Subject: [PATCH 453/525] [SPARK-8305] [SPARK-8190] [SQL] improve codegen

This PR fix a few small issues about codgen:

1. cast decimal to boolean
2. do not inline literal with null
3. improve SpecificRow.equals()
4. test expressions with optimized express
5. fix compare with BinaryType

cc rxin chenghao-intel

Author: Davies Liu <davies@databricks.com>

Closes #6755 from davies/fix_codegen and squashes the following commits:

ef27343 [Davies Liu] address comments
6617ea6 [Davies Liu] fix scala tyle
70b7dda [Davies Liu] improve codegen
---
 .../scala/org/apache/spark/sql/BaseRow.java   | 21 +++++++++
 .../spark/sql/catalyst/expressions/Cast.scala |  4 +-
 .../expressions/codegen/CodeGenerator.scala   | 34 ++++++++++----
 .../codegen/GenerateMutableProjection.scala   |  1 -
 .../codegen/GenerateOrdering.scala            | 39 ++--------------
 .../codegen/GenerateProjection.scala          | 45 +++++++++----------
 .../catalyst/expressions/conditionals.scala   |  2 +-
 .../sql/catalyst/expressions/literals.scala   |  3 +-
 .../sql/catalyst/expressions/predicates.scala | 20 ++++-----
 .../spark/sql/catalyst/util/TypeUtils.scala   |  8 ++++
 .../apache/spark/sql/types/BinaryType.scala   |  7 +--
 .../sql/catalyst/expressions/CastSuite.scala  | 37 +++++++++++++--
 .../expressions/ExpressionEvalHelper.scala    | 12 +++++
 .../ExpressionOptimizationSuite.scala         | 37 ---------------
 14 files changed, 141 insertions(+), 129 deletions(-)
 delete mode 100644 sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/ExpressionOptimizationSuite.scala

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/BaseRow.java b/sql/catalyst/src/main/scala/org/apache/spark/sql/BaseRow.java
index 6584882a62fd1..e91daf17f8085 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/BaseRow.java
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/BaseRow.java
@@ -154,6 +154,27 @@ public int fieldIndex(String name) {
     throw new UnsupportedOperationException();
   }
 
+  /**
+   * A generic version of Row.equals(Row), which is used for tests.
+   */
+  @Override
+  public boolean equals(Object other) {
+    if (other instanceof Row) {
+      Row row = (Row) other;
+      int n = size();
+      if (n != row.size()) {
+        return false;
+      }
+      for (int i = 0; i < n; i ++) {
+        if (isNullAt(i) != row.isNullAt(i) || (!isNullAt(i) && !get(i).equals(row.get(i)))) {
+          return false;
+        }
+      }
+      return true;
+    }
+    return false;
+  }
+
   @Override
   public Row copy() {
     final int n = size();
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala
index 8d93957fea2fc..037efd75580d3 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala
@@ -141,7 +141,7 @@ case class Cast(child: Expression, dataType: DataType) extends UnaryExpression w
     case ByteType =>
       buildCast[Byte](_, _ != 0)
     case DecimalType() =>
-      buildCast[Decimal](_, _ != 0)
+      buildCast[Decimal](_, _ != Decimal(0))
     case DoubleType =>
       buildCast[Double](_, _ != 0)
     case FloatType =>
@@ -454,7 +454,7 @@ case class Cast(child: Expression, dataType: DataType) extends UnaryExpression w
       case (BooleanType, dt: NumericType) =>
         defineCodeGen(ctx, ev, c => s"(${ctx.javaType(dt)})($c ? 1 : 0)")
       case (dt: DecimalType, BooleanType) =>
-        defineCodeGen(ctx, ev, c => s"$c.isZero()")
+        defineCodeGen(ctx, ev, c => s"!$c.isZero()")
       case (dt: NumericType, BooleanType) =>
         defineCodeGen(ctx, ev, c => s"$c != 0")
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala
index 80aa8fa056146..ecf8e0d1a7a22 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala
@@ -161,15 +161,23 @@ class CodeGenContext {
   }
 
   /**
-   * Returns a function to generate equal expression in Java
+   * Generate code for equal expression in Java
    */
-  def equalFunc(dataType: DataType): ((String, String) => String) = dataType match {
-    case BinaryType => { case (eval1, eval2) =>
-      s"java.util.Arrays.equals($eval1, $eval2)" }
-    case IntegerType | BooleanType | LongType | DoubleType | FloatType | ShortType | ByteType =>
-      { case (eval1, eval2) => s"$eval1 == $eval2" }
-    case other =>
-      { case (eval1, eval2) => s"$eval1.equals($eval2)" }
+  def genEqual(dataType: DataType, c1: String, c2: String): String = dataType match {
+    case BinaryType => s"java.util.Arrays.equals($c1, $c2)"
+    case dt: DataType if isPrimitiveType(dt) => s"$c1 == $c2"
+    case other => s"$c1.equals($c2)"
+  }
+
+  /**
+   * Generate code for compare expression in Java
+   */
+  def genComp(dataType: DataType, c1: String, c2: String): String = dataType match {
+    // Use signum() to keep any small difference bwteen float/double
+    case FloatType | DoubleType => s"(int)java.lang.Math.signum($c1 - $c2)"
+    case dt: DataType if isPrimitiveType(dt) => s"(int)($c1 - $c2)"
+    case BinaryType => s"org.apache.spark.sql.catalyst.util.TypeUtils.compareBinary($c1, $c2)"
+    case other => s"$c1.compare($c2)"
   }
 
   /**
@@ -182,6 +190,16 @@ class CodeGenContext {
    * Returns true if the data type has a special accessor and setter in [[Row]].
    */
   def isNativeType(dt: DataType): Boolean = nativeTypes.contains(dt)
+
+  /**
+   * List of data types who's Java type is primitive type
+   */
+  val primitiveTypes = nativeTypes ++ Seq(DateType, TimestampType)
+
+  /**
+   * Returns true if the Java type is primitive type
+   */
+  def isPrimitiveType(dt: DataType): Boolean = primitiveTypes.contains(dt)
 }
 
 /**
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateMutableProjection.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateMutableProjection.scala
index e5ee2accd8a84..ed3df547d1c90 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateMutableProjection.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateMutableProjection.scala
@@ -82,7 +82,6 @@ object GenerateMutableProjection extends CodeGenerator[Seq[Expression], () => Mu
       }
     """
 
-
     logDebug(s"code for ${expressions.mkString(",")}:\n$code")
 
     val c = compile(code)
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateOrdering.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateOrdering.scala
index 36e155d164a40..56ecc5fc06cc1 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateOrdering.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateOrdering.scala
@@ -21,7 +21,6 @@ import org.apache.spark.Logging
 import org.apache.spark.annotation.Private
 import org.apache.spark.sql.Row
 import org.apache.spark.sql.catalyst.expressions._
-import org.apache.spark.sql.types.{BinaryType, NumericType}
 
 /**
  * Inherits some default implementation for Java from `Ordering[Row]`
@@ -55,39 +54,6 @@ object GenerateOrdering extends CodeGenerator[Seq[SortOrder], Ordering[Row]] wit
       val evalA = order.child.gen(ctx)
       val evalB = order.child.gen(ctx)
       val asc = order.direction == Ascending
-      val compare = order.child.dataType match {
-        case BinaryType =>
-          s"""
-            {
-              byte[] x = ${if (asc) evalA.primitive else evalB.primitive};
-              byte[] y = ${if (!asc) evalB.primitive else evalA.primitive};
-              int j = 0;
-              while (j < x.length && j < y.length) {
-                if (x[j] != y[j]) return x[j] - y[j];
-                j = j + 1;
-              }
-              int d = x.length - y.length;
-              if (d != 0) {
-                return d;
-              }
-            }"""
-        case _: NumericType =>
-          s"""
-            if (${evalA.primitive} != ${evalB.primitive}) {
-              if (${evalA.primitive} > ${evalB.primitive}) {
-                return ${if (asc) "1" else "-1"};
-              } else {
-                return ${if (asc) "-1" else "1"};
-              }
-            }"""
-        case _ =>
-          s"""
-            int comp = ${evalA.primitive}.compare(${evalB.primitive});
-            if (comp != 0) {
-              return ${if (asc) "comp" else "-comp"};
-            }"""
-      }
-
       s"""
           i = $a;
           ${evalA.code}
@@ -100,7 +66,10 @@ object GenerateOrdering extends CodeGenerator[Seq[SortOrder], Ordering[Row]] wit
           } else if (${evalB.isNull}) {
             return ${if (order.direction == Ascending) "1" else "-1"};
           } else {
-            $compare
+            int comp = ${ctx.genComp(order.child.dataType, evalA.primitive, evalB.primitive)};
+            if (comp != 0) {
+              return ${if (asc) "comp" else "-comp"};
+            }
           }
       """
     }.mkString("\n")
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateProjection.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateProjection.scala
index 274429cd1c55f..9b906c3ff5cde 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateProjection.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateProjection.scala
@@ -72,14 +72,12 @@ object GenerateProjection extends CodeGenerator[Seq[Expression], Projection] {
     }.mkString("\n        ")
 
     val specificAccessorFunctions = ctx.nativeTypes.map { dataType =>
-      val cases = expressions.zipWithIndex.map {
-        case (e, i) if e.dataType == dataType
-          || dataType == IntegerType && e.dataType == DateType
-          || dataType == LongType && e.dataType == TimestampType =>
-          s"case $i: return c$i;"
-        case _ => ""
+      val cases = expressions.zipWithIndex.flatMap {
+        case (e, i) if ctx.javaType(e.dataType) == ctx.javaType(dataType) =>
+          List(s"case $i: return c$i;")
+        case _ => Nil
       }.mkString("\n        ")
-      if (cases.count(_ != '\n') > 0) {
+      if (cases.length > 0) {
         s"""
       @Override
       public ${ctx.javaType(dataType)} ${ctx.accessorForType(dataType)}(int i) {
@@ -89,7 +87,8 @@ object GenerateProjection extends CodeGenerator[Seq[Expression], Projection] {
         switch (i) {
         $cases
         }
-        return ${ctx.defaultValue(dataType)};
+        throw new IllegalArgumentException("Invalid index: " + i
+          + " in ${ctx.accessorForType(dataType)}");
       }"""
       } else {
         ""
@@ -97,14 +96,12 @@ object GenerateProjection extends CodeGenerator[Seq[Expression], Projection] {
     }.mkString("\n")
 
     val specificMutatorFunctions = ctx.nativeTypes.map { dataType =>
-      val cases = expressions.zipWithIndex.map {
-        case (e, i) if e.dataType == dataType
-          || dataType == IntegerType && e.dataType == DateType
-          || dataType == LongType && e.dataType == TimestampType =>
-          s"case $i: { c$i = value; return; }"
-        case _ => ""
-      }.mkString("\n")
-      if (cases.count(_ != '\n') > 0) {
+      val cases = expressions.zipWithIndex.flatMap {
+        case (e, i) if ctx.javaType(e.dataType) == ctx.javaType(dataType) =>
+          List(s"case $i: { c$i = value; return; }")
+        case _ => Nil
+      }.mkString("\n        ")
+      if (cases.length > 0) {
         s"""
       @Override
       public void ${ctx.mutatorForType(dataType)}(int i, ${ctx.javaType(dataType)} value) {
@@ -112,6 +109,8 @@ object GenerateProjection extends CodeGenerator[Seq[Expression], Projection] {
         switch (i) {
         $cases
         }
+        throw new IllegalArgumentException("Invalid index: " + i +
+          " in ${ctx.mutatorForType(dataType)}");
       }"""
       } else {
         ""
@@ -139,9 +138,10 @@ object GenerateProjection extends CodeGenerator[Seq[Expression], Projection] {
 
     val columnChecks = expressions.zipWithIndex.map { case (e, i) =>
       s"""
-          if (isNullAt($i) != row.isNullAt($i) || !isNullAt($i) && !get($i).equals(row.get($i))) {
-            return false;
-          }
+        if (nullBits[$i] != row.nullBits[$i] ||
+          (!nullBits[$i] && !(${ctx.genEqual(e.dataType, s"c$i", s"row.c$i")}))) {
+          return false;
+        }
       """
     }.mkString("\n")
 
@@ -174,7 +174,7 @@ object GenerateProjection extends CodeGenerator[Seq[Expression], Projection] {
       }
 
       public int size() { return ${expressions.length};}
-      private boolean[] nullBits = new boolean[${expressions.length}];
+      protected boolean[] nullBits = new boolean[${expressions.length}];
       public void setNullAt(int i) { nullBits[i] = true; }
       public boolean isNullAt(int i) { return nullBits[i]; }
 
@@ -207,9 +207,8 @@ object GenerateProjection extends CodeGenerator[Seq[Expression], Projection] {
 
       @Override
       public boolean equals(Object other) {
-        if (other instanceof Row) {
-          Row row = (Row) other;
-          if (row.length() != size()) return false;
+        if (other instanceof SpecificRow) {
+          SpecificRow row = (SpecificRow) other;
           $columnChecks
           return true;
         }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/conditionals.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/conditionals.scala
index 1a5cde26c9b13..72b9f23456a54 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/conditionals.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/conditionals.scala
@@ -261,7 +261,7 @@ case class CaseKeyWhen(key: Expression, branches: Seq[Expression]) extends CaseW
           ${cond.code}
           if (${keyEval.isNull} && ${cond.isNull} ||
             !${keyEval.isNull} && !${cond.isNull}
-             && ${ctx.equalFunc(key.dataType)(keyEval.primitive, cond.primitive)}) {
+             && ${ctx.genEqual(key.dataType, keyEval.primitive, cond.primitive)}) {
             $got = true;
             ${res.code}
             ${ev.isNull} = ${res.isNull};
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/literals.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/literals.scala
index 833c08a293dcb..ef50c50e13558 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/literals.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/literals.scala
@@ -92,8 +92,7 @@ case class Literal protected (value: Any, dataType: DataType) extends LeafExpres
     // change the isNull and primitive to consts, to inline them
     if (value == null) {
       ev.isNull = "true"
-      ev.primitive = ctx.defaultValue(dataType)
-      ""
+      s"final ${ctx.javaType(dataType)} ${ev.primitive} = ${ctx.defaultValue(dataType)};"
     } else {
       dataType match {
         case BooleanType =>
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala
index 2c49352874fc3..7574d1cbda33e 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala
@@ -250,16 +250,11 @@ abstract class BinaryComparison extends BinaryExpression with Predicate {
   }
 
   override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
-    left.dataType match {
-      case dt: NumericType if ctx.isNativeType(dt) => defineCodeGen (ctx, ev, {
-        (c1, c3) => s"$c1 $symbol $c3"
-      })
-      case DateType | TimestampType => defineCodeGen (ctx, ev, {
-        (c1, c3) => s"$c1 $symbol $c3"
-      })
-      case other => defineCodeGen (ctx, ev, {
-        (c1, c2) => s"$c1.compare($c2) $symbol 0"
-      })
+    if (ctx.isPrimitiveType(left.dataType)) {
+      // faster version
+      defineCodeGen(ctx, ev, (c1, c2) => s"$c1 $symbol $c2")
+    } else {
+      defineCodeGen(ctx, ev, (c1, c2) => s"${ctx.genComp(left.dataType, c1, c2)} $symbol 0")
     }
   }
 
@@ -280,8 +275,9 @@ case class EqualTo(left: Expression, right: Expression) extends BinaryComparison
     if (left.dataType != BinaryType) l == r
     else java.util.Arrays.equals(l.asInstanceOf[Array[Byte]], r.asInstanceOf[Array[Byte]])
   }
+
   override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
-    defineCodeGen(ctx, ev, ctx.equalFunc(left.dataType))
+    defineCodeGen(ctx, ev, (c1, c2) => ctx.genEqual(left.dataType, c1, c2))
   }
 }
 
@@ -307,7 +303,7 @@ case class EqualNullSafe(left: Expression, right: Expression) extends BinaryComp
   override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
     val eval1 = left.gen(ctx)
     val eval2 = right.gen(ctx)
-    val equalCode = ctx.equalFunc(left.dataType)(eval1.primitive, eval2.primitive)
+    val equalCode = ctx.genEqual(left.dataType, eval1.primitive, eval2.primitive)
     ev.isNull = "false"
     eval1.code + eval2.code + s"""
         boolean ${ev.primitive} = (${eval1.isNull} && ${eval2.isNull}) ||
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/TypeUtils.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/TypeUtils.scala
index 0bb12d2039ffc..04857a23f4c1e 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/TypeUtils.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/TypeUtils.scala
@@ -53,4 +53,12 @@ object TypeUtils {
 
   def getOrdering(t: DataType): Ordering[Any] =
     t.asInstanceOf[AtomicType].ordering.asInstanceOf[Ordering[Any]]
+
+  def compareBinary(x: Array[Byte], y: Array[Byte]): Int = {
+    for (i <- 0 until x.length; if i < y.length) {
+      val res = x(i).compareTo(y(i))
+      if (res != 0) return res
+    }
+    x.length - y.length
+  }
 }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/BinaryType.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/BinaryType.scala
index a581a9e9468ef..9b58601e5e6ec 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/BinaryType.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/BinaryType.scala
@@ -22,6 +22,7 @@ import scala.reflect.runtime.universe.typeTag
 
 import org.apache.spark.annotation.DeveloperApi
 import org.apache.spark.sql.catalyst.ScalaReflectionLock
+import org.apache.spark.sql.catalyst.util.TypeUtils
 
 
 /**
@@ -43,11 +44,7 @@ class BinaryType private() extends AtomicType {
 
   private[sql] val ordering = new Ordering[InternalType] {
     def compare(x: Array[Byte], y: Array[Byte]): Int = {
-      for (i <- 0 until x.length; if i < y.length) {
-        val res = x(i).compareTo(y(i))
-        if (res != 0) return res
-      }
-      x.length - y.length
+      TypeUtils.compareBinary(x, y)
     }
   }
 
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CastSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CastSuite.scala
index 3aca94db3bd8f..969c6cc15fdee 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CastSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CastSuite.scala
@@ -43,7 +43,7 @@ class CastSuite extends SparkFunSuite with ExpressionEvalHelper {
   test("cast from int") {
     checkCast(0, false)
     checkCast(1, true)
-    checkCast(5, true)
+    checkCast(-5, true)
     checkCast(1, 1.toByte)
     checkCast(1, 1.toShort)
     checkCast(1, 1)
@@ -61,7 +61,7 @@ class CastSuite extends SparkFunSuite with ExpressionEvalHelper {
   test("cast from long") {
     checkCast(0L, false)
     checkCast(1L, true)
-    checkCast(5L, true)
+    checkCast(-5L, true)
     checkCast(1L, 1.toByte)
     checkCast(1L, 1.toShort)
     checkCast(1L, 1)
@@ -99,10 +99,28 @@ class CastSuite extends SparkFunSuite with ExpressionEvalHelper {
   }
 
   test("cast from float") {
-
+    checkCast(0.0f, false)
+    checkCast(0.5f, true)
+    checkCast(-5.0f, true)
+    checkCast(1.5f, 1.toByte)
+    checkCast(1.5f, 1.toShort)
+    checkCast(1.5f, 1)
+    checkCast(1.5f, 1.toLong)
+    checkCast(1.5f, 1.5)
+    checkCast(1.5f, "1.5")
   }
 
   test("cast from double") {
+    checkCast(0.0, false)
+    checkCast(0.5, true)
+    checkCast(-5.0, true)
+    checkCast(1.5, 1.toByte)
+    checkCast(1.5, 1.toShort)
+    checkCast(1.5, 1)
+    checkCast(1.5, 1.toLong)
+    checkCast(1.5, 1.5f)
+    checkCast(1.5, "1.5")
+
     checkEvaluation(cast(cast(1.toDouble, TimestampType), DoubleType), 1.toDouble)
     checkEvaluation(cast(cast(1.toDouble, TimestampType), DoubleType), 1.toDouble)
   }
@@ -183,6 +201,19 @@ class CastSuite extends SparkFunSuite with ExpressionEvalHelper {
     checkEvaluation(Add(Literal(23.toShort), cast(true, ShortType)), 24.toShort)
   }
 
+  test("from decimal") {
+    checkCast(Decimal(0.0), false)
+    checkCast(Decimal(0.5), true)
+    checkCast(Decimal(-5.0), true)
+    checkCast(Decimal(1.5), 1.toByte)
+    checkCast(Decimal(1.5), 1.toShort)
+    checkCast(Decimal(1.5), 1)
+    checkCast(Decimal(1.5), 1.toLong)
+    checkCast(Decimal(1.5), 1.5f)
+    checkCast(Decimal(1.5), 1.5)
+    checkCast(Decimal(1.5), "1.5")
+  }
+
   test("casting to fixed-precision decimals") {
     // Overflow and rounding for casting to fixed-precision decimals:
     // - Values should round with HALF_UP mode by default when you lower scale
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionEvalHelper.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionEvalHelper.scala
index 87a92b87962f8..4a241d3603570 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionEvalHelper.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionEvalHelper.scala
@@ -23,6 +23,8 @@ import org.scalatest.Matchers._
 import org.apache.spark.SparkFunSuite
 import org.apache.spark.sql.catalyst.CatalystTypeConverters
 import org.apache.spark.sql.catalyst.expressions.codegen.{GenerateProjection, GenerateMutableProjection}
+import org.apache.spark.sql.catalyst.optimizer.DefaultOptimizer
+import org.apache.spark.sql.catalyst.plans.logical.{OneRowRelation, Project}
 
 /**
  * A few helper functions for expression evaluation testing. Mixin this trait to use them.
@@ -39,6 +41,7 @@ trait ExpressionEvalHelper {
     checkEvaluationWithoutCodegen(expression, expected, inputRow)
     checkEvaluationWithGeneratedMutableProjection(expression, expected, inputRow)
     checkEvaluationWithGeneratedProjection(expression, expected, inputRow)
+    checkEvaluationWithOptimization(expression, expected, inputRow)
   }
 
   protected def evaluate(expression: Expression, inputRow: Row = EmptyRow): Any = {
@@ -122,6 +125,15 @@ trait ExpressionEvalHelper {
     }
   }
 
+  protected def checkEvaluationWithOptimization(
+      expression: Expression,
+      expected: Any,
+      inputRow: Row = EmptyRow): Unit = {
+    val plan = Project(Alias(expression, s"Optimized($expression)")() :: Nil, OneRowRelation)
+    val optimizedPlan = DefaultOptimizer.execute(plan)
+    checkEvaluationWithoutCodegen(optimizedPlan.expressions.head, expected, inputRow)
+  }
+
   protected def checkDoubleEvaluation(
       expression: Expression,
       expected: Spread[Double],
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/ExpressionOptimizationSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/ExpressionOptimizationSuite.scala
deleted file mode 100644
index f33a18d53b1a9..0000000000000
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/ExpressionOptimizationSuite.scala
+++ /dev/null
@@ -1,37 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.sql.catalyst.optimizer
-
-import org.apache.spark.SparkFunSuite
-import org.apache.spark.sql.catalyst.expressions._
-import org.apache.spark.sql.catalyst.plans.logical._
-
-/**
- * Overrides our expression evaluation tests and reruns them after optimization has occured.  This
- * is to ensure that constant folding and other optimizations do not break anything.
- */
-class ExpressionOptimizationSuite extends SparkFunSuite with ExpressionEvalHelper {
-  override def checkEvaluation(
-      expression: Expression,
-      expected: Any,
-      inputRow: Row = EmptyRow): Unit = {
-    val plan = Project(Alias(expression, s"Optimized($expression)")() :: Nil, OneRowRelation)
-    val optimizedPlan = DefaultOptimizer.execute(plan)
-    super.checkEvaluation(optimizedPlan.expressions.head, expected, inputRow)
-  }
-}

From c8d551d546979e126c91925487e30c353185e3ba Mon Sep 17 00:00:00 2001
From: Shivaram Venkataraman <shivaram@cs.berkeley.edu>
Date: Thu, 11 Jun 2015 13:18:42 -0700
Subject: [PATCH 454/525] [SPARK-8310] [EC2] Updates the master branch EC2
 versions

Will send another PR for `branch-1.4`

Author: Shivaram Venkataraman <shivaram@cs.berkeley.edu>

Closes #6764 from shivaram/SPARK-8310 and squashes the following commits:

d8cd3b3 [Shivaram Venkataraman] This updates the master branch EC2 versions
---
 ec2/spark_ec2.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/ec2/spark_ec2.py b/ec2/spark_ec2.py
index 84629cb9a0ca0..58b24ae9ef500 100755
--- a/ec2/spark_ec2.py
+++ b/ec2/spark_ec2.py
@@ -51,7 +51,7 @@
     raw_input = input
     xrange = range
 
-SPARK_EC2_VERSION = "1.3.1"
+SPARK_EC2_VERSION = "1.4.0"
 SPARK_EC2_DIR = os.path.dirname(os.path.realpath(__file__))
 
 VALID_SPARK_VERSIONS = set([
@@ -89,7 +89,7 @@
 
 # Default location to get the spark-ec2 scripts (and ami-list) from
 DEFAULT_SPARK_EC2_GITHUB_REPO = "https://github.com/mesos/spark-ec2"
-DEFAULT_SPARK_EC2_BRANCH = "branch-1.3"
+DEFAULT_SPARK_EC2_BRANCH = "branch-1.4"
 
 
 def setup_external_libs(libs):

From 040f223c5b9ca724c9f2b4abb59c21b3a23720ba Mon Sep 17 00:00:00 2001
From: Cheng Hao <hao.cheng@intel.com>
Date: Thu, 11 Jun 2015 14:03:08 -0700
Subject: [PATCH 455/525] [SPARK-7915] [SQL] Support specifying the column list
 for target table in CTAS

```
create table t1 (a int, b string) as select key, value from src;

desc t1;
key	int	NULL
value	string	NULL
```

Thus Hive doesn't support specifying the column list for target table in CTAS, however, we should either throwing exception explicity, or supporting the this feature, we just pick up the later one, which seems useful and straightforward.

Author: Cheng Hao <hao.cheng@intel.com>

Closes #6458 from chenghao-intel/ctas_column and squashes the following commits:

d1fa9b6 [Cheng Hao] bug in unittest
4e701aa [Cheng Hao] update as feedback
f305ec1 [Cheng Hao] support specifying the column list for target table in CTAS
---
 .../hive/execution/CreateTableAsSelect.scala   | 16 ++++++++++++----
 .../sql/hive/execution/SQLQuerySuite.scala     | 18 ++++++++++++++++++
 2 files changed, 30 insertions(+), 4 deletions(-)

diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/CreateTableAsSelect.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/CreateTableAsSelect.scala
index 7d3ec12c4eb05..87c36a8b618ce 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/CreateTableAsSelect.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/CreateTableAsSelect.scala
@@ -50,17 +50,25 @@ case class CreateTableAsSelect(
       import org.apache.hadoop.io.Text
       import org.apache.hadoop.mapred.TextInputFormat
 
-      val withSchema =
+      val withFormat =
         tableDesc.copy(
-          schema =
-            query.output.map(c =>
-              HiveColumn(c.name, HiveMetastoreTypes.toMetastoreType(c.dataType), null)),
           inputFormat =
             tableDesc.inputFormat.orElse(Some(classOf[TextInputFormat].getName)),
           outputFormat =
             tableDesc.outputFormat
               .orElse(Some(classOf[HiveIgnoreKeyTextOutputFormat[Text, Text]].getName)),
           serde = tableDesc.serde.orElse(Some(classOf[LazySimpleSerDe].getName())))
+
+      val withSchema = if (withFormat.schema.isEmpty) {
+        // Hive doesn't support specifying the column list for target table in CTAS
+        // However we don't think SparkSQL should follow that.
+        tableDesc.copy(schema =
+        query.output.map(c =>
+          HiveColumn(c.name, HiveMetastoreTypes.toMetastoreType(c.dataType), null)))
+      } else {
+        withFormat
+      }
+
       hiveContext.catalog.client.createTable(withSchema)
 
       // Get the Metastore Relation
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
index 40a35674e4cb8..8bd4900497c4f 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
@@ -360,6 +360,24 @@ class SQLQuerySuite extends QueryTest {
     }
   }
 
+  test("specifying the column list for CTAS") {
+    Seq((1, "111111"), (2, "222222")).toDF("key", "value").registerTempTable("mytable1")
+
+    sql("create table gen__tmp(a int, b string) as select key, value from mytable1")
+    checkAnswer(
+      sql("SELECT a, b from gen__tmp"),
+      sql("select key, value from mytable1").collect())
+    sql("DROP TABLE gen__tmp")
+
+    sql("create table gen__tmp(a double, b double) as select key, value from mytable1")
+    checkAnswer(
+      sql("SELECT a, b from gen__tmp"),
+      sql("select cast(key as double), cast(value as double) from mytable1").collect())
+    sql("DROP TABLE gen__tmp")
+
+    sql("drop table mytable1")
+  }
+
   test("command substitution") {
     sql("set tbl=src")
     checkAnswer(

From 95690a17d328f205c3398b9b477b4072b6fe908f Mon Sep 17 00:00:00 2001
From: zsxwing <zsxwing@gmail.com>
Date: Thu, 11 Jun 2015 14:21:49 -0700
Subject: [PATCH 456/525] [SPARK-7444] [TESTS] Eliminate noisy css warn/error
 logs for UISeleniumSuite

Eliminate the following noisy logs for `UISeleniumSuite`:
```
15/05/07 10:09:50.196 pool-1-thread-1-ScalaTest-running-UISeleniumSuite WARN DefaultCssErrorHandler: CSS error: 'http://192.168.0.170:4040/static/bootstrap.min.css' [793:167] Error in style rule. (Invalid token "*". Was expecting one of: <EOF>, <S>, <IDENT>, "}", ";".)
15/05/07 10:09:50.196 pool-1-thread-1-ScalaTest-running-UISeleniumSuite WARN DefaultCssErrorHandler: CSS warning: 'http://192.168.0.170:4040/static/bootstrap.min.css' [793:167] Ignoring the following declarations in this rule.
15/05/07 10:09:50.197 pool-1-thread-1-ScalaTest-running-UISeleniumSuite WARN DefaultCssErrorHandler: CSS error: 'http://192.168.0.170:4040/static/bootstrap.min.css' [799:325] Error in style rule. (Invalid token "*". Was expecting one of: <EOF>, <S>, <IDENT>, "}", ";".)
15/05/07 10:09:50.197 pool-1-thread-1-ScalaTest-running-UISeleniumSuite WARN DefaultCssErrorHandler: CSS warning: 'http://192.168.0.170:4040/static/bootstrap.min.css' [799:325] Ignoring the following declarations in this rule.
15/05/07 10:09:50.198 pool-1-thread-1-ScalaTest-running-UISeleniumSuite WARN DefaultCssErrorHandler: CSS error: 'http://192.168.0.170:4040/static/bootstrap.min.css' [805:18] Error in style rule. (Invalid token "*". Was expecting one of: <EOF>, <S>, <IDENT>, "}", ";".)
15/05/07 10:09:50.198 pool-1-thread-1-ScalaTest-running-UISeleniumSuite WARN DefaultCssErrorHandler: CSS warning: 'http://192.168.0.170:4040/static/bootstrap.min.css' [805:18] Ignoring the following declarations in this rule.
```

Author: zsxwing <zsxwing@gmail.com>

Closes #5983 from zsxwing/SPARK-7444 and squashes the following commits:

4202728 [zsxwing] Add SparkUICssErrorHandler for all tests
d1398ad [zsxwing] Merge remote-tracking branch 'origin/master' into SPARK-7444
7bb7f11 [zsxwing] Merge branch 'master' into SPARK-7444
a59f40e [zsxwing] Eliminate noisy css warn/error logs for UISeleniumSuite
---
 .../org/apache/spark/ui/UISeleniumSuite.scala | 31 ++++++++++++++++++-
 .../hive/thriftserver/UISeleniumSuite.scala   |  7 +++--
 .../streaming/ui/static/streaming-page.css    |  2 +-
 .../spark/streaming/UISeleniumSuite.scala     |  7 +++--
 4 files changed, 41 insertions(+), 6 deletions(-)

diff --git a/core/src/test/scala/org/apache/spark/ui/UISeleniumSuite.scala b/core/src/test/scala/org/apache/spark/ui/UISeleniumSuite.scala
index 33712f1bfa782..3aa672f8b713c 100644
--- a/core/src/test/scala/org/apache/spark/ui/UISeleniumSuite.scala
+++ b/core/src/test/scala/org/apache/spark/ui/UISeleniumSuite.scala
@@ -23,6 +23,7 @@ import javax.servlet.http.{HttpServletResponse, HttpServletRequest}
 import scala.collection.JavaConversions._
 import scala.xml.Node
 
+import com.gargoylesoftware.htmlunit.DefaultCssErrorHandler
 import org.json4s._
 import org.json4s.jackson.JsonMethods
 import org.openqa.selenium.htmlunit.HtmlUnitDriver
@@ -31,6 +32,7 @@ import org.scalatest._
 import org.scalatest.concurrent.Eventually._
 import org.scalatest.selenium.WebBrowser
 import org.scalatest.time.SpanSugar._
+import org.w3c.css.sac.CSSParseException
 
 import org.apache.spark.LocalSparkContext._
 import org.apache.spark._
@@ -39,6 +41,31 @@ import org.apache.spark.deploy.history.HistoryServerSuite
 import org.apache.spark.shuffle.FetchFailedException
 import org.apache.spark.status.api.v1.{JacksonMessageWriter, StageStatus}
 
+private[spark] class SparkUICssErrorHandler extends DefaultCssErrorHandler {
+
+  private val cssWhiteList = List("bootstrap.min.css", "vis.min.css")
+
+  private def isInWhileList(uri: String): Boolean = cssWhiteList.exists(uri.endsWith)
+
+  override def warning(e: CSSParseException): Unit = {
+    if (!isInWhileList(e.getURI)) {
+      super.warning(e)
+    }
+  }
+
+  override def fatalError(e: CSSParseException): Unit = {
+    if (!isInWhileList(e.getURI)) {
+      super.fatalError(e)
+    }
+  }
+
+  override def error(e: CSSParseException): Unit = {
+    if (!isInWhileList(e.getURI)) {
+      super.error(e)
+    }
+  }
+}
+
 /**
  * Selenium tests for the Spark Web UI.
  */
@@ -49,7 +76,9 @@ class UISeleniumSuite extends SparkFunSuite with WebBrowser with Matchers with B
 
 
   override def beforeAll(): Unit = {
-    webDriver = new HtmlUnitDriver
+    webDriver = new HtmlUnitDriver {
+      getWebClient.setCssErrorHandler(new SparkUICssErrorHandler)
+    }
   }
 
   override def afterAll(): Unit = {
diff --git a/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/UISeleniumSuite.scala b/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/UISeleniumSuite.scala
index 4c9fab7ef6136..806240e6de458 100644
--- a/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/UISeleniumSuite.scala
+++ b/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/UISeleniumSuite.scala
@@ -22,12 +22,13 @@ import scala.util.Random
 import org.apache.hadoop.hive.conf.HiveConf.ConfVars
 import org.openqa.selenium.WebDriver
 import org.openqa.selenium.htmlunit.HtmlUnitDriver
+import org.scalatest.{BeforeAndAfterAll, Matchers}
 import org.scalatest.concurrent.Eventually._
 import org.scalatest.selenium.WebBrowser
 import org.scalatest.time.SpanSugar._
-import org.scalatest.{BeforeAndAfterAll, Matchers}
 
 import org.apache.spark.sql.hive.HiveContext
+import org.apache.spark.ui.SparkUICssErrorHandler
 
 class UISeleniumSuite
   extends HiveThriftJdbcTest
@@ -40,7 +41,9 @@ class UISeleniumSuite
   override def mode: ServerMode.Value = ServerMode.binary
 
   override def beforeAll(): Unit = {
-    webDriver = new HtmlUnitDriver
+    webDriver = new HtmlUnitDriver {
+      getWebClient.setCssErrorHandler(new SparkUICssErrorHandler)
+    }
     super.beforeAll()
   }
 
diff --git a/streaming/src/main/resources/org/apache/spark/streaming/ui/static/streaming-page.css b/streaming/src/main/resources/org/apache/spark/streaming/ui/static/streaming-page.css
index b22c884bfebdb..ec12616b58d87 100644
--- a/streaming/src/main/resources/org/apache/spark/streaming/ui/static/streaming-page.css
+++ b/streaming/src/main/resources/org/apache/spark/streaming/ui/static/streaming-page.css
@@ -31,7 +31,7 @@
 }
 
 .tooltip-inner {
-  max-width: 500px !important; // Make sure we only have one line tooltip
+  max-width: 500px !important; /* Make sure we only have one line tooltip */
 }
 
 .line {
diff --git a/streaming/src/test/scala/org/apache/spark/streaming/UISeleniumSuite.scala b/streaming/src/test/scala/org/apache/spark/streaming/UISeleniumSuite.scala
index cbc24aee4fa1e..a08578680cff9 100644
--- a/streaming/src/test/scala/org/apache/spark/streaming/UISeleniumSuite.scala
+++ b/streaming/src/test/scala/org/apache/spark/streaming/UISeleniumSuite.scala
@@ -27,9 +27,10 @@ import org.scalatest.selenium.WebBrowser
 import org.scalatest.time.SpanSugar._
 
 import org.apache.spark._
+import org.apache.spark.ui.SparkUICssErrorHandler
 
 /**
- * Selenium tests for the Spark Web UI.
+ * Selenium tests for the Spark Streaming Web UI.
  */
 class UISeleniumSuite
   extends SparkFunSuite with WebBrowser with Matchers with BeforeAndAfterAll with TestSuiteBase {
@@ -37,7 +38,9 @@ class UISeleniumSuite
   implicit var webDriver: WebDriver = _
 
   override def beforeAll(): Unit = {
-    webDriver = new HtmlUnitDriver
+    webDriver = new HtmlUnitDriver {
+      getWebClient.setCssErrorHandler(new SparkUICssErrorHandler)
+    }
   }
 
   override def afterAll(): Unit = {

From 9cbdf31ec1399d4d43a1863c15688ce78b6dfd92 Mon Sep 17 00:00:00 2001
From: Marcelo Vanzin <vanzin@cloudera.com>
Date: Thu, 11 Jun 2015 15:29:03 -0700
Subject: [PATCH 457/525] [SPARK-6511] [docs] Fix example command in
 hadoop-provided docs.

Author: Marcelo Vanzin <vanzin@cloudera.com>

Closes #6766 from vanzin/SPARK-6511 and squashes the following commits:

49f0f67 [Marcelo Vanzin] [SPARK-6511] [docs] Fix example command in hadoop-provided docs.
---
 docs/hadoop-provided.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/hadoop-provided.md b/docs/hadoop-provided.md
index 0ba5a58051abc..bbd26b343e2e6 100644
--- a/docs/hadoop-provided.md
+++ b/docs/hadoop-provided.md
@@ -21,6 +21,6 @@ export SPARK_DIST_CLASSPATH=$(hadoop classpath)
 export SPARK_DIST_CLASSPATH=$(/path/to/hadoop/bin/hadoop classpath)
 
 # Passing a Hadoop configuration directory
-export SPARK_DIST_CLASSPATH=$(hadoop classpath --config /path/to/configs)
+export SPARK_DIST_CLASSPATH=$(hadoop --config /path/to/configs classpath)
 
 {% endhighlight %}

From 7d669a56ffc7a4f5827830ef3c27d45cc0e8774f Mon Sep 17 00:00:00 2001
From: Reynold Xin <rxin@databricks.com>
Date: Thu, 11 Jun 2015 16:07:15 -0700
Subject: [PATCH 458/525] [SPARK-8286] Rewrite UTF8String in Java and move it
 into unsafe package.

Unit test is still in Scala.

Author: Reynold Xin <rxin@databricks.com>

Closes #6738 from rxin/utf8string-java and squashes the following commits:

562dc6e [Reynold Xin] Flag...
98e600b [Reynold Xin] Another try with encoding setting ..
cfa6bdf [Reynold Xin] Merge branch 'master' into utf8string-java
a3b124d [Reynold Xin] Try different UTF-8 encoded characters.
1ff7c82 [Reynold Xin] Enable UTF-8 encoding.
82d58cc [Reynold Xin] Reset run-tests.
2cb3c69 [Reynold Xin] Use utf-8 encoding in set bytes.
53f8ef4 [Reynold Xin] Hack Jenkins to run one test.
9a48e8d [Reynold Xin] Fixed runtime compilation error.
911c450 [Reynold Xin] Moved unit test also to Java.
4eff7bd [Reynold Xin] Improved unit test coverage.
8e89a3c [Reynold Xin] Fixed tests.
77c64bd [Reynold Xin] Fixed string type codegen.
ffedb62 [Reynold Xin] Code review feedback.
0967ce6 [Reynold Xin] Fixed import ordering.
45a123d [Reynold Xin] [SPARK-8286] Rewrite UTF8String in Java and move it into unsafe package.
---
 project/SparkBuild.scala                      |   4 +-
 .../sql/catalyst/expressions/UnsafeRow.java   |   2 +-
 .../sql/catalyst/CatalystTypeConverters.scala |   3 +-
 .../spark/sql/catalyst/ScalaReflection.scala  |   1 +
 .../spark/sql/catalyst/expressions/Cast.scala |   9 +-
 .../expressions/SpecificMutableRow.scala      |   4 +-
 .../expressions/UnsafeRowConverter.scala      |   1 +
 .../expressions/codegen/CodeGenerator.scala   |   2 +
 .../sql/catalyst/expressions/literals.scala   |   3 +-
 .../spark/sql/catalyst/expressions/rows.scala |   7 +-
 .../expressions/stringOperations.scala        |   3 +-
 .../apache/spark/sql/types/StringType.scala   |   1 +
 .../apache/spark/sql/types/UTF8String.scala   | 221 ------------------
 .../expressions/ComplexTypeSuite.scala        |   1 +
 .../UnsafeFixedWidthAggregationMapSuite.scala |  10 +-
 .../spark/sql/types/UTF8StringSuite.scala     |  70 ------
 .../spark/sql/columnar/ColumnStats.scala      |   1 +
 .../spark/sql/columnar/ColumnType.scala       |   3 +-
 .../sql/execution/SparkSqlSerializer2.scala   |   3 +-
 .../spark/sql/execution/debug/package.scala   |   1 +
 .../spark/sql/execution/pythonUdfs.scala      |   9 +-
 .../org/apache/spark/sql/jdbc/JDBCRDD.scala   |   1 +
 .../apache/spark/sql/json/JacksonParser.scala |  11 +-
 .../org/apache/spark/sql/json/JsonRDD.scala   |   8 +-
 .../spark/sql/parquet/ParquetConverter.scala  |   7 +-
 .../spark/sql/parquet/ParquetFilters.scala    |   1 +
 .../sql/parquet/ParquetTableSupport.scala     |   1 +
 .../sql/sources/DataSourceStrategy.scala      |   3 +-
 .../spark/sql/columnar/ColumnTypeSuite.scala  |   8 +-
 .../sql/columnar/ColumnarTestUtils.scala      |   7 +-
 .../spark/sql/hive/HiveInspectors.scala       |  13 +-
 .../apache/spark/unsafe/types/UTF8String.java | 212 +++++++++++++++++
 .../spark/unsafe/bitset/BitSetSuite.java      |   1 -
 .../spark/unsafe/types/UTF8StringSuite.java   |  93 ++++++++
 34 files changed, 390 insertions(+), 335 deletions(-)
 delete mode 100644 sql/catalyst/src/main/scala/org/apache/spark/sql/types/UTF8String.scala
 delete mode 100644 sql/catalyst/src/test/scala/org/apache/spark/sql/types/UTF8StringSuite.scala
 create mode 100644 unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java
 create mode 100644 unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java

diff --git a/project/SparkBuild.scala b/project/SparkBuild.scala
index aa75a64b63caf..41b7eba3a06c2 100644
--- a/project/SparkBuild.scala
+++ b/project/SparkBuild.scala
@@ -149,7 +149,9 @@ object SparkBuild extends PomBuild {
     javacOptions in (Compile, doc) ++= {
       val Array(major, minor, _) = System.getProperty("java.version").split("\\.", 3)
       if (major.toInt >= 1 && minor.toInt >= 8) Seq("-Xdoclint:all", "-Xdoclint:-missing") else Seq.empty
-    }
+    },
+
+    javacOptions in Compile ++= Seq("-encoding", "UTF-8")
   )
 
   def enable(settings: Seq[Setting[_]])(projectRef: ProjectRef) = {
diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeRow.java b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeRow.java
index ec97fe603c44f..143acc9f5e36f 100644
--- a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeRow.java
+++ b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeRow.java
@@ -30,7 +30,7 @@
 import org.apache.spark.sql.BaseMutableRow;
 import org.apache.spark.sql.types.DataType;
 import org.apache.spark.sql.types.StructType;
-import org.apache.spark.sql.types.UTF8String;
+import org.apache.spark.unsafe.types.UTF8String;
 import org.apache.spark.unsafe.PlatformDependent;
 import org.apache.spark.unsafe.bitset.BitSetMethods;
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/CatalystTypeConverters.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/CatalystTypeConverters.scala
index beb82dbc08642..7e4b11a4951b8 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/CatalystTypeConverters.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/CatalystTypeConverters.scala
@@ -28,6 +28,7 @@ import scala.collection.mutable.HashMap
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.util.DateUtils
 import org.apache.spark.sql.types._
+import org.apache.spark.unsafe.types.UTF8String
 
 /**
  * Functions to convert Scala types to Catalyst types and vice versa.
@@ -257,7 +258,7 @@ object CatalystTypeConverters {
 
   private object StringConverter extends CatalystTypeConverter[Any, String, Any] {
     override def toCatalystImpl(scalaValue: Any): UTF8String = scalaValue match {
-      case str: String => UTF8String(str)
+      case str: String => UTF8String.fromString(str)
       case utf8: UTF8String => utf8
     }
     override def toScala(catalystValue: Any): String = catalystValue match {
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/ScalaReflection.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/ScalaReflection.scala
index 6998cc8d9666d..90698cd572de4 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/ScalaReflection.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/ScalaReflection.scala
@@ -17,6 +17,7 @@
 
 package org.apache.spark.sql.catalyst
 
+import org.apache.spark.unsafe.types.UTF8String
 import org.apache.spark.util.Utils
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.plans.logical.LocalRelation
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala
index 037efd75580d3..4c7123fcb7fcc 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala
@@ -24,6 +24,7 @@ import org.apache.spark.Logging
 import org.apache.spark.sql.catalyst.expressions.codegen.{CodeGenContext, GeneratedExpressionCode}
 import org.apache.spark.sql.catalyst.util.DateUtils
 import org.apache.spark.sql.types._
+import org.apache.spark.unsafe.types.UTF8String
 
 /** Cast the child expression to the target data type. */
 case class Cast(child: Expression, dataType: DataType) extends UnaryExpression with Logging {
@@ -111,11 +112,11 @@ case class Cast(child: Expression, dataType: DataType) extends UnaryExpression w
 
   // UDFToString
   private[this] def castToString(from: DataType): Any => Any = from match {
-    case BinaryType => buildCast[Array[Byte]](_, UTF8String(_))
-    case DateType => buildCast[Int](_, d => UTF8String(DateUtils.toString(d)))
+    case BinaryType => buildCast[Array[Byte]](_, UTF8String.fromBytes)
+    case DateType => buildCast[Int](_, d => UTF8String.fromString(DateUtils.toString(d)))
     case TimestampType => buildCast[Long](_,
-      t => UTF8String(timestampToString(DateUtils.toJavaTimestamp(t))))
-    case _ => buildCast[Any](_, o => UTF8String(o.toString))
+      t => UTF8String.fromString(timestampToString(DateUtils.toJavaTimestamp(t))))
+    case _ => buildCast[Any](_, o => UTF8String.fromString(o.toString))
   }
 
   // BinaryConverter
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/SpecificMutableRow.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/SpecificMutableRow.scala
index 2c884517d62a7..98eda61a80b40 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/SpecificMutableRow.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/SpecificMutableRow.scala
@@ -18,6 +18,7 @@
 package org.apache.spark.sql.catalyst.expressions
 
 import org.apache.spark.sql.types._
+import org.apache.spark.unsafe.types.UTF8String
 
 /**
  * A parent class for mutable container objects that are reused when the values are changed,
@@ -240,7 +241,8 @@ final class SpecificMutableRow(val values: Array[MutableValue]) extends MutableR
     }
   }
 
-  override def setString(ordinal: Int, value: String): Unit = update(ordinal, UTF8String(value))
+  override def setString(ordinal: Int, value: String): Unit =
+    update(ordinal, UTF8String.fromString(value))
 
   override def getString(ordinal: Int): String = apply(ordinal).toString
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/UnsafeRowConverter.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/UnsafeRowConverter.scala
index 5b2c8572784bd..5350123bf4c01 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/UnsafeRowConverter.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/UnsafeRowConverter.scala
@@ -20,6 +20,7 @@ package org.apache.spark.sql.catalyst.expressions
 import org.apache.spark.sql.types._
 import org.apache.spark.unsafe.PlatformDependent
 import org.apache.spark.unsafe.array.ByteArrayMethods
+import org.apache.spark.unsafe.types.UTF8String
 
 /**
  * Converts Rows into UnsafeRow format. This class is NOT thread-safe.
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala
index ecf8e0d1a7a22..536e47733074a 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala
@@ -26,6 +26,8 @@ import org.codehaus.janino.ClassBodyEvaluator
 import org.apache.spark.Logging
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.types._
+import org.apache.spark.unsafe.types.UTF8String
+
 
 // These classes are here to avoid issues with serialization and integration with quasiquotes.
 class IntegerHashSet extends org.apache.spark.util.collection.OpenHashSet[Int]
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/literals.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/literals.scala
index ef50c50e13558..a33007bda1458 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/literals.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/literals.scala
@@ -23,6 +23,7 @@ import org.apache.spark.sql.catalyst.CatalystTypeConverters
 import org.apache.spark.sql.catalyst.expressions.codegen.{CodeGenContext, GeneratedExpressionCode}
 import org.apache.spark.sql.catalyst.util.DateUtils
 import org.apache.spark.sql.types._
+import org.apache.spark.unsafe.types.UTF8String
 
 object Literal {
   def apply(v: Any): Literal = v match {
@@ -32,7 +33,7 @@ object Literal {
     case f: Float => Literal(f, FloatType)
     case b: Byte => Literal(b, ByteType)
     case s: Short => Literal(s, ShortType)
-    case s: String => Literal(UTF8String(s), StringType)
+    case s: String => Literal(UTF8String.fromString(s), StringType)
     case b: Boolean => Literal(b, BooleanType)
     case d: BigDecimal => Literal(Decimal(d), DecimalType.Unlimited)
     case d: java.math.BigDecimal => Literal(Decimal(d), DecimalType.Unlimited)
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/rows.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/rows.scala
index 5fd892c42e69c..5d2d82077f0eb 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/rows.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/rows.scala
@@ -17,7 +17,8 @@
 
 package org.apache.spark.sql.catalyst.expressions
 
-import org.apache.spark.sql.types.{UTF8String, DataType, StructType, AtomicType}
+import org.apache.spark.sql.types.{DataType, StructType, AtomicType}
+import org.apache.spark.unsafe.types.UTF8String
 
 /**
  * An extended interface to [[Row]] that allows the values for each column to be updated.  Setting
@@ -197,7 +198,9 @@ class GenericMutableRow(v: Array[Any]) extends GenericRow(v) with MutableRow {
   override def setFloat(ordinal: Int, value: Float): Unit = { values(ordinal) = value }
   override def setInt(ordinal: Int, value: Int): Unit = { values(ordinal) = value }
   override def setLong(ordinal: Int, value: Long): Unit = { values(ordinal) = value }
-  override def setString(ordinal: Int, value: String) { values(ordinal) = UTF8String(value)}
+  override def setString(ordinal: Int, value: String) {
+    values(ordinal) = UTF8String.fromString(value)
+  }
   override def setNullAt(i: Int): Unit = { values(i) = null }
 
   override def setShort(ordinal: Int, value: Short): Unit = { values(ordinal) = value }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala
index 345038323ddc5..4f4c19526eeb6 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala
@@ -22,6 +22,7 @@ import java.util.regex.Pattern
 import org.apache.spark.sql.catalyst.analysis.UnresolvedException
 import org.apache.spark.sql.catalyst.expressions.codegen._
 import org.apache.spark.sql.types._
+import org.apache.spark.unsafe.types.UTF8String
 
 trait StringRegexExpression extends ExpectsInputTypes {
   self: BinaryExpression =>
@@ -277,7 +278,7 @@ case class Substring(str: Expression, pos: Expression, len: Expression)
           ba.slice(st, end)
         case s: UTF8String =>
           val (st, end) = slicePos(start, length, () => s.length())
-          s.slice(st, end)
+          s.substring(st, end)
       }
     }
   }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/StringType.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/StringType.scala
index 134ab0af4e0de..1e9476ad06656 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/StringType.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/StringType.scala
@@ -22,6 +22,7 @@ import scala.reflect.runtime.universe.typeTag
 
 import org.apache.spark.annotation.DeveloperApi
 import org.apache.spark.sql.catalyst.ScalaReflectionLock
+import org.apache.spark.unsafe.types.UTF8String
 
 /**
  * :: DeveloperApi ::
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/UTF8String.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/UTF8String.scala
deleted file mode 100644
index f5d8fcced362b..0000000000000
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/UTF8String.scala
+++ /dev/null
@@ -1,221 +0,0 @@
-/*
-* Licensed to the Apache Software Foundation (ASF) under one or more
-* contributor license agreements.  See the NOTICE file distributed with
-* this work for additional information regarding copyright ownership.
-* The ASF licenses this file to You under the Apache License, Version 2.0
-* (the "License"); you may not use this file except in compliance with
-* the License.  You may obtain a copy of the License at
-*
-*    http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
-
-package org.apache.spark.sql.types
-
-import java.util.Arrays
-
-import org.apache.spark.annotation.DeveloperApi
-
-/**
- * :: DeveloperApi ::
- * A UTF-8 String, as internal representation of StringType in SparkSQL
- *
- * A String encoded in UTF-8 as an Array[Byte], which can be used for comparison,
- * search, see http://en.wikipedia.org/wiki/UTF-8 for details.
- *
- * Note: This is not designed for general use cases, should not be used outside SQL.
- */
-@DeveloperApi
-final class UTF8String extends Ordered[UTF8String] with Serializable {
-
-  private[this] var bytes: Array[Byte] = _
-
-  /**
-   * Update the UTF8String with String.
-   */
-  def set(str: String): UTF8String = {
-    bytes = str.getBytes("utf-8")
-    this
-  }
-
-  /**
-   * Update the UTF8String with Array[Byte], which should be encoded in UTF-8
-   */
-  def set(bytes: Array[Byte]): UTF8String = {
-    this.bytes = bytes
-    this
-  }
-
-  /**
-   * Return the number of bytes for a code point with the first byte as `b`
-   * @param b The first byte of a code point
-   */
-  @inline
-  private[this] def numOfBytes(b: Byte): Int = {
-    val offset = (b & 0xFF) - 192
-    if (offset >= 0) UTF8String.bytesOfCodePointInUTF8(offset) else 1
-  }
-
-  /**
-   * Return the number of code points in it.
-   *
-   * This is only used by Substring() when `start` is negative.
-   */
-  def length(): Int = {
-    var len = 0
-    var i: Int = 0
-    while (i < bytes.length) {
-      i += numOfBytes(bytes(i))
-      len += 1
-    }
-    len
-  }
-
-  def getBytes: Array[Byte] = {
-    bytes
-  }
-
-  /**
-   * Return a substring of this,
-   * @param start the position of first code point
-   * @param until the position after last code point
-   */
-  def slice(start: Int, until: Int): UTF8String = {
-    if (until <= start || start >= bytes.length || bytes == null) {
-      new UTF8String
-    }
-
-    var c = 0
-    var i: Int = 0
-    while (c < start && i < bytes.length) {
-      i += numOfBytes(bytes(i))
-      c += 1
-    }
-    var j = i
-    while (c < until && j < bytes.length) {
-      j += numOfBytes(bytes(j))
-      c += 1
-    }
-    UTF8String(Arrays.copyOfRange(bytes, i, j))
-  }
-
-  def contains(sub: UTF8String): Boolean = {
-    val b = sub.getBytes
-    if (b.length == 0) {
-      return true
-    }
-    var i: Int = 0
-    while (i <= bytes.length - b.length) {
-      // In worst case, it's O(N*K), but should works fine with SQL
-      if (bytes(i) == b(0) && Arrays.equals(Arrays.copyOfRange(bytes, i, i + b.length), b)) {
-        return true
-      }
-      i += 1
-    }
-    false
-  }
-
-  def startsWith(prefix: UTF8String): Boolean = {
-    val b = prefix.getBytes
-    if (b.length > bytes.length) {
-      return false
-    }
-    Arrays.equals(Arrays.copyOfRange(bytes, 0, b.length), b)
-  }
-
-  def endsWith(suffix: UTF8String): Boolean = {
-    val b = suffix.getBytes
-    if (b.length > bytes.length) {
-      return false
-    }
-    Arrays.equals(Arrays.copyOfRange(bytes, bytes.length - b.length, bytes.length), b)
-  }
-
-  def toUpperCase(): UTF8String = {
-    // upper case depends on locale, fallback to String.
-    UTF8String(toString().toUpperCase)
-  }
-
-  def toLowerCase(): UTF8String = {
-    // lower case depends on locale, fallback to String.
-    UTF8String(toString().toLowerCase)
-  }
-
-  override def toString(): String = {
-    new String(bytes, "utf-8")
-  }
-
-  override def clone(): UTF8String = new UTF8String().set(this.bytes)
-
-  override def compare(other: UTF8String): Int = {
-    var i: Int = 0
-    val b = other.getBytes
-    while (i < bytes.length && i < b.length) {
-      val res = bytes(i).compareTo(b(i))
-      if (res != 0) return res
-      i += 1
-    }
-    bytes.length - b.length
-  }
-
-  override def compareTo(other: UTF8String): Int = {
-    compare(other)
-  }
-
-  override def equals(other: Any): Boolean = other match {
-    case s: UTF8String =>
-      Arrays.equals(bytes, s.getBytes)
-    case s: String =>
-      // This is only used for Catalyst unit tests
-      // fail fast
-      bytes.length >= s.length && length() == s.length && toString() == s
-    case _ =>
-      false
-  }
-
-  override def hashCode(): Int = {
-    Arrays.hashCode(bytes)
-  }
-}
-
-/**
- * :: DeveloperApi ::
- */
-@DeveloperApi
-object UTF8String {
-  // number of tailing bytes in a UTF8 sequence for a code point
-  // see http://en.wikipedia.org/wiki/UTF-8, 192-256 of Byte 1
-  private[types] val bytesOfCodePointInUTF8: Array[Int] = Array(2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
-    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
-    3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
-    4, 4, 4, 4, 4, 4, 4, 4,
-    5, 5, 5, 5,
-    6, 6, 6, 6)
-
-  /**
-   * Create a UTF-8 String from String
-   */
-  def apply(s: String): UTF8String = {
-    if (s != null) {
-      new UTF8String().set(s)
-    } else {
-      null
-    }
-  }
-
-  /**
-   * Create a UTF-8 String from Array[Byte], which should be encoded in UTF-8
-   */
-  def apply(bytes: Array[Byte]): UTF8String = {
-    if (bytes != null) {
-      new UTF8String().set(bytes)
-    } else {
-      null
-    }
-  }
-}
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ComplexTypeSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ComplexTypeSuite.scala
index f151dd2a47f78..bcc594cb7c193 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ComplexTypeSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ComplexTypeSuite.scala
@@ -21,6 +21,7 @@ import org.apache.spark.SparkFunSuite
 import org.apache.spark.sql.catalyst.analysis.UnresolvedExtractValue
 import org.apache.spark.sql.catalyst.dsl.expressions._
 import org.apache.spark.sql.types._
+import org.apache.spark.unsafe.types.UTF8String
 
 
 class ComplexTypeSuite extends SparkFunSuite with ExpressionEvalHelper {
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/UnsafeFixedWidthAggregationMapSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/UnsafeFixedWidthAggregationMapSuite.scala
index 88a36aa121b55..72bbc4efeb8ef 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/UnsafeFixedWidthAggregationMapSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/UnsafeFixedWidthAggregationMapSuite.scala
@@ -20,11 +20,13 @@ package org.apache.spark.sql.catalyst.expressions
 import scala.collection.JavaConverters._
 import scala.util.Random
 
-import org.apache.spark.SparkFunSuite
-import org.apache.spark.unsafe.memory.{ExecutorMemoryManager, TaskMemoryManager, MemoryAllocator}
 import org.scalatest.{BeforeAndAfterEach, Matchers}
 
+import org.apache.spark.SparkFunSuite
 import org.apache.spark.sql.types._
+import org.apache.spark.unsafe.memory.{ExecutorMemoryManager, TaskMemoryManager, MemoryAllocator}
+import org.apache.spark.unsafe.types.UTF8String
+
 
 class UnsafeFixedWidthAggregationMapSuite
   extends SparkFunSuite
@@ -82,7 +84,7 @@ class UnsafeFixedWidthAggregationMapSuite
       1024, // initial capacity
       false // disable perf metrics
     )
-    val groupKey = new GenericRow(Array[Any](UTF8String("cats")))
+    val groupKey = new GenericRow(Array[Any](UTF8String.fromString("cats")))
 
     // Looking up a key stores a zero-entry in the map (like Python Counters or DefaultDicts)
     map.getAggregationBuffer(groupKey)
@@ -111,7 +113,7 @@ class UnsafeFixedWidthAggregationMapSuite
     val rand = new Random(42)
     val groupKeys: Set[String] = Seq.fill(512)(rand.nextString(1024)).toSet
     groupKeys.foreach { keyString =>
-      map.getAggregationBuffer(new GenericRow(Array[Any](UTF8String(keyString))))
+      map.getAggregationBuffer(new GenericRow(Array[Any](UTF8String.fromString(keyString))))
     }
     val seenKeys: Set[String] = map.iterator().asScala.map { entry =>
       entry.key.getString(0)
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/types/UTF8StringSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/types/UTF8StringSuite.scala
deleted file mode 100644
index 81d7ab010f394..0000000000000
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/types/UTF8StringSuite.scala
+++ /dev/null
@@ -1,70 +0,0 @@
-/*
-* Licensed to the Apache Software Foundation (ASF) under one or more
-* contributor license agreements.  See the NOTICE file distributed with
-* this work for additional information regarding copyright ownership.
-* The ASF licenses this file to You under the Apache License, Version 2.0
-* (the "License"); you may not use this file except in compliance with
-* the License.  You may obtain a copy of the License at
-*
-*    http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
-
-package org.apache.spark.sql.types
-
-import org.apache.spark.SparkFunSuite
-
-// scalastyle:off
-class UTF8StringSuite extends SparkFunSuite {
-  test("basic") {
-    def check(str: String, len: Int) {
-
-      assert(UTF8String(str).length == len)
-      assert(UTF8String(str.getBytes("utf8")).length() == len)
-
-      assert(UTF8String(str) == str)
-      assert(UTF8String(str.getBytes("utf8")) == str)
-      assert(UTF8String(str).toString == str)
-      assert(UTF8String(str.getBytes("utf8")).toString == str)
-      assert(UTF8String(str.getBytes("utf8")) == UTF8String(str))
-
-      assert(UTF8String(str).hashCode() == UTF8String(str.getBytes("utf8")).hashCode())
-    }
-
-    check("hello", 5)
-    check("世 界", 3)
-  }
-
-  test("contains") {
-    assert(UTF8String("hello").contains(UTF8String("ello")))
-    assert(!UTF8String("hello").contains(UTF8String("vello")))
-    assert(UTF8String("大千世界").contains(UTF8String("千世")))
-    assert(!UTF8String("大千世界").contains(UTF8String("世千")))
-  }
-
-  test("prefix") {
-    assert(UTF8String("hello").startsWith(UTF8String("hell")))
-    assert(!UTF8String("hello").startsWith(UTF8String("ell")))
-    assert(UTF8String("大千世界").startsWith(UTF8String("大千")))
-    assert(!UTF8String("大千世界").startsWith(UTF8String("千")))
-  }
-
-  test("suffix") {
-    assert(UTF8String("hello").endsWith(UTF8String("ello")))
-    assert(!UTF8String("hello").endsWith(UTF8String("ellov")))
-    assert(UTF8String("大千世界").endsWith(UTF8String("世界")))
-    assert(!UTF8String("大千世界").endsWith(UTF8String("世")))
-  }
-
-  test("slice") {
-    assert(UTF8String("hello").slice(1, 3) == UTF8String("el"))
-    assert(UTF8String("大千世界").slice(0, 1) == UTF8String("大"))
-    assert(UTF8String("大千世界").slice(1, 3) == UTF8String("千世"))
-    assert(UTF8String("大千世界").slice(3, 5) == UTF8String("界"))
-  }
-}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/columnar/ColumnStats.scala b/sql/core/src/main/scala/org/apache/spark/sql/columnar/ColumnStats.scala
index 83881a3687090..11c79c865f11a 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/columnar/ColumnStats.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/columnar/ColumnStats.scala
@@ -20,6 +20,7 @@ package org.apache.spark.sql.columnar
 import org.apache.spark.sql.Row
 import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeMap, AttributeReference}
 import org.apache.spark.sql.types._
+import org.apache.spark.unsafe.types.UTF8String
 
 private[sql] class ColumnStatisticsSchema(a: Attribute) extends Serializable {
   val upperBound = AttributeReference(a.name + ".upperBound", a.dataType, nullable = true)()
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/columnar/ColumnType.scala b/sql/core/src/main/scala/org/apache/spark/sql/columnar/ColumnType.scala
index c9c4d630fb5f4..8e21020917768 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/columnar/ColumnType.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/columnar/ColumnType.scala
@@ -25,6 +25,7 @@ import org.apache.spark.sql.Row
 import org.apache.spark.sql.catalyst.expressions.MutableRow
 import org.apache.spark.sql.execution.SparkSqlSerializer
 import org.apache.spark.sql.types._
+import org.apache.spark.unsafe.types.UTF8String
 
 /**
  * An abstract class that represents type of a column. Used to append/extract Java objects into/from
@@ -320,7 +321,7 @@ private[sql] object STRING extends NativeColumnType(StringType, 7, 8) {
     val length = buffer.getInt()
     val stringBytes = new Array[Byte](length)
     buffer.get(stringBytes, 0, length)
-    UTF8String(stringBytes)
+    UTF8String.fromBytes(stringBytes)
   }
 
   override def setField(row: MutableRow, ordinal: Int, value: UTF8String): Unit = {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlSerializer2.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlSerializer2.scala
index 60f3b2d539ffe..202e4488a64bf 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlSerializer2.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlSerializer2.scala
@@ -28,6 +28,7 @@ import org.apache.spark.serializer._
 import org.apache.spark.sql.Row
 import org.apache.spark.sql.catalyst.expressions.{GenericMutableRow, MutableRow, SpecificMutableRow}
 import org.apache.spark.sql.types._
+import org.apache.spark.unsafe.types.UTF8String
 
 /**
  * The serialization stream for [[SparkSqlSerializer2]]. It assumes that the object passed in
@@ -434,7 +435,7 @@ private[sql] object SparkSqlSerializer2 {
                 val length = in.readInt()
                 val bytes = new Array[Byte](length)
                 in.readFully(bytes)
-                mutableRow.update(i, UTF8String(bytes))
+                mutableRow.update(i, UTF8String.fromBytes(bytes))
               }
 
             case BinaryType =>
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/debug/package.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/debug/package.scala
index 720b529d5946f..83c1f65d5c96f 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/debug/package.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/debug/package.scala
@@ -19,6 +19,7 @@ package org.apache.spark.sql.execution
 
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.catalyst.expressions.Attribute
+import org.apache.spark.unsafe.types.UTF8String
 
 import scala.collection.mutable.HashSet
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/pythonUdfs.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/pythonUdfs.scala
index b1333ec09a09a..2b45a83d145f5 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/pythonUdfs.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/pythonUdfs.scala
@@ -24,6 +24,7 @@ import scala.collection.JavaConverters._
 
 import net.razorvine.pickle.{Pickler, Unpickler}
 
+import org.apache.spark.{Accumulator, Logging => SparkLogging}
 import org.apache.spark.annotation.DeveloperApi
 import org.apache.spark.api.python.{PythonBroadcast, PythonRDD}
 import org.apache.spark.broadcast.Broadcast
@@ -34,7 +35,7 @@ import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
 import org.apache.spark.sql.catalyst.rules.Rule
 import org.apache.spark.sql.catalyst.util.DateUtils
 import org.apache.spark.sql.types._
-import org.apache.spark.{Accumulator, Logging => SparkLogging}
+import org.apache.spark.unsafe.types.UTF8String
 
 /**
  * A serialized version of a Python lambda function.  Suitable for use in a [[PythonRDD]].
@@ -203,8 +204,10 @@ object EvaluatePython {
     case (c: Long, IntegerType) => c.toInt
     case (c: Int, LongType) => c.toLong
     case (c: Double, FloatType) => c.toFloat
-    case (c: String, StringType) => UTF8String(c)
-    case (c, StringType) if !c.isInstanceOf[String] => UTF8String(c.toString)
+    case (c: String, StringType) => UTF8String.fromString(c)
+    case (c, StringType) =>
+      // If we get here, c is not a string. Call toString on it.
+      UTF8String.fromString(c.toString)
 
     case (c, _) => c
   }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/jdbc/JDBCRDD.scala b/sql/core/src/main/scala/org/apache/spark/sql/jdbc/JDBCRDD.scala
index 9028d5ed72c92..e75e6681c5ff3 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/jdbc/JDBCRDD.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/jdbc/JDBCRDD.scala
@@ -28,6 +28,7 @@ import org.apache.spark.sql.catalyst.expressions.{Row, SpecificMutableRow}
 import org.apache.spark.sql.catalyst.util.DateUtils
 import org.apache.spark.sql.types._
 import org.apache.spark.sql.sources._
+import org.apache.spark.unsafe.types.UTF8String
 
 /**
  * Data corresponding to one partition of a JDBCRDD.
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/json/JacksonParser.scala b/sql/core/src/main/scala/org/apache/spark/sql/json/JacksonParser.scala
index 4e07cf36ae434..f16075ce58ffa 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/json/JacksonParser.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/json/JacksonParser.scala
@@ -28,6 +28,8 @@ import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.util.DateUtils
 import org.apache.spark.sql.json.JacksonUtils.nextUntil
 import org.apache.spark.sql.types._
+import org.apache.spark.unsafe.types.UTF8String
+
 
 private[sql] object JacksonParser {
   def apply(
@@ -54,7 +56,7 @@ private[sql] object JacksonParser {
         convertField(factory, parser, schema)
 
       case (VALUE_STRING, StringType) =>
-        UTF8String(parser.getText)
+        UTF8String.fromString(parser.getText)
 
       case (VALUE_STRING, _) if parser.getTextLength < 1 =>
         // guard the non string type
@@ -74,7 +76,7 @@ private[sql] object JacksonParser {
         val generator = factory.createGenerator(writer, JsonEncoding.UTF8)
         generator.copyCurrentStructure(parser)
         generator.close()
-        UTF8String(writer.toByteArray)
+        UTF8String.fromBytes(writer.toByteArray)
 
       case (VALUE_NUMBER_INT | VALUE_NUMBER_FLOAT, FloatType) =>
         parser.getFloatValue
@@ -152,7 +154,8 @@ private[sql] object JacksonParser {
       valueType: DataType): Map[UTF8String, Any] = {
     val builder = Map.newBuilder[UTF8String, Any]
     while (nextUntil(parser, JsonToken.END_OBJECT)) {
-      builder += UTF8String(parser.getCurrentName) -> convertField(factory, parser, valueType)
+      builder +=
+        UTF8String.fromString(parser.getCurrentName) -> convertField(factory, parser, valueType)
     }
 
     builder.result()
@@ -180,7 +183,7 @@ private[sql] object JacksonParser {
       val row = new GenericMutableRow(schema.length)
       for (corruptIndex <- schema.getFieldIndex(columnNameOfCorruptRecords)) {
         require(schema(corruptIndex).dataType == StringType)
-        row.update(corruptIndex, UTF8String(record))
+        row.update(corruptIndex, UTF8String.fromString(record))
       }
 
       Seq(row)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/json/JsonRDD.scala b/sql/core/src/main/scala/org/apache/spark/sql/json/JsonRDD.scala
index fb0d137bdbfdb..e4acf1ddaf173 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/json/JsonRDD.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/json/JsonRDD.scala
@@ -30,6 +30,8 @@ import org.apache.spark.sql.catalyst.analysis.HiveTypeCoercion
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.util.DateUtils
 import org.apache.spark.sql.types._
+import org.apache.spark.unsafe.types.UTF8String
+
 
 private[sql] object JsonRDD extends Logging {
 
@@ -317,7 +319,7 @@ private[sql] object JsonRDD extends Logging {
           parsed
         } catch {
           case e: JsonProcessingException =>
-            Map(columnNameOfCorruptRecords -> UTF8String(record)) :: Nil
+            Map(columnNameOfCorruptRecords -> UTF8String.fromString(record)) :: Nil
         }
       }
     })
@@ -409,7 +411,7 @@ private[sql] object JsonRDD extends Logging {
       null
     } else {
       desiredType match {
-        case StringType => UTF8String(toString(value))
+        case StringType => UTF8String.fromString(toString(value))
         case _ if value == null || value == "" => null // guard the non string type
         case IntegerType => value.asInstanceOf[IntegerType.InternalType]
         case LongType => toLong(value)
@@ -423,7 +425,7 @@ private[sql] object JsonRDD extends Logging {
           val map = value.asInstanceOf[Map[String, Any]]
           map.map {
             case (k, v) =>
-              (UTF8String(k), enforceCorrectType(v, valueType))
+              (UTF8String.fromString(k), enforceCorrectType(v, valueType))
           }.map(identity)
         case struct: StructType => asRow(value.asInstanceOf[Map[String, Any]], struct)
         case DateType => toDate(value)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetConverter.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetConverter.scala
index ddc5097f88fb1..ab9f878d1e936 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetConverter.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetConverter.scala
@@ -32,6 +32,7 @@ import org.apache.spark.sql.catalyst.util.DateUtils
 import org.apache.spark.sql.parquet.CatalystConverter.FieldType
 import org.apache.spark.sql.types._
 import org.apache.spark.sql.parquet.timestamp.NanoTime
+import org.apache.spark.unsafe.types.UTF8String
 
 /**
  * Collection of converters of Parquet types (group and primitive types) that
@@ -222,7 +223,7 @@ private[parquet] abstract class CatalystConverter extends GroupConverter {
     updateField(fieldIndex, value.getBytes)
 
   protected[parquet] def updateString(fieldIndex: Int, value: Array[Byte]): Unit =
-    updateField(fieldIndex, UTF8String(value))
+    updateField(fieldIndex, UTF8String.fromBytes(value))
 
   protected[parquet] def updateTimestamp(fieldIndex: Int, value: Binary): Unit =
     updateField(fieldIndex, readTimestamp(value))
@@ -423,7 +424,7 @@ private[parquet] class CatalystPrimitiveRowConverter(
     current.update(fieldIndex, value.getBytes)
 
   override protected[parquet] def updateString(fieldIndex: Int, value: Array[Byte]): Unit =
-    current.update(fieldIndex, UTF8String(value))
+    current.update(fieldIndex, UTF8String.fromBytes(value))
 
   override protected[parquet] def updateTimestamp(fieldIndex: Int, value: Binary): Unit =
     current.setLong(fieldIndex, readTimestamp(value))
@@ -719,7 +720,7 @@ private[parquet] class CatalystNativeArrayConverter(
 
   override protected[parquet] def updateString(fieldIndex: Int, value: Array[Byte]): Unit = {
     checkGrowBuffer()
-    buffer(elements) = UTF8String(value).asInstanceOf[NativeType]
+    buffer(elements) = UTF8String.fromBytes(value).asInstanceOf[NativeType]
     elements += 1
   }
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetFilters.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetFilters.scala
index 88ae88e9684c8..4d659f261a3b7 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetFilters.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetFilters.scala
@@ -31,6 +31,7 @@ import org.apache.spark.SparkEnv
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.sources
 import org.apache.spark.sql.types._
+import org.apache.spark.unsafe.types.UTF8String
 
 private[sql] object ParquetFilters {
   val PARQUET_FILTER_DATA = "org.apache.spark.sql.parquet.row.filter"
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableSupport.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableSupport.scala
index e03dbdec0491d..c62c592b3f3e4 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableSupport.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableSupport.scala
@@ -31,6 +31,7 @@ import org.apache.spark.Logging
 import org.apache.spark.sql.catalyst.expressions.{Attribute, Row}
 import org.apache.spark.sql.catalyst.util.DateUtils
 import org.apache.spark.sql.types._
+import org.apache.spark.unsafe.types.UTF8String
 
 /**
  * A `parquet.io.api.RecordMaterializer` for Rows.
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/sources/DataSourceStrategy.scala b/sql/core/src/main/scala/org/apache/spark/sql/sources/DataSourceStrategy.scala
index c6a4dabbab05e..edda3f2017fe8 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/sources/DataSourceStrategy.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/sources/DataSourceStrategy.scala
@@ -26,9 +26,10 @@ import org.apache.spark.sql.catalyst.planning.PhysicalOperation
 import org.apache.spark.sql.catalyst.plans.logical
 import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
 import org.apache.spark.sql.execution.SparkPlan
-import org.apache.spark.sql.types.{StringType, StructType, UTF8String}
+import org.apache.spark.sql.types.{StringType, StructType}
 import org.apache.spark.sql.{SaveMode, Strategy, execution, sources}
 import org.apache.spark.util.Utils
+import org.apache.spark.unsafe.types.UTF8String
 
 /**
  * A Strategy for planning scans over data sources defined using the sources API.
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/columnar/ColumnTypeSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/columnar/ColumnTypeSuite.scala
index 8421e670ff05d..6daddfb2c4804 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/columnar/ColumnTypeSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/columnar/ColumnTypeSuite.scala
@@ -22,12 +22,14 @@ import java.nio.ByteBuffer
 import com.esotericsoftware.kryo.io.{Input, Output}
 import com.esotericsoftware.kryo.{Kryo, Serializer}
 
+import org.apache.spark.{Logging, SparkConf, SparkFunSuite}
 import org.apache.spark.serializer.KryoRegistrator
 import org.apache.spark.sql.catalyst.expressions.GenericMutableRow
 import org.apache.spark.sql.columnar.ColumnarTestUtils._
 import org.apache.spark.sql.execution.SparkSqlSerializer
 import org.apache.spark.sql.types._
-import org.apache.spark.{Logging, SparkConf, SparkFunSuite}
+import org.apache.spark.unsafe.types.UTF8String
+
 
 class ColumnTypeSuite extends SparkFunSuite with Logging {
   val DEFAULT_BUFFER_SIZE = 512
@@ -66,7 +68,7 @@ class ColumnTypeSuite extends SparkFunSuite with Logging {
     checkActualSize(FLOAT, Float.MaxValue, 4)
     checkActualSize(FIXED_DECIMAL(15, 10), Decimal(0, 15, 10), 8)
     checkActualSize(BOOLEAN, true, 1)
-    checkActualSize(STRING, UTF8String("hello"), 4 + "hello".getBytes("utf-8").length)
+    checkActualSize(STRING, UTF8String.fromString("hello"), 4 + "hello".getBytes("utf-8").length)
     checkActualSize(DATE, 0, 4)
     checkActualSize(TIMESTAMP, 0L, 8)
 
@@ -118,7 +120,7 @@ class ColumnTypeSuite extends SparkFunSuite with Logging {
       val length = buffer.getInt()
       val bytes = new Array[Byte](length)
       buffer.get(bytes)
-      UTF8String(bytes)
+      UTF8String.fromBytes(bytes)
     })
 
   testColumnType[BinaryType.type, Array[Byte]](
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/columnar/ColumnarTestUtils.scala b/sql/core/src/test/scala/org/apache/spark/sql/columnar/ColumnarTestUtils.scala
index c5d38595c0bec..1bc7eb36311bb 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/columnar/ColumnarTestUtils.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/columnar/ColumnarTestUtils.scala
@@ -22,7 +22,10 @@ import scala.util.Random
 
 import org.apache.spark.sql.Row
 import org.apache.spark.sql.catalyst.expressions.GenericMutableRow
-import org.apache.spark.sql.types.{AtomicType, DataType, Decimal, UTF8String}
+import org.apache.spark.sql.types.{AtomicType, DataType, Decimal}
+import org.apache.spark.sql.types.{DataType, Decimal, AtomicType}
+import org.apache.spark.unsafe.types.UTF8String
+
 
 object ColumnarTestUtils {
   def makeNullRow(length: Int): GenericMutableRow = {
@@ -46,7 +49,7 @@ object ColumnarTestUtils {
       case FLOAT => Random.nextFloat()
       case DOUBLE => Random.nextDouble()
       case FIXED_DECIMAL(precision, scale) => Decimal(Random.nextLong() % 100, precision, scale)
-      case STRING => UTF8String(Random.nextString(Random.nextInt(32)))
+      case STRING => UTF8String.fromString(Random.nextString(Random.nextInt(32)))
       case BOOLEAN => Random.nextBoolean()
       case BINARY => randomBytes(Random.nextInt(32))
       case DATE => Random.nextInt()
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveInspectors.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveInspectors.scala
index 1f14cba78f479..fd01a8722bce6 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveInspectors.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveInspectors.scala
@@ -28,6 +28,7 @@ import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.util.DateUtils
 import org.apache.spark.sql.types
 import org.apache.spark.sql.types._
+import org.apache.spark.unsafe.types.UTF8String
 
 /* Implicit conversions */
 import scala.collection.JavaConversions._
@@ -242,9 +243,9 @@ private[hive] trait HiveInspectors {
   def unwrap(data: Any, oi: ObjectInspector): Any = oi match {
     case coi: ConstantObjectInspector if coi.getWritableConstantValue == null => null
     case poi: WritableConstantStringObjectInspector =>
-      UTF8String(poi.getWritableConstantValue.toString)
+      UTF8String.fromString(poi.getWritableConstantValue.toString)
     case poi: WritableConstantHiveVarcharObjectInspector =>
-      UTF8String(poi.getWritableConstantValue.getHiveVarchar.getValue)
+      UTF8String.fromString(poi.getWritableConstantValue.getHiveVarchar.getValue)
     case poi: WritableConstantHiveDecimalObjectInspector =>
       HiveShim.toCatalystDecimal(
         PrimitiveObjectInspectorFactory.javaHiveDecimalObjectInspector,
@@ -288,13 +289,13 @@ private[hive] trait HiveInspectors {
     case pi: PrimitiveObjectInspector => pi match {
       // We think HiveVarchar is also a String
       case hvoi: HiveVarcharObjectInspector if hvoi.preferWritable() =>
-        UTF8String(hvoi.getPrimitiveWritableObject(data).getHiveVarchar.getValue)
+        UTF8String.fromString(hvoi.getPrimitiveWritableObject(data).getHiveVarchar.getValue)
       case hvoi: HiveVarcharObjectInspector =>
-        UTF8String(hvoi.getPrimitiveJavaObject(data).getValue)
+        UTF8String.fromString(hvoi.getPrimitiveJavaObject(data).getValue)
       case x: StringObjectInspector if x.preferWritable() =>
-        UTF8String(x.getPrimitiveWritableObject(data).toString)
+        UTF8String.fromString(x.getPrimitiveWritableObject(data).toString)
       case x: StringObjectInspector =>
-        UTF8String(x.getPrimitiveJavaObject(data))
+        UTF8String.fromString(x.getPrimitiveJavaObject(data))
       case x: IntObjectInspector if x.preferWritable() => x.get(data)
       case x: BooleanObjectInspector if x.preferWritable() => x.get(data)
       case x: FloatObjectInspector if x.preferWritable() => x.get(data)
diff --git a/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java b/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java
new file mode 100644
index 0000000000000..a35168019549e
--- /dev/null
+++ b/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java
@@ -0,0 +1,212 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.unsafe.types;
+
+import java.io.Serializable;
+import java.io.UnsupportedEncodingException;
+import java.util.Arrays;
+import javax.annotation.Nullable;
+
+import org.apache.spark.unsafe.PlatformDependent;
+
+/**
+ * A UTF-8 String for internal Spark use.
+ * <p>
+ * A String encoded in UTF-8 as an Array[Byte], which can be used for comparison,
+ * search, see http://en.wikipedia.org/wiki/UTF-8 for details.
+ * <p>
+ * Note: This is not designed for general use cases, should not be used outside SQL.
+ */
+public final class UTF8String implements Comparable<UTF8String>, Serializable {
+
+  @Nullable
+  private byte[] bytes;
+
+  private static int[] bytesOfCodePointInUTF8 = {2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+    3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+    4, 4, 4, 4, 4, 4, 4, 4,
+    5, 5, 5, 5,
+    6, 6, 6, 6};
+
+  public static UTF8String fromBytes(byte[] bytes) {
+    return (bytes != null) ? new UTF8String().set(bytes) : null;
+  }
+
+  public static UTF8String fromString(String str) {
+    return (str != null) ? new UTF8String().set(str) : null;
+  }
+
+  /**
+   * Updates the UTF8String with String.
+   */
+  public UTF8String set(final String str) {
+    try {
+      bytes = str.getBytes("utf-8");
+    } catch (UnsupportedEncodingException e) {
+      // Turn the exception into unchecked so we can find out about it at runtime, but
+      // don't need to add lots of boilerplate code everywhere.
+      PlatformDependent.throwException(e);
+    }
+    return this;
+  }
+
+  /**
+   * Updates the UTF8String with byte[], which should be encoded in UTF-8.
+   */
+  public UTF8String set(final byte[] bytes) {
+    this.bytes = bytes;
+    return this;
+  }
+
+  /**
+   * Returns the number of bytes for a code point with the first byte as `b`
+   * @param b The first byte of a code point
+   */
+  public int numBytes(final byte b) {
+    final int offset = (b & 0xFF) - 192;
+    return (offset >= 0) ? bytesOfCodePointInUTF8[offset] : 1;
+  }
+
+  /**
+   * Returns the number of code points in it.
+   *
+   * This is only used by Substring() when `start` is negative.
+   */
+  public int length() {
+    int len = 0;
+    for (int i = 0; i < bytes.length; i+= numBytes(bytes[i])) {
+      len += 1;
+    }
+    return len;
+  }
+
+  public byte[] getBytes() {
+    return bytes;
+  }
+
+  /**
+   * Returns a substring of this.
+   * @param start the position of first code point
+   * @param until the position after last code point, exclusive.
+   */
+  public UTF8String substring(final int start, final int until) {
+    if (until <= start || start >= bytes.length) {
+      return UTF8String.fromBytes(new byte[0]);
+    }
+
+    int i = 0;
+    int c = 0;
+    for (; i < bytes.length && c < start; i += numBytes(bytes[i])) {
+      c += 1;
+    }
+
+    int j = i;
+    for (; j < bytes.length && c < until; j += numBytes(bytes[i])) {
+      c += 1;
+    }
+
+    return UTF8String.fromBytes(Arrays.copyOfRange(bytes, i, j));
+  }
+
+  public boolean contains(final UTF8String substring) {
+    final byte[] b = substring.getBytes();
+    if (b.length == 0) {
+      return true;
+    }
+
+    for (int i = 0; i <= bytes.length - b.length; i++) {
+      // TODO: Avoid copying.
+      if (bytes[i] == b[0] && Arrays.equals(Arrays.copyOfRange(bytes, i, i + b.length), b)) {
+        return true;
+      }
+    }
+    return false;
+  }
+
+  public boolean startsWith(final UTF8String prefix) {
+    final byte[] b = prefix.getBytes();
+    // TODO: Avoid copying.
+    return b.length <= bytes.length && Arrays.equals(Arrays.copyOfRange(bytes, 0, b.length), b);
+  }
+
+  public boolean endsWith(final UTF8String suffix) {
+    final byte[] b = suffix.getBytes();
+    return b.length <= bytes.length &&
+      Arrays.equals(Arrays.copyOfRange(bytes, bytes.length - b.length, bytes.length), b);
+  }
+
+  public UTF8String toUpperCase() {
+    return UTF8String.fromString(toString().toUpperCase());
+  }
+
+  public UTF8String toLowerCase() {
+    return UTF8String.fromString(toString().toLowerCase());
+  }
+
+  @Override
+  public String toString() {
+    try {
+      return new String(bytes, "utf-8");
+    } catch (UnsupportedEncodingException e) {
+      // Turn the exception into unchecked so we can find out about it at runtime, but
+      // don't need to add lots of boilerplate code everywhere.
+      PlatformDependent.throwException(e);
+      return "unknown";  // we will never reach here.
+    }
+  }
+
+  @Override
+  public UTF8String clone() {
+    return new UTF8String().set(bytes);
+  }
+
+  @Override
+  public int compareTo(final UTF8String other) {
+    final byte[] b = other.getBytes();
+    for (int i = 0; i < bytes.length && i < b.length; i++) {
+      int res = bytes[i] - b[i];
+      if (res != 0) {
+        return res;
+      }
+    }
+    return bytes.length - b.length;
+  }
+
+  public int compare(final UTF8String other) {
+    return compareTo(other);
+  }
+
+  @Override
+  public boolean equals(final Object other) {
+    if (other instanceof UTF8String) {
+      return Arrays.equals(bytes, ((UTF8String) other).getBytes());
+    } else if (other instanceof String) {
+      // Used only in unit tests.
+      String s = (String) other;
+      return bytes.length >= s.length() && length() == s.length() && toString().equals(s);
+    } else {
+      return false;
+    }
+  }
+
+  @Override
+  public int hashCode() {
+    return Arrays.hashCode(bytes);
+  }
+}
diff --git a/unsafe/src/test/java/org/apache/spark/unsafe/bitset/BitSetSuite.java b/unsafe/src/test/java/org/apache/spark/unsafe/bitset/BitSetSuite.java
index 18393db9f382f..a93fc0ee297c4 100644
--- a/unsafe/src/test/java/org/apache/spark/unsafe/bitset/BitSetSuite.java
+++ b/unsafe/src/test/java/org/apache/spark/unsafe/bitset/BitSetSuite.java
@@ -18,7 +18,6 @@
 package org.apache.spark.unsafe.bitset;
 
 import junit.framework.Assert;
-import org.apache.spark.unsafe.bitset.BitSet;
 import org.junit.Test;
 
 import org.apache.spark.unsafe.memory.MemoryBlock;
diff --git a/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java b/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java
new file mode 100644
index 0000000000000..80c179a1b5e75
--- /dev/null
+++ b/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java
@@ -0,0 +1,93 @@
+/*
+* Licensed to the Apache Software Foundation (ASF) under one or more
+* contributor license agreements.  See the NOTICE file distributed with
+* this work for additional information regarding copyright ownership.
+* The ASF licenses this file to You under the Apache License, Version 2.0
+* (the "License"); you may not use this file except in compliance with
+* the License.  You may obtain a copy of the License at
+*
+*    http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+package org.apache.spark.unsafe.types;
+
+import java.io.UnsupportedEncodingException;
+
+import junit.framework.Assert;
+import org.junit.Test;
+
+public class UTF8StringSuite {
+
+  private void checkBasic(String str, int len) throws UnsupportedEncodingException {
+    Assert.assertEquals(UTF8String.fromString(str).length(), len);
+    Assert.assertEquals(UTF8String.fromBytes(str.getBytes("utf8")).length(), len);
+
+    Assert.assertEquals(UTF8String.fromString(str), str);
+    Assert.assertEquals(UTF8String.fromBytes(str.getBytes("utf8")), str);
+    Assert.assertEquals(UTF8String.fromString(str).toString(), str);
+    Assert.assertEquals(UTF8String.fromBytes(str.getBytes("utf8")).toString(), str);
+    Assert.assertEquals(UTF8String.fromBytes(str.getBytes("utf8")), UTF8String.fromString(str));
+
+    Assert.assertEquals(UTF8String.fromString(str).hashCode(),
+      UTF8String.fromBytes(str.getBytes("utf8")).hashCode());
+  }
+
+  @Test
+  public void basicTest() throws UnsupportedEncodingException {
+    checkBasic("hello", 5);
+    checkBasic("世 界", 3);
+  }
+
+  @Test
+  public void contains() {
+    Assert.assertTrue(UTF8String.fromString("hello").contains(UTF8String.fromString("ello")));
+    Assert.assertFalse(UTF8String.fromString("hello").contains(UTF8String.fromString("vello")));
+    Assert.assertFalse(UTF8String.fromString("hello").contains(UTF8String.fromString("hellooo")));
+    Assert.assertTrue(UTF8String.fromString("大千世界").contains(UTF8String.fromString("千世")));
+    Assert.assertFalse(UTF8String.fromString("大千世界").contains(UTF8String.fromString("世千")));
+    Assert.assertFalse(
+      UTF8String.fromString("大千世界").contains(UTF8String.fromString("大千世界好")));
+  }
+
+  @Test
+  public void startsWith() {
+    Assert.assertTrue(UTF8String.fromString("hello").startsWith(UTF8String.fromString("hell")));
+    Assert.assertFalse(UTF8String.fromString("hello").startsWith(UTF8String.fromString("ell")));
+    Assert.assertFalse(UTF8String.fromString("hello").startsWith(UTF8String.fromString("hellooo")));
+    Assert.assertTrue(UTF8String.fromString("数据砖头").startsWith(UTF8String.fromString("数据")));
+    Assert.assertFalse(UTF8String.fromString("大千世界").startsWith(UTF8String.fromString("千")));
+    Assert.assertFalse(
+      UTF8String.fromString("大千世界").startsWith(UTF8String.fromString("大千世界好")));
+  }
+
+  @Test
+  public void endsWith() {
+    Assert.assertTrue(UTF8String.fromString("hello").endsWith(UTF8String.fromString("ello")));
+    Assert.assertFalse(UTF8String.fromString("hello").endsWith(UTF8String.fromString("ellov")));
+    Assert.assertFalse(UTF8String.fromString("hello").endsWith(UTF8String.fromString("hhhello")));
+    Assert.assertTrue(UTF8String.fromString("大千世界").endsWith(UTF8String.fromString("世界")));
+    Assert.assertFalse(UTF8String.fromString("大千世界").endsWith(UTF8String.fromString("世")));
+    Assert.assertFalse(
+      UTF8String.fromString("数据砖头").endsWith(UTF8String.fromString("我的数据砖头")));
+  }
+
+  @Test
+  public void substring() {
+    Assert.assertEquals(
+      UTF8String.fromString("hello").substring(0, 0), UTF8String.fromString(""));
+    Assert.assertEquals(
+      UTF8String.fromString("hello").substring(1, 3), UTF8String.fromString("el"));
+    Assert.assertEquals(
+      UTF8String.fromString("数据砖头").substring(0, 1), UTF8String.fromString("数"));
+    Assert.assertEquals(
+      UTF8String.fromString("数据砖头").substring(1, 3), UTF8String.fromString("据砖"));
+    Assert.assertEquals(
+      UTF8String.fromString("数据砖头").substring(3, 5), UTF8String.fromString("头"));
+  }
+}

From 7914c720bf7447e8c9d96d564eafd6b687d2fc1a Mon Sep 17 00:00:00 2001
From: Zhongshuai Pei <799203320@qq.com>
Date: Thu, 11 Jun 2015 17:01:02 -0700
Subject: [PATCH 459/525] [SPARK-7824] [SQL] Collapse operator reordering and
 constant folding into a single batch.

SQL
```
select * from tableA join tableB on (a > 3 and b = d) or (a > 3 and b = e)
```
Plan before modify
```
== Optimized Logical Plan ==
Project [a#293,b#294,c#295,d#296,e#297]
 Join Inner, Some(((a#293 > 3) && ((b#294 = d#296) || (b#294 = e#297))))
  MetastoreRelation default, tablea, None
  MetastoreRelation default, tableb, None
```
Plan after modify
```
== Optimized Logical Plan ==
Project [a#293,b#294,c#295,d#296,e#297]
 Join Inner, Some(((b#294 = d#296) || (b#294 = e#297)))
  Filter (a#293 > 3)
   MetastoreRelation default, tablea, None
  MetastoreRelation default, tableb, None
```

CombineLimits ==> Limit(If(LessThan(ne, le), ne, le), grandChild) and LessThan is in BooleanSimplification ,  so CombineLimits  must before BooleanSimplification and BooleanSimplification must before PushPredicateThroughJoin.

Author: Zhongshuai Pei <799203320@qq.com>
Author: DoingDone9 <799203320@qq.com>

Closes #6351 from DoingDone9/master and squashes the following commits:

20de7be [Zhongshuai Pei] Update Optimizer.scala
7bc7d28 [Zhongshuai Pei] Merge pull request #17 from apache/master
0ba5f42 [Zhongshuai Pei] Update Optimizer.scala
f8b9314 [Zhongshuai Pei] Update FilterPushdownSuite.scala
c529d9f [Zhongshuai Pei] Update FilterPushdownSuite.scala
ae3af6d [Zhongshuai Pei] Update FilterPushdownSuite.scala
a04ffae [Zhongshuai Pei] Update Optimizer.scala
11beb61 [Zhongshuai Pei] Update FilterPushdownSuite.scala
f2ee5fe [Zhongshuai Pei] Update Optimizer.scala
be6b1d5 [Zhongshuai Pei] Update Optimizer.scala
b01e622 [Zhongshuai Pei] Merge pull request #15 from apache/master
8df716a [Zhongshuai Pei] Update FilterPushdownSuite.scala
d98bc35 [Zhongshuai Pei] Update FilterPushdownSuite.scala
fa65718 [Zhongshuai Pei] Update Optimizer.scala
ab8e9a6 [Zhongshuai Pei] Merge pull request #14 from apache/master
14952e2 [Zhongshuai Pei] Merge pull request #13 from apache/master
f03fe7f [Zhongshuai Pei] Merge pull request #12 from apache/master
f12fa50 [Zhongshuai Pei] Merge pull request #10 from apache/master
f61210c [Zhongshuai Pei] Merge pull request #9 from apache/master
34b1a9a [Zhongshuai Pei] Merge pull request #8 from apache/master
802261c [DoingDone9] Merge pull request #7 from apache/master
d00303b [DoingDone9] Merge pull request #6 from apache/master
98b134f [DoingDone9] Merge pull request #5 from apache/master
161cae3 [DoingDone9] Merge pull request #4 from apache/master
c87e8b6 [DoingDone9] Merge pull request #3 from apache/master
cb1852d [DoingDone9] Merge pull request #2 from apache/master
c3f046f [DoingDone9] Merge pull request #1 from apache/master
---
 .../sql/catalyst/optimizer/Optimizer.scala    |  7 ++---
 .../optimizer/FilterPushdownSuite.scala       | 31 ++++++++++++++-----
 2 files changed, 27 insertions(+), 11 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
index c16f08d389955..f8f1efcc7e990 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
@@ -38,21 +38,20 @@ object DefaultOptimizer extends Optimizer {
       EliminateSubQueries) ::
     Batch("Distinct", FixedPoint(100),
       ReplaceDistinctWithAggregate) ::
-    Batch("Operator Reordering", FixedPoint(100),
+    Batch("Operator Optimizations", FixedPoint(100),
       UnionPushdown,
       CombineFilters,
       PushPredicateThroughProject,
-      PushPredicateThroughJoin,
       PushPredicateThroughGenerate,
       ColumnPruning,
       ProjectCollapsing,
-      CombineLimits) ::
-    Batch("ConstantFolding", FixedPoint(100),
+      CombineLimits,
       NullPropagation,
       OptimizeIn,
       ConstantFolding,
       LikeSimplification,
       BooleanSimplification,
+      PushPredicateThroughJoin,
       SimplifyFilters,
       SimplifyCasts,
       SimplifyCaseConversionExpressions) ::
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/FilterPushdownSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/FilterPushdownSuite.scala
index 17dc9124749e8..ffdc673cdc455 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/FilterPushdownSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/FilterPushdownSuite.scala
@@ -36,6 +36,7 @@ class FilterPushdownSuite extends PlanTest {
       Batch("Filter Pushdown", Once,
         CombineFilters,
         PushPredicateThroughProject,
+        BooleanSimplification,
         PushPredicateThroughJoin,
         PushPredicateThroughGenerate,
         ColumnPruning,
@@ -156,11 +157,9 @@ class FilterPushdownSuite extends PlanTest {
         .where('a === 1 && 'a === 2)
         .select('a).analyze
 
-
     comparePlans(optimized, correctAnswer)
   }
 
-
   test("joins: push to either side") {
     val x = testRelation.subquery('x)
     val y = testRelation.subquery('y)
@@ -198,6 +197,25 @@ class FilterPushdownSuite extends PlanTest {
     comparePlans(optimized, correctAnswer)
   }
 
+  test("joins: push to one side after transformCondition") {
+    val x = testRelation.subquery('x)
+    val y = testRelation1.subquery('y)
+
+    val originalQuery = {
+      x.join(y)
+       .where(("x.a".attr === 1 && "y.d".attr === "x.b".attr) ||
+              ("x.a".attr === 1 && "y.d".attr === "x.c".attr))
+    }
+
+    val optimized = Optimize.execute(originalQuery.analyze)
+    val left = testRelation.where('a === 1)
+    val right = testRelation1
+    val correctAnswer =
+      left.join(right, condition = Some("d".attr === "b".attr || "d".attr === "c".attr)).analyze
+
+    comparePlans(optimized, correctAnswer)
+  }
+
   test("joins: rewrite filter to push to either side") {
     val x = testRelation.subquery('x)
     val y = testRelation.subquery('y)
@@ -563,17 +581,16 @@ class FilterPushdownSuite extends PlanTest {
     // push down invalid
     val originalQuery1 = {
       x.select('a, 'b)
-        .sortBy(SortOrder('a, Ascending))
-        .select('b)
+       .sortBy(SortOrder('a, Ascending))
+       .select('b)
     }
 
     val optimized1 = Optimize.execute(originalQuery1.analyze)
     val correctAnswer1 =
       x.select('a, 'b)
-        .sortBy(SortOrder('a, Ascending))
-        .select('b).analyze
+       .sortBy(SortOrder('a, Ascending))
+       .select('b).analyze
 
     comparePlans(optimized1, analysis.EliminateSubQueries(correctAnswer1))
-
   }
 }

From 337c16d57e40cb4967bf85269baae14745f161db Mon Sep 17 00:00:00 2001
From: Reynold Xin <rxin@databricks.com>
Date: Thu, 11 Jun 2015 17:06:21 -0700
Subject: [PATCH 460/525] [SQL] Miscellaneous SQL/DF expression changes.

SPARK-8201 conditional function: if
SPARK-8205 conditional function: nvl
SPARK-8208 math function: ceiling
SPARK-8210 math function: degrees
SPARK-8211 math function: radians
SPARK-8219 math function: negative
SPARK-8216 math function: rename log -> ln
SPARK-8222 math function: alias power / pow
SPARK-8225 math function: alias sign / signum
SPARK-8228 conditional function: isnull
SPARK-8229 conditional function: isnotnull
SPARK-8250 string function: alias lower/lcase
SPARK-8251 string function: alias upper / ucase

Author: Reynold Xin <rxin@databricks.com>

Closes #6754 from rxin/expressions-misc and squashes the following commits:

35fce15 [Reynold Xin] Removed println.
2647067 [Reynold Xin] Promote to string type.
3c32bbc [Reynold Xin] Fixed if.
de827ac [Reynold Xin] Fixed style
b201cd4 [Reynold Xin] Removed if.
6b21a9b [Reynold Xin] [SQL] Miscellaneous SQL/DF expression changes.
---
 .../catalyst/analysis/FunctionRegistry.scala  | 20 ++++++--
 .../catalyst/analysis/HiveTypeCoercion.scala  | 30 +++++++++++
 .../analysis/HiveTypeCoercionSuite.scala      | 13 +++++
 .../ConditionalExpressionSuite.scala          | 43 +++++++++++++++-
 .../spark/sql/ColumnExpressionSuite.scala     | 16 ++++++
 .../spark/sql/DataFrameFunctionsSuite.scala   | 29 +++++------
 .../spark/sql/MathExpressionsSuite.scala      | 51 ++++++++++++++++---
 7 files changed, 175 insertions(+), 27 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
index a7816e327526f..45bcbf73fae98 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
@@ -84,43 +84,51 @@ object FunctionRegistry {
   type FunctionBuilder = Seq[Expression] => Expression
 
   val expressions: Map[String, FunctionBuilder] = Map(
-    // Non aggregate functions
+    // misc non-aggregate functions
     expression[Abs]("abs"),
     expression[CreateArray]("array"),
     expression[Coalesce]("coalesce"),
     expression[Explode]("explode"),
+    expression[If]("if"),
+    expression[IsNull]("isnull"),
+    expression[IsNotNull]("isnotnull"),
+    expression[Coalesce]("nvl"),
     expression[Rand]("rand"),
     expression[Randn]("randn"),
     expression[CreateStruct]("struct"),
     expression[Sqrt]("sqrt"),
 
-    // Math functions
+    // math functions
     expression[Acos]("acos"),
     expression[Asin]("asin"),
     expression[Atan]("atan"),
     expression[Atan2]("atan2"),
     expression[Cbrt]("cbrt"),
     expression[Ceil]("ceil"),
+    expression[Ceil]("ceiling"),
     expression[Cos]("cos"),
     expression[EulerNumber]("e"),
     expression[Exp]("exp"),
     expression[Expm1]("expm1"),
     expression[Floor]("floor"),
     expression[Hypot]("hypot"),
-    expression[Log]("log"),
+    expression[Log]("ln"),
     expression[Log10]("log10"),
     expression[Log1p]("log1p"),
+    expression[UnaryMinus]("negative"),
     expression[Pi]("pi"),
     expression[Log2]("log2"),
     expression[Pow]("pow"),
+    expression[Pow]("power"),
     expression[Rint]("rint"),
+    expression[Signum]("sign"),
     expression[Signum]("signum"),
     expression[Sin]("sin"),
     expression[Sinh]("sinh"),
     expression[Tan]("tan"),
     expression[Tanh]("tanh"),
-    expression[ToDegrees]("todegrees"),
-    expression[ToRadians]("toradians"),
+    expression[ToDegrees]("degrees"),
+    expression[ToRadians]("radians"),
 
     // aggregate functions
     expression[Average]("avg"),
@@ -132,10 +140,12 @@ object FunctionRegistry {
     expression[Sum]("sum"),
 
     // string functions
+    expression[Lower]("lcase"),
     expression[Lower]("lower"),
     expression[StringLength]("length"),
     expression[Substring]("substr"),
     expression[Substring]("substring"),
+    expression[Upper]("ucase"),
     expression[Upper]("upper")
   )
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala
index 737905c3582ba..6ed192360dd62 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala
@@ -58,6 +58,15 @@ object HiveTypeCoercion {
     case _ => None
   }
 
+  /** Similar to [[findTightestCommonType]], but can promote all the way to StringType. */
+  private def findTightestCommonTypeToString(left: DataType, right: DataType): Option[DataType] = {
+    findTightestCommonTypeOfTwo(left, right).orElse((left, right) match {
+      case (StringType, t2: AtomicType) if t2 != BinaryType && t2 != BooleanType => Some(StringType)
+      case (t1: AtomicType, StringType) if t1 != BinaryType && t1 != BooleanType => Some(StringType)
+      case _ => None
+    })
+  }
+
   /**
    * Find the tightest common type of a set of types by continuously applying
    * `findTightestCommonTypeOfTwo` on these types.
@@ -91,6 +100,7 @@ trait HiveTypeCoercion {
     StringToIntegralCasts ::
     FunctionArgumentConversion ::
     CaseWhenCoercion ::
+    IfCoercion ::
     Division ::
     PropagateTypes ::
     ExpectedInputConversion ::
@@ -652,6 +662,26 @@ trait HiveTypeCoercion {
     }
   }
 
+  /**
+   * Coerces the type of different branches of If statement to a common type.
+   */
+  object IfCoercion extends Rule[LogicalPlan] {
+    def apply(plan: LogicalPlan): LogicalPlan = plan transformAllExpressions {
+      // Find tightest common type for If, if the true value and false value have different types.
+      case i @ If(pred, left, right) if left.dataType != right.dataType =>
+        findTightestCommonTypeToString(left.dataType, right.dataType).map { widestType =>
+          val newLeft = if (left.dataType == widestType) left else Cast(left, widestType)
+          val newRight = if (right.dataType == widestType) right else Cast(right, widestType)
+          i.makeCopy(Array(pred, newLeft, newRight))
+        }.getOrElse(i)  // If there is no applicable conversion, leave expression unchanged.
+
+      // Convert If(null literal, _, _) into boolean type.
+      // In the optimizer, we should short-circuit this directly into false value.
+      case i @ If(pred, left, right) if pred.dataType == NullType =>
+        i.makeCopy(Array(Literal.create(null, BooleanType), left, right))
+    }
+  }
+
   /**
    * Casts types according to the expected input types for Expressions that have the trait
    * `ExpectsInputTypes`.
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercionSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercionSuite.scala
index 9977f7af00f6b..f7b8e21bed490 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercionSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercionSuite.scala
@@ -134,6 +134,19 @@ class HiveTypeCoercionSuite extends PlanTest {
         :: Nil))
   }
 
+  test("type coercion for If") {
+    val rule = new HiveTypeCoercion { }.IfCoercion
+    ruleTest(rule,
+      If(Literal(true), Literal(1), Literal(1L)),
+      If(Literal(true), Cast(Literal(1), LongType), Literal(1L))
+    )
+
+    ruleTest(rule,
+      If(Literal.create(null, NullType), Literal(1), Literal(1)),
+      If(Literal.create(null, BooleanType), Literal(1), Literal(1))
+    )
+  }
+
   test("type coercion for CaseKeyWhen") {
     val cwc = new HiveTypeCoercion {}.CaseWhenCoercion
     ruleTest(cwc,
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ConditionalExpressionSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ConditionalExpressionSuite.scala
index 152c4e4111244..372848ea9a596 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ConditionalExpressionSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ConditionalExpressionSuite.scala
@@ -19,11 +19,52 @@ package org.apache.spark.sql.catalyst.expressions
 
 import org.apache.spark.SparkFunSuite
 import org.apache.spark.sql.catalyst.dsl.expressions._
-import org.apache.spark.sql.types.{IntegerType, BooleanType}
+import org.apache.spark.sql.types._
 
 
 class ConditionalExpressionSuite extends SparkFunSuite with ExpressionEvalHelper {
 
+  test("if") {
+    val testcases = Seq[(java.lang.Boolean, Integer, Integer, Integer)](
+      (true, 1, 2, 1),
+      (false, 1, 2, 2),
+      (null, 1, 2, 2),
+      (true, null, 2, null),
+      (false, 1, null, null),
+      (null, null, 2, 2),
+      (null, 1, null, null)
+    )
+
+    // dataType must match T.
+    def testIf(convert: (Integer => Any), dataType: DataType): Unit = {
+      for ((predicate, trueValue, falseValue, expected) <- testcases) {
+        val trueValueConverted = if (trueValue == null) null else convert(trueValue)
+        val falseValueConverted = if (falseValue == null) null else convert(falseValue)
+        val expectedConverted = if (expected == null) null else convert(expected)
+
+        checkEvaluation(
+          If(Literal.create(predicate, BooleanType),
+            Literal.create(trueValueConverted, dataType),
+            Literal.create(falseValueConverted, dataType)),
+          expectedConverted)
+      }
+    }
+
+    testIf(_ == 1, BooleanType)
+    testIf(_.toShort, ShortType)
+    testIf(identity, IntegerType)
+    testIf(_.toLong, LongType)
+
+    testIf(_.toFloat, FloatType)
+    testIf(_.toDouble, DoubleType)
+    testIf(Decimal(_), DecimalType.Unlimited)
+
+    testIf(identity, DateType)
+    testIf(_.toLong, TimestampType)
+
+    testIf(_.toString, StringType)
+  }
+
   test("case when") {
     val row = create_row(null, false, true, "a", "b", "c")
     val c1 = 'a.boolean.at(0)
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/ColumnExpressionSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/ColumnExpressionSuite.scala
index 4f5484f1368d1..efcdae5bce031 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/ColumnExpressionSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/ColumnExpressionSuite.scala
@@ -185,12 +185,20 @@ class ColumnExpressionSuite extends QueryTest {
     checkAnswer(
       nullStrings.toDF.where($"s".isNull),
       nullStrings.collect().toSeq.filter(r => r.getString(1) eq null))
+
+    checkAnswer(
+      ctx.sql("select isnull(null), isnull(1)"),
+      Row(true, false))
   }
 
   test("isNotNull") {
     checkAnswer(
       nullStrings.toDF.where($"s".isNotNull),
       nullStrings.collect().toSeq.filter(r => r.getString(1) ne null))
+
+    checkAnswer(
+      ctx.sql("select isnotnull(null), isnotnull('a')"),
+      Row(false, true))
   }
 
   test("===") {
@@ -393,6 +401,10 @@ class ColumnExpressionSuite extends QueryTest {
       testData.select(upper(lit(null))),
       (1 to 100).map(n => Row(null))
     )
+
+    checkAnswer(
+      ctx.sql("SELECT upper('aB'), ucase('cDe')"),
+      Row("AB", "CDE"))
   }
 
   test("lower") {
@@ -410,6 +422,10 @@ class ColumnExpressionSuite extends QueryTest {
       testData.select(lower(lit(null))),
       (1 to 100).map(n => Row(null))
     )
+
+    checkAnswer(
+      ctx.sql("SELECT lower('aB'), lcase('cDe')"),
+      Row("ab", "cde"))
   }
 
   test("monotonicallyIncreasingId") {
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala
index 659b64c185f43..cfd23867a9bba 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala
@@ -110,7 +110,20 @@ class DataFrameFunctionsSuite extends QueryTest {
       testData2.collect().toSeq.map(r => Row(~r.getInt(0))))
   }
 
-  test("length") {
+  test("if function") {
+    val df = Seq((1, 2)).toDF("a", "b")
+    checkAnswer(
+      df.selectExpr("if(a = 1, 'one', 'not_one')", "if(b = 1, 'one', 'not_one')"),
+      Row("one", "not_one"))
+  }
+
+  test("nvl function") {
+    checkAnswer(
+      ctx.sql("SELECT nvl(null, 'x'), nvl('y', 'x'), nvl(null, null)"),
+      Row("x", "y", null))
+  }
+
+  test("string length function") {
     checkAnswer(
       nullStrings.select(strlen($"s"), strlen("s")),
       nullStrings.collect().toSeq.map { r =>
@@ -127,18 +140,4 @@ class DataFrameFunctionsSuite extends QueryTest {
         Row(l)
       })
   }
-
-  test("log2 functions test") {
-    val df = Seq((1, 2)).toDF("a", "b")
-    checkAnswer(
-      df.select(log2("b") + log2("a")),
-      Row(1))
-
-    checkAnswer(
-      ctx.sql("SELECT LOG2(8)"),
-      Row(3))
-    checkAnswer(
-      ctx.sql("SELECT LOG2(null)"),
-      Row(null))
-  }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/MathExpressionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/MathExpressionsSuite.scala
index 0a38af2b4c889..6561c3b2322c7 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/MathExpressionsSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/MathExpressionsSuite.scala
@@ -18,6 +18,7 @@
 package org.apache.spark.sql
 
 import org.apache.spark.sql.functions._
+import org.apache.spark.sql.functions.{log => logarithm}
 
 
 private object MathExpressionsTestData {
@@ -151,20 +152,31 @@ class MathExpressionsSuite extends QueryTest {
     testOneToOneMathFunction(tanh, math.tanh)
   }
 
-  test("toDeg") {
+  test("toDegrees") {
     testOneToOneMathFunction(toDegrees, math.toDegrees)
+    checkAnswer(
+      ctx.sql("SELECT degrees(0), degrees(1), degrees(1.5)"),
+      Seq((1, 2)).toDF().select(toDegrees(lit(0)), toDegrees(lit(1)), toDegrees(lit(1.5)))
+    )
   }
 
-  test("toRad") {
+  test("toRadians") {
     testOneToOneMathFunction(toRadians, math.toRadians)
+    checkAnswer(
+      ctx.sql("SELECT radians(0), radians(1), radians(1.5)"),
+      Seq((1, 2)).toDF().select(toRadians(lit(0)), toRadians(lit(1)), toRadians(lit(1.5)))
+    )
   }
 
   test("cbrt") {
     testOneToOneMathFunction(cbrt, math.cbrt)
   }
 
-  test("ceil") {
+  test("ceil and ceiling") {
     testOneToOneMathFunction(ceil, math.ceil)
+    checkAnswer(
+      ctx.sql("SELECT ceiling(0), ceiling(1), ceiling(1.5)"),
+      Row(0.0, 1.0, 2.0))
   }
 
   test("floor") {
@@ -183,12 +195,21 @@ class MathExpressionsSuite extends QueryTest {
     testOneToOneMathFunction(expm1, math.expm1)
   }
 
-  test("signum") {
+  test("signum / sign") {
     testOneToOneMathFunction[Double](signum, math.signum)
+
+    checkAnswer(
+      ctx.sql("SELECT sign(10), signum(-11)"),
+      Row(1, -1))
   }
 
-  test("pow") {
+  test("pow / power") {
     testTwoToOneMathFunction(pow, pow, math.pow)
+
+    checkAnswer(
+      ctx.sql("SELECT pow(1, 2), power(2, 1)"),
+      Seq((1, 2)).toDF().select(pow(lit(1), lit(2)), pow(lit(2), lit(1)))
+    )
   }
 
   test("hypot") {
@@ -199,8 +220,12 @@ class MathExpressionsSuite extends QueryTest {
     testTwoToOneMathFunction(atan2, atan2, math.atan2)
   }
 
-  test("log") {
+  test("log / ln") {
     testOneToOneNonNegativeMathFunction(org.apache.spark.sql.functions.log, math.log)
+    checkAnswer(
+      ctx.sql("SELECT ln(0), ln(1), ln(1.5)"),
+      Seq((1, 2)).toDF().select(logarithm(lit(0)), logarithm(lit(1)), logarithm(lit(1.5)))
+    )
   }
 
   test("log10") {
@@ -211,4 +236,18 @@ class MathExpressionsSuite extends QueryTest {
     testOneToOneNonNegativeMathFunction(log1p, math.log1p)
   }
 
+  test("log2") {
+    val df = Seq((1, 2)).toDF("a", "b")
+    checkAnswer(
+      df.select(log2("b") + log2("a")),
+      Row(1))
+
+    checkAnswer(ctx.sql("SELECT LOG2(8), LOG2(null)"), Row(3, null))
+  }
+
+  test("negative") {
+    checkAnswer(
+      ctx.sql("SELECT negative(1), negative(0), negative(-1)"),
+      Row(-1, 0, 1))
+  }
 }

From 767cc94ca6d397ba19226996ccb3c8e57083c549 Mon Sep 17 00:00:00 2001
From: Cheng Hao <hao.cheng@intel.com>
Date: Thu, 11 Jun 2015 18:01:32 -0700
Subject: [PATCH 461/525] [SPARK-7158] [SQL] Fix bug of cached data cannot be
 used in collect() after cache()

When df.cache() method called, the `withCachedData` of `QueryExecution` has been created, which mean it will not look up the cached tables when action method called afterward.

Author: Cheng Hao <hao.cheng@intel.com>

Closes #5714 from chenghao-intel/SPARK-7158 and squashes the following commits:

58ea8aa [Cheng Hao] style issue
2bf740f [Cheng Hao] create new QueryExecution instance for CacheManager
a5647d9 [Cheng Hao] hide the queryExecution of DataFrame
fbfd3c5 [Cheng Hao] make the DataFrame.queryExecution mutable for cache/persist/unpersist
---
 .../spark/sql/execution/CacheManager.scala    |  2 +-
 .../org/apache/spark/sql/SQLQuerySuite.scala  | 26 +++++++++++++++++++
 2 files changed, 27 insertions(+), 1 deletion(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/CacheManager.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/CacheManager.scala
index 5fcc48a67948b..a4b38d364d54a 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/CacheManager.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/CacheManager.scala
@@ -103,7 +103,7 @@ private[sql] class CacheManager(sqlContext: SQLContext) extends Logging {
             sqlContext.conf.useCompression,
             sqlContext.conf.columnBatchSize,
             storageLevel,
-            query.queryExecution.executedPlan,
+            sqlContext.executePlan(query.logicalPlan).executedPlan,
             tableName))
     }
   }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
index 3ca5ff347dd0c..14ecd4e9a77dc 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
@@ -123,6 +123,32 @@ class SQLQuerySuite extends QueryTest with BeforeAndAfterAll with SQLTestUtils {
     )
   }
 
+  test("SPARK-7158 collect and take return different results") {
+    import java.util.UUID
+    import org.apache.spark.sql.types._
+
+    val df = Seq(Tuple1(1), Tuple1(2), Tuple1(3)).toDF("index")
+    // we except the id is materialized once
+    def id: () => String = () => { UUID.randomUUID().toString() }
+
+    val dfWithId = df.withColumn("id", callUDF(id, StringType))
+    // Make a new DataFrame (actually the same reference to the old one)
+    val cached = dfWithId.cache()
+    // Trigger the cache
+    val d0 = dfWithId.collect()
+    val d1 = cached.collect()
+    val d2 = cached.collect()
+
+    // Since the ID is only materialized once, then all of the records
+    // should come from the cache, not by re-computing. Otherwise, the ID
+    // will be different
+    assert(d0.map(_(0)) === d2.map(_(0)))
+    assert(d0.map(_(1)) === d2.map(_(1)))
+
+    assert(d1.map(_(0)) === d2.map(_(0)))
+    assert(d1.map(_(1)) === d2.map(_(1)))
+  }
+
   test("grouping on nested fields") {
     sqlContext.read.json(sqlContext.sparkContext.parallelize(
       """{"nested": {"attribute": 1}, "value": 2}""" :: Nil))

From b9d177c5110cd054fdb9bcbeeb5f4ca9aa645dc1 Mon Sep 17 00:00:00 2001
From: Josh Rosen <joshrosen@databricks.com>
Date: Thu, 11 Jun 2015 22:15:15 -0700
Subject: [PATCH 462/525] [SPARK-8317] [SQL] Do not push sort into shuffle in
 Exchange operator

In some cases, Spark SQL pushes sorting operations into the shuffle layer by specifying a key ordering as part of the shuffle dependency. I think that we should not do this:

- Since we do not delegate aggregation to Spark's shuffle, specifying the keyOrdering as part of the shuffle has no effect on the shuffle map side.
- By performing the shuffle ourselves (by inserting a sort operator after the shuffle instead), we can use the Exchange planner to choose specialized sorting implementations based on the types of rows being sorted.
- We can remove some complexity from SqlSerializer2 by not requiring it to know about sort orderings, since SQL's own sort operators will already perform the necessary defensive copying.

This patch removes Exchange's `canSortWithShuffle` path and the associated code in `SqlSerializer2`.  Shuffles that used to go through the `canSortWithShuffle` path would always wind up using Spark's `ExternalSorter` (inside of `HashShuffleReader`); to avoid a performance regression as a result of handling these shuffles ourselves, I've changed the SQLConf defaults so that external sorting is enabled by default.

Author: Josh Rosen <joshrosen@databricks.com>

Closes #6772 from JoshRosen/SPARK-8317 and squashes the following commits:

ebf9c0f [Josh Rosen] Do not push sort into shuffle in Exchange operator
bf3b4c8 [Josh Rosen] Enable external sort by default
---
 .../scala/org/apache/spark/sql/SQLConf.scala  |  2 +-
 .../apache/spark/sql/execution/Exchange.scala | 54 ++++++-------------
 .../sql/execution/SparkSqlSerializer2.scala   | 22 +++-----
 3 files changed, 24 insertions(+), 54 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SQLConf.scala b/sql/core/src/main/scala/org/apache/spark/sql/SQLConf.scala
index be786f9b7f49e..87f40482e31bb 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/SQLConf.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/SQLConf.scala
@@ -161,7 +161,7 @@ private[sql] class SQLConf extends Serializable with CatalystConf {
     getConf(HIVE_VERIFY_PARTITIONPATH, "true").toBoolean
 
   /** When true the planner will use the external sort, which may spill to disk. */
-  private[spark] def externalSortEnabled: Boolean = getConf(EXTERNAL_SORT, "false").toBoolean
+  private[spark] def externalSortEnabled: Boolean = getConf(EXTERNAL_SORT, "true").toBoolean
 
   /**
    * Sort merge join would sort the two side of join first, and then iterate both sides together
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/Exchange.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/Exchange.scala
index f25d10fec0411..6fa7ccc6cc89b 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/Exchange.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/Exchange.scala
@@ -31,16 +31,6 @@ import org.apache.spark.sql.types.DataType
 import org.apache.spark.sql.{SQLContext, Row}
 import org.apache.spark.util.MutablePair
 
-object Exchange {
-  /**
-   * Returns true when the ordering expressions are a subset of the key.
-   * if true, ShuffledRDD can use `setKeyOrdering(orderingKey)` to sort within [[Exchange]].
-   */
-  def canSortWithShuffle(partitioning: Partitioning, desiredOrdering: Seq[SortOrder]): Boolean = {
-    desiredOrdering.map(_.child).toSet.subsetOf(partitioning.keyExpressions.toSet)
-  }
-}
-
 /**
  * :: DeveloperApi ::
  * Performs a shuffle that will result in the desired `newPartitioning`.  Optionally sorts each
@@ -143,7 +133,6 @@ case class Exchange(
   private def getSerializer(
       keySchema: Array[DataType],
       valueSchema: Array[DataType],
-      hasKeyOrdering: Boolean,
       numPartitions: Int): Serializer = {
     // It is true when there is no field that needs to be write out.
     // For now, we will not use SparkSqlSerializer2 when noField is true.
@@ -159,7 +148,7 @@ case class Exchange(
 
     val serializer = if (useSqlSerializer2) {
       logInfo("Using SparkSqlSerializer2.")
-      new SparkSqlSerializer2(keySchema, valueSchema, hasKeyOrdering)
+      new SparkSqlSerializer2(keySchema, valueSchema)
     } else {
       logInfo("Using SparkSqlSerializer.")
       new SparkSqlSerializer(sparkConf)
@@ -173,7 +162,7 @@ case class Exchange(
       case HashPartitioning(expressions, numPartitions) =>
         val keySchema = expressions.map(_.dataType).toArray
         val valueSchema = child.output.map(_.dataType).toArray
-        val serializer = getSerializer(keySchema, valueSchema, newOrdering.nonEmpty, numPartitions)
+        val serializer = getSerializer(keySchema, valueSchema, numPartitions)
         val part = new HashPartitioner(numPartitions)
 
         val rdd = if (needToCopyObjectsBeforeShuffle(part, serializer)) {
@@ -189,15 +178,12 @@ case class Exchange(
           }
         }
         val shuffled = new ShuffledRDD[Row, Row, Row](rdd, part)
-        if (newOrdering.nonEmpty) {
-          shuffled.setKeyOrdering(keyOrdering)
-        }
         shuffled.setSerializer(serializer)
         shuffled.map(_._2)
 
       case RangePartitioning(sortingExpressions, numPartitions) =>
         val keySchema = child.output.map(_.dataType).toArray
-        val serializer = getSerializer(keySchema, null, newOrdering.nonEmpty, numPartitions)
+        val serializer = getSerializer(keySchema, null, numPartitions)
 
         val childRdd = child.execute()
         val part: Partitioner = {
@@ -222,15 +208,12 @@ case class Exchange(
         }
 
         val shuffled = new ShuffledRDD[Row, Null, Null](rdd, part)
-        if (newOrdering.nonEmpty) {
-          shuffled.setKeyOrdering(keyOrdering)
-        }
         shuffled.setSerializer(serializer)
         shuffled.map(_._1)
 
       case SinglePartition =>
         val valueSchema = child.output.map(_.dataType).toArray
-        val serializer = getSerializer(null, valueSchema, hasKeyOrdering = false, 1)
+        val serializer = getSerializer(null, valueSchema, numPartitions = 1)
         val partitioner = new HashPartitioner(1)
 
         val rdd = if (needToCopyObjectsBeforeShuffle(partitioner, serializer)) {
@@ -306,29 +289,24 @@ private[sql] case class EnsureRequirements(sqlContext: SQLContext) extends Rule[
           child: SparkPlan): SparkPlan = {
         val needSort = rowOrdering.nonEmpty && child.outputOrdering != rowOrdering
         val needsShuffle = child.outputPartitioning != partitioning
-        val canSortWithShuffle = Exchange.canSortWithShuffle(partitioning, rowOrdering)
 
-        if (needSort && needsShuffle && canSortWithShuffle) {
-          Exchange(partitioning, rowOrdering, child)
+        val withShuffle = if (needsShuffle) {
+          Exchange(partitioning, Nil, child)
         } else {
-          val withShuffle = if (needsShuffle) {
-            Exchange(partitioning, Nil, child)
-          } else {
-            child
-          }
+          child
+        }
 
-          val withSort = if (needSort) {
-            if (sqlContext.conf.externalSortEnabled) {
-              ExternalSort(rowOrdering, global = false, withShuffle)
-            } else {
-              Sort(rowOrdering, global = false, withShuffle)
-            }
+        val withSort = if (needSort) {
+          if (sqlContext.conf.externalSortEnabled) {
+            ExternalSort(rowOrdering, global = false, withShuffle)
           } else {
-            withShuffle
+            Sort(rowOrdering, global = false, withShuffle)
           }
-
-          withSort
+        } else {
+          withShuffle
         }
+
+        withSort
       }
 
       if (meetsRequirements && compatible && !needsAnySort) {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlSerializer2.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlSerializer2.scala
index 202e4488a64bf..15b6936acd59b 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlSerializer2.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlSerializer2.scala
@@ -86,7 +86,6 @@ private[sql] class Serializer2SerializationStream(
 private[sql] class Serializer2DeserializationStream(
     keySchema: Array[DataType],
     valueSchema: Array[DataType],
-    hasKeyOrdering: Boolean,
     in: InputStream)
   extends DeserializationStream with Logging  {
 
@@ -96,14 +95,9 @@ private[sql] class Serializer2DeserializationStream(
     if (schema == null) {
       () => null
     } else {
-      if (hasKeyOrdering) {
-        // We have key ordering specified in a ShuffledRDD, it is not safe to reuse a mutable row.
-        () => new GenericMutableRow(schema.length)
-      } else {
-        // It is safe to reuse the mutable row.
-        val mutableRow = new SpecificMutableRow(schema)
-        () => mutableRow
-      }
+      // It is safe to reuse the mutable row.
+      val mutableRow = new SpecificMutableRow(schema)
+      () => mutableRow
     }
   }
 
@@ -133,8 +127,7 @@ private[sql] class Serializer2DeserializationStream(
 
 private[sql] class SparkSqlSerializer2Instance(
     keySchema: Array[DataType],
-    valueSchema: Array[DataType],
-    hasKeyOrdering: Boolean)
+    valueSchema: Array[DataType])
   extends SerializerInstance {
 
   def serialize[T: ClassTag](t: T): ByteBuffer =
@@ -151,7 +144,7 @@ private[sql] class SparkSqlSerializer2Instance(
   }
 
   def deserializeStream(s: InputStream): DeserializationStream = {
-    new Serializer2DeserializationStream(keySchema, valueSchema, hasKeyOrdering, s)
+    new Serializer2DeserializationStream(keySchema, valueSchema, s)
   }
 }
 
@@ -164,14 +157,13 @@ private[sql] class SparkSqlSerializer2Instance(
  */
 private[sql] class SparkSqlSerializer2(
     keySchema: Array[DataType],
-    valueSchema: Array[DataType],
-    hasKeyOrdering: Boolean)
+    valueSchema: Array[DataType])
   extends Serializer
   with Logging
   with Serializable{
 
   def newInstance(): SerializerInstance =
-    new SparkSqlSerializer2Instance(keySchema, valueSchema, hasKeyOrdering)
+    new SparkSqlSerializer2Instance(keySchema, valueSchema)
 
   override def supportsRelocationOfSerializedObjects: Boolean = {
     // SparkSqlSerializer2 is stateless and writes no stream headers

From 2dd7f93080ee882afcc2aac1a419802a19a668ce Mon Sep 17 00:00:00 2001
From: "zhichao.li" <zhichao.li@intel.com>
Date: Thu, 11 Jun 2015 22:28:28 -0700
Subject: [PATCH 463/525] [SPARK-7862] [SQL] Fix the deadlock in script
 transformation for stderr

[Related PR SPARK-7044] (https://github.com/apache/spark/pull/5671)

Author: zhichao.li <zhichao.li@intel.com>

Closes #6404 from zhichao-li/transform and squashes the following commits:

8418c97 [zhichao.li] add comments and remove useless failAfter logic
d9677e1 [zhichao.li] redirect the error desitination to be the same as the current process
---
 .../sql/hive/execution/ScriptTransformation.scala    |  7 +++++++
 .../spark/sql/hive/execution/SQLQuerySuite.scala     | 12 ++++++++++--
 2 files changed, 17 insertions(+), 2 deletions(-)

diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/ScriptTransformation.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/ScriptTransformation.scala
index fd623370cc407..28792db7686b5 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/ScriptTransformation.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/ScriptTransformation.scala
@@ -18,6 +18,7 @@
 package org.apache.spark.sql.hive.execution
 
 import java.io.{BufferedReader, DataInputStream, DataOutputStream, EOFException, InputStreamReader}
+import java.lang.ProcessBuilder.Redirect
 import java.util.Properties
 
 import scala.collection.JavaConversions._
@@ -58,6 +59,12 @@ case class ScriptTransformation(
     child.execute().mapPartitions { iter =>
       val cmd = List("/bin/bash", "-c", script)
       val builder = new ProcessBuilder(cmd)
+      // redirectError(Redirect.INHERIT) would consume the error output from buffer and
+      // then print it to stderr (inherit the target from the current Scala process).
+      // If without this there would be 2 issues:
+      // 1) The error msg generated by the script process would be hidden.
+      // 2) If the error msg is too big to chock up the buffer, the input logic would be hung
+      builder.redirectError(Redirect.INHERIT)
       val proc = builder.start()
       val inputStream = proc.getInputStream
       val outputStream = proc.getOutputStream
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
index 8bd4900497c4f..c8e5e246322df 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
@@ -645,12 +645,20 @@ class SQLQuerySuite extends QueryTest {
       .queryExecution.analyzed
   }
 
-  test("test script transform") {
+  test("test script transform for stdout") {
     val data = (1 to 100000).map { i => (i, i, i) }
     data.toDF("d1", "d2", "d3").registerTempTable("script_trans")
     assert(100000 ===
       sql("SELECT TRANSFORM (d1, d2, d3) USING 'cat' AS (a,b,c) FROM script_trans")
-      .queryExecution.toRdd.count())
+        .queryExecution.toRdd.count())
+  }
+
+  test("test script transform for stderr") {
+    val data = (1 to 100000).map { i => (i, i, i) }
+    data.toDF("d1", "d2", "d3").registerTempTable("script_trans")
+    assert(0 ===
+      sql("SELECT TRANSFORM (d1, d2, d3) USING 'cat 1>&2' AS (a,b,c) FROM script_trans")
+        .queryExecution.toRdd.count())
   }
 
   test("window function: udaf with aggregate expressin") {

From e428b3a951377d47aa80d5f26d6bab979e72e8ab Mon Sep 17 00:00:00 2001
From: Yash Datta <Yash.Datta@guavus.com>
Date: Fri, 12 Jun 2015 13:44:09 +0800
Subject: [PATCH 464/525] [SPARK-6566] [SQL] Related changes for newer parquet
 version

This brings in major improvement in that footers are not read on the driver. This also cleans up the code in parquetTableOperations, where we had to override getSplits to eliminate multiple listStatus calls.

cc liancheng

are there any other changes we need for this ?

Author: Yash Datta <Yash.Datta@guavus.com>

Closes #5889 from saucam/parquet_1.6 and squashes the following commits:

d1bf41e [Yash Datta] SPARK-7340: Fix scalastyle and incorporate review comments
c9aa042 [Yash Datta] SPARK-7340: Use the new user defined filter predicate for pushing down inset into parquet
56bc750 [Yash Datta] SPARK-7340: Change parquet version to latest release
---
 .../spark/sql/parquet/ParquetFilters.scala    |  42 +++-
 .../sql/parquet/ParquetTableOperations.scala  | 187 +-----------------
 2 files changed, 44 insertions(+), 185 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetFilters.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetFilters.scala
index 4d659f261a3b7..d57b789f5c1c7 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetFilters.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetFilters.scala
@@ -17,6 +17,7 @@
 
 package org.apache.spark.sql.parquet
 
+import java.io.Serializable
 import java.nio.ByteBuffer
 
 import com.google.common.io.BaseEncoding
@@ -24,7 +25,8 @@ import org.apache.hadoop.conf.Configuration
 import org.apache.parquet.filter2.compat.FilterCompat
 import org.apache.parquet.filter2.compat.FilterCompat._
 import org.apache.parquet.filter2.predicate.FilterApi._
-import org.apache.parquet.filter2.predicate.{FilterApi, FilterPredicate}
+import org.apache.parquet.filter2.predicate.{FilterApi, FilterPredicate, Statistics}
+import org.apache.parquet.filter2.predicate.UserDefinedPredicate
 import org.apache.parquet.io.api.Binary
 
 import org.apache.spark.SparkEnv
@@ -42,6 +44,18 @@ private[sql] object ParquetFilters {
     }.reduceOption(FilterApi.and).map(FilterCompat.get)
   }
 
+  case class SetInFilter[T <: Comparable[T]](
+    valueSet: Set[T]) extends UserDefinedPredicate[T] with Serializable {
+
+    override def keep(value: T): Boolean = {
+      value != null && valueSet.contains(value)
+    }
+
+    override def canDrop(statistics: Statistics[T]): Boolean = false
+
+    override def inverseCanDrop(statistics: Statistics[T]): Boolean = false
+  }
+
   private val makeEq: PartialFunction[DataType, (String, Any) => FilterPredicate] = {
     case BooleanType =>
       (n: String, v: Any) => FilterApi.eq(booleanColumn(n), v.asInstanceOf[java.lang.Boolean])
@@ -154,6 +168,29 @@ private[sql] object ParquetFilters {
         FilterApi.gtEq(binaryColumn(n), Binary.fromByteArray(v.asInstanceOf[Array[Byte]]))
   }
 
+  private val makeInSet: PartialFunction[DataType, (String, Set[Any]) => FilterPredicate] = {
+    case IntegerType =>
+      (n: String, v: Set[Any]) =>
+        FilterApi.userDefined(intColumn(n), SetInFilter(v.asInstanceOf[Set[java.lang.Integer]]))
+    case LongType =>
+      (n: String, v: Set[Any]) =>
+        FilterApi.userDefined(longColumn(n), SetInFilter(v.asInstanceOf[Set[java.lang.Long]]))
+    case FloatType =>
+      (n: String, v: Set[Any]) =>
+        FilterApi.userDefined(floatColumn(n), SetInFilter(v.asInstanceOf[Set[java.lang.Float]]))
+    case DoubleType =>
+      (n: String, v: Set[Any]) =>
+        FilterApi.userDefined(doubleColumn(n), SetInFilter(v.asInstanceOf[Set[java.lang.Double]]))
+    case StringType =>
+      (n: String, v: Set[Any]) =>
+        FilterApi.userDefined(binaryColumn(n),
+          SetInFilter(v.map(e => Binary.fromByteArray(e.asInstanceOf[UTF8String].getBytes))))
+    case BinaryType =>
+      (n: String, v: Set[Any]) =>
+        FilterApi.userDefined(binaryColumn(n),
+          SetInFilter(v.map(e => Binary.fromByteArray(e.asInstanceOf[Array[Byte]]))))
+  }
+
   /**
    * Converts data sources filters to Parquet filter predicates.
    */
@@ -285,6 +322,9 @@ private[sql] object ParquetFilters {
       case Not(pred) =>
         createFilter(pred).map(FilterApi.not)
 
+      case InSet(NamedExpression(name, dataType), valueSet) =>
+        makeInSet.lift(dataType).map(_(name, valueSet))
+
       case _ => None
     }
   }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableOperations.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableOperations.scala
index 1e694f2feabee..272608d4e2a09 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableOperations.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableOperations.scala
@@ -117,6 +117,9 @@ private[sql] case class ParquetTableScan(
       SQLConf.PARQUET_CACHE_METADATA,
       sqlContext.getConf(SQLConf.PARQUET_CACHE_METADATA, "true"))
 
+    // Use task side metadata in parquet
+    conf.setBoolean(ParquetInputFormat.TASK_SIDE_METADATA, true);
+
     val baseRDD =
       new org.apache.spark.rdd.NewHadoopRDD(
         sc,
@@ -453,190 +456,6 @@ private[parquet] class FilteringParquetRowInputFormat
     }
   }
 
-  // This is only a temporary solution sicne we need to use fileStatuses in
-  // both getClientSideSplits and getTaskSideSplits. It can be removed once we get rid of these
-  // two methods.
-  override def getSplits(jobContext: JobContext): JList[InputSplit] = {
-    // First set fileStatuses.
-    val statuses = listStatus(jobContext)
-    fileStatuses = statuses.map(file => file.getPath -> file).toMap
-
-    super.getSplits(jobContext)
-  }
-
-  // TODO Remove this method and related code once PARQUET-16 is fixed
-  // This method together with the `getFooters` method and the `fileStatuses` field are just used
-  // to mimic this PR: https://github.com/apache/incubator-parquet-mr/pull/17
-  override def getSplits(
-      configuration: Configuration,
-      footers: JList[Footer]): JList[ParquetInputSplit] = {
-
-    // Use task side strategy by default
-    val taskSideMetaData = configuration.getBoolean(ParquetInputFormat.TASK_SIDE_METADATA, true)
-    val maxSplitSize: JLong = configuration.getLong("mapred.max.split.size", Long.MaxValue)
-    val minSplitSize: JLong =
-      Math.max(getFormatMinSplitSize, configuration.getLong("mapred.min.split.size", 0L))
-    if (maxSplitSize < 0 || minSplitSize < 0) {
-      throw new ParquetDecodingException(
-        s"maxSplitSize or minSplitSie should not be negative: maxSplitSize = $maxSplitSize;" +
-          s" minSplitSize = $minSplitSize")
-    }
-
-    // Uses strict type checking by default
-    val getGlobalMetaData =
-      classOf[ParquetFileWriter].getDeclaredMethod("getGlobalMetaData", classOf[JList[Footer]])
-    getGlobalMetaData.setAccessible(true)
-    var globalMetaData = getGlobalMetaData.invoke(null, footers).asInstanceOf[GlobalMetaData]
-
-    if (globalMetaData == null) {
-     val splits = mutable.ArrayBuffer.empty[ParquetInputSplit]
-     return splits
-    }
-
-    val metadata = configuration.get(RowWriteSupport.SPARK_ROW_SCHEMA)
-    val mergedMetadata = globalMetaData
-      .getKeyValueMetaData
-      .updated(RowReadSupport.SPARK_METADATA_KEY, setAsJavaSet(Set(metadata)))
-
-    globalMetaData = new GlobalMetaData(globalMetaData.getSchema,
-      mergedMetadata, globalMetaData.getCreatedBy)
-
-    val readContext = ParquetInputFormat.getReadSupportInstance(configuration).init(
-      new InitContext(configuration,
-        globalMetaData.getKeyValueMetaData,
-        globalMetaData.getSchema))
-
-    if (taskSideMetaData){
-      logInfo("Using Task Side Metadata Split Strategy")
-      getTaskSideSplits(configuration,
-        footers,
-        maxSplitSize,
-        minSplitSize,
-        readContext)
-    } else {
-      logInfo("Using Client Side Metadata Split Strategy")
-      getClientSideSplits(configuration,
-        footers,
-        maxSplitSize,
-        minSplitSize,
-        readContext)
-    }
-
-  }
-
-  def getClientSideSplits(
-    configuration: Configuration,
-    footers: JList[Footer],
-    maxSplitSize: JLong,
-    minSplitSize: JLong,
-    readContext: ReadContext): JList[ParquetInputSplit] = {
-
-    import org.apache.parquet.filter2.compat.FilterCompat.Filter
-    import org.apache.parquet.filter2.compat.RowGroupFilter
-
-    import org.apache.spark.sql.parquet.FilteringParquetRowInputFormat.blockLocationCache
-
-    val cacheMetadata = configuration.getBoolean(SQLConf.PARQUET_CACHE_METADATA, true)
-
-    val splits = mutable.ArrayBuffer.empty[ParquetInputSplit]
-    val filter: Filter = ParquetInputFormat.getFilter(configuration)
-    var rowGroupsDropped: Long = 0
-    var totalRowGroups: Long = 0
-
-    // Ugly hack, stuck with it until PR:
-    // https://github.com/apache/incubator-parquet-mr/pull/17
-    // is resolved
-    val generateSplits =
-      Class.forName("org.apache.parquet.hadoop.ClientSideMetadataSplitStrategy")
-       .getDeclaredMethods.find(_.getName == "generateSplits").getOrElse(
-         sys.error(s"Failed to reflectively invoke ClientSideMetadataSplitStrategy.generateSplits"))
-    generateSplits.setAccessible(true)
-
-    for (footer <- footers) {
-      val fs = footer.getFile.getFileSystem(configuration)
-      val file = footer.getFile
-      val status = fileStatuses.getOrElse(file, fs.getFileStatus(file))
-      val parquetMetaData = footer.getParquetMetadata
-      val blocks = parquetMetaData.getBlocks
-      totalRowGroups = totalRowGroups + blocks.size
-      val filteredBlocks = RowGroupFilter.filterRowGroups(
-        filter,
-        blocks,
-        parquetMetaData.getFileMetaData.getSchema)
-      rowGroupsDropped = rowGroupsDropped + (blocks.size - filteredBlocks.size)
-
-      if (!filteredBlocks.isEmpty){
-          var blockLocations: Array[BlockLocation] = null
-          if (!cacheMetadata) {
-            blockLocations = fs.getFileBlockLocations(status, 0, status.getLen)
-          } else {
-            blockLocations = blockLocationCache.get(status, new Callable[Array[BlockLocation]] {
-              def call(): Array[BlockLocation] = fs.getFileBlockLocations(status, 0, status.getLen)
-            })
-          }
-          splits.addAll(
-            generateSplits.invoke(
-              null,
-              filteredBlocks,
-              blockLocations,
-              status,
-              readContext.getRequestedSchema.toString,
-              readContext.getReadSupportMetadata,
-              minSplitSize,
-              maxSplitSize).asInstanceOf[JList[ParquetInputSplit]])
-        }
-    }
-
-    if (rowGroupsDropped > 0 && totalRowGroups > 0){
-      val percentDropped = ((rowGroupsDropped/totalRowGroups.toDouble) * 100).toInt
-      logInfo(s"Dropping $rowGroupsDropped row groups that do not pass filter predicate "
-        + s"($percentDropped %) !")
-    }
-    else {
-      logInfo("There were no row groups that could be dropped due to filter predicates")
-    }
-    splits
-
-  }
-
-  def getTaskSideSplits(
-    configuration: Configuration,
-    footers: JList[Footer],
-    maxSplitSize: JLong,
-    minSplitSize: JLong,
-    readContext: ReadContext): JList[ParquetInputSplit] = {
-
-    val splits = mutable.ArrayBuffer.empty[ParquetInputSplit]
-
-    // Ugly hack, stuck with it until PR:
-    // https://github.com/apache/incubator-parquet-mr/pull/17
-    // is resolved
-    val generateSplits =
-      Class.forName("org.apache.parquet.hadoop.TaskSideMetadataSplitStrategy")
-       .getDeclaredMethods.find(_.getName == "generateTaskSideMDSplits").getOrElse(
-         sys.error(
-           s"Failed to reflectively invoke TaskSideMetadataSplitStrategy.generateTaskSideMDSplits"))
-    generateSplits.setAccessible(true)
-
-    for (footer <- footers) {
-      val file = footer.getFile
-      val fs = file.getFileSystem(configuration)
-      val status = fileStatuses.getOrElse(file, fs.getFileStatus(file))
-      val blockLocations = fs.getFileBlockLocations(status, 0, status.getLen)
-      splits.addAll(
-        generateSplits.invoke(
-         null,
-         blockLocations,
-         status,
-         readContext.getRequestedSchema.toString,
-         readContext.getReadSupportMetadata,
-         minSplitSize,
-         maxSplitSize).asInstanceOf[JList[ParquetInputSplit]])
-    }
-
-    splits
-  }
-
 }
 
 private[parquet] object FilteringParquetRowInputFormat {

From c19c78577a211eefe1112ebd4670a4ce7c3cc3be Mon Sep 17 00:00:00 2001
From: Wenchen Fan <cloud0fan@outlook.com>
Date: Fri, 12 Jun 2015 16:38:28 +0800
Subject: [PATCH 465/525] [SQL] [MINOR] correct semanticEquals logic

It's a follow up of https://github.com/apache/spark/pull/6173, for expressions like `Coalesce` that have a `Seq[Expression]`, when we do semantic equal check for it, we need to do semantic equal check for all of its children.
Also we can just use `Seq[(Expression, NamedExpression)]` instead of `Map[Expression, NamedExpression]` as we only search it with `find`.

chenghao-intel, I agree that we probably never knows `semanticEquals` in a general way, but I think we have done that in `TreeNode`, so we can use similar logic. Then we can handle something like `Coalesce(children: Seq[Expression])` correctly.

Author: Wenchen Fan <cloud0fan@outlook.com>

Closes #6261 from cloud-fan/tmp and squashes the following commits:

4daef88 [Wenchen Fan] address comments
dd8fbd9 [Wenchen Fan] correct semanticEquals
---
 .../sql/catalyst/expressions/Expression.scala  | 13 +++++++++----
 .../spark/sql/catalyst/planning/patterns.scala | 18 ++++++++----------
 .../sql/execution/GeneratedAggregate.scala     | 14 +++++++-------
 .../org/apache/spark/sql/SQLQuerySuite.scala   |  2 +-
 4 files changed, 25 insertions(+), 22 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala
index 8c1e4d74f9df1..0b9f621fed7cf 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala
@@ -136,12 +136,17 @@ abstract class Expression extends TreeNode[Expression] {
    * cosmetically (i.e. capitalization of names in attributes may be different).
    */
   def semanticEquals(other: Expression): Boolean = this.getClass == other.getClass && {
+    def checkSemantic(elements1: Seq[Any], elements2: Seq[Any]): Boolean = {
+      elements1.length == elements2.length && elements1.zip(elements2).forall {
+        case (e1: Expression, e2: Expression) => e1 semanticEquals e2
+        case (Some(e1: Expression), Some(e2: Expression)) => e1 semanticEquals e2
+        case (t1: Traversable[_], t2: Traversable[_]) => checkSemantic(t1.toSeq, t2.toSeq)
+        case (i1, i2) => i1 == i2
+      }
+    }
     val elements1 = this.productIterator.toSeq
     val elements2 = other.asInstanceOf[Product].productIterator.toSeq
-    elements1.length == elements2.length && elements1.zip(elements2).forall {
-      case (e1: Expression, e2: Expression) => e1 semanticEquals e2
-      case (i1, i2) => i1 == i2
-    }
+    checkSemantic(elements1, elements2)
   }
 
   /**
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/planning/patterns.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/planning/patterns.scala
index 1dd75a8846303..3b6f8bfd9ff9b 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/planning/patterns.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/planning/patterns.scala
@@ -143,11 +143,11 @@ object PartialAggregation {
         // We need to pass all grouping expressions though so the grouping can happen a second
         // time. However some of them might be unnamed so we alias them allowing them to be
         // referenced in the second aggregation.
-        val namedGroupingExpressions: Map[Expression, NamedExpression] =
+        val namedGroupingExpressions: Seq[(Expression, NamedExpression)] =
           groupingExpressions.filter(!_.isInstanceOf[Literal]).map {
             case n: NamedExpression => (n, n)
             case other => (other, Alias(other, "PartialGroup")())
-          }.toMap
+          }
 
         // Replace aggregations with a new expression that computes the result from the already
         // computed partial evaluations and grouping values.
@@ -160,17 +160,15 @@ object PartialAggregation {
             // resolving struct field accesses, because `GetField` is not a `NamedExpression`.
             // (Should we just turn `GetField` into a `NamedExpression`?)
             val trimmed = e.transform { case Alias(g: ExtractValue, _) => g }
-            namedGroupingExpressions
-              .find { case (k, v) => k semanticEquals trimmed }
-              .map(_._2.toAttribute)
-              .getOrElse(e)
+            namedGroupingExpressions.collectFirst {
+              case (expr, ne) if expr semanticEquals trimmed => ne.toAttribute
+            }.getOrElse(e)
         }).asInstanceOf[Seq[NamedExpression]]
 
-        val partialComputation =
-          (namedGroupingExpressions.values ++
-            partialEvaluations.values.flatMap(_.partialEvaluations)).toSeq
+        val partialComputation = namedGroupingExpressions.map(_._2) ++
+          partialEvaluations.values.flatMap(_.partialEvaluations)
 
-        val namedGroupingAttributes = namedGroupingExpressions.values.map(_.toAttribute).toSeq
+        val namedGroupingAttributes = namedGroupingExpressions.map(_._2.toAttribute)
 
         Some(
           (namedGroupingAttributes,
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/GeneratedAggregate.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/GeneratedAggregate.scala
index af3791734d0c9..1c40a9209f6d5 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/GeneratedAggregate.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/GeneratedAggregate.scala
@@ -214,18 +214,18 @@ case class GeneratedAggregate(
       }.toMap
 
     val namedGroups = groupingExpressions.zipWithIndex.map {
-      case (ne: NamedExpression, _) => (ne, ne)
-      case (e, i) => (e, Alias(e, s"GroupingExpr$i")())
+      case (ne: NamedExpression, _) => (ne, ne.toAttribute)
+      case (e, i) => (e, Alias(e, s"GroupingExpr$i")().toAttribute)
     }
 
-    val groupMap: Map[Expression, Attribute] =
-      namedGroups.map { case (k, v) => k -> v.toAttribute}.toMap
-
     // The set of expressions that produce the final output given the aggregation buffer and the
     // grouping expressions.
     val resultExpressions = aggregateExpressions.map(_.transform {
       case e: Expression if resultMap.contains(new TreeNodeRef(e)) => resultMap(new TreeNodeRef(e))
-      case e: Expression if groupMap.contains(e) => groupMap(e)
+      case e: Expression =>
+        namedGroups.collectFirst {
+          case (expr, attr) if expr semanticEquals e => attr
+        }.getOrElse(e)
     })
 
     val aggregationBufferSchema: StructType = StructType.fromAttributes(computationSchema)
@@ -265,7 +265,7 @@ case class GeneratedAggregate(
       val resultProjectionBuilder =
         newMutableProjection(
           resultExpressions,
-          (namedGroups.map(_._2.toAttribute) ++ computationSchema).toSeq)
+          namedGroups.map(_._2) ++ computationSchema)
       log.info(s"Result Projection: ${resultExpressions.mkString(",")}")
 
       val joinedRow = new JoinedRow3
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
index 14ecd4e9a77dc..6898d584414ba 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
@@ -697,7 +697,7 @@ class SQLQuerySuite extends QueryTest with BeforeAndAfterAll with SQLTestUtils {
         row => Seq.fill(16)(Row.merge(row, row))).collect().toSeq)
   }
 
-  ignore("cartesian product join") {
+  test("cartesian product join") {
     checkAnswer(
       testData3.join(testData3),
       Row(1, null, 1, null) ::

From 71cc17bfa7ff32f820742fdc2c45237b624c5370 Mon Sep 17 00:00:00 2001
From: Mark Smith <mark.smith@bronto.com>
Date: Fri, 12 Jun 2015 08:19:03 -0700
Subject: [PATCH 466/525] =?UTF-8?q?[SPARK-8322]=20[EC2]=20Added=20spark=20?=
 =?UTF-8?q?1.4.0=20into=20the=20VALID=5FSPARK=5FVERSIONS=20and=E2=80=A6?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

… SPARK_TACHYON_MAP

This contribution is my original work and I license the work to the project under the project's open source license.

Author: Mark Smith <mark.smith@bronto.com>

Closes #6776 from markmsmith/SPARK-8322 and squashes the following commits:

d744244 [Mark Smith] [SPARK-8322][EC2] Fixed tachyon mapp entry to point to 0.6.4
e4f14d3 [Mark Smith] [SPARK-8322][EC2] Added spark 1.4.0 into the VALID_SPARK_VERSIONS and SPARK_TACHYON_MAP
---
 ec2/spark_ec2.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/ec2/spark_ec2.py b/ec2/spark_ec2.py
index 58b24ae9ef500..56087499464e0 100755
--- a/ec2/spark_ec2.py
+++ b/ec2/spark_ec2.py
@@ -70,6 +70,7 @@
     "1.2.1",
     "1.3.0",
     "1.3.1",
+    "1.4.0",
 ])
 
 SPARK_TACHYON_MAP = {
@@ -82,6 +83,7 @@
     "1.2.1": "0.5.0",
     "1.3.0": "0.5.0",
     "1.3.1": "0.5.0",
+    "1.4.0": "0.6.4",
 }
 
 DEFAULT_SPARK_VERSION = SPARK_EC2_VERSION

From 19834fa9184f0365a160bcb54bcd33eaa87c70dc Mon Sep 17 00:00:00 2001
From: akhilthatipamula <130050068@iitb.ac.in>
Date: Fri, 12 Jun 2015 10:40:28 -0700
Subject: [PATCH 467/525] [SPARK-7993] [SQL] Improved DataFrame.show() output

Closes #6633

Author: akhilthatipamula <130050068@iitb.ac.in>
Author: zsxwing <zsxwing@gmail.com>

Closes #6784 from zsxwing/pr6633 and squashes the following commits:

5da1c51 [zsxwing] Address comments and add unit tests
17eab7b [akhilthatipamula] refactored code
19874b3 [akhilthatipamula] Update DataFrame.scala
0a76a5e [akhilthatipamula] Optimised showString()
e3dd03f [akhilthatipamula] Modified showString() method
a21012b [akhilthatipamula] improved the show()
4bb742f [akhilthatipamula] Modified dataframe.show() method
---
 .../org/apache/spark/sql/DataFrame.scala      | 30 ++++++++---
 .../org/apache/spark/sql/DataFrameSuite.scala | 51 +++++++++++++++++++
 2 files changed, 75 insertions(+), 6 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
index 59f64dd4bc648..f041fd397b04b 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
@@ -169,23 +169,34 @@ class DataFrame private[sql](
 
   /**
    * Internal API for Python
-   * @param numRows Number of rows to show
+   * @param _numRows Number of rows to show
    */
-  private[sql] def showString(numRows: Int): String = {
+  private[sql] def showString(_numRows: Int): String = {
+    val numRows = _numRows.max(0)
     val sb = new StringBuilder
-    val data = take(numRows)
+    val takeResult = take(numRows + 1)
+    val hasMoreData = takeResult.length > numRows
+    val data = takeResult.take(numRows)
     val numCols = schema.fieldNames.length
 
+    // For array values, replace Seq and Array with square brackets
     // For cells that are beyond 20 characters, replace it with the first 17 and "..."
     val rows: Seq[Seq[String]] = schema.fieldNames.toSeq +: data.map { row =>
       row.toSeq.map { cell =>
-        val str = if (cell == null) "null" else cell.toString
+        val str = cell match {
+          case null => "null"
+          case array: Array[_] => array.mkString("[", ", ", "]")
+          case seq: Seq[_] => seq.mkString("[", ", ", "]")
+          case _ => cell.toString
+        }
         if (str.length > 20) str.substring(0, 17) + "..." else str
       }: Seq[String]
     }
 
+    // Initialise the width of each column to a minimum value of '3'
+    val colWidths = Array.fill(numCols)(3)
+
     // Compute the width of each column
-    val colWidths = Array.fill(numCols)(0)
     for (row <- rows) {
       for ((cell, i) <- row.zipWithIndex) {
         colWidths(i) = math.max(colWidths(i), cell.length)
@@ -197,7 +208,7 @@ class DataFrame private[sql](
 
     // column names
     rows.head.zipWithIndex.map { case (cell, i) =>
-      StringUtils.leftPad(cell.toString, colWidths(i))
+      StringUtils.leftPad(cell, colWidths(i))
     }.addString(sb, "|", "|", "|\n")
 
     sb.append(sep)
@@ -210,6 +221,13 @@ class DataFrame private[sql](
     }
 
     sb.append(sep)
+
+    // For Data that has more than "numRows" records
+    if (hasMoreData) {
+      val rowsString = if (numRows == 1) "row" else "rows"
+      sb.append(s"only showing top $numRows ${rowsString}\n")
+    }
+
     sb.toString()
   }
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
index bb8621abe64ad..84835c0db765d 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
@@ -469,12 +469,63 @@ class DataFrameSuite extends QueryTest {
     testData.select($"*").show(1000)
   }
 
+  test("showString(negative)") {
+    val expectedAnswer = """+---+-----+
+                           ||key|value|
+                           |+---+-----+
+                           |+---+-----+
+                           |only showing top 0 rows
+                           |""".stripMargin
+    assert(testData.select($"*").showString(-1) === expectedAnswer)
+  }
+
+  test("showString(0)") {
+    val expectedAnswer = """+---+-----+
+                           ||key|value|
+                           |+---+-----+
+                           |+---+-----+
+                           |only showing top 0 rows
+                           |""".stripMargin
+    assert(testData.select($"*").showString(0) === expectedAnswer)
+  }
+
+  test("showString: array") {
+    val df = Seq(
+      (Array(1, 2, 3), Array(1, 2, 3)),
+      (Array(2, 3, 4), Array(2, 3, 4))
+    ).toDF()
+    val expectedAnswer = """+---------+---------+
+                           ||       _1|       _2|
+                           |+---------+---------+
+                           ||[1, 2, 3]|[1, 2, 3]|
+                           ||[2, 3, 4]|[2, 3, 4]|
+                           |+---------+---------+
+                           |""".stripMargin
+    assert(df.showString(10) === expectedAnswer)
+  }
+
+  test("showString: minimum column width") {
+    val df = Seq(
+      (1, 1),
+      (2, 2)
+    ).toDF()
+    val expectedAnswer = """+---+---+
+                           || _1| _2|
+                           |+---+---+
+                           ||  1|  1|
+                           ||  2|  2|
+                           |+---+---+
+                           |""".stripMargin
+    assert(df.showString(10) === expectedAnswer)
+  }
+
   test("SPARK-7319 showString") {
     val expectedAnswer = """+---+-----+
                            ||key|value|
                            |+---+-----+
                            ||  1|    1|
                            |+---+-----+
+                           |only showing top 1 row
                            |""".stripMargin
     assert(testData.select($"*").showString(1) === expectedAnswer)
   }

From 88604051511c788d7abb41a49e3eb3a8330c09a9 Mon Sep 17 00:00:00 2001
From: Andrew Or <andrew@databricks.com>
Date: Fri, 12 Jun 2015 11:14:55 -0700
Subject: [PATCH 468/525] [SPARK-8330] DAG visualization: trim whitespace from
 input

Safeguard against DOM rewriting.

Author: Andrew Or <andrew@databricks.com>

Closes #6787 from andrewor14/dag-viz-trim and squashes the following commits:

0fb4afe [Andrew Or] Trim input metadata from DOM
---
 .../resources/org/apache/spark/ui/static/spark-dag-viz.js    | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/core/src/main/resources/org/apache/spark/ui/static/spark-dag-viz.js b/core/src/main/resources/org/apache/spark/ui/static/spark-dag-viz.js
index 7a0dec2a3eaec..9fa53baaf4212 100644
--- a/core/src/main/resources/org/apache/spark/ui/static/spark-dag-viz.js
+++ b/core/src/main/resources/org/apache/spark/ui/static/spark-dag-viz.js
@@ -140,7 +140,8 @@ function renderDagViz(forJob) {
 
   // Find cached RDDs and mark them as such
   metadataContainer().selectAll(".cached-rdd").each(function(v) {
-    var nodeId = VizConstants.nodePrefix + d3.select(this).text();
+    var rddId = d3.select(this).text().trim();
+    var nodeId = VizConstants.nodePrefix + rddId;
     svg.selectAll("g." + nodeId).classed("cached", true);
   });
 
@@ -150,7 +151,7 @@ function renderDagViz(forJob) {
 /* Render the RDD DAG visualization on the stage page. */
 function renderDagVizForStage(svgContainer) {
   var metadata = metadataContainer().select(".stage-metadata");
-  var dot = metadata.select(".dot-file").text();
+  var dot = metadata.select(".dot-file").text().trim();
   var containerId = VizConstants.graphPrefix + metadata.attr("stage-id");
   var container = svgContainer.append("g").attr("id", containerId);
   renderDot(dot, container, false);

From e9471d3414d327c7d0853e18f1844ab1bd09c8ed Mon Sep 17 00:00:00 2001
From: Tathagata Das <tathagata.das1565@gmail.com>
Date: Fri, 12 Jun 2015 15:22:59 -0700
Subject: [PATCH 469/525] [SPARK-7284] [STREAMING] Updated streaming
 documentation

- Kinesis API updated
- Kafka version updated, and Python API for Direct Kafka added
- Added SQLContext.getOrCreate()
- Added information on how to get partitionId in foreachRDD

Author: Tathagata Das <tathagata.das1565@gmail.com>

Closes #6781 from tdas/SPARK-7284 and squashes the following commits:

aac7be0 [Tathagata Das] Added information on how to get partition id
a66ec22 [Tathagata Das] Complete the line incomplete line,
a92ca39 [Tathagata Das] Updated streaming documentation
---
 docs/streaming-kafka-integration.md   | 12 ++++-
 docs/streaming-kinesis-integration.md | 24 +++++----
 docs/streaming-programming-guide.md   | 70 +++++++++------------------
 3 files changed, 50 insertions(+), 56 deletions(-)

diff --git a/docs/streaming-kafka-integration.md b/docs/streaming-kafka-integration.md
index 998c8c994e4b4..02bc95d0e95f9 100644
--- a/docs/streaming-kafka-integration.md
+++ b/docs/streaming-kafka-integration.md
@@ -118,6 +118,13 @@ Next, we discuss how to use this approach in your streaming application.
 	See the [API docs](api/java/index.html?org/apache/spark/streaming/kafka/KafkaUtils.html)
 	and the [example]({{site.SPARK_GITHUB_URL}}/blob/master/examples/src/main/java/org/apache/spark/examples/streaming/JavaDirectKafkaWordCount.java).
 
+	</div>
+	<div data-lang="python" markdown="1">
+		from pyspark.streaming.kafka import KafkaUtils
+		directKafkaStream = KafkaUtils.createDirectStream(ssc, [topic], {"metadata.broker.list": brokers})
+
+	By default, the Python API will decode Kafka data as UTF8 encoded strings. You can specify your custom decoding function to decode the byte arrays in Kafka records to any arbitrary data type. See the [API docs](api/python/pyspark.streaming.html#pyspark.streaming.kafka.KafkaUtils)
+	and the [example]({{site.SPARK_GITHUB_URL}}/blob/master/examples/src/main/python/streaming/direct_kafka_wordcount.py).
 	</div>
 	</div>
 
@@ -147,10 +154,13 @@ Next, we discuss how to use this approach in your streaming application.
 		    }
 		);
 	</div>
+	<div data-lang="python" markdown="1">
+	Not supported
 	</div>
+   	</div>
 
 	You can use this to update Zookeeper yourself if you want Zookeeper-based Kafka monitoring tools to show progress of the streaming application.
 
 	Another thing to note is that since this approach does not use Receivers, the standard receiver-related (that is, [configurations](configuration.html) of the form `spark.streaming.receiver.*` ) will not apply to the input DStreams created by this approach (will apply to other input DStreams though). Instead, use the [configurations](configuration.html) `spark.streaming.kafka.*`. An important one is `spark.streaming.kafka.maxRatePerPartition` which is the maximum rate at which each Kafka partition will be read by this direct API. 
 
-3. **Deploying:** Similar to the first approach, you can package `spark-streaming-kafka_{{site.SCALA_BINARY_VERSION}}` and its dependencies into the application JAR and the launch the application using `spark-submit`. Make sure `spark-core_{{site.SCALA_BINARY_VERSION}}` and `spark-streaming_{{site.SCALA_BINARY_VERSION}}` are marked as `provided` dependencies as those are already present in a Spark installation.
+3. **Deploying:** This is same as the first approach, for Scala, Java and Python.
diff --git a/docs/streaming-kinesis-integration.md b/docs/streaming-kinesis-integration.md
index 379eb513d521e..aa9749afbc867 100644
--- a/docs/streaming-kinesis-integration.md
+++ b/docs/streaming-kinesis-integration.md
@@ -32,7 +32,8 @@ A Kinesis stream can be set up at one of the valid Kinesis endpoints with 1 or m
 		import com.amazonaws.services.kinesis.clientlibrary.lib.worker.InitialPositionInStream
 
 		val kinesisStream = KinesisUtils.createStream(
-        	streamingContext, [Kinesis stream name], [endpoint URL], [checkpoint interval], [initial position])
+			streamingContext, [Kinesis app name], [Kinesis stream name], [endpoint URL],
+			[region name], [initial position], [checkpoint interval], StorageLevel.MEMORY_AND_DISK_2)
 
 	See the [API docs](api/scala/index.html#org.apache.spark.streaming.kinesis.KinesisUtils$)
 	and the [example]({{site.SPARK_GITHUB_URL}}/tree/master/extras/kinesis-asl/src/main/scala/org/apache/spark/examples/streaming/KinesisWordCountASL.scala). Refer to the Running the Example section for instructions on how to run the example.
@@ -44,7 +45,8 @@ A Kinesis stream can be set up at one of the valid Kinesis endpoints with 1 or m
 		import com.amazonaws.services.kinesis.clientlibrary.lib.worker.InitialPositionInStream;
 
 		JavaReceiverInputDStream<byte[]> kinesisStream = KinesisUtils.createStream(
-        	streamingContext, [Kinesis stream name], [endpoint URL], [checkpoint interval], [initial position]);
+			streamingContext, [Kinesis app name], [Kinesis stream name], [endpoint URL],
+			[region name], [initial position], [checkpoint interval], StorageLevel.MEMORY_AND_DISK_2);
 
 	See the [API docs](api/java/index.html?org/apache/spark/streaming/kinesis/KinesisUtils.html)
 	and the [example]({{site.SPARK_GITHUB_URL}}/tree/master/extras/kinesis-asl/src/main/java/org/apache/spark/examples/streaming/JavaKinesisWordCountASL.java). Refer to the next subsection for instructions to run the example.
@@ -54,19 +56,23 @@ A Kinesis stream can be set up at one of the valid Kinesis endpoints with 1 or m
 
     - `streamingContext`: StreamingContext containg an application name used by Kinesis to tie this Kinesis application to the Kinesis stream
 
-	- `[Kinesis stream name]`: The Kinesis stream that this streaming application receives from
-		- The application name used in the streaming context becomes the Kinesis application name
+	- `[Kineiss app name]`: The application name that will be used to checkpoint the Kinesis
+		sequence numbers in DynamoDB table.
 		- The application name must be unique for a given account and region.
-		- The Kinesis backend automatically associates the application name to the Kinesis stream using a DynamoDB table (always in the us-east-1 region) created during Kinesis Client Library initialization. 
-		- Changing the application name or stream name can lead to Kinesis errors in some cases.  If you see errors, you may need to manually delete the DynamoDB table.
+		- If the table exists but has incorrect checkpoint information (for a different stream, or
+		  old expired sequenced numbers), then there may be temporary errors.
 
+	- `[Kinesis stream name]`: The Kinesis stream that this streaming application will pull data from.
 
 	- `[endpoint URL]`: Valid Kinesis endpoints URL can be found [here](http://docs.aws.amazon.com/general/latest/gr/rande.html#ak_region).
 
+	- `[region name]`: Valid Kinesis region names can be found [here](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/using-regions-availability-zones.html).
+
 	- `[checkpoint interval]`: The interval (e.g., Duration(2000) = 2 seconds) at which the Kinesis Client Library saves its position in the stream.  For starters, set it to the same as the batch interval of the streaming application.
 
 	- `[initial position]`: Can be either `InitialPositionInStream.TRIM_HORIZON` or `InitialPositionInStream.LATEST` (see Kinesis Checkpointing section and Amazon Kinesis API documentation for more details).
 
+	In other versions of the API, you can also specify the AWS access key and secret key directly.
 
 3. **Deploying:** Package `spark-streaming-kinesis-asl_{{site.SCALA_BINARY_VERSION}}` and its dependencies (except `spark-core_{{site.SCALA_BINARY_VERSION}}` and `spark-streaming_{{site.SCALA_BINARY_VERSION}}` which are provided by `spark-submit`) into the application JAR. Then use `spark-submit` to launch your application (see [Deploying section](streaming-programming-guide.html#deploying-applications) in the main programming guide).
 
@@ -122,12 +128,12 @@ To run the example,
 	<div class="codetabs">
 	<div data-lang="scala" markdown="1">
 
-    	bin/run-example streaming.KinesisWordCountASL [Kinesis stream name] [endpoint URL]
+        bin/run-example streaming.KinesisWordCountASL [Kinesis app name] [Kinesis stream name] [endpoint URL]
 
 	</div>
 	<div data-lang="java" markdown="1">
 
-        bin/run-example streaming.JavaKinesisWordCountASL [Kinesis stream name] [endpoint URL]
+        bin/run-example streaming.JavaKinesisWordCountASL [Kinesis app name] [Kinesis stream name] [endpoint URL]
 
 	</div>
 	</div>
@@ -136,7 +142,7 @@ To run the example,
 
 - To generate random string data to put onto the Kinesis stream, in another terminal, run the associated Kinesis data producer.
 
-		bin/run-example streaming.KinesisWordCountProducerASL [Kinesis stream name] [endpoint URL] 1000 10
+		bin/run-example streaming.KinesisWordProducerASL [Kinesis stream name] [endpoint URL] 1000 10
 
 	This will push 1000 lines per second of 10 random numbers per line to the Kinesis stream.  This data should then be received and processed by the running example.
 
diff --git a/docs/streaming-programming-guide.md b/docs/streaming-programming-guide.md
index 42b33947873b0..836f0473597d8 100644
--- a/docs/streaming-programming-guide.md
+++ b/docs/streaming-programming-guide.md
@@ -77,7 +77,7 @@ main entry point for all streaming functionality. We create a local StreamingCon
 {% highlight scala %}
 import org.apache.spark._
 import org.apache.spark.streaming._
-import org.apache.spark.streaming.StreamingContext._ // not necessary in Spark 1.3+
+import org.apache.spark.streaming.StreamingContext._ // not necessary since Spark 1.3
 
 // Create a local StreamingContext with two working thread and batch interval of 1 second.
 // The master requires 2 cores to prevent from a starvation scenario.
@@ -109,7 +109,7 @@ each line will be split into multiple words and the stream of words is represent
 `words` DStream.  Next, we want to count these words.
 
 {% highlight scala %}
-import org.apache.spark.streaming.StreamingContext._ // not necessary in Spark 1.3+
+import org.apache.spark.streaming.StreamingContext._ // not necessary since Spark 1.3
 // Count each word in each batch
 val pairs = words.map(word => (word, 1))
 val wordCounts = pairs.reduceByKey(_ + _)
@@ -682,7 +682,7 @@ for Java, and [StreamingContext](api/python/pyspark.streaming.html#pyspark.strea
 ### Advanced Sources
 {:.no_toc}
 
-<span class="badge" style="background-color: grey">Python API</span> As of Spark 1.3,
+<span class="badge" style="background-color: grey">Python API</span> As of Spark {{site.SPARK_VERSION_SHORT}},
 out of these sources, *only* Kafka is available in the Python API. We will add more advanced sources in the Python API in future.
 
 This category of sources require interfacing with external non-Spark libraries, some of them with
@@ -723,7 +723,7 @@ and it in the classpath.
 
 Some of these advanced sources are as follows.
 
-- **Kafka:** Spark Streaming {{site.SPARK_VERSION_SHORT}} is compatible with Kafka 0.8.1.1. See the [Kafka Integration Guide](streaming-kafka-integration.html) for more details.
+- **Kafka:** Spark Streaming {{site.SPARK_VERSION_SHORT}} is compatible with Kafka 0.8.2.1. See the [Kafka Integration Guide](streaming-kafka-integration.html) for more details.
 
 - **Flume:** Spark Streaming {{site.SPARK_VERSION_SHORT}} is compatible with Flume 1.4.0. See the [Flume Integration Guide](streaming-flume-integration.html) for more details.
 
@@ -991,8 +991,9 @@ cleanedDStream = wordCounts.transform(lambda rdd: rdd.join(spamInfoRDD).filter(.
 </div>
 </div>
 
-In fact, you can also use [machine learning](mllib-guide.html) and
-[graph computation](graphx-programming-guide.html) algorithms in the `transform` method.
+Note that the supplied function gets called in every batch interval. This allows you to do
+time-varying RDD operations, that is, RDD operations, number of partitions, broadcast variables,
+etc. can be changed between batches.
 
 #### Window Operations
 {:.no_toc}
@@ -1427,26 +1428,6 @@ You can easily use [DataFrames and SQL](sql-programming-guide.html) operations o
 <div data-lang="scala" markdown="1">
 {% highlight scala %}
 
-/** Lazily instantiated singleton instance of SQLContext */
-object SQLContextSingleton {
-  @transient private var instance: SQLContext = null
-
-  // Instantiate SQLContext on demand
-  def getInstance(sparkContext: SparkContext): SQLContext = synchronized {
-    if (instance == null) {
-      instance = new SQLContext(sparkContext)
-    }
-    instance
-  }
-}
-
-...
-
-/** Case class for converting RDD to DataFrame */
-case class Row(word: String)
-
-...
-
 /** DataFrame operations inside your streaming program */
 
 val words: DStream[String] = ...
@@ -1454,11 +1435,11 @@ val words: DStream[String] = ...
 words.foreachRDD { rdd =>
 
   // Get the singleton instance of SQLContext
-  val sqlContext = SQLContextSingleton.getInstance(rdd.sparkContext)
+  val sqlContext = SQLContext.getOrCreate(rdd.sparkContext)
   import sqlContext.implicits._
 
-  // Convert RDD[String] to RDD[case class] to DataFrame
-  val wordsDataFrame = rdd.map(w => Row(w)).toDF()
+  // Convert RDD[String] to DataFrame
+  val wordsDataFrame = rdd.toDF("word")
 
   // Register as table
   wordsDataFrame.registerTempTable("words")
@@ -1476,19 +1457,6 @@ See the full [source code]({{site.SPARK_GITHUB_URL}}/blob/master/examples/src/ma
 <div data-lang="java" markdown="1">
 {% highlight java %}
 
-/** Lazily instantiated singleton instance of SQLContext */
-class JavaSQLContextSingleton {
-  static private transient SQLContext instance = null;
-  static public SQLContext getInstance(SparkContext sparkContext) {
-    if (instance == null) {
-      instance = new SQLContext(sparkContext);
-    }
-    return instance;
-  }
-}
-
-...
-
 /** Java Bean class for converting RDD to DataFrame */
 public class JavaRow implements java.io.Serializable {
   private String word;
@@ -1512,7 +1480,9 @@ words.foreachRDD(
   new Function2<JavaRDD<String>, Time, Void>() {
     @Override
     public Void call(JavaRDD<String> rdd, Time time) {
-      SQLContext sqlContext = JavaSQLContextSingleton.getInstance(rdd.context());
+
+      // Get the singleton instance of SQLContext
+      SQLContext sqlContext = SQLContext.getOrCreate(rdd.context());
 
       // Convert RDD[String] to RDD[case class] to DataFrame
       JavaRDD<JavaRow> rowRDD = rdd.map(new Function<String, JavaRow>() {
@@ -2234,7 +2204,7 @@ The following table summarizes the semantics under failures:
 
 ### With Kafka Direct API
 {:.no_toc}
-In Spark 1.3, we have introduced a new Kafka Direct API, which can ensure that all the Kafka data is received by Spark Streaming exactly once. Along with this, if you implement exactly-once output operation, you can achieve end-to-end exactly-once guarantees. This approach (experimental as of Spark 1.3) is further discussed in the [Kafka Integration Guide](streaming-kafka-integration.html).
+In Spark 1.3, we have introduced a new Kafka Direct API, which can ensure that all the Kafka data is received by Spark Streaming exactly once. Along with this, if you implement exactly-once output operation, you can achieve end-to-end exactly-once guarantees. This approach (experimental as of Spark {{site.SPARK_VERSION_SHORT}}) is further discussed in the [Kafka Integration Guide](streaming-kafka-integration.html).
 
 ## Semantics of output operations
 {:.no_toc}
@@ -2248,8 +2218,16 @@ additional effort may be necessary to achieve exactly-once semantics. There are
 
 - *Transactional updates*: All updates are made transactionally so that updates are made exactly once atomically. One way to do this would be the following.
 
-    - Use the batch time (available in `foreachRDD`) and the partition index of the transformed RDD to create an identifier. This identifier uniquely identifies a blob data in the streaming application.
-    - Update external system with this blob transactionally (that is, exactly once, atomically) using the identifier. That is, if the identifier is not already committed, commit the partition data and the identifier atomically. Else if this was already committed, skip the update. 
+    - Use the batch time (available in `foreachRDD`) and the partition index of the RDD to create an identifier. This identifier uniquely identifies a blob data in the streaming application.
+    - Update external system with this blob transactionally (that is, exactly once, atomically) using the identifier. That is, if the identifier is not already committed, commit the partition data and the identifier atomically. Else if this was already committed, skip the update.
+
+          dstream.foreachRDD { (rdd, time) =>
+            rdd.foreachPartition { partitionIterator =>
+              val partitionId = TaskContext.get.partitionId()
+              val uniqueId = generateUniqueId(time.milliseconds, partitionId)
+              // use this uniqueId to transactionally commit the data in partitionIterator
+            }
+          }
 
 
 ***************************************************************************************************

From 6e9c3ff1ecaf12a0126d83f27f5a4153ae420a34 Mon Sep 17 00:00:00 2001
From: Roger Menezes <rmenezes@netflix.com>
Date: Fri, 12 Jun 2015 18:29:58 -0700
Subject: [PATCH 470/525] [SPARK-8314][MLlib] improvement in performance of
 MLUtils.appendBias

MLUtils.appendBias method is heavily used in creating intercepts for linear models.
This method uses Breeze's vector concatenation which is very slow compared to the plain
System.arrayCopy. This improvement is to change the implementation to use System.arrayCopy.

I saw the following performance improvements after the change:
Benchmark with mnist dataset for 50 times:
MLUtils.appendBias (SparseVector Before): 47320 ms
MLUtils.appendBias (SparseVector After): 1935 ms
MLUtils.appendBias (DenseVector Before): 5340 ms
MLUtils.appendBias (DenseVector After): 4080 ms
This is almost a 24 times performance boost for SparseVectors.

Author: Roger Menezes <rmenezes@netflix.com>

Closes #6768 from rogermenezes/improve-append-bias and squashes the following commits:

4e42f75 [Roger Menezes] address feedback
e999d79 [Roger Menezes] first commit
---
 .../ml/LogisticRegressionExample.scala        |  4 +--
 .../org/apache/spark/mllib/linalg/BLAS.scala  |  4 +--
 .../org/apache/spark/mllib/util/MLUtils.scala | 26 +++++++++++++++----
 3 files changed, 25 insertions(+), 9 deletions(-)

diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/LogisticRegressionExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/LogisticRegressionExample.scala
index b12f833ce94c8..3cf193f353fbc 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/LogisticRegressionExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/LogisticRegressionExample.scala
@@ -145,9 +145,9 @@ object LogisticRegressionExample {
     val elapsedTime = (System.nanoTime() - startTime) / 1e9
     println(s"Training time: $elapsedTime seconds")
 
-    val lirModel = pipelineModel.stages.last.asInstanceOf[LogisticRegressionModel]
+    val lorModel = pipelineModel.stages.last.asInstanceOf[LogisticRegressionModel]
     // Print the weights and intercept for logistic regression.
-    println(s"Weights: ${lirModel.weights} Intercept: ${lirModel.intercept}")
+    println(s"Weights: ${lorModel.weights} Intercept: ${lorModel.intercept}")
 
     println("Training data results:")
     DecisionTreeExample.evaluateClassificationModel(pipelineModel, training, "indexedLabel")
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/BLAS.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/BLAS.scala
index 557119f7b1cd1..3523f1804325d 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/BLAS.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/BLAS.scala
@@ -213,9 +213,9 @@ private[spark] object BLAS extends Serializable with Logging {
   def scal(a: Double, x: Vector): Unit = {
     x match {
       case sx: SparseVector =>
-        f2jBLAS.dscal(sx.values.size, a, sx.values, 1)
+        f2jBLAS.dscal(sx.values.length, a, sx.values, 1)
       case dx: DenseVector =>
-        f2jBLAS.dscal(dx.values.size, a, dx.values, 1)
+        f2jBLAS.dscal(dx.values.length, a, dx.values, 1)
       case _ =>
         throw new IllegalArgumentException(s"scal doesn't support vector type ${x.getClass}.")
     }
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala b/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala
index 52d6468a72af7..7c5cfa7bd84ce 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala
@@ -270,12 +270,28 @@ object MLUtils {
    * Returns a new vector with `1.0` (bias) appended to the input vector.
    */
   def appendBias(vector: Vector): Vector = {
-    val vector1 = vector.toBreeze match {
-      case dv: BDV[Double] => BDV.vertcat(dv, new BDV[Double](Array(1.0)))
-      case sv: BSV[Double] => BSV.vertcat(sv, new BSV[Double](Array(0), Array(1.0), 1))
-      case v: Any => throw new IllegalArgumentException("Do not support vector type " + v.getClass)
+    vector match {
+      case dv: DenseVector =>
+        val inputValues = dv.values
+        val inputLength = inputValues.length
+        val outputValues = Array.ofDim[Double](inputLength + 1)
+        System.arraycopy(inputValues, 0, outputValues, 0, inputLength)
+        outputValues(inputLength) = 1.0
+        Vectors.dense(outputValues)
+      case sv: SparseVector =>
+        val inputValues = sv.values
+        val inputIndices = sv.indices
+        val inputValuesLength = inputValues.length
+        val dim = sv.size
+        val outputValues = Array.ofDim[Double](inputValuesLength + 1)
+        val outputIndices = Array.ofDim[Int](inputValuesLength + 1)
+        System.arraycopy(inputValues, 0, outputValues, 0, inputValuesLength)
+        System.arraycopy(inputIndices, 0, outputIndices, 0, inputValuesLength)
+        outputValues(inputValuesLength) = 1.0
+        outputIndices(inputValuesLength) = dim
+        Vectors.sparse(dim + 1, outputIndices, outputValues)
+      case _ => throw new IllegalArgumentException(s"Do not support vector type ${vector.getClass}")
     }
-    Vectors.fromBreeze(vector1)
   }
 
   /**

From d46f8e5d4b5c1278e0fae3ad133b2229ac01b197 Mon Sep 17 00:00:00 2001
From: Davies Liu <davies@databricks.com>
Date: Fri, 12 Jun 2015 23:06:31 -0700
Subject: [PATCH 471/525] [SPARK-7186] [SQL] Decouple internal Row from
 external Row

Currently, we use o.a.s.sql.Row both internally and externally. The external interface is wider than what the internal needs because it is designed to facilitate end-user programming. This design has proven to be very error prone and cumbersome for internal Row implementations.

As a first step, we create an InternalRow interface in the catalyst module, which is identical to the current Row interface. And we switch all internal operators/expressions to use this InternalRow instead. When we need to expose Row, we convert the InternalRow implementation into Row for users.

For all public API, we use Row (for example, data source APIs), which will be converted into/from InternalRow by CatalystTypeConverters.

For all internal data sources (Json, Parquet, JDBC, Hive), we use InternalRow for better performance, casted into Row in buildScan() (without change the public API). When create a PhysicalRDD, we cast them back to InternalRow.

cc rxin marmbrus JoshRosen

Author: Davies Liu <davies@databricks.com>

Closes #6792 from davies/internal_row and squashes the following commits:

f2abd13 [Davies Liu] fix scalastyle
a7e025c [Davies Liu] move InternalRow into catalyst
30db8ba [Davies Liu] Merge branch 'master' of github.com:apache/spark into internal_row
7cbced8 [Davies Liu] separate Row and InternalRow
---
 .../org/apache/spark/sql/BaseMutableRow.java  |   0
 .../org/apache/spark/sql/BaseRow.java         |   5 +-
 .../UnsafeFixedWidthAggregationMap.java       |   8 +-
 .../sql/catalyst/expressions/UnsafeRow.java   |   4 +-
 .../sql/catalyst/CatalystTypeConverters.scala |  54 +++++----
 .../spark/sql/catalyst/InternalRow.scala      |  57 +++++++++
 .../sql/catalyst/analysis/unresolved.scala    |  11 +-
 .../catalyst/expressions/BoundAttribute.scala |   4 +-
 .../spark/sql/catalyst/expressions/Cast.scala |   5 +-
 .../sql/catalyst/expressions/Expression.scala |   4 +-
 .../catalyst/expressions/ExtractValue.scala   |  12 +-
 .../sql/catalyst/expressions/Projection.scala | 104 ++++++++--------
 .../sql/catalyst/expressions/ScalaUdf.scala   |  51 ++++----
 .../sql/catalyst/expressions/SortOrder.scala  |   4 +-
 .../expressions/SpecificMutableRow.scala      |   2 +-
 .../expressions/UnsafeRowConverter.scala      |  66 ++++++++---
 .../sql/catalyst/expressions/aggregates.scala |  73 ++++++------
 .../sql/catalyst/expressions/arithmetic.scala |  15 +--
 .../expressions/codegen/CodeGenerator.scala   |   7 +-
 .../codegen/GenerateMutableProjection.scala   |  11 +-
 .../codegen/GenerateOrdering.scala            |  17 +--
 .../codegen/GeneratePredicate.scala           |  15 +--
 .../codegen/GenerateProjection.scala          |  13 +-
 .../catalyst/expressions/complexTypes.scala   |   7 +-
 .../catalyst/expressions/conditionals.scala   |   7 +-
 .../expressions/decimalFunctions.scala        |   7 +-
 .../sql/catalyst/expressions/generators.scala |  15 +--
 .../sql/catalyst/expressions/literals.scala   |   7 +-
 .../spark/sql/catalyst/expressions/math.scala |   9 +-
 .../expressions/namedExpressions.scala        |   7 +-
 .../catalyst/expressions/nullFunctions.scala  |   9 +-
 .../sql/catalyst/expressions/package.scala    |  28 +++--
 .../sql/catalyst/expressions/predicates.scala |  20 ++--
 .../sql/catalyst/expressions/random.scala     |   4 +-
 .../spark/sql/catalyst/expressions/rows.scala |  23 ++--
 .../spark/sql/catalyst/expressions/sets.scala |  10 +-
 .../expressions/stringOperations.scala        |  12 +-
 .../expressions/windowExpressions.scala       |  12 +-
 .../plans/logical/LocalRelation.scala         |   9 +-
 .../plans/physical/partitioning.scala         |   7 +-
 .../sql/catalyst/ScalaReflectionSuite.scala   |   8 +-
 .../sql/catalyst/expressions/CastSuite.scala  |  22 ++--
 .../expressions/ComplexTypeSuite.scala        |   4 +-
 .../expressions/ExpressionEvalHelper.scala    |  16 +--
 .../UnsafeFixedWidthAggregationMapSuite.scala |   2 +-
 .../expressions/UnsafeRowConverterSuite.scala |   4 +-
 .../ConvertToLocalRelationSuite.scala         |  10 +-
 .../sql/catalyst/trees/TreeNodeSuite.scala    |   2 +-
 .../sql/catalyst/util/DateUtilsSuite.scala    |   1 -
 .../org/apache/spark/sql/DataFrame.scala      |  18 +--
 .../apache/spark/sql/DataFrameReader.scala    |   2 +-
 .../org/apache/spark/sql/SQLContext.scala     |  26 ++--
 .../spark/sql/columnar/ColumnBuilder.scala    |   7 +-
 .../spark/sql/columnar/ColumnStats.scala      |  65 +++++-----
 .../columnar/InMemoryColumnarTableScan.scala  |  27 ++---
 .../sql/columnar/NullableColumnBuilder.scala  |   4 +-
 .../CompressibleColumnBuilder.scala           |   6 +-
 .../compression/CompressionScheme.scala       |   5 +-
 .../compression/compressionSchemes.scala      |  13 +-
 .../spark/sql/execution/Aggregate.scala       |  14 +--
 .../apache/spark/sql/execution/Exchange.scala |  24 ++--
 .../spark/sql/execution/ExistingRDD.scala     |  18 +--
 .../apache/spark/sql/execution/Expand.scala   |  13 +-
 .../apache/spark/sql/execution/Generate.scala |  10 +-
 .../sql/execution/GeneratedAggregate.scala    |  20 ++--
 .../spark/sql/execution/LocalTableScan.scala  |   8 +-
 .../spark/sql/execution/SparkPlan.scala       |  20 ++--
 .../spark/sql/execution/SparkStrategies.scala |   2 +-
 .../apache/spark/sql/execution/Window.scala   |  21 ++--
 .../spark/sql/execution/basicOperators.scala  |  46 ++++----
 .../apache/spark/sql/execution/commands.scala |  10 +-
 .../spark/sql/execution/debug/package.scala   |  13 +-
 .../MonotonicallyIncreasingID.scala           |   4 +-
 .../expressions/SparkPartitionID.scala        |   4 +-
 .../execution/joins/BroadcastHashJoin.scala   |  11 +-
 .../joins/BroadcastLeftSemiJoinHash.scala     |   8 +-
 .../joins/BroadcastNestedLoopJoin.scala       |  11 +-
 .../execution/joins/CartesianProduct.scala    |   4 +-
 .../spark/sql/execution/joins/HashJoin.scala  |  12 +-
 .../sql/execution/joins/HashOuterJoin.scala   |  43 ++++---
 .../sql/execution/joins/HashedRelation.scala  |  25 ++--
 .../sql/execution/joins/LeftSemiJoinBNL.scala |   2 +-
 .../execution/joins/LeftSemiJoinHash.scala    |   8 +-
 .../execution/joins/ShuffledHashJoin.scala    |   4 +-
 .../sql/execution/joins/SortMergeJoin.scala   |  22 ++--
 .../spark/sql/execution/pythonUdfs.scala      |   9 +-
 .../sql/execution/stat/FrequentItems.scala    |   5 +-
 .../org/apache/spark/sql/jdbc/JDBCRDD.scala   |  18 +--
 .../apache/spark/sql/jdbc/JDBCRelation.scala  |   5 +-
 .../apache/spark/sql/json/JSONRelation.scala  |  14 +--
 .../spark/sql/json/JacksonGenerator.scala     |   2 +-
 .../apache/spark/sql/json/JacksonParser.scala |  13 +-
 .../org/apache/spark/sql/json/JsonRDD.scala   |   4 +-
 .../spark/sql/parquet/ParquetConverter.scala  |  16 +--
 .../sql/parquet/ParquetTableOperations.scala  |  38 +++---
 .../sql/parquet/ParquetTableSupport.scala     |  18 +--
 .../apache/spark/sql/parquet/newParquet.scala |  14 +--
 .../sql/sources/DataSourceStrategy.scala      | 111 ++++++++++--------
 .../spark/sql/sources/PartitioningUtils.scala |   7 +-
 .../apache/spark/sql/sources/commands.scala   |  74 +++++-------
 .../org/apache/spark/sql/sources/ddl.scala    |  10 +-
 .../apache/spark/sql/sources/interfaces.scala |  19 ++-
 .../apache/spark/sql/CachedTableSuite.scala   |   2 +-
 .../spark/sql/columnar/ColumnStatsSuite.scala |  27 +++--
 .../sql/columnar/ColumnarTestUtils.scala      |   9 +-
 .../columnar/InMemoryColumnarQuerySuite.scala |   3 +-
 .../compression/BooleanBitSetSuite.scala      |   6 +-
 .../spark/sql/execution/PlannerSuite.scala    |   5 +-
 .../execution/joins/HashedRelationSuite.scala |  26 ++--
 .../sql/parquet/ParquetFilterSuite.scala      |   2 +-
 .../spark/sql/parquet/ParquetIOSuite.scala    |   3 +-
 .../ParquetPartitionDiscoverySuite.scala      |  57 ++++++---
 .../spark/sql/parquet/ParquetQuerySuite.scala |   4 +-
 .../spark/sql/sources/DDLTestSuite.scala      |   7 +-
 .../spark/sql/sources/TableScanSuite.scala    |  27 +++--
 .../spark/sql/hive/HiveInspectors.scala       |   8 +-
 .../spark/sql/hive/HiveMetastoreCatalog.scala |   2 +-
 .../spark/sql/hive/HiveStrategies.scala       |   6 +-
 .../apache/spark/sql/hive/TableReader.scala   |  18 +--
 .../hive/execution/CreateTableAsSelect.scala  |   6 +-
 .../execution/DescribeHiveTableCommand.scala  |  12 +-
 .../hive/execution/HiveNativeCommand.scala    |  68 +++++------
 .../sql/hive/execution/HiveTableScan.scala    |   2 +-
 .../hive/execution/InsertIntoHiveTable.scala  |  16 +--
 .../hive/execution/ScriptTransformation.scala |  10 +-
 .../spark/sql/hive/execution/commands.scala   |  32 ++---
 .../org/apache/spark/sql/hive/hiveUdfs.scala  |  24 ++--
 .../spark/sql/hive/HiveInspectorSuite.scala   |   9 +-
 .../spark/sql/hive/HiveParquetSuite.scala     |   3 +-
 .../hive/orc/OrcPartitionDiscoverySuite.scala |   2 +-
 .../spark/sql/hive/orc/OrcQuerySuite.scala    |   2 +-
 .../apache/spark/sql/hive/parquetSuites.scala |   4 +-
 132 files changed, 1160 insertions(+), 973 deletions(-)
 rename sql/catalyst/src/main/{scala => java}/org/apache/spark/sql/BaseMutableRow.java (100%)
 rename sql/catalyst/src/main/{scala => java}/org/apache/spark/sql/BaseRow.java (97%)
 create mode 100644 sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/InternalRow.scala

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/BaseMutableRow.java b/sql/catalyst/src/main/java/org/apache/spark/sql/BaseMutableRow.java
similarity index 100%
rename from sql/catalyst/src/main/scala/org/apache/spark/sql/BaseMutableRow.java
rename to sql/catalyst/src/main/java/org/apache/spark/sql/BaseMutableRow.java
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/BaseRow.java b/sql/catalyst/src/main/java/org/apache/spark/sql/BaseRow.java
similarity index 97%
rename from sql/catalyst/src/main/scala/org/apache/spark/sql/BaseRow.java
rename to sql/catalyst/src/main/java/org/apache/spark/sql/BaseRow.java
index e91daf17f8085..611e02d8fb666 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/BaseRow.java
+++ b/sql/catalyst/src/main/java/org/apache/spark/sql/BaseRow.java
@@ -25,10 +25,11 @@
 import scala.collection.Seq;
 import scala.collection.mutable.ArraySeq;
 
+import org.apache.spark.sql.catalyst.InternalRow;
 import org.apache.spark.sql.catalyst.expressions.GenericRow;
 import org.apache.spark.sql.types.StructType;
 
-public abstract class BaseRow implements Row {
+public abstract class BaseRow extends InternalRow {
 
   @Override
   final public int length() {
@@ -176,7 +177,7 @@ public boolean equals(Object other) {
   }
 
   @Override
-  public Row copy() {
+  public InternalRow copy() {
     final int n = size();
     Object[] arr = new Object[n];
     for (int i = 0; i < n; i++) {
diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeFixedWidthAggregationMap.java b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeFixedWidthAggregationMap.java
index 299ff3728a6d9..b23e0efc83332 100644
--- a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeFixedWidthAggregationMap.java
+++ b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeFixedWidthAggregationMap.java
@@ -20,7 +20,7 @@
 import java.util.Arrays;
 import java.util.Iterator;
 
-import org.apache.spark.sql.Row;
+import org.apache.spark.sql.catalyst.InternalRow;
 import org.apache.spark.sql.types.StructField;
 import org.apache.spark.sql.types.StructType;
 import org.apache.spark.unsafe.PlatformDependent;
@@ -107,7 +107,7 @@ public static boolean supportsAggregationBufferSchema(StructType schema) {
    * @param enablePerfMetrics if true, performance metrics will be recorded (has minor perf impact)
    */
   public UnsafeFixedWidthAggregationMap(
-      Row emptyAggregationBuffer,
+      InternalRow emptyAggregationBuffer,
       StructType aggregationBufferSchema,
       StructType groupingKeySchema,
       TaskMemoryManager memoryManager,
@@ -125,7 +125,7 @@ public UnsafeFixedWidthAggregationMap(
   /**
    * Convert a Java object row into an UnsafeRow, allocating it into a new long array.
    */
-  private static long[] convertToUnsafeRow(Row javaRow, StructType schema) {
+  private static long[] convertToUnsafeRow(InternalRow javaRow, StructType schema) {
     final UnsafeRowConverter converter = new UnsafeRowConverter(schema);
     final long[] unsafeRow = new long[converter.getSizeRequirement(javaRow)];
     final long writtenLength =
@@ -138,7 +138,7 @@ private static long[] convertToUnsafeRow(Row javaRow, StructType schema) {
    * Return the aggregation buffer for the current group. For efficiency, all calls to this method
    * return the same object.
    */
-  public UnsafeRow getAggregationBuffer(Row groupingKey) {
+  public UnsafeRow getAggregationBuffer(InternalRow groupingKey) {
     final int groupingKeySize = groupingKeyToUnsafeRowConverter.getSizeRequirement(groupingKey);
     // Make sure that the buffer is large enough to hold the key. If it's not, grow it:
     if (groupingKeySize > groupingKeyConversionScratchSpace.length) {
diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeRow.java b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeRow.java
index 143acc9f5e36f..aec88c9241d92 100644
--- a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeRow.java
+++ b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeRow.java
@@ -26,7 +26,7 @@
 import scala.collection.Seq;
 import scala.collection.mutable.ArraySeq;
 
-import org.apache.spark.sql.Row;
+import org.apache.spark.sql.catalyst.InternalRow;
 import org.apache.spark.sql.BaseMutableRow;
 import org.apache.spark.sql.types.DataType;
 import org.apache.spark.sql.types.StructType;
@@ -334,7 +334,7 @@ public String getString(int i) {
 
 
   @Override
-  public Row copy() {
+  public InternalRow copy() {
     throw new UnsupportedOperationException();
   }
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/CatalystTypeConverters.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/CatalystTypeConverters.scala
index 7e4b11a4951b8..6175456c58214 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/CatalystTypeConverters.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/CatalystTypeConverters.scala
@@ -27,6 +27,7 @@ import scala.collection.mutable.HashMap
 
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.util.DateUtils
+import org.apache.spark.sql.Row
 import org.apache.spark.sql.types._
 import org.apache.spark.unsafe.types.UTF8String
 
@@ -105,7 +106,7 @@ object CatalystTypeConverters {
     /**
      * Given a Catalyst row, convert the value at column `column` to its Scala equivalent.
      */
-    final def toScala(row: Row, column: Int): ScalaOutputType = {
+    final def toScala(row: InternalRow, column: Int): ScalaOutputType = {
       if (row.isNullAt(column)) null.asInstanceOf[ScalaOutputType] else toScalaImpl(row, column)
     }
 
@@ -125,20 +126,20 @@ object CatalystTypeConverters {
      * Given a Catalyst row, convert the value at column `column` to its Scala equivalent.
      * This method will only be called on non-null columns.
      */
-    protected def toScalaImpl(row: Row, column: Int): ScalaOutputType
+    protected def toScalaImpl(row: InternalRow, column: Int): ScalaOutputType
   }
 
   private object IdentityConverter extends CatalystTypeConverter[Any, Any, Any] {
     override def toCatalystImpl(scalaValue: Any): Any = scalaValue
     override def toScala(catalystValue: Any): Any = catalystValue
-    override def toScalaImpl(row: Row, column: Int): Any = row(column)
+    override def toScalaImpl(row: InternalRow, column: Int): Any = row(column)
   }
 
   private case class UDTConverter(
       udt: UserDefinedType[_]) extends CatalystTypeConverter[Any, Any, Any] {
     override def toCatalystImpl(scalaValue: Any): Any = udt.serialize(scalaValue)
     override def toScala(catalystValue: Any): Any = udt.deserialize(catalystValue)
-    override def toScalaImpl(row: Row, column: Int): Any = toScala(row(column))
+    override def toScalaImpl(row: InternalRow, column: Int): Any = toScala(row(column))
   }
 
   /** Converter for arrays, sequences, and Java iterables. */
@@ -170,7 +171,7 @@ object CatalystTypeConverters {
       }
     }
 
-    override def toScalaImpl(row: Row, column: Int): Seq[Any] =
+    override def toScalaImpl(row: InternalRow, column: Int): Seq[Any] =
       toScala(row(column).asInstanceOf[Seq[Any]])
   }
 
@@ -209,16 +210,16 @@ object CatalystTypeConverters {
       }
     }
 
-    override def toScalaImpl(row: Row, column: Int): Map[Any, Any] =
+    override def toScalaImpl(row: InternalRow, column: Int): Map[Any, Any] =
       toScala(row(column).asInstanceOf[Map[Any, Any]])
   }
 
   private case class StructConverter(
-      structType: StructType) extends CatalystTypeConverter[Any, Row, Row] {
+      structType: StructType) extends CatalystTypeConverter[Any, Row, InternalRow] {
 
     private[this] val converters = structType.fields.map { f => getConverterForType(f.dataType) }
 
-    override def toCatalystImpl(scalaValue: Any): Row = scalaValue match {
+    override def toCatalystImpl(scalaValue: Any): InternalRow = scalaValue match {
       case row: Row =>
         val ar = new Array[Any](row.size)
         var idx = 0
@@ -239,7 +240,7 @@ object CatalystTypeConverters {
         new GenericRowWithSchema(ar, structType)
     }
 
-    override def toScala(row: Row): Row = {
+    override def toScala(row: InternalRow): Row = {
       if (row == null) {
         null
       } else {
@@ -253,7 +254,8 @@ object CatalystTypeConverters {
       }
     }
 
-    override def toScalaImpl(row: Row, column: Int): Row = toScala(row(column).asInstanceOf[Row])
+    override def toScalaImpl(row: InternalRow, column: Int): Row =
+      toScala(row(column).asInstanceOf[InternalRow])
   }
 
   private object StringConverter extends CatalystTypeConverter[Any, String, Any] {
@@ -266,14 +268,14 @@ object CatalystTypeConverters {
       case str: String => str
       case utf8: UTF8String => utf8.toString()
     }
-    override def toScalaImpl(row: Row, column: Int): String = row(column).toString
+    override def toScalaImpl(row: InternalRow, column: Int): String = row(column).toString
   }
 
   private object DateConverter extends CatalystTypeConverter[Date, Date, Any] {
     override def toCatalystImpl(scalaValue: Date): Int = DateUtils.fromJavaDate(scalaValue)
     override def toScala(catalystValue: Any): Date =
       if (catalystValue == null) null else DateUtils.toJavaDate(catalystValue.asInstanceOf[Int])
-    override def toScalaImpl(row: Row, column: Int): Date = toScala(row.getInt(column))
+    override def toScalaImpl(row: InternalRow, column: Int): Date = toScala(row.getInt(column))
   }
 
   private object TimestampConverter extends CatalystTypeConverter[Timestamp, Timestamp, Any] {
@@ -282,7 +284,8 @@ object CatalystTypeConverters {
     override def toScala(catalystValue: Any): Timestamp =
       if (catalystValue == null) null
       else DateUtils.toJavaTimestamp(catalystValue.asInstanceOf[Long])
-    override def toScalaImpl(row: Row, column: Int): Timestamp = toScala(row.getLong(column))
+    override def toScalaImpl(row: InternalRow, column: Int): Timestamp =
+      toScala(row.getLong(column))
   }
 
   private object BigDecimalConverter extends CatalystTypeConverter[Any, JavaBigDecimal, Decimal] {
@@ -292,10 +295,11 @@ object CatalystTypeConverters {
       case d: Decimal => d
     }
     override def toScala(catalystValue: Decimal): JavaBigDecimal = catalystValue.toJavaBigDecimal
-    override def toScalaImpl(row: Row, column: Int): JavaBigDecimal = row.get(column) match {
-      case d: JavaBigDecimal => d
-      case d: Decimal => d.toJavaBigDecimal
-    }
+    override def toScalaImpl(row: InternalRow, column: Int): JavaBigDecimal =
+      row.get(column) match {
+        case d: JavaBigDecimal => d
+        case d: Decimal => d.toJavaBigDecimal
+      }
   }
 
   private abstract class PrimitiveConverter[T] extends CatalystTypeConverter[T, Any, Any] {
@@ -304,31 +308,31 @@ object CatalystTypeConverters {
   }
 
   private object BooleanConverter extends PrimitiveConverter[Boolean] {
-    override def toScalaImpl(row: Row, column: Int): Boolean = row.getBoolean(column)
+    override def toScalaImpl(row: InternalRow, column: Int): Boolean = row.getBoolean(column)
   }
 
   private object ByteConverter extends PrimitiveConverter[Byte] {
-    override def toScalaImpl(row: Row, column: Int): Byte = row.getByte(column)
+    override def toScalaImpl(row: InternalRow, column: Int): Byte = row.getByte(column)
   }
 
   private object ShortConverter extends PrimitiveConverter[Short] {
-    override def toScalaImpl(row: Row, column: Int): Short = row.getShort(column)
+    override def toScalaImpl(row: InternalRow, column: Int): Short = row.getShort(column)
   }
 
   private object IntConverter extends PrimitiveConverter[Int] {
-    override def toScalaImpl(row: Row, column: Int): Int = row.getInt(column)
+    override def toScalaImpl(row: InternalRow, column: Int): Int = row.getInt(column)
   }
 
   private object LongConverter extends PrimitiveConverter[Long] {
-    override def toScalaImpl(row: Row, column: Int): Long = row.getLong(column)
+    override def toScalaImpl(row: InternalRow, column: Int): Long = row.getLong(column)
   }
 
   private object FloatConverter extends PrimitiveConverter[Float] {
-    override def toScalaImpl(row: Row, column: Int): Float = row.getFloat(column)
+    override def toScalaImpl(row: InternalRow, column: Int): Float = row.getFloat(column)
   }
 
   private object DoubleConverter extends PrimitiveConverter[Double] {
-    override def toScalaImpl(row: Row, column: Int): Double = row.getDouble(column)
+    override def toScalaImpl(row: InternalRow, column: Int): Double = row.getDouble(column)
   }
 
   /**
@@ -382,7 +386,7 @@ object CatalystTypeConverters {
     case d: BigDecimal => BigDecimalConverter.toCatalyst(d)
     case d: JavaBigDecimal => BigDecimalConverter.toCatalyst(d)
     case seq: Seq[Any] => seq.map(convertToCatalyst)
-    case r: Row => Row(r.toSeq.map(convertToCatalyst): _*)
+    case r: Row => InternalRow(r.toSeq.map(convertToCatalyst): _*)
     case arr: Array[Any] => arr.toSeq.map(convertToCatalyst).toArray
     case m: Map[Any, Any] =>
       m.map { case (k, v) => (convertToCatalyst(k), convertToCatalyst(v)) }.toMap
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/InternalRow.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/InternalRow.scala
new file mode 100644
index 0000000000000..e3c2cc243310b
--- /dev/null
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/InternalRow.scala
@@ -0,0 +1,57 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.catalyst
+
+import org.apache.spark.sql.Row
+import org.apache.spark.sql.catalyst.expressions.GenericRow
+
+/**
+ * An abstract class for row used internal in Spark SQL, which only contain the columns as
+ * internal types.
+ */
+abstract class InternalRow extends Row {
+  // A default implementation to change the return type
+  override def copy(): InternalRow = {this}
+}
+
+object InternalRow {
+  def unapplySeq(row: InternalRow): Some[Seq[Any]] = Some(row.toSeq)
+
+  /**
+   * This method can be used to construct a [[Row]] with the given values.
+   */
+  def apply(values: Any*): InternalRow = new GenericRow(values.toArray)
+
+  /**
+   * This method can be used to construct a [[Row]] from a [[Seq]] of values.
+   */
+  def fromSeq(values: Seq[Any]): InternalRow = new GenericRow(values.toArray)
+
+  def fromTuple(tuple: Product): InternalRow = fromSeq(tuple.productIterator.toSeq)
+
+  /**
+   * Merge multiple rows into a single row, one after another.
+   */
+  def merge(rows: InternalRow*): InternalRow = {
+    // TODO: Improve the performance of this if used in performance critical part.
+    new GenericRow(rows.flatMap(_.toSeq).toArray)
+  }
+
+  /** Returns an empty row. */
+  val empty = apply()
+}
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/unresolved.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/unresolved.scala
index bbb150c1e83c7..5de188d418924 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/unresolved.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/unresolved.scala
@@ -17,6 +17,7 @@
 
 package org.apache.spark.sql.catalyst.analysis
 
+import org.apache.spark.sql.catalyst
 import org.apache.spark.sql.catalyst.{errors, trees}
 import org.apache.spark.sql.catalyst.errors.TreeNodeException
 import org.apache.spark.sql.catalyst.expressions._
@@ -67,7 +68,7 @@ case class UnresolvedAttribute(nameParts: Seq[String])
   override def withName(newName: String): UnresolvedAttribute = UnresolvedAttribute.quoted(newName)
 
   // Unresolved attributes are transient at compile time and don't get evaluated during execution.
-  override def eval(input: Row = null): Any =
+  override def eval(input: catalyst.InternalRow = null): Any =
     throw new TreeNodeException(this, s"No function to evaluate expression. type: ${this.nodeName}")
 
   override def toString: String = s"'$name"
@@ -85,7 +86,7 @@ case class UnresolvedFunction(name: String, children: Seq[Expression]) extends E
   override lazy val resolved = false
 
   // Unresolved functions are transient at compile time and don't get evaluated during execution.
-  override def eval(input: Row = null): Any =
+  override def eval(input: catalyst.InternalRow = null): Any =
     throw new TreeNodeException(this, s"No function to evaluate expression. type: ${this.nodeName}")
 
   override def toString: String = s"'$name(${children.mkString(",")})"
@@ -107,7 +108,7 @@ trait Star extends NamedExpression with trees.LeafNode[Expression] {
   override lazy val resolved = false
 
   // Star gets expanded at runtime so we never evaluate a Star.
-  override def eval(input: Row = null): Any =
+  override def eval(input: catalyst.InternalRow = null): Any =
     throw new TreeNodeException(this, s"No function to evaluate expression. type: ${this.nodeName}")
 
   def expand(input: Seq[Attribute], resolver: Resolver): Seq[NamedExpression]
@@ -166,7 +167,7 @@ case class MultiAlias(child: Expression, names: Seq[String])
 
   override lazy val resolved = false
 
-  override def eval(input: Row = null): Any =
+  override def eval(input: catalyst.InternalRow = null): Any =
     throw new TreeNodeException(this, s"No function to evaluate expression. type: ${this.nodeName}")
 
   override def toString: String = s"$child AS $names"
@@ -200,7 +201,7 @@ case class UnresolvedExtractValue(child: Expression, extraction: Expression)
   override def nullable: Boolean = throw new UnresolvedException(this, "nullable")
   override lazy val resolved = false
 
-  override def eval(input: Row = null): Any =
+  override def eval(input: catalyst.InternalRow = null): Any =
     throw new TreeNodeException(this, s"No function to evaluate expression. type: ${this.nodeName}")
 
   override def toString: String = s"$child[$extraction]"
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/BoundAttribute.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/BoundAttribute.scala
index fcadf9595e768..c4dd11a4518cd 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/BoundAttribute.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/BoundAttribute.scala
@@ -21,7 +21,7 @@ import org.apache.spark.Logging
 import org.apache.spark.sql.catalyst.errors.attachTree
 import org.apache.spark.sql.catalyst.expressions.codegen.{GeneratedExpressionCode, CodeGenContext}
 import org.apache.spark.sql.types._
-import org.apache.spark.sql.catalyst.trees
+import org.apache.spark.sql.catalyst.{InternalRow, trees}
 
 /**
  * A bound reference points to a specific slot in the input tuple, allowing the actual value
@@ -33,7 +33,7 @@ case class BoundReference(ordinal: Int, dataType: DataType, nullable: Boolean)
 
   override def toString: String = s"input[$ordinal]"
 
-  override def eval(input: Row): Any = input(ordinal)
+  override def eval(input: InternalRow): Any = input(ordinal)
 
   override def name: String = s"i[$ordinal]"
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala
index 4c7123fcb7fcc..afbf30af332d8 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala
@@ -21,6 +21,7 @@ import java.sql.{Date, Timestamp}
 import java.text.{DateFormat, SimpleDateFormat}
 
 import org.apache.spark.Logging
+import org.apache.spark.sql.catalyst
 import org.apache.spark.sql.catalyst.expressions.codegen.{CodeGenContext, GeneratedExpressionCode}
 import org.apache.spark.sql.catalyst.util.DateUtils
 import org.apache.spark.sql.types._
@@ -393,7 +394,7 @@ case class Cast(child: Expression, dataType: DataType) extends UnaryExpression w
     }
     // TODO: Could be faster?
     val newRow = new GenericMutableRow(from.fields.size)
-    buildCast[Row](_, row => {
+    buildCast[catalyst.InternalRow](_, row => {
       var i = 0
       while (i < row.length) {
         val v = row(i)
@@ -425,7 +426,7 @@ case class Cast(child: Expression, dataType: DataType) extends UnaryExpression w
 
   private[this] lazy val cast: Any => Any = cast(child.dataType, dataType)
 
-  override def eval(input: Row): Any = {
+  override def eval(input: catalyst.InternalRow): Any = {
     val evaluated = child.eval(input)
     if (evaluated == null) null else cast(evaluated)
   }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala
index 0b9f621fed7cf..61de34bfa4b3b 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala
@@ -18,7 +18,7 @@
 package org.apache.spark.sql.catalyst.expressions
 
 import org.apache.spark.sql.catalyst.analysis.{TypeCheckResult, UnresolvedAttribute}
-import org.apache.spark.sql.catalyst.expressions.codegen.{GeneratedExpressionCode, CodeGenContext}
+import org.apache.spark.sql.catalyst.expressions.codegen.{CodeGenContext, GeneratedExpressionCode}
 import org.apache.spark.sql.catalyst.trees
 import org.apache.spark.sql.catalyst.trees.TreeNode
 import org.apache.spark.sql.types._
@@ -59,7 +59,7 @@ abstract class Expression extends TreeNode[Expression] {
   def references: AttributeSet = AttributeSet(children.flatMap(_.references.iterator))
 
   /** Returns the result of evaluating this expression on a given input Row */
-  def eval(input: Row = null): Any
+  def eval(input: InternalRow = null): Any
 
   /**
    * Returns an [[GeneratedExpressionCode]], which contains Java source code that
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ExtractValue.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ExtractValue.scala
index a1e0819e8a433..16f3ccc3d6b88 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ExtractValue.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ExtractValue.scala
@@ -19,7 +19,7 @@ package org.apache.spark.sql.catalyst.expressions
 
 import scala.collection.Map
 
-import org.apache.spark.sql.AnalysisException
+import org.apache.spark.sql.{catalyst, AnalysisException}
 import org.apache.spark.sql.catalyst.analysis._
 import org.apache.spark.sql.types._
 
@@ -105,8 +105,8 @@ case class GetStructField(child: Expression, field: StructField, ordinal: Int)
   override def foldable: Boolean = child.foldable
   override def toString: String = s"$child.${field.name}"
 
-  override def eval(input: Row): Any = {
-    val baseValue = child.eval(input).asInstanceOf[Row]
+  override def eval(input: catalyst.InternalRow): Any = {
+    val baseValue = child.eval(input).asInstanceOf[catalyst.InternalRow]
     if (baseValue == null) null else baseValue(ordinal)
   }
 }
@@ -125,8 +125,8 @@ case class GetArrayStructFields(
   override def foldable: Boolean = child.foldable
   override def toString: String = s"$child.${field.name}"
 
-  override def eval(input: Row): Any = {
-    val baseValue = child.eval(input).asInstanceOf[Seq[Row]]
+  override def eval(input: catalyst.InternalRow): Any = {
+    val baseValue = child.eval(input).asInstanceOf[Seq[catalyst.InternalRow]]
     if (baseValue == null) null else {
       baseValue.map { row =>
         if (row == null) null else row(ordinal)
@@ -146,7 +146,7 @@ abstract class ExtractValueWithOrdinal extends ExtractValue {
   override def toString: String = s"$child[$ordinal]"
   override def children: Seq[Expression] = child :: ordinal :: Nil
 
-  override def eval(input: Row): Any = {
+  override def eval(input: catalyst.InternalRow): Any = {
     val value = child.eval(input)
     if (value == null) {
       null
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Projection.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Projection.scala
index 8cae548279eb1..d6806f78ab3fd 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Projection.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Projection.scala
@@ -17,6 +17,8 @@
 
 package org.apache.spark.sql.catalyst.expressions
 
+import org.apache.spark.sql.catalyst
+
 
 /**
  * A [[Projection]] that is calculated by calling the `eval` of each of the specified expressions.
@@ -30,7 +32,7 @@ class InterpretedProjection(expressions: Seq[Expression]) extends Projection {
   // null check is required for when Kryo invokes the no-arg constructor.
   protected val exprArray = if (expressions != null) expressions.toArray else null
 
-  def apply(input: Row): Row = {
+  def apply(input: catalyst.InternalRow): catalyst.InternalRow = {
     val outputArray = new Array[Any](exprArray.length)
     var i = 0
     while (i < exprArray.length) {
@@ -55,14 +57,14 @@ case class InterpretedMutableProjection(expressions: Seq[Expression]) extends Mu
 
   private[this] val exprArray = expressions.toArray
   private[this] var mutableRow: MutableRow = new GenericMutableRow(exprArray.size)
-  def currentValue: Row = mutableRow
+  def currentValue: catalyst.InternalRow = mutableRow
 
   override def target(row: MutableRow): MutableProjection = {
     mutableRow = row
     this
   }
 
-  override def apply(input: Row): Row = {
+  override def apply(input: catalyst.InternalRow): catalyst.InternalRow = {
     var i = 0
     while (i < exprArray.length) {
       mutableRow(i) = exprArray(i).eval(input)
@@ -76,31 +78,31 @@ case class InterpretedMutableProjection(expressions: Seq[Expression]) extends Mu
  * A mutable wrapper that makes two rows appear as a single concatenated row.  Designed to
  * be instantiated once per thread and reused.
  */
-class JoinedRow extends Row {
-  private[this] var row1: Row = _
-  private[this] var row2: Row = _
+class JoinedRow extends catalyst.InternalRow {
+  private[this] var row1: catalyst.InternalRow = _
+  private[this] var row2: catalyst.InternalRow = _
 
-  def this(left: Row, right: Row) = {
+  def this(left: catalyst.InternalRow, right: catalyst.InternalRow) = {
     this()
     row1 = left
     row2 = right
   }
 
   /** Updates this JoinedRow to used point at two new base rows.  Returns itself. */
-  def apply(r1: Row, r2: Row): Row = {
+  def apply(r1: catalyst.InternalRow, r2: catalyst.InternalRow): catalyst.InternalRow = {
     row1 = r1
     row2 = r2
     this
   }
 
   /** Updates this JoinedRow by updating its left base row.  Returns itself. */
-  def withLeft(newLeft: Row): Row = {
+  def withLeft(newLeft: catalyst.InternalRow): catalyst.InternalRow = {
     row1 = newLeft
     this
   }
 
   /** Updates this JoinedRow by updating its right base row.  Returns itself. */
-  def withRight(newRight: Row): Row = {
+  def withRight(newRight: catalyst.InternalRow): catalyst.InternalRow = {
     row2 = newRight
     this
   }
@@ -142,7 +144,7 @@ class JoinedRow extends Row {
   override def getAs[T](i: Int): T =
     if (i < row1.length) row1.getAs[T](i) else row2.getAs[T](i - row1.length)
 
-  override def copy(): Row = {
+  override def copy(): catalyst.InternalRow = {
     val totalSize = row1.length + row2.length
     val copiedValues = new Array[Any](totalSize)
     var i = 0
@@ -176,31 +178,31 @@ class JoinedRow extends Row {
  * Row will be referenced, increasing the opportunity for the JIT to play tricks.  This sounds
  * crazy but in benchmarks it had noticeable effects.
  */
-class JoinedRow2 extends Row {
-  private[this] var row1: Row = _
-  private[this] var row2: Row = _
+class JoinedRow2 extends catalyst.InternalRow {
+  private[this] var row1: catalyst.InternalRow = _
+  private[this] var row2: catalyst.InternalRow = _
 
-  def this(left: Row, right: Row) = {
+  def this(left: catalyst.InternalRow, right: catalyst.InternalRow) = {
     this()
     row1 = left
     row2 = right
   }
 
   /** Updates this JoinedRow to used point at two new base rows.  Returns itself. */
-  def apply(r1: Row, r2: Row): Row = {
+  def apply(r1: catalyst.InternalRow, r2: catalyst.InternalRow): catalyst.InternalRow = {
     row1 = r1
     row2 = r2
     this
   }
 
   /** Updates this JoinedRow by updating its left base row.  Returns itself. */
-  def withLeft(newLeft: Row): Row = {
+  def withLeft(newLeft: catalyst.InternalRow): catalyst.InternalRow = {
     row1 = newLeft
     this
   }
 
   /** Updates this JoinedRow by updating its right base row.  Returns itself. */
-  def withRight(newRight: Row): Row = {
+  def withRight(newRight: catalyst.InternalRow): catalyst.InternalRow = {
     row2 = newRight
     this
   }
@@ -242,7 +244,7 @@ class JoinedRow2 extends Row {
   override def getAs[T](i: Int): T =
     if (i < row1.length) row1.getAs[T](i) else row2.getAs[T](i - row1.length)
 
-  override def copy(): Row = {
+  override def copy(): catalyst.InternalRow = {
     val totalSize = row1.length + row2.length
     val copiedValues = new Array[Any](totalSize)
     var i = 0
@@ -270,31 +272,31 @@ class JoinedRow2 extends Row {
 /**
  * JIT HACK: Replace with macros
  */
-class JoinedRow3 extends Row {
-  private[this] var row1: Row = _
-  private[this] var row2: Row = _
+class JoinedRow3 extends catalyst.InternalRow {
+  private[this] var row1: catalyst.InternalRow = _
+  private[this] var row2: catalyst.InternalRow = _
 
-  def this(left: Row, right: Row) = {
+  def this(left: catalyst.InternalRow, right: catalyst.InternalRow) = {
     this()
     row1 = left
     row2 = right
   }
 
   /** Updates this JoinedRow to used point at two new base rows.  Returns itself. */
-  def apply(r1: Row, r2: Row): Row = {
+  def apply(r1: catalyst.InternalRow, r2: catalyst.InternalRow): catalyst.InternalRow = {
     row1 = r1
     row2 = r2
     this
   }
 
   /** Updates this JoinedRow by updating its left base row.  Returns itself. */
-  def withLeft(newLeft: Row): Row = {
+  def withLeft(newLeft: catalyst.InternalRow): catalyst.InternalRow = {
     row1 = newLeft
     this
   }
 
   /** Updates this JoinedRow by updating its right base row.  Returns itself. */
-  def withRight(newRight: Row): Row = {
+  def withRight(newRight: catalyst.InternalRow): catalyst.InternalRow = {
     row2 = newRight
     this
   }
@@ -336,7 +338,7 @@ class JoinedRow3 extends Row {
   override def getAs[T](i: Int): T =
     if (i < row1.length) row1.getAs[T](i) else row2.getAs[T](i - row1.length)
 
-  override def copy(): Row = {
+  override def copy(): catalyst.InternalRow = {
     val totalSize = row1.length + row2.length
     val copiedValues = new Array[Any](totalSize)
     var i = 0
@@ -364,31 +366,31 @@ class JoinedRow3 extends Row {
 /**
  * JIT HACK: Replace with macros
  */
-class JoinedRow4 extends Row {
-  private[this] var row1: Row = _
-  private[this] var row2: Row = _
+class JoinedRow4 extends catalyst.InternalRow {
+  private[this] var row1: catalyst.InternalRow = _
+  private[this] var row2: catalyst.InternalRow = _
 
-  def this(left: Row, right: Row) = {
+  def this(left: catalyst.InternalRow, right: catalyst.InternalRow) = {
     this()
     row1 = left
     row2 = right
   }
 
   /** Updates this JoinedRow to used point at two new base rows.  Returns itself. */
-  def apply(r1: Row, r2: Row): Row = {
+  def apply(r1: catalyst.InternalRow, r2: catalyst.InternalRow): catalyst.InternalRow = {
     row1 = r1
     row2 = r2
     this
   }
 
   /** Updates this JoinedRow by updating its left base row.  Returns itself. */
-  def withLeft(newLeft: Row): Row = {
+  def withLeft(newLeft: catalyst.InternalRow): catalyst.InternalRow = {
     row1 = newLeft
     this
   }
 
   /** Updates this JoinedRow by updating its right base row.  Returns itself. */
-  def withRight(newRight: Row): Row = {
+  def withRight(newRight: catalyst.InternalRow): catalyst.InternalRow = {
     row2 = newRight
     this
   }
@@ -430,7 +432,7 @@ class JoinedRow4 extends Row {
   override def getAs[T](i: Int): T =
     if (i < row1.length) row1.getAs[T](i) else row2.getAs[T](i - row1.length)
 
-  override def copy(): Row = {
+  override def copy(): catalyst.InternalRow = {
     val totalSize = row1.length + row2.length
     val copiedValues = new Array[Any](totalSize)
     var i = 0
@@ -458,31 +460,31 @@ class JoinedRow4 extends Row {
 /**
  * JIT HACK: Replace with macros
  */
-class JoinedRow5 extends Row {
-  private[this] var row1: Row = _
-  private[this] var row2: Row = _
+class JoinedRow5 extends catalyst.InternalRow {
+  private[this] var row1: catalyst.InternalRow = _
+  private[this] var row2: catalyst.InternalRow = _
 
-  def this(left: Row, right: Row) = {
+  def this(left: catalyst.InternalRow, right: catalyst.InternalRow) = {
     this()
     row1 = left
     row2 = right
   }
 
   /** Updates this JoinedRow to used point at two new base rows.  Returns itself. */
-  def apply(r1: Row, r2: Row): Row = {
+  def apply(r1: catalyst.InternalRow, r2: catalyst.InternalRow): catalyst.InternalRow = {
     row1 = r1
     row2 = r2
     this
   }
 
   /** Updates this JoinedRow by updating its left base row.  Returns itself. */
-  def withLeft(newLeft: Row): Row = {
+  def withLeft(newLeft: catalyst.InternalRow): catalyst.InternalRow = {
     row1 = newLeft
     this
   }
 
   /** Updates this JoinedRow by updating its right base row.  Returns itself. */
-  def withRight(newRight: Row): Row = {
+  def withRight(newRight: catalyst.InternalRow): catalyst.InternalRow = {
     row2 = newRight
     this
   }
@@ -524,7 +526,7 @@ class JoinedRow5 extends Row {
   override def getAs[T](i: Int): T =
     if (i < row1.length) row1.getAs[T](i) else row2.getAs[T](i - row1.length)
 
-  override def copy(): Row = {
+  override def copy(): catalyst.InternalRow = {
     val totalSize = row1.length + row2.length
     val copiedValues = new Array[Any](totalSize)
     var i = 0
@@ -552,31 +554,31 @@ class JoinedRow5 extends Row {
 /**
  * JIT HACK: Replace with macros
  */
-class JoinedRow6 extends Row {
-  private[this] var row1: Row = _
-  private[this] var row2: Row = _
+class JoinedRow6 extends catalyst.InternalRow {
+  private[this] var row1: catalyst.InternalRow = _
+  private[this] var row2: catalyst.InternalRow = _
 
-  def this(left: Row, right: Row) = {
+  def this(left: catalyst.InternalRow, right: catalyst.InternalRow) = {
     this()
     row1 = left
     row2 = right
   }
 
   /** Updates this JoinedRow to used point at two new base rows.  Returns itself. */
-  def apply(r1: Row, r2: Row): Row = {
+  def apply(r1: catalyst.InternalRow, r2: catalyst.InternalRow): catalyst.InternalRow = {
     row1 = r1
     row2 = r2
     this
   }
 
   /** Updates this JoinedRow by updating its left base row.  Returns itself. */
-  def withLeft(newLeft: Row): Row = {
+  def withLeft(newLeft: catalyst.InternalRow): catalyst.InternalRow = {
     row1 = newLeft
     this
   }
 
   /** Updates this JoinedRow by updating its right base row.  Returns itself. */
-  def withRight(newRight: Row): Row = {
+  def withRight(newRight: catalyst.InternalRow): catalyst.InternalRow = {
     row2 = newRight
     this
   }
@@ -618,7 +620,7 @@ class JoinedRow6 extends Row {
   override def getAs[T](i: Int): T =
     if (i < row1.length) row1.getAs[T](i) else row2.getAs[T](i - row1.length)
 
-  override def copy(): Row = {
+  override def copy(): catalyst.InternalRow = {
     val totalSize = row1.length + row2.length
     val copiedValues = new Array[Any](totalSize)
     var i = 0
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ScalaUdf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ScalaUdf.scala
index 5b45347872cca..40f235fc19536 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ScalaUdf.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ScalaUdf.scala
@@ -17,6 +17,7 @@
 
 package org.apache.spark.sql.catalyst.expressions
 
+import org.apache.spark.sql.catalyst
 import org.apache.spark.sql.catalyst.CatalystTypeConverters
 import org.apache.spark.sql.types.DataType
 
@@ -45,7 +46,7 @@ case class ScalaUdf(function: AnyRef, dataType: DataType, children: Seq[Expressi
       val func = function.asInstanceOf[($anys) => Any]
       $childs
       $converters
-      (input: Row) => {
+      (input: InternalRow) => {
         func(
           $evals)
       }
@@ -57,7 +58,7 @@ case class ScalaUdf(function: AnyRef, dataType: DataType, children: Seq[Expressi
   private[this] val f = children.size match {
     case 0 =>
       val func = function.asInstanceOf[() => Any]
-      (input: Row) => {
+      (input: catalyst.InternalRow) => {
         func()
       }
 
@@ -65,7 +66,7 @@ case class ScalaUdf(function: AnyRef, dataType: DataType, children: Seq[Expressi
       val func = function.asInstanceOf[(Any) => Any]
       val child0 = children(0)
       lazy val converter0 = CatalystTypeConverters.createToScalaConverter(child0.dataType)
-      (input: Row) => {
+      (input: catalyst.InternalRow) => {
         func(
           converter0(child0.eval(input)))
       }
@@ -76,7 +77,7 @@ case class ScalaUdf(function: AnyRef, dataType: DataType, children: Seq[Expressi
       val child1 = children(1)
       lazy val converter0 = CatalystTypeConverters.createToScalaConverter(child0.dataType)
       lazy val converter1 = CatalystTypeConverters.createToScalaConverter(child1.dataType)
-      (input: Row) => {
+      (input: catalyst.InternalRow) => {
         func(
           converter0(child0.eval(input)),
           converter1(child1.eval(input)))
@@ -90,7 +91,7 @@ case class ScalaUdf(function: AnyRef, dataType: DataType, children: Seq[Expressi
       lazy val converter0 = CatalystTypeConverters.createToScalaConverter(child0.dataType)
       lazy val converter1 = CatalystTypeConverters.createToScalaConverter(child1.dataType)
       lazy val converter2 = CatalystTypeConverters.createToScalaConverter(child2.dataType)
-      (input: Row) => {
+      (input: catalyst.InternalRow) => {
         func(
           converter0(child0.eval(input)),
           converter1(child1.eval(input)),
@@ -107,7 +108,7 @@ case class ScalaUdf(function: AnyRef, dataType: DataType, children: Seq[Expressi
       lazy val converter1 = CatalystTypeConverters.createToScalaConverter(child1.dataType)
       lazy val converter2 = CatalystTypeConverters.createToScalaConverter(child2.dataType)
       lazy val converter3 = CatalystTypeConverters.createToScalaConverter(child3.dataType)
-      (input: Row) => {
+      (input: catalyst.InternalRow) => {
         func(
           converter0(child0.eval(input)),
           converter1(child1.eval(input)),
@@ -127,7 +128,7 @@ case class ScalaUdf(function: AnyRef, dataType: DataType, children: Seq[Expressi
       lazy val converter2 = CatalystTypeConverters.createToScalaConverter(child2.dataType)
       lazy val converter3 = CatalystTypeConverters.createToScalaConverter(child3.dataType)
       lazy val converter4 = CatalystTypeConverters.createToScalaConverter(child4.dataType)
-      (input: Row) => {
+      (input: catalyst.InternalRow) => {
         func(
           converter0(child0.eval(input)),
           converter1(child1.eval(input)),
@@ -150,7 +151,7 @@ case class ScalaUdf(function: AnyRef, dataType: DataType, children: Seq[Expressi
       lazy val converter3 = CatalystTypeConverters.createToScalaConverter(child3.dataType)
       lazy val converter4 = CatalystTypeConverters.createToScalaConverter(child4.dataType)
       lazy val converter5 = CatalystTypeConverters.createToScalaConverter(child5.dataType)
-      (input: Row) => {
+      (input: catalyst.InternalRow) => {
         func(
           converter0(child0.eval(input)),
           converter1(child1.eval(input)),
@@ -176,7 +177,7 @@ case class ScalaUdf(function: AnyRef, dataType: DataType, children: Seq[Expressi
       lazy val converter4 = CatalystTypeConverters.createToScalaConverter(child4.dataType)
       lazy val converter5 = CatalystTypeConverters.createToScalaConverter(child5.dataType)
       lazy val converter6 = CatalystTypeConverters.createToScalaConverter(child6.dataType)
-      (input: Row) => {
+      (input: catalyst.InternalRow) => {
         func(
           converter0(child0.eval(input)),
           converter1(child1.eval(input)),
@@ -205,7 +206,7 @@ case class ScalaUdf(function: AnyRef, dataType: DataType, children: Seq[Expressi
       lazy val converter5 = CatalystTypeConverters.createToScalaConverter(child5.dataType)
       lazy val converter6 = CatalystTypeConverters.createToScalaConverter(child6.dataType)
       lazy val converter7 = CatalystTypeConverters.createToScalaConverter(child7.dataType)
-      (input: Row) => {
+      (input: catalyst.InternalRow) => {
         func(
           converter0(child0.eval(input)),
           converter1(child1.eval(input)),
@@ -237,7 +238,7 @@ case class ScalaUdf(function: AnyRef, dataType: DataType, children: Seq[Expressi
       lazy val converter6 = CatalystTypeConverters.createToScalaConverter(child6.dataType)
       lazy val converter7 = CatalystTypeConverters.createToScalaConverter(child7.dataType)
       lazy val converter8 = CatalystTypeConverters.createToScalaConverter(child8.dataType)
-      (input: Row) => {
+      (input: catalyst.InternalRow) => {
         func(
           converter0(child0.eval(input)),
           converter1(child1.eval(input)),
@@ -272,7 +273,7 @@ case class ScalaUdf(function: AnyRef, dataType: DataType, children: Seq[Expressi
       lazy val converter7 = CatalystTypeConverters.createToScalaConverter(child7.dataType)
       lazy val converter8 = CatalystTypeConverters.createToScalaConverter(child8.dataType)
       lazy val converter9 = CatalystTypeConverters.createToScalaConverter(child9.dataType)
-      (input: Row) => {
+      (input: catalyst.InternalRow) => {
         func(
           converter0(child0.eval(input)),
           converter1(child1.eval(input)),
@@ -310,7 +311,7 @@ case class ScalaUdf(function: AnyRef, dataType: DataType, children: Seq[Expressi
       lazy val converter8 = CatalystTypeConverters.createToScalaConverter(child8.dataType)
       lazy val converter9 = CatalystTypeConverters.createToScalaConverter(child9.dataType)
       lazy val converter10 = CatalystTypeConverters.createToScalaConverter(child10.dataType)
-      (input: Row) => {
+      (input: catalyst.InternalRow) => {
         func(
           converter0(child0.eval(input)),
           converter1(child1.eval(input)),
@@ -351,7 +352,7 @@ case class ScalaUdf(function: AnyRef, dataType: DataType, children: Seq[Expressi
       lazy val converter9 = CatalystTypeConverters.createToScalaConverter(child9.dataType)
       lazy val converter10 = CatalystTypeConverters.createToScalaConverter(child10.dataType)
       lazy val converter11 = CatalystTypeConverters.createToScalaConverter(child11.dataType)
-      (input: Row) => {
+      (input: catalyst.InternalRow) => {
         func(
           converter0(child0.eval(input)),
           converter1(child1.eval(input)),
@@ -395,7 +396,7 @@ case class ScalaUdf(function: AnyRef, dataType: DataType, children: Seq[Expressi
       lazy val converter10 = CatalystTypeConverters.createToScalaConverter(child10.dataType)
       lazy val converter11 = CatalystTypeConverters.createToScalaConverter(child11.dataType)
       lazy val converter12 = CatalystTypeConverters.createToScalaConverter(child12.dataType)
-      (input: Row) => {
+      (input: catalyst.InternalRow) => {
         func(
           converter0(child0.eval(input)),
           converter1(child1.eval(input)),
@@ -442,7 +443,7 @@ case class ScalaUdf(function: AnyRef, dataType: DataType, children: Seq[Expressi
       lazy val converter11 = CatalystTypeConverters.createToScalaConverter(child11.dataType)
       lazy val converter12 = CatalystTypeConverters.createToScalaConverter(child12.dataType)
       lazy val converter13 = CatalystTypeConverters.createToScalaConverter(child13.dataType)
-      (input: Row) => {
+      (input: catalyst.InternalRow) => {
         func(
           converter0(child0.eval(input)),
           converter1(child1.eval(input)),
@@ -492,7 +493,7 @@ case class ScalaUdf(function: AnyRef, dataType: DataType, children: Seq[Expressi
       lazy val converter12 = CatalystTypeConverters.createToScalaConverter(child12.dataType)
       lazy val converter13 = CatalystTypeConverters.createToScalaConverter(child13.dataType)
       lazy val converter14 = CatalystTypeConverters.createToScalaConverter(child14.dataType)
-      (input: Row) => {
+      (input: catalyst.InternalRow) => {
         func(
           converter0(child0.eval(input)),
           converter1(child1.eval(input)),
@@ -545,7 +546,7 @@ case class ScalaUdf(function: AnyRef, dataType: DataType, children: Seq[Expressi
       lazy val converter13 = CatalystTypeConverters.createToScalaConverter(child13.dataType)
       lazy val converter14 = CatalystTypeConverters.createToScalaConverter(child14.dataType)
       lazy val converter15 = CatalystTypeConverters.createToScalaConverter(child15.dataType)
-      (input: Row) => {
+      (input: catalyst.InternalRow) => {
         func(
           converter0(child0.eval(input)),
           converter1(child1.eval(input)),
@@ -601,7 +602,7 @@ case class ScalaUdf(function: AnyRef, dataType: DataType, children: Seq[Expressi
       lazy val converter14 = CatalystTypeConverters.createToScalaConverter(child14.dataType)
       lazy val converter15 = CatalystTypeConverters.createToScalaConverter(child15.dataType)
       lazy val converter16 = CatalystTypeConverters.createToScalaConverter(child16.dataType)
-      (input: Row) => {
+      (input: catalyst.InternalRow) => {
         func(
           converter0(child0.eval(input)),
           converter1(child1.eval(input)),
@@ -660,7 +661,7 @@ case class ScalaUdf(function: AnyRef, dataType: DataType, children: Seq[Expressi
       lazy val converter15 = CatalystTypeConverters.createToScalaConverter(child15.dataType)
       lazy val converter16 = CatalystTypeConverters.createToScalaConverter(child16.dataType)
       lazy val converter17 = CatalystTypeConverters.createToScalaConverter(child17.dataType)
-      (input: Row) => {
+      (input: catalyst.InternalRow) => {
         func(
           converter0(child0.eval(input)),
           converter1(child1.eval(input)),
@@ -722,7 +723,7 @@ case class ScalaUdf(function: AnyRef, dataType: DataType, children: Seq[Expressi
       lazy val converter16 = CatalystTypeConverters.createToScalaConverter(child16.dataType)
       lazy val converter17 = CatalystTypeConverters.createToScalaConverter(child17.dataType)
       lazy val converter18 = CatalystTypeConverters.createToScalaConverter(child18.dataType)
-      (input: Row) => {
+      (input: catalyst.InternalRow) => {
         func(
           converter0(child0.eval(input)),
           converter1(child1.eval(input)),
@@ -787,7 +788,7 @@ case class ScalaUdf(function: AnyRef, dataType: DataType, children: Seq[Expressi
       lazy val converter17 = CatalystTypeConverters.createToScalaConverter(child17.dataType)
       lazy val converter18 = CatalystTypeConverters.createToScalaConverter(child18.dataType)
       lazy val converter19 = CatalystTypeConverters.createToScalaConverter(child19.dataType)
-      (input: Row) => {
+      (input: catalyst.InternalRow) => {
         func(
           converter0(child0.eval(input)),
           converter1(child1.eval(input)),
@@ -855,7 +856,7 @@ case class ScalaUdf(function: AnyRef, dataType: DataType, children: Seq[Expressi
       lazy val converter18 = CatalystTypeConverters.createToScalaConverter(child18.dataType)
       lazy val converter19 = CatalystTypeConverters.createToScalaConverter(child19.dataType)
       lazy val converter20 = CatalystTypeConverters.createToScalaConverter(child20.dataType)
-      (input: Row) => {
+      (input: catalyst.InternalRow) => {
         func(
           converter0(child0.eval(input)),
           converter1(child1.eval(input)),
@@ -926,7 +927,7 @@ case class ScalaUdf(function: AnyRef, dataType: DataType, children: Seq[Expressi
       lazy val converter19 = CatalystTypeConverters.createToScalaConverter(child19.dataType)
       lazy val converter20 = CatalystTypeConverters.createToScalaConverter(child20.dataType)
       lazy val converter21 = CatalystTypeConverters.createToScalaConverter(child21.dataType)
-      (input: Row) => {
+      (input: catalyst.InternalRow) => {
         func(
           converter0(child0.eval(input)),
           converter1(child1.eval(input)),
@@ -955,6 +956,6 @@ case class ScalaUdf(function: AnyRef, dataType: DataType, children: Seq[Expressi
 
   // scalastyle:on
   private[this] val converter = CatalystTypeConverters.createToCatalystConverter(dataType)
-  override def eval(input: Row): Any = converter(f(input))
+  override def eval(input: catalyst.InternalRow): Any = converter(f(input))
 
 }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/SortOrder.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/SortOrder.scala
index 99340a14c9ecc..8a3435599922f 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/SortOrder.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/SortOrder.scala
@@ -18,7 +18,7 @@
 package org.apache.spark.sql.catalyst.expressions
 
 import org.apache.spark.sql.catalyst.errors.TreeNodeException
-import org.apache.spark.sql.catalyst.trees
+import org.apache.spark.sql.catalyst.{InternalRow, trees}
 import org.apache.spark.sql.types.DataType
 
 abstract sealed class SortDirection
@@ -36,7 +36,7 @@ case class SortOrder(child: Expression, direction: SortDirection) extends Expres
   override def nullable: Boolean = child.nullable
 
   // SortOrder itself is never evaluated.
-  override def eval(input: Row = null): Any =
+  override def eval(input: InternalRow = null): Any =
     throw new TreeNodeException(this, s"No function to evaluate expression. type: ${this.nodeName}")
 
   override def toString: String = s"$child ${if (direction == Ascending) "ASC" else "DESC"}"
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/SpecificMutableRow.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/SpecificMutableRow.scala
index 98eda61a80b40..05aab34559985 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/SpecificMutableRow.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/SpecificMutableRow.scala
@@ -222,7 +222,7 @@ final class SpecificMutableRow(val values: Array[MutableValue]) extends MutableR
 
   override def isNullAt(i: Int): Boolean = values(i).isNull
 
-  override def copy(): Row = {
+  override def copy(): InternalRow = {
     val newValues = new Array[Any](values.length)
     var i = 0
     while (i < values.length) {
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/UnsafeRowConverter.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/UnsafeRowConverter.scala
index 5350123bf4c01..d771e454b5170 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/UnsafeRowConverter.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/UnsafeRowConverter.scala
@@ -48,7 +48,7 @@ class UnsafeRowConverter(fieldTypes: Array[DataType]) {
   /**
    * Compute the amount of space, in bytes, required to encode the given row.
    */
-  def getSizeRequirement(row: Row): Int = {
+  def getSizeRequirement(row: InternalRow): Int = {
     var fieldNumber = 0
     var variableLengthFieldSize: Int = 0
     while (fieldNumber < writers.length) {
@@ -68,7 +68,7 @@ class UnsafeRowConverter(fieldTypes: Array[DataType]) {
    * @param baseOffset the base offset of the destination address
    * @return the number of bytes written. This should be equal to `getSizeRequirement(row)`.
    */
-  def writeRow(row: Row, baseObject: Object, baseOffset: Long): Long = {
+  def writeRow(row: InternalRow, baseObject: Object, baseOffset: Long): Long = {
     unsafeRow.pointTo(baseObject, baseOffset, writers.length, null)
     var fieldNumber = 0
     var appendCursor: Int = fixedLengthSize
@@ -99,12 +99,12 @@ private abstract class UnsafeColumnWriter {
    *                     used for calculating where variable-length data should be written
    * @return the number of variable-length bytes written
    */
-  def write(source: Row, target: UnsafeRow, column: Int, appendCursor: Int): Int
+  def write(source: InternalRow, target: UnsafeRow, column: Int, appendCursor: Int): Int
 
   /**
    * Return the number of bytes that are needed to write this variable-length value.
    */
-  def getSize(source: Row, column: Int): Int
+  def getSize(source: InternalRow, column: Int): Int
 }
 
 private object UnsafeColumnWriter {
@@ -140,72 +140,108 @@ private object StringUnsafeColumnWriter extends StringUnsafeColumnWriter
 
 private abstract class PrimitiveUnsafeColumnWriter extends UnsafeColumnWriter {
   // Primitives don't write to the variable-length region:
-  def getSize(sourceRow: Row, column: Int): Int = 0
+  def getSize(sourceRow: InternalRow, column: Int): Int = 0
 }
 
 private class NullUnsafeColumnWriter private() extends PrimitiveUnsafeColumnWriter {
-  override def write(source: Row, target: UnsafeRow, column: Int, appendCursor: Int): Int = {
+  override def write(
+      source: InternalRow,
+      target: UnsafeRow,
+      column: Int,
+      appendCursor: Int): Int = {
     target.setNullAt(column)
     0
   }
 }
 
 private class BooleanUnsafeColumnWriter private() extends PrimitiveUnsafeColumnWriter {
-  override def write(source: Row, target: UnsafeRow, column: Int, appendCursor: Int): Int = {
+  override def write(
+      source: InternalRow,
+      target: UnsafeRow,
+      column: Int,
+      appendCursor: Int): Int = {
     target.setBoolean(column, source.getBoolean(column))
     0
   }
 }
 
 private class ByteUnsafeColumnWriter private() extends PrimitiveUnsafeColumnWriter {
-  override def write(source: Row, target: UnsafeRow, column: Int, appendCursor: Int): Int = {
+  override def write(
+      source: InternalRow,
+      target: UnsafeRow,
+      column: Int,
+      appendCursor: Int): Int = {
     target.setByte(column, source.getByte(column))
     0
   }
 }
 
 private class ShortUnsafeColumnWriter private() extends PrimitiveUnsafeColumnWriter {
-  override def write(source: Row, target: UnsafeRow, column: Int, appendCursor: Int): Int = {
+  override def write(
+      source: InternalRow,
+      target: UnsafeRow,
+      column: Int,
+      appendCursor: Int): Int = {
     target.setShort(column, source.getShort(column))
     0
   }
 }
 
 private class IntUnsafeColumnWriter private() extends PrimitiveUnsafeColumnWriter {
-  override def write(source: Row, target: UnsafeRow, column: Int, appendCursor: Int): Int = {
+  override def write(
+      source: InternalRow,
+      target: UnsafeRow,
+      column: Int,
+      appendCursor: Int): Int = {
     target.setInt(column, source.getInt(column))
     0
   }
 }
 
 private class LongUnsafeColumnWriter private() extends PrimitiveUnsafeColumnWriter {
-  override def write(source: Row, target: UnsafeRow, column: Int, appendCursor: Int): Int = {
+  override def write(
+      source: InternalRow,
+      target: UnsafeRow,
+      column: Int,
+      appendCursor: Int): Int = {
     target.setLong(column, source.getLong(column))
     0
   }
 }
 
 private class FloatUnsafeColumnWriter private() extends PrimitiveUnsafeColumnWriter {
-  override def write(source: Row, target: UnsafeRow, column: Int, appendCursor: Int): Int = {
+  override def write(
+      source: InternalRow,
+      target: UnsafeRow,
+      column: Int,
+      appendCursor: Int): Int = {
     target.setFloat(column, source.getFloat(column))
     0
   }
 }
 
 private class DoubleUnsafeColumnWriter private() extends PrimitiveUnsafeColumnWriter {
-  override def write(source: Row, target: UnsafeRow, column: Int, appendCursor: Int): Int = {
+  override def write(
+      source: InternalRow,
+      target: UnsafeRow,
+      column: Int,
+      appendCursor: Int): Int = {
     target.setDouble(column, source.getDouble(column))
     0
   }
 }
 
 private class StringUnsafeColumnWriter private() extends UnsafeColumnWriter {
-  def getSize(source: Row, column: Int): Int = {
+  def getSize(source: InternalRow, column: Int): Int = {
     val numBytes = source.get(column).asInstanceOf[UTF8String].getBytes.length
     8 + ByteArrayMethods.roundNumberOfBytesToNearestWord(numBytes)
   }
 
-  override def write(source: Row, target: UnsafeRow, column: Int, appendCursor: Int): Int = {
+  override def write(
+      source: InternalRow,
+      target: UnsafeRow,
+      column: Int,
+      appendCursor: Int): Int = {
     val value = source.get(column).asInstanceOf[UTF8String]
     val baseObject = target.getBaseObject
     val baseOffset = target.getBaseOffset
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregates.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregates.scala
index 0266084a6d174..f9e8150a689c1 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregates.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregates.scala
@@ -19,9 +19,10 @@ package org.apache.spark.sql.catalyst.expressions
 
 import com.clearspring.analytics.stream.cardinality.HyperLogLog
 
-import org.apache.spark.sql.types._
-import org.apache.spark.sql.catalyst.trees
+import org.apache.spark.sql.catalyst
 import org.apache.spark.sql.catalyst.errors.TreeNodeException
+import org.apache.spark.sql.catalyst.trees
+import org.apache.spark.sql.types._
 import org.apache.spark.util.collection.OpenHashSet
 
 abstract class AggregateExpression extends Expression {
@@ -37,7 +38,7 @@ abstract class AggregateExpression extends Expression {
    * [[AggregateExpression.eval]] should never be invoked because [[AggregateExpression]]'s are
    * replaced with a physical aggregate operator at runtime.
    */
-  override def eval(input: Row = null): Any =
+  override def eval(input: catalyst.InternalRow = null): Any =
     throw new TreeNodeException(this, s"No function to evaluate expression. type: ${this.nodeName}")
 }
 
@@ -80,7 +81,7 @@ abstract class AggregateFunction
   override def nullable: Boolean = base.nullable
   override def dataType: DataType = base.dataType
 
-  def update(input: Row): Unit
+  def update(input: catalyst.InternalRow): Unit
 
   // Do we really need this?
   override def newInstance(): AggregateFunction = {
@@ -108,7 +109,7 @@ case class MinFunction(expr: Expression, base: AggregateExpression) extends Aggr
   val currentMin: MutableLiteral = MutableLiteral(null, expr.dataType)
   val cmp = GreaterThan(currentMin, expr)
 
-  override def update(input: Row): Unit = {
+  override def update(input: catalyst.InternalRow): Unit = {
     if (currentMin.value == null) {
       currentMin.value = expr.eval(input)
     } else if (cmp.eval(input) == true) {
@@ -116,7 +117,7 @@ case class MinFunction(expr: Expression, base: AggregateExpression) extends Aggr
     }
   }
 
-  override def eval(input: Row): Any = currentMin.value
+  override def eval(input: catalyst.InternalRow): Any = currentMin.value
 }
 
 case class Max(child: Expression) extends PartialAggregate with trees.UnaryNode[Expression] {
@@ -139,7 +140,7 @@ case class MaxFunction(expr: Expression, base: AggregateExpression) extends Aggr
   val currentMax: MutableLiteral = MutableLiteral(null, expr.dataType)
   val cmp = LessThan(currentMax, expr)
 
-  override def update(input: Row): Unit = {
+  override def update(input: catalyst.InternalRow): Unit = {
     if (currentMax.value == null) {
       currentMax.value = expr.eval(input)
     } else if (cmp.eval(input) == true) {
@@ -147,7 +148,7 @@ case class MaxFunction(expr: Expression, base: AggregateExpression) extends Aggr
     }
   }
 
-  override def eval(input: Row): Any = currentMax.value
+  override def eval(input: catalyst.InternalRow): Any = currentMax.value
 }
 
 case class Count(child: Expression) extends PartialAggregate with trees.UnaryNode[Expression] {
@@ -205,14 +206,14 @@ case class CollectHashSetFunction(
   @transient
   val distinctValue = new InterpretedProjection(expr)
 
-  override def update(input: Row): Unit = {
+  override def update(input: catalyst.InternalRow): Unit = {
     val evaluatedExpr = distinctValue(input)
     if (!evaluatedExpr.anyNull) {
       seen.add(evaluatedExpr)
     }
   }
 
-  override def eval(input: Row): Any = {
+  override def eval(input: catalyst.InternalRow): Any = {
     seen
   }
 }
@@ -238,7 +239,7 @@ case class CombineSetsAndCountFunction(
 
   val seen = new OpenHashSet[Any]()
 
-  override def update(input: Row): Unit = {
+  override def update(input: catalyst.InternalRow): Unit = {
     val inputSetEval = inputSet.eval(input).asInstanceOf[OpenHashSet[Any]]
     val inputIterator = inputSetEval.iterator
     while (inputIterator.hasNext) {
@@ -246,7 +247,7 @@ case class CombineSetsAndCountFunction(
     }
   }
 
-  override def eval(input: Row): Any = seen.size.toLong
+  override def eval(input: catalyst.InternalRow): Any = seen.size.toLong
 }
 
 /** The data type of ApproxCountDistinctPartition since its output is a HyperLogLog object. */
@@ -453,7 +454,7 @@ case class CombineSetsAndSumFunction(
 
   val seen = new OpenHashSet[Any]()
 
-  override def update(input: Row): Unit = {
+  override def update(input: catalyst.InternalRow): Unit = {
     val inputSetEval = inputSet.eval(input).asInstanceOf[OpenHashSet[Any]]
     val inputIterator = inputSetEval.iterator
     while (inputIterator.hasNext) {
@@ -461,8 +462,8 @@ case class CombineSetsAndSumFunction(
     }
   }
 
-  override def eval(input: Row): Any = {
-    val casted = seen.asInstanceOf[OpenHashSet[Row]]
+  override def eval(input: catalyst.InternalRow): Any = {
+    val casted = seen.asInstanceOf[OpenHashSet[catalyst.InternalRow]]
     if (casted.size == 0) {
       null
     } else {
@@ -524,7 +525,7 @@ case class AverageFunction(expr: Expression, base: AggregateExpression)
   private def addFunction(value: Any) = Add(sum,
     Cast(Literal.create(value, expr.dataType), calcType))
 
-  override def eval(input: Row): Any = {
+  override def eval(input: catalyst.InternalRow): Any = {
     if (count == 0L) {
       null
     } else {
@@ -541,7 +542,7 @@ case class AverageFunction(expr: Expression, base: AggregateExpression)
     }
   }
 
-  override def update(input: Row): Unit = {
+  override def update(input: catalyst.InternalRow): Unit = {
     val evaluatedExpr = expr.eval(input)
     if (evaluatedExpr != null) {
       count += 1
@@ -555,14 +556,14 @@ case class CountFunction(expr: Expression, base: AggregateExpression) extends Ag
 
   var count: Long = _
 
-  override def update(input: Row): Unit = {
+  override def update(input: catalyst.InternalRow): Unit = {
     val evaluatedExpr = expr.eval(input)
     if (evaluatedExpr != null) {
       count += 1L
     }
   }
 
-  override def eval(input: Row): Any = count
+  override def eval(input: catalyst.InternalRow): Any = count
 }
 
 case class ApproxCountDistinctPartitionFunction(
@@ -574,14 +575,14 @@ case class ApproxCountDistinctPartitionFunction(
 
   private val hyperLogLog = new HyperLogLog(relativeSD)
 
-  override def update(input: Row): Unit = {
+  override def update(input: catalyst.InternalRow): Unit = {
     val evaluatedExpr = expr.eval(input)
     if (evaluatedExpr != null) {
       hyperLogLog.offer(evaluatedExpr)
     }
   }
 
-  override def eval(input: Row): Any = hyperLogLog
+  override def eval(input: catalyst.InternalRow): Any = hyperLogLog
 }
 
 case class ApproxCountDistinctMergeFunction(
@@ -593,12 +594,12 @@ case class ApproxCountDistinctMergeFunction(
 
   private val hyperLogLog = new HyperLogLog(relativeSD)
 
-  override def update(input: Row): Unit = {
+  override def update(input: catalyst.InternalRow): Unit = {
     val evaluatedExpr = expr.eval(input)
     hyperLogLog.addAll(evaluatedExpr.asInstanceOf[HyperLogLog])
   }
 
-  override def eval(input: Row): Any = hyperLogLog.cardinality()
+  override def eval(input: catalyst.InternalRow): Any = hyperLogLog.cardinality()
 }
 
 case class SumFunction(expr: Expression, base: AggregateExpression) extends AggregateFunction {
@@ -619,11 +620,11 @@ case class SumFunction(expr: Expression, base: AggregateExpression) extends Aggr
   private val addFunction =
     Coalesce(Seq(Add(Coalesce(Seq(sum, zero)), Cast(expr, calcType)), sum, zero))
 
-  override def update(input: Row): Unit = {
+  override def update(input: catalyst.InternalRow): Unit = {
     sum.update(addFunction, input)
   }
 
-  override def eval(input: Row): Any = {
+  override def eval(input: catalyst.InternalRow): Any = {
     expr.dataType match {
       case DecimalType.Fixed(_, _) =>
         Cast(sum, dataType).eval(null)
@@ -652,7 +653,7 @@ case class CombineSumFunction(expr: Expression, base: AggregateExpression)
   private val addFunction =
     Coalesce(Seq(Add(Coalesce(Seq(sum, zero)), Cast(expr, calcType)), sum, zero))
 
-  override def update(input: Row): Unit = {
+  override def update(input: catalyst.InternalRow): Unit = {
     val result = expr.eval(input)
     // partial sum result can be null only when no input rows present
     if(result != null) {
@@ -660,7 +661,7 @@ case class CombineSumFunction(expr: Expression, base: AggregateExpression)
     }
   }
 
-  override def eval(input: Row): Any = {
+  override def eval(input: catalyst.InternalRow): Any = {
     expr.dataType match {
       case DecimalType.Fixed(_, _) =>
         Cast(sum, dataType).eval(null)
@@ -676,14 +677,14 @@ case class SumDistinctFunction(expr: Expression, base: AggregateExpression)
 
   private val seen = new scala.collection.mutable.HashSet[Any]()
 
-  override def update(input: Row): Unit = {
+  override def update(input: catalyst.InternalRow): Unit = {
     val evaluatedExpr = expr.eval(input)
     if (evaluatedExpr != null) {
       seen += evaluatedExpr
     }
   }
 
-  override def eval(input: Row): Any = {
+  override def eval(input: catalyst.InternalRow): Any = {
     if (seen.size == 0) {
       null
     } else {
@@ -707,14 +708,14 @@ case class CountDistinctFunction(
   @transient
   val distinctValue = new InterpretedProjection(expr)
 
-  override def update(input: Row): Unit = {
+  override def update(input: catalyst.InternalRow): Unit = {
     val evaluatedExpr = distinctValue(input)
     if (!evaluatedExpr.anyNull) {
       seen.add(evaluatedExpr)
     }
   }
 
-  override def eval(input: Row): Any = seen.size.toLong
+  override def eval(input: catalyst.InternalRow): Any = seen.size.toLong
 }
 
 case class FirstFunction(expr: Expression, base: AggregateExpression) extends AggregateFunction {
@@ -722,13 +723,13 @@ case class FirstFunction(expr: Expression, base: AggregateExpression) extends Ag
 
   var result: Any = null
 
-  override def update(input: Row): Unit = {
+  override def update(input: catalyst.InternalRow): Unit = {
     if (result == null) {
       result = expr.eval(input)
     }
   }
 
-  override def eval(input: Row): Any = result
+  override def eval(input: catalyst.InternalRow): Any = result
 }
 
 case class LastFunction(expr: Expression, base: AggregateExpression) extends AggregateFunction {
@@ -736,11 +737,11 @@ case class LastFunction(expr: Expression, base: AggregateExpression) extends Agg
 
   var result: Any = null
 
-  override def update(input: Row): Unit = {
+  override def update(input: catalyst.InternalRow): Unit = {
     result = input
   }
 
-  override def eval(input: Row): Any = {
-    if (result != null) expr.eval(result.asInstanceOf[Row]) else null
+  override def eval(input: catalyst.InternalRow): Any = {
+    if (result != null) expr.eval(result.asInstanceOf[catalyst.InternalRow]) else null
   }
 }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/arithmetic.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/arithmetic.scala
index 124274c94203c..0ba2ff75aac5c 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/arithmetic.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/arithmetic.scala
@@ -17,8 +17,9 @@
 
 package org.apache.spark.sql.catalyst.expressions
 
+import org.apache.spark.sql.catalyst
 import org.apache.spark.sql.catalyst.analysis.TypeCheckResult
-import org.apache.spark.sql.catalyst.expressions.codegen.{GeneratedExpressionCode, CodeGenContext}
+import org.apache.spark.sql.catalyst.expressions.codegen.{CodeGenContext, GeneratedExpressionCode}
 import org.apache.spark.sql.catalyst.util.TypeUtils
 import org.apache.spark.sql.types._
 
@@ -29,7 +30,7 @@ abstract class UnaryArithmetic extends UnaryExpression {
   override def nullable: Boolean = child.nullable
   override def dataType: DataType = child.dataType
 
-  override def eval(input: Row): Any = {
+  override def eval(input: catalyst.InternalRow): Any = {
     val evalE = child.eval(input)
     if (evalE == null) {
       null
@@ -124,7 +125,7 @@ abstract class BinaryArithmetic extends BinaryExpression {
 
   protected def checkTypesInternal(t: DataType): TypeCheckResult
 
-  override def eval(input: Row): Any = {
+  override def eval(input: catalyst.InternalRow): Any = {
     val evalE1 = left.eval(input)
     if(evalE1 == null) {
       null
@@ -219,7 +220,7 @@ case class Divide(left: Expression, right: Expression) extends BinaryArithmetic
     case it: IntegralType => it.integral.asInstanceOf[Integral[Any]].quot
   }
 
-  override def eval(input: Row): Any = {
+  override def eval(input: catalyst.InternalRow): Any = {
     val evalE2 = right.eval(input)
     if (evalE2 == null || evalE2 == 0) {
       null
@@ -279,7 +280,7 @@ case class Remainder(left: Expression, right: Expression) extends BinaryArithmet
     case i: FractionalType => i.asIntegral.asInstanceOf[Integral[Any]]
   }
 
-  override def eval(input: Row): Any = {
+  override def eval(input: catalyst.InternalRow): Any = {
     val evalE2 = right.eval(input)
     if (evalE2 == null || evalE2 == 0) {
       null
@@ -330,7 +331,7 @@ case class MaxOf(left: Expression, right: Expression) extends BinaryArithmetic {
 
   private lazy val ordering = TypeUtils.getOrdering(dataType)
 
-  override def eval(input: Row): Any = {
+  override def eval(input: catalyst.InternalRow): Any = {
     val evalE1 = left.eval(input)
     val evalE2 = right.eval(input)
     if (evalE1 == null) {
@@ -384,7 +385,7 @@ case class MinOf(left: Expression, right: Expression) extends BinaryArithmetic {
 
   private lazy val ordering = TypeUtils.getOrdering(dataType)
 
-  override def eval(input: Row): Any = {
+  override def eval(input: catalyst.InternalRow): Any = {
     val evalE1 = left.eval(input)
     val evalE2 = right.eval(input)
     if (evalE1 == null) {
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala
index 536e47733074a..244a06638f61f 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala
@@ -24,6 +24,7 @@ import com.google.common.cache.{CacheBuilder, CacheLoader}
 import org.codehaus.janino.ClassBodyEvaluator
 
 import org.apache.spark.Logging
+import org.apache.spark.sql.catalyst
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.types._
 import org.apache.spark.unsafe.types.UTF8String
@@ -34,7 +35,7 @@ class IntegerHashSet extends org.apache.spark.util.collection.OpenHashSet[Int]
 class LongHashSet extends org.apache.spark.util.collection.OpenHashSet[Long]
 
 /**
- * Java source for evaluating an [[Expression]] given a [[Row]] of input.
+ * Java source for evaluating an [[Expression]] given a [[catalyst.InternalRow]] of input.
  *
  * @param code The sequence of statements required to evaluate the expression.
  * @param isNull A term that holds a boolean value representing whether the expression evaluated
@@ -183,13 +184,13 @@ class CodeGenContext {
   }
 
   /**
-   * List of data types that have special accessors and setters in [[Row]].
+   * List of data types that have special accessors and setters in [[catalyst.InternalRow]].
    */
   val nativeTypes =
     Seq(IntegerType, BooleanType, LongType, DoubleType, FloatType, ShortType, ByteType)
 
   /**
-   * Returns true if the data type has a special accessor and setter in [[Row]].
+   * Returns true if the data type has a special accessor and setter in [[catalyst.InternalRow]].
    */
   def isNativeType(dt: DataType): Boolean = nativeTypes.contains(dt)
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateMutableProjection.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateMutableProjection.scala
index ed3df547d1c90..35cb954c54308 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateMutableProjection.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateMutableProjection.scala
@@ -17,6 +17,7 @@
 
 package org.apache.spark.sql.catalyst.expressions.codegen
 
+import org.apache.spark.sql.catalyst
 import org.apache.spark.sql.catalyst.expressions._
 
 // MutableProjection is not accessible in Java
@@ -24,7 +25,7 @@ abstract class BaseMutableProjection extends MutableProjection {}
 
 /**
  * Generates byte code that produces a [[MutableRow]] object that can update itself based on a new
- * input [[Row]] for a fixed set of [[Expression Expressions]].
+ * input [[catalyst.InternalRow]] for a fixed set of [[Expression Expressions]].
  */
 object GenerateMutableProjection extends CodeGenerator[Seq[Expression], () => MutableProjection] {
 
@@ -47,7 +48,7 @@ object GenerateMutableProjection extends CodeGenerator[Seq[Expression], () => Mu
         """
     }.mkString("\n")
     val code = s"""
-      import org.apache.spark.sql.Row;
+      import org.apache.spark.sql.catalyst.InternalRow;
 
       public SpecificProjection generate($exprType[] expr) {
         return new SpecificProjection(expr);
@@ -69,12 +70,12 @@ object GenerateMutableProjection extends CodeGenerator[Seq[Expression], () => Mu
         }
 
         /* Provide immutable access to the last projected row. */
-        public Row currentValue() {
-          return mutableRow;
+        public InternalRow currentValue() {
+          return (InternalRow) mutableRow;
         }
 
         public Object apply(Object _i) {
-          Row i = (Row) _i;
+          InternalRow i = (InternalRow) _i;
           $projectionCode
 
           return mutableRow;
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateOrdering.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateOrdering.scala
index 56ecc5fc06cc1..db5d570aeb6d4 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateOrdering.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateOrdering.scala
@@ -19,15 +19,15 @@ package org.apache.spark.sql.catalyst.expressions.codegen
 
 import org.apache.spark.Logging
 import org.apache.spark.annotation.Private
-import org.apache.spark.sql.Row
+import org.apache.spark.sql.{catalyst, Row}
 import org.apache.spark.sql.catalyst.expressions._
 
 /**
  * Inherits some default implementation for Java from `Ordering[Row]`
  */
 @Private
-class BaseOrdering extends Ordering[Row] {
-  def compare(a: Row, b: Row): Int = {
+class BaseOrdering extends Ordering[catalyst.InternalRow] {
+  def compare(a: catalyst.InternalRow, b: catalyst.InternalRow): Int = {
     throw new UnsupportedOperationException
   }
 }
@@ -36,7 +36,8 @@ class BaseOrdering extends Ordering[Row] {
  * Generates bytecode for an [[Ordering]] of [[Row Rows]] for a given set of
  * [[Expression Expressions]].
  */
-object GenerateOrdering extends CodeGenerator[Seq[SortOrder], Ordering[Row]] with Logging {
+object GenerateOrdering
+    extends CodeGenerator[Seq[SortOrder], Ordering[catalyst.InternalRow]] with Logging {
   import scala.reflect.runtime.universe._
 
   protected def canonicalize(in: Seq[SortOrder]): Seq[SortOrder] =
@@ -45,7 +46,7 @@ object GenerateOrdering extends CodeGenerator[Seq[SortOrder], Ordering[Row]] wit
   protected def bind(in: Seq[SortOrder], inputSchema: Seq[Attribute]): Seq[SortOrder] =
     in.map(BindReferences.bindReference(_, inputSchema))
 
-  protected def create(ordering: Seq[SortOrder]): Ordering[Row] = {
+  protected def create(ordering: Seq[SortOrder]): Ordering[catalyst.InternalRow] = {
     val a = newTermName("a")
     val b = newTermName("b")
     val ctx = newCodeGenContext()
@@ -75,7 +76,7 @@ object GenerateOrdering extends CodeGenerator[Seq[SortOrder], Ordering[Row]] wit
     }.mkString("\n")
 
     val code = s"""
-      import org.apache.spark.sql.Row;
+      import org.apache.spark.sql.catalyst.InternalRow;
 
       public SpecificOrdering generate($exprType[] expr) {
         return new SpecificOrdering(expr);
@@ -90,8 +91,8 @@ object GenerateOrdering extends CodeGenerator[Seq[SortOrder], Ordering[Row]] wit
         }
 
         @Override
-        public int compare(Row a, Row b) {
-          Row i = null;  // Holds current row being evaluated.
+        public int compare(InternalRow a, InternalRow b) {
+          InternalRow i = null;  // Holds current row being evaluated.
           $comparisons
           return 0;
         }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GeneratePredicate.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GeneratePredicate.scala
index 4a547b5ce9543..9e191dc2e9422 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GeneratePredicate.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GeneratePredicate.scala
@@ -17,30 +17,31 @@
 
 package org.apache.spark.sql.catalyst.expressions.codegen
 
+import org.apache.spark.sql.catalyst
 import org.apache.spark.sql.catalyst.expressions._
 
 /**
  * Interface for generated predicate
  */
 abstract class Predicate {
-  def eval(r: Row): Boolean
+  def eval(r: catalyst.InternalRow): Boolean
 }
 
 /**
- * Generates bytecode that evaluates a boolean [[Expression]] on a given input [[Row]].
+ * Generates bytecode that evaluates a boolean [[Expression]] on a given input [[InternalRow]].
  */
-object GeneratePredicate extends CodeGenerator[Expression, (Row) => Boolean] {
+object GeneratePredicate extends CodeGenerator[Expression, (catalyst.InternalRow) => Boolean] {
 
   protected def canonicalize(in: Expression): Expression = ExpressionCanonicalizer.execute(in)
 
   protected def bind(in: Expression, inputSchema: Seq[Attribute]): Expression =
     BindReferences.bindReference(in, inputSchema)
 
-  protected def create(predicate: Expression): ((Row) => Boolean) = {
+  protected def create(predicate: Expression): ((catalyst.InternalRow) => Boolean) = {
     val ctx = newCodeGenContext()
     val eval = predicate.gen(ctx)
     val code = s"""
-      import org.apache.spark.sql.Row;
+      import org.apache.spark.sql.catalyst.InternalRow;
 
       public SpecificPredicate generate($exprType[] expr) {
         return new SpecificPredicate(expr);
@@ -53,7 +54,7 @@ object GeneratePredicate extends CodeGenerator[Expression, (Row) => Boolean] {
         }
 
         @Override
-        public boolean eval(Row i) {
+        public boolean eval(InternalRow i) {
           ${eval.code}
           return !${eval.isNull} && ${eval.primitive};
         }
@@ -65,6 +66,6 @@ object GeneratePredicate extends CodeGenerator[Expression, (Row) => Boolean] {
     // fetch the only one method `generate(Expression[])`
     val m = c.getDeclaredMethods()(0)
     val p = m.invoke(c.newInstance(), ctx.references.toArray).asInstanceOf[Predicate]
-    (r: Row) => p.eval(r)
+    (r: catalyst.InternalRow) => p.eval(r)
   }
 }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateProjection.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateProjection.scala
index 9b906c3ff5cde..8b5dc194be31f 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateProjection.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateProjection.scala
@@ -27,9 +27,10 @@ import org.apache.spark.sql.types._
 abstract class BaseProject extends Projection {}
 
 /**
- * Generates bytecode that produces a new [[Row]] object based on a fixed set of input
- * [[Expression Expressions]] and a given input [[Row]].  The returned [[Row]] object is custom
- * generated based on the output types of the [[Expression]] to avoid boxing of primitive values.
+ * Generates bytecode that produces a new [[InternalRow]] object based on a fixed set of input
+ * [[Expression Expressions]] and a given input [[InternalRow]].  The returned [[InternalRow]]
+ * object is custom generated based on the output types of the [[Expression]] to avoid boxing of
+ * primitive values.
  */
 object GenerateProjection extends CodeGenerator[Seq[Expression], Projection] {
   import scala.reflect.runtime.universe._
@@ -146,7 +147,7 @@ object GenerateProjection extends CodeGenerator[Seq[Expression], Projection] {
     }.mkString("\n")
 
     val code = s"""
-    import org.apache.spark.sql.Row;
+    import org.apache.spark.sql.catalyst.InternalRow;
 
     public SpecificProjection generate($exprType[] expr) {
       return new SpecificProjection(expr);
@@ -161,7 +162,7 @@ object GenerateProjection extends CodeGenerator[Seq[Expression], Projection] {
 
       @Override
       public Object apply(Object r) {
-        return new SpecificRow(expressions, (Row) r);
+        return new SpecificRow(expressions, (InternalRow) r);
       }
     }
 
@@ -169,7 +170,7 @@ object GenerateProjection extends CodeGenerator[Seq[Expression], Projection] {
 
       $columns
 
-      public SpecificRow($exprType[] expressions, Row i) {
+      public SpecificRow($exprType[] expressions, InternalRow i) {
         $initColumns
       }
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypes.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypes.scala
index 6398b8f9e4ed7..a6913cc03ca20 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypes.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypes.scala
@@ -17,6 +17,7 @@
 
 package org.apache.spark.sql.catalyst.expressions
 
+import org.apache.spark.sql.catalyst
 import org.apache.spark.sql.types._
 
 
@@ -41,7 +42,7 @@ case class CreateArray(children: Seq[Expression]) extends Expression {
 
   override def nullable: Boolean = false
 
-  override def eval(input: Row): Any = {
+  override def eval(input: catalyst.InternalRow): Any = {
     children.map(_.eval(input))
   }
 
@@ -69,7 +70,7 @@ case class CreateStruct(children: Seq[NamedExpression]) extends Expression {
 
   override def nullable: Boolean = false
 
-  override def eval(input: Row): Any = {
-    Row(children.map(_.eval(input)): _*)
+  override def eval(input: catalyst.InternalRow): Any = {
+    InternalRow(children.map(_.eval(input)): _*)
   }
 }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/conditionals.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/conditionals.scala
index 72b9f23456a54..a119c313007c8 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/conditionals.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/conditionals.scala
@@ -17,6 +17,7 @@
 
 package org.apache.spark.sql.catalyst.expressions
 
+import org.apache.spark.sql.catalyst
 import org.apache.spark.sql.catalyst.analysis.TypeCheckResult
 import org.apache.spark.sql.catalyst.expressions.codegen._
 import org.apache.spark.sql.types.{BooleanType, DataType}
@@ -42,7 +43,7 @@ case class If(predicate: Expression, trueValue: Expression, falseValue: Expressi
 
   override def dataType: DataType = trueValue.dataType
 
-  override def eval(input: Row): Any = {
+  override def eval(input: catalyst.InternalRow): Any = {
     if (true == predicate.eval(input)) {
       trueValue.eval(input)
     } else {
@@ -137,7 +138,7 @@ case class CaseWhen(branches: Seq[Expression]) extends CaseWhenLike {
   }
 
   /** Written in imperative fashion for performance considerations. */
-  override def eval(input: Row): Any = {
+  override def eval(input: catalyst.InternalRow): Any = {
     val len = branchesArr.length
     var i = 0
     // If all branches fail and an elseVal is not provided, the whole statement
@@ -229,7 +230,7 @@ case class CaseKeyWhen(key: Expression, branches: Seq[Expression]) extends CaseW
   }
 
   /** Written in imperative fashion for performance considerations. */
-  override def eval(input: Row): Any = {
+  override def eval(input: catalyst.InternalRow): Any = {
     val evaluatedKey = key.eval(input)
     val len = branchesArr.length
     var i = 0
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/decimalFunctions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/decimalFunctions.scala
index 8ab6d977dd3a6..de8b66bc3bcbd 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/decimalFunctions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/decimalFunctions.scala
@@ -17,7 +17,8 @@
 
 package org.apache.spark.sql.catalyst.expressions
 
-import org.apache.spark.sql.catalyst.expressions.codegen.{GeneratedExpressionCode, CodeGenContext}
+import org.apache.spark.sql.catalyst
+import org.apache.spark.sql.catalyst.expressions.codegen.{CodeGenContext, GeneratedExpressionCode}
 import org.apache.spark.sql.types._
 
 /** Return the unscaled Long value of a Decimal, assuming it fits in a Long */
@@ -28,7 +29,7 @@ case class UnscaledValue(child: Expression) extends UnaryExpression {
   override def nullable: Boolean = child.nullable
   override def toString: String = s"UnscaledValue($child)"
 
-  override def eval(input: Row): Any = {
+  override def eval(input: catalyst.InternalRow): Any = {
     val childResult = child.eval(input)
     if (childResult == null) {
       null
@@ -50,7 +51,7 @@ case class MakeDecimal(child: Expression, precision: Int, scale: Int) extends Un
   override def nullable: Boolean = child.nullable
   override def toString: String = s"MakeDecimal($child,$precision,$scale)"
 
-  override def eval(input: Row): Decimal = {
+  override def eval(input: catalyst.InternalRow): Decimal = {
     val childResult = child.eval(input)
     if (childResult == null) {
       null
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/generators.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/generators.scala
index b6191eafba71b..a80c255a296af 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/generators.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/generators.scala
@@ -19,6 +19,7 @@ package org.apache.spark.sql.catalyst.expressions
 
 import scala.collection.Map
 
+import org.apache.spark.sql.catalyst
 import org.apache.spark.sql.catalyst.{CatalystTypeConverters, trees}
 import org.apache.spark.sql.types._
 
@@ -53,13 +54,13 @@ abstract class Generator extends Expression {
   def elementTypes: Seq[(DataType, Boolean)]
 
   /** Should be implemented by child classes to perform specific Generators. */
-  override def eval(input: Row): TraversableOnce[Row]
+  override def eval(input: catalyst.InternalRow): TraversableOnce[catalyst.InternalRow]
 
   /**
    * Notifies that there are no more rows to process, clean up code, and additional
    * rows can be made here.
    */
-  def terminate(): TraversableOnce[Row] = Nil
+  def terminate(): TraversableOnce[catalyst.InternalRow] = Nil
 }
 
 /**
@@ -67,22 +68,22 @@ abstract class Generator extends Expression {
  */
 case class UserDefinedGenerator(
     elementTypes: Seq[(DataType, Boolean)],
-    function: Row => TraversableOnce[Row],
+    function: catalyst.InternalRow => TraversableOnce[catalyst.InternalRow],
     children: Seq[Expression])
   extends Generator {
 
   @transient private[this] var inputRow: InterpretedProjection = _
-  @transient private[this] var convertToScala: (Row) => Row = _
+  @transient private[this] var convertToScala: (catalyst.InternalRow) => catalyst.InternalRow = _
 
   private def initializeConverters(): Unit = {
     inputRow = new InterpretedProjection(children)
     convertToScala = {
       val inputSchema = StructType(children.map(e => StructField(e.simpleString, e.dataType, true)))
       CatalystTypeConverters.createToScalaConverter(inputSchema)
-    }.asInstanceOf[(Row => Row)]
+    }.asInstanceOf[(catalyst.InternalRow => catalyst.InternalRow)]
   }
 
-  override def eval(input: Row): TraversableOnce[Row] = {
+  override def eval(input: catalyst.InternalRow): TraversableOnce[catalyst.InternalRow] = {
     if (inputRow == null) {
       initializeConverters()
     }
@@ -108,7 +109,7 @@ case class Explode(child: Expression)
     case MapType(kt, vt, valueContainsNull) => (kt, false) :: (vt, valueContainsNull) :: Nil
   }
 
-  override def eval(input: Row): TraversableOnce[Row] = {
+  override def eval(input: catalyst.InternalRow): TraversableOnce[catalyst.InternalRow] = {
     child.dataType match {
       case ArrayType(_, _) =>
         val inputArray = child.eval(input).asInstanceOf[Seq[Any]]
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/literals.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/literals.scala
index a33007bda1458..d8fff2b84d585 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/literals.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/literals.scala
@@ -19,6 +19,7 @@ package org.apache.spark.sql.catalyst.expressions
 
 import java.sql.{Date, Timestamp}
 
+import org.apache.spark.sql.catalyst
 import org.apache.spark.sql.catalyst.CatalystTypeConverters
 import org.apache.spark.sql.catalyst.expressions.codegen.{CodeGenContext, GeneratedExpressionCode}
 import org.apache.spark.sql.catalyst.util.DateUtils
@@ -87,7 +88,7 @@ case class Literal protected (value: Any, dataType: DataType) extends LeafExpres
     case _ => false
   }
 
-  override def eval(input: Row): Any = value
+  override def eval(input: catalyst.InternalRow): Any = value
 
   override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
     // change the isNull and primitive to consts, to inline them
@@ -142,9 +143,9 @@ case class Literal protected (value: Any, dataType: DataType) extends LeafExpres
 case class MutableLiteral(var value: Any, dataType: DataType, nullable: Boolean = true)
     extends LeafExpression {
 
-  def update(expression: Expression, input: Row): Unit = {
+  def update(expression: Expression, input: catalyst.InternalRow): Unit = {
     value = expression.eval(input)
   }
 
-  override def eval(input: Row): Any = value
+  override def eval(input: catalyst.InternalRow): Any = value
 }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/math.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/math.scala
index 97e960b8d6422..6f90d607ddbcc 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/math.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/math.scala
@@ -17,6 +17,7 @@
 
 package org.apache.spark.sql.catalyst.expressions
 
+import org.apache.spark.sql.catalyst
 import org.apache.spark.sql.catalyst.expressions.codegen._
 import org.apache.spark.sql.types.{DataType, DoubleType}
 
@@ -34,7 +35,7 @@ abstract class LeafMathExpression(c: Double, name: String)
   override def nullable: Boolean = false
   override def toString: String = s"$name()"
 
-  override def eval(input: Row): Any = c
+  override def eval(input: catalyst.InternalRow): Any = c
 
   override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
     s"""
@@ -60,7 +61,7 @@ abstract class UnaryMathExpression(f: Double => Double, name: String)
   override def nullable: Boolean = true
   override def toString: String = s"$name($child)"
 
-  override def eval(input: Row): Any = {
+  override def eval(input: catalyst.InternalRow): Any = {
     val evalE = child.eval(input)
     if (evalE == null) {
       null
@@ -103,7 +104,7 @@ abstract class BinaryMathExpression(f: (Double, Double) => Double, name: String)
 
   override def dataType: DataType = DoubleType
 
-  override def eval(input: Row): Any = {
+  override def eval(input: catalyst.InternalRow): Any = {
     val evalE1 = left.eval(input)
     if (evalE1 == null) {
       null
@@ -215,7 +216,7 @@ case class ToRadians(child: Expression) extends UnaryMathExpression(math.toRadia
 case class Atan2(left: Expression, right: Expression)
   extends BinaryMathExpression(math.atan2, "ATAN2") {
 
-  override def eval(input: Row): Any = {
+  override def eval(input: catalyst.InternalRow): Any = {
     val evalE1 = left.eval(input)
     if (evalE1 == null) {
       null
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/namedExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/namedExpressions.scala
index 2e4b9ba678433..20505129e96c3 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/namedExpressions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/namedExpressions.scala
@@ -17,6 +17,7 @@
 
 package org.apache.spark.sql.catalyst.expressions
 
+import org.apache.spark.sql.catalyst
 import org.apache.spark.sql.catalyst.analysis.UnresolvedAttribute
 import org.apache.spark.sql.catalyst.errors.TreeNodeException
 import org.apache.spark.sql.catalyst.expressions.codegen.{CodeGenContext, GeneratedExpressionCode}
@@ -114,7 +115,7 @@ case class Alias(child: Expression, name: String)(
   // Alias(Generator, xx) need to be transformed into Generate(generator, ...)
   override lazy val resolved = childrenResolved && !child.isInstanceOf[Generator]
 
-  override def eval(input: Row): Any = child.eval(input)
+  override def eval(input: catalyst.InternalRow): Any = child.eval(input)
 
   override def gen(ctx: CodeGenContext): GeneratedExpressionCode = child.gen(ctx)
 
@@ -230,7 +231,7 @@ case class AttributeReference(
   }
 
   // Unresolved attributes are transient at compile time and don't get evaluated during execution.
-  override def eval(input: Row = null): Any =
+  override def eval(input: catalyst.InternalRow = null): Any =
     throw new TreeNodeException(this, s"No function to evaluate expression. type: ${this.nodeName}")
 
   override def toString: String = s"$name#${exprId.id}$typeSuffix"
@@ -252,7 +253,7 @@ case class PrettyAttribute(name: String) extends Attribute with trees.LeafNode[E
   override def withName(newName: String): Attribute = throw new UnsupportedOperationException
   override def qualifiers: Seq[String] = throw new UnsupportedOperationException
   override def exprId: ExprId = throw new UnsupportedOperationException
-  override def eval(input: Row): Any = throw new UnsupportedOperationException
+  override def eval(input: catalyst.InternalRow): Any = throw new UnsupportedOperationException
   override def nullable: Boolean = throw new UnsupportedOperationException
   override def dataType: DataType = NullType
 }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/nullFunctions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/nullFunctions.scala
index c2d1a4eadae29..292d626f019f4 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/nullFunctions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/nullFunctions.scala
@@ -17,6 +17,7 @@
 
 package org.apache.spark.sql.catalyst.expressions
 
+import org.apache.spark.sql.catalyst
 import org.apache.spark.sql.catalyst.expressions.codegen.{GeneratedExpressionCode, CodeGenContext}
 import org.apache.spark.sql.catalyst.trees
 import org.apache.spark.sql.catalyst.analysis.UnresolvedException
@@ -43,7 +44,7 @@ case class Coalesce(children: Seq[Expression]) extends Expression {
       this, s"Coalesce cannot have children of different types. $childTypes")
   }
 
-  override def eval(input: Row): Any = {
+  override def eval(input: catalyst.InternalRow): Any = {
     var i = 0
     var result: Any = null
     val childIterator = children.iterator
@@ -77,7 +78,7 @@ case class IsNull(child: Expression) extends Predicate with trees.UnaryNode[Expr
   override def foldable: Boolean = child.foldable
   override def nullable: Boolean = false
 
-  override def eval(input: Row): Any = {
+  override def eval(input: catalyst.InternalRow): Any = {
     child.eval(input) == null
   }
 
@@ -96,7 +97,7 @@ case class IsNotNull(child: Expression) extends Predicate with trees.UnaryNode[E
   override def nullable: Boolean = false
   override def toString: String = s"IS NOT NULL $child"
 
-  override def eval(input: Row): Any = {
+  override def eval(input: catalyst.InternalRow): Any = {
     child.eval(input) != null
   }
 
@@ -118,7 +119,7 @@ case class AtLeastNNonNulls(n: Int, children: Seq[Expression]) extends Predicate
 
   private[this] val childrenArray = children.toArray
 
-  override def eval(input: Row): Boolean = {
+  override def eval(input: catalyst.InternalRow): Boolean = {
     var numNonNulls = 0
     var i = 0
     while (i < childrenArray.length && numNonNulls < n) {
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/package.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/package.scala
index fbc97b2e75312..c2e57b4715a79 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/package.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/package.scala
@@ -17,6 +17,8 @@
 
 package org.apache.spark.sql.catalyst
 
+import org.apache.spark.sql.catalyst
+
 /**
  * A set of classes that can be used to represent trees of relational expressions.  A key goal of
  * the expression library is to hide the details of naming and scoping from developers who want to
@@ -49,30 +51,30 @@ package org.apache.spark.sql.catalyst
  */
 package object expressions  {
 
-  type Row = org.apache.spark.sql.Row
+  type InternalRow = catalyst.InternalRow
 
-  val Row = org.apache.spark.sql.Row
+  val InternalRow = catalyst.InternalRow
 
   /**
-   * Converts a [[Row]] to another Row given a sequence of expression that define each column of the
-   * new row. If the schema of the input row is specified, then the given expression will be bound
-   * to that schema.
+   * Converts a [[InternalRow]] to another Row given a sequence of expression that define each
+   * column of the new row. If the schema of the input row is specified, then the given expression
+   * will be bound to that schema.
    */
-  abstract class Projection extends (Row => Row)
+  abstract class Projection extends (InternalRow => InternalRow)
 
   /**
-   * Converts a [[Row]] to another Row given a sequence of expression that define each column of the
-   * new row. If the schema of the input row is specified, then the given expression will be bound
-   * to that schema.
+   * Converts a [[InternalRow]] to another Row given a sequence of expression that define each
+   * column of the new row. If the schema of the input row is specified, then the given expression
+   * will be bound to that schema.
    *
    * In contrast to a normal projection, a MutableProjection reuses the same underlying row object
    * each time an input row is added.  This significantly reduces the cost of calculating the
-   * projection, but means that it is not safe to hold on to a reference to a [[Row]] after `next()`
-   * has been called on the [[Iterator]] that produced it. Instead, the user must call `Row.copy()`
-   * and hold on to the returned [[Row]] before calling `next()`.
+   * projection, but means that it is not safe to hold on to a reference to a [[InternalRow]] after
+   * `next()` has been called on the [[Iterator]] that produced it. Instead, the user must call
+   * `InternalRow.copy()` and hold on to the returned [[InternalRow]] before calling `next()`.
    */
   abstract class MutableProjection extends Projection {
-    def currentValue: Row
+    def currentValue: InternalRow
 
     /** Uses the given row to store the output of the projection. */
     def target(row: MutableRow): MutableProjection
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala
index 7574d1cbda33e..082d72eb438fa 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala
@@ -24,11 +24,11 @@ import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
 import org.apache.spark.sql.types._
 
 object InterpretedPredicate {
-  def create(expression: Expression, inputSchema: Seq[Attribute]): (Row => Boolean) =
+  def create(expression: Expression, inputSchema: Seq[Attribute]): (InternalRow => Boolean) =
     create(BindReferences.bindReference(expression, inputSchema))
 
-  def create(expression: Expression): (Row => Boolean) = {
-    (r: Row) => expression.eval(r).asInstanceOf[Boolean]
+  def create(expression: Expression): (InternalRow => Boolean) = {
+    (r: InternalRow) => expression.eval(r).asInstanceOf[Boolean]
   }
 }
 
@@ -77,7 +77,7 @@ case class Not(child: Expression) extends UnaryExpression with Predicate with Ex
 
   override def expectedChildTypes: Seq[DataType] = Seq(BooleanType)
 
-  override def eval(input: Row): Any = {
+  override def eval(input: InternalRow): Any = {
     child.eval(input) match {
       case null => null
       case b: Boolean => !b
@@ -98,7 +98,7 @@ case class In(value: Expression, list: Seq[Expression]) extends Predicate {
   override def nullable: Boolean = true // TODO: Figure out correct nullability semantics of IN.
   override def toString: String = s"$value IN ${list.mkString("(", ",", ")")}"
 
-  override def eval(input: Row): Any = {
+  override def eval(input: InternalRow): Any = {
     val evaluatedValue = value.eval(input)
     list.exists(e => e.eval(input) == evaluatedValue)
   }
@@ -117,7 +117,7 @@ case class InSet(value: Expression, hset: Set[Any])
   override def nullable: Boolean = true // TODO: Figure out correct nullability semantics of IN.
   override def toString: String = s"$value INSET ${hset.mkString("(", ",", ")")}"
 
-  override def eval(input: Row): Any = {
+  override def eval(input: InternalRow): Any = {
     hset.contains(value.eval(input))
   }
 }
@@ -129,7 +129,7 @@ case class And(left: Expression, right: Expression)
 
   override def symbol: String = "&&"
 
-  override def eval(input: Row): Any = {
+  override def eval(input: InternalRow): Any = {
     val l = left.eval(input)
     if (l == false) {
        false
@@ -178,7 +178,7 @@ case class Or(left: Expression, right: Expression)
 
   override def symbol: String = "||"
 
-  override def eval(input: Row): Any = {
+  override def eval(input: InternalRow): Any = {
     val l = left.eval(input)
     if (l == true) {
       true
@@ -235,7 +235,7 @@ abstract class BinaryComparison extends BinaryExpression with Predicate {
 
   protected def checkTypesInternal(t: DataType): TypeCheckResult
 
-  override def eval(input: Row): Any = {
+  override def eval(input: InternalRow): Any = {
     val evalE1 = left.eval(input)
     if (evalE1 == null) {
       null
@@ -288,7 +288,7 @@ case class EqualNullSafe(left: Expression, right: Expression) extends BinaryComp
 
   override protected def checkTypesInternal(t: DataType) = TypeCheckResult.TypeCheckSuccess
 
-  override def eval(input: Row): Any = {
+  override def eval(input: InternalRow): Any = {
     val l = left.eval(input)
     val r = right.eval(input)
     if (l == null && r == null) {
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/random.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/random.scala
index 6e4e9cb1be090..7e8033307ea4e 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/random.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/random.scala
@@ -48,7 +48,7 @@ abstract class RDG(seed: Long) extends LeafExpression with Serializable {
 
 /** Generate a random column with i.i.d. uniformly distributed values in [0, 1). */
 case class Rand(seed: Long) extends RDG(seed) {
-  override def eval(input: Row): Double = rng.nextDouble()
+  override def eval(input: InternalRow): Double = rng.nextDouble()
 }
 
 object Rand {
@@ -62,7 +62,7 @@ object Rand {
 
 /** Generate a random column with i.i.d. gaussian random distribution. */
 case class Randn(seed: Long) extends RDG(seed) {
-  override def eval(input: Row): Double = rng.nextGaussian()
+  override def eval(input: InternalRow): Double = rng.nextGaussian()
 }
 
 object Randn {
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/rows.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/rows.scala
index 5d2d82077f0eb..534dac1f92e89 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/rows.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/rows.scala
@@ -21,10 +21,10 @@ import org.apache.spark.sql.types.{DataType, StructType, AtomicType}
 import org.apache.spark.unsafe.types.UTF8String
 
 /**
- * An extended interface to [[Row]] that allows the values for each column to be updated.  Setting
- * a value through a primitive function implicitly marks that column as not null.
+ * An extended interface to [[InternalRow]] that allows the values for each column to be updated.
+ * Setting a value through a primitive function implicitly marks that column as not null.
  */
-trait MutableRow extends Row {
+trait MutableRow extends InternalRow {
   def setNullAt(i: Int): Unit
 
   def update(ordinal: Int, value: Any)
@@ -37,13 +37,12 @@ trait MutableRow extends Row {
   def setByte(ordinal: Int, value: Byte)
   def setFloat(ordinal: Int, value: Float)
   def setString(ordinal: Int, value: String)
-  // TODO(davies): add setDate() and setDecimal()
 }
 
 /**
  * A row with no data.  Calling any methods will result in an error.  Can be used as a placeholder.
  */
-object EmptyRow extends Row {
+object EmptyRow extends InternalRow {
   override def apply(i: Int): Any = throw new UnsupportedOperationException
   override def toSeq: Seq[Any] = Seq.empty
   override def length: Int = 0
@@ -57,7 +56,7 @@ object EmptyRow extends Row {
   override def getByte(i: Int): Byte = throw new UnsupportedOperationException
   override def getString(i: Int): String = throw new UnsupportedOperationException
   override def getAs[T](i: Int): T = throw new UnsupportedOperationException
-  override def copy(): Row = this
+  override def copy(): InternalRow = this
 }
 
 /**
@@ -65,7 +64,7 @@ object EmptyRow extends Row {
  * the array is not copied, and thus could technically be mutated after creation, this is not
  * allowed.
  */
-class GenericRow(protected[sql] val values: Array[Any]) extends Row {
+class GenericRow(protected[sql] val values: Array[Any]) extends InternalRow {
   /** No-arg constructor for serialization. */
   protected def this() = this(null)
 
@@ -154,7 +153,7 @@ class GenericRow(protected[sql] val values: Array[Any]) extends Row {
   }
 
   override def equals(o: Any): Boolean = o match {
-    case other: Row =>
+    case other: InternalRow =>
       if (values.length != other.length) {
         return false
       }
@@ -174,7 +173,7 @@ class GenericRow(protected[sql] val values: Array[Any]) extends Row {
     case _ => false
   }
 
-  override def copy(): Row = this
+  override def copy(): InternalRow = this
 }
 
 class GenericRowWithSchema(values: Array[Any], override val schema: StructType)
@@ -207,15 +206,15 @@ class GenericMutableRow(v: Array[Any]) extends GenericRow(v) with MutableRow {
 
   override def update(ordinal: Int, value: Any): Unit = { values(ordinal) = value }
 
-  override def copy(): Row = new GenericRow(values.clone())
+  override def copy(): InternalRow = new GenericRow(values.clone())
 }
 
 
-class RowOrdering(ordering: Seq[SortOrder]) extends Ordering[Row] {
+class RowOrdering(ordering: Seq[SortOrder]) extends Ordering[InternalRow] {
   def this(ordering: Seq[SortOrder], inputSchema: Seq[Attribute]) =
     this(ordering.map(BindReferences.bindReference(_, inputSchema)))
 
-  def compare(a: Row, b: Row): Int = {
+  def compare(a: InternalRow, b: InternalRow): Int = {
     var i = 0
     while (i < ordering.size) {
       val order = ordering(i)
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/sets.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/sets.scala
index 2bcb960e9177e..30e41677b774b 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/sets.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/sets.scala
@@ -17,7 +17,7 @@
 
 package org.apache.spark.sql.catalyst.expressions
 
-import org.apache.spark.sql.catalyst.expressions.codegen.{GeneratedExpressionCode, CodeGenContext}
+import org.apache.spark.sql.catalyst.expressions.codegen.{CodeGenContext, GeneratedExpressionCode}
 import org.apache.spark.sql.types._
 import org.apache.spark.util.collection.OpenHashSet
 
@@ -57,7 +57,7 @@ case class NewSet(elementType: DataType) extends LeafExpression {
 
   override def dataType: OpenHashSetUDT = new OpenHashSetUDT(elementType)
 
-  override def eval(input: Row): Any = {
+  override def eval(input: InternalRow): Any = {
     new OpenHashSet[Any]()
   }
 
@@ -87,7 +87,7 @@ case class AddItemToSet(item: Expression, set: Expression) extends Expression {
 
   override def dataType: OpenHashSetUDT = set.dataType.asInstanceOf[OpenHashSetUDT]
 
-  override def eval(input: Row): Any = {
+  override def eval(input: InternalRow): Any = {
     val itemEval = item.eval(input)
     val setEval = set.eval(input).asInstanceOf[OpenHashSet[Any]]
 
@@ -137,7 +137,7 @@ case class CombineSets(left: Expression, right: Expression) extends BinaryExpres
 
   override def symbol: String = "++="
 
-  override def eval(input: Row): Any = {
+  override def eval(input: InternalRow): Any = {
     val leftEval = left.eval(input).asInstanceOf[OpenHashSet[Any]]
     if(leftEval != null) {
       val rightEval = right.eval(input).asInstanceOf[OpenHashSet[Any]]
@@ -183,7 +183,7 @@ case class CountSet(child: Expression) extends UnaryExpression {
 
   override def dataType: DataType = LongType
 
-  override def eval(input: Row): Any = {
+  override def eval(input: InternalRow): Any = {
     val childEval = child.eval(input).asInstanceOf[OpenHashSet[Any]]
     if (childEval != null) {
       childEval.size.toLong
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala
index 4f4c19526eeb6..8ca8d22bc4697 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala
@@ -49,7 +49,7 @@ trait StringRegexExpression extends ExpectsInputTypes {
 
   protected def pattern(str: String) = if (cache == null) compile(str) else cache
 
-  override def eval(input: Row): Any = {
+  override def eval(input: InternalRow): Any = {
     val l = left.eval(input)
     if (l == null) {
       null
@@ -121,7 +121,7 @@ trait CaseConversionExpression extends ExpectsInputTypes {
   override def dataType: DataType = StringType
   override def expectedChildTypes: Seq[DataType] = Seq(StringType)
 
-  override def eval(input: Row): Any = {
+  override def eval(input: InternalRow): Any = {
     val evaluated = child.eval(input)
     if (evaluated == null) {
       null
@@ -169,7 +169,7 @@ trait StringComparison extends ExpectsInputTypes {
 
   override def expectedChildTypes: Seq[DataType] = Seq(StringType, StringType)
 
-  override def eval(input: Row): Any = {
+  override def eval(input: InternalRow): Any = {
     val leftEval = left.eval(input)
     if(leftEval == null) {
       null
@@ -262,7 +262,7 @@ case class Substring(str: Expression, pos: Expression, len: Expression)
     (start, end)
   }
 
-  override def eval(input: Row): Any = {
+  override def eval(input: InternalRow): Any = {
     val string = str.eval(input)
     val po = pos.eval(input)
     val ln = len.eval(input)
@@ -303,7 +303,7 @@ case class StringLength(child: Expression) extends UnaryExpression with ExpectsI
   override def dataType: DataType = IntegerType
   override def expectedChildTypes: Seq[DataType] = Seq(StringType)
 
-  override def eval(input: Row): Any = {
+  override def eval(input: InternalRow): Any = {
     val string = child.eval(input)
     if (string == null) null else string.asInstanceOf[UTF8String].length
   }
@@ -314,5 +314,3 @@ case class StringLength(child: Expression) extends UnaryExpression with ExpectsI
     defineCodeGen(ctx, ev, c => s"($c).length()")
   }
 }
-
-
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/windowExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/windowExpressions.scala
index 82c4d462cc322..056f170539884 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/windowExpressions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/windowExpressions.scala
@@ -74,7 +74,7 @@ case class WindowSpecDefinition(
 
   override def toString: String = simpleString
 
-  override def eval(input: Row): Any = throw new UnsupportedOperationException
+  override def eval(input: InternalRow): Any = throw new UnsupportedOperationException
   override def nullable: Boolean = true
   override def foldable: Boolean = false
   override def dataType: DataType = throw new UnsupportedOperationException
@@ -259,7 +259,7 @@ trait WindowFunction extends Expression {
 
   def reset(): Unit
 
-  def prepareInputParameters(input: Row): AnyRef
+  def prepareInputParameters(input: InternalRow): AnyRef
 
   def update(input: AnyRef): Unit
 
@@ -286,7 +286,7 @@ case class UnresolvedWindowFunction(
     throw new UnresolvedException(this, "init")
   override def reset(): Unit =
     throw new UnresolvedException(this, "reset")
-  override def prepareInputParameters(input: Row): AnyRef =
+  override def prepareInputParameters(input: InternalRow): AnyRef =
     throw new UnresolvedException(this, "prepareInputParameters")
   override def update(input: AnyRef): Unit =
     throw new UnresolvedException(this, "update")
@@ -297,7 +297,7 @@ case class UnresolvedWindowFunction(
   override def get(index: Int): Any =
     throw new UnresolvedException(this, "get")
   // Unresolved functions are transient at compile time and don't get evaluated during execution.
-  override def eval(input: Row = null): Any =
+  override def eval(input: InternalRow = null): Any =
     throw new TreeNodeException(this, s"No function to evaluate expression. type: ${this.nodeName}")
 
   override def toString: String = s"'$name(${children.mkString(",")})"
@@ -316,7 +316,7 @@ case class UnresolvedWindowExpression(
   override lazy val resolved = false
 
   // Unresolved functions are transient at compile time and don't get evaluated during execution.
-  override def eval(input: Row = null): Any =
+  override def eval(input: InternalRow = null): Any =
     throw new TreeNodeException(this, s"No function to evaluate expression. type: ${this.nodeName}")
 }
 
@@ -327,7 +327,7 @@ case class WindowExpression(
   override def children: Seq[Expression] =
     windowFunction :: windowSpec :: Nil
 
-  override def eval(input: Row): Any =
+  override def eval(input: InternalRow): Any =
     throw new TreeNodeException(this, s"No function to evaluate expression. type: ${this.nodeName}")
 
   override def dataType: DataType = windowFunction.dataType
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LocalRelation.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LocalRelation.scala
index e3e070f0ff307..2c946cd12f8d8 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LocalRelation.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LocalRelation.scala
@@ -17,10 +17,9 @@
 
 package org.apache.spark.sql.catalyst.plans.logical
 
-import org.apache.spark.sql.Row
-import org.apache.spark.sql.catalyst.{CatalystTypeConverters, analysis}
 import org.apache.spark.sql.catalyst.expressions.Attribute
-import org.apache.spark.sql.types.{StructType, StructField}
+import org.apache.spark.sql.catalyst.{InternalRow, CatalystTypeConverters, analysis}
+import org.apache.spark.sql.types.{StructField, StructType}
 
 object LocalRelation {
   def apply(output: Attribute*): LocalRelation = new LocalRelation(output)
@@ -32,11 +31,11 @@ object LocalRelation {
   def fromProduct(output: Seq[Attribute], data: Seq[Product]): LocalRelation = {
     val schema = StructType.fromAttributes(output)
     val converter = CatalystTypeConverters.createToCatalystConverter(schema)
-    LocalRelation(output, data.map(converter(_).asInstanceOf[Row]))
+    LocalRelation(output, data.map(converter(_).asInstanceOf[InternalRow]))
   }
 }
 
-case class LocalRelation(output: Seq[Attribute], data: Seq[Row] = Nil)
+case class LocalRelation(output: Seq[Attribute], data: Seq[InternalRow] = Nil)
   extends LeafNode with analysis.MultiInstanceRelation {
 
   /**
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/physical/partitioning.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/physical/partitioning.scala
index 80ba57a082a60..42dead7c28425 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/physical/partitioning.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/physical/partitioning.scala
@@ -17,8 +17,9 @@
 
 package org.apache.spark.sql.catalyst.plans.physical
 
+import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.errors.TreeNodeException
-import org.apache.spark.sql.catalyst.expressions.{Expression, Row, SortOrder}
+import org.apache.spark.sql.catalyst.expressions.{Expression, SortOrder}
 import org.apache.spark.sql.types.{DataType, IntegerType}
 
 /**
@@ -169,7 +170,7 @@ case class HashPartitioning(expressions: Seq[Expression], numPartitions: Int)
 
   override def keyExpressions: Seq[Expression] = expressions
 
-  override def eval(input: Row = null): Any =
+  override def eval(input: InternalRow = null): Any =
     throw new TreeNodeException(this, s"No function to evaluate expression. type: ${this.nodeName}")
 }
 
@@ -213,6 +214,6 @@ case class RangePartitioning(ordering: Seq[SortOrder], numPartitions: Int)
 
   override def keyExpressions: Seq[Expression] = ordering.map(_.child)
 
-  override def eval(input: Row): Any =
+  override def eval(input: InternalRow): Any =
     throw new TreeNodeException(this, s"No function to evaluate expression. type: ${this.nodeName}")
 }
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/ScalaReflectionSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/ScalaReflectionSuite.scala
index 9a24b23024e18..b4d5e013f3582 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/ScalaReflectionSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/ScalaReflectionSuite.scala
@@ -21,7 +21,7 @@ import java.math.BigInteger
 import java.sql.{Date, Timestamp}
 
 import org.apache.spark.SparkFunSuite
-import org.apache.spark.sql.catalyst.expressions.Row
+import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.types._
 
 case class PrimitiveData(
@@ -257,7 +257,7 @@ class ScalaReflectionSuite extends SparkFunSuite {
 
   test("convert PrimitiveData to catalyst") {
     val data = PrimitiveData(1, 1, 1, 1, 1, 1, true)
-    val convertedData = Row(1, 1.toLong, 1.toDouble, 1.toFloat, 1.toShort, 1.toByte, true)
+    val convertedData = InternalRow(1, 1.toLong, 1.toDouble, 1.toFloat, 1.toShort, 1.toByte, true)
     val dataType = schemaFor[PrimitiveData].dataType
     assert(CatalystTypeConverters.convertToCatalyst(data, dataType) === convertedData)
   }
@@ -267,8 +267,8 @@ class ScalaReflectionSuite extends SparkFunSuite {
     val data = OptionalData(Some(2), Some(2), Some(2), Some(2), Some(2), Some(2), Some(true),
       Some(primitiveData))
     val dataType = schemaFor[OptionalData].dataType
-    val convertedData = Row(2, 2.toLong, 2.toDouble, 2.toFloat, 2.toShort, 2.toByte, true,
-      Row(1, 1, 1, 1, 1, 1, true))
+    val convertedData = InternalRow(2, 2.toLong, 2.toDouble, 2.toFloat, 2.toShort, 2.toByte, true,
+      InternalRow(1, 1, 1, 1, 1, 1, true))
     assert(CatalystTypeConverters.convertToCatalyst(data, dataType) === convertedData)
   }
 
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CastSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CastSuite.scala
index 969c6cc15fdee..e407f6f166e86 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CastSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CastSuite.scala
@@ -437,14 +437,14 @@ class CastSuite extends SparkFunSuite with ExpressionEvalHelper {
 
   test("cast from struct") {
     val struct = Literal.create(
-      Row("123", "abc", "", null),
+      InternalRow("123", "abc", "", null),
       StructType(Seq(
         StructField("a", StringType, nullable = true),
         StructField("b", StringType, nullable = true),
         StructField("c", StringType, nullable = true),
         StructField("d", StringType, nullable = true))))
     val struct_notNull = Literal.create(
-      Row("123", "abc", ""),
+      InternalRow("123", "abc", ""),
       StructType(Seq(
         StructField("a", StringType, nullable = false),
         StructField("b", StringType, nullable = false),
@@ -457,7 +457,7 @@ class CastSuite extends SparkFunSuite with ExpressionEvalHelper {
         StructField("c", IntegerType, nullable = true),
         StructField("d", IntegerType, nullable = true))))
       assert(ret.resolved === true)
-      checkEvaluation(ret, Row(123, null, null, null))
+      checkEvaluation(ret, InternalRow(123, null, null, null))
     }
     {
       val ret = cast(struct, StructType(Seq(
@@ -474,7 +474,7 @@ class CastSuite extends SparkFunSuite with ExpressionEvalHelper {
         StructField("c", BooleanType, nullable = true),
         StructField("d", BooleanType, nullable = true))))
       assert(ret.resolved === true)
-      checkEvaluation(ret, Row(true, true, false, null))
+      checkEvaluation(ret, InternalRow(true, true, false, null))
     }
     {
       val ret = cast(struct, StructType(Seq(
@@ -491,7 +491,7 @@ class CastSuite extends SparkFunSuite with ExpressionEvalHelper {
         StructField("b", IntegerType, nullable = true),
         StructField("c", IntegerType, nullable = true))))
       assert(ret.resolved === true)
-      checkEvaluation(ret, Row(123, null, null))
+      checkEvaluation(ret, InternalRow(123, null, null))
     }
     {
       val ret = cast(struct_notNull, StructType(Seq(
@@ -506,7 +506,7 @@ class CastSuite extends SparkFunSuite with ExpressionEvalHelper {
         StructField("b", BooleanType, nullable = true),
         StructField("c", BooleanType, nullable = true))))
       assert(ret.resolved === true)
-      checkEvaluation(ret, Row(true, true, false))
+      checkEvaluation(ret, InternalRow(true, true, false))
     }
     {
       val ret = cast(struct_notNull, StructType(Seq(
@@ -514,7 +514,7 @@ class CastSuite extends SparkFunSuite with ExpressionEvalHelper {
         StructField("b", BooleanType, nullable = true),
         StructField("c", BooleanType, nullable = false))))
       assert(ret.resolved === true)
-      checkEvaluation(ret, Row(true, true, false))
+      checkEvaluation(ret, InternalRow(true, true, false))
     }
 
     {
@@ -532,10 +532,10 @@ class CastSuite extends SparkFunSuite with ExpressionEvalHelper {
 
   test("complex casting") {
     val complex = Literal.create(
-      Row(
+      InternalRow(
         Seq("123", "abc", ""),
         Map("a" -> "123", "b" -> "abc", "c" -> ""),
-        Row(0)),
+        InternalRow(0)),
       StructType(Seq(
         StructField("a",
           ArrayType(StringType, containsNull = false), nullable = true),
@@ -555,10 +555,10 @@ class CastSuite extends SparkFunSuite with ExpressionEvalHelper {
           StructField("l", LongType, nullable = true)))))))
 
     assert(ret.resolved === true)
-    checkEvaluation(ret, Row(
+    checkEvaluation(ret, InternalRow(
       Seq(123, null, null),
       Map("a" -> true, "b" -> true, "c" -> false),
-      Row(0L)))
+      InternalRow(0L)))
   }
 
 }
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ComplexTypeSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ComplexTypeSuite.scala
index bcc594cb7c193..2b0f4618b21e0 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ComplexTypeSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ComplexTypeSuite.scala
@@ -27,10 +27,10 @@ import org.apache.spark.unsafe.types.UTF8String
 class ComplexTypeSuite extends SparkFunSuite with ExpressionEvalHelper {
 
   test("CreateStruct") {
-    val row = Row(1, 2, 3)
+    val row = InternalRow(1, 2, 3)
     val c1 = 'a.int.at(0).as("a")
     val c3 = 'c.int.at(2).as("c")
-    checkEvaluation(CreateStruct(Seq(c1, c3)), Row(1, 3), row)
+    checkEvaluation(CreateStruct(Seq(c1, c3)), InternalRow(1, 3), row)
   }
 
   test("complex type") {
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionEvalHelper.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionEvalHelper.scala
index 4a241d3603570..12d2da8b33986 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionEvalHelper.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionEvalHelper.scala
@@ -32,26 +32,26 @@ import org.apache.spark.sql.catalyst.plans.logical.{OneRowRelation, Project}
 trait ExpressionEvalHelper {
   self: SparkFunSuite =>
 
-  protected def create_row(values: Any*): Row = {
+  protected def create_row(values: Any*): InternalRow = {
     new GenericRow(values.map(CatalystTypeConverters.convertToCatalyst).toArray)
   }
 
   protected def checkEvaluation(
-      expression: Expression, expected: Any, inputRow: Row = EmptyRow): Unit = {
+      expression: Expression, expected: Any, inputRow: InternalRow = EmptyRow): Unit = {
     checkEvaluationWithoutCodegen(expression, expected, inputRow)
     checkEvaluationWithGeneratedMutableProjection(expression, expected, inputRow)
     checkEvaluationWithGeneratedProjection(expression, expected, inputRow)
     checkEvaluationWithOptimization(expression, expected, inputRow)
   }
 
-  protected def evaluate(expression: Expression, inputRow: Row = EmptyRow): Any = {
+  protected def evaluate(expression: Expression, inputRow: InternalRow = EmptyRow): Any = {
     expression.eval(inputRow)
   }
 
   protected def checkEvaluationWithoutCodegen(
       expression: Expression,
       expected: Any,
-      inputRow: Row = EmptyRow): Unit = {
+      inputRow: InternalRow = EmptyRow): Unit = {
     val actual = try evaluate(expression, inputRow) catch {
       case e: Exception => fail(s"Exception evaluating $expression", e)
     }
@@ -66,7 +66,7 @@ trait ExpressionEvalHelper {
   protected def checkEvaluationWithGeneratedMutableProjection(
       expression: Expression,
       expected: Any,
-      inputRow: Row = EmptyRow): Unit = {
+      inputRow: InternalRow = EmptyRow): Unit = {
 
     val plan = try {
       GenerateMutableProjection.generate(Alias(expression, s"Optimized($expression)")() :: Nil)()
@@ -92,7 +92,7 @@ trait ExpressionEvalHelper {
   protected def checkEvaluationWithGeneratedProjection(
       expression: Expression,
       expected: Any,
-      inputRow: Row = EmptyRow): Unit = {
+      inputRow: InternalRow = EmptyRow): Unit = {
     val ctx = GenerateProjection.newCodeGenContext()
     lazy val evaluated = expression.gen(ctx)
 
@@ -128,7 +128,7 @@ trait ExpressionEvalHelper {
   protected def checkEvaluationWithOptimization(
       expression: Expression,
       expected: Any,
-      inputRow: Row = EmptyRow): Unit = {
+      inputRow: InternalRow = EmptyRow): Unit = {
     val plan = Project(Alias(expression, s"Optimized($expression)")() :: Nil, OneRowRelation)
     val optimizedPlan = DefaultOptimizer.execute(plan)
     checkEvaluationWithoutCodegen(optimizedPlan.expressions.head, expected, inputRow)
@@ -137,7 +137,7 @@ trait ExpressionEvalHelper {
   protected def checkDoubleEvaluation(
       expression: Expression,
       expected: Spread[Double],
-      inputRow: Row = EmptyRow): Unit = {
+      inputRow: InternalRow = EmptyRow): Unit = {
     val actual = try evaluate(expression, inputRow) catch {
       case e: Exception => fail(s"Exception evaluating $expression", e)
     }
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/UnsafeFixedWidthAggregationMapSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/UnsafeFixedWidthAggregationMapSuite.scala
index 72bbc4efeb8ef..7aae2bbd8a0b8 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/UnsafeFixedWidthAggregationMapSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/UnsafeFixedWidthAggregationMapSuite.scala
@@ -37,7 +37,7 @@ class UnsafeFixedWidthAggregationMapSuite
 
   private val groupKeySchema = StructType(StructField("product", StringType) :: Nil)
   private val aggBufferSchema = StructType(StructField("salePrice", IntegerType) :: Nil)
-  private def emptyAggregationBuffer: Row = new GenericRow(Array[Any](0))
+  private def emptyAggregationBuffer: InternalRow = new GenericRow(Array[Any](0))
 
   private var memoryManager: TaskMemoryManager = null
 
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/UnsafeRowConverterSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/UnsafeRowConverterSuite.scala
index 61722f1ffa462..577c7a0de0160 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/UnsafeRowConverterSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/UnsafeRowConverterSuite.scala
@@ -86,7 +86,7 @@ class UnsafeRowConverterSuite extends SparkFunSuite with Matchers {
       DoubleType)
     val converter = new UnsafeRowConverter(fieldTypes)
 
-    val rowWithAllNullColumns: Row = {
+    val rowWithAllNullColumns: InternalRow = {
       val r = new SpecificMutableRow(fieldTypes)
       for (i <- 0 to fieldTypes.length - 1) {
         r.setNullAt(i)
@@ -117,7 +117,7 @@ class UnsafeRowConverterSuite extends SparkFunSuite with Matchers {
     // If we have an UnsafeRow with columns that are initially non-null and we null out those
     // columns, then the serialized row representation should be identical to what we would get by
     // creating an entirely null row via the converter
-    val rowWithNoNullColumns: Row = {
+    val rowWithNoNullColumns: InternalRow = {
       val r = new SpecificMutableRow(fieldTypes)
       r.setNullAt(0)
       r.setBoolean(1, false)
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/ConvertToLocalRelationSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/ConvertToLocalRelationSuite.scala
index 6841bd9890c97..54e8c6462e962 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/ConvertToLocalRelationSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/ConvertToLocalRelationSuite.scala
@@ -17,10 +17,10 @@
 
 package org.apache.spark.sql.catalyst.optimizer
 
-import org.apache.spark.sql.Row
 import org.apache.spark.sql.catalyst.analysis.UnresolvedAttribute
-import org.apache.spark.sql.catalyst.dsl.plans._
 import org.apache.spark.sql.catalyst.dsl.expressions._
+import org.apache.spark.sql.catalyst.dsl.plans._
+import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.plans.PlanTest
 import org.apache.spark.sql.catalyst.plans.logical.{LocalRelation, LogicalPlan}
 import org.apache.spark.sql.catalyst.rules.RuleExecutor
@@ -37,13 +37,11 @@ class ConvertToLocalRelationSuite extends PlanTest {
   test("Project on LocalRelation should be turned into a single LocalRelation") {
     val testRelation = LocalRelation(
       LocalRelation('a.int, 'b.int).output,
-      Row(1, 2) ::
-      Row(4, 5) :: Nil)
+      InternalRow(1, 2) :: InternalRow(4, 5) :: Nil)
 
     val correctAnswer = LocalRelation(
       LocalRelation('a1.int, 'b1.int).output,
-      Row(1, 3) ::
-      Row(4, 6) :: Nil)
+      InternalRow(1, 3) :: InternalRow(4, 6) :: Nil)
 
     val projectOnLocal = testRelation.select(
       UnresolvedAttribute("a").as("a1"),
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/trees/TreeNodeSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/trees/TreeNodeSuite.scala
index 8ec79c3d4d28d..bda217935cb05 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/trees/TreeNodeSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/trees/TreeNodeSuite.scala
@@ -28,7 +28,7 @@ case class Dummy(optKey: Option[Expression]) extends Expression {
   override def nullable: Boolean = true
   override def dataType: NullType = NullType
   override lazy val resolved = true
-  override def eval(input: Row): Any = null.asInstanceOf[Any]
+  override def eval(input: InternalRow): Any = null.asInstanceOf[Any]
 }
 
 case class ComplexPlan(exprs: Seq[Seq[Expression]])
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DateUtilsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DateUtilsSuite.scala
index a4245545ffc1d..4d8fe4ac5e78f 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DateUtilsSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DateUtilsSuite.scala
@@ -21,7 +21,6 @@ import java.sql.Timestamp
 
 import org.apache.spark.SparkFunSuite
 
-
 class DateUtilsSuite extends SparkFunSuite {
 
   test("timestamp") {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
index f041fd397b04b..f1acdfeea5793 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
@@ -20,7 +20,6 @@ package org.apache.spark.sql
 import java.io.CharArrayWriter
 import java.util.Properties
 
-import scala.collection.JavaConversions._
 import scala.language.implicitConversions
 import scala.reflect.ClassTag
 import scala.reflect.runtime.universe.TypeTag
@@ -33,7 +32,7 @@ import org.apache.spark.annotation.{DeveloperApi, Experimental}
 import org.apache.spark.api.java.JavaRDD
 import org.apache.spark.api.python.SerDeUtil
 import org.apache.spark.rdd.RDD
-import org.apache.spark.sql.catalyst.analysis.{MultiAlias, ResolvedStar, UnresolvedAttribute, UnresolvedRelation}
+import org.apache.spark.sql.catalyst.analysis.{MultiAlias, ResolvedStar, UnresolvedAttribute}
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.plans.logical.{Filter, _}
 import org.apache.spark.sql.catalyst.plans.{Inner, JoinType}
@@ -1032,7 +1031,8 @@ class DataFrame private[sql](
     val names = schema.toAttributes.map(_.name)
 
     val rowFunction =
-      f.andThen(_.map(CatalystTypeConverters.convertToCatalyst(_, schema).asInstanceOf[Row]))
+      f.andThen(_.map(CatalystTypeConverters.convertToCatalyst(_, schema)
+        .asInstanceOf[InternalRow]))
     val generator = UserDefinedGenerator(elementTypes, rowFunction, input.map(_.expr))
 
     Generate(generator, join = true, outer = false,
@@ -1058,8 +1058,9 @@ class DataFrame private[sql](
     val elementTypes = attributes.map { attr => (attr.dataType, attr.nullable) }
     val names = attributes.map(_.name)
 
-    def rowFunction(row: Row): TraversableOnce[Row] = {
-      f(row(0).asInstanceOf[A]).map(o => Row(CatalystTypeConverters.convertToCatalyst(o, dataType)))
+    def rowFunction(row: Row): TraversableOnce[InternalRow] = {
+      f(row(0).asInstanceOf[A]).map(o =>
+        catalyst.InternalRow(CatalystTypeConverters.convertToCatalyst(o, dataType)))
     }
     val generator = UserDefinedGenerator(elementTypes, rowFunction, apply(inputColumn).expr :: Nil)
 
@@ -1221,7 +1222,7 @@ class DataFrame private[sql](
 
     val outputCols = (if (cols.isEmpty) numericColumns.map(_.prettyString) else cols).toList
 
-    val ret: Seq[Row] = if (outputCols.nonEmpty) {
+    val ret: Seq[InternalRow] = if (outputCols.nonEmpty) {
       val aggExprs = statistics.flatMap { case (_, colToAgg) =>
         outputCols.map(c => Column(Cast(colToAgg(Column(c).expr), StringType)).as(c))
       }
@@ -1230,11 +1231,12 @@ class DataFrame private[sql](
 
       // Pivot the data so each summary is one row
       row.grouped(outputCols.size).toSeq.zip(statistics).map {
-        case (aggregation, (statistic, _)) => Row(statistic :: aggregation.toList: _*)
+        case (aggregation, (statistic, _)) =>
+          catalyst.InternalRow(statistic :: aggregation.toList: _*)
       }
     } else {
       // If there are no output columns, just output a single column that contains the stats.
-      statistics.map { case (name, _) => Row(name) }
+      statistics.map { case (name, _) => catalyst.InternalRow(name) }
     }
 
     // All columns are string type
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala
index b44d4c86ac5d3..1828ed1aab50b 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala
@@ -245,7 +245,7 @@ class DataFrameReader private[sql](sqlContext: SQLContext) {
         JsonRDD.nullTypeToStringType(
           JsonRDD.inferSchema(jsonRDD, 1.0, columnNameOfCorruptJsonRecord)))
       val rowRDD = JsonRDD.jsonStringToRow(jsonRDD, appliedSchema, columnNameOfCorruptJsonRecord)
-      sqlContext.createDataFrame(rowRDD, appliedSchema, needsConversion = false)
+      sqlContext.internalCreateDataFrame(rowRDD, appliedSchema)
     }
   }
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
index 5f758adf3dfc6..22d0e50e4ef6f 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
@@ -31,7 +31,7 @@ import org.apache.spark.SparkContext
 import org.apache.spark.annotation.{DeveloperApi, Experimental}
 import org.apache.spark.api.java.{JavaRDD, JavaSparkContext}
 import org.apache.spark.rdd.RDD
-import org.apache.spark.sql.catalyst._
+import org.apache.spark.sql.catalyst.{InternalRow, _}
 import org.apache.spark.sql.catalyst.analysis._
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.errors.DialectException
@@ -486,14 +486,26 @@ class SQLContext(@transient val sparkContext: SparkContext)
     // schema differs from the existing schema on any field data type.
     val catalystRows = if (needsConversion) {
       val converter = CatalystTypeConverters.createToCatalystConverter(schema)
-      rowRDD.map(converter(_).asInstanceOf[Row])
+      rowRDD.map(converter(_).asInstanceOf[InternalRow])
     } else {
-      rowRDD
+      rowRDD.map{r: Row => InternalRow.fromSeq(r.toSeq)}
     }
     val logicalPlan = LogicalRDD(schema.toAttributes, catalystRows)(self)
     DataFrame(this, logicalPlan)
   }
 
+  /**
+   * Creates a DataFrame from an RDD[Row]. User can specify whether the input rows should be
+   * converted to Catalyst rows.
+   */
+  private[sql]
+  def internalCreateDataFrame(catalystRows: RDD[InternalRow], schema: StructType) = {
+    // TODO: use MutableProjection when rowRDD is another DataFrame and the applied
+    // schema differs from the existing schema on any field data type.
+    val logicalPlan = LogicalRDD(schema.toAttributes, catalystRows)(self)
+    DataFrame(this, logicalPlan)
+  }
+
   /**
    * :: DeveloperApi ::
    * Creates a [[DataFrame]] from an [[JavaRDD]] containing [[Row]]s using the given schema.
@@ -531,7 +543,7 @@ class SQLContext(@transient val sparkContext: SparkContext)
           extractors.zip(attributeSeq).map { case (e, attr) =>
             CatalystTypeConverters.convertToCatalyst(e.invoke(row), attr.dataType)
           }.toArray[Any]
-        ) : Row
+        ) : InternalRow
       }
     }
     DataFrame(this, LogicalRDD(attributeSeq, rowRdd)(this))
@@ -886,7 +898,7 @@ class SQLContext(@transient val sparkContext: SparkContext)
   protected[sql] val planner = new SparkPlanner
 
   @transient
-  protected[sql] lazy val emptyResult = sparkContext.parallelize(Seq.empty[Row], 1)
+  protected[sql] lazy val emptyResult = sparkContext.parallelize(Seq.empty[InternalRow], 1)
 
   /**
    * Prepares a planned SparkPlan for execution by inserting shuffle operations as needed.
@@ -953,7 +965,7 @@ class SQLContext(@transient val sparkContext: SparkContext)
     lazy val executedPlan: SparkPlan = prepareForExecution.execute(sparkPlan)
 
     /** Internal version of the RDD. Avoids copies and has no schema */
-    lazy val toRdd: RDD[Row] = executedPlan.execute()
+    lazy val toRdd: RDD[InternalRow] = executedPlan.execute()
 
     protected def stringOrError[A](f: => A): String =
       try f.toString catch { case e: Throwable => e.toString }
@@ -1035,7 +1047,7 @@ class SQLContext(@transient val sparkContext: SparkContext)
     }
 
     val rowRdd = convertedRdd.mapPartitions { iter =>
-      iter.map { m => new GenericRow(m): Row}
+      iter.map { m => new GenericRow(m): InternalRow}
     }
 
     DataFrame(this, LogicalRDD(schema.toAttributes, rowRdd)(self))
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/columnar/ColumnBuilder.scala b/sql/core/src/main/scala/org/apache/spark/sql/columnar/ColumnBuilder.scala
index aa10af400c815..cc7506dec1ee8 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/columnar/ColumnBuilder.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/columnar/ColumnBuilder.scala
@@ -18,8 +18,7 @@
 package org.apache.spark.sql.columnar
 
 import java.nio.{ByteBuffer, ByteOrder}
-
-import org.apache.spark.sql.Row
+import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.columnar.ColumnBuilder._
 import org.apache.spark.sql.columnar.compression.{AllCompressionSchemes, CompressibleColumnBuilder}
 import org.apache.spark.sql.types._
@@ -33,7 +32,7 @@ private[sql] trait ColumnBuilder {
   /**
    * Appends `row(ordinal)` to the column builder.
    */
-  def appendFrom(row: Row, ordinal: Int)
+  def appendFrom(row: InternalRow, ordinal: Int)
 
   /**
    * Column statistics information
@@ -68,7 +67,7 @@ private[sql] class BasicColumnBuilder[T <: DataType, JvmType](
     buffer.order(ByteOrder.nativeOrder()).putInt(columnType.typeId)
   }
 
-  override def appendFrom(row: Row, ordinal: Int): Unit = {
+  override def appendFrom(row: InternalRow, ordinal: Int): Unit = {
     buffer = ensureFreeSpace(buffer, columnType.actualSize(row, ordinal))
     columnType.append(row, ordinal, buffer)
   }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/columnar/ColumnStats.scala b/sql/core/src/main/scala/org/apache/spark/sql/columnar/ColumnStats.scala
index 11c79c865f11a..1bce214d1d6c3 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/columnar/ColumnStats.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/columnar/ColumnStats.scala
@@ -17,7 +17,7 @@
 
 package org.apache.spark.sql.columnar
 
-import org.apache.spark.sql.Row
+import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeMap, AttributeReference}
 import org.apache.spark.sql.types._
 import org.apache.spark.unsafe.types.UTF8String
@@ -53,7 +53,7 @@ private[sql] sealed trait ColumnStats extends Serializable {
   /**
    * Gathers statistics information from `row(ordinal)`.
    */
-  def gatherStats(row: Row, ordinal: Int): Unit = {
+  def gatherStats(row: InternalRow, ordinal: Int): Unit = {
     if (row.isNullAt(ordinal)) {
       nullCount += 1
       // 4 bytes for null position
@@ -66,23 +66,23 @@ private[sql] sealed trait ColumnStats extends Serializable {
    * Column statistics represented as a single row, currently including closed lower bound, closed
    * upper bound and null count.
    */
-  def collectedStatistics: Row
+  def collectedStatistics: InternalRow
 }
 
 /**
  * A no-op ColumnStats only used for testing purposes.
  */
 private[sql] class NoopColumnStats extends ColumnStats {
-  override def gatherStats(row: Row, ordinal: Int): Unit = super.gatherStats(row, ordinal)
+  override def gatherStats(row: InternalRow, ordinal: Int): Unit = super.gatherStats(row, ordinal)
 
-  override def collectedStatistics: Row = Row(null, null, nullCount, count, 0L)
+  override def collectedStatistics: InternalRow = InternalRow(null, null, nullCount, count, 0L)
 }
 
 private[sql] class BooleanColumnStats extends ColumnStats {
   protected var upper = false
   protected var lower = true
 
-  override def gatherStats(row: Row, ordinal: Int): Unit = {
+  override def gatherStats(row: InternalRow, ordinal: Int): Unit = {
     super.gatherStats(row, ordinal)
     if (!row.isNullAt(ordinal)) {
       val value = row.getBoolean(ordinal)
@@ -92,14 +92,15 @@ private[sql] class BooleanColumnStats extends ColumnStats {
     }
   }
 
-  override def collectedStatistics: Row = Row(lower, upper, nullCount, count, sizeInBytes)
+  override def collectedStatistics: InternalRow =
+    InternalRow(lower, upper, nullCount, count, sizeInBytes)
 }
 
 private[sql] class ByteColumnStats extends ColumnStats {
   protected var upper = Byte.MinValue
   protected var lower = Byte.MaxValue
 
-  override def gatherStats(row: Row, ordinal: Int): Unit = {
+  override def gatherStats(row: InternalRow, ordinal: Int): Unit = {
     super.gatherStats(row, ordinal)
     if (!row.isNullAt(ordinal)) {
       val value = row.getByte(ordinal)
@@ -109,14 +110,15 @@ private[sql] class ByteColumnStats extends ColumnStats {
     }
   }
 
-  override def collectedStatistics: Row = Row(lower, upper, nullCount, count, sizeInBytes)
+  override def collectedStatistics: InternalRow =
+    InternalRow(lower, upper, nullCount, count, sizeInBytes)
 }
 
 private[sql] class ShortColumnStats extends ColumnStats {
   protected var upper = Short.MinValue
   protected var lower = Short.MaxValue
 
-  override def gatherStats(row: Row, ordinal: Int): Unit = {
+  override def gatherStats(row: InternalRow, ordinal: Int): Unit = {
     super.gatherStats(row, ordinal)
     if (!row.isNullAt(ordinal)) {
       val value = row.getShort(ordinal)
@@ -126,14 +128,15 @@ private[sql] class ShortColumnStats extends ColumnStats {
     }
   }
 
-  override def collectedStatistics: Row = Row(lower, upper, nullCount, count, sizeInBytes)
+  override def collectedStatistics: InternalRow =
+    InternalRow(lower, upper, nullCount, count, sizeInBytes)
 }
 
 private[sql] class LongColumnStats extends ColumnStats {
   protected var upper = Long.MinValue
   protected var lower = Long.MaxValue
 
-  override def gatherStats(row: Row, ordinal: Int): Unit = {
+  override def gatherStats(row: InternalRow, ordinal: Int): Unit = {
     super.gatherStats(row, ordinal)
     if (!row.isNullAt(ordinal)) {
       val value = row.getLong(ordinal)
@@ -143,14 +146,15 @@ private[sql] class LongColumnStats extends ColumnStats {
     }
   }
 
-  override def collectedStatistics: Row = Row(lower, upper, nullCount, count, sizeInBytes)
+  override def collectedStatistics: InternalRow =
+    InternalRow(lower, upper, nullCount, count, sizeInBytes)
 }
 
 private[sql] class DoubleColumnStats extends ColumnStats {
   protected var upper = Double.MinValue
   protected var lower = Double.MaxValue
 
-  override def gatherStats(row: Row, ordinal: Int): Unit = {
+  override def gatherStats(row: InternalRow, ordinal: Int): Unit = {
     super.gatherStats(row, ordinal)
     if (!row.isNullAt(ordinal)) {
       val value = row.getDouble(ordinal)
@@ -160,14 +164,15 @@ private[sql] class DoubleColumnStats extends ColumnStats {
     }
   }
 
-  override def collectedStatistics: Row = Row(lower, upper, nullCount, count, sizeInBytes)
+  override def collectedStatistics: InternalRow =
+    InternalRow(lower, upper, nullCount, count, sizeInBytes)
 }
 
 private[sql] class FloatColumnStats extends ColumnStats {
   protected var upper = Float.MinValue
   protected var lower = Float.MaxValue
 
-  override def gatherStats(row: Row, ordinal: Int): Unit = {
+  override def gatherStats(row: InternalRow, ordinal: Int): Unit = {
     super.gatherStats(row, ordinal)
     if (!row.isNullAt(ordinal)) {
       val value = row.getFloat(ordinal)
@@ -177,14 +182,15 @@ private[sql] class FloatColumnStats extends ColumnStats {
     }
   }
 
-  override def collectedStatistics: Row = Row(lower, upper, nullCount, count, sizeInBytes)
+  override def collectedStatistics: InternalRow =
+    InternalRow(lower, upper, nullCount, count, sizeInBytes)
 }
 
 private[sql] class FixedDecimalColumnStats extends ColumnStats {
   protected var upper: Decimal = null
   protected var lower: Decimal = null
 
-  override def gatherStats(row: Row, ordinal: Int): Unit = {
+  override def gatherStats(row: InternalRow, ordinal: Int): Unit = {
     super.gatherStats(row, ordinal)
     if (!row.isNullAt(ordinal)) {
       val value = row(ordinal).asInstanceOf[Decimal]
@@ -194,14 +200,15 @@ private[sql] class FixedDecimalColumnStats extends ColumnStats {
     }
   }
 
-  override def collectedStatistics: Row = Row(lower, upper, nullCount, count, sizeInBytes)
+  override def collectedStatistics: InternalRow =
+    InternalRow(lower, upper, nullCount, count, sizeInBytes)
 }
 
 private[sql] class IntColumnStats extends ColumnStats {
   protected var upper = Int.MinValue
   protected var lower = Int.MaxValue
 
-  override def gatherStats(row: Row, ordinal: Int): Unit = {
+  override def gatherStats(row: InternalRow, ordinal: Int): Unit = {
     super.gatherStats(row, ordinal)
     if (!row.isNullAt(ordinal)) {
       val value = row.getInt(ordinal)
@@ -211,14 +218,15 @@ private[sql] class IntColumnStats extends ColumnStats {
     }
   }
 
-  override def collectedStatistics: Row = Row(lower, upper, nullCount, count, sizeInBytes)
+  override def collectedStatistics: InternalRow =
+    InternalRow(lower, upper, nullCount, count, sizeInBytes)
 }
 
 private[sql] class StringColumnStats extends ColumnStats {
   protected var upper: UTF8String = null
   protected var lower: UTF8String = null
 
-  override def gatherStats(row: Row, ordinal: Int): Unit = {
+  override def gatherStats(row: InternalRow, ordinal: Int): Unit = {
     super.gatherStats(row, ordinal)
     if (!row.isNullAt(ordinal)) {
       val value = row(ordinal).asInstanceOf[UTF8String]
@@ -228,7 +236,8 @@ private[sql] class StringColumnStats extends ColumnStats {
     }
   }
 
-  override def collectedStatistics: Row = Row(lower, upper, nullCount, count, sizeInBytes)
+  override def collectedStatistics: InternalRow =
+    InternalRow(lower, upper, nullCount, count, sizeInBytes)
 }
 
 private[sql] class DateColumnStats extends IntColumnStats
@@ -236,23 +245,25 @@ private[sql] class DateColumnStats extends IntColumnStats
 private[sql] class TimestampColumnStats extends LongColumnStats
 
 private[sql] class BinaryColumnStats extends ColumnStats {
-  override def gatherStats(row: Row, ordinal: Int): Unit = {
+  override def gatherStats(row: InternalRow, ordinal: Int): Unit = {
     super.gatherStats(row, ordinal)
     if (!row.isNullAt(ordinal)) {
       sizeInBytes += BINARY.actualSize(row, ordinal)
     }
   }
 
-  override def collectedStatistics: Row = Row(null, null, nullCount, count, sizeInBytes)
+  override def collectedStatistics: InternalRow =
+    InternalRow(null, null, nullCount, count, sizeInBytes)
 }
 
 private[sql] class GenericColumnStats extends ColumnStats {
-  override def gatherStats(row: Row, ordinal: Int): Unit = {
+  override def gatherStats(row: InternalRow, ordinal: Int): Unit = {
     super.gatherStats(row, ordinal)
     if (!row.isNullAt(ordinal)) {
       sizeInBytes += GENERIC.actualSize(row, ordinal)
     }
   }
 
-  override def collectedStatistics: Row = Row(null, null, nullCount, count, sizeInBytes)
+  override def collectedStatistics: InternalRow =
+    InternalRow(null, null, nullCount, count, sizeInBytes)
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/columnar/InMemoryColumnarTableScan.scala b/sql/core/src/main/scala/org/apache/spark/sql/columnar/InMemoryColumnarTableScan.scala
index 3db26fad2b92f..761f427b8cd0d 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/columnar/InMemoryColumnarTableScan.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/columnar/InMemoryColumnarTableScan.scala
@@ -19,21 +19,16 @@ package org.apache.spark.sql.columnar
 
 import java.nio.ByteBuffer
 
-import org.apache.spark.{Accumulable, Accumulator, Accumulators}
-import org.apache.spark.sql.catalyst.expressions
-
 import scala.collection.mutable.ArrayBuffer
-import scala.collection.mutable.HashMap
 
 import org.apache.spark.rdd.RDD
-import org.apache.spark.sql.Row
-import org.apache.spark.SparkContext
 import org.apache.spark.sql.catalyst.analysis.MultiInstanceRelation
 import org.apache.spark.sql.catalyst.dsl.expressions._
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, Statistics}
 import org.apache.spark.sql.execution.{LeafNode, SparkPlan}
 import org.apache.spark.storage.StorageLevel
+import org.apache.spark.{Accumulable, Accumulator, Accumulators}
 
 private[sql] object InMemoryRelation {
   def apply(
@@ -45,7 +40,7 @@ private[sql] object InMemoryRelation {
     new InMemoryRelation(child.output, useCompression, batchSize, storageLevel, child, tableName)()
 }
 
-private[sql] case class CachedBatch(buffers: Array[Array[Byte]], stats: Row)
+private[sql] case class CachedBatch(buffers: Array[Array[Byte]], stats: InternalRow)
 
 private[sql] case class InMemoryRelation(
     output: Seq[Attribute],
@@ -56,12 +51,12 @@ private[sql] case class InMemoryRelation(
     tableName: Option[String])(
     private var _cachedColumnBuffers: RDD[CachedBatch] = null,
     private var _statistics: Statistics = null,
-    private var _batchStats: Accumulable[ArrayBuffer[Row], Row] = null)
+    private var _batchStats: Accumulable[ArrayBuffer[InternalRow], InternalRow] = null)
   extends LogicalPlan with MultiInstanceRelation {
 
-  private val batchStats: Accumulable[ArrayBuffer[Row], Row] =
+  private val batchStats: Accumulable[ArrayBuffer[InternalRow], InternalRow] =
     if (_batchStats == null) {
-      child.sqlContext.sparkContext.accumulableCollection(ArrayBuffer.empty[Row])
+      child.sqlContext.sparkContext.accumulableCollection(ArrayBuffer.empty[InternalRow])
     } else {
       _batchStats
     }
@@ -151,7 +146,7 @@ private[sql] case class InMemoryRelation(
             rowCount += 1
           }
 
-          val stats = Row.merge(columnBuilders.map(_.columnStats.collectedStatistics) : _*)
+          val stats = InternalRow.merge(columnBuilders.map(_.columnStats.collectedStatistics) : _*)
 
           batchStats += stats
           CachedBatch(columnBuilders.map(_.build().array()), stats)
@@ -267,7 +262,7 @@ private[sql] case class InMemoryColumnarTableScan(
 
   private val inMemoryPartitionPruningEnabled = sqlContext.conf.inMemoryPartitionPruning
 
-  protected override def doExecute(): RDD[Row] = {
+  protected override def doExecute(): RDD[InternalRow] = {
     if (enableAccumulators) {
       readPartitions.setValue(0)
       readBatches.setValue(0)
@@ -296,7 +291,7 @@ private[sql] case class InMemoryColumnarTableScan(
 
       val nextRow = new SpecificMutableRow(requestedColumnDataTypes)
 
-      def cachedBatchesToRows(cacheBatches: Iterator[CachedBatch]): Iterator[Row] = {
+      def cachedBatchesToRows(cacheBatches: Iterator[CachedBatch]): Iterator[InternalRow] = {
         val rows = cacheBatches.flatMap { cachedBatch =>
           // Build column accessors
           val columnAccessors = requestedColumnIndices.map { batchColumnIndex =>
@@ -306,15 +301,15 @@ private[sql] case class InMemoryColumnarTableScan(
           }
 
           // Extract rows via column accessors
-          new Iterator[Row] {
+          new Iterator[InternalRow] {
             private[this] val rowLen = nextRow.length
-            override def next(): Row = {
+            override def next(): InternalRow = {
               var i = 0
               while (i < rowLen) {
                 columnAccessors(i).extractTo(nextRow, i)
                 i += 1
               }
-              if (attributes.isEmpty) Row.empty else nextRow
+              if (attributes.isEmpty) InternalRow.empty else nextRow
             }
 
             override def hasNext: Boolean = columnAccessors(0).hasNext
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/columnar/NullableColumnBuilder.scala b/sql/core/src/main/scala/org/apache/spark/sql/columnar/NullableColumnBuilder.scala
index f1f494ac26d0c..ba47bc783f31e 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/columnar/NullableColumnBuilder.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/columnar/NullableColumnBuilder.scala
@@ -19,7 +19,7 @@ package org.apache.spark.sql.columnar
 
 import java.nio.{ByteBuffer, ByteOrder}
 
-import org.apache.spark.sql.Row
+import org.apache.spark.sql.catalyst.InternalRow
 
 /**
  * A stackable trait used for building byte buffer for a column containing null values.  Memory
@@ -52,7 +52,7 @@ private[sql] trait NullableColumnBuilder extends ColumnBuilder {
     super.initialize(initialSize, columnName, useCompression)
   }
 
-  abstract override def appendFrom(row: Row, ordinal: Int): Unit = {
+  abstract override def appendFrom(row: InternalRow, ordinal: Int): Unit = {
     columnStats.gatherStats(row, ordinal)
     if (row.isNullAt(ordinal)) {
       nulls = ColumnBuilder.ensureFreeSpace(nulls, 4)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/columnar/compression/CompressibleColumnBuilder.scala b/sql/core/src/main/scala/org/apache/spark/sql/columnar/compression/CompressibleColumnBuilder.scala
index 8e2a1af6dae78..39b21ddb47ba4 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/columnar/compression/CompressibleColumnBuilder.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/columnar/compression/CompressibleColumnBuilder.scala
@@ -20,7 +20,7 @@ package org.apache.spark.sql.columnar.compression
 import java.nio.{ByteBuffer, ByteOrder}
 
 import org.apache.spark.Logging
-import org.apache.spark.sql.Row
+import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.columnar.{ColumnBuilder, NativeColumnBuilder}
 import org.apache.spark.sql.types.AtomicType
 
@@ -66,7 +66,7 @@ private[sql] trait CompressibleColumnBuilder[T <: AtomicType]
     encoder.compressionRatio < 0.8
   }
 
-  private def gatherCompressibilityStats(row: Row, ordinal: Int): Unit = {
+  private def gatherCompressibilityStats(row: InternalRow, ordinal: Int): Unit = {
     var i = 0
     while (i < compressionEncoders.length) {
       compressionEncoders(i).gatherCompressibilityStats(row, ordinal)
@@ -74,7 +74,7 @@ private[sql] trait CompressibleColumnBuilder[T <: AtomicType]
     }
   }
 
-  abstract override def appendFrom(row: Row, ordinal: Int): Unit = {
+  abstract override def appendFrom(row: InternalRow, ordinal: Int): Unit = {
     super.appendFrom(row, ordinal)
     if (!row.isNullAt(ordinal)) {
       gatherCompressibilityStats(row, ordinal)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/columnar/compression/CompressionScheme.scala b/sql/core/src/main/scala/org/apache/spark/sql/columnar/compression/CompressionScheme.scala
index 17c2d9b111188..4eaec6d853d4d 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/columnar/compression/CompressionScheme.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/columnar/compression/CompressionScheme.scala
@@ -18,14 +18,13 @@
 package org.apache.spark.sql.columnar.compression
 
 import java.nio.{ByteBuffer, ByteOrder}
-
-import org.apache.spark.sql.Row
+import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions.MutableRow
 import org.apache.spark.sql.columnar.{ColumnType, NativeColumnType}
 import org.apache.spark.sql.types.AtomicType
 
 private[sql] trait Encoder[T <: AtomicType] {
-  def gatherCompressibilityStats(row: Row, ordinal: Int): Unit = {}
+  def gatherCompressibilityStats(row: InternalRow, ordinal: Int): Unit = {}
 
   def compressedSize: Int
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/columnar/compression/compressionSchemes.scala b/sql/core/src/main/scala/org/apache/spark/sql/columnar/compression/compressionSchemes.scala
index 534ae90ddbc8b..5abc1259a19ab 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/columnar/compression/compressionSchemes.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/columnar/compression/compressionSchemes.scala
@@ -22,8 +22,7 @@ import java.nio.ByteBuffer
 import scala.collection.mutable
 import scala.reflect.ClassTag
 import scala.reflect.runtime.universe.runtimeMirror
-
-import org.apache.spark.sql.Row
+import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions.{MutableRow, SpecificMutableRow}
 import org.apache.spark.sql.columnar._
 import org.apache.spark.sql.types._
@@ -96,7 +95,7 @@ private[sql] case object RunLengthEncoding extends CompressionScheme {
 
     override def compressedSize: Int = _compressedSize
 
-    override def gatherCompressibilityStats(row: Row, ordinal: Int): Unit = {
+    override def gatherCompressibilityStats(row: InternalRow, ordinal: Int): Unit = {
       val value = columnType.getField(row, ordinal)
       val actualSize = columnType.actualSize(row, ordinal)
       _uncompressedSize += actualSize
@@ -217,7 +216,7 @@ private[sql] case object DictionaryEncoding extends CompressionScheme {
     // to store dictionary element count.
     private var dictionarySize = 4
 
-    override def gatherCompressibilityStats(row: Row, ordinal: Int): Unit = {
+    override def gatherCompressibilityStats(row: InternalRow, ordinal: Int): Unit = {
       val value = columnType.getField(row, ordinal)
 
       if (!overflow) {
@@ -310,7 +309,7 @@ private[sql] case object BooleanBitSet extends CompressionScheme {
   class Encoder extends compression.Encoder[BooleanType.type] {
     private var _uncompressedSize = 0
 
-    override def gatherCompressibilityStats(row: Row, ordinal: Int): Unit = {
+    override def gatherCompressibilityStats(row: InternalRow, ordinal: Int): Unit = {
       _uncompressedSize += BOOLEAN.defaultSize
     }
 
@@ -404,7 +403,7 @@ private[sql] case object IntDelta extends CompressionScheme {
 
     private var prevValue: Int = _
 
-    override def gatherCompressibilityStats(row: Row, ordinal: Int): Unit = {
+    override def gatherCompressibilityStats(row: InternalRow, ordinal: Int): Unit = {
       val value = row.getInt(ordinal)
       val delta = value - prevValue
 
@@ -484,7 +483,7 @@ private[sql] case object LongDelta extends CompressionScheme {
 
     private var prevValue: Long = _
 
-    override def gatherCompressibilityStats(row: Row, ordinal: Int): Unit = {
+    override def gatherCompressibilityStats(row: InternalRow, ordinal: Int): Unit = {
       val value = row.getLong(ordinal)
       val delta = value - prevValue
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/Aggregate.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/Aggregate.scala
index 8d16749697aa2..6e8a5ef18ab62 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/Aggregate.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/Aggregate.scala
@@ -20,12 +20,10 @@ package org.apache.spark.sql.execution
 import java.util.HashMap
 
 import org.apache.spark.annotation.DeveloperApi
-import org.apache.spark.SparkContext
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.catalyst.errors._
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.plans.physical._
-import org.apache.spark.sql.SQLContext
 
 /**
  * :: DeveloperApi ::
@@ -121,11 +119,11 @@ case class Aggregate(
     }
   }
 
-  protected override def doExecute(): RDD[Row] = attachTree(this, "execute") {
+  protected override def doExecute(): RDD[InternalRow] = attachTree(this, "execute") {
     if (groupingExpressions.isEmpty) {
       child.execute().mapPartitions { iter =>
         val buffer = newAggregateBuffer()
-        var currentRow: Row = null
+        var currentRow: InternalRow = null
         while (iter.hasNext) {
           currentRow = iter.next()
           var i = 0
@@ -147,10 +145,10 @@ case class Aggregate(
       }
     } else {
       child.execute().mapPartitions { iter =>
-        val hashTable = new HashMap[Row, Array[AggregateFunction]]
+        val hashTable = new HashMap[InternalRow, Array[AggregateFunction]]
         val groupingProjection = new InterpretedMutableProjection(groupingExpressions, child.output)
 
-        var currentRow: Row = null
+        var currentRow: InternalRow = null
         while (iter.hasNext) {
           currentRow = iter.next()
           val currentGroup = groupingProjection(currentRow)
@@ -167,7 +165,7 @@ case class Aggregate(
           }
         }
 
-        new Iterator[Row] {
+        new Iterator[InternalRow] {
           private[this] val hashTableIter = hashTable.entrySet().iterator()
           private[this] val aggregateResults = new GenericMutableRow(computedAggregates.length)
           private[this] val resultProjection =
@@ -177,7 +175,7 @@ case class Aggregate(
 
           override final def hasNext: Boolean = hashTableIter.hasNext
 
-          override final def next(): Row = {
+          override final def next(): InternalRow = {
             val currentEntry = hashTableIter.next()
             val currentGroup = currentEntry.getKey
             val currentBuffer = currentEntry.getValue
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/Exchange.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/Exchange.scala
index 6fa7ccc6cc89b..c9a188309a4d8 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/Exchange.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/Exchange.scala
@@ -17,19 +17,19 @@
 
 package org.apache.spark.sql.execution
 
-import org.apache.spark.{HashPartitioner, Partitioner, RangePartitioner, SparkEnv}
 import org.apache.spark.annotation.DeveloperApi
 import org.apache.spark.rdd.{RDD, ShuffledRDD}
 import org.apache.spark.serializer.Serializer
 import org.apache.spark.shuffle.sort.SortShuffleManager
 import org.apache.spark.shuffle.unsafe.UnsafeShuffleManager
+import org.apache.spark.sql.SQLContext
 import org.apache.spark.sql.catalyst.errors.attachTree
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.plans.physical._
 import org.apache.spark.sql.catalyst.rules.Rule
 import org.apache.spark.sql.types.DataType
-import org.apache.spark.sql.{SQLContext, Row}
 import org.apache.spark.util.MutablePair
+import org.apache.spark.{HashPartitioner, Partitioner, RangePartitioner, SparkEnv}
 
 /**
  * :: DeveloperApi ::
@@ -157,7 +157,7 @@ case class Exchange(
     serializer
   }
 
-  protected override def doExecute(): RDD[Row] = attachTree(this , "execute") {
+  protected override def doExecute(): RDD[InternalRow] = attachTree(this , "execute") {
     newPartitioning match {
       case HashPartitioning(expressions, numPartitions) =>
         val keySchema = expressions.map(_.dataType).toArray
@@ -173,11 +173,11 @@ case class Exchange(
         } else {
           child.execute().mapPartitions { iter =>
             val hashExpressions = newMutableProjection(expressions, child.output)()
-            val mutablePair = new MutablePair[Row, Row]()
+            val mutablePair = new MutablePair[InternalRow, InternalRow]()
             iter.map(r => mutablePair.update(hashExpressions(r), r))
           }
         }
-        val shuffled = new ShuffledRDD[Row, Row, Row](rdd, part)
+        val shuffled = new ShuffledRDD[InternalRow, InternalRow, InternalRow](rdd, part)
         shuffled.setSerializer(serializer)
         shuffled.map(_._2)
 
@@ -190,7 +190,7 @@ case class Exchange(
           // Internally, RangePartitioner runs a job on the RDD that samples keys to compute
           // partition bounds. To get accurate samples, we need to copy the mutable keys.
           val rddForSampling = childRdd.mapPartitions { iter =>
-            val mutablePair = new MutablePair[Row, Null]()
+            val mutablePair = new MutablePair[InternalRow, Null]()
             iter.map(row => mutablePair.update(row.copy(), null))
           }
           // TODO: RangePartitioner should take an Ordering.
@@ -202,12 +202,12 @@ case class Exchange(
           childRdd.mapPartitions { iter => iter.map(row => (row.copy(), null))}
         } else {
           childRdd.mapPartitions { iter =>
-            val mutablePair = new MutablePair[Row, Null]()
+            val mutablePair = new MutablePair[InternalRow, Null]()
             iter.map(row => mutablePair.update(row, null))
           }
         }
 
-        val shuffled = new ShuffledRDD[Row, Null, Null](rdd, part)
+        val shuffled = new ShuffledRDD[InternalRow, Null, Null](rdd, part)
         shuffled.setSerializer(serializer)
         shuffled.map(_._1)
 
@@ -217,14 +217,16 @@ case class Exchange(
         val partitioner = new HashPartitioner(1)
 
         val rdd = if (needToCopyObjectsBeforeShuffle(partitioner, serializer)) {
-          child.execute().mapPartitions { iter => iter.map(r => (null, r.copy())) }
+          child.execute().mapPartitions {
+            iter => iter.map(r => (null, r.copy()))
+          }
         } else {
           child.execute().mapPartitions { iter =>
-            val mutablePair = new MutablePair[Null, Row]()
+            val mutablePair = new MutablePair[Null, InternalRow]()
             iter.map(r => mutablePair.update(null, r))
           }
         }
-        val shuffled = new ShuffledRDD[Null, Row, Row](rdd, partitioner)
+        val shuffled = new ShuffledRDD[Null, InternalRow, InternalRow](rdd, partitioner)
         shuffled.setSerializer(serializer)
         shuffled.map(_._2)
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/ExistingRDD.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/ExistingRDD.scala
index f931dc95ef575..da27a753a710f 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/ExistingRDD.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/ExistingRDD.scala
@@ -19,7 +19,7 @@ package org.apache.spark.sql.execution
 
 import org.apache.spark.annotation.DeveloperApi
 import org.apache.spark.rdd.RDD
-import org.apache.spark.sql.catalyst.CatalystTypeConverters
+import org.apache.spark.sql.catalyst.{InternalRow, CatalystTypeConverters}
 import org.apache.spark.sql.catalyst.analysis.MultiInstanceRelation
 import org.apache.spark.sql.catalyst.expressions.{Attribute, GenericMutableRow}
 import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, Statistics}
@@ -31,7 +31,7 @@ import org.apache.spark.sql.{Row, SQLContext}
  */
 @DeveloperApi
 object RDDConversions {
-  def productToRowRdd[A <: Product](data: RDD[A], outputTypes: Seq[DataType]): RDD[Row] = {
+  def productToRowRdd[A <: Product](data: RDD[A], outputTypes: Seq[DataType]): RDD[InternalRow] = {
     data.mapPartitions { iterator =>
       val numColumns = outputTypes.length
       val mutableRow = new GenericMutableRow(numColumns)
@@ -51,7 +51,7 @@ object RDDConversions {
   /**
    * Convert the objects inside Row into the types Catalyst expected.
    */
-  def rowToRowRdd(data: RDD[Row], outputTypes: Seq[DataType]): RDD[Row] = {
+  def rowToRowRdd(data: RDD[Row], outputTypes: Seq[DataType]): RDD[InternalRow] = {
     data.mapPartitions { iterator =>
       val numColumns = outputTypes.length
       val mutableRow = new GenericMutableRow(numColumns)
@@ -70,7 +70,9 @@ object RDDConversions {
 }
 
 /** Logical plan node for scanning data from an RDD. */
-private[sql] case class LogicalRDD(output: Seq[Attribute], rdd: RDD[Row])(sqlContext: SQLContext)
+private[sql] case class LogicalRDD(
+    output: Seq[Attribute],
+    rdd: RDD[InternalRow])(sqlContext: SQLContext)
   extends LogicalPlan with MultiInstanceRelation {
 
   override def children: Seq[LogicalPlan] = Nil
@@ -91,13 +93,15 @@ private[sql] case class LogicalRDD(output: Seq[Attribute], rdd: RDD[Row])(sqlCon
 }
 
 /** Physical plan node for scanning data from an RDD. */
-private[sql] case class PhysicalRDD(output: Seq[Attribute], rdd: RDD[Row]) extends LeafNode {
-  protected override def doExecute(): RDD[Row] = rdd
+private[sql] case class PhysicalRDD(
+    output: Seq[Attribute],
+    rdd: RDD[InternalRow]) extends LeafNode {
+  protected override def doExecute(): RDD[InternalRow] = rdd
 }
 
 /** Logical plan node for scanning data from a local collection. */
 private[sql]
-case class LogicalLocalTable(output: Seq[Attribute], rows: Seq[Row])(sqlContext: SQLContext)
+case class LogicalLocalTable(output: Seq[Attribute], rows: Seq[InternalRow])(sqlContext: SQLContext)
    extends LogicalPlan with MultiInstanceRelation {
 
   override def children: Seq[LogicalPlan] = Nil
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/Expand.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/Expand.scala
index 4b601c11924b9..42a0c1be4f694 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/Expand.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/Expand.scala
@@ -19,10 +19,9 @@ package org.apache.spark.sql.execution
 
 import org.apache.spark.annotation.DeveloperApi
 import org.apache.spark.rdd.RDD
-import org.apache.spark.sql.SQLContext
 import org.apache.spark.sql.catalyst.errors._
 import org.apache.spark.sql.catalyst.expressions._
-import org.apache.spark.sql.catalyst.plans.physical.{UnknownPartitioning, Partitioning}
+import org.apache.spark.sql.catalyst.plans.physical.{Partitioning, UnknownPartitioning}
 
 /**
  * Apply the all of the GroupExpressions to every input row, hence we will get
@@ -43,7 +42,7 @@ case class Expand(
   // as UNKNOWN partitioning
   override def outputPartitioning: Partitioning = UnknownPartitioning(0)
 
-  protected override def doExecute(): RDD[Row] = attachTree(this, "execute") {
+  protected override def doExecute(): RDD[InternalRow] = attachTree(this, "execute") {
     child.execute().mapPartitions { iter =>
       // TODO Move out projection objects creation and transfer to
       // workers via closure. However we can't assume the Projection
@@ -51,14 +50,14 @@ case class Expand(
       // create the projections within each of the partition processing.
       val groups = projections.map(ee => newProjection(ee, child.output)).toArray
 
-      new Iterator[Row] {
-        private[this] var result: Row = _
+      new Iterator[InternalRow] {
+        private[this] var result: InternalRow = _
         private[this] var idx = -1  // -1 means the initial state
-        private[this] var input: Row = _
+        private[this] var input: InternalRow = _
 
         override final def hasNext: Boolean = (-1 < idx && idx < groups.length) || iter.hasNext
 
-        override final def next(): Row = {
+        override final def next(): InternalRow = {
           if (idx <= 0) {
             // in the initial (-1) or beginning(0) of a new input row, fetch the next input tuple
             input = iter.next()
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/Generate.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/Generate.scala
index dd02c1f4573bb..c1665f78a960e 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/Generate.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/Generate.scala
@@ -25,12 +25,12 @@ import org.apache.spark.sql.catalyst.expressions._
  * For lazy computing, be sure the generator.terminate() called in the very last
  * TODO reusing the CompletionIterator?
  */
-private[execution] sealed case class LazyIterator(func: () => TraversableOnce[Row])
-  extends Iterator[Row] {
+private[execution] sealed case class LazyIterator(func: () => TraversableOnce[InternalRow])
+  extends Iterator[InternalRow] {
 
   lazy val results = func().toIterator
   override def hasNext: Boolean = results.hasNext
-  override def next(): Row = results.next()
+  override def next(): InternalRow = results.next()
 }
 
 /**
@@ -58,11 +58,11 @@ case class Generate(
 
   val boundGenerator = BindReferences.bindReference(generator, child.output)
 
-  protected override def doExecute(): RDD[Row] = {
+  protected override def doExecute(): RDD[InternalRow] = {
     // boundGenerator.terminate() should be triggered after all of the rows in the partition
     if (join) {
       child.execute().mapPartitions { iter =>
-        val generatorNullRow = Row.fromSeq(Seq.fill[Any](generator.elementTypes.size)(null))
+        val generatorNullRow = InternalRow.fromSeq(Seq.fill[Any](generator.elementTypes.size)(null))
         val joinedRow = new JoinedRow
 
         iter.flatMap { row =>
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/GeneratedAggregate.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/GeneratedAggregate.scala
index 1c40a9209f6d5..ba2c8f53d702d 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/GeneratedAggregate.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/GeneratedAggregate.scala
@@ -66,7 +66,7 @@ case class GeneratedAggregate(
 
   override def output: Seq[Attribute] = aggregateExpressions.map(_.toAttribute)
 
-  protected override def doExecute(): RDD[Row] = {
+  protected override def doExecute(): RDD[InternalRow] = {
     val aggregatesToCompute = aggregateExpressions.flatMap { a =>
       a.collect { case agg: AggregateExpression => agg}
     }
@@ -273,7 +273,7 @@ case class GeneratedAggregate(
       if (groupingExpressions.isEmpty) {
         // TODO: Codegening anything other than the updateProjection is probably over kill.
         val buffer = newAggregationBuffer(EmptyRow).asInstanceOf[MutableRow]
-        var currentRow: Row = null
+        var currentRow: InternalRow = null
         updateProjection.target(buffer)
 
         while (iter.hasNext) {
@@ -295,19 +295,19 @@ case class GeneratedAggregate(
         )
 
         while (iter.hasNext) {
-          val currentRow: Row = iter.next()
-          val groupKey: Row = groupProjection(currentRow)
+          val currentRow: InternalRow = iter.next()
+          val groupKey: InternalRow = groupProjection(currentRow)
           val aggregationBuffer = aggregationMap.getAggregationBuffer(groupKey)
           updateProjection.target(aggregationBuffer)(joinedRow(aggregationBuffer, currentRow))
         }
 
-        new Iterator[Row] {
+        new Iterator[InternalRow] {
           private[this] val mapIterator = aggregationMap.iterator()
           private[this] val resultProjection = resultProjectionBuilder()
 
           def hasNext: Boolean = mapIterator.hasNext
 
-          def next(): Row = {
+          def next(): InternalRow = {
             val entry = mapIterator.next()
             val result = resultProjection(joinedRow(entry.key, entry.value))
             if (hasNext) {
@@ -326,9 +326,9 @@ case class GeneratedAggregate(
         if (unsafeEnabled) {
           log.info("Not using Unsafe-based aggregator because it is not supported for this schema")
         }
-        val buffers = new java.util.HashMap[Row, MutableRow]()
+        val buffers = new java.util.HashMap[InternalRow, MutableRow]()
 
-        var currentRow: Row = null
+        var currentRow: InternalRow = null
         while (iter.hasNext) {
           currentRow = iter.next()
           val currentGroup = groupProjection(currentRow)
@@ -342,13 +342,13 @@ case class GeneratedAggregate(
           updateProjection.target(currentBuffer)(joinedRow(currentBuffer, currentRow))
         }
 
-        new Iterator[Row] {
+        new Iterator[InternalRow] {
           private[this] val resultIterator = buffers.entrySet.iterator()
           private[this] val resultProjection = resultProjectionBuilder()
 
           def hasNext: Boolean = resultIterator.hasNext
 
-          def next(): Row = {
+          def next(): InternalRow = {
             val currentGroup = resultIterator.next()
             resultProjection(joinedRow(currentGroup.getKey, currentGroup.getValue))
           }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/LocalTableScan.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/LocalTableScan.scala
index 03bee80ad7f38..cd341180b6100 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/LocalTableScan.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/LocalTableScan.scala
@@ -19,18 +19,20 @@ package org.apache.spark.sql.execution
 
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.Row
-import org.apache.spark.sql.catalyst.CatalystTypeConverters
+import org.apache.spark.sql.catalyst.{InternalRow, CatalystTypeConverters}
 import org.apache.spark.sql.catalyst.expressions.Attribute
 
 
 /**
  * Physical plan node for scanning data from a local collection.
  */
-private[sql] case class LocalTableScan(output: Seq[Attribute], rows: Seq[Row]) extends LeafNode {
+private[sql] case class LocalTableScan(
+    output: Seq[Attribute],
+    rows: Seq[InternalRow]) extends LeafNode {
 
   private lazy val rdd = sqlContext.sparkContext.parallelize(rows)
 
-  protected override def doExecute(): RDD[Row] = rdd
+  protected override def doExecute(): RDD[InternalRow] = rdd
 
 
   override def executeCollect(): Array[Row] = {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlan.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlan.scala
index 435ac011178de..7739a9f949c77 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlan.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlan.scala
@@ -23,6 +23,7 @@ import org.apache.spark.rdd.{RDD, RDDOperationScope}
 import org.apache.spark.sql.SQLContext
 import org.apache.spark.sql.catalyst.{CatalystTypeConverters, trees}
 import org.apache.spark.sql.catalyst.expressions._
+import org.apache.spark.sql.Row
 import org.apache.spark.sql.catalyst.expressions.codegen._
 import org.apache.spark.sql.catalyst.plans.QueryPlan
 import org.apache.spark.sql.catalyst.plans.physical._
@@ -79,11 +80,11 @@ abstract class SparkPlan extends QueryPlan[SparkPlan] with Logging with Serializ
   def requiredChildOrdering: Seq[Seq[SortOrder]] = Seq.fill(children.size)(Nil)
 
   /**
-   * Returns the result of this query as an RDD[Row] by delegating to doExecute
+   * Returns the result of this query as an RDD[InternalRow] by delegating to doExecute
    * after adding query plan information to created RDDs for visualization.
    * Concrete implementations of SparkPlan should override doExecute instead.
    */
-  final def execute(): RDD[Row] = {
+  final def execute(): RDD[InternalRow] = {
     RDDOperationScope.withScope(sparkContext, nodeName, false, true) {
       doExecute()
     }
@@ -91,9 +92,9 @@ abstract class SparkPlan extends QueryPlan[SparkPlan] with Logging with Serializ
 
   /**
    * Overridden by concrete implementations of SparkPlan.
-   * Produces the result of the query as an RDD[Row]
+   * Produces the result of the query as an RDD[InternalRow]
    */
-  protected def doExecute(): RDD[Row]
+  protected def doExecute(): RDD[InternalRow]
 
   /**
    * Runs this query returning the result as an array.
@@ -117,7 +118,7 @@ abstract class SparkPlan extends QueryPlan[SparkPlan] with Logging with Serializ
 
     val childRDD = execute().map(_.copy())
 
-    val buf = new ArrayBuffer[Row]
+    val buf = new ArrayBuffer[InternalRow]
     val totalParts = childRDD.partitions.length
     var partsScanned = 0
     while (buf.size < n && partsScanned < totalParts) {
@@ -140,7 +141,8 @@ abstract class SparkPlan extends QueryPlan[SparkPlan] with Logging with Serializ
       val p = partsScanned until math.min(partsScanned + numPartsToTry, totalParts)
       val sc = sqlContext.sparkContext
       val res =
-        sc.runJob(childRDD, (it: Iterator[Row]) => it.take(left).toArray, p, allowLocal = false)
+        sc.runJob(childRDD, (it: Iterator[InternalRow]) => it.take(left).toArray, p,
+          allowLocal = false)
 
       res.foreach(buf ++= _.take(n - buf.size))
       partsScanned += numPartsToTry
@@ -175,7 +177,7 @@ abstract class SparkPlan extends QueryPlan[SparkPlan] with Logging with Serializ
 
 
   protected def newPredicate(
-      expression: Expression, inputSchema: Seq[Attribute]): (Row) => Boolean = {
+      expression: Expression, inputSchema: Seq[Attribute]): (InternalRow) => Boolean = {
     if (codegenEnabled) {
       GeneratePredicate.generate(expression, inputSchema)
     } else {
@@ -183,7 +185,9 @@ abstract class SparkPlan extends QueryPlan[SparkPlan] with Logging with Serializ
     }
   }
 
-  protected def newOrdering(order: Seq[SortOrder], inputSchema: Seq[Attribute]): Ordering[Row] = {
+  protected def newOrdering(
+      order: Seq[SortOrder],
+      inputSchema: Seq[Attribute]): Ordering[InternalRow] = {
     if (codegenEnabled) {
       GenerateOrdering.generate(order, inputSchema)
     } else {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala
index 7a1331a39151a..422992d019c7b 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala
@@ -203,7 +203,7 @@ private[sql] abstract class SparkStrategies extends QueryPlanner[SparkPlan] {
   }
 
   protected lazy val singleRowRdd =
-    sparkContext.parallelize(Seq(new GenericRow(Array[Any]()): Row), 1)
+    sparkContext.parallelize(Seq(new GenericRow(Array[Any]()): InternalRow), 1)
 
   object TakeOrdered extends Strategy {
     def apply(plan: LogicalPlan): Seq[SparkPlan] = plan match {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/Window.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/Window.scala
index c4327ce262ac5..fd6f1d7ae1255 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/Window.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/Window.scala
@@ -20,9 +20,8 @@ package org.apache.spark.sql.execution
 import java.util
 
 import org.apache.spark.rdd.RDD
-import org.apache.spark.sql.Row
 import org.apache.spark.sql.catalyst.expressions._
-import org.apache.spark.sql.catalyst.plans.physical.{AllTuples, Distribution, ClusteredDistribution, Partitioning}
+import org.apache.spark.sql.catalyst.plans.physical.{AllTuples, ClusteredDistribution, Distribution, Partitioning}
 import org.apache.spark.util.collection.CompactBuffer
 
 /**
@@ -112,16 +111,16 @@ case class Window(
     }
   }
 
-  protected override def doExecute(): RDD[Row] = {
+  protected override def doExecute(): RDD[InternalRow] = {
     child.execute().mapPartitions { iter =>
-      new Iterator[Row] {
+      new Iterator[InternalRow] {
 
         // Although input rows are grouped based on windowSpec.partitionSpec, we need to
         // know when we have a new partition.
         // This is to manually construct an ordering that can be used to compare rows.
         // TODO: We may want to have a newOrdering that takes BoundReferences.
         // So, we can take advantave of code gen.
-        private val partitionOrdering: Ordering[Row] =
+        private val partitionOrdering: Ordering[InternalRow] =
           RowOrdering.forSchema(windowSpec.partitionSpec.map(_.dataType))
 
         // This is used to project expressions for the partition specification.
@@ -137,13 +136,13 @@ case class Window(
         // The number of buffered rows in the inputRowBuffer (the size of the current partition).
         var partitionSize: Int = 0
         // The buffer used to buffer rows in a partition.
-        var inputRowBuffer: CompactBuffer[Row] = _
+        var inputRowBuffer: CompactBuffer[InternalRow] = _
         // The partition key of the current partition.
-        var currentPartitionKey: Row = _
+        var currentPartitionKey: InternalRow = _
         // The partition key of next partition.
-        var nextPartitionKey: Row = _
+        var nextPartitionKey: InternalRow = _
         // The first row of next partition.
-        var firstRowInNextPartition: Row = _
+        var firstRowInNextPartition: InternalRow = _
         // Indicates if this partition is the last one in the iter.
         var lastPartition: Boolean = false
 
@@ -316,7 +315,7 @@ case class Window(
           !lastPartition || (rowPosition < partitionSize)
         }
 
-        override final def next(): Row = {
+        override final def next(): InternalRow = {
           if (hasNext) {
             if (rowPosition == partitionSize) {
               // All rows of this buffer have been consumed.
@@ -353,7 +352,7 @@ case class Window(
         // Fetch the next partition.
         private def fetchNextPartition(): Unit = {
           // Create a new buffer for input rows.
-          inputRowBuffer = new CompactBuffer[Row]()
+          inputRowBuffer = new CompactBuffer[InternalRow]()
           // We already have the first row for this partition
           // (recorded in firstRowInNextPartition). Add it back.
           inputRowBuffer += firstRowInNextPartition
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/basicOperators.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/basicOperators.scala
index fb42072f9d5a7..7aedd630e3871 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/basicOperators.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/basicOperators.scala
@@ -17,16 +17,17 @@
 
 package org.apache.spark.sql.execution
 
-import org.apache.spark.{SparkEnv, HashPartitioner, SparkConf}
 import org.apache.spark.annotation.DeveloperApi
 import org.apache.spark.rdd.{RDD, ShuffledRDD}
 import org.apache.spark.shuffle.sort.SortShuffleManager
 import org.apache.spark.sql.catalyst.CatalystTypeConverters
 import org.apache.spark.sql.catalyst.errors._
 import org.apache.spark.sql.catalyst.expressions._
+import org.apache.spark.sql.Row
 import org.apache.spark.sql.catalyst.plans.physical._
-import org.apache.spark.util.{CompletionIterator, MutablePair}
 import org.apache.spark.util.collection.ExternalSorter
+import org.apache.spark.util.{CompletionIterator, MutablePair}
+import org.apache.spark.{HashPartitioner, SparkEnv}
 
 /**
  * :: DeveloperApi ::
@@ -37,7 +38,7 @@ case class Project(projectList: Seq[NamedExpression], child: SparkPlan) extends
 
   @transient lazy val buildProjection = newMutableProjection(projectList, child.output)
 
-  protected override def doExecute(): RDD[Row] = child.execute().mapPartitions { iter =>
+  protected override def doExecute(): RDD[InternalRow] = child.execute().mapPartitions { iter =>
     val resuableProjection = buildProjection()
     iter.map(resuableProjection)
   }
@@ -52,9 +53,10 @@ case class Project(projectList: Seq[NamedExpression], child: SparkPlan) extends
 case class Filter(condition: Expression, child: SparkPlan) extends UnaryNode {
   override def output: Seq[Attribute] = child.output
 
-  @transient lazy val conditionEvaluator: (Row) => Boolean = newPredicate(condition, child.output)
+  @transient lazy val conditionEvaluator: (InternalRow) => Boolean =
+    newPredicate(condition, child.output)
 
-  protected override def doExecute(): RDD[Row] = child.execute().mapPartitions { iter =>
+  protected override def doExecute(): RDD[InternalRow] = child.execute().mapPartitions { iter =>
     iter.filter(conditionEvaluator)
   }
 
@@ -83,7 +85,7 @@ case class Sample(
   override def output: Seq[Attribute] = child.output
 
   // TODO: How to pick seed?
-  protected override def doExecute(): RDD[Row] = {
+  protected override def doExecute(): RDD[InternalRow] = {
     if (withReplacement) {
       child.execute().map(_.copy()).sample(withReplacement, upperBound - lowerBound, seed)
     } else {
@@ -99,7 +101,8 @@ case class Sample(
 case class Union(children: Seq[SparkPlan]) extends SparkPlan {
   // TODO: attributes output by union should be distinct for nullability purposes
   override def output: Seq[Attribute] = children.head.output
-  protected override def doExecute(): RDD[Row] = sparkContext.union(children.map(_.execute()))
+  protected override def doExecute(): RDD[InternalRow] =
+    sparkContext.union(children.map(_.execute()))
 }
 
 /**
@@ -124,19 +127,19 @@ case class Limit(limit: Int, child: SparkPlan)
 
   override def executeCollect(): Array[Row] = child.executeTake(limit)
 
-  protected override def doExecute(): RDD[Row] = {
-    val rdd: RDD[_ <: Product2[Boolean, Row]] = if (sortBasedShuffleOn) {
+  protected override def doExecute(): RDD[InternalRow] = {
+    val rdd: RDD[_ <: Product2[Boolean, InternalRow]] = if (sortBasedShuffleOn) {
       child.execute().mapPartitions { iter =>
         iter.take(limit).map(row => (false, row.copy()))
       }
     } else {
       child.execute().mapPartitions { iter =>
-        val mutablePair = new MutablePair[Boolean, Row]()
+        val mutablePair = new MutablePair[Boolean, InternalRow]()
         iter.take(limit).map(row => mutablePair.update(false, row))
       }
     }
     val part = new HashPartitioner(1)
-    val shuffled = new ShuffledRDD[Boolean, Row, Row](rdd, part)
+    val shuffled = new ShuffledRDD[Boolean, InternalRow, InternalRow](rdd, part)
     shuffled.setSerializer(new SparkSqlSerializer(child.sqlContext.sparkContext.getConf))
     shuffled.mapPartitions(_.take(limit).map(_._2))
   }
@@ -157,7 +160,8 @@ case class TakeOrdered(limit: Int, sortOrder: Seq[SortOrder], child: SparkPlan)
 
   private val ord: RowOrdering = new RowOrdering(sortOrder, child.output)
 
-  private def collectData(): Array[Row] = child.execute().map(_.copy()).takeOrdered(limit)(ord)
+  private def collectData(): Array[InternalRow] =
+    child.execute().map(_.copy()).takeOrdered(limit)(ord)
 
   override def executeCollect(): Array[Row] = {
     val converter = CatalystTypeConverters.createToScalaConverter(schema)
@@ -166,7 +170,7 @@ case class TakeOrdered(limit: Int, sortOrder: Seq[SortOrder], child: SparkPlan)
 
   // TODO: Terminal split should be implemented differently from non-terminal split.
   // TODO: Pick num splits based on |limit|.
-  protected override def doExecute(): RDD[Row] = sparkContext.makeRDD(collectData(), 1)
+  protected override def doExecute(): RDD[InternalRow] = sparkContext.makeRDD(collectData(), 1)
 
   override def outputOrdering: Seq[SortOrder] = sortOrder
 }
@@ -186,7 +190,7 @@ case class Sort(
   override def requiredChildDistribution: Seq[Distribution] =
     if (global) OrderedDistribution(sortOrder) :: Nil else UnspecifiedDistribution :: Nil
 
-  protected override def doExecute(): RDD[Row] = attachTree(this, "sort") {
+  protected override def doExecute(): RDD[InternalRow] = attachTree(this, "sort") {
     child.execute().mapPartitions( { iterator =>
       val ordering = newOrdering(sortOrder, child.output)
       iterator.map(_.copy()).toArray.sorted(ordering).iterator
@@ -214,14 +218,14 @@ case class ExternalSort(
   override def requiredChildDistribution: Seq[Distribution] =
     if (global) OrderedDistribution(sortOrder) :: Nil else UnspecifiedDistribution :: Nil
 
-  protected override def doExecute(): RDD[Row] = attachTree(this, "sort") {
+  protected override def doExecute(): RDD[InternalRow] = attachTree(this, "sort") {
     child.execute().mapPartitions( { iterator =>
       val ordering = newOrdering(sortOrder, child.output)
-      val sorter = new ExternalSorter[Row, Null, Row](ordering = Some(ordering))
+      val sorter = new ExternalSorter[InternalRow, Null, InternalRow](ordering = Some(ordering))
       sorter.insertAll(iterator.map(r => (r.copy, null)))
       val baseIterator = sorter.iterator.map(_._1)
       // TODO(marmbrus): The complex type signature below thwarts inference for no reason.
-      CompletionIterator[Row, Iterator[Row]](baseIterator, sorter.stop())
+      CompletionIterator[InternalRow, Iterator[InternalRow]](baseIterator, sorter.stop())
     }, preservesPartitioning = true)
   }
 
@@ -239,7 +243,7 @@ case class Repartition(numPartitions: Int, shuffle: Boolean, child: SparkPlan)
   extends UnaryNode {
   override def output: Seq[Attribute] = child.output
 
-  protected override def doExecute(): RDD[Row] = {
+  protected override def doExecute(): RDD[InternalRow] = {
     child.execute().map(_.copy()).coalesce(numPartitions, shuffle)
   }
 }
@@ -254,7 +258,7 @@ case class Repartition(numPartitions: Int, shuffle: Boolean, child: SparkPlan)
 case class Except(left: SparkPlan, right: SparkPlan) extends BinaryNode {
   override def output: Seq[Attribute] = left.output
 
-  protected override def doExecute(): RDD[Row] = {
+  protected override def doExecute(): RDD[InternalRow] = {
     left.execute().map(_.copy()).subtract(right.execute().map(_.copy()))
   }
 }
@@ -268,7 +272,7 @@ case class Except(left: SparkPlan, right: SparkPlan) extends BinaryNode {
 case class Intersect(left: SparkPlan, right: SparkPlan) extends BinaryNode {
   override def output: Seq[Attribute] = children.head.output
 
-  protected override def doExecute(): RDD[Row] = {
+  protected override def doExecute(): RDD[InternalRow] = {
     left.execute().map(_.copy()).intersection(right.execute().map(_.copy()))
   }
 }
@@ -283,5 +287,5 @@ case class Intersect(left: SparkPlan, right: SparkPlan) extends BinaryNode {
 case class OutputFaker(output: Seq[Attribute], child: SparkPlan) extends SparkPlan {
   def children: Seq[SparkPlan] = child :: Nil
 
-  protected override def doExecute(): RDD[Row] = child.execute()
+  protected override def doExecute(): RDD[InternalRow] = child.execute()
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/commands.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/commands.scala
index 49b361e96b2d6..653792ea2e537 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/commands.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/commands.scala
@@ -20,13 +20,13 @@ package org.apache.spark.sql.execution
 import org.apache.spark.Logging
 import org.apache.spark.annotation.DeveloperApi
 import org.apache.spark.rdd.RDD
-import org.apache.spark.sql.catalyst.CatalystTypeConverters
+import org.apache.spark.sql.catalyst.{InternalRow, CatalystTypeConverters}
 import org.apache.spark.sql.catalyst.errors.TreeNodeException
-import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference, Row}
+import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference}
 import org.apache.spark.sql.catalyst.plans.logical
 import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
 import org.apache.spark.sql.types._
-import org.apache.spark.sql.{DataFrame, SQLConf, SQLContext}
+import org.apache.spark.sql.{DataFrame, Row, SQLConf, SQLContext}
 
 /**
  * A logical command that is executed for its side-effects.  `RunnableCommand`s are
@@ -64,9 +64,9 @@ private[sql] case class ExecutedCommand(cmd: RunnableCommand) extends SparkPlan
 
   override def executeTake(limit: Int): Array[Row] = sideEffectResult.take(limit).toArray
 
-  protected override def doExecute(): RDD[Row] = {
+  protected override def doExecute(): RDD[InternalRow] = {
     val converted = sideEffectResult.map(r =>
-      CatalystTypeConverters.convertToCatalyst(r, schema).asInstanceOf[Row])
+      CatalystTypeConverters.convertToCatalyst(r, schema).asInstanceOf[InternalRow])
     sqlContext.sparkContext.parallelize(converted, 1)
   }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/debug/package.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/debug/package.scala
index 83c1f65d5c96f..3ee4033baee2e 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/debug/package.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/debug/package.scala
@@ -18,6 +18,7 @@
 package org.apache.spark.sql.execution
 
 import org.apache.spark.rdd.RDD
+import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions.Attribute
 import org.apache.spark.unsafe.types.UTF8String
 
@@ -25,7 +26,7 @@ import scala.collection.mutable.HashSet
 
 import org.apache.spark.{AccumulatorParam, Accumulator}
 import org.apache.spark.annotation.DeveloperApi
-import org.apache.spark.sql.{SQLConf, SQLContext, DataFrame, Row}
+import org.apache.spark.sql._
 import org.apache.spark.sql.catalyst.trees.TreeNodeRef
 import org.apache.spark.sql.types._
 
@@ -126,11 +127,11 @@ package object debug {
       }
     }
 
-    protected override def doExecute(): RDD[Row] = {
+    protected override def doExecute(): RDD[InternalRow] = {
       child.execute().mapPartitions { iter =>
-        new Iterator[Row] {
+        new Iterator[InternalRow] {
           def hasNext: Boolean = iter.hasNext
-          def next(): Row = {
+          def next(): InternalRow = {
             val currentRow = iter.next()
             tupleCount += 1
             var i = 0
@@ -155,7 +156,7 @@ package object debug {
     def typeCheck(data: Any, schema: DataType): Unit = (data, schema) match {
       case (null, _) =>
 
-      case (row: Row, StructType(fields)) =>
+      case (row: InternalRow, StructType(fields)) =>
         row.toSeq.zip(fields.map(_.dataType)).foreach { case(d, t) => typeCheck(d, t) }
       case (s: Seq[_], ArrayType(elemType, _)) =>
         s.foreach(typeCheck(_, elemType))
@@ -196,7 +197,7 @@ package object debug {
 
     def children: List[SparkPlan] = child :: Nil
 
-    protected override def doExecute(): RDD[Row] = {
+    protected override def doExecute(): RDD[InternalRow] = {
       child.execute().map { row =>
         try typeCheck(row, child.schema) catch {
           case e: Exception =>
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/expressions/MonotonicallyIncreasingID.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/expressions/MonotonicallyIncreasingID.scala
index e228a60c9029f..68914cf85cb50 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/expressions/MonotonicallyIncreasingID.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/expressions/MonotonicallyIncreasingID.scala
@@ -18,7 +18,7 @@
 package org.apache.spark.sql.execution.expressions
 
 import org.apache.spark.TaskContext
-import org.apache.spark.sql.catalyst.expressions.{Row, LeafExpression}
+import org.apache.spark.sql.catalyst.expressions.{InternalRow, LeafExpression}
 import org.apache.spark.sql.types.{LongType, DataType}
 
 /**
@@ -43,7 +43,7 @@ private[sql] case class MonotonicallyIncreasingID() extends LeafExpression {
 
   override def dataType: DataType = LongType
 
-  override def eval(input: Row): Long = {
+  override def eval(input: InternalRow): Long = {
     val currentCount = count
     count += 1
     (TaskContext.get().partitionId().toLong << 33) + currentCount
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/expressions/SparkPartitionID.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/expressions/SparkPartitionID.scala
index 1272793f88cd0..12c2eed0d6b7c 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/expressions/SparkPartitionID.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/expressions/SparkPartitionID.scala
@@ -18,7 +18,7 @@
 package org.apache.spark.sql.execution.expressions
 
 import org.apache.spark.TaskContext
-import org.apache.spark.sql.catalyst.expressions.{LeafExpression, Row}
+import org.apache.spark.sql.catalyst.expressions.{LeafExpression, InternalRow}
 import org.apache.spark.sql.types.{IntegerType, DataType}
 
 
@@ -31,5 +31,5 @@ private[sql] case object SparkPartitionID extends LeafExpression {
 
   override def dataType: DataType = IntegerType
 
-  override def eval(input: Row): Int = TaskContext.get().partitionId()
+  override def eval(input: InternalRow): Int = TaskContext.get().partitionId()
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/BroadcastHashJoin.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/BroadcastHashJoin.scala
index b8b12be8756f9..2d2e1b92b86be 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/BroadcastHashJoin.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/BroadcastHashJoin.scala
@@ -17,16 +17,15 @@
 
 package org.apache.spark.sql.execution.joins
 
-import org.apache.spark.rdd.RDD
-import org.apache.spark.util.ThreadUtils
-
 import scala.concurrent._
 import scala.concurrent.duration._
 
 import org.apache.spark.annotation.DeveloperApi
-import org.apache.spark.sql.catalyst.expressions.{Row, Expression}
+import org.apache.spark.rdd.RDD
+import org.apache.spark.sql.catalyst.expressions.{Expression, InternalRow}
 import org.apache.spark.sql.catalyst.plans.physical.{Distribution, Partitioning, UnspecifiedDistribution}
 import org.apache.spark.sql.execution.{BinaryNode, SparkPlan}
+import org.apache.spark.util.ThreadUtils
 
 /**
  * :: DeveloperApi ::
@@ -61,12 +60,12 @@ case class BroadcastHashJoin(
   @transient
   private val broadcastFuture = future {
     // Note that we use .execute().collect() because we don't want to convert data to Scala types
-    val input: Array[Row] = buildPlan.execute().map(_.copy()).collect()
+    val input: Array[InternalRow] = buildPlan.execute().map(_.copy()).collect()
     val hashed = HashedRelation(input.iterator, buildSideKeyGenerator, input.length)
     sparkContext.broadcast(hashed)
   }(BroadcastHashJoin.broadcastHashJoinExecutionContext)
 
-  protected override def doExecute(): RDD[Row] = {
+  protected override def doExecute(): RDD[InternalRow] = {
     val broadcastRelation = Await.result(broadcastFuture, timeout)
 
     streamedPlan.execute().mapPartitions { streamedIter =>
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/BroadcastLeftSemiJoinHash.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/BroadcastLeftSemiJoinHash.scala
index a32e5fc4f7ea4..044964f3a355b 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/BroadcastLeftSemiJoinHash.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/BroadcastLeftSemiJoinHash.scala
@@ -19,7 +19,7 @@ package org.apache.spark.sql.execution.joins
 
 import org.apache.spark.annotation.DeveloperApi
 import org.apache.spark.rdd.RDD
-import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression, Row}
+import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.execution.{BinaryNode, SparkPlan}
 
 /**
@@ -38,10 +38,10 @@ case class BroadcastLeftSemiJoinHash(
 
   override def output: Seq[Attribute] = left.output
 
-  protected override def doExecute(): RDD[Row] = {
+  protected override def doExecute(): RDD[InternalRow] = {
     val buildIter = buildPlan.execute().map(_.copy()).collect().toIterator
-    val hashSet = new java.util.HashSet[Row]()
-    var currentRow: Row = null
+    val hashSet = new java.util.HashSet[InternalRow]()
+    var currentRow: InternalRow = null
 
     // Create a Hash set of buildKeys
     while (buildIter.hasNext) {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/BroadcastNestedLoopJoin.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/BroadcastNestedLoopJoin.scala
index caad3dfbe1c5e..0b2cf8e12a6c4 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/BroadcastNestedLoopJoin.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/BroadcastNestedLoopJoin.scala
@@ -61,13 +61,14 @@ case class BroadcastNestedLoopJoin(
   @transient private lazy val boundCondition =
     newPredicate(condition.getOrElse(Literal(true)), left.output ++ right.output)
 
-  protected override def doExecute(): RDD[Row] = {
+  protected override def doExecute(): RDD[InternalRow] = {
     val broadcastedRelation =
-      sparkContext.broadcast(broadcast.execute().map(_.copy()).collect().toIndexedSeq)
+      sparkContext.broadcast(broadcast.execute().map(_.copy())
+        .collect().toIndexedSeq)
 
     /** All rows that either match both-way, or rows from streamed joined with nulls. */
     val matchesOrStreamedRowsWithNulls = streamed.execute().mapPartitions { streamedIter =>
-      val matchedRows = new CompactBuffer[Row]
+      val matchedRows = new CompactBuffer[InternalRow]
       // TODO: Use Spark's BitSet.
       val includedBroadcastTuples =
         new scala.collection.mutable.BitSet(broadcastedRelation.value.size)
@@ -118,8 +119,8 @@ case class BroadcastNestedLoopJoin(
     val leftNulls = new GenericMutableRow(left.output.size)
     val rightNulls = new GenericMutableRow(right.output.size)
     /** Rows from broadcasted joined with nulls. */
-    val broadcastRowsWithNulls: Seq[Row] = {
-      val buf: CompactBuffer[Row] = new CompactBuffer()
+    val broadcastRowsWithNulls: Seq[InternalRow] = {
+      val buf: CompactBuffer[InternalRow] = new CompactBuffer()
       var i = 0
       val rel = broadcastedRelation.value
       while (i < rel.length) {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/CartesianProduct.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/CartesianProduct.scala
index 191c00cb55da2..261b4724159fb 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/CartesianProduct.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/CartesianProduct.scala
@@ -19,7 +19,7 @@ package org.apache.spark.sql.execution.joins
 
 import org.apache.spark.annotation.DeveloperApi
 import org.apache.spark.rdd.RDD
-import org.apache.spark.sql.Row
+import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions.{Attribute, JoinedRow}
 import org.apache.spark.sql.execution.{BinaryNode, SparkPlan}
 
@@ -30,7 +30,7 @@ import org.apache.spark.sql.execution.{BinaryNode, SparkPlan}
 case class CartesianProduct(left: SparkPlan, right: SparkPlan) extends BinaryNode {
   override def output: Seq[Attribute] = left.output ++ right.output
 
-  protected override def doExecute(): RDD[Row] = {
+  protected override def doExecute(): RDD[InternalRow] = {
     val leftResults = left.execute().map(_.copy())
     val rightResults = right.execute().map(_.copy())
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashJoin.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashJoin.scala
index 851de1685509a..3a4196a90d14a 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashJoin.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashJoin.scala
@@ -49,11 +49,13 @@ trait HashJoin {
   @transient protected lazy val streamSideKeyGenerator: () => MutableProjection =
     newMutableProjection(streamedKeys, streamedPlan.output)
 
-  protected def hashJoin(streamIter: Iterator[Row], hashedRelation: HashedRelation): Iterator[Row] =
+  protected def hashJoin(
+      streamIter: Iterator[InternalRow],
+      hashedRelation: HashedRelation): Iterator[InternalRow] =
   {
-    new Iterator[Row] {
-      private[this] var currentStreamedRow: Row = _
-      private[this] var currentHashMatches: CompactBuffer[Row] = _
+    new Iterator[InternalRow] {
+      private[this] var currentStreamedRow: InternalRow = _
+      private[this] var currentHashMatches: CompactBuffer[InternalRow] = _
       private[this] var currentMatchPosition: Int = -1
 
       // Mutable per row objects.
@@ -65,7 +67,7 @@ trait HashJoin {
         (currentMatchPosition != -1 && currentMatchPosition < currentHashMatches.size) ||
           (streamIter.hasNext && fetchNext())
 
-      override final def next(): Row = {
+      override final def next(): InternalRow = {
         val ret = buildSide match {
           case BuildRight => joinRow(currentStreamedRow, currentHashMatches(currentMatchPosition))
           case BuildLeft => joinRow(currentHashMatches(currentMatchPosition), currentStreamedRow)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashOuterJoin.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashOuterJoin.scala
index c21a453115292..19aef9978e732 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashOuterJoin.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashOuterJoin.scala
@@ -68,26 +68,29 @@ case class HashOuterJoin(
     }
   }
 
-  @transient private[this] lazy val DUMMY_LIST = Seq[Row](null)
-  @transient private[this] lazy val EMPTY_LIST = Seq.empty[Row]
+  @transient private[this] lazy val DUMMY_LIST = Seq[InternalRow](null)
+  @transient private[this] lazy val EMPTY_LIST = Seq.empty[InternalRow]
 
   @transient private[this] lazy val leftNullRow = new GenericRow(left.output.length)
   @transient private[this] lazy val rightNullRow = new GenericRow(right.output.length)
   @transient private[this] lazy val boundCondition =
-    condition.map(newPredicate(_, left.output ++ right.output)).getOrElse((row: Row) => true)
+    condition.map(
+      newPredicate(_, left.output ++ right.output)).getOrElse((row: InternalRow) => true)
 
   // TODO we need to rewrite all of the iterators with our own implementation instead of the Scala
   // iterator for performance purpose.
 
   private[this] def leftOuterIterator(
-      key: Row, joinedRow: JoinedRow, rightIter: Iterable[Row]): Iterator[Row] = {
-    val ret: Iterable[Row] = {
+      key: InternalRow,
+      joinedRow: JoinedRow,
+      rightIter: Iterable[InternalRow]): Iterator[InternalRow] = {
+    val ret: Iterable[InternalRow] = {
       if (!key.anyNull) {
         val temp = rightIter.collect {
           case r if boundCondition(joinedRow.withRight(r)) => joinedRow.copy()
         }
         if (temp.size == 0) {
-          joinedRow.withRight(rightNullRow).copy :: Nil
+          joinedRow.withRight(rightNullRow).copy.asInstanceOf[InternalRow] :: Nil
         } else {
           temp
         }
@@ -99,12 +102,15 @@ case class HashOuterJoin(
   }
 
   private[this] def rightOuterIterator(
-      key: Row, leftIter: Iterable[Row], joinedRow: JoinedRow): Iterator[Row] = {
+      key: InternalRow,
+      leftIter: Iterable[InternalRow],
+      joinedRow: JoinedRow): Iterator[InternalRow] = {
 
-    val ret: Iterable[Row] = {
+    val ret: Iterable[InternalRow] = {
       if (!key.anyNull) {
         val temp = leftIter.collect {
-          case l if boundCondition(joinedRow.withLeft(l)) => joinedRow.copy
+          case l if boundCondition(joinedRow.withLeft(l)) =>
+            joinedRow.copy
         }
         if (temp.size == 0) {
           joinedRow.withLeft(leftNullRow).copy :: Nil
@@ -119,14 +125,14 @@ case class HashOuterJoin(
   }
 
   private[this] def fullOuterIterator(
-      key: Row, leftIter: Iterable[Row], rightIter: Iterable[Row],
-      joinedRow: JoinedRow): Iterator[Row] = {
+      key: InternalRow, leftIter: Iterable[InternalRow], rightIter: Iterable[InternalRow],
+      joinedRow: JoinedRow): Iterator[InternalRow] = {
 
     if (!key.anyNull) {
       // Store the positions of records in right, if one of its associated row satisfy
       // the join condition.
       val rightMatchedSet = scala.collection.mutable.Set[Int]()
-      leftIter.iterator.flatMap[Row] { l =>
+      leftIter.iterator.flatMap[InternalRow] { l =>
         joinedRow.withLeft(l)
         var matched = false
         rightIter.zipWithIndex.collect {
@@ -157,24 +163,25 @@ case class HashOuterJoin(
           joinedRow(leftNullRow, r).copy()
       }
     } else {
-      leftIter.iterator.map[Row] { l =>
+      leftIter.iterator.map[InternalRow] { l =>
         joinedRow(l, rightNullRow).copy()
-      } ++ rightIter.iterator.map[Row] { r =>
+      } ++ rightIter.iterator.map[InternalRow] { r =>
         joinedRow(leftNullRow, r).copy()
       }
     }
   }
 
   private[this] def buildHashTable(
-      iter: Iterator[Row], keyGenerator: Projection): JavaHashMap[Row, CompactBuffer[Row]] = {
-    val hashTable = new JavaHashMap[Row, CompactBuffer[Row]]()
+      iter: Iterator[InternalRow],
+      keyGenerator: Projection): JavaHashMap[InternalRow, CompactBuffer[InternalRow]] = {
+    val hashTable = new JavaHashMap[InternalRow, CompactBuffer[InternalRow]]()
     while (iter.hasNext) {
       val currentRow = iter.next()
       val rowKey = keyGenerator(currentRow)
 
       var existingMatchList = hashTable.get(rowKey)
       if (existingMatchList == null) {
-        existingMatchList = new CompactBuffer[Row]()
+        existingMatchList = new CompactBuffer[InternalRow]()
         hashTable.put(rowKey, existingMatchList)
       }
 
@@ -184,7 +191,7 @@ case class HashOuterJoin(
     hashTable
   }
 
-  protected override def doExecute(): RDD[Row] = {
+  protected override def doExecute(): RDD[InternalRow] = {
     val joinedRow = new JoinedRow()
     left.execute().zipPartitions(right.execute()) { (leftIter, rightIter) =>
       // TODO this probably can be replaced by external sort (sort merged join?)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashedRelation.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashedRelation.scala
index ab84c123e0c0b..e18c817975134 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashedRelation.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashedRelation.scala
@@ -20,7 +20,7 @@ package org.apache.spark.sql.execution.joins
 import java.io.{ObjectInput, ObjectOutput, Externalizable}
 import java.util.{HashMap => JavaHashMap}
 
-import org.apache.spark.sql.catalyst.expressions.{Projection, Row}
+import org.apache.spark.sql.catalyst.expressions.{Projection, InternalRow}
 import org.apache.spark.sql.execution.SparkSqlSerializer
 import org.apache.spark.util.collection.CompactBuffer
 
@@ -30,7 +30,7 @@ import org.apache.spark.util.collection.CompactBuffer
  * object.
  */
 private[joins] sealed trait HashedRelation {
-  def get(key: Row): CompactBuffer[Row]
+  def get(key: InternalRow): CompactBuffer[InternalRow]
 
   // This is a helper method to implement Externalizable, and is used by
   // GeneralHashedRelation and UniqueKeyHashedRelation
@@ -54,12 +54,12 @@ private[joins] sealed trait HashedRelation {
  * A general [[HashedRelation]] backed by a hash map that maps the key into a sequence of values.
  */
 private[joins] final class GeneralHashedRelation(
-    private var hashTable: JavaHashMap[Row, CompactBuffer[Row]])
+    private var hashTable: JavaHashMap[InternalRow, CompactBuffer[InternalRow]])
   extends HashedRelation with Externalizable {
 
   def this() = this(null) // Needed for serialization
 
-  override def get(key: Row): CompactBuffer[Row] = hashTable.get(key)
+  override def get(key: InternalRow): CompactBuffer[InternalRow] = hashTable.get(key)
 
   override def writeExternal(out: ObjectOutput): Unit = {
     writeBytes(out, SparkSqlSerializer.serialize(hashTable))
@@ -75,17 +75,18 @@ private[joins] final class GeneralHashedRelation(
  * A specialized [[HashedRelation]] that maps key into a single value. This implementation
  * assumes the key is unique.
  */
-private[joins] final class UniqueKeyHashedRelation(private var hashTable: JavaHashMap[Row, Row])
+private[joins]
+final class UniqueKeyHashedRelation(private var hashTable: JavaHashMap[InternalRow, InternalRow])
   extends HashedRelation with Externalizable {
 
   def this() = this(null) // Needed for serialization
 
-  override def get(key: Row): CompactBuffer[Row] = {
+  override def get(key: InternalRow): CompactBuffer[InternalRow] = {
     val v = hashTable.get(key)
     if (v eq null) null else CompactBuffer(v)
   }
 
-  def getValue(key: Row): Row = hashTable.get(key)
+  def getValue(key: InternalRow): InternalRow = hashTable.get(key)
 
   override def writeExternal(out: ObjectOutput): Unit = {
     writeBytes(out, SparkSqlSerializer.serialize(hashTable))
@@ -103,13 +104,13 @@ private[joins] final class UniqueKeyHashedRelation(private var hashTable: JavaHa
 private[joins] object HashedRelation {
 
   def apply(
-      input: Iterator[Row],
+      input: Iterator[InternalRow],
       keyGenerator: Projection,
       sizeEstimate: Int = 64): HashedRelation = {
 
     // TODO: Use Spark's HashMap implementation.
-    val hashTable = new JavaHashMap[Row, CompactBuffer[Row]](sizeEstimate)
-    var currentRow: Row = null
+    val hashTable = new JavaHashMap[InternalRow, CompactBuffer[InternalRow]](sizeEstimate)
+    var currentRow: InternalRow = null
 
     // Whether the join key is unique. If the key is unique, we can convert the underlying
     // hash map into one specialized for this.
@@ -122,7 +123,7 @@ private[joins] object HashedRelation {
       if (!rowKey.anyNull) {
         val existingMatchList = hashTable.get(rowKey)
         val matchList = if (existingMatchList == null) {
-          val newMatchList = new CompactBuffer[Row]()
+          val newMatchList = new CompactBuffer[InternalRow]()
           hashTable.put(rowKey, newMatchList)
           newMatchList
         } else {
@@ -134,7 +135,7 @@ private[joins] object HashedRelation {
     }
 
     if (keyIsUnique) {
-      val uniqHashTable = new JavaHashMap[Row, Row](hashTable.size)
+      val uniqHashTable = new JavaHashMap[InternalRow, InternalRow](hashTable.size)
       val iter = hashTable.entrySet().iterator()
       while (iter.hasNext) {
         val entry = iter.next()
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/LeftSemiJoinBNL.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/LeftSemiJoinBNL.scala
index 036423e6faea4..2a6d4d1ab08bb 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/LeftSemiJoinBNL.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/LeftSemiJoinBNL.scala
@@ -47,7 +47,7 @@ case class LeftSemiJoinBNL(
   @transient private lazy val boundCondition =
     newPredicate(condition.getOrElse(Literal(true)), left.output ++ right.output)
 
-  protected override def doExecute(): RDD[Row] = {
+  protected override def doExecute(): RDD[InternalRow] = {
     val broadcastedRelation =
       sparkContext.broadcast(broadcast.execute().map(_.copy()).collect().toIndexedSeq)
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/LeftSemiJoinHash.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/LeftSemiJoinHash.scala
index 8ad27eae80ffb..20d74270afb48 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/LeftSemiJoinHash.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/LeftSemiJoinHash.scala
@@ -19,7 +19,7 @@ package org.apache.spark.sql.execution.joins
 
 import org.apache.spark.annotation.DeveloperApi
 import org.apache.spark.rdd.RDD
-import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression, Row}
+import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression, InternalRow}
 import org.apache.spark.sql.catalyst.plans.physical.ClusteredDistribution
 import org.apache.spark.sql.execution.{BinaryNode, SparkPlan}
 
@@ -42,10 +42,10 @@ case class LeftSemiJoinHash(
 
   override def output: Seq[Attribute] = left.output
 
-  protected override def doExecute(): RDD[Row] = {
+  protected override def doExecute(): RDD[InternalRow] = {
     buildPlan.execute().zipPartitions(streamedPlan.execute()) { (buildIter, streamIter) =>
-      val hashSet = new java.util.HashSet[Row]()
-      var currentRow: Row = null
+      val hashSet = new java.util.HashSet[InternalRow]()
+      var currentRow: InternalRow = null
 
       // Create a Hash set of buildKeys
       while (buildIter.hasNext) {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/ShuffledHashJoin.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/ShuffledHashJoin.scala
index 219525d9d85f3..5439e10a60b2a 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/ShuffledHashJoin.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/ShuffledHashJoin.scala
@@ -19,7 +19,7 @@ package org.apache.spark.sql.execution.joins
 
 import org.apache.spark.annotation.DeveloperApi
 import org.apache.spark.rdd.RDD
-import org.apache.spark.sql.Row
+import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions.Expression
 import org.apache.spark.sql.catalyst.plans.physical.{ClusteredDistribution, Partitioning}
 import org.apache.spark.sql.execution.{BinaryNode, SparkPlan}
@@ -43,7 +43,7 @@ case class ShuffledHashJoin(
   override def requiredChildDistribution: Seq[ClusteredDistribution] =
     ClusteredDistribution(leftKeys) :: ClusteredDistribution(rightKeys) :: Nil
 
-  protected override def doExecute(): RDD[Row] = {
+  protected override def doExecute(): RDD[InternalRow] = {
     buildPlan.execute().zipPartitions(streamedPlan.execute()) { (buildIter, streamIter) =>
       val hashed = HashedRelation(buildIter, buildSideKeyGenerator)
       hashJoin(streamIter, hashed)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/SortMergeJoin.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/SortMergeJoin.scala
index 1a39fb4b96608..2abe65a71813d 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/SortMergeJoin.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/SortMergeJoin.scala
@@ -21,9 +21,7 @@ import java.util.NoSuchElementException
 
 import org.apache.spark.annotation.DeveloperApi
 import org.apache.spark.rdd.RDD
-import org.apache.spark.sql.Row
 import org.apache.spark.sql.catalyst.expressions._
-import org.apache.spark.sql.catalyst.plans._
 import org.apache.spark.sql.catalyst.plans.physical._
 import org.apache.spark.sql.execution.{BinaryNode, SparkPlan}
 import org.apache.spark.util.collection.CompactBuffer
@@ -60,29 +58,29 @@ case class SortMergeJoin(
   private def requiredOrders(keys: Seq[Expression]): Seq[SortOrder] =
     keys.map(SortOrder(_, Ascending))
 
-  protected override def doExecute(): RDD[Row] = {
+  protected override def doExecute(): RDD[InternalRow] = {
     val leftResults = left.execute().map(_.copy())
     val rightResults = right.execute().map(_.copy())
 
     leftResults.zipPartitions(rightResults) { (leftIter, rightIter) =>
-      new Iterator[Row] {
+      new Iterator[InternalRow] {
         // Mutable per row objects.
         private[this] val joinRow = new JoinedRow5
-        private[this] var leftElement: Row = _
-        private[this] var rightElement: Row = _
-        private[this] var leftKey: Row = _
-        private[this] var rightKey: Row = _
-        private[this] var rightMatches: CompactBuffer[Row] = _
+        private[this] var leftElement: InternalRow = _
+        private[this] var rightElement: InternalRow = _
+        private[this] var leftKey: InternalRow = _
+        private[this] var rightKey: InternalRow = _
+        private[this] var rightMatches: CompactBuffer[InternalRow] = _
         private[this] var rightPosition: Int = -1
         private[this] var stop: Boolean = false
-        private[this] var matchKey: Row = _
+        private[this] var matchKey: InternalRow = _
 
         // initialize iterator
         initialize()
 
         override final def hasNext: Boolean = nextMatchingPair()
 
-        override final def next(): Row = {
+        override final def next(): InternalRow = {
           if (hasNext) {
             // we are using the buffered right rows and run down left iterator
             val joinedRow = joinRow(leftElement, rightMatches(rightPosition))
@@ -145,7 +143,7 @@ case class SortMergeJoin(
                 fetchLeft()
               }
             }
-            rightMatches = new CompactBuffer[Row]()
+            rightMatches = new CompactBuffer[InternalRow]()
             if (stop) {
               stop = false
               // iterate the right side to buffer all rows that matches
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/pythonUdfs.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/pythonUdfs.scala
index 2b45a83d145f5..1ce150ceaf5f9 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/pythonUdfs.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/pythonUdfs.scala
@@ -29,7 +29,8 @@ import org.apache.spark.annotation.DeveloperApi
 import org.apache.spark.api.python.{PythonBroadcast, PythonRDD}
 import org.apache.spark.broadcast.Broadcast
 import org.apache.spark.rdd.RDD
-import org.apache.spark.sql.catalyst.expressions.{Row, _}
+import org.apache.spark.sql.catalyst.expressions._
+import org.apache.spark.sql.Row
 import org.apache.spark.sql.catalyst.plans.logical
 import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
 import org.apache.spark.sql.catalyst.rules.Rule
@@ -56,7 +57,7 @@ private[spark] case class PythonUDF(
 
   def nullable: Boolean = true
 
-  override def eval(input: Row): Any = {
+  override def eval(input: InternalRow): Any = {
     throw new UnsupportedOperationException("PythonUDFs can not be directly evaluated.")
   }
 }
@@ -241,7 +242,7 @@ case class BatchPythonEvaluation(udf: PythonUDF, output: Seq[Attribute], child:
 
   def children: Seq[SparkPlan] = child :: Nil
 
-  protected override def doExecute(): RDD[Row] = {
+  protected override def doExecute(): RDD[InternalRow] = {
     val childResults = child.execute().map(_.copy())
 
     val parent = childResults.mapPartitions { iter =>
@@ -276,7 +277,7 @@ case class BatchPythonEvaluation(udf: PythonUDF, output: Seq[Attribute], child:
       val row = new GenericMutableRow(1)
       iter.map { result =>
         row(0) = EvaluatePython.fromJava(result, udf.dataType)
-        row: Row
+        row: InternalRow
       }
     }
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/stat/FrequentItems.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/stat/FrequentItems.scala
index c41c21c0eeb50..8df1da037c434 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/stat/FrequentItems.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/stat/FrequentItems.scala
@@ -20,9 +20,10 @@ package org.apache.spark.sql.execution.stat
 import scala.collection.mutable.{Map => MutableMap}
 
 import org.apache.spark.Logging
-import org.apache.spark.sql.{Column, DataFrame, Row}
+import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.plans.logical.LocalRelation
 import org.apache.spark.sql.types.{ArrayType, StructField, StructType}
+import org.apache.spark.sql.{Column, DataFrame}
 
 private[sql] object FrequentItems extends Logging {
 
@@ -110,7 +111,7 @@ private[sql] object FrequentItems extends Logging {
       }
     )
     val justItems = freqItems.map(m => m.baseMap.keys.toSeq)
-    val resultRow = Row(justItems : _*)
+    val resultRow = InternalRow(justItems : _*)
     // append frequent Items to the column name for easy debugging
     val outputCols = colInfo.map { v =>
       StructField(v._1 + "_freqItems", ArrayType(v._2, false))
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/jdbc/JDBCRDD.scala b/sql/core/src/main/scala/org/apache/spark/sql/jdbc/JDBCRDD.scala
index e75e6681c5ff3..667fc70cff956 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/jdbc/JDBCRDD.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/jdbc/JDBCRDD.scala
@@ -24,7 +24,7 @@ import org.apache.commons.lang3.StringUtils
 
 import org.apache.spark.{Logging, Partition, SparkContext, TaskContext}
 import org.apache.spark.rdd.RDD
-import org.apache.spark.sql.catalyst.expressions.{Row, SpecificMutableRow}
+import org.apache.spark.sql.catalyst.expressions.{InternalRow, SpecificMutableRow}
 import org.apache.spark.sql.catalyst.util.DateUtils
 import org.apache.spark.sql.types._
 import org.apache.spark.sql.sources._
@@ -211,7 +211,7 @@ private[sql] object JDBCRDD extends Logging {
       fqTable: String,
       requiredColumns: Array[String],
       filters: Array[Filter],
-      parts: Array[Partition]): RDD[Row] = {
+      parts: Array[Partition]): RDD[InternalRow] = {
     val dialect = JdbcDialects.get(url)
     val quotedColumns = requiredColumns.map(colName => dialect.quoteIdentifier(colName))
     new JDBCRDD(
@@ -240,7 +240,7 @@ private[sql] class JDBCRDD(
     filters: Array[Filter],
     partitions: Array[Partition],
     properties: Properties)
-  extends RDD[Row](sc, Nil) {
+  extends RDD[InternalRow](sc, Nil) {
 
   /**
    * Retrieve the list of partitions corresponding to this RDD.
@@ -348,12 +348,12 @@ private[sql] class JDBCRDD(
   /**
    * Runs the SQL query against the JDBC driver.
    */
-  override def compute(thePart: Partition, context: TaskContext): Iterator[Row] = new Iterator[Row]
-  {
+  override def compute(thePart: Partition, context: TaskContext): Iterator[InternalRow] =
+    new Iterator[InternalRow] {
     var closed = false
     var finished = false
     var gotNext = false
-    var nextValue: Row = null
+    var nextValue: InternalRow = null
 
     context.addTaskCompletionListener{ context => close() }
     val part = thePart.asInstanceOf[JDBCPartition]
@@ -375,7 +375,7 @@ private[sql] class JDBCRDD(
     val conversions = getConversions(schema)
     val mutableRow = new SpecificMutableRow(schema.fields.map(x => x.dataType))
 
-    def getNext(): Row = {
+    def getNext(): InternalRow = {
       if (rs.next()) {
         var i = 0
         while (i < conversions.length) {
@@ -443,7 +443,7 @@ private[sql] class JDBCRDD(
         mutableRow
       } else {
         finished = true
-        null.asInstanceOf[Row]
+        null.asInstanceOf[InternalRow]
       }
     }
 
@@ -486,7 +486,7 @@ private[sql] class JDBCRDD(
       !finished
     }
 
-    override def next(): Row = {
+    override def next(): InternalRow = {
       if (!hasNext) {
         throw new NoSuchElementException("End of stream")
       }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/jdbc/JDBCRelation.scala b/sql/core/src/main/scala/org/apache/spark/sql/jdbc/JDBCRelation.scala
index 30f9190d45bf8..4d3aac464c538 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/jdbc/JDBCRelation.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/jdbc/JDBCRelation.scala
@@ -23,10 +23,9 @@ import scala.collection.mutable.ArrayBuffer
 
 import org.apache.spark.Partition
 import org.apache.spark.rdd.RDD
-import org.apache.spark.sql.{SaveMode, DataFrame, SQLContext}
-import org.apache.spark.sql.catalyst.expressions.Row
 import org.apache.spark.sql.sources._
 import org.apache.spark.sql.types.StructType
+import org.apache.spark.sql.{DataFrame, Row, SQLContext, SaveMode}
 
 /**
  * Instructions on how to partition the table among workers.
@@ -138,7 +137,7 @@ private[sql] case class JDBCRelation(
       table,
       requiredColumns,
       filters,
-      parts)
+      parts).map(_.asInstanceOf[Row])
   }
 
   override def insert(data: DataFrame, overwrite: Boolean): Unit = {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/json/JSONRelation.scala b/sql/core/src/main/scala/org/apache/spark/sql/json/JSONRelation.scala
index c772cd1f53e53..69bf13e1e5a6a 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/json/JSONRelation.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/json/JSONRelation.scala
@@ -22,10 +22,10 @@ import java.io.IOException
 import org.apache.hadoop.fs.Path
 
 import org.apache.spark.rdd.RDD
-import org.apache.spark.sql.catalyst.expressions.{Expression, Attribute, Row}
+import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression}
 import org.apache.spark.sql.sources._
-import org.apache.spark.sql.types.{StructField, StructType}
-import org.apache.spark.sql.{DataFrame, SQLContext, SaveMode}
+import org.apache.spark.sql.types.StructType
+import org.apache.spark.sql.{DataFrame, Row, SQLContext, SaveMode}
 
 
 private[sql] class DefaultSource
@@ -154,12 +154,12 @@ private[sql] class JSONRelation(
       JacksonParser(
         baseRDD(),
         schema,
-        sqlContext.conf.columnNameOfCorruptRecord)
+        sqlContext.conf.columnNameOfCorruptRecord).map(_.asInstanceOf[Row])
     } else {
       JsonRDD.jsonStringToRow(
         baseRDD(),
         schema,
-        sqlContext.conf.columnNameOfCorruptRecord)
+        sqlContext.conf.columnNameOfCorruptRecord).map(_.asInstanceOf[Row])
     }
   }
 
@@ -168,12 +168,12 @@ private[sql] class JSONRelation(
       JacksonParser(
         baseRDD(),
         StructType.fromAttributes(requiredColumns),
-        sqlContext.conf.columnNameOfCorruptRecord)
+        sqlContext.conf.columnNameOfCorruptRecord).map(_.asInstanceOf[Row])
     } else {
       JsonRDD.jsonStringToRow(
         baseRDD(),
         StructType.fromAttributes(requiredColumns),
-        sqlContext.conf.columnNameOfCorruptRecord)
+        sqlContext.conf.columnNameOfCorruptRecord).map(_.asInstanceOf[Row])
     }
   }
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/json/JacksonGenerator.scala b/sql/core/src/main/scala/org/apache/spark/sql/json/JacksonGenerator.scala
index 325f54b6808a8..1e6b1198d245b 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/json/JacksonGenerator.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/json/JacksonGenerator.scala
@@ -21,7 +21,7 @@ import scala.collection.Map
 
 import com.fasterxml.jackson.core._
 
-import org.apache.spark.sql.catalyst.expressions._
+import org.apache.spark.sql.Row
 import org.apache.spark.sql.types._
 
 private[sql] object JacksonGenerator {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/json/JacksonParser.scala b/sql/core/src/main/scala/org/apache/spark/sql/json/JacksonParser.scala
index f16075ce58ffa..817e8a20b34de 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/json/JacksonParser.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/json/JacksonParser.scala
@@ -35,7 +35,7 @@ private[sql] object JacksonParser {
   def apply(
       json: RDD[String],
       schema: StructType,
-      columnNameOfCorruptRecords: String): RDD[Row] = {
+      columnNameOfCorruptRecords: String): RDD[InternalRow] = {
     parseJson(json, schema, columnNameOfCorruptRecords)
   }
 
@@ -130,7 +130,10 @@ private[sql] object JacksonParser {
    *
    * Fields in the json that are not defined in the requested schema will be dropped.
    */
-  private def convertObject(factory: JsonFactory, parser: JsonParser, schema: StructType): Row = {
+  private def convertObject(
+      factory: JsonFactory,
+      parser: JsonParser,
+      schema: StructType): InternalRow = {
     val row = new GenericMutableRow(schema.length)
     while (nextUntil(parser, JsonToken.END_OBJECT)) {
       schema.getFieldIndex(parser.getCurrentName) match {
@@ -176,9 +179,9 @@ private[sql] object JacksonParser {
   private def parseJson(
       json: RDD[String],
       schema: StructType,
-      columnNameOfCorruptRecords: String): RDD[Row] = {
+      columnNameOfCorruptRecords: String): RDD[InternalRow] = {
 
-    def failedRecord(record: String): Seq[Row] = {
+    def failedRecord(record: String): Seq[InternalRow] = {
       // create a row even if no corrupt record column is present
       val row = new GenericMutableRow(schema.length)
       for (corruptIndex <- schema.getFieldIndex(columnNameOfCorruptRecords)) {
@@ -202,7 +205,7 @@ private[sql] object JacksonParser {
           // convertField wrap an object into a single value array when necessary.
           convertField(factory, parser, ArrayType(schema)) match {
             case null => failedRecord(record)
-            case list: Seq[Row @unchecked] => list
+            case list: Seq[InternalRow @unchecked] => list
             case _ =>
               sys.error(
                 s"Failed to parse record $record. Please make sure that each line of the file " +
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/json/JsonRDD.scala b/sql/core/src/main/scala/org/apache/spark/sql/json/JsonRDD.scala
index e4acf1ddaf173..44594c5080ff4 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/json/JsonRDD.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/json/JsonRDD.scala
@@ -38,7 +38,7 @@ private[sql] object JsonRDD extends Logging {
   private[sql] def jsonStringToRow(
       json: RDD[String],
       schema: StructType,
-      columnNameOfCorruptRecords: String): RDD[Row] = {
+      columnNameOfCorruptRecords: String): RDD[InternalRow] = {
     parseJson(json, columnNameOfCorruptRecords).map(parsed => asRow(parsed, schema))
   }
 
@@ -434,7 +434,7 @@ private[sql] object JsonRDD extends Logging {
     }
   }
 
-  private def asRow(json: Map[String, Any], schema: StructType): Row = {
+  private def asRow(json: Map[String, Any], schema: StructType): InternalRow = {
     // TODO: Reuse the row instead of creating a new one for every record.
     val row = new GenericMutableRow(schema.fields.length)
     schema.fields.zipWithIndex.foreach {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetConverter.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetConverter.scala
index ab9f878d1e936..4da5e96b82e3d 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetConverter.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetConverter.scala
@@ -79,7 +79,7 @@ private[sql] object CatalystConverter {
 
   // TODO: consider using Array[T] for arrays to avoid boxing of primitive types
   type ArrayScalaType[T] = Seq[T]
-  type StructScalaType[T] = Row
+  type StructScalaType[T] = InternalRow
   type MapScalaType[K, V] = Map[K, V]
 
   protected[parquet] def createConverter(
@@ -240,7 +240,7 @@ private[parquet] abstract class CatalystConverter extends GroupConverter {
    *
    * @return
    */
-  def getCurrentRecord: Row = throw new UnsupportedOperationException
+  def getCurrentRecord: InternalRow = throw new UnsupportedOperationException
 
   /**
    * Read a decimal value from a Parquet Binary into "dest". Only supports decimals that fit in
@@ -275,7 +275,7 @@ private[parquet] abstract class CatalystConverter extends GroupConverter {
 
 /**
  * A `parquet.io.api.GroupConverter` that is able to convert a Parquet record
- * to a [[org.apache.spark.sql.catalyst.expressions.Row]] object.
+ * to a [[org.apache.spark.sql.catalyst.expressions.InternalRow]] object.
  *
  * @param schema The corresponding Catalyst schema in the form of a list of attributes.
  */
@@ -284,7 +284,7 @@ private[parquet] class CatalystGroupConverter(
     protected[parquet] val index: Int,
     protected[parquet] val parent: CatalystConverter,
     protected[parquet] var current: ArrayBuffer[Any],
-    protected[parquet] var buffer: ArrayBuffer[Row])
+    protected[parquet] var buffer: ArrayBuffer[InternalRow])
   extends CatalystConverter {
 
   def this(schema: Array[FieldType], index: Int, parent: CatalystConverter) =
@@ -293,7 +293,7 @@ private[parquet] class CatalystGroupConverter(
       index,
       parent,
       current = null,
-      buffer = new ArrayBuffer[Row](
+      buffer = new ArrayBuffer[InternalRow](
         CatalystArrayConverter.INITIAL_ARRAY_SIZE))
 
   /**
@@ -309,7 +309,7 @@ private[parquet] class CatalystGroupConverter(
 
   override val size = schema.size
 
-  override def getCurrentRecord: Row = {
+  override def getCurrentRecord: InternalRow = {
     assert(isRootConverter, "getCurrentRecord should only be called in root group converter!")
     // TODO: use iterators if possible
     // Note: this will ever only be called in the root converter when the record has been
@@ -347,7 +347,7 @@ private[parquet] class CatalystGroupConverter(
 
 /**
  * A `parquet.io.api.GroupConverter` that is able to convert a Parquet record
- * to a [[org.apache.spark.sql.catalyst.expressions.Row]] object. Note that his
+ * to a [[org.apache.spark.sql.catalyst.expressions.InternalRow]] object. Note that his
  * converter is optimized for rows of primitive types (non-nested records).
  */
 private[parquet] class CatalystPrimitiveRowConverter(
@@ -373,7 +373,7 @@ private[parquet] class CatalystPrimitiveRowConverter(
   override val parent = null
 
   // Should be only called in root group converter!
-  override def getCurrentRecord: Row = current
+  override def getCurrentRecord: InternalRow = current
 
   override def getConverter(fieldIndex: Int): Converter = converters(fieldIndex)
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableOperations.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableOperations.scala
index 272608d4e2a09..39360e13313a3 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableOperations.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableOperations.scala
@@ -46,7 +46,7 @@ import org.apache.spark.mapred.SparkHadoopMapRedUtil
 import org.apache.spark.mapreduce.SparkHadoopMapReduceUtil
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.SQLConf
-import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression, Row, _}
+import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression, InternalRow, _}
 import org.apache.spark.sql.execution.{LeafNode, SparkPlan, UnaryNode}
 import org.apache.spark.sql.types.StructType
 import org.apache.spark.{Logging, SerializableWritable, TaskContext}
@@ -54,7 +54,7 @@ import org.apache.spark.{Logging, SerializableWritable, TaskContext}
 /**
  * :: DeveloperApi ::
  * Parquet table scan operator. Imports the file that backs the given
- * [[org.apache.spark.sql.parquet.ParquetRelation]] as a ``RDD[Row]``.
+ * [[org.apache.spark.sql.parquet.ParquetRelation]] as a ``RDD[InternalRow]``.
  */
 private[sql] case class ParquetTableScan(
     attributes: Seq[Attribute],
@@ -77,7 +77,7 @@ private[sql] case class ParquetTableScan(
     }
   }.toArray
 
-  protected override def doExecute(): RDD[Row] = {
+  protected override def doExecute(): RDD[InternalRow] = {
     import org.apache.parquet.filter2.compat.FilterCompat.FilterPredicateCompat
 
     val sc = sqlContext.sparkContext
@@ -125,7 +125,7 @@ private[sql] case class ParquetTableScan(
         sc,
         classOf[FilteringParquetRowInputFormat],
         classOf[Void],
-        classOf[Row],
+        classOf[InternalRow],
         conf)
 
     if (requestedPartitionOrdinals.nonEmpty) {
@@ -154,9 +154,9 @@ private[sql] case class ParquetTableScan(
             .map(a => Cast(Literal(partValues(a.name)), a.dataType).eval(EmptyRow))
 
         if (primitiveRow) {
-          new Iterator[Row] {
+          new Iterator[InternalRow] {
             def hasNext: Boolean = iter.hasNext
-            def next(): Row = {
+            def next(): InternalRow = {
               // We are using CatalystPrimitiveRowConverter and it returns a SpecificMutableRow.
               val row = iter.next()._2.asInstanceOf[SpecificMutableRow]
 
@@ -173,12 +173,12 @@ private[sql] case class ParquetTableScan(
         } else {
           // Create a mutable row since we need to fill in values from partition columns.
           val mutableRow = new GenericMutableRow(outputSize)
-          new Iterator[Row] {
+          new Iterator[InternalRow] {
             def hasNext: Boolean = iter.hasNext
-            def next(): Row = {
+            def next(): InternalRow = {
               // We are using CatalystGroupConverter and it returns a GenericRow.
               // Since GenericRow is not mutable, we just cast it to a Row.
-              val row = iter.next()._2.asInstanceOf[Row]
+              val row = iter.next()._2.asInstanceOf[InternalRow]
 
               var i = 0
               while (i < row.size) {
@@ -258,7 +258,7 @@ private[sql] case class InsertIntoParquetTable(
   /**
    * Inserts all rows into the Parquet file.
    */
-  protected override def doExecute(): RDD[Row] = {
+  protected override def doExecute(): RDD[InternalRow] = {
     // TODO: currently we do not check whether the "schema"s are compatible
     // That means if one first creates a table and then INSERTs data with
     // and incompatible schema the execution will fail. It would be nice
@@ -321,13 +321,13 @@ private[sql] case class InsertIntoParquetTable(
    * @param conf A [[org.apache.hadoop.conf.Configuration]].
    */
   private def saveAsHadoopFile(
-      rdd: RDD[Row],
+      rdd: RDD[InternalRow],
       path: String,
       conf: Configuration) {
     val job = new Job(conf)
     val keyType = classOf[Void]
     job.setOutputKeyClass(keyType)
-    job.setOutputValueClass(classOf[Row])
+    job.setOutputValueClass(classOf[InternalRow])
     NewFileOutputFormat.setOutputPath(job, new Path(path))
     val wrappedConf = new SerializableWritable(job.getConfiguration)
     val formatter = new SimpleDateFormat("yyyyMMddHHmm")
@@ -342,7 +342,7 @@ private[sql] case class InsertIntoParquetTable(
           .findMaxTaskId(NewFileOutputFormat.getOutputPath(job).toString, job.getConfiguration) + 1
       }
 
-    def writeShard(context: TaskContext, iter: Iterator[Row]): Int = {
+    def writeShard(context: TaskContext, iter: Iterator[InternalRow]): Int = {
       /* "reduce task" <split #> <attempt # = spark task #> */
       val attemptId = newTaskAttemptID(jobtrackerID, stageId, isMap = false, context.partitionId,
         context.attemptNumber)
@@ -381,7 +381,7 @@ private[sql] case class InsertIntoParquetTable(
  * to imported ones.
  */
 private[parquet] class AppendingParquetOutputFormat(offset: Int)
-  extends org.apache.parquet.hadoop.ParquetOutputFormat[Row] {
+  extends org.apache.parquet.hadoop.ParquetOutputFormat[InternalRow] {
   // override to accept existing directories as valid output directory
   override def checkOutputSpecs(job: JobContext): Unit = {}
   var committer: OutputCommitter = null
@@ -434,25 +434,25 @@ private[parquet] class AppendingParquetOutputFormat(offset: Int)
  * RecordFilter we want to use.
  */
 private[parquet] class FilteringParquetRowInputFormat
-  extends org.apache.parquet.hadoop.ParquetInputFormat[Row] with Logging {
+  extends org.apache.parquet.hadoop.ParquetInputFormat[InternalRow] with Logging {
 
   private var fileStatuses = Map.empty[Path, FileStatus]
 
   override def createRecordReader(
       inputSplit: InputSplit,
-      taskAttemptContext: TaskAttemptContext): RecordReader[Void, Row] = {
+      taskAttemptContext: TaskAttemptContext): RecordReader[Void, InternalRow] = {
 
     import org.apache.parquet.filter2.compat.FilterCompat.NoOpFilter
 
-    val readSupport: ReadSupport[Row] = new RowReadSupport()
+    val readSupport: ReadSupport[InternalRow] = new RowReadSupport()
 
     val filter = ParquetInputFormat.getFilter(ContextUtil.getConfiguration(taskAttemptContext))
     if (!filter.isInstanceOf[NoOpFilter]) {
-      new ParquetRecordReader[Row](
+      new ParquetRecordReader[InternalRow](
         readSupport,
         filter)
     } else {
-      new ParquetRecordReader[Row](readSupport)
+      new ParquetRecordReader[InternalRow](readSupport)
     }
   }
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableSupport.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableSupport.scala
index c62c592b3f3e4..a8775a2a8fd83 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableSupport.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableSupport.scala
@@ -28,7 +28,7 @@ import org.apache.parquet.io.api._
 import org.apache.parquet.schema.MessageType
 
 import org.apache.spark.Logging
-import org.apache.spark.sql.catalyst.expressions.{Attribute, Row}
+import org.apache.spark.sql.catalyst.expressions.{Attribute, InternalRow}
 import org.apache.spark.sql.catalyst.util.DateUtils
 import org.apache.spark.sql.types._
 import org.apache.spark.unsafe.types.UTF8String
@@ -39,12 +39,12 @@ import org.apache.spark.unsafe.types.UTF8String
  *@param root The root group converter for the record.
  */
 private[parquet] class RowRecordMaterializer(root: CatalystConverter)
-  extends RecordMaterializer[Row] {
+  extends RecordMaterializer[InternalRow] {
 
   def this(parquetSchema: MessageType, attributes: Seq[Attribute]) =
     this(CatalystConverter.createRootConverter(parquetSchema, attributes))
 
-  override def getCurrentRecord: Row = root.getCurrentRecord
+  override def getCurrentRecord: InternalRow = root.getCurrentRecord
 
   override def getRootConverter: GroupConverter = root.asInstanceOf[GroupConverter]
 }
@@ -52,13 +52,13 @@ private[parquet] class RowRecordMaterializer(root: CatalystConverter)
 /**
  * A `parquet.hadoop.api.ReadSupport` for Row objects.
  */
-private[parquet] class RowReadSupport extends ReadSupport[Row] with Logging {
+private[parquet] class RowReadSupport extends ReadSupport[InternalRow] with Logging {
 
   override def prepareForRead(
       conf: Configuration,
       stringMap: java.util.Map[String, String],
       fileSchema: MessageType,
-      readContext: ReadContext): RecordMaterializer[Row] = {
+      readContext: ReadContext): RecordMaterializer[InternalRow] = {
     log.debug(s"preparing for read with Parquet file schema $fileSchema")
     // Note: this very much imitates AvroParquet
     val parquetSchema = readContext.getRequestedSchema
@@ -133,7 +133,7 @@ private[parquet] object RowReadSupport {
 /**
  * A `parquet.hadoop.api.WriteSupport` for Row objects.
  */
-private[parquet] class RowWriteSupport extends WriteSupport[Row] with Logging {
+private[parquet] class RowWriteSupport extends WriteSupport[InternalRow] with Logging {
 
   private[parquet] var writer: RecordConsumer = null
   private[parquet] var attributes: Array[Attribute] = null
@@ -157,7 +157,7 @@ private[parquet] class RowWriteSupport extends WriteSupport[Row] with Logging {
     log.debug(s"preparing for write with schema $attributes")
   }
 
-  override def write(record: Row): Unit = {
+  override def write(record: InternalRow): Unit = {
     val attributesSize = attributes.size
     if (attributesSize > record.size) {
       throw new IndexOutOfBoundsException(
@@ -322,7 +322,7 @@ private[parquet] class RowWriteSupport extends WriteSupport[Row] with Logging {
 
 // Optimized for non-nested rows
 private[parquet] class MutableRowWriteSupport extends RowWriteSupport {
-  override def write(record: Row): Unit = {
+  override def write(record: InternalRow): Unit = {
     val attributesSize = attributes.size
     if (attributesSize > record.size) {
       throw new IndexOutOfBoundsException(
@@ -345,7 +345,7 @@ private[parquet] class MutableRowWriteSupport extends RowWriteSupport {
 
   private def consumeType(
       ctype: DataType,
-      record: Row,
+      record: InternalRow,
       index: Int): Unit = {
     ctype match {
       case StringType => writer.addBinary(
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/newParquet.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/newParquet.scala
index 7af4eb1ca4716..bc27a9b67a6d6 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/newParquet.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/newParquet.scala
@@ -34,15 +34,15 @@ import org.apache.parquet.hadoop._
 import org.apache.parquet.hadoop.metadata.CompressionCodecName
 import org.apache.parquet.hadoop.util.ContextUtil
 
+import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.{Partition => SparkPartition, SerializableWritable, Logging, SparkException}
 import org.apache.spark.broadcast.Broadcast
 import org.apache.spark.deploy.SparkHadoopUtil
 import org.apache.spark.rdd.RDD._
 import org.apache.spark.rdd.RDD
-import org.apache.spark.sql.AnalysisException
+import org.apache.spark.sql._
 import org.apache.spark.sql.sources._
 import org.apache.spark.sql.types.{DataType, StructType}
-import org.apache.spark.sql.{Row, SQLConf, SQLContext}
 import org.apache.spark.util.Utils
 
 private[sql] class DefaultSource extends HadoopFsRelationProvider {
@@ -60,7 +60,7 @@ private[sql] class DefaultSource extends HadoopFsRelationProvider {
 private[sql] class ParquetOutputWriter(path: String, context: TaskAttemptContext)
   extends OutputWriter {
 
-  private val recordWriter: RecordWriter[Void, Row] = {
+  private val recordWriter: RecordWriter[Void, InternalRow] = {
     val conf = context.getConfiguration
     val outputFormat = {
       // When appending new Parquet files to an existing Parquet file directory, to avoid
@@ -93,7 +93,7 @@ private[sql] class ParquetOutputWriter(path: String, context: TaskAttemptContext
         }
       }
 
-      new ParquetOutputFormat[Row]() {
+      new ParquetOutputFormat[InternalRow]() {
         // Here we override `getDefaultWorkFile` for two reasons:
         //
         //  1. To allow appending.  We need to generate output file name based on the max available
@@ -112,7 +112,7 @@ private[sql] class ParquetOutputWriter(path: String, context: TaskAttemptContext
     outputFormat.getRecordWriter(context)
   }
 
-  override def write(row: Row): Unit = recordWriter.write(null, row)
+  override def write(row: Row): Unit = recordWriter.write(null, row.asInstanceOf[InternalRow])
 
   override def close(): Unit = recordWriter.close(context)
 }
@@ -286,7 +286,7 @@ private[sql] class ParquetRelation2(
         initLocalJobFuncOpt = Some(initLocalJobFuncOpt),
         inputFormatClass = classOf[FilteringParquetRowInputFormat],
         keyClass = classOf[Void],
-        valueClass = classOf[Row]) {
+        valueClass = classOf[InternalRow]) {
 
         val cacheMetadata = useMetadataCache
 
@@ -331,7 +331,7 @@ private[sql] class ParquetRelation2(
             new SqlNewHadoopPartition(id, i, rawSplits(i).asInstanceOf[InputSplit with Writable])
           }
         }
-      }.values
+      }.values.map(_.asInstanceOf[Row])
     }
   }
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/sources/DataSourceStrategy.scala b/sql/core/src/main/scala/org/apache/spark/sql/sources/DataSourceStrategy.scala
index edda3f2017fe8..4cf67439b9b8d 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/sources/DataSourceStrategy.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/sources/DataSourceStrategy.scala
@@ -17,48 +17,48 @@
 
 package org.apache.spark.sql.sources
 
-import org.apache.spark.{Logging, SerializableWritable, TaskContext}
 import org.apache.spark.deploy.SparkHadoopUtil
 import org.apache.spark.rdd.{MapPartitionsRDD, RDD, UnionRDD}
+import org.apache.spark.sql._
 import org.apache.spark.sql.catalyst.expressions
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.planning.PhysicalOperation
 import org.apache.spark.sql.catalyst.plans.logical
 import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
-import org.apache.spark.sql.execution.SparkPlan
 import org.apache.spark.sql.types.{StringType, StructType}
 import org.apache.spark.sql.{SaveMode, Strategy, execution, sources}
 import org.apache.spark.util.Utils
 import org.apache.spark.unsafe.types.UTF8String
+import org.apache.spark.{Logging, SerializableWritable, TaskContext}
 
 /**
  * A Strategy for planning scans over data sources defined using the sources API.
  */
 private[sql] object DataSourceStrategy extends Strategy with Logging {
   def apply(plan: LogicalPlan): Seq[execution.SparkPlan] = plan match {
-    case PhysicalOperation(projectList, filters, l @ LogicalRelation(t: CatalystScan)) =>
+    case PhysicalOperation(projects, filters, l @ LogicalRelation(t: CatalystScan)) =>
       pruneFilterProjectRaw(
         l,
-        projectList,
+        projects,
         filters,
-        (a, f) => t.buildScan(a, f)) :: Nil
+        (a, f) => toCatalystRDD(l, a, t.buildScan(a, f))) :: Nil
 
-    case PhysicalOperation(projectList, filters, l @ LogicalRelation(t: PrunedFilteredScan)) =>
+    case PhysicalOperation(projects, filters, l @ LogicalRelation(t: PrunedFilteredScan)) =>
       pruneFilterProject(
         l,
-        projectList,
+        projects,
         filters,
-        (a, f) => t.buildScan(a, f)) :: Nil
+        (a, f) => toCatalystRDD(l, a, t.buildScan(a.map(_.name).toArray, f))) :: Nil
 
-    case PhysicalOperation(projectList, filters, l @ LogicalRelation(t: PrunedScan)) =>
+    case PhysicalOperation(projects, filters, l @ LogicalRelation(t: PrunedScan)) =>
       pruneFilterProject(
         l,
-        projectList,
+        projects,
         filters,
-        (a, _) => t.buildScan(a)) :: Nil
+        (a, _) => toCatalystRDD(l, a, t.buildScan(a.map(_.name).toArray))) :: Nil
 
     // Scanning partitioned HadoopFsRelation
-    case PhysicalOperation(projectList, filters, l @ LogicalRelation(t: HadoopFsRelation))
+    case PhysicalOperation(projects, filters, l @ LogicalRelation(t: HadoopFsRelation))
         if t.partitionSpec.partitionColumns.nonEmpty =>
       val selectedPartitions = prunePartitions(filters, t.partitionSpec).toArray
 
@@ -80,13 +80,13 @@ private[sql] object DataSourceStrategy extends Strategy with Logging {
 
       buildPartitionedTableScan(
         l,
-        projectList,
+        projects,
         pushedFilters,
         t.partitionSpec.partitionColumns,
         selectedPartitions) :: Nil
 
     // Scanning non-partitioned HadoopFsRelation
-    case PhysicalOperation(projectList, filters, l @ LogicalRelation(t: HadoopFsRelation)) =>
+    case PhysicalOperation(projects, filters, l @ LogicalRelation(t: HadoopFsRelation)) =>
       // See buildPartitionedTableScan for the reason that we need to create a shard
       // broadcast HadoopConf.
       val sharedHadoopConf = SparkHadoopUtil.get.conf
@@ -94,12 +94,13 @@ private[sql] object DataSourceStrategy extends Strategy with Logging {
         t.sqlContext.sparkContext.broadcast(new SerializableWritable(sharedHadoopConf))
       pruneFilterProject(
         l,
-        projectList,
+        projects,
         filters,
-        (a, f) => t.buildScan(a, f, t.paths, confBroadcast)) :: Nil
+        (a, f) =>
+          toCatalystRDD(l, a, t.buildScan(a.map(_.name).toArray, f, t.paths, confBroadcast))) :: Nil
 
     case l @ LogicalRelation(t: TableScan) =>
-      createPhysicalRDD(l.relation, l.output, t.buildScan()) :: Nil
+      execution.PhysicalRDD(l.output, toCatalystRDD(l, t.buildScan())) :: Nil
 
     case i @ logical.InsertIntoTable(
       l @ LogicalRelation(t: InsertableRelation), part, query, overwrite, false) if part.isEmpty =>
@@ -119,7 +120,6 @@ private[sql] object DataSourceStrategy extends Strategy with Logging {
       filters: Seq[Expression],
       partitionColumns: StructType,
       partitions: Array[Partition]) = {
-    val output = projections.map(_.toAttribute)
     val relation = logicalRelation.relation.asInstanceOf[HadoopFsRelation]
 
     // Because we are creating one RDD per partition, we need to have a shared HadoopConf.
@@ -138,23 +138,23 @@ private[sql] object DataSourceStrategy extends Strategy with Logging {
           logicalRelation,
           projections,
           filters,
-          (requiredColumns, filters) => {
+          (columns: Seq[Attribute], filters) => {
             val partitionColNames = partitionColumns.fieldNames
 
             // Don't scan any partition columns to save I/O.  Here we are being optimistic and
             // assuming partition columns data stored in data files are always consistent with those
             // partition values encoded in partition directory paths.
-            val nonPartitionColumns = requiredColumns.filterNot(partitionColNames.contains)
+            val needed = columns.filterNot(a => partitionColNames.contains(a.name))
             val dataRows =
-              relation.buildScan(nonPartitionColumns, filters, Array(dir), confBroadcast)
+              relation.buildScan(needed.map(_.name).toArray, filters, Array(dir), confBroadcast)
 
             // Merges data values with partition values.
             mergeWithPartitionValues(
               relation.schema,
-              requiredColumns,
+              columns.map(_.name).toArray,
               partitionColNames,
               partitionValues,
-              dataRows)
+              toCatalystRDD(logicalRelation, needed, dataRows))
           })
 
       scan.execute()
@@ -167,15 +167,15 @@ private[sql] object DataSourceStrategy extends Strategy with Logging {
         new UnionRDD(relation.sqlContext.sparkContext, perPartitionRows)
       }
 
-    createPhysicalRDD(logicalRelation.relation, output, unionedRows)
+    execution.PhysicalRDD(projections.map(_.toAttribute), unionedRows)
   }
 
   private def mergeWithPartitionValues(
       schema: StructType,
       requiredColumns: Array[String],
       partitionColumns: Array[String],
-      partitionValues: Row,
-      dataRows: RDD[Row]): RDD[Row] = {
+      partitionValues: InternalRow,
+      dataRows: RDD[InternalRow]): RDD[InternalRow] = {
     val nonPartitionColumns = requiredColumns.filterNot(partitionColumns.contains)
 
     // If output columns contain any partition column(s), we need to merge scanned data
@@ -186,13 +186,13 @@ private[sql] object DataSourceStrategy extends Strategy with Logging {
         val i = partitionColumns.indexOf(name)
         if (i != -1) {
           // If yes, gets column value from partition values.
-          (mutableRow: MutableRow, dataRow: expressions.Row, ordinal: Int) => {
+          (mutableRow: MutableRow, dataRow: InternalRow, ordinal: Int) => {
             mutableRow(ordinal) = partitionValues(i)
           }
         } else {
           // Otherwise, inherits the value from scanned data.
           val i = nonPartitionColumns.indexOf(name)
-          (mutableRow: MutableRow, dataRow: expressions.Row, ordinal: Int) => {
+          (mutableRow: MutableRow, dataRow: InternalRow, ordinal: Int) => {
             mutableRow(ordinal) = dataRow(i)
           }
         }
@@ -201,7 +201,7 @@ private[sql] object DataSourceStrategy extends Strategy with Logging {
       // Since we know for sure that this closure is serializable, we can avoid the overhead
       // of cleaning a closure for each RDD by creating our own MapPartitionsRDD. Functionally
       // this is equivalent to calling `dataRows.mapPartitions(mapPartitionsFunc)` (SPARK-7718).
-      val mapPartitionsFunc = (_: TaskContext, _: Int, iterator: Iterator[Row]) => {
+      val mapPartitionsFunc = (_: TaskContext, _: Int, iterator: Iterator[InternalRow]) => {
         val dataTypes = requiredColumns.map(schema(_).dataType)
         val mutableRow = new SpecificMutableRow(dataTypes)
         iterator.map { dataRow =>
@@ -210,7 +210,7 @@ private[sql] object DataSourceStrategy extends Strategy with Logging {
             mergers(i)(mutableRow, dataRow, i)
             i += 1
           }
-          mutableRow.asInstanceOf[expressions.Row]
+          mutableRow.asInstanceOf[InternalRow]
         }
       }
 
@@ -256,26 +256,26 @@ private[sql] object DataSourceStrategy extends Strategy with Logging {
   // Based on Public API.
   protected def pruneFilterProject(
       relation: LogicalRelation,
-      projectList: Seq[NamedExpression],
+      projects: Seq[NamedExpression],
       filterPredicates: Seq[Expression],
-      scanBuilder: (Array[String], Array[Filter]) => RDD[Row]) = {
+      scanBuilder: (Seq[Attribute], Array[Filter]) => RDD[InternalRow]) = {
     pruneFilterProjectRaw(
       relation,
-      projectList,
+      projects,
       filterPredicates,
       (requestedColumns, pushedFilters) => {
-        scanBuilder(requestedColumns.map(_.name).toArray, selectFilters(pushedFilters).toArray)
+        scanBuilder(requestedColumns, selectFilters(pushedFilters).toArray)
       })
   }
 
   // Based on Catalyst expressions.
   protected def pruneFilterProjectRaw(
       relation: LogicalRelation,
-      projectList: Seq[NamedExpression],
+      projects: Seq[NamedExpression],
       filterPredicates: Seq[Expression],
-      scanBuilder: (Seq[Attribute], Seq[Expression]) => RDD[Row]) = {
+      scanBuilder: (Seq[Attribute], Seq[Expression]) => RDD[InternalRow]) = {
 
-    val projectSet = AttributeSet(projectList.flatMap(_.references))
+    val projectSet = AttributeSet(projects.flatMap(_.references))
     val filterSet = AttributeSet(filterPredicates.flatMap(_.references))
     val filterCondition = filterPredicates.reduceLeftOption(expressions.And)
 
@@ -283,38 +283,47 @@ private[sql] object DataSourceStrategy extends Strategy with Logging {
       case a: AttributeReference => relation.attributeMap(a) // Match original case of attributes.
     }}
 
-    if (projectList.map(_.toAttribute) == projectList &&
-        projectSet.size == projectList.size &&
+    if (projects.map(_.toAttribute) == projects &&
+        projectSet.size == projects.size &&
         filterSet.subsetOf(projectSet)) {
       // When it is possible to just use column pruning to get the right projection and
       // when the columns of this projection are enough to evaluate all filter conditions,
       // just do a scan followed by a filter, with no extra project.
       val requestedColumns =
-        projectList.asInstanceOf[Seq[Attribute]] // Safe due to if above.
+        projects.asInstanceOf[Seq[Attribute]] // Safe due to if above.
           .map(relation.attributeMap)            // Match original case of attributes.
 
-      val scan = createPhysicalRDD(relation.relation, projectList.map(_.toAttribute),
-          scanBuilder(requestedColumns, pushedFilters))
+      val scan = execution.PhysicalRDD(projects.map(_.toAttribute),
+        scanBuilder(requestedColumns, pushedFilters))
       filterCondition.map(execution.Filter(_, scan)).getOrElse(scan)
     } else {
       val requestedColumns = (projectSet ++ filterSet).map(relation.attributeMap).toSeq
 
-      val scan = createPhysicalRDD(relation.relation, requestedColumns,
+      val scan = execution.PhysicalRDD(requestedColumns,
         scanBuilder(requestedColumns, pushedFilters))
-      execution.Project(projectList, filterCondition.map(execution.Filter(_, scan)).getOrElse(scan))
+      execution.Project(projects, filterCondition.map(execution.Filter(_, scan)).getOrElse(scan))
     }
   }
 
-  private[this] def createPhysicalRDD(
-      relation: BaseRelation,
+  /**
+   * Convert RDD of Row into RDD of InternalRow with objects in catalyst types
+   */
+  private[this] def toCatalystRDD(
+      relation: LogicalRelation,
       output: Seq[Attribute],
-      rdd: RDD[Row]): SparkPlan = {
-    val converted = if (relation.needConversion) {
-      execution.RDDConversions.rowToRowRdd(rdd, output.map(_.dataType))
+      rdd: RDD[Row]): RDD[InternalRow] = {
+    if (relation.relation.needConversion) {
+      execution.RDDConversions.rowToRowRdd(rdd.asInstanceOf[RDD[Row]], output.map(_.dataType))
     } else {
-      rdd
+      rdd.map(_.asInstanceOf[InternalRow])
     }
-    execution.PhysicalRDD(output, converted)
+  }
+
+  /**
+   * Convert RDD of Row into RDD of InternalRow with objects in catalyst types
+   */
+  private[this] def toCatalystRDD(relation: LogicalRelation, rdd: RDD[Row]): RDD[InternalRow] = {
+    toCatalystRDD(relation, relation.output, rdd)
   }
 
   /**
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/sources/PartitioningUtils.scala b/sql/core/src/main/scala/org/apache/spark/sql/sources/PartitioningUtils.scala
index 7a2b5b949dd4e..c6f535dde7676 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/sources/PartitioningUtils.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/sources/PartitioningUtils.scala
@@ -25,12 +25,11 @@ import scala.util.Try
 
 import org.apache.hadoop.fs.Path
 import org.apache.hadoop.util.Shell
-
-import org.apache.spark.sql.Row
+import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions.{Cast, Literal}
 import org.apache.spark.sql.types._
 
-private[sql] case class Partition(values: Row, path: String)
+private[sql] case class Partition(values: InternalRow, path: String)
 
 private[sql] case class PartitionSpec(partitionColumns: StructType, partitions: Seq[Partition])
 
@@ -100,7 +99,7 @@ private[sql] object PartitioningUtils {
       // Finally, we create `Partition`s based on paths and resolved partition values.
       val partitions = resolvedPartitionValues.zip(pathsWithPartitionValues).map {
         case (PartitionValues(_, literals), (path, _)) =>
-          Partition(Row.fromSeq(literals.map(_.value)), path.toString)
+          Partition(InternalRow.fromSeq(literals.map(_.value)), path.toString)
       }
 
       PartitionSpec(StructType(fields), partitions)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/sources/commands.scala b/sql/core/src/main/scala/org/apache/spark/sql/sources/commands.scala
index c94199bfcd233..1763cee419572 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/sources/commands.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/sources/commands.scala
@@ -36,7 +36,7 @@ import org.apache.spark.sql.catalyst.expressions.codegen.GenerateProjection
 import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, Project}
 import org.apache.spark.sql.execution.RunnableCommand
 import org.apache.spark.sql.types.StructType
-import org.apache.spark.sql.{DataFrame, SQLConf, SQLContext, SaveMode}
+import org.apache.spark.sql.{DataFrame, Row, SQLConf, SQLContext, SaveMode}
 
 private[sql] case class InsertIntoDataSource(
     logicalRelation: LogicalRelation,
@@ -44,18 +44,17 @@ private[sql] case class InsertIntoDataSource(
     overwrite: Boolean)
   extends RunnableCommand {
 
-  override def run(sqlContext: SQLContext): Seq[Row] = {
+  override def run(sqlContext: SQLContext): Seq[InternalRow] = {
     val relation = logicalRelation.relation.asInstanceOf[InsertableRelation]
     val data = DataFrame(sqlContext, query)
     // Apply the schema of the existing table to the new data.
-    val df = sqlContext.createDataFrame(
-      data.queryExecution.toRdd, logicalRelation.schema, needsConversion = false)
+    val df = sqlContext.internalCreateDataFrame(data.queryExecution.toRdd, logicalRelation.schema)
     relation.insert(df, overwrite)
 
     // Invalidate the cache.
     sqlContext.cacheManager.invalidateCache(logicalRelation)
 
-    Seq.empty[Row]
+    Seq.empty[InternalRow]
   }
 }
 
@@ -65,7 +64,7 @@ private[sql] case class InsertIntoHadoopFsRelation(
     mode: SaveMode)
   extends RunnableCommand {
 
-  override def run(sqlContext: SQLContext): Seq[Row] = {
+  override def run(sqlContext: SQLContext): Seq[InternalRow] = {
     require(
       relation.paths.length == 1,
       s"Cannot write to multiple destinations: ${relation.paths.mkString(",")}")
@@ -90,7 +89,7 @@ private[sql] case class InsertIntoHadoopFsRelation(
     if (doInsertion) {
       val job = new Job(hadoopConf)
       job.setOutputKeyClass(classOf[Void])
-      job.setOutputValueClass(classOf[Row])
+      job.setOutputValueClass(classOf[InternalRow])
       FileOutputFormat.setOutputPath(job, qualifiedOutputPath)
 
       // We create a DataFrame by applying the schema of relation to the data to make sure.
@@ -103,10 +102,8 @@ private[sql] case class InsertIntoHadoopFsRelation(
         val project = Project(
           relation.schema.map(field => new UnresolvedAttribute(Seq(field.name))), query)
 
-        sqlContext.createDataFrame(
-          DataFrame(sqlContext, project).queryExecution.toRdd,
-          relation.schema,
-          needsConversion = false)
+        sqlContext.internalCreateDataFrame(
+          DataFrame(sqlContext, project).queryExecution.toRdd, relation.schema)
       }
 
       val partitionColumns = relation.partitionColumns.fieldNames
@@ -119,7 +116,7 @@ private[sql] case class InsertIntoHadoopFsRelation(
       }
     }
 
-    Seq.empty[Row]
+    Seq.empty[InternalRow]
   }
 
   private def insert(writerContainer: BaseWriterContainer, df: DataFrame): Unit = {
@@ -141,22 +138,19 @@ private[sql] case class InsertIntoHadoopFsRelation(
       throw new SparkException("Job aborted.", cause)
     }
 
-    def writeRows(taskContext: TaskContext, iterator: Iterator[Row]): Unit = {
+    def writeRows(taskContext: TaskContext, iterator: Iterator[InternalRow]): Unit = {
       // If anything below fails, we should abort the task.
       try {
         writerContainer.executorSideSetup(taskContext)
 
-        if (needsConversion) {
-          val converter = CatalystTypeConverters.createToScalaConverter(dataSchema)
-          while (iterator.hasNext) {
-            val row = converter(iterator.next()).asInstanceOf[Row]
-            writerContainer.outputWriterForRow(row).write(row)
-          }
+        val converter = if (needsConversion) {
+          CatalystTypeConverters.createToScalaConverter(dataSchema).asInstanceOf[InternalRow => Row]
         } else {
-          while (iterator.hasNext) {
-            val row = iterator.next()
-            writerContainer.outputWriterForRow(row).write(row)
-          }
+          r: InternalRow => r.asInstanceOf[Row]
+        }
+        while (iterator.hasNext) {
+          val row = converter(iterator.next())
+          writerContainer.outputWriterForRow(row).write(row)
         }
 
         writerContainer.commitTask()
@@ -210,32 +204,28 @@ private[sql] case class InsertIntoHadoopFsRelation(
       throw new SparkException("Job aborted.", cause)
     }
 
-    def writeRows(taskContext: TaskContext, iterator: Iterator[Row]): Unit = {
+    def writeRows(taskContext: TaskContext, iterator: Iterator[InternalRow]): Unit = {
       // If anything below fails, we should abort the task.
       try {
         writerContainer.executorSideSetup(taskContext)
 
         val partitionProj = newProjection(codegenEnabled, partitionOutput, output)
         val dataProj = newProjection(codegenEnabled, dataOutput, output)
-
-        if (needsConversion) {
-          val converter = CatalystTypeConverters.createToScalaConverter(dataSchema)
-          while (iterator.hasNext) {
-            val row = iterator.next()
-            val partitionPart = partitionProj(row)
-            val dataPart = dataProj(row)
-            val convertedDataPart = converter(dataPart).asInstanceOf[Row]
-            writerContainer.outputWriterForRow(partitionPart).write(convertedDataPart)
-          }
+        val dataConverter: InternalRow => Row = if (needsConversion) {
+          CatalystTypeConverters.createToScalaConverter(dataSchema).asInstanceOf[InternalRow => Row]
         } else {
-          val partitionSchema = StructType.fromAttributes(partitionOutput)
-          val converter = CatalystTypeConverters.createToScalaConverter(partitionSchema)
-          while (iterator.hasNext) {
-            val row = iterator.next()
-            val partitionPart = converter(partitionProj(row)).asInstanceOf[Row]
-            val dataPart = dataProj(row)
-            writerContainer.outputWriterForRow(partitionPart).write(dataPart)
-          }
+          r: InternalRow => r.asInstanceOf[Row]
+        }
+        val partitionSchema = StructType.fromAttributes(partitionOutput)
+        val partConverter: InternalRow => Row =
+          CatalystTypeConverters.createToScalaConverter(partitionSchema)
+            .asInstanceOf[InternalRow => Row]
+
+        while (iterator.hasNext) {
+          val row = iterator.next()
+          val partitionPart = partConverter(partitionProj(row))
+          val dataPart = dataConverter(dataProj(row))
+          writerContainer.outputWriterForRow(partitionPart).write(dataPart)
         }
 
         writerContainer.commitTask()
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/sources/ddl.scala b/sql/core/src/main/scala/org/apache/spark/sql/sources/ddl.scala
index 20afd60cb7767..01c67db232569 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/sources/ddl.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/sources/ddl.scala
@@ -26,7 +26,7 @@ import org.apache.spark.Logging
 import org.apache.spark.deploy.SparkHadoopUtil
 import org.apache.spark.sql.catalyst.AbstractSparkSQLParser
 import org.apache.spark.sql.catalyst.analysis.UnresolvedRelation
-import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference, Row}
+import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference, InternalRow}
 import org.apache.spark.sql.catalyst.plans.logical._
 import org.apache.spark.sql.execution.RunnableCommand
 import org.apache.spark.sql.types._
@@ -404,7 +404,7 @@ private[sql] case class CreateTempTableUsing(
     provider: String,
     options: Map[String, String]) extends RunnableCommand {
 
-  def run(sqlContext: SQLContext): Seq[Row] = {
+  def run(sqlContext: SQLContext): Seq[InternalRow] = {
     val resolved = ResolvedDataSource(
       sqlContext, userSpecifiedSchema, Array.empty[String], provider, options)
     sqlContext.registerDataFrameAsTable(
@@ -421,7 +421,7 @@ private[sql] case class CreateTempTableUsingAsSelect(
     options: Map[String, String],
     query: LogicalPlan) extends RunnableCommand {
 
-  override def run(sqlContext: SQLContext): Seq[Row] = {
+  override def run(sqlContext: SQLContext): Seq[InternalRow] = {
     val df = DataFrame(sqlContext, query)
     val resolved = ResolvedDataSource(sqlContext, provider, partitionColumns, mode, options, df)
     sqlContext.registerDataFrameAsTable(
@@ -434,7 +434,7 @@ private[sql] case class CreateTempTableUsingAsSelect(
 private[sql] case class RefreshTable(databaseName: String, tableName: String)
   extends RunnableCommand {
 
-  override def run(sqlContext: SQLContext): Seq[Row] = {
+  override def run(sqlContext: SQLContext): Seq[InternalRow] = {
     // Refresh the given table's metadata first.
     sqlContext.catalog.refreshTable(databaseName, tableName)
 
@@ -453,7 +453,7 @@ private[sql] case class RefreshTable(databaseName: String, tableName: String)
       sqlContext.cacheManager.cacheQuery(df, Some(tableName))
     }
 
-    Seq.empty[Row]
+    Seq.empty[InternalRow]
   }
 }
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/sources/interfaces.scala b/sql/core/src/main/scala/org/apache/spark/sql/sources/interfaces.scala
index d1547fb1e4abb..27534a1f48ce2 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/sources/interfaces.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/sources/interfaces.scala
@@ -28,7 +28,8 @@ import org.apache.spark.annotation.{DeveloperApi, Experimental}
 import org.apache.spark.broadcast.Broadcast
 import org.apache.spark.rdd.RDD
 import org.apache.spark.SerializableWritable
-import org.apache.spark.sql.{Row, _}
+import org.apache.spark.sql.execution.RDDConversions
+import org.apache.spark.sql.{DataFrame, Row, SaveMode, SQLContext}
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.expressions.codegen.GenerateMutableProjection
 import org.apache.spark.sql.types.StructType
@@ -195,6 +196,8 @@ abstract class BaseRelation {
    *  java.lang.String -> UTF8String
    *  java.lang.Decimal -> Decimal
    *
+   * If `needConversion` is `false`, buildScan() should return an [[RDD]] of [[InternalRow]]
+   *
    * Note: The internal representation is not stable across releases and thus data sources outside
    * of Spark SQL should leave this as true.
    *
@@ -443,7 +446,7 @@ abstract class HadoopFsRelation private[sql](maybePartitionSpec: Option[Partitio
               val castedValues = partitionSchema.zip(literals).map { case (field, literal) =>
                 Cast(literal, field.dataType).eval()
               }
-              p.copy(values = Row.fromSeq(castedValues))
+              p.copy(values = InternalRow.fromSeq(castedValues))
             }
             PartitionSpec(partitionSchema, castedPartitions)
           }
@@ -579,15 +582,21 @@ abstract class HadoopFsRelation private[sql](maybePartitionSpec: Option[Partitio
       BoundReference(dataSchema.fieldIndex(col), field.dataType, field.nullable)
     }.toSeq
 
-    buildScan(inputFiles).mapPartitions { rows =>
+    val rdd = buildScan(inputFiles)
+    val converted =
+      if (needConversion) {
+        RDDConversions.rowToRowRdd(rdd, dataSchema.fields.map(_.dataType))
+      } else {
+        rdd.map(_.asInstanceOf[InternalRow])
+      }
+    converted.mapPartitions { rows =>
       val buildProjection = if (codegenEnabled) {
         GenerateMutableProjection.generate(requiredOutput, dataSchema.toAttributes)
       } else {
         () => new InterpretedMutableProjection(requiredOutput, dataSchema.toAttributes)
       }
-
       val mutableProjection = buildProjection()
-      rows.map(mutableProjection)
+      rows.map(r => mutableProjection(r).asInstanceOf[Row])
     }
   }
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/CachedTableSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/CachedTableSuite.scala
index 17a3cec48b856..eb3e913322062 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/CachedTableSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/CachedTableSuite.scala
@@ -94,7 +94,7 @@ class CachedTableSuite extends QueryTest {
   }
 
   test("too big for memory") {
-    val data = "*" * 10000
+    val data = "*" * 1000
     ctx.sparkContext.parallelize(1 to 200000, 1).map(_ => BigData(data)).toDF()
       .registerTempTable("bigData")
     ctx.table("bigData").persist(StorageLevel.MEMORY_AND_DISK)
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/columnar/ColumnStatsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/columnar/ColumnStatsSuite.scala
index 16836628cb73a..1f37455dd0bc4 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/columnar/ColumnStatsSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/columnar/ColumnStatsSuite.scala
@@ -18,25 +18,28 @@
 package org.apache.spark.sql.columnar
 
 import org.apache.spark.SparkFunSuite
-import org.apache.spark.sql.catalyst.expressions.Row
+import org.apache.spark.sql.catalyst.expressions.InternalRow
 import org.apache.spark.sql.types._
 
 class ColumnStatsSuite extends SparkFunSuite {
-  testColumnStats(classOf[ByteColumnStats], BYTE, Row(Byte.MaxValue, Byte.MinValue, 0))
-  testColumnStats(classOf[ShortColumnStats], SHORT, Row(Short.MaxValue, Short.MinValue, 0))
-  testColumnStats(classOf[IntColumnStats], INT, Row(Int.MaxValue, Int.MinValue, 0))
-  testColumnStats(classOf[LongColumnStats], LONG, Row(Long.MaxValue, Long.MinValue, 0))
-  testColumnStats(classOf[FloatColumnStats], FLOAT, Row(Float.MaxValue, Float.MinValue, 0))
-  testColumnStats(classOf[DoubleColumnStats], DOUBLE, Row(Double.MaxValue, Double.MinValue, 0))
-  testColumnStats(classOf[FixedDecimalColumnStats], FIXED_DECIMAL(15, 10), Row(null, null, 0))
-  testColumnStats(classOf[StringColumnStats], STRING, Row(null, null, 0))
-  testColumnStats(classOf[DateColumnStats], DATE, Row(Int.MaxValue, Int.MinValue, 0))
-  testColumnStats(classOf[TimestampColumnStats], TIMESTAMP, Row(Long.MaxValue, Long.MinValue, 0))
+  testColumnStats(classOf[ByteColumnStats], BYTE, InternalRow(Byte.MaxValue, Byte.MinValue, 0))
+  testColumnStats(classOf[ShortColumnStats], SHORT, InternalRow(Short.MaxValue, Short.MinValue, 0))
+  testColumnStats(classOf[IntColumnStats], INT, InternalRow(Int.MaxValue, Int.MinValue, 0))
+  testColumnStats(classOf[LongColumnStats], LONG, InternalRow(Long.MaxValue, Long.MinValue, 0))
+  testColumnStats(classOf[FloatColumnStats], FLOAT, InternalRow(Float.MaxValue, Float.MinValue, 0))
+  testColumnStats(classOf[DoubleColumnStats], DOUBLE,
+    InternalRow(Double.MaxValue, Double.MinValue, 0))
+  testColumnStats(classOf[FixedDecimalColumnStats],
+    FIXED_DECIMAL(15, 10), InternalRow(null, null, 0))
+  testColumnStats(classOf[StringColumnStats], STRING, InternalRow(null, null, 0))
+  testColumnStats(classOf[DateColumnStats], DATE, InternalRow(Int.MaxValue, Int.MinValue, 0))
+  testColumnStats(classOf[TimestampColumnStats], TIMESTAMP,
+    InternalRow(Long.MaxValue, Long.MinValue, 0))
 
   def testColumnStats[T <: AtomicType, U <: ColumnStats](
       columnStatsClass: Class[U],
       columnType: NativeColumnType[T],
-      initialStatistics: Row): Unit = {
+      initialStatistics: InternalRow): Unit = {
 
     val columnStatsName = columnStatsClass.getSimpleName
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/columnar/ColumnarTestUtils.scala b/sql/core/src/test/scala/org/apache/spark/sql/columnar/ColumnarTestUtils.scala
index 1bc7eb36311bb..7c86eae3f77fd 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/columnar/ColumnarTestUtils.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/columnar/ColumnarTestUtils.scala
@@ -19,14 +19,11 @@ package org.apache.spark.sql.columnar
 
 import scala.collection.immutable.HashSet
 import scala.util.Random
-
-import org.apache.spark.sql.Row
+import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions.GenericMutableRow
-import org.apache.spark.sql.types.{AtomicType, DataType, Decimal}
 import org.apache.spark.sql.types.{DataType, Decimal, AtomicType}
 import org.apache.spark.unsafe.types.UTF8String
 
-
 object ColumnarTestUtils {
   def makeNullRow(length: Int): GenericMutableRow = {
     val row = new GenericMutableRow(length)
@@ -79,9 +76,9 @@ object ColumnarTestUtils {
 
   def makeRandomRow(
       head: ColumnType[_ <: DataType, _],
-      tail: ColumnType[_ <: DataType, _]*): Row = makeRandomRow(Seq(head) ++ tail)
+      tail: ColumnType[_ <: DataType, _]*): InternalRow = makeRandomRow(Seq(head) ++ tail)
 
-  def makeRandomRow(columnTypes: Seq[ColumnType[_ <: DataType, _]]): Row = {
+  def makeRandomRow(columnTypes: Seq[ColumnType[_ <: DataType, _]]): InternalRow = {
     val row = new GenericMutableRow(columnTypes.length)
     makeRandomValues(columnTypes).zipWithIndex.foreach { case (value, index) =>
       row(index) = value
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/columnar/InMemoryColumnarQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/columnar/InMemoryColumnarQuerySuite.scala
index fa3b8144c086e..12f95eb557c04 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/columnar/InMemoryColumnarQuerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/columnar/InMemoryColumnarQuerySuite.scala
@@ -20,9 +20,8 @@ package org.apache.spark.sql.columnar
 import java.sql.{Date, Timestamp}
 
 import org.apache.spark.sql.TestData._
-import org.apache.spark.sql.catalyst.expressions.Row
 import org.apache.spark.sql.types._
-import org.apache.spark.sql.{QueryTest, TestData}
+import org.apache.spark.sql.{QueryTest, Row, TestData}
 import org.apache.spark.storage.StorageLevel.MEMORY_ONLY
 
 class InMemoryColumnarQuerySuite extends QueryTest {
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/columnar/compression/BooleanBitSetSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/columnar/compression/BooleanBitSetSuite.scala
index 20d65a74e3b7a..f606e2133bedc 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/columnar/compression/BooleanBitSetSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/columnar/compression/BooleanBitSetSuite.scala
@@ -18,10 +18,10 @@
 package org.apache.spark.sql.columnar.compression
 
 import org.apache.spark.SparkFunSuite
-import org.apache.spark.sql.Row
+import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions.GenericMutableRow
-import org.apache.spark.sql.columnar.{NoopColumnStats, BOOLEAN}
 import org.apache.spark.sql.columnar.ColumnarTestUtils._
+import org.apache.spark.sql.columnar.{BOOLEAN, NoopColumnStats}
 
 class BooleanBitSetSuite extends SparkFunSuite {
   import BooleanBitSet._
@@ -32,7 +32,7 @@ class BooleanBitSetSuite extends SparkFunSuite {
     // -------------
 
     val builder = TestCompressibleColumnBuilder(new NoopColumnStats, BOOLEAN, BooleanBitSet)
-    val rows = Seq.fill[Row](count)(makeRandomRow(BOOLEAN))
+    val rows = Seq.fill[InternalRow](count)(makeRandomRow(BOOLEAN))
     val values = rows.map(_(0))
 
     rows.foreach(builder.appendFrom(_, 0))
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/PlannerSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/PlannerSuite.scala
index 45a7e8fe68f72..3e27f58a92d01 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/PlannerSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/PlannerSuite.scala
@@ -18,16 +18,15 @@
 package org.apache.spark.sql.execution
 
 import org.apache.spark.SparkFunSuite
-import org.apache.spark.sql.{SQLConf, execution}
-import org.apache.spark.sql.functions._
 import org.apache.spark.sql.TestData._
-import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.plans._
 import org.apache.spark.sql.execution.joins.{BroadcastHashJoin, ShuffledHashJoin}
+import org.apache.spark.sql.functions._
 import org.apache.spark.sql.test.TestSQLContext._
 import org.apache.spark.sql.test.TestSQLContext.implicits._
 import org.apache.spark.sql.test.TestSQLContext.planner._
 import org.apache.spark.sql.types._
+import org.apache.spark.sql.{Row, SQLConf, execution}
 
 
 class PlannerSuite extends SparkFunSuite {
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/joins/HashedRelationSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/joins/HashedRelationSuite.scala
index 5290c28cfca02..71db6a2159857 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/joins/HashedRelationSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/joins/HashedRelationSuite.scala
@@ -18,7 +18,7 @@
 package org.apache.spark.sql.execution.joins
 
 import org.apache.spark.SparkFunSuite
-import org.apache.spark.sql.catalyst.expressions.{Projection, Row}
+import org.apache.spark.sql.catalyst.expressions.{Projection, InternalRow}
 import org.apache.spark.util.collection.CompactBuffer
 
 
@@ -26,37 +26,37 @@ class HashedRelationSuite extends SparkFunSuite {
 
   // Key is simply the record itself
   private val keyProjection = new Projection {
-    override def apply(row: Row): Row = row
+    override def apply(row: InternalRow): InternalRow = row
   }
 
   test("GeneralHashedRelation") {
-    val data = Array(Row(0), Row(1), Row(2), Row(2))
+    val data = Array(InternalRow(0), InternalRow(1), InternalRow(2), InternalRow(2))
     val hashed = HashedRelation(data.iterator, keyProjection)
     assert(hashed.isInstanceOf[GeneralHashedRelation])
 
-    assert(hashed.get(data(0)) == CompactBuffer[Row](data(0)))
-    assert(hashed.get(data(1)) == CompactBuffer[Row](data(1)))
-    assert(hashed.get(Row(10)) === null)
+    assert(hashed.get(data(0)) == CompactBuffer[InternalRow](data(0)))
+    assert(hashed.get(data(1)) == CompactBuffer[InternalRow](data(1)))
+    assert(hashed.get(InternalRow(10)) === null)
 
-    val data2 = CompactBuffer[Row](data(2))
+    val data2 = CompactBuffer[InternalRow](data(2))
     data2 += data(2)
     assert(hashed.get(data(2)) == data2)
   }
 
   test("UniqueKeyHashedRelation") {
-    val data = Array(Row(0), Row(1), Row(2))
+    val data = Array(InternalRow(0), InternalRow(1), InternalRow(2))
     val hashed = HashedRelation(data.iterator, keyProjection)
     assert(hashed.isInstanceOf[UniqueKeyHashedRelation])
 
-    assert(hashed.get(data(0)) == CompactBuffer[Row](data(0)))
-    assert(hashed.get(data(1)) == CompactBuffer[Row](data(1)))
-    assert(hashed.get(data(2)) == CompactBuffer[Row](data(2)))
-    assert(hashed.get(Row(10)) === null)
+    assert(hashed.get(data(0)) == CompactBuffer[InternalRow](data(0)))
+    assert(hashed.get(data(1)) == CompactBuffer[InternalRow](data(1)))
+    assert(hashed.get(data(2)) == CompactBuffer[InternalRow](data(2)))
+    assert(hashed.get(InternalRow(10)) === null)
 
     val uniqHashed = hashed.asInstanceOf[UniqueKeyHashedRelation]
     assert(uniqHashed.getValue(data(0)) == data(0))
     assert(uniqHashed.getValue(data(1)) == data(1))
     assert(uniqHashed.getValue(data(2)) == data(2))
-    assert(uniqHashed.getValue(Row(10)) == null)
+    assert(uniqHashed.getValue(InternalRow(10)) == null)
   }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetFilterSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetFilterSuite.scala
index 17f5f9a491e6b..fa5d4eca05d9f 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetFilterSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetFilterSuite.scala
@@ -26,7 +26,7 @@ import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.planning.PhysicalOperation
 import org.apache.spark.sql.sources.LogicalRelation
 import org.apache.spark.sql.types._
-import org.apache.spark.sql.{Column, DataFrame, QueryTest, SQLConf}
+import org.apache.spark.sql.{Column, DataFrame, QueryTest, Row, SQLConf}
 
 /**
  * A test suite that tests Parquet filter2 API based filter pushdown optimization.
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetIOSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetIOSuite.scala
index 46b25859d9a68..fc827bc4ca11b 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetIOSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetIOSuite.scala
@@ -35,11 +35,10 @@ import org.apache.parquet.schema.{MessageType, MessageTypeParser}
 import org.scalatest.BeforeAndAfterAll
 
 import org.apache.spark.SparkException
+import org.apache.spark.sql._
 import org.apache.spark.sql.catalyst.ScalaReflection
-import org.apache.spark.sql.catalyst.expressions.Row
 import org.apache.spark.sql.catalyst.util.DateUtils
 import org.apache.spark.sql.types._
-import org.apache.spark.sql.{DataFrame, QueryTest, SQLConf, SaveMode}
 
 // Write support class for nested groups: ParquetWriter initializes GroupWriteSupport
 // with an empty configuration (it is after all not intended to be used in this way?)
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetPartitionDiscoverySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetPartitionDiscoverySuite.scala
index 3240079483545..01df189d1f3be 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetPartitionDiscoverySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetPartitionDiscoverySuite.scala
@@ -26,11 +26,13 @@ import scala.collection.mutable.ArrayBuffer
 import com.google.common.io.Files
 import org.apache.hadoop.fs.Path
 
+import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions.Literal
 import org.apache.spark.sql.sources.PartitioningUtils._
 import org.apache.spark.sql.sources.{LogicalRelation, Partition, PartitionSpec}
 import org.apache.spark.sql.types._
-import org.apache.spark.sql.{Column, QueryTest, Row, SQLContext}
+import org.apache.spark.sql._
+import org.apache.spark.unsafe.types.UTF8String
 
 // The data where the partitioning key exists only in the directory structure.
 case class ParquetData(intField: Int, stringField: String)
@@ -114,7 +116,8 @@ class ParquetPartitionDiscoverySuite extends QueryTest with ParquetTest {
         StructType(Seq(
           StructField("a", IntegerType),
           StructField("b", StringType))),
-        Seq(Partition(Row(10, "hello"), "hdfs://host:9000/path/a=10/b=hello"))))
+        Seq(Partition(InternalRow(10, UTF8String.fromString("hello")),
+          "hdfs://host:9000/path/a=10/b=hello"))))
 
     check(Seq(
       "hdfs://host:9000/path/a=10/b=20",
@@ -124,8 +127,10 @@ class ParquetPartitionDiscoverySuite extends QueryTest with ParquetTest {
           StructField("a", DoubleType),
           StructField("b", StringType))),
         Seq(
-          Partition(Row(10, "20"), "hdfs://host:9000/path/a=10/b=20"),
-          Partition(Row(10.5, "hello"), "hdfs://host:9000/path/a=10.5/b=hello"))))
+          Partition(InternalRow(10, UTF8String.fromString("20")),
+            "hdfs://host:9000/path/a=10/b=20"),
+          Partition(InternalRow(10.5, UTF8String.fromString("hello")),
+            "hdfs://host:9000/path/a=10.5/b=hello"))))
 
     check(Seq(
       "hdfs://host:9000/path/_temporary",
@@ -143,8 +148,10 @@ class ParquetPartitionDiscoverySuite extends QueryTest with ParquetTest {
           StructField("a", DoubleType),
           StructField("b", StringType))),
         Seq(
-          Partition(Row(10, "20"), "hdfs://host:9000/path/a=10/b=20"),
-          Partition(Row(10.5, "hello"), "hdfs://host:9000/path/a=10.5/b=hello"))))
+          Partition(InternalRow(10, UTF8String.fromString("20")),
+            "hdfs://host:9000/path/a=10/b=20"),
+          Partition(InternalRow(10.5, UTF8String.fromString("hello")),
+            "hdfs://host:9000/path/a=10.5/b=hello"))))
 
     check(Seq(
       s"hdfs://host:9000/path/a=10/b=20",
@@ -154,8 +161,10 @@ class ParquetPartitionDiscoverySuite extends QueryTest with ParquetTest {
           StructField("a", IntegerType),
           StructField("b", StringType))),
         Seq(
-          Partition(Row(10, "20"), s"hdfs://host:9000/path/a=10/b=20"),
-          Partition(Row(null, "hello"), s"hdfs://host:9000/path/a=$defaultPartitionName/b=hello"))))
+          Partition(InternalRow(10, UTF8String.fromString("20")),
+            s"hdfs://host:9000/path/a=10/b=20"),
+          Partition(InternalRow(null, UTF8String.fromString("hello")),
+            s"hdfs://host:9000/path/a=$defaultPartitionName/b=hello"))))
 
     check(Seq(
       s"hdfs://host:9000/path/a=10/b=$defaultPartitionName",
@@ -165,8 +174,9 @@ class ParquetPartitionDiscoverySuite extends QueryTest with ParquetTest {
           StructField("a", DoubleType),
           StructField("b", StringType))),
         Seq(
-          Partition(Row(10, null), s"hdfs://host:9000/path/a=10/b=$defaultPartitionName"),
-          Partition(Row(10.5, null), s"hdfs://host:9000/path/a=10.5/b=$defaultPartitionName"))))
+          Partition(InternalRow(10, null), s"hdfs://host:9000/path/a=10/b=$defaultPartitionName"),
+          Partition(InternalRow(10.5, null),
+            s"hdfs://host:9000/path/a=10.5/b=$defaultPartitionName"))))
 
     check(Seq(
       s"hdfs://host:9000/path1",
@@ -185,7 +195,8 @@ class ParquetPartitionDiscoverySuite extends QueryTest with ParquetTest {
         StructType(Seq(
           StructField("a", StringType),
           StructField("b", StringType))),
-        Seq(Partition(Row("10", "hello"), "hdfs://host:9000/path/a=10/b=hello"))))
+        Seq(Partition(InternalRow(UTF8String.fromString("10"), UTF8String.fromString("hello")),
+          "hdfs://host:9000/path/a=10/b=hello"))))
 
     check(Seq(
       "hdfs://host:9000/path/a=10/b=20",
@@ -195,8 +206,10 @@ class ParquetPartitionDiscoverySuite extends QueryTest with ParquetTest {
           StructField("a", StringType),
           StructField("b", StringType))),
         Seq(
-          Partition(Row("10", "20"), "hdfs://host:9000/path/a=10/b=20"),
-          Partition(Row("10.5", "hello"), "hdfs://host:9000/path/a=10.5/b=hello"))))
+          Partition(InternalRow(UTF8String.fromString("10"), UTF8String.fromString("20")),
+            "hdfs://host:9000/path/a=10/b=20"),
+          Partition(InternalRow(UTF8String.fromString("10.5"), UTF8String.fromString("hello")),
+            "hdfs://host:9000/path/a=10.5/b=hello"))))
 
     check(Seq(
       "hdfs://host:9000/path/_temporary",
@@ -214,8 +227,10 @@ class ParquetPartitionDiscoverySuite extends QueryTest with ParquetTest {
           StructField("a", StringType),
           StructField("b", StringType))),
         Seq(
-          Partition(Row("10", "20"), "hdfs://host:9000/path/a=10/b=20"),
-          Partition(Row("10.5", "hello"), "hdfs://host:9000/path/a=10.5/b=hello"))))
+          Partition(InternalRow(UTF8String.fromString("10"), UTF8String.fromString("20")),
+            "hdfs://host:9000/path/a=10/b=20"),
+          Partition(InternalRow(UTF8String.fromString("10.5"), UTF8String.fromString("hello")),
+            "hdfs://host:9000/path/a=10.5/b=hello"))))
 
     check(Seq(
       s"hdfs://host:9000/path/a=10/b=20",
@@ -225,8 +240,10 @@ class ParquetPartitionDiscoverySuite extends QueryTest with ParquetTest {
           StructField("a", StringType),
           StructField("b", StringType))),
         Seq(
-          Partition(Row("10", "20"), s"hdfs://host:9000/path/a=10/b=20"),
-          Partition(Row(null, "hello"), s"hdfs://host:9000/path/a=$defaultPartitionName/b=hello"))))
+          Partition(InternalRow(UTF8String.fromString("10"), UTF8String.fromString("20")),
+            s"hdfs://host:9000/path/a=10/b=20"),
+          Partition(InternalRow(null, UTF8String.fromString("hello")),
+            s"hdfs://host:9000/path/a=$defaultPartitionName/b=hello"))))
 
     check(Seq(
       s"hdfs://host:9000/path/a=10/b=$defaultPartitionName",
@@ -236,8 +253,10 @@ class ParquetPartitionDiscoverySuite extends QueryTest with ParquetTest {
           StructField("a", StringType),
           StructField("b", StringType))),
         Seq(
-          Partition(Row("10", null), s"hdfs://host:9000/path/a=10/b=$defaultPartitionName"),
-          Partition(Row("10.5", null), s"hdfs://host:9000/path/a=10.5/b=$defaultPartitionName"))))
+          Partition(InternalRow(UTF8String.fromString("10"), null),
+            s"hdfs://host:9000/path/a=10/b=$defaultPartitionName"),
+          Partition(InternalRow(UTF8String.fromString("10.5"), null),
+            s"hdfs://host:9000/path/a=10.5/b=$defaultPartitionName"))))
 
     check(Seq(
       s"hdfs://host:9000/path1",
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetQuerySuite.scala
index de0107a361815..be3b34d5b9b70 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetQuerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetQuerySuite.scala
@@ -20,15 +20,13 @@ package org.apache.spark.sql.parquet
 import org.scalatest.BeforeAndAfterAll
 
 import org.apache.spark.sql.types._
-import org.apache.spark.sql.{SQLConf, QueryTest}
-import org.apache.spark.sql.catalyst.expressions.Row
+import org.apache.spark.sql.{QueryTest, Row, SQLConf}
 
 /**
  * A test suite that tests various Parquet queries.
  */
 class ParquetQuerySuiteBase extends QueryTest with ParquetTest {
   lazy val sqlContext = org.apache.spark.sql.test.TestSQLContext
-  import sqlContext.implicits._
   import sqlContext.sql
 
   test("simple select queries") {
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/DDLTestSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/DDLTestSuite.scala
index 51d22b6a1378a..5fc53f7012994 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/sources/DDLTestSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/sources/DDLTestSuite.scala
@@ -19,7 +19,9 @@ package org.apache.spark.sql.sources
 
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql._
+import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.types._
+import org.apache.spark.unsafe.types.UTF8String
 
 class DDLScanSource extends RelationProvider {
   override def createRelation(
@@ -56,9 +58,12 @@ case class SimpleDDLScan(from: Int, to: Int, table: String)(@transient val sqlCo
       )
     ))
 
+  override def needConversion: Boolean = false
 
   override def buildScan(): RDD[Row] = {
-    sqlContext.sparkContext.parallelize(from to to).map(e => Row(s"people$e", e * 2))
+    sqlContext.sparkContext.parallelize(from to to).map { e =>
+      InternalRow(UTF8String.fromString(s"people$e"), e * 2)
+    }
   }
 }
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/TableScanSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/TableScanSuite.scala
index 5d4ecd810862c..4a4d8d2548514 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/sources/TableScanSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/sources/TableScanSuite.scala
@@ -19,9 +19,13 @@ package org.apache.spark.sql.sources
 
 import java.sql.{Timestamp, Date}
 
+
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql._
+import org.apache.spark.sql.catalyst.util.DateUtils
+import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.types._
+import org.apache.spark.unsafe.types.UTF8String
 
 class DefaultSource extends SimpleScanSource
 
@@ -60,10 +64,12 @@ case class AllDataTypesScan(
 
   override def schema: StructType = userSpecifiedSchema
 
+  override def needConversion: Boolean = false
+
   override def buildScan(): RDD[Row] = {
     sqlContext.sparkContext.parallelize(from to to).map { i =>
-      Row(
-        s"str_$i",
+      InternalRow(
+        UTF8String.fromString(s"str_$i"),
         s"str_$i".getBytes(),
         i % 2 == 0,
         i.toByte,
@@ -72,17 +78,18 @@ case class AllDataTypesScan(
         i.toLong,
         i.toFloat,
         i.toDouble,
-        new java.math.BigDecimal(i),
-        new java.math.BigDecimal(i),
-        new Date(1970, 1, 1),
-        new Timestamp(20000 + i),
-        s"varchar_$i",
+        Decimal(new java.math.BigDecimal(i)),
+        Decimal(new java.math.BigDecimal(i)),
+        DateUtils.fromJavaDate(new Date(1970, 1, 1)),
+        DateUtils.fromJavaTimestamp(new Timestamp(20000 + i)),
+        UTF8String.fromString(s"varchar_$i"),
         Seq(i, i + 1),
-        Seq(Map(s"str_$i" -> Row(i.toLong))),
+        Seq(Map(UTF8String.fromString(s"str_$i") -> InternalRow(i.toLong))),
         Map(i -> i.toString),
-        Map(Map(s"str_$i" -> i.toFloat) -> Row(i.toLong)),
+        Map(Map(UTF8String.fromString(s"str_$i") -> i.toFloat) -> InternalRow(i.toLong)),
         Row(i, i.toString),
-        Row(Seq(s"str_$i", s"str_${i + 1}"), Row(Seq(new Date(1970, 1, i + 1)))))
+        Row(Seq(UTF8String.fromString(s"str_$i"), UTF8String.fromString(s"str_${i + 1}")),
+          InternalRow(Seq(DateUtils.fromJavaDate(new Date(1970, 1, i + 1))))))
     }
   }
 }
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveInspectors.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveInspectors.scala
index fd01a8722bce6..d4f1ae8ee01d9 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveInspectors.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveInspectors.scala
@@ -366,7 +366,7 @@ private[hive] trait HiveInspectors {
       (o: Any) => {
         if (o != null) {
           val struct = soi.create()
-          (soi.getAllStructFieldRefs, wrappers, o.asInstanceOf[Row].toSeq).zipped.foreach {
+          (soi.getAllStructFieldRefs, wrappers, o.asInstanceOf[InternalRow].toSeq).zipped.foreach {
             (field, wrapper, data) => soi.setStructFieldData(struct, field, wrapper(data))
           }
           struct
@@ -474,7 +474,7 @@ private[hive] trait HiveInspectors {
     }
     case x: SettableStructObjectInspector =>
       val fieldRefs = x.getAllStructFieldRefs
-      val row = a.asInstanceOf[Row]
+      val row = a.asInstanceOf[InternalRow]
       // 1. create the pojo (most likely) object
       val result = x.create()
       var i = 0
@@ -490,7 +490,7 @@ private[hive] trait HiveInspectors {
       result
     case x: StructObjectInspector =>
       val fieldRefs = x.getAllStructFieldRefs
-      val row = a.asInstanceOf[Row]
+      val row = a.asInstanceOf[InternalRow]
       val result = new java.util.ArrayList[AnyRef](fieldRefs.length)
       var i = 0
       while (i < fieldRefs.length) {
@@ -517,7 +517,7 @@ private[hive] trait HiveInspectors {
   }
 
   def wrap(
-      row: Row,
+      row: InternalRow,
       inspectors: Seq[ObjectInspector],
       cache: Array[AnyRef]): Array[AnyRef] = {
     var i = 0
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
index 5a4651a887b7c..619ef63223241 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
@@ -302,7 +302,7 @@ private[hive] class HiveMetastoreCatalog(val client: ClientInterface, hive: Hive
       val partitionColumnDataTypes = partitionSchema.map(_.dataType)
       val partitions = metastoreRelation.hiveQlPartitions.map { p =>
         val location = p.getLocation
-        val values = Row.fromSeq(p.getValues.zip(partitionColumnDataTypes).map {
+        val values = InternalRow.fromSeq(p.getValues.zip(partitionColumnDataTypes).map {
           case (rawValue, dataType) => Cast(Literal(rawValue), dataType).eval(null)
         })
         ParquetPartition(values, location)
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveStrategies.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveStrategies.scala
index c6b65106452bf..452b7f0bcc749 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveStrategies.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveStrategies.scala
@@ -24,7 +24,7 @@ import org.apache.spark.sql._
 import org.apache.spark.sql.catalyst.CatalystTypeConverters
 import org.apache.spark.sql.catalyst.analysis.UnresolvedAttribute
 import org.apache.spark.sql.catalyst.expressions.codegen.GeneratePredicate
-import org.apache.spark.sql.catalyst.expressions.{Row, _}
+import org.apache.spark.sql.catalyst.expressions.{InternalRow, _}
 import org.apache.spark.sql.catalyst.planning._
 import org.apache.spark.sql.catalyst.plans._
 import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
@@ -137,7 +137,7 @@ private[hive] trait HiveStrategies {
             val partitionLocations = partitions.map(_.getLocation)
 
             if (partitionLocations.isEmpty) {
-              PhysicalRDD(plan.output, sparkContext.emptyRDD[Row]) :: Nil
+              PhysicalRDD(plan.output, sparkContext.emptyRDD[InternalRow]) :: Nil
             } else {
               hiveContext
                 .read.parquet(partitionLocations: _*)
@@ -165,7 +165,7 @@ private[hive] trait HiveStrategies {
           // TODO: Remove this hack for Spark 1.3.
           case iae: java.lang.IllegalArgumentException
               if iae.getMessage.contains("Can not create a Path from an empty string") =>
-            PhysicalRDD(plan.output, sparkContext.emptyRDD[Row]) :: Nil
+            PhysicalRDD(plan.output, sparkContext.emptyRDD[InternalRow]) :: Nil
         }
       case _ => Nil
     }
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/TableReader.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/TableReader.scala
index d3c82d8c2e326..485810320f3c1 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/TableReader.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/TableReader.scala
@@ -41,9 +41,9 @@ import org.apache.spark.util.Utils
  * A trait for subclasses that handle table scans.
  */
 private[hive] sealed trait TableReader {
-  def makeRDDForTable(hiveTable: HiveTable): RDD[Row]
+  def makeRDDForTable(hiveTable: HiveTable): RDD[InternalRow]
 
-  def makeRDDForPartitionedTable(partitions: Seq[HivePartition]): RDD[Row]
+  def makeRDDForPartitionedTable(partitions: Seq[HivePartition]): RDD[InternalRow]
 }
 
 
@@ -74,7 +74,7 @@ class HadoopTableReader(
   private val _broadcastedHiveConf =
     sc.sparkContext.broadcast(new SerializableWritable(hiveExtraConf))
 
-  override def makeRDDForTable(hiveTable: HiveTable): RDD[Row] =
+  override def makeRDDForTable(hiveTable: HiveTable): RDD[InternalRow] =
     makeRDDForTable(
       hiveTable,
       Class.forName(
@@ -94,7 +94,7 @@ class HadoopTableReader(
   def makeRDDForTable(
       hiveTable: HiveTable,
       deserializerClass: Class[_ <: Deserializer],
-      filterOpt: Option[PathFilter]): RDD[Row] = {
+      filterOpt: Option[PathFilter]): RDD[InternalRow] = {
 
     assert(!hiveTable.isPartitioned, """makeRDDForTable() cannot be called on a partitioned table,
       since input formats may differ across partitions. Use makeRDDForTablePartitions() instead.""")
@@ -125,7 +125,7 @@ class HadoopTableReader(
     deserializedHadoopRDD
   }
 
-  override def makeRDDForPartitionedTable(partitions: Seq[HivePartition]): RDD[Row] = {
+  override def makeRDDForPartitionedTable(partitions: Seq[HivePartition]): RDD[InternalRow] = {
     val partitionToDeserializer = partitions.map(part =>
       (part, part.getDeserializer.getClass.asInstanceOf[Class[Deserializer]])).toMap
     makeRDDForPartitionedTable(partitionToDeserializer, filterOpt = None)
@@ -144,7 +144,7 @@ class HadoopTableReader(
   def makeRDDForPartitionedTable(
       partitionToDeserializer: Map[HivePartition,
       Class[_ <: Deserializer]],
-      filterOpt: Option[PathFilter]): RDD[Row] = {
+      filterOpt: Option[PathFilter]): RDD[InternalRow] = {
 
     // SPARK-5068:get FileStatus and do the filtering locally when the path is not exists
     def verifyPartitionPath(
@@ -243,7 +243,7 @@ class HadoopTableReader(
 
     // Even if we don't use any partitions, we still need an empty RDD
     if (hivePartitionRDDs.size == 0) {
-      new EmptyRDD[Row](sc.sparkContext)
+      new EmptyRDD[InternalRow](sc.sparkContext)
     } else {
       new UnionRDD(hivePartitionRDDs(0).context, hivePartitionRDDs)
     }
@@ -319,7 +319,7 @@ private[hive] object HadoopTableReader extends HiveInspectors with Logging {
       rawDeser: Deserializer,
       nonPartitionKeyAttrs: Seq[(Attribute, Int)],
       mutableRow: MutableRow,
-      tableDeser: Deserializer): Iterator[Row] = {
+      tableDeser: Deserializer): Iterator[InternalRow] = {
 
     val soi = if (rawDeser.getObjectInspector.equals(tableDeser.getObjectInspector)) {
       rawDeser.getObjectInspector.asInstanceOf[StructObjectInspector]
@@ -391,7 +391,7 @@ private[hive] object HadoopTableReader extends HiveInspectors with Logging {
         i += 1
       }
 
-      mutableRow: Row
+      mutableRow: InternalRow
     }
   }
 }
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/CreateTableAsSelect.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/CreateTableAsSelect.scala
index 87c36a8b618ce..0e4a2427a9c15 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/CreateTableAsSelect.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/CreateTableAsSelect.scala
@@ -19,7 +19,7 @@ package org.apache.spark.sql.hive.execution
 
 import org.apache.spark.annotation.Experimental
 import org.apache.spark.sql.{AnalysisException, SQLContext}
-import org.apache.spark.sql.catalyst.expressions.Row
+import org.apache.spark.sql.catalyst.expressions.InternalRow
 import org.apache.spark.sql.catalyst.plans.logical.{InsertIntoTable, LogicalPlan}
 import org.apache.spark.sql.execution.RunnableCommand
 import org.apache.spark.sql.hive.client.{HiveTable, HiveColumn}
@@ -42,7 +42,7 @@ case class CreateTableAsSelect(
   def database: String = tableDesc.database
   def tableName: String = tableDesc.name
 
-  override def run(sqlContext: SQLContext): Seq[Row] = {
+  override def run(sqlContext: SQLContext): Seq[InternalRow] = {
     val hiveContext = sqlContext.asInstanceOf[HiveContext]
     lazy val metastoreRelation: MetastoreRelation = {
       import org.apache.hadoop.hive.serde2.`lazy`.LazySimpleSerDe
@@ -89,7 +89,7 @@ case class CreateTableAsSelect(
       hiveContext.executePlan(InsertIntoTable(metastoreRelation, Map(), query, true, false)).toRdd
     }
 
-    Seq.empty[Row]
+    Seq.empty[InternalRow]
   }
 
   override def argString: String = {
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/DescribeHiveTableCommand.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/DescribeHiveTableCommand.scala
index 6fce69b58b85e..a89381000ad5f 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/DescribeHiveTableCommand.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/DescribeHiveTableCommand.scala
@@ -21,12 +21,10 @@ import scala.collection.JavaConversions._
 
 import org.apache.hadoop.hive.metastore.api.FieldSchema
 
-import org.apache.spark.annotation.DeveloperApi
-import org.apache.spark.sql.catalyst.expressions.{Attribute, Row}
-import org.apache.spark.sql.execution.{SparkPlan, RunnableCommand}
-import org.apache.spark.sql.hive.{HiveContext, MetastoreRelation}
-import org.apache.spark.sql.hive.HiveShim
 import org.apache.spark.sql.SQLContext
+import org.apache.spark.sql.catalyst.expressions.{Attribute, InternalRow}
+import org.apache.spark.sql.execution.RunnableCommand
+import org.apache.spark.sql.hive.MetastoreRelation
 
 /**
  * Implementation for "describe [extended] table".
@@ -37,7 +35,7 @@ case class DescribeHiveTableCommand(
     override val output: Seq[Attribute],
     isExtended: Boolean) extends RunnableCommand {
 
-  override def run(sqlContext: SQLContext): Seq[Row] = {
+  override def run(sqlContext: SQLContext): Seq[InternalRow] = {
     // Trying to mimic the format of Hive's output. But not exactly the same.
     var results: Seq[(String, String, String)] = Nil
 
@@ -59,7 +57,7 @@ case class DescribeHiveTableCommand(
     }
 
     results.map { case (name, dataType, comment) =>
-      Row(name, dataType, comment)
+      InternalRow(name, dataType, comment)
     }
   }
 }
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/HiveNativeCommand.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/HiveNativeCommand.scala
index 60a9bb630d0d9..87f8e3f7fcfcc 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/HiveNativeCommand.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/HiveNativeCommand.scala
@@ -1,34 +1,34 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.sql.hive.execution
-
-import org.apache.spark.sql.catalyst.expressions.{AttributeReference, Row}
-import org.apache.spark.sql.execution.RunnableCommand
-import org.apache.spark.sql.hive.HiveContext
-import org.apache.spark.sql.SQLContext
-import org.apache.spark.sql.types.StringType
-
-private[hive]
-case class HiveNativeCommand(sql: String) extends RunnableCommand {
-
-  override def output: Seq[AttributeReference] =
-    Seq(AttributeReference("result", StringType, nullable = false)())
-
-  override def run(sqlContext: SQLContext): Seq[Row] =
-    sqlContext.asInstanceOf[HiveContext].runSqlHive(sql).map(Row(_))
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.hive.execution
+
+import org.apache.spark.sql.catalyst.expressions.{AttributeReference, InternalRow}
+import org.apache.spark.sql.execution.RunnableCommand
+import org.apache.spark.sql.hive.HiveContext
+import org.apache.spark.sql.SQLContext
+import org.apache.spark.sql.types.StringType
+
+private[hive]
+case class HiveNativeCommand(sql: String) extends RunnableCommand {
+
+  override def output: Seq[AttributeReference] =
+    Seq(AttributeReference("result", StringType, nullable = false)())
+
+  override def run(sqlContext: SQLContext): Seq[InternalRow] =
+    sqlContext.asInstanceOf[HiveContext].runSqlHive(sql).map(InternalRow(_))
+}
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/HiveTableScan.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/HiveTableScan.scala
index 11ee5503146b9..1f5e4af2e4746 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/HiveTableScan.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/HiveTableScan.scala
@@ -129,7 +129,7 @@ case class HiveTableScan(
     }
   }
 
-  protected override def doExecute(): RDD[Row] = if (!relation.hiveQlTable.isPartitioned) {
+  protected override def doExecute(): RDD[InternalRow] = if (!relation.hiveQlTable.isPartitioned) {
     hadoopReader.makeRDDForTable(relation.hiveQlTable)
   } else {
     hadoopReader.makeRDDForPartitionedTable(prunePartitions(relation.hiveQlPartitions))
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveTable.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveTable.scala
index eeb472602be3c..1d306c5d10af8 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveTable.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveTable.scala
@@ -30,7 +30,8 @@ import org.apache.hadoop.hive.serde2.objectinspector._
 import org.apache.hadoop.mapred.{FileOutputFormat, JobConf}
 
 import org.apache.spark.rdd.RDD
-import org.apache.spark.sql.catalyst.expressions.{Attribute, Row}
+import org.apache.spark.sql.Row
+import org.apache.spark.sql.catalyst.expressions.{Attribute, InternalRow}
 import org.apache.spark.sql.execution.{UnaryNode, SparkPlan}
 import org.apache.spark.sql.hive.HiveShim.{ShimFileSinkDesc => FileSinkDesc}
 import org.apache.spark.sql.hive._
@@ -60,7 +61,7 @@ case class InsertIntoHiveTable(
   def output: Seq[Attribute] = child.output
 
   def saveAsHiveFile(
-      rdd: RDD[Row],
+      rdd: RDD[InternalRow],
       valueClass: Class[_],
       fileSinkConf: FileSinkDesc,
       conf: SerializableWritable[JobConf],
@@ -82,7 +83,7 @@ case class InsertIntoHiveTable(
     writerContainer.commitJob()
 
     // Note that this function is executed on executor side
-    def writeToFile(context: TaskContext, iterator: Iterator[Row]): Unit = {
+    def writeToFile(context: TaskContext, iterator: Iterator[InternalRow]): Unit = {
       val serializer = newSerializer(fileSinkConf.getTableInfo)
       val standardOI = ObjectInspectorUtils
         .getStandardObjectInspector(
@@ -119,7 +120,7 @@ case class InsertIntoHiveTable(
    *
    * Note: this is run once and then kept to avoid double insertions.
    */
-  protected[sql] lazy val sideEffectResult: Seq[Row] = {
+  protected[sql] lazy val sideEffectResult: Seq[InternalRow] = {
     // Have to pass the TableDesc object to RDD.mapPartitions and then instantiate new serializer
     // instances within the closure, since Serializer is not serializable while TableDesc is.
     val tableDesc = table.tableDesc
@@ -250,12 +251,13 @@ case class InsertIntoHiveTable(
     // however for now we return an empty list to simplify compatibility checks with hive, which
     // does not return anything for insert operations.
     // TODO: implement hive compatibility as rules.
-    Seq.empty[Row]
+    Seq.empty[InternalRow]
   }
 
-  override def executeCollect(): Array[Row] = sideEffectResult.toArray
+  override def executeCollect(): Array[Row] =
+    sideEffectResult.toArray
 
-  protected override def doExecute(): RDD[Row] = {
+  protected override def doExecute(): RDD[InternalRow] = {
     sqlContext.sparkContext.parallelize(sideEffectResult, 1)
   }
 }
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/ScriptTransformation.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/ScriptTransformation.scala
index 28792db7686b5..9d8872aa47d1f 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/ScriptTransformation.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/ScriptTransformation.scala
@@ -55,7 +55,7 @@ case class ScriptTransformation(
 
   override def otherCopyArgs: Seq[HiveContext] = sc :: Nil
 
-  protected override def doExecute(): RDD[Row] = {
+  protected override def doExecute(): RDD[InternalRow] = {
     child.execute().mapPartitions { iter =>
       val cmd = List("/bin/bash", "-c", script)
       val builder = new ProcessBuilder(cmd)
@@ -72,8 +72,8 @@ case class ScriptTransformation(
 
       val (outputSerde, outputSoi) = ioschema.initOutputSerDe(output)
 
-      val iterator: Iterator[Row] = new Iterator[Row] with HiveInspectors {
-        var cacheRow: Row = null
+      val iterator: Iterator[InternalRow] = new Iterator[InternalRow] with HiveInspectors {
+        var cacheRow: InternalRow = null
         var curLine: String = null
         var eof: Boolean = false
 
@@ -90,7 +90,7 @@ case class ScriptTransformation(
           }
         }
 
-        def deserialize(): Row = {
+        def deserialize(): InternalRow = {
           if (cacheRow != null) return cacheRow
 
           val mutableRow = new SpecificMutableRow(output.map(_.dataType))
@@ -120,7 +120,7 @@ case class ScriptTransformation(
           }
         }
 
-        override def next(): Row = {
+        override def next(): InternalRow = {
           if (!hasNext) {
             throw new NoSuchElementException
           }
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/commands.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/commands.scala
index 0ba94d7b7c649..195e5752c3ec0 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/commands.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/commands.scala
@@ -22,7 +22,7 @@ import org.apache.spark.sql.catalyst.analysis.EliminateSubQueries
 import org.apache.spark.sql.catalyst.util._
 import org.apache.spark.sql.sources._
 import org.apache.spark.sql.{SaveMode, DataFrame, SQLContext}
-import org.apache.spark.sql.catalyst.expressions.{Attribute, Row}
+import org.apache.spark.sql.catalyst.expressions.{Attribute, InternalRow}
 import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
 import org.apache.spark.sql.execution.RunnableCommand
 import org.apache.spark.sql.hive.HiveContext
@@ -39,9 +39,9 @@ import org.apache.spark.util.Utils
 private[hive]
 case class AnalyzeTable(tableName: String) extends RunnableCommand {
 
-  override def run(sqlContext: SQLContext): Seq[Row] = {
+  override def run(sqlContext: SQLContext): Seq[InternalRow] = {
     sqlContext.asInstanceOf[HiveContext].analyze(tableName)
-    Seq.empty[Row]
+    Seq.empty[InternalRow]
   }
 }
 
@@ -53,7 +53,7 @@ case class DropTable(
     tableName: String,
     ifExists: Boolean) extends RunnableCommand {
 
-  override def run(sqlContext: SQLContext): Seq[Row] = {
+  override def run(sqlContext: SQLContext): Seq[InternalRow] = {
     val hiveContext = sqlContext.asInstanceOf[HiveContext]
     val ifExistsClause = if (ifExists) "IF EXISTS " else ""
     try {
@@ -70,7 +70,7 @@ case class DropTable(
     hiveContext.invalidateTable(tableName)
     hiveContext.runSqlHive(s"DROP TABLE $ifExistsClause$tableName")
     hiveContext.catalog.unregisterTable(Seq(tableName))
-    Seq.empty[Row]
+    Seq.empty[InternalRow]
   }
 }
 
@@ -83,7 +83,7 @@ case class AddJar(path: String) extends RunnableCommand {
     schema.toAttributes
   }
 
-  override def run(sqlContext: SQLContext): Seq[Row] = {
+  override def run(sqlContext: SQLContext): Seq[InternalRow] = {
     val hiveContext = sqlContext.asInstanceOf[HiveContext]
     val currentClassLoader = Utils.getContextOrSparkClassLoader
 
@@ -99,18 +99,18 @@ case class AddJar(path: String) extends RunnableCommand {
     // Add jar to executors
     hiveContext.sparkContext.addJar(path)
 
-    Seq(Row(0))
+    Seq(InternalRow(0))
   }
 }
 
 private[hive]
 case class AddFile(path: String) extends RunnableCommand {
 
-  override def run(sqlContext: SQLContext): Seq[Row] = {
+  override def run(sqlContext: SQLContext): Seq[InternalRow] = {
     val hiveContext = sqlContext.asInstanceOf[HiveContext]
     hiveContext.runSqlHive(s"ADD FILE $path")
     hiveContext.sparkContext.addFile(path)
-    Seq.empty[Row]
+    Seq.empty[InternalRow]
   }
 }
 
@@ -123,12 +123,12 @@ case class CreateMetastoreDataSource(
     allowExisting: Boolean,
     managedIfNoPath: Boolean) extends RunnableCommand {
 
-  override def run(sqlContext: SQLContext): Seq[Row] = {
+  override def run(sqlContext: SQLContext): Seq[InternalRow] = {
     val hiveContext = sqlContext.asInstanceOf[HiveContext]
 
     if (hiveContext.catalog.tableExists(tableName :: Nil)) {
       if (allowExisting) {
-        return Seq.empty[Row]
+        return Seq.empty[InternalRow]
       } else {
         throw new AnalysisException(s"Table $tableName already exists.")
       }
@@ -151,7 +151,7 @@ case class CreateMetastoreDataSource(
       optionsWithPath,
       isExternal)
 
-    Seq.empty[Row]
+    Seq.empty[InternalRow]
   }
 }
 
@@ -164,7 +164,7 @@ case class CreateMetastoreDataSourceAsSelect(
     options: Map[String, String],
     query: LogicalPlan) extends RunnableCommand {
 
-  override def run(sqlContext: SQLContext): Seq[Row] = {
+  override def run(sqlContext: SQLContext): Seq[InternalRow] = {
     val hiveContext = sqlContext.asInstanceOf[HiveContext]
     var createMetastoreTable = false
     var isExternal = true
@@ -188,7 +188,7 @@ case class CreateMetastoreDataSourceAsSelect(
             s"Or, if you are using SQL CREATE TABLE, you need to drop $tableName first.")
         case SaveMode.Ignore =>
           // Since the table already exists and the save mode is Ignore, we will just return.
-          return Seq.empty[Row]
+          return Seq.empty[InternalRow]
         case SaveMode.Append =>
           // Check if the specified data source match the data source of the existing table.
           val resolved = ResolvedDataSource(
@@ -230,7 +230,7 @@ case class CreateMetastoreDataSourceAsSelect(
     val data = DataFrame(hiveContext, query)
     val df = existingSchema match {
       // If we are inserting into an existing table, just use the existing schema.
-      case Some(schema) => sqlContext.createDataFrame(data.queryExecution.toRdd, schema)
+      case Some(schema) => sqlContext.internalCreateDataFrame(data.queryExecution.toRdd, schema)
       case None => data
     }
 
@@ -253,6 +253,6 @@ case class CreateMetastoreDataSourceAsSelect(
 
     // Refresh the cache of the table in the catalog.
     hiveContext.refreshTable(tableName)
-    Seq.empty[Row]
+    Seq.empty[InternalRow]
   }
 }
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/hiveUdfs.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/hiveUdfs.scala
index a46ee9da9039c..c40dd4e4b94f8 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/hiveUdfs.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/hiveUdfs.scala
@@ -121,7 +121,7 @@ private[hive] case class HiveSimpleUdf(funcWrapper: HiveFunctionWrapper, childre
   protected lazy val cached: Array[AnyRef] = new Array[AnyRef](children.length)
 
   // TODO: Finish input output types.
-  override def eval(input: Row): Any = {
+  override def eval(input: InternalRow): Any = {
     unwrap(
       FunctionRegistry.invoke(method, function, conversionHelper
         .convertIfNecessary(wrap(children.map(c => c.eval(input)), arguments, cached): _*): _*),
@@ -178,7 +178,7 @@ private[hive] case class HiveGenericUdf(funcWrapper: HiveFunctionWrapper, childr
 
   lazy val dataType: DataType = inspectorToDataType(returnInspector)
 
-  override def eval(input: Row): Any = {
+  override def eval(input: InternalRow): Any = {
     returnInspector // Make sure initialized.
 
     var i = 0
@@ -345,7 +345,7 @@ private[hive] case class HiveWindowFunction(
 
   def nullable: Boolean = true
 
-  override def eval(input: Row): Any =
+  override def eval(input: InternalRow): Any =
     throw new TreeNodeException(this, s"No function to evaluate expression. type: ${this.nodeName}")
 
   @transient
@@ -369,7 +369,7 @@ private[hive] case class HiveWindowFunction(
     evaluator.reset(hiveEvaluatorBuffer)
   }
 
-  override def prepareInputParameters(input: Row): AnyRef = {
+  override def prepareInputParameters(input: InternalRow): AnyRef = {
     wrap(inputProjection(input), inputInspectors, new Array[AnyRef](children.length))
   }
   // Add input parameters for a single row.
@@ -512,7 +512,7 @@ private[hive] case class HiveGenericUdtf(
     field => (inspectorToDataType(field.getFieldObjectInspector), true)
   }
 
-  override def eval(input: Row): TraversableOnce[Row] = {
+  override def eval(input: InternalRow): TraversableOnce[InternalRow] = {
     outputInspector // Make sure initialized.
 
     val inputProjection = new InterpretedProjection(children)
@@ -522,23 +522,23 @@ private[hive] case class HiveGenericUdtf(
   }
 
   protected class UDTFCollector extends Collector {
-    var collected = new ArrayBuffer[Row]
+    var collected = new ArrayBuffer[InternalRow]
 
     override def collect(input: java.lang.Object) {
       // We need to clone the input here because implementations of
       // GenericUDTF reuse the same object. Luckily they are always an array, so
       // it is easy to clone.
-      collected += unwrap(input, outputInspector).asInstanceOf[Row]
+      collected += unwrap(input, outputInspector).asInstanceOf[InternalRow]
     }
 
-    def collectRows(): Seq[Row] = {
+    def collectRows(): Seq[InternalRow] = {
       val toCollect = collected
-      collected = new ArrayBuffer[Row]
+      collected = new ArrayBuffer[InternalRow]
       toCollect
     }
   }
 
-  override def terminate(): TraversableOnce[Row] = {
+  override def terminate(): TraversableOnce[InternalRow] = {
     outputInspector // Make sure initialized.
     function.close()
     collector.collectRows()
@@ -578,7 +578,7 @@ private[hive] case class HiveUdafFunction(
   private val buffer =
     function.getNewAggregationBuffer
 
-  override def eval(input: Row): Any = unwrap(function.evaluate(buffer), returnInspector)
+  override def eval(input: InternalRow): Any = unwrap(function.evaluate(buffer), returnInspector)
 
   @transient
   val inputProjection = new InterpretedProjection(exprs)
@@ -586,7 +586,7 @@ private[hive] case class HiveUdafFunction(
   @transient
   protected lazy val cached = new Array[AnyRef](exprs.length)
 
-  def update(input: Row): Unit = {
+  def update(input: InternalRow): Unit = {
     val inputs = inputProjection(input)
     function.iterate(buffer, wrap(inputs, inspectors, cached))
   }
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveInspectorSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveInspectorSuite.scala
index df137e7b2b333..aff0456b37ed5 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveInspectorSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveInspectorSuite.scala
@@ -28,8 +28,9 @@ import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectIn
 import org.apache.hadoop.io.LongWritable
 
 import org.apache.spark.SparkFunSuite
-import org.apache.spark.sql.catalyst.expressions.{Literal, Row}
+import org.apache.spark.sql.catalyst.expressions.{Literal, InternalRow}
 import org.apache.spark.sql.types._
+import org.apache.spark.sql.Row
 
 class HiveInspectorSuite extends SparkFunSuite with HiveInspectors {
   test("Test wrap SettableStructObjectInspector") {
@@ -45,7 +46,7 @@ class HiveInspectorSuite extends SparkFunSuite with HiveInspectors {
       classOf[UDAFPercentile.State],
       ObjectInspectorOptions.JAVA).asInstanceOf[StructObjectInspector]
 
-    val a = unwrap(state, soi).asInstanceOf[Row]
+    val a = unwrap(state, soi).asInstanceOf[InternalRow]
     val b = wrap(a, soi).asInstanceOf[UDAFPercentile.State]
 
     val sfCounts = soi.getStructFieldRef("counts")
@@ -127,7 +128,7 @@ class HiveInspectorSuite extends SparkFunSuite with HiveInspectors {
     }
   }
 
-  def checkValues(row1: Seq[Any], row2: Row): Unit = {
+  def checkValues(row1: Seq[Any], row2: InternalRow): Unit = {
     row1.zip(row2.toSeq).foreach { case (r1, r2) =>
       checkValue(r1, r2)
     }
@@ -203,7 +204,7 @@ class HiveInspectorSuite extends SparkFunSuite with HiveInspectors {
     })
 
     checkValues(row,
-      unwrap(wrap(Row.fromSeq(row), toInspector(dt)), toInspector(dt)).asInstanceOf[Row])
+      unwrap(wrap(Row.fromSeq(row), toInspector(dt)), toInspector(dt)).asInstanceOf[InternalRow])
     checkValue(null, unwrap(wrap(null, toInspector(dt)), toInspector(dt)))
   }
 
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveParquetSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveParquetSuite.scala
index 5a5ea10e3c82e..a0d80dc39c108 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveParquetSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveParquetSuite.scala
@@ -17,10 +17,9 @@
 
 package org.apache.spark.sql.hive
 
-import org.apache.spark.sql.catalyst.expressions.Row
 import org.apache.spark.sql.hive.test.TestHive
 import org.apache.spark.sql.parquet.ParquetTest
-import org.apache.spark.sql.{QueryTest, SQLConf}
+import org.apache.spark.sql.{QueryTest, Row, SQLConf}
 
 case class Cases(lower: String, UPPER: String)
 
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcPartitionDiscoverySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcPartitionDiscoverySuite.scala
index 0e63d84e9824a..8707f9f936be6 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcPartitionDiscoverySuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcPartitionDiscoverySuite.scala
@@ -21,7 +21,7 @@ import java.io.File
 import org.apache.hadoop.hive.conf.HiveConf.ConfVars
 import org.apache.spark.SparkFunSuite
 import org.apache.spark.sql._
-import org.apache.spark.sql.catalyst.expressions.Row
+import org.apache.spark.sql.catalyst.expressions.InternalRow
 import org.apache.spark.sql.hive.test.TestHive
 import org.apache.spark.sql.hive.test.TestHive._
 import org.apache.spark.sql.hive.test.TestHive.implicits._
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcQuerySuite.scala
index b384fb39f3d66..267d22c6b5f1e 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcQuerySuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcQuerySuite.scala
@@ -25,7 +25,7 @@ import org.scalatest.BeforeAndAfterAll
 
 import org.apache.spark.SparkFunSuite
 import org.apache.spark.sql._
-import org.apache.spark.sql.catalyst.expressions.Row
+import org.apache.spark.sql.catalyst.expressions.InternalRow
 import org.apache.spark.sql.hive.test.TestHive
 import org.apache.spark.sql.hive.test.TestHive._
 import org.apache.spark.sql.hive.test.TestHive.implicits._
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/parquetSuites.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/parquetSuites.scala
index e62ac909cbd0c..3864349cdbd89 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/parquetSuites.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/parquetSuites.scala
@@ -21,8 +21,6 @@ import java.io.File
 
 import org.scalatest.BeforeAndAfterAll
 
-import org.apache.spark.sql.catalyst.expressions.Row
-import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
 import org.apache.spark.sql.execution.{ExecutedCommand, PhysicalRDD}
 import org.apache.spark.sql.hive.execution.HiveTableScan
 import org.apache.spark.sql.hive.test.TestHive._
@@ -30,7 +28,7 @@ import org.apache.spark.sql.hive.test.TestHive.implicits._
 import org.apache.spark.sql.parquet.{ParquetRelation2, ParquetTableScan}
 import org.apache.spark.sql.sources.{InsertIntoDataSource, InsertIntoHadoopFsRelation, LogicalRelation}
 import org.apache.spark.sql.types._
-import org.apache.spark.sql.{DataFrame, QueryTest, SQLConf, SaveMode}
+import org.apache.spark.sql.{DataFrame, QueryTest, Row, SQLConf, SaveMode}
 import org.apache.spark.util.Utils
 
 // The data where the partitioning key exists only in the directory structure.

From 4aed66f299a67f5a594da9316b6bf4c345838216 Mon Sep 17 00:00:00 2001
From: Michael Armbrust <michael@databricks.com>
Date: Fri, 12 Jun 2015 23:11:16 -0700
Subject: [PATCH 472/525] [SPARK-8329][SQL] Allow _ in DataSource options

Author: Michael Armbrust <michael@databricks.com>

Closes #6786 from marmbrus/optionsParser and squashes the following commits:

e7d18ef [Michael Armbrust] add dots
99a3452 [Michael Armbrust] [SPARK-8329][SQL] Allow _ in DataSource options
---
 .../scala/org/apache/spark/sql/sources/ddl.scala |  6 +++++-
 .../spark/sql/sources/TableScanSuite.scala       | 16 +++++++++++++---
 2 files changed, 18 insertions(+), 4 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/sources/ddl.scala b/sql/core/src/main/scala/org/apache/spark/sql/sources/ddl.scala
index 01c67db232569..b7095c8ead797 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/sources/ddl.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/sources/ddl.scala
@@ -166,10 +166,14 @@ private[sql] class DDLParser(
     }
   )
 
-  protected lazy val optionName: Parser[String] = "[_a-zA-Z][a-zA-Z0-9]*".r ^^ {
+  protected lazy val optionPart: Parser[String] = "[_a-zA-Z][_a-zA-Z0-9]*".r ^^ {
     case name => name
   }
 
+  protected lazy val optionName: Parser[String] = repsep(optionPart, ".") ^^ {
+    case parts => parts.mkString(".")
+  }
+
   protected lazy val pair: Parser[(String, String)] =
     optionName ~ stringLit ^^ { case k ~ v => (k, v) }
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/TableScanSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/TableScanSuite.scala
index 4a4d8d2548514..48875773224c7 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/sources/TableScanSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/sources/TableScanSuite.scala
@@ -51,6 +51,10 @@ class AllDataTypesScanSource extends SchemaRelationProvider {
       sqlContext: SQLContext,
       parameters: Map[String, String],
       schema: StructType): BaseRelation = {
+    // Check that weird parameters are passed correctly.
+    parameters("option_with_underscores")
+    parameters("option.with.dots")
+
     AllDataTypesScan(parameters("from").toInt, parameters("TO").toInt, schema)(sqlContext)
   }
 }
@@ -128,7 +132,9 @@ class TableScanSuite extends DataSourceTest {
         |USING org.apache.spark.sql.sources.SimpleScanSource
         |OPTIONS (
         |  From '1',
-        |  To '10'
+        |  To '10',
+        |  option_with_underscores 'someval',
+        |  option.with.dots 'someval'
         |)
       """.stripMargin)
 
@@ -159,7 +165,9 @@ class TableScanSuite extends DataSourceTest {
         |USING org.apache.spark.sql.sources.AllDataTypesScanSource
         |OPTIONS (
         |  From '1',
-        |  To '10'
+        |  To '10',
+        |  option_with_underscores 'someval',
+        |  option.with.dots 'someval'
         |)
       """.stripMargin)
   }
@@ -361,7 +369,9 @@ class TableScanSuite extends DataSourceTest {
        |USING org.apache.spark.sql.sources.AllDataTypesScanSource
        |OPTIONS (
        |  from '1',
-       |  to '10'
+       |  to '10',
+       |  option_with_underscores 'someval',
+       |  option.with.dots 'someval'
        |)
        """.stripMargin)
 

From d986fb9a378416248768828e6e6c7405697f9a5a Mon Sep 17 00:00:00 2001
From: Rene Treffer <treffer@measite.de>
Date: Sat, 13 Jun 2015 11:58:22 -0700
Subject: [PATCH 473/525] [SPARK-7897] Improbe type for jdbc/"unsigned bigint"

The original fix uses DecimalType.Unlimited, which is harder to
handle afterwards. There is no scale and most data should fit into
a long, thus DecimalType(20,0) should be better.

Author: Rene Treffer <treffer@measite.de>

Closes #6789 from rtreffer/spark-7897-unsigned-bigint-as-decimal and squashes the following commits:

2006613 [Rene Treffer] Fix type for "unsigned bigint" jdbc loading.
---
 sql/core/src/main/scala/org/apache/spark/sql/jdbc/JDBCRDD.scala | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/jdbc/JDBCRDD.scala b/sql/core/src/main/scala/org/apache/spark/sql/jdbc/JDBCRDD.scala
index 667fc70cff956..226b143923df6 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/jdbc/JDBCRDD.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/jdbc/JDBCRDD.scala
@@ -55,7 +55,7 @@ private[sql] object JDBCRDD extends Logging {
     val answer = sqlType match {
       // scalastyle:off
       case java.sql.Types.ARRAY         => null
-      case java.sql.Types.BIGINT        => if (signed) { LongType } else { DecimalType.Unlimited }
+      case java.sql.Types.BIGINT        => if (signed) { LongType } else { DecimalType(20,0) }
       case java.sql.Types.BINARY        => BinaryType
       case java.sql.Types.BIT           => BooleanType // @see JdbcDialect for quirks
       case java.sql.Types.BLOB          => BinaryType

From ce1041c38f92449ca14894551c358c875672afe6 Mon Sep 17 00:00:00 2001
From: Davies Liu <davies@databricks.com>
Date: Sat, 13 Jun 2015 16:13:26 -0700
Subject: [PATCH 474/525] [SPARK-8346] [SQL] Use InternalRow instread of
 catalyst.InternalRow

cc rxin marmbrus

Author: Davies Liu <davies@databricks.com>

Closes #6802 from davies/cleanup_internalrow and squashes the following commits:

769d2aa [Davies Liu] remove not needed cast
4acbbe4 [Davies Liu] catalyst.Internal -> InternalRow
---
 .../sql/catalyst/analysis/unresolved.scala    |  10 +-
 .../spark/sql/catalyst/expressions/Cast.scala |   4 +-
 .../catalyst/expressions/ExtractValue.scala   |  10 +-
 .../sql/catalyst/expressions/Projection.scala | 105 +++++++++---------
 .../sql/catalyst/expressions/ScalaUdf.scala   |  48 ++++----
 .../sql/catalyst/expressions/aggregates.scala |  68 ++++++------
 .../sql/catalyst/expressions/arithmetic.scala |  12 +-
 .../expressions/codegen/CodeGenerator.scala   |   6 +-
 .../codegen/GenerateMutableProjection.scala   |   3 +-
 .../codegen/GenerateOrdering.scala            |  10 +-
 .../codegen/GeneratePredicate.scala           |   8 +-
 .../catalyst/expressions/complexTypes.scala   |   4 +-
 .../catalyst/expressions/conditionals.scala   |   6 +-
 .../expressions/decimalFunctions.scala        |   4 +-
 .../sql/catalyst/expressions/generators.scala |  14 +--
 .../sql/catalyst/expressions/literals.scala   |   6 +-
 .../spark/sql/catalyst/expressions/math.scala |   8 +-
 .../expressions/namedExpressions.scala        |   6 +-
 .../catalyst/expressions/nullFunctions.scala  |  13 +--
 .../sql/catalyst/expressions/package.scala    |   6 +-
 .../org/apache/spark/sql/DataFrame.scala      |   6 +-
 .../sql/execution/joins/HashOuterJoin.scala   |   2 +-
 22 files changed, 176 insertions(+), 183 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/unresolved.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/unresolved.scala
index 5de188d418924..c9d91425788a8 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/unresolved.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/unresolved.scala
@@ -68,7 +68,7 @@ case class UnresolvedAttribute(nameParts: Seq[String])
   override def withName(newName: String): UnresolvedAttribute = UnresolvedAttribute.quoted(newName)
 
   // Unresolved attributes are transient at compile time and don't get evaluated during execution.
-  override def eval(input: catalyst.InternalRow = null): Any =
+  override def eval(input: InternalRow = null): Any =
     throw new TreeNodeException(this, s"No function to evaluate expression. type: ${this.nodeName}")
 
   override def toString: String = s"'$name"
@@ -86,7 +86,7 @@ case class UnresolvedFunction(name: String, children: Seq[Expression]) extends E
   override lazy val resolved = false
 
   // Unresolved functions are transient at compile time and don't get evaluated during execution.
-  override def eval(input: catalyst.InternalRow = null): Any =
+  override def eval(input: InternalRow = null): Any =
     throw new TreeNodeException(this, s"No function to evaluate expression. type: ${this.nodeName}")
 
   override def toString: String = s"'$name(${children.mkString(",")})"
@@ -108,7 +108,7 @@ trait Star extends NamedExpression with trees.LeafNode[Expression] {
   override lazy val resolved = false
 
   // Star gets expanded at runtime so we never evaluate a Star.
-  override def eval(input: catalyst.InternalRow = null): Any =
+  override def eval(input: InternalRow = null): Any =
     throw new TreeNodeException(this, s"No function to evaluate expression. type: ${this.nodeName}")
 
   def expand(input: Seq[Attribute], resolver: Resolver): Seq[NamedExpression]
@@ -167,7 +167,7 @@ case class MultiAlias(child: Expression, names: Seq[String])
 
   override lazy val resolved = false
 
-  override def eval(input: catalyst.InternalRow = null): Any =
+  override def eval(input: InternalRow = null): Any =
     throw new TreeNodeException(this, s"No function to evaluate expression. type: ${this.nodeName}")
 
   override def toString: String = s"$child AS $names"
@@ -201,7 +201,7 @@ case class UnresolvedExtractValue(child: Expression, extraction: Expression)
   override def nullable: Boolean = throw new UnresolvedException(this, "nullable")
   override lazy val resolved = false
 
-  override def eval(input: catalyst.InternalRow = null): Any =
+  override def eval(input: InternalRow = null): Any =
     throw new TreeNodeException(this, s"No function to evaluate expression. type: ${this.nodeName}")
 
   override def toString: String = s"$child[$extraction]"
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala
index afbf30af332d8..05a04bdff9b3e 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala
@@ -394,7 +394,7 @@ case class Cast(child: Expression, dataType: DataType) extends UnaryExpression w
     }
     // TODO: Could be faster?
     val newRow = new GenericMutableRow(from.fields.size)
-    buildCast[catalyst.InternalRow](_, row => {
+    buildCast[InternalRow](_, row => {
       var i = 0
       while (i < row.length) {
         val v = row(i)
@@ -426,7 +426,7 @@ case class Cast(child: Expression, dataType: DataType) extends UnaryExpression w
 
   private[this] lazy val cast: Any => Any = cast(child.dataType, dataType)
 
-  override def eval(input: catalyst.InternalRow): Any = {
+  override def eval(input: InternalRow): Any = {
     val evaluated = child.eval(input)
     if (evaluated == null) null else cast(evaluated)
   }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ExtractValue.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ExtractValue.scala
index 16f3ccc3d6b88..4aaabff15b6ee 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ExtractValue.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ExtractValue.scala
@@ -105,8 +105,8 @@ case class GetStructField(child: Expression, field: StructField, ordinal: Int)
   override def foldable: Boolean = child.foldable
   override def toString: String = s"$child.${field.name}"
 
-  override def eval(input: catalyst.InternalRow): Any = {
-    val baseValue = child.eval(input).asInstanceOf[catalyst.InternalRow]
+  override def eval(input: InternalRow): Any = {
+    val baseValue = child.eval(input).asInstanceOf[InternalRow]
     if (baseValue == null) null else baseValue(ordinal)
   }
 }
@@ -125,8 +125,8 @@ case class GetArrayStructFields(
   override def foldable: Boolean = child.foldable
   override def toString: String = s"$child.${field.name}"
 
-  override def eval(input: catalyst.InternalRow): Any = {
-    val baseValue = child.eval(input).asInstanceOf[Seq[catalyst.InternalRow]]
+  override def eval(input: InternalRow): Any = {
+    val baseValue = child.eval(input).asInstanceOf[Seq[InternalRow]]
     if (baseValue == null) null else {
       baseValue.map { row =>
         if (row == null) null else row(ordinal)
@@ -146,7 +146,7 @@ abstract class ExtractValueWithOrdinal extends ExtractValue {
   override def toString: String = s"$child[$ordinal]"
   override def children: Seq[Expression] = child :: ordinal :: Nil
 
-  override def eval(input: catalyst.InternalRow): Any = {
+  override def eval(input: InternalRow): Any = {
     val value = child.eval(input)
     if (value == null) {
       null
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Projection.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Projection.scala
index d6806f78ab3fd..d5967438ccb5a 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Projection.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Projection.scala
@@ -17,9 +17,6 @@
 
 package org.apache.spark.sql.catalyst.expressions
 
-import org.apache.spark.sql.catalyst
-
-
 /**
  * A [[Projection]] that is calculated by calling the `eval` of each of the specified expressions.
  * @param expressions a sequence of expressions that determine the value of each column of the
@@ -32,7 +29,7 @@ class InterpretedProjection(expressions: Seq[Expression]) extends Projection {
   // null check is required for when Kryo invokes the no-arg constructor.
   protected val exprArray = if (expressions != null) expressions.toArray else null
 
-  def apply(input: catalyst.InternalRow): catalyst.InternalRow = {
+  def apply(input: InternalRow): InternalRow = {
     val outputArray = new Array[Any](exprArray.length)
     var i = 0
     while (i < exprArray.length) {
@@ -57,14 +54,14 @@ case class InterpretedMutableProjection(expressions: Seq[Expression]) extends Mu
 
   private[this] val exprArray = expressions.toArray
   private[this] var mutableRow: MutableRow = new GenericMutableRow(exprArray.size)
-  def currentValue: catalyst.InternalRow = mutableRow
+  def currentValue: InternalRow = mutableRow
 
   override def target(row: MutableRow): MutableProjection = {
     mutableRow = row
     this
   }
 
-  override def apply(input: catalyst.InternalRow): catalyst.InternalRow = {
+  override def apply(input: InternalRow): InternalRow = {
     var i = 0
     while (i < exprArray.length) {
       mutableRow(i) = exprArray(i).eval(input)
@@ -78,31 +75,31 @@ case class InterpretedMutableProjection(expressions: Seq[Expression]) extends Mu
  * A mutable wrapper that makes two rows appear as a single concatenated row.  Designed to
  * be instantiated once per thread and reused.
  */
-class JoinedRow extends catalyst.InternalRow {
-  private[this] var row1: catalyst.InternalRow = _
-  private[this] var row2: catalyst.InternalRow = _
+class JoinedRow extends InternalRow {
+  private[this] var row1: InternalRow = _
+  private[this] var row2: InternalRow = _
 
-  def this(left: catalyst.InternalRow, right: catalyst.InternalRow) = {
+  def this(left: InternalRow, right: InternalRow) = {
     this()
     row1 = left
     row2 = right
   }
 
   /** Updates this JoinedRow to used point at two new base rows.  Returns itself. */
-  def apply(r1: catalyst.InternalRow, r2: catalyst.InternalRow): catalyst.InternalRow = {
+  def apply(r1: InternalRow, r2: InternalRow): InternalRow = {
     row1 = r1
     row2 = r2
     this
   }
 
   /** Updates this JoinedRow by updating its left base row.  Returns itself. */
-  def withLeft(newLeft: catalyst.InternalRow): catalyst.InternalRow = {
+  def withLeft(newLeft: InternalRow): InternalRow = {
     row1 = newLeft
     this
   }
 
   /** Updates this JoinedRow by updating its right base row.  Returns itself. */
-  def withRight(newRight: catalyst.InternalRow): catalyst.InternalRow = {
+  def withRight(newRight: InternalRow): InternalRow = {
     row2 = newRight
     this
   }
@@ -144,7 +141,7 @@ class JoinedRow extends catalyst.InternalRow {
   override def getAs[T](i: Int): T =
     if (i < row1.length) row1.getAs[T](i) else row2.getAs[T](i - row1.length)
 
-  override def copy(): catalyst.InternalRow = {
+  override def copy(): InternalRow = {
     val totalSize = row1.length + row2.length
     val copiedValues = new Array[Any](totalSize)
     var i = 0
@@ -178,31 +175,31 @@ class JoinedRow extends catalyst.InternalRow {
  * Row will be referenced, increasing the opportunity for the JIT to play tricks.  This sounds
  * crazy but in benchmarks it had noticeable effects.
  */
-class JoinedRow2 extends catalyst.InternalRow {
-  private[this] var row1: catalyst.InternalRow = _
-  private[this] var row2: catalyst.InternalRow = _
+class JoinedRow2 extends InternalRow {
+  private[this] var row1: InternalRow = _
+  private[this] var row2: InternalRow = _
 
-  def this(left: catalyst.InternalRow, right: catalyst.InternalRow) = {
+  def this(left: InternalRow, right: InternalRow) = {
     this()
     row1 = left
     row2 = right
   }
 
   /** Updates this JoinedRow to used point at two new base rows.  Returns itself. */
-  def apply(r1: catalyst.InternalRow, r2: catalyst.InternalRow): catalyst.InternalRow = {
+  def apply(r1: InternalRow, r2: InternalRow): InternalRow = {
     row1 = r1
     row2 = r2
     this
   }
 
   /** Updates this JoinedRow by updating its left base row.  Returns itself. */
-  def withLeft(newLeft: catalyst.InternalRow): catalyst.InternalRow = {
+  def withLeft(newLeft: InternalRow): InternalRow = {
     row1 = newLeft
     this
   }
 
   /** Updates this JoinedRow by updating its right base row.  Returns itself. */
-  def withRight(newRight: catalyst.InternalRow): catalyst.InternalRow = {
+  def withRight(newRight: InternalRow): InternalRow = {
     row2 = newRight
     this
   }
@@ -244,7 +241,7 @@ class JoinedRow2 extends catalyst.InternalRow {
   override def getAs[T](i: Int): T =
     if (i < row1.length) row1.getAs[T](i) else row2.getAs[T](i - row1.length)
 
-  override def copy(): catalyst.InternalRow = {
+  override def copy(): InternalRow = {
     val totalSize = row1.length + row2.length
     val copiedValues = new Array[Any](totalSize)
     var i = 0
@@ -272,31 +269,31 @@ class JoinedRow2 extends catalyst.InternalRow {
 /**
  * JIT HACK: Replace with macros
  */
-class JoinedRow3 extends catalyst.InternalRow {
-  private[this] var row1: catalyst.InternalRow = _
-  private[this] var row2: catalyst.InternalRow = _
+class JoinedRow3 extends InternalRow {
+  private[this] var row1: InternalRow = _
+  private[this] var row2: InternalRow = _
 
-  def this(left: catalyst.InternalRow, right: catalyst.InternalRow) = {
+  def this(left: InternalRow, right: InternalRow) = {
     this()
     row1 = left
     row2 = right
   }
 
   /** Updates this JoinedRow to used point at two new base rows.  Returns itself. */
-  def apply(r1: catalyst.InternalRow, r2: catalyst.InternalRow): catalyst.InternalRow = {
+  def apply(r1: InternalRow, r2: InternalRow): InternalRow = {
     row1 = r1
     row2 = r2
     this
   }
 
   /** Updates this JoinedRow by updating its left base row.  Returns itself. */
-  def withLeft(newLeft: catalyst.InternalRow): catalyst.InternalRow = {
+  def withLeft(newLeft: InternalRow): InternalRow = {
     row1 = newLeft
     this
   }
 
   /** Updates this JoinedRow by updating its right base row.  Returns itself. */
-  def withRight(newRight: catalyst.InternalRow): catalyst.InternalRow = {
+  def withRight(newRight: InternalRow): InternalRow = {
     row2 = newRight
     this
   }
@@ -338,7 +335,7 @@ class JoinedRow3 extends catalyst.InternalRow {
   override def getAs[T](i: Int): T =
     if (i < row1.length) row1.getAs[T](i) else row2.getAs[T](i - row1.length)
 
-  override def copy(): catalyst.InternalRow = {
+  override def copy(): InternalRow = {
     val totalSize = row1.length + row2.length
     val copiedValues = new Array[Any](totalSize)
     var i = 0
@@ -366,31 +363,31 @@ class JoinedRow3 extends catalyst.InternalRow {
 /**
  * JIT HACK: Replace with macros
  */
-class JoinedRow4 extends catalyst.InternalRow {
-  private[this] var row1: catalyst.InternalRow = _
-  private[this] var row2: catalyst.InternalRow = _
+class JoinedRow4 extends InternalRow {
+  private[this] var row1: InternalRow = _
+  private[this] var row2: InternalRow = _
 
-  def this(left: catalyst.InternalRow, right: catalyst.InternalRow) = {
+  def this(left: InternalRow, right: InternalRow) = {
     this()
     row1 = left
     row2 = right
   }
 
   /** Updates this JoinedRow to used point at two new base rows.  Returns itself. */
-  def apply(r1: catalyst.InternalRow, r2: catalyst.InternalRow): catalyst.InternalRow = {
+  def apply(r1: InternalRow, r2: InternalRow): InternalRow = {
     row1 = r1
     row2 = r2
     this
   }
 
   /** Updates this JoinedRow by updating its left base row.  Returns itself. */
-  def withLeft(newLeft: catalyst.InternalRow): catalyst.InternalRow = {
+  def withLeft(newLeft: InternalRow): InternalRow = {
     row1 = newLeft
     this
   }
 
   /** Updates this JoinedRow by updating its right base row.  Returns itself. */
-  def withRight(newRight: catalyst.InternalRow): catalyst.InternalRow = {
+  def withRight(newRight: InternalRow): InternalRow = {
     row2 = newRight
     this
   }
@@ -432,7 +429,7 @@ class JoinedRow4 extends catalyst.InternalRow {
   override def getAs[T](i: Int): T =
     if (i < row1.length) row1.getAs[T](i) else row2.getAs[T](i - row1.length)
 
-  override def copy(): catalyst.InternalRow = {
+  override def copy(): InternalRow = {
     val totalSize = row1.length + row2.length
     val copiedValues = new Array[Any](totalSize)
     var i = 0
@@ -460,31 +457,31 @@ class JoinedRow4 extends catalyst.InternalRow {
 /**
  * JIT HACK: Replace with macros
  */
-class JoinedRow5 extends catalyst.InternalRow {
-  private[this] var row1: catalyst.InternalRow = _
-  private[this] var row2: catalyst.InternalRow = _
+class JoinedRow5 extends InternalRow {
+  private[this] var row1: InternalRow = _
+  private[this] var row2: InternalRow = _
 
-  def this(left: catalyst.InternalRow, right: catalyst.InternalRow) = {
+  def this(left: InternalRow, right: InternalRow) = {
     this()
     row1 = left
     row2 = right
   }
 
   /** Updates this JoinedRow to used point at two new base rows.  Returns itself. */
-  def apply(r1: catalyst.InternalRow, r2: catalyst.InternalRow): catalyst.InternalRow = {
+  def apply(r1: InternalRow, r2: InternalRow): InternalRow = {
     row1 = r1
     row2 = r2
     this
   }
 
   /** Updates this JoinedRow by updating its left base row.  Returns itself. */
-  def withLeft(newLeft: catalyst.InternalRow): catalyst.InternalRow = {
+  def withLeft(newLeft: InternalRow): InternalRow = {
     row1 = newLeft
     this
   }
 
   /** Updates this JoinedRow by updating its right base row.  Returns itself. */
-  def withRight(newRight: catalyst.InternalRow): catalyst.InternalRow = {
+  def withRight(newRight: InternalRow): InternalRow = {
     row2 = newRight
     this
   }
@@ -526,7 +523,7 @@ class JoinedRow5 extends catalyst.InternalRow {
   override def getAs[T](i: Int): T =
     if (i < row1.length) row1.getAs[T](i) else row2.getAs[T](i - row1.length)
 
-  override def copy(): catalyst.InternalRow = {
+  override def copy(): InternalRow = {
     val totalSize = row1.length + row2.length
     val copiedValues = new Array[Any](totalSize)
     var i = 0
@@ -554,31 +551,31 @@ class JoinedRow5 extends catalyst.InternalRow {
 /**
  * JIT HACK: Replace with macros
  */
-class JoinedRow6 extends catalyst.InternalRow {
-  private[this] var row1: catalyst.InternalRow = _
-  private[this] var row2: catalyst.InternalRow = _
+class JoinedRow6 extends InternalRow {
+  private[this] var row1: InternalRow = _
+  private[this] var row2: InternalRow = _
 
-  def this(left: catalyst.InternalRow, right: catalyst.InternalRow) = {
+  def this(left: InternalRow, right: InternalRow) = {
     this()
     row1 = left
     row2 = right
   }
 
   /** Updates this JoinedRow to used point at two new base rows.  Returns itself. */
-  def apply(r1: catalyst.InternalRow, r2: catalyst.InternalRow): catalyst.InternalRow = {
+  def apply(r1: InternalRow, r2: InternalRow): InternalRow = {
     row1 = r1
     row2 = r2
     this
   }
 
   /** Updates this JoinedRow by updating its left base row.  Returns itself. */
-  def withLeft(newLeft: catalyst.InternalRow): catalyst.InternalRow = {
+  def withLeft(newLeft: InternalRow): InternalRow = {
     row1 = newLeft
     this
   }
 
   /** Updates this JoinedRow by updating its right base row.  Returns itself. */
-  def withRight(newRight: catalyst.InternalRow): catalyst.InternalRow = {
+  def withRight(newRight: InternalRow): InternalRow = {
     row2 = newRight
     this
   }
@@ -620,7 +617,7 @@ class JoinedRow6 extends catalyst.InternalRow {
   override def getAs[T](i: Int): T =
     if (i < row1.length) row1.getAs[T](i) else row2.getAs[T](i - row1.length)
 
-  override def copy(): catalyst.InternalRow = {
+  override def copy(): InternalRow = {
     val totalSize = row1.length + row2.length
     val copiedValues = new Array[Any](totalSize)
     var i = 0
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ScalaUdf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ScalaUdf.scala
index 40f235fc19536..b3ce698c5552d 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ScalaUdf.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ScalaUdf.scala
@@ -58,7 +58,7 @@ case class ScalaUdf(function: AnyRef, dataType: DataType, children: Seq[Expressi
   private[this] val f = children.size match {
     case 0 =>
       val func = function.asInstanceOf[() => Any]
-      (input: catalyst.InternalRow) => {
+      (input: InternalRow) => {
         func()
       }
 
@@ -66,7 +66,7 @@ case class ScalaUdf(function: AnyRef, dataType: DataType, children: Seq[Expressi
       val func = function.asInstanceOf[(Any) => Any]
       val child0 = children(0)
       lazy val converter0 = CatalystTypeConverters.createToScalaConverter(child0.dataType)
-      (input: catalyst.InternalRow) => {
+      (input: InternalRow) => {
         func(
           converter0(child0.eval(input)))
       }
@@ -77,7 +77,7 @@ case class ScalaUdf(function: AnyRef, dataType: DataType, children: Seq[Expressi
       val child1 = children(1)
       lazy val converter0 = CatalystTypeConverters.createToScalaConverter(child0.dataType)
       lazy val converter1 = CatalystTypeConverters.createToScalaConverter(child1.dataType)
-      (input: catalyst.InternalRow) => {
+      (input: InternalRow) => {
         func(
           converter0(child0.eval(input)),
           converter1(child1.eval(input)))
@@ -91,7 +91,7 @@ case class ScalaUdf(function: AnyRef, dataType: DataType, children: Seq[Expressi
       lazy val converter0 = CatalystTypeConverters.createToScalaConverter(child0.dataType)
       lazy val converter1 = CatalystTypeConverters.createToScalaConverter(child1.dataType)
       lazy val converter2 = CatalystTypeConverters.createToScalaConverter(child2.dataType)
-      (input: catalyst.InternalRow) => {
+      (input: InternalRow) => {
         func(
           converter0(child0.eval(input)),
           converter1(child1.eval(input)),
@@ -108,7 +108,7 @@ case class ScalaUdf(function: AnyRef, dataType: DataType, children: Seq[Expressi
       lazy val converter1 = CatalystTypeConverters.createToScalaConverter(child1.dataType)
       lazy val converter2 = CatalystTypeConverters.createToScalaConverter(child2.dataType)
       lazy val converter3 = CatalystTypeConverters.createToScalaConverter(child3.dataType)
-      (input: catalyst.InternalRow) => {
+      (input: InternalRow) => {
         func(
           converter0(child0.eval(input)),
           converter1(child1.eval(input)),
@@ -128,7 +128,7 @@ case class ScalaUdf(function: AnyRef, dataType: DataType, children: Seq[Expressi
       lazy val converter2 = CatalystTypeConverters.createToScalaConverter(child2.dataType)
       lazy val converter3 = CatalystTypeConverters.createToScalaConverter(child3.dataType)
       lazy val converter4 = CatalystTypeConverters.createToScalaConverter(child4.dataType)
-      (input: catalyst.InternalRow) => {
+      (input: InternalRow) => {
         func(
           converter0(child0.eval(input)),
           converter1(child1.eval(input)),
@@ -151,7 +151,7 @@ case class ScalaUdf(function: AnyRef, dataType: DataType, children: Seq[Expressi
       lazy val converter3 = CatalystTypeConverters.createToScalaConverter(child3.dataType)
       lazy val converter4 = CatalystTypeConverters.createToScalaConverter(child4.dataType)
       lazy val converter5 = CatalystTypeConverters.createToScalaConverter(child5.dataType)
-      (input: catalyst.InternalRow) => {
+      (input: InternalRow) => {
         func(
           converter0(child0.eval(input)),
           converter1(child1.eval(input)),
@@ -177,7 +177,7 @@ case class ScalaUdf(function: AnyRef, dataType: DataType, children: Seq[Expressi
       lazy val converter4 = CatalystTypeConverters.createToScalaConverter(child4.dataType)
       lazy val converter5 = CatalystTypeConverters.createToScalaConverter(child5.dataType)
       lazy val converter6 = CatalystTypeConverters.createToScalaConverter(child6.dataType)
-      (input: catalyst.InternalRow) => {
+      (input: InternalRow) => {
         func(
           converter0(child0.eval(input)),
           converter1(child1.eval(input)),
@@ -206,7 +206,7 @@ case class ScalaUdf(function: AnyRef, dataType: DataType, children: Seq[Expressi
       lazy val converter5 = CatalystTypeConverters.createToScalaConverter(child5.dataType)
       lazy val converter6 = CatalystTypeConverters.createToScalaConverter(child6.dataType)
       lazy val converter7 = CatalystTypeConverters.createToScalaConverter(child7.dataType)
-      (input: catalyst.InternalRow) => {
+      (input: InternalRow) => {
         func(
           converter0(child0.eval(input)),
           converter1(child1.eval(input)),
@@ -238,7 +238,7 @@ case class ScalaUdf(function: AnyRef, dataType: DataType, children: Seq[Expressi
       lazy val converter6 = CatalystTypeConverters.createToScalaConverter(child6.dataType)
       lazy val converter7 = CatalystTypeConverters.createToScalaConverter(child7.dataType)
       lazy val converter8 = CatalystTypeConverters.createToScalaConverter(child8.dataType)
-      (input: catalyst.InternalRow) => {
+      (input: InternalRow) => {
         func(
           converter0(child0.eval(input)),
           converter1(child1.eval(input)),
@@ -273,7 +273,7 @@ case class ScalaUdf(function: AnyRef, dataType: DataType, children: Seq[Expressi
       lazy val converter7 = CatalystTypeConverters.createToScalaConverter(child7.dataType)
       lazy val converter8 = CatalystTypeConverters.createToScalaConverter(child8.dataType)
       lazy val converter9 = CatalystTypeConverters.createToScalaConverter(child9.dataType)
-      (input: catalyst.InternalRow) => {
+      (input: InternalRow) => {
         func(
           converter0(child0.eval(input)),
           converter1(child1.eval(input)),
@@ -311,7 +311,7 @@ case class ScalaUdf(function: AnyRef, dataType: DataType, children: Seq[Expressi
       lazy val converter8 = CatalystTypeConverters.createToScalaConverter(child8.dataType)
       lazy val converter9 = CatalystTypeConverters.createToScalaConverter(child9.dataType)
       lazy val converter10 = CatalystTypeConverters.createToScalaConverter(child10.dataType)
-      (input: catalyst.InternalRow) => {
+      (input: InternalRow) => {
         func(
           converter0(child0.eval(input)),
           converter1(child1.eval(input)),
@@ -352,7 +352,7 @@ case class ScalaUdf(function: AnyRef, dataType: DataType, children: Seq[Expressi
       lazy val converter9 = CatalystTypeConverters.createToScalaConverter(child9.dataType)
       lazy val converter10 = CatalystTypeConverters.createToScalaConverter(child10.dataType)
       lazy val converter11 = CatalystTypeConverters.createToScalaConverter(child11.dataType)
-      (input: catalyst.InternalRow) => {
+      (input: InternalRow) => {
         func(
           converter0(child0.eval(input)),
           converter1(child1.eval(input)),
@@ -396,7 +396,7 @@ case class ScalaUdf(function: AnyRef, dataType: DataType, children: Seq[Expressi
       lazy val converter10 = CatalystTypeConverters.createToScalaConverter(child10.dataType)
       lazy val converter11 = CatalystTypeConverters.createToScalaConverter(child11.dataType)
       lazy val converter12 = CatalystTypeConverters.createToScalaConverter(child12.dataType)
-      (input: catalyst.InternalRow) => {
+      (input: InternalRow) => {
         func(
           converter0(child0.eval(input)),
           converter1(child1.eval(input)),
@@ -443,7 +443,7 @@ case class ScalaUdf(function: AnyRef, dataType: DataType, children: Seq[Expressi
       lazy val converter11 = CatalystTypeConverters.createToScalaConverter(child11.dataType)
       lazy val converter12 = CatalystTypeConverters.createToScalaConverter(child12.dataType)
       lazy val converter13 = CatalystTypeConverters.createToScalaConverter(child13.dataType)
-      (input: catalyst.InternalRow) => {
+      (input: InternalRow) => {
         func(
           converter0(child0.eval(input)),
           converter1(child1.eval(input)),
@@ -493,7 +493,7 @@ case class ScalaUdf(function: AnyRef, dataType: DataType, children: Seq[Expressi
       lazy val converter12 = CatalystTypeConverters.createToScalaConverter(child12.dataType)
       lazy val converter13 = CatalystTypeConverters.createToScalaConverter(child13.dataType)
       lazy val converter14 = CatalystTypeConverters.createToScalaConverter(child14.dataType)
-      (input: catalyst.InternalRow) => {
+      (input: InternalRow) => {
         func(
           converter0(child0.eval(input)),
           converter1(child1.eval(input)),
@@ -546,7 +546,7 @@ case class ScalaUdf(function: AnyRef, dataType: DataType, children: Seq[Expressi
       lazy val converter13 = CatalystTypeConverters.createToScalaConverter(child13.dataType)
       lazy val converter14 = CatalystTypeConverters.createToScalaConverter(child14.dataType)
       lazy val converter15 = CatalystTypeConverters.createToScalaConverter(child15.dataType)
-      (input: catalyst.InternalRow) => {
+      (input: InternalRow) => {
         func(
           converter0(child0.eval(input)),
           converter1(child1.eval(input)),
@@ -602,7 +602,7 @@ case class ScalaUdf(function: AnyRef, dataType: DataType, children: Seq[Expressi
       lazy val converter14 = CatalystTypeConverters.createToScalaConverter(child14.dataType)
       lazy val converter15 = CatalystTypeConverters.createToScalaConverter(child15.dataType)
       lazy val converter16 = CatalystTypeConverters.createToScalaConverter(child16.dataType)
-      (input: catalyst.InternalRow) => {
+      (input: InternalRow) => {
         func(
           converter0(child0.eval(input)),
           converter1(child1.eval(input)),
@@ -661,7 +661,7 @@ case class ScalaUdf(function: AnyRef, dataType: DataType, children: Seq[Expressi
       lazy val converter15 = CatalystTypeConverters.createToScalaConverter(child15.dataType)
       lazy val converter16 = CatalystTypeConverters.createToScalaConverter(child16.dataType)
       lazy val converter17 = CatalystTypeConverters.createToScalaConverter(child17.dataType)
-      (input: catalyst.InternalRow) => {
+      (input: InternalRow) => {
         func(
           converter0(child0.eval(input)),
           converter1(child1.eval(input)),
@@ -723,7 +723,7 @@ case class ScalaUdf(function: AnyRef, dataType: DataType, children: Seq[Expressi
       lazy val converter16 = CatalystTypeConverters.createToScalaConverter(child16.dataType)
       lazy val converter17 = CatalystTypeConverters.createToScalaConverter(child17.dataType)
       lazy val converter18 = CatalystTypeConverters.createToScalaConverter(child18.dataType)
-      (input: catalyst.InternalRow) => {
+      (input: InternalRow) => {
         func(
           converter0(child0.eval(input)),
           converter1(child1.eval(input)),
@@ -788,7 +788,7 @@ case class ScalaUdf(function: AnyRef, dataType: DataType, children: Seq[Expressi
       lazy val converter17 = CatalystTypeConverters.createToScalaConverter(child17.dataType)
       lazy val converter18 = CatalystTypeConverters.createToScalaConverter(child18.dataType)
       lazy val converter19 = CatalystTypeConverters.createToScalaConverter(child19.dataType)
-      (input: catalyst.InternalRow) => {
+      (input: InternalRow) => {
         func(
           converter0(child0.eval(input)),
           converter1(child1.eval(input)),
@@ -856,7 +856,7 @@ case class ScalaUdf(function: AnyRef, dataType: DataType, children: Seq[Expressi
       lazy val converter18 = CatalystTypeConverters.createToScalaConverter(child18.dataType)
       lazy val converter19 = CatalystTypeConverters.createToScalaConverter(child19.dataType)
       lazy val converter20 = CatalystTypeConverters.createToScalaConverter(child20.dataType)
-      (input: catalyst.InternalRow) => {
+      (input: InternalRow) => {
         func(
           converter0(child0.eval(input)),
           converter1(child1.eval(input)),
@@ -927,7 +927,7 @@ case class ScalaUdf(function: AnyRef, dataType: DataType, children: Seq[Expressi
       lazy val converter19 = CatalystTypeConverters.createToScalaConverter(child19.dataType)
       lazy val converter20 = CatalystTypeConverters.createToScalaConverter(child20.dataType)
       lazy val converter21 = CatalystTypeConverters.createToScalaConverter(child21.dataType)
-      (input: catalyst.InternalRow) => {
+      (input: InternalRow) => {
         func(
           converter0(child0.eval(input)),
           converter1(child1.eval(input)),
@@ -956,6 +956,6 @@ case class ScalaUdf(function: AnyRef, dataType: DataType, children: Seq[Expressi
 
   // scalastyle:on
   private[this] val converter = CatalystTypeConverters.createToCatalystConverter(dataType)
-  override def eval(input: catalyst.InternalRow): Any = converter(f(input))
+  override def eval(input: InternalRow): Any = converter(f(input))
 
 }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregates.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregates.scala
index f9e8150a689c1..00d2e499c5890 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregates.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregates.scala
@@ -38,7 +38,7 @@ abstract class AggregateExpression extends Expression {
    * [[AggregateExpression.eval]] should never be invoked because [[AggregateExpression]]'s are
    * replaced with a physical aggregate operator at runtime.
    */
-  override def eval(input: catalyst.InternalRow = null): Any =
+  override def eval(input: InternalRow = null): Any =
     throw new TreeNodeException(this, s"No function to evaluate expression. type: ${this.nodeName}")
 }
 
@@ -81,7 +81,7 @@ abstract class AggregateFunction
   override def nullable: Boolean = base.nullable
   override def dataType: DataType = base.dataType
 
-  def update(input: catalyst.InternalRow): Unit
+  def update(input: InternalRow): Unit
 
   // Do we really need this?
   override def newInstance(): AggregateFunction = {
@@ -109,7 +109,7 @@ case class MinFunction(expr: Expression, base: AggregateExpression) extends Aggr
   val currentMin: MutableLiteral = MutableLiteral(null, expr.dataType)
   val cmp = GreaterThan(currentMin, expr)
 
-  override def update(input: catalyst.InternalRow): Unit = {
+  override def update(input: InternalRow): Unit = {
     if (currentMin.value == null) {
       currentMin.value = expr.eval(input)
     } else if (cmp.eval(input) == true) {
@@ -117,7 +117,7 @@ case class MinFunction(expr: Expression, base: AggregateExpression) extends Aggr
     }
   }
 
-  override def eval(input: catalyst.InternalRow): Any = currentMin.value
+  override def eval(input: InternalRow): Any = currentMin.value
 }
 
 case class Max(child: Expression) extends PartialAggregate with trees.UnaryNode[Expression] {
@@ -140,7 +140,7 @@ case class MaxFunction(expr: Expression, base: AggregateExpression) extends Aggr
   val currentMax: MutableLiteral = MutableLiteral(null, expr.dataType)
   val cmp = LessThan(currentMax, expr)
 
-  override def update(input: catalyst.InternalRow): Unit = {
+  override def update(input: InternalRow): Unit = {
     if (currentMax.value == null) {
       currentMax.value = expr.eval(input)
     } else if (cmp.eval(input) == true) {
@@ -148,7 +148,7 @@ case class MaxFunction(expr: Expression, base: AggregateExpression) extends Aggr
     }
   }
 
-  override def eval(input: catalyst.InternalRow): Any = currentMax.value
+  override def eval(input: InternalRow): Any = currentMax.value
 }
 
 case class Count(child: Expression) extends PartialAggregate with trees.UnaryNode[Expression] {
@@ -206,14 +206,14 @@ case class CollectHashSetFunction(
   @transient
   val distinctValue = new InterpretedProjection(expr)
 
-  override def update(input: catalyst.InternalRow): Unit = {
+  override def update(input: InternalRow): Unit = {
     val evaluatedExpr = distinctValue(input)
     if (!evaluatedExpr.anyNull) {
       seen.add(evaluatedExpr)
     }
   }
 
-  override def eval(input: catalyst.InternalRow): Any = {
+  override def eval(input: InternalRow): Any = {
     seen
   }
 }
@@ -239,7 +239,7 @@ case class CombineSetsAndCountFunction(
 
   val seen = new OpenHashSet[Any]()
 
-  override def update(input: catalyst.InternalRow): Unit = {
+  override def update(input: InternalRow): Unit = {
     val inputSetEval = inputSet.eval(input).asInstanceOf[OpenHashSet[Any]]
     val inputIterator = inputSetEval.iterator
     while (inputIterator.hasNext) {
@@ -247,7 +247,7 @@ case class CombineSetsAndCountFunction(
     }
   }
 
-  override def eval(input: catalyst.InternalRow): Any = seen.size.toLong
+  override def eval(input: InternalRow): Any = seen.size.toLong
 }
 
 /** The data type of ApproxCountDistinctPartition since its output is a HyperLogLog object. */
@@ -454,7 +454,7 @@ case class CombineSetsAndSumFunction(
 
   val seen = new OpenHashSet[Any]()
 
-  override def update(input: catalyst.InternalRow): Unit = {
+  override def update(input: InternalRow): Unit = {
     val inputSetEval = inputSet.eval(input).asInstanceOf[OpenHashSet[Any]]
     val inputIterator = inputSetEval.iterator
     while (inputIterator.hasNext) {
@@ -462,8 +462,8 @@ case class CombineSetsAndSumFunction(
     }
   }
 
-  override def eval(input: catalyst.InternalRow): Any = {
-    val casted = seen.asInstanceOf[OpenHashSet[catalyst.InternalRow]]
+  override def eval(input: InternalRow): Any = {
+    val casted = seen.asInstanceOf[OpenHashSet[InternalRow]]
     if (casted.size == 0) {
       null
     } else {
@@ -525,7 +525,7 @@ case class AverageFunction(expr: Expression, base: AggregateExpression)
   private def addFunction(value: Any) = Add(sum,
     Cast(Literal.create(value, expr.dataType), calcType))
 
-  override def eval(input: catalyst.InternalRow): Any = {
+  override def eval(input: InternalRow): Any = {
     if (count == 0L) {
       null
     } else {
@@ -542,7 +542,7 @@ case class AverageFunction(expr: Expression, base: AggregateExpression)
     }
   }
 
-  override def update(input: catalyst.InternalRow): Unit = {
+  override def update(input: InternalRow): Unit = {
     val evaluatedExpr = expr.eval(input)
     if (evaluatedExpr != null) {
       count += 1
@@ -556,14 +556,14 @@ case class CountFunction(expr: Expression, base: AggregateExpression) extends Ag
 
   var count: Long = _
 
-  override def update(input: catalyst.InternalRow): Unit = {
+  override def update(input: InternalRow): Unit = {
     val evaluatedExpr = expr.eval(input)
     if (evaluatedExpr != null) {
       count += 1L
     }
   }
 
-  override def eval(input: catalyst.InternalRow): Any = count
+  override def eval(input: InternalRow): Any = count
 }
 
 case class ApproxCountDistinctPartitionFunction(
@@ -575,14 +575,14 @@ case class ApproxCountDistinctPartitionFunction(
 
   private val hyperLogLog = new HyperLogLog(relativeSD)
 
-  override def update(input: catalyst.InternalRow): Unit = {
+  override def update(input: InternalRow): Unit = {
     val evaluatedExpr = expr.eval(input)
     if (evaluatedExpr != null) {
       hyperLogLog.offer(evaluatedExpr)
     }
   }
 
-  override def eval(input: catalyst.InternalRow): Any = hyperLogLog
+  override def eval(input: InternalRow): Any = hyperLogLog
 }
 
 case class ApproxCountDistinctMergeFunction(
@@ -594,12 +594,12 @@ case class ApproxCountDistinctMergeFunction(
 
   private val hyperLogLog = new HyperLogLog(relativeSD)
 
-  override def update(input: catalyst.InternalRow): Unit = {
+  override def update(input: InternalRow): Unit = {
     val evaluatedExpr = expr.eval(input)
     hyperLogLog.addAll(evaluatedExpr.asInstanceOf[HyperLogLog])
   }
 
-  override def eval(input: catalyst.InternalRow): Any = hyperLogLog.cardinality()
+  override def eval(input: InternalRow): Any = hyperLogLog.cardinality()
 }
 
 case class SumFunction(expr: Expression, base: AggregateExpression) extends AggregateFunction {
@@ -620,11 +620,11 @@ case class SumFunction(expr: Expression, base: AggregateExpression) extends Aggr
   private val addFunction =
     Coalesce(Seq(Add(Coalesce(Seq(sum, zero)), Cast(expr, calcType)), sum, zero))
 
-  override def update(input: catalyst.InternalRow): Unit = {
+  override def update(input: InternalRow): Unit = {
     sum.update(addFunction, input)
   }
 
-  override def eval(input: catalyst.InternalRow): Any = {
+  override def eval(input: InternalRow): Any = {
     expr.dataType match {
       case DecimalType.Fixed(_, _) =>
         Cast(sum, dataType).eval(null)
@@ -653,7 +653,7 @@ case class CombineSumFunction(expr: Expression, base: AggregateExpression)
   private val addFunction =
     Coalesce(Seq(Add(Coalesce(Seq(sum, zero)), Cast(expr, calcType)), sum, zero))
 
-  override def update(input: catalyst.InternalRow): Unit = {
+  override def update(input: InternalRow): Unit = {
     val result = expr.eval(input)
     // partial sum result can be null only when no input rows present
     if(result != null) {
@@ -661,7 +661,7 @@ case class CombineSumFunction(expr: Expression, base: AggregateExpression)
     }
   }
 
-  override def eval(input: catalyst.InternalRow): Any = {
+  override def eval(input: InternalRow): Any = {
     expr.dataType match {
       case DecimalType.Fixed(_, _) =>
         Cast(sum, dataType).eval(null)
@@ -677,14 +677,14 @@ case class SumDistinctFunction(expr: Expression, base: AggregateExpression)
 
   private val seen = new scala.collection.mutable.HashSet[Any]()
 
-  override def update(input: catalyst.InternalRow): Unit = {
+  override def update(input: InternalRow): Unit = {
     val evaluatedExpr = expr.eval(input)
     if (evaluatedExpr != null) {
       seen += evaluatedExpr
     }
   }
 
-  override def eval(input: catalyst.InternalRow): Any = {
+  override def eval(input: InternalRow): Any = {
     if (seen.size == 0) {
       null
     } else {
@@ -708,14 +708,14 @@ case class CountDistinctFunction(
   @transient
   val distinctValue = new InterpretedProjection(expr)
 
-  override def update(input: catalyst.InternalRow): Unit = {
+  override def update(input: InternalRow): Unit = {
     val evaluatedExpr = distinctValue(input)
     if (!evaluatedExpr.anyNull) {
       seen.add(evaluatedExpr)
     }
   }
 
-  override def eval(input: catalyst.InternalRow): Any = seen.size.toLong
+  override def eval(input: InternalRow): Any = seen.size.toLong
 }
 
 case class FirstFunction(expr: Expression, base: AggregateExpression) extends AggregateFunction {
@@ -723,13 +723,13 @@ case class FirstFunction(expr: Expression, base: AggregateExpression) extends Ag
 
   var result: Any = null
 
-  override def update(input: catalyst.InternalRow): Unit = {
+  override def update(input: InternalRow): Unit = {
     if (result == null) {
       result = expr.eval(input)
     }
   }
 
-  override def eval(input: catalyst.InternalRow): Any = result
+  override def eval(input: InternalRow): Any = result
 }
 
 case class LastFunction(expr: Expression, base: AggregateExpression) extends AggregateFunction {
@@ -737,11 +737,11 @@ case class LastFunction(expr: Expression, base: AggregateExpression) extends Agg
 
   var result: Any = null
 
-  override def update(input: catalyst.InternalRow): Unit = {
+  override def update(input: InternalRow): Unit = {
     result = input
   }
 
-  override def eval(input: catalyst.InternalRow): Any = {
-    if (result != null) expr.eval(result.asInstanceOf[catalyst.InternalRow]) else null
+  override def eval(input: InternalRow): Any = {
+    if (result != null) expr.eval(result.asInstanceOf[InternalRow]) else null
   }
 }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/arithmetic.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/arithmetic.scala
index 0ba2ff75aac5c..18ddac1b598e6 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/arithmetic.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/arithmetic.scala
@@ -30,7 +30,7 @@ abstract class UnaryArithmetic extends UnaryExpression {
   override def nullable: Boolean = child.nullable
   override def dataType: DataType = child.dataType
 
-  override def eval(input: catalyst.InternalRow): Any = {
+  override def eval(input: InternalRow): Any = {
     val evalE = child.eval(input)
     if (evalE == null) {
       null
@@ -125,7 +125,7 @@ abstract class BinaryArithmetic extends BinaryExpression {
 
   protected def checkTypesInternal(t: DataType): TypeCheckResult
 
-  override def eval(input: catalyst.InternalRow): Any = {
+  override def eval(input: InternalRow): Any = {
     val evalE1 = left.eval(input)
     if(evalE1 == null) {
       null
@@ -220,7 +220,7 @@ case class Divide(left: Expression, right: Expression) extends BinaryArithmetic
     case it: IntegralType => it.integral.asInstanceOf[Integral[Any]].quot
   }
 
-  override def eval(input: catalyst.InternalRow): Any = {
+  override def eval(input: InternalRow): Any = {
     val evalE2 = right.eval(input)
     if (evalE2 == null || evalE2 == 0) {
       null
@@ -280,7 +280,7 @@ case class Remainder(left: Expression, right: Expression) extends BinaryArithmet
     case i: FractionalType => i.asIntegral.asInstanceOf[Integral[Any]]
   }
 
-  override def eval(input: catalyst.InternalRow): Any = {
+  override def eval(input: InternalRow): Any = {
     val evalE2 = right.eval(input)
     if (evalE2 == null || evalE2 == 0) {
       null
@@ -331,7 +331,7 @@ case class MaxOf(left: Expression, right: Expression) extends BinaryArithmetic {
 
   private lazy val ordering = TypeUtils.getOrdering(dataType)
 
-  override def eval(input: catalyst.InternalRow): Any = {
+  override def eval(input: InternalRow): Any = {
     val evalE1 = left.eval(input)
     val evalE2 = right.eval(input)
     if (evalE1 == null) {
@@ -385,7 +385,7 @@ case class MinOf(left: Expression, right: Expression) extends BinaryArithmetic {
 
   private lazy val ordering = TypeUtils.getOrdering(dataType)
 
-  override def eval(input: catalyst.InternalRow): Any = {
+  override def eval(input: InternalRow): Any = {
     val evalE1 = left.eval(input)
     val evalE2 = right.eval(input)
     if (evalE1 == null) {
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala
index 244a06638f61f..54f06aaa10484 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala
@@ -35,7 +35,7 @@ class IntegerHashSet extends org.apache.spark.util.collection.OpenHashSet[Int]
 class LongHashSet extends org.apache.spark.util.collection.OpenHashSet[Long]
 
 /**
- * Java source for evaluating an [[Expression]] given a [[catalyst.InternalRow]] of input.
+ * Java source for evaluating an [[Expression]] given a [[InternalRow]] of input.
  *
  * @param code The sequence of statements required to evaluate the expression.
  * @param isNull A term that holds a boolean value representing whether the expression evaluated
@@ -184,13 +184,13 @@ class CodeGenContext {
   }
 
   /**
-   * List of data types that have special accessors and setters in [[catalyst.InternalRow]].
+   * List of data types that have special accessors and setters in [[InternalRow]].
    */
   val nativeTypes =
     Seq(IntegerType, BooleanType, LongType, DoubleType, FloatType, ShortType, ByteType)
 
   /**
-   * Returns true if the data type has a special accessor and setter in [[catalyst.InternalRow]].
+   * Returns true if the data type has a special accessor and setter in [[InternalRow]].
    */
   def isNativeType(dt: DataType): Boolean = nativeTypes.contains(dt)
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateMutableProjection.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateMutableProjection.scala
index 35cb954c54308..573a9ea0a5471 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateMutableProjection.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateMutableProjection.scala
@@ -17,7 +17,6 @@
 
 package org.apache.spark.sql.catalyst.expressions.codegen
 
-import org.apache.spark.sql.catalyst
 import org.apache.spark.sql.catalyst.expressions._
 
 // MutableProjection is not accessible in Java
@@ -25,7 +24,7 @@ abstract class BaseMutableProjection extends MutableProjection {}
 
 /**
  * Generates byte code that produces a [[MutableRow]] object that can update itself based on a new
- * input [[catalyst.InternalRow]] for a fixed set of [[Expression Expressions]].
+ * input [[InternalRow]] for a fixed set of [[Expression Expressions]].
  */
 object GenerateMutableProjection extends CodeGenerator[Seq[Expression], () => MutableProjection] {
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateOrdering.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateOrdering.scala
index db5d570aeb6d4..3e9ee60f33037 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateOrdering.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateOrdering.scala
@@ -19,15 +19,15 @@ package org.apache.spark.sql.catalyst.expressions.codegen
 
 import org.apache.spark.Logging
 import org.apache.spark.annotation.Private
-import org.apache.spark.sql.{catalyst, Row}
+import org.apache.spark.sql.Row
 import org.apache.spark.sql.catalyst.expressions._
 
 /**
  * Inherits some default implementation for Java from `Ordering[Row]`
  */
 @Private
-class BaseOrdering extends Ordering[catalyst.InternalRow] {
-  def compare(a: catalyst.InternalRow, b: catalyst.InternalRow): Int = {
+class BaseOrdering extends Ordering[InternalRow] {
+  def compare(a: InternalRow, b: InternalRow): Int = {
     throw new UnsupportedOperationException
   }
 }
@@ -37,7 +37,7 @@ class BaseOrdering extends Ordering[catalyst.InternalRow] {
  * [[Expression Expressions]].
  */
 object GenerateOrdering
-    extends CodeGenerator[Seq[SortOrder], Ordering[catalyst.InternalRow]] with Logging {
+    extends CodeGenerator[Seq[SortOrder], Ordering[InternalRow]] with Logging {
   import scala.reflect.runtime.universe._
 
   protected def canonicalize(in: Seq[SortOrder]): Seq[SortOrder] =
@@ -46,7 +46,7 @@ object GenerateOrdering
   protected def bind(in: Seq[SortOrder], inputSchema: Seq[Attribute]): Seq[SortOrder] =
     in.map(BindReferences.bindReference(_, inputSchema))
 
-  protected def create(ordering: Seq[SortOrder]): Ordering[catalyst.InternalRow] = {
+  protected def create(ordering: Seq[SortOrder]): Ordering[InternalRow] = {
     val a = newTermName("a")
     val b = newTermName("b")
     val ctx = newCodeGenContext()
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GeneratePredicate.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GeneratePredicate.scala
index 9e191dc2e9422..dad4364bdd94a 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GeneratePredicate.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GeneratePredicate.scala
@@ -24,20 +24,20 @@ import org.apache.spark.sql.catalyst.expressions._
  * Interface for generated predicate
  */
 abstract class Predicate {
-  def eval(r: catalyst.InternalRow): Boolean
+  def eval(r: InternalRow): Boolean
 }
 
 /**
  * Generates bytecode that evaluates a boolean [[Expression]] on a given input [[InternalRow]].
  */
-object GeneratePredicate extends CodeGenerator[Expression, (catalyst.InternalRow) => Boolean] {
+object GeneratePredicate extends CodeGenerator[Expression, (InternalRow) => Boolean] {
 
   protected def canonicalize(in: Expression): Expression = ExpressionCanonicalizer.execute(in)
 
   protected def bind(in: Expression, inputSchema: Seq[Attribute]): Expression =
     BindReferences.bindReference(in, inputSchema)
 
-  protected def create(predicate: Expression): ((catalyst.InternalRow) => Boolean) = {
+  protected def create(predicate: Expression): ((InternalRow) => Boolean) = {
     val ctx = newCodeGenContext()
     val eval = predicate.gen(ctx)
     val code = s"""
@@ -66,6 +66,6 @@ object GeneratePredicate extends CodeGenerator[Expression, (catalyst.InternalRow
     // fetch the only one method `generate(Expression[])`
     val m = c.getDeclaredMethods()(0)
     val p = m.invoke(c.newInstance(), ctx.references.toArray).asInstanceOf[Predicate]
-    (r: catalyst.InternalRow) => p.eval(r)
+    (r: InternalRow) => p.eval(r)
   }
 }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypes.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypes.scala
index a6913cc03ca20..1aaf9b309efc3 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypes.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypes.scala
@@ -42,7 +42,7 @@ case class CreateArray(children: Seq[Expression]) extends Expression {
 
   override def nullable: Boolean = false
 
-  override def eval(input: catalyst.InternalRow): Any = {
+  override def eval(input: InternalRow): Any = {
     children.map(_.eval(input))
   }
 
@@ -70,7 +70,7 @@ case class CreateStruct(children: Seq[NamedExpression]) extends Expression {
 
   override def nullable: Boolean = false
 
-  override def eval(input: catalyst.InternalRow): Any = {
+  override def eval(input: InternalRow): Any = {
     InternalRow(children.map(_.eval(input)): _*)
   }
 }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/conditionals.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/conditionals.scala
index a119c313007c8..1d7393d3d91f9 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/conditionals.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/conditionals.scala
@@ -43,7 +43,7 @@ case class If(predicate: Expression, trueValue: Expression, falseValue: Expressi
 
   override def dataType: DataType = trueValue.dataType
 
-  override def eval(input: catalyst.InternalRow): Any = {
+  override def eval(input: InternalRow): Any = {
     if (true == predicate.eval(input)) {
       trueValue.eval(input)
     } else {
@@ -138,7 +138,7 @@ case class CaseWhen(branches: Seq[Expression]) extends CaseWhenLike {
   }
 
   /** Written in imperative fashion for performance considerations. */
-  override def eval(input: catalyst.InternalRow): Any = {
+  override def eval(input: InternalRow): Any = {
     val len = branchesArr.length
     var i = 0
     // If all branches fail and an elseVal is not provided, the whole statement
@@ -230,7 +230,7 @@ case class CaseKeyWhen(key: Expression, branches: Seq[Expression]) extends CaseW
   }
 
   /** Written in imperative fashion for performance considerations. */
-  override def eval(input: catalyst.InternalRow): Any = {
+  override def eval(input: InternalRow): Any = {
     val evaluatedKey = key.eval(input)
     val len = branchesArr.length
     var i = 0
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/decimalFunctions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/decimalFunctions.scala
index de8b66bc3bcbd..2bc893af02641 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/decimalFunctions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/decimalFunctions.scala
@@ -29,7 +29,7 @@ case class UnscaledValue(child: Expression) extends UnaryExpression {
   override def nullable: Boolean = child.nullable
   override def toString: String = s"UnscaledValue($child)"
 
-  override def eval(input: catalyst.InternalRow): Any = {
+  override def eval(input: InternalRow): Any = {
     val childResult = child.eval(input)
     if (childResult == null) {
       null
@@ -51,7 +51,7 @@ case class MakeDecimal(child: Expression, precision: Int, scale: Int) extends Un
   override def nullable: Boolean = child.nullable
   override def toString: String = s"MakeDecimal($child,$precision,$scale)"
 
-  override def eval(input: catalyst.InternalRow): Decimal = {
+  override def eval(input: InternalRow): Decimal = {
     val childResult = child.eval(input)
     if (childResult == null) {
       null
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/generators.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/generators.scala
index a80c255a296af..f30cb42d12b83 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/generators.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/generators.scala
@@ -54,13 +54,13 @@ abstract class Generator extends Expression {
   def elementTypes: Seq[(DataType, Boolean)]
 
   /** Should be implemented by child classes to perform specific Generators. */
-  override def eval(input: catalyst.InternalRow): TraversableOnce[catalyst.InternalRow]
+  override def eval(input: InternalRow): TraversableOnce[InternalRow]
 
   /**
    * Notifies that there are no more rows to process, clean up code, and additional
    * rows can be made here.
    */
-  def terminate(): TraversableOnce[catalyst.InternalRow] = Nil
+  def terminate(): TraversableOnce[InternalRow] = Nil
 }
 
 /**
@@ -68,22 +68,22 @@ abstract class Generator extends Expression {
  */
 case class UserDefinedGenerator(
     elementTypes: Seq[(DataType, Boolean)],
-    function: catalyst.InternalRow => TraversableOnce[catalyst.InternalRow],
+    function: InternalRow => TraversableOnce[InternalRow],
     children: Seq[Expression])
   extends Generator {
 
   @transient private[this] var inputRow: InterpretedProjection = _
-  @transient private[this] var convertToScala: (catalyst.InternalRow) => catalyst.InternalRow = _
+  @transient private[this] var convertToScala: (InternalRow) => InternalRow = _
 
   private def initializeConverters(): Unit = {
     inputRow = new InterpretedProjection(children)
     convertToScala = {
       val inputSchema = StructType(children.map(e => StructField(e.simpleString, e.dataType, true)))
       CatalystTypeConverters.createToScalaConverter(inputSchema)
-    }.asInstanceOf[(catalyst.InternalRow => catalyst.InternalRow)]
+    }.asInstanceOf[(InternalRow => InternalRow)]
   }
 
-  override def eval(input: catalyst.InternalRow): TraversableOnce[catalyst.InternalRow] = {
+  override def eval(input: InternalRow): TraversableOnce[InternalRow] = {
     if (inputRow == null) {
       initializeConverters()
     }
@@ -109,7 +109,7 @@ case class Explode(child: Expression)
     case MapType(kt, vt, valueContainsNull) => (kt, false) :: (vt, valueContainsNull) :: Nil
   }
 
-  override def eval(input: catalyst.InternalRow): TraversableOnce[catalyst.InternalRow] = {
+  override def eval(input: InternalRow): TraversableOnce[InternalRow] = {
     child.dataType match {
       case ArrayType(_, _) =>
         val inputArray = child.eval(input).asInstanceOf[Seq[Any]]
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/literals.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/literals.scala
index d8fff2b84d585..6c86a47ba200c 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/literals.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/literals.scala
@@ -88,7 +88,7 @@ case class Literal protected (value: Any, dataType: DataType) extends LeafExpres
     case _ => false
   }
 
-  override def eval(input: catalyst.InternalRow): Any = value
+  override def eval(input: InternalRow): Any = value
 
   override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
     // change the isNull and primitive to consts, to inline them
@@ -143,9 +143,9 @@ case class Literal protected (value: Any, dataType: DataType) extends LeafExpres
 case class MutableLiteral(var value: Any, dataType: DataType, nullable: Boolean = true)
     extends LeafExpression {
 
-  def update(expression: Expression, input: catalyst.InternalRow): Unit = {
+  def update(expression: Expression, input: InternalRow): Unit = {
     value = expression.eval(input)
   }
 
-  override def eval(input: catalyst.InternalRow): Any = value
+  override def eval(input: InternalRow): Any = value
 }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/math.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/math.scala
index 6f90d607ddbcc..42c596b5b31ab 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/math.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/math.scala
@@ -35,7 +35,7 @@ abstract class LeafMathExpression(c: Double, name: String)
   override def nullable: Boolean = false
   override def toString: String = s"$name()"
 
-  override def eval(input: catalyst.InternalRow): Any = c
+  override def eval(input: InternalRow): Any = c
 
   override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
     s"""
@@ -61,7 +61,7 @@ abstract class UnaryMathExpression(f: Double => Double, name: String)
   override def nullable: Boolean = true
   override def toString: String = s"$name($child)"
 
-  override def eval(input: catalyst.InternalRow): Any = {
+  override def eval(input: InternalRow): Any = {
     val evalE = child.eval(input)
     if (evalE == null) {
       null
@@ -104,7 +104,7 @@ abstract class BinaryMathExpression(f: (Double, Double) => Double, name: String)
 
   override def dataType: DataType = DoubleType
 
-  override def eval(input: catalyst.InternalRow): Any = {
+  override def eval(input: InternalRow): Any = {
     val evalE1 = left.eval(input)
     if (evalE1 == null) {
       null
@@ -216,7 +216,7 @@ case class ToRadians(child: Expression) extends UnaryMathExpression(math.toRadia
 case class Atan2(left: Expression, right: Expression)
   extends BinaryMathExpression(math.atan2, "ATAN2") {
 
-  override def eval(input: catalyst.InternalRow): Any = {
+  override def eval(input: InternalRow): Any = {
     val evalE1 = left.eval(input)
     if (evalE1 == null) {
       null
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/namedExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/namedExpressions.scala
index 20505129e96c3..f22c8a7f6a374 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/namedExpressions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/namedExpressions.scala
@@ -115,7 +115,7 @@ case class Alias(child: Expression, name: String)(
   // Alias(Generator, xx) need to be transformed into Generate(generator, ...)
   override lazy val resolved = childrenResolved && !child.isInstanceOf[Generator]
 
-  override def eval(input: catalyst.InternalRow): Any = child.eval(input)
+  override def eval(input: InternalRow): Any = child.eval(input)
 
   override def gen(ctx: CodeGenContext): GeneratedExpressionCode = child.gen(ctx)
 
@@ -231,7 +231,7 @@ case class AttributeReference(
   }
 
   // Unresolved attributes are transient at compile time and don't get evaluated during execution.
-  override def eval(input: catalyst.InternalRow = null): Any =
+  override def eval(input: InternalRow = null): Any =
     throw new TreeNodeException(this, s"No function to evaluate expression. type: ${this.nodeName}")
 
   override def toString: String = s"$name#${exprId.id}$typeSuffix"
@@ -253,7 +253,7 @@ case class PrettyAttribute(name: String) extends Attribute with trees.LeafNode[E
   override def withName(newName: String): Attribute = throw new UnsupportedOperationException
   override def qualifiers: Seq[String] = throw new UnsupportedOperationException
   override def exprId: ExprId = throw new UnsupportedOperationException
-  override def eval(input: catalyst.InternalRow): Any = throw new UnsupportedOperationException
+  override def eval(input: InternalRow): Any = throw new UnsupportedOperationException
   override def nullable: Boolean = throw new UnsupportedOperationException
   override def dataType: DataType = NullType
 }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/nullFunctions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/nullFunctions.scala
index 292d626f019f4..0d06589a795b1 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/nullFunctions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/nullFunctions.scala
@@ -17,10 +17,9 @@
 
 package org.apache.spark.sql.catalyst.expressions
 
-import org.apache.spark.sql.catalyst
-import org.apache.spark.sql.catalyst.expressions.codegen.{GeneratedExpressionCode, CodeGenContext}
-import org.apache.spark.sql.catalyst.trees
 import org.apache.spark.sql.catalyst.analysis.UnresolvedException
+import org.apache.spark.sql.catalyst.expressions.codegen.{CodeGenContext, GeneratedExpressionCode}
+import org.apache.spark.sql.catalyst.trees
 import org.apache.spark.sql.types.DataType
 
 case class Coalesce(children: Seq[Expression]) extends Expression {
@@ -44,7 +43,7 @@ case class Coalesce(children: Seq[Expression]) extends Expression {
       this, s"Coalesce cannot have children of different types. $childTypes")
   }
 
-  override def eval(input: catalyst.InternalRow): Any = {
+  override def eval(input: InternalRow): Any = {
     var i = 0
     var result: Any = null
     val childIterator = children.iterator
@@ -78,7 +77,7 @@ case class IsNull(child: Expression) extends Predicate with trees.UnaryNode[Expr
   override def foldable: Boolean = child.foldable
   override def nullable: Boolean = false
 
-  override def eval(input: catalyst.InternalRow): Any = {
+  override def eval(input: InternalRow): Any = {
     child.eval(input) == null
   }
 
@@ -97,7 +96,7 @@ case class IsNotNull(child: Expression) extends Predicate with trees.UnaryNode[E
   override def nullable: Boolean = false
   override def toString: String = s"IS NOT NULL $child"
 
-  override def eval(input: catalyst.InternalRow): Any = {
+  override def eval(input: InternalRow): Any = {
     child.eval(input) != null
   }
 
@@ -119,7 +118,7 @@ case class AtLeastNNonNulls(n: Int, children: Seq[Expression]) extends Predicate
 
   private[this] val childrenArray = children.toArray
 
-  override def eval(input: catalyst.InternalRow): Boolean = {
+  override def eval(input: InternalRow): Boolean = {
     var numNonNulls = 0
     var i = 0
     while (i < childrenArray.length && numNonNulls < n) {
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/package.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/package.scala
index c2e57b4715a79..d24d74e7b82ad 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/package.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/package.scala
@@ -17,8 +17,6 @@
 
 package org.apache.spark.sql.catalyst
 
-import org.apache.spark.sql.catalyst
-
 /**
  * A set of classes that can be used to represent trees of relational expressions.  A key goal of
  * the expression library is to hide the details of naming and scoping from developers who want to
@@ -51,9 +49,9 @@ import org.apache.spark.sql.catalyst
  */
 package object expressions  {
 
-  type InternalRow = catalyst.InternalRow
+  type InternalRow = org.apache.spark.sql.catalyst.InternalRow
 
-  val InternalRow = catalyst.InternalRow
+  val InternalRow = org.apache.spark.sql.catalyst.InternalRow
 
   /**
    * Converts a [[InternalRow]] to another Row given a sequence of expression that define each
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
index f1acdfeea5793..9ca168881c5b6 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
@@ -1060,7 +1060,7 @@ class DataFrame private[sql](
 
     def rowFunction(row: Row): TraversableOnce[InternalRow] = {
       f(row(0).asInstanceOf[A]).map(o =>
-        catalyst.InternalRow(CatalystTypeConverters.convertToCatalyst(o, dataType)))
+        InternalRow(CatalystTypeConverters.convertToCatalyst(o, dataType)))
     }
     val generator = UserDefinedGenerator(elementTypes, rowFunction, apply(inputColumn).expr :: Nil)
 
@@ -1232,11 +1232,11 @@ class DataFrame private[sql](
       // Pivot the data so each summary is one row
       row.grouped(outputCols.size).toSeq.zip(statistics).map {
         case (aggregation, (statistic, _)) =>
-          catalyst.InternalRow(statistic :: aggregation.toList: _*)
+          InternalRow(statistic :: aggregation.toList: _*)
       }
     } else {
       // If there are no output columns, just output a single column that contains the stats.
-      statistics.map { case (name, _) => catalyst.InternalRow(name) }
+      statistics.map { case (name, _) => InternalRow(name) }
     }
 
     // All columns are string type
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashOuterJoin.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashOuterJoin.scala
index 19aef9978e732..bce0e8d70a57b 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashOuterJoin.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashOuterJoin.scala
@@ -90,7 +90,7 @@ case class HashOuterJoin(
           case r if boundCondition(joinedRow.withRight(r)) => joinedRow.copy()
         }
         if (temp.size == 0) {
-          joinedRow.withRight(rightNullRow).copy.asInstanceOf[InternalRow] :: Nil
+          joinedRow.withRight(rightNullRow).copy :: Nil
         } else {
           temp
         }

From af31335adce13e1452ce1990496c9bfac9778b5c Mon Sep 17 00:00:00 2001
From: Josh Rosen <joshrosen@databricks.com>
Date: Sat, 13 Jun 2015 16:14:24 -0700
Subject: [PATCH 475/525] [SPARK-8319] [CORE] [SQL] Update logic related to key
 orderings in shuffle dependencies

This patch updates two pieces of logic that are related to handling of keyOrderings in ShuffleDependencies:

- The Tungsten ShuffleManager falls back to regular SortShuffleManager whenever the shuffle dependency specifies a key ordering, but technically we only need to fall back when an aggregator is also specified. This patch updates the fallback logic to reflect this so that the Tungsten optimizations can apply to more workloads.

- The SQL Exchange operator performs defensive copying of shuffle inputs when a key ordering is specified, but this is unnecessary. The copying was added to guard against cases where ExternalSorter would buffer non-serialized records in memory.  When ExternalSorter is configured without an aggregator, it uses the following logic to determine whether to buffer records in a serialized or deserialized format:

   ```scala
     private val useSerializedPairBuffer =
        ordering.isEmpty &&
        conf.getBoolean("spark.shuffle.sort.serializeMapOutputs", true) &&
        ser.supportsRelocationOfSerializedObjects
   ```

   The `newOrdering.isDefined` branch in `ExternalSorter.needToCopyObjectsBeforeShuffle`, removed by this patch, is not necessary:

   - It was checked even if we weren't using sort-based shuffle, but this was unnecessary because only SortShuffleManager performs map-side sorting.
   - Map-side sorting during shuffle writing is only performed for shuffles that perform map-side aggregation as part of the shuffle (to see this, look at how SortShuffleWriter constructs ExternalSorter).  Since SQL never pushes aggregation into Spark's shuffle, we can guarantee that both the aggregator and ordering will be empty and Spark SQL always uses serializers that support relocation, so sort-shuffle will use the serialized pair buffer unless the user has explicitly disabled it via the SparkConf feature-flag.  Therefore, I think my optimization in Exchange should be safe.

Author: Josh Rosen <joshrosen@databricks.com>

Closes #6773 from JoshRosen/SPARK-8319 and squashes the following commits:

7a14129 [Josh Rosen] Revise comments; add handler to guard against future ShuffleManager implementations
07bb2c9 [Josh Rosen] Update comment to clarify circumstances under which shuffle operates on serialized records
269089a [Josh Rosen] Avoid unnecessary copy in SQL Exchange
34e526e [Josh Rosen] Enable Tungsten shuffle for non-agg shuffles w/ key orderings
---
 .../shuffle/unsafe/UnsafeShuffleManager.scala |  3 ---
 .../unsafe/UnsafeShuffleManagerSuite.scala    | 19 ++++++++++---------
 .../apache/spark/sql/execution/Exchange.scala | 19 +++++++++++--------
 3 files changed, 21 insertions(+), 20 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/shuffle/unsafe/UnsafeShuffleManager.scala b/core/src/main/scala/org/apache/spark/shuffle/unsafe/UnsafeShuffleManager.scala
index f2bfef376d3ca..df7bbd64247dd 100644
--- a/core/src/main/scala/org/apache/spark/shuffle/unsafe/UnsafeShuffleManager.scala
+++ b/core/src/main/scala/org/apache/spark/shuffle/unsafe/UnsafeShuffleManager.scala
@@ -56,9 +56,6 @@ private[spark] object UnsafeShuffleManager extends Logging {
     } else if (dependency.aggregator.isDefined) {
       log.debug(s"Can't use UnsafeShuffle for shuffle $shufId because an aggregator is defined")
       false
-    } else if (dependency.keyOrdering.isDefined) {
-      log.debug(s"Can't use UnsafeShuffle for shuffle $shufId because a key ordering is defined")
-      false
     } else if (dependency.partitioner.numPartitions > MAX_SHUFFLE_OUTPUT_PARTITIONS) {
       log.debug(s"Can't use UnsafeShuffle for shuffle $shufId because it has more than " +
         s"$MAX_SHUFFLE_OUTPUT_PARTITIONS partitions")
diff --git a/core/src/test/scala/org/apache/spark/shuffle/unsafe/UnsafeShuffleManagerSuite.scala b/core/src/test/scala/org/apache/spark/shuffle/unsafe/UnsafeShuffleManagerSuite.scala
index a73e94e05575e..6727934d8c7ca 100644
--- a/core/src/test/scala/org/apache/spark/shuffle/unsafe/UnsafeShuffleManagerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/shuffle/unsafe/UnsafeShuffleManagerSuite.scala
@@ -76,6 +76,15 @@ class UnsafeShuffleManagerSuite extends SparkFunSuite with Matchers {
       mapSideCombine = false
     )))
 
+    // Shuffles with key orderings are supported as long as no aggregator is specified
+    assert(canUseUnsafeShuffle(shuffleDep(
+      partitioner = new HashPartitioner(2),
+      serializer = kryo,
+      keyOrdering = Some(mock(classOf[Ordering[Any]])),
+      aggregator = None,
+      mapSideCombine = false
+    )))
+
   }
 
   test("unsupported shuffle dependencies") {
@@ -100,14 +109,7 @@ class UnsafeShuffleManagerSuite extends SparkFunSuite with Matchers {
       mapSideCombine = false
     )))
 
-    // We do not support shuffles that perform any kind of aggregation or sorting of keys
-    assert(!canUseUnsafeShuffle(shuffleDep(
-      partitioner = new HashPartitioner(2),
-      serializer = kryo,
-      keyOrdering = Some(mock(classOf[Ordering[Any]])),
-      aggregator = None,
-      mapSideCombine = false
-    )))
+    // We do not support shuffles that perform aggregation
     assert(!canUseUnsafeShuffle(shuffleDep(
       partitioner = new HashPartitioner(2),
       serializer = kryo,
@@ -115,7 +117,6 @@ class UnsafeShuffleManagerSuite extends SparkFunSuite with Matchers {
       aggregator = Some(mock(classOf[Aggregator[Any, Any, Any]])),
       mapSideCombine = false
     )))
-    // We do not support shuffles that perform any kind of aggregation or sorting of keys
     assert(!canUseUnsafeShuffle(shuffleDep(
       partitioner = new HashPartitioner(2),
       serializer = kryo,
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/Exchange.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/Exchange.scala
index c9a188309a4d8..edc64a03335d6 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/Exchange.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/Exchange.scala
@@ -20,6 +20,7 @@ package org.apache.spark.sql.execution
 import org.apache.spark.annotation.DeveloperApi
 import org.apache.spark.rdd.{RDD, ShuffledRDD}
 import org.apache.spark.serializer.Serializer
+import org.apache.spark.shuffle.hash.HashShuffleManager
 import org.apache.spark.shuffle.sort.SortShuffleManager
 import org.apache.spark.shuffle.unsafe.UnsafeShuffleManager
 import org.apache.spark.sql.SQLContext
@@ -81,11 +82,7 @@ case class Exchange(
       shuffleManager.isInstanceOf[UnsafeShuffleManager]
     val bypassMergeThreshold = conf.getInt("spark.shuffle.sort.bypassMergeThreshold", 200)
     val serializeMapOutputs = conf.getBoolean("spark.shuffle.sort.serializeMapOutputs", true)
-    if (newOrdering.nonEmpty) {
-      // If a new ordering is required, then records will be sorted with Spark's `ExternalSorter`,
-      // which requires a defensive copy.
-      true
-    } else if (sortBasedShuffleOn) {
+    if (sortBasedShuffleOn) {
       val bypassIsSupported = SparkEnv.get.shuffleManager.isInstanceOf[SortShuffleManager]
       if (bypassIsSupported && partitioner.numPartitions <= bypassMergeThreshold) {
         // If we're using the original SortShuffleManager and the number of output partitions is
@@ -96,8 +93,11 @@ case class Exchange(
       } else if (serializeMapOutputs && serializer.supportsRelocationOfSerializedObjects) {
         // SPARK-4550 extended sort-based shuffle to serialize individual records prior to sorting
         // them. This optimization is guarded by a feature-flag and is only applied in cases where
-        // shuffle dependency does not specify an ordering and the record serializer has certain
-        // properties. If this optimization is enabled, we can safely avoid the copy.
+        // shuffle dependency does not specify an aggregator or ordering and the record serializer
+        // has certain properties. If this optimization is enabled, we can safely avoid the copy.
+        //
+        // Exchange never configures its ShuffledRDDs with aggregators or key orderings, so we only
+        // need to check whether the optimization is enabled and supported by our serializer.
         //
         // This optimization also applies to UnsafeShuffleManager (added in SPARK-7081).
         false
@@ -108,9 +108,12 @@ case class Exchange(
         // both cases, we must copy.
         true
       }
-    } else {
+    } else if (shuffleManager.isInstanceOf[HashShuffleManager]) {
       // We're using hash-based shuffle, so we don't need to copy.
       false
+    } else {
+      // Catch-all case to safely handle any future ShuffleManager implementations.
+      true
     }
   }
 

From ddec45279ed1061f4c05fd0760309a53581d03f5 Mon Sep 17 00:00:00 2001
From: Liang-Chi Hsieh <viirya@gmail.com>
Date: Sat, 13 Jun 2015 16:39:52 -0700
Subject: [PATCH 476/525] [SPARK-8052] [SQL] Use java.math.BigDecimal for
 casting String to Decimal instead of using toDouble

JIRA: https://issues.apache.org/jira/browse/SPARK-8052

Author: Liang-Chi Hsieh <viirya@gmail.com>

Closes #6645 from viirya/cast_string_integraltype and squashes the following commits:

e19c6a3 [Liang-Chi Hsieh] For comment.
c3e472a [Liang-Chi Hsieh] Add test.
7ced9b0 [Liang-Chi Hsieh] Use java.math.BigDecimal for casting String to Decimal instead of using toDouble.
---
 .../org/apache/spark/sql/catalyst/expressions/Cast.scala      | 3 ++-
 .../org/apache/spark/sql/hive/execution/SQLQuerySuite.scala   | 4 ++++
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala
index 05a04bdff9b3e..b20086bcc48b9 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala
@@ -17,6 +17,7 @@
 
 package org.apache.spark.sql.catalyst.expressions
 
+import java.math.{BigDecimal => JavaBigDecimal}
 import java.sql.{Date, Timestamp}
 import java.text.{DateFormat, SimpleDateFormat}
 
@@ -320,7 +321,7 @@ case class Cast(child: Expression, dataType: DataType) extends UnaryExpression w
   private[this] def castToDecimal(from: DataType, target: DecimalType): Any => Any = from match {
     case StringType =>
       buildCast[UTF8String](_, s => try {
-        changePrecision(Decimal(s.toString.toDouble), target)
+        changePrecision(Decimal(new JavaBigDecimal(s.toString)), target)
       } catch {
         case _: NumberFormatException => null
       })
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
index c8e5e246322df..f8908760cc897 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
@@ -875,6 +875,10 @@ class SQLQuerySuite extends QueryTest {
     }
   }
 
+  test("Cast STRING to BIGINT") {
+    checkAnswer(sql("SELECT CAST('775983671874188101' as BIGINT)"), Row(775983671874188101L))
+  }
+
   // `Math.exp(1.0)` has different result for different jdk version, so not use createQueryTest
   test("udf_java_method") {
     checkAnswer(sql(

From a138953391975886c88bfe81d4ce6b6dd189cd32 Mon Sep 17 00:00:00 2001
From: Reynold Xin <rxin@databricks.com>
Date: Sat, 13 Jun 2015 17:10:13 -0700
Subject: [PATCH 477/525] [SPARK-8347][SQL] Add unit tests for abs.

Also addressed code review feedback from #6754

Author: Reynold Xin <rxin@databricks.com>

Closes #6803 from rxin/abs and squashes the following commits:

d07beba [Reynold Xin] [SPARK-8347] Add unit tests for abs.
---
 .../catalyst/analysis/HiveTypeCoercion.scala    |  6 +++---
 .../expressions/ArithmeticExpressionSuite.scala | 17 ++++++++++++++++-
 .../spark/sql/ColumnExpressionSuite.scala       | 17 -----------------
 .../apache/spark/sql/MathExpressionsSuite.scala | 12 ++++++++++++
 .../org/apache/spark/sql/SQLQuerySuite.scala    | 12 ------------
 5 files changed, 31 insertions(+), 33 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala
index 6ed192360dd62..e7bf7cc1f1313 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala
@@ -672,13 +672,13 @@ trait HiveTypeCoercion {
         findTightestCommonTypeToString(left.dataType, right.dataType).map { widestType =>
           val newLeft = if (left.dataType == widestType) left else Cast(left, widestType)
           val newRight = if (right.dataType == widestType) right else Cast(right, widestType)
-          i.makeCopy(Array(pred, newLeft, newRight))
+          If(pred, newLeft, newRight)
         }.getOrElse(i)  // If there is no applicable conversion, leave expression unchanged.
 
       // Convert If(null literal, _, _) into boolean type.
       // In the optimizer, we should short-circuit this directly into false value.
-      case i @ If(pred, left, right) if pred.dataType == NullType =>
-        i.makeCopy(Array(Literal.create(null, BooleanType), left, right))
+      case If(pred, left, right) if pred.dataType == NullType =>
+        If(Literal.create(null, BooleanType), left, right)
     }
   }
 
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ArithmeticExpressionSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ArithmeticExpressionSuite.scala
index e1afa81a7a82f..5ff1bca260b24 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ArithmeticExpressionSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ArithmeticExpressionSuite.scala
@@ -21,7 +21,7 @@ import org.scalatest.Matchers._
 
 import org.apache.spark.SparkFunSuite
 import org.apache.spark.sql.catalyst.dsl.expressions._
-import org.apache.spark.sql.types.{DoubleType, IntegerType}
+import org.apache.spark.sql.types.{Decimal, DoubleType, IntegerType}
 
 
 class ArithmeticExpressionSuite extends SparkFunSuite with ExpressionEvalHelper {
@@ -75,6 +75,21 @@ class ArithmeticExpressionSuite extends SparkFunSuite with ExpressionEvalHelper
     checkDoubleEvaluation(c3 % c2, (1.1 +- 0.001), row)
   }
 
+  test("Abs") {
+    def testAbs(convert: (Int) => Any): Unit = {
+      checkEvaluation(Abs(Literal(convert(0))), convert(0))
+      checkEvaluation(Abs(Literal(convert(1))), convert(1))
+      checkEvaluation(Abs(Literal(convert(-1))), convert(1))
+    }
+    testAbs(_.toByte)
+    testAbs(_.toShort)
+    testAbs(identity)
+    testAbs(_.toLong)
+    testAbs(_.toFloat)
+    testAbs(_.toDouble)
+    testAbs(Decimal(_))
+  }
+
   test("Divide") {
     checkEvaluation(Divide(Literal(2), Literal(1)), 2)
     checkEvaluation(Divide(Literal(1.0), Literal(2.0)), 0.5)
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/ColumnExpressionSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/ColumnExpressionSuite.scala
index efcdae5bce031..5a08578e7ba4b 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/ColumnExpressionSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/ColumnExpressionSuite.scala
@@ -369,23 +369,6 @@ class ColumnExpressionSuite extends QueryTest {
     )
   }
 
-  test("abs") {
-    checkAnswer(
-      testData.select(abs('key)).orderBy('key.asc),
-      (1 to 100).map(n => Row(n))
-    )
-
-    checkAnswer(
-      negativeData.select(abs('key)).orderBy('key.desc),
-      (1 to 100).map(n => Row(n))
-    )
-
-    checkAnswer(
-      testData.select(abs(lit(null))),
-      (1 to 100).map(_ => Row(null))
-    )
-  }
-
   test("upper") {
     checkAnswer(
       lowerCaseData.select(upper('l)),
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/MathExpressionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/MathExpressionsSuite.scala
index 6561c3b2322c7..faa1d1193b509 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/MathExpressionsSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/MathExpressionsSuite.scala
@@ -236,6 +236,18 @@ class MathExpressionsSuite extends QueryTest {
     testOneToOneNonNegativeMathFunction(log1p, math.log1p)
   }
 
+  test("abs") {
+    val input =
+      Seq[(java.lang.Double, java.lang.Double)]((null, null), (0.0, 0.0), (1.5, 1.5), (-2.5, 2.5))
+    checkAnswer(
+      input.toDF("key", "value").select(abs($"key").alias("a")).sort("a"),
+      input.map(pair => Row(pair._2)))
+
+    checkAnswer(
+      input.toDF("key", "value").selectExpr("abs(key) a").sort("a"),
+      input.map(pair => Row(pair._2)))
+  }
+
   test("log2") {
     val df = Seq((1, 2)).toDF("a", "b")
     checkAnswer(
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
index 6898d584414ba..d1520b757e57b 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
@@ -178,18 +178,6 @@ class SQLQuerySuite extends QueryTest with BeforeAndAfterAll with SQLTestUtils {
       Seq(Row("1"), Row("2")))
   }
 
-  test("SPARK-3176 Added Parser of SQL ABS()") {
-    checkAnswer(
-      sql("SELECT ABS(-1.3)"),
-      Row(1.3))
-    checkAnswer(
-      sql("SELECT ABS(0.0)"),
-      Row(0.0))
-    checkAnswer(
-      sql("SELECT ABS(2.5)"),
-      Row(2.5))
-  }
-
   test("aggregation with codegen") {
     val originalValue = sqlContext.conf.codegenEnabled
     sqlContext.setConf(SQLConf.CODEGEN_ENABLED, "true")

From 2d71ba4c8a629deab672869ac8e8b6a4b3aec479 Mon Sep 17 00:00:00 2001
From: Reynold Xin <rxin@databricks.com>
Date: Sat, 13 Jun 2015 18:22:17 -0700
Subject: [PATCH 478/525] [SPARK-8349] [SQL] Use expression constructors
 (rather than apply) in FunctionRegistry

Author: Reynold Xin <rxin@databricks.com>

Closes #6806 from rxin/gs and squashes the following commits:

ed1aebb [Reynold Xin] Fixed style.
c7fc3e6 [Reynold Xin] [SPARK-8349][SQL] Use expression constructors (rather than apply) in FunctionRegistry
---
 .../catalyst/analysis/FunctionRegistry.scala   | 18 +++++++-----------
 .../sql/catalyst/expressions/Expression.scala  |  3 +--
 .../sql/catalyst/expressions/random.scala      | 12 ++++--------
 .../expressions/stringOperations.scala         | 11 +++++------
 .../spark/sql/catalyst/trees/TreeNode.scala    | 10 +++++-----
 5 files changed, 22 insertions(+), 32 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
index 45bcbf73fae98..04e306da23e4c 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
@@ -158,27 +158,23 @@ object FunctionRegistry {
   /** See usage above. */
   private def expression[T <: Expression](name: String)
       (implicit tag: ClassTag[T]): (String, FunctionBuilder) = {
-    // Use the companion class to find apply methods.
-    val objectClass = Class.forName(tag.runtimeClass.getName + "$")
-    val companionObj = objectClass.getDeclaredField("MODULE$").get(null)
-
-    // See if we can find an apply that accepts Seq[Expression]
-    val varargApply = Try(objectClass.getDeclaredMethod("apply", classOf[Seq[_]])).toOption
 
+    // See if we can find a constructor that accepts Seq[Expression]
+    val varargCtor = Try(tag.runtimeClass.getDeclaredConstructor(classOf[Seq[_]])).toOption
     val builder = (expressions: Seq[Expression]) => {
-      if (varargApply.isDefined) {
+      if (varargCtor.isDefined) {
         // If there is an apply method that accepts Seq[Expression], use that one.
-        varargApply.get.invoke(companionObj, expressions).asInstanceOf[Expression]
+        varargCtor.get.newInstance(expressions).asInstanceOf[Expression]
       } else {
-        // Otherwise, find an apply method that matches the number of arguments, and use that.
+        // Otherwise, find an ctor method that matches the number of arguments, and use that.
         val params = Seq.fill(expressions.size)(classOf[Expression])
-        val f = Try(objectClass.getDeclaredMethod("apply", params : _*)) match {
+        val f = Try(tag.runtimeClass.getDeclaredConstructor(params : _*)) match {
           case Success(e) =>
             e
           case Failure(e) =>
             throw new AnalysisException(s"Invalid number of arguments for function $name")
         }
-        f.invoke(companionObj, expressions : _*).asInstanceOf[Expression]
+        f.newInstance(expressions : _*).asInstanceOf[Expression]
       }
     }
     (name, builder)
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala
index 61de34bfa4b3b..7427ca76b54d7 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala
@@ -27,8 +27,7 @@ import org.apache.spark.sql.types._
 /**
  * If an expression wants to be exposed in the function registry (so users can call it with
  * "name(arguments...)", the concrete implementation must be a case class whose constructor
- * arguments are all Expressions types. In addition, if it needs to support more than one
- * constructor, define those constructors explicitly as apply methods in the companion object.
+ * arguments are all Expressions types.
  *
  * See [[Substring]] for an example.
  */
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/random.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/random.scala
index 7e8033307ea4e..cc34467391b96 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/random.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/random.scala
@@ -49,12 +49,10 @@ abstract class RDG(seed: Long) extends LeafExpression with Serializable {
 /** Generate a random column with i.i.d. uniformly distributed values in [0, 1). */
 case class Rand(seed: Long) extends RDG(seed) {
   override def eval(input: InternalRow): Double = rng.nextDouble()
-}
 
-object Rand {
-  def apply(): Rand = apply(Utils.random.nextLong())
+  def this() = this(Utils.random.nextLong())
 
-  def apply(seed: Expression): Rand = apply(seed match {
+  def this(seed: Expression) = this(seed match {
     case IntegerLiteral(s) => s
     case _ => throw new AnalysisException("Input argument to rand must be an integer literal.")
   })
@@ -63,12 +61,10 @@ object Rand {
 /** Generate a random column with i.i.d. gaussian random distribution. */
 case class Randn(seed: Long) extends RDG(seed) {
   override def eval(input: InternalRow): Double = rng.nextGaussian()
-}
 
-object Randn {
-  def apply(): Randn = apply(Utils.random.nextLong())
+  def this() = this(Utils.random.nextLong())
 
-  def apply(seed: Expression): Randn = apply(seed match {
+  def this(seed: Expression) = this(seed match {
     case IntegerLiteral(s) => s
     case _ => throw new AnalysisException("Input argument to rand must be an integer literal.")
   })
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala
index 8ca8d22bc4697..315c63e63c635 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala
@@ -20,6 +20,7 @@ package org.apache.spark.sql.catalyst.expressions
 import java.util.regex.Pattern
 
 import org.apache.spark.sql.catalyst.analysis.UnresolvedException
+import org.apache.spark.sql.catalyst.expressions.Substring
 import org.apache.spark.sql.catalyst.expressions.codegen._
 import org.apache.spark.sql.types._
 import org.apache.spark.unsafe.types.UTF8String
@@ -225,6 +226,10 @@ case class EndsWith(left: Expression, right: Expression)
 case class Substring(str: Expression, pos: Expression, len: Expression)
   extends Expression with ExpectsInputTypes {
 
+  def this(str: Expression, pos: Expression) = {
+    this(str, pos, Literal(Integer.MAX_VALUE))
+  }
+
   override def foldable: Boolean = str.foldable && pos.foldable && len.foldable
 
   override  def nullable: Boolean = str.nullable || pos.nullable || len.nullable
@@ -290,12 +295,6 @@ case class Substring(str: Expression, pos: Expression, len: Expression)
   }
 }
 
-object Substring {
-  def apply(str: Expression, pos: Expression): Substring = {
-    apply(str, pos, Literal(Integer.MAX_VALUE))
-  }
-}
-
 /**
  * A function that return the length of the given string expression.
  */
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/trees/TreeNode.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/trees/TreeNode.scala
index 36d005d0e1684..5964e3dc3d77e 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/trees/TreeNode.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/trees/TreeNode.scala
@@ -344,11 +344,11 @@ abstract class TreeNode[BaseType <: TreeNode[BaseType]] {
    * @param newArgs the new product arguments.
    */
   def makeCopy(newArgs: Array[AnyRef]): this.type = attachTree(this, "makeCopy") {
-    val defaultCtor =
-      getClass.getConstructors
-        .find(_.getParameterTypes.size != 0)
-        .headOption
-        .getOrElse(sys.error(s"No valid constructor for $nodeName"))
+    val ctors = getClass.getConstructors.filter(_.getParameterTypes.size != 0)
+    if (ctors.isEmpty) {
+      sys.error(s"No valid constructor for $nodeName")
+    }
+    val defaultCtor = ctors.maxBy(_.getParameterTypes.size)
 
     try {
       CurrentOrigin.withOrigin(origin) {

From 35d1267cf8e918032c92a206b22bb301bf0c806e Mon Sep 17 00:00:00 2001
From: Mike Dusenberry <dusenberrymw@gmail.com>
Date: Sat, 13 Jun 2015 21:22:46 -0700
Subject: [PATCH 479/525] [Spark-8343] [Streaming] [Docs] Improve Spark
 Streaming Guides.

This improves the Spark Streaming Guides by fixing broken links, rewording confusing sections, fixing typos, adding missing words, etc.

Author: Mike Dusenberry <dusenberrymw@gmail.com>

Closes #6801 from dusenberrymw/SPARK-8343_Improve_Spark_Streaming_Guides_MERGED and squashes the following commits:

6688090 [Mike Dusenberry] Improvements to the Spark Streaming Custom Receiver Guide, including slight rewording of confusing sections, and fixing typos & missing words.
436fbd8 [Mike Dusenberry] Bunch of improvements to the Spark Streaming Guide, including fixing broken links, slight rewording of confusing sections, fixing typos & missing words, etc.
---
 docs/streaming-custom-receivers.md  |  26 ++-
 docs/streaming-programming-guide.md | 247 ++++++++++++++--------------
 2 files changed, 133 insertions(+), 140 deletions(-)

diff --git a/docs/streaming-custom-receivers.md b/docs/streaming-custom-receivers.md
index 6a2048121f8bf..a75587a92adc7 100644
--- a/docs/streaming-custom-receivers.md
+++ b/docs/streaming-custom-receivers.md
@@ -4,7 +4,7 @@ title: Spark Streaming Custom Receivers
 ---
 
 Spark Streaming can receive streaming data from any arbitrary data source beyond
-the one's for which it has in-built support (that is, beyond Flume, Kafka, Kinesis, files, sockets, etc.).
+the ones for which it has built-in support (that is, beyond Flume, Kafka, Kinesis, files, sockets, etc.).
 This requires the developer to implement a *receiver* that is customized for receiving data from
 the concerned data source. This guide walks through the process of implementing a custom receiver
 and using it in a Spark Streaming application. Note that custom receivers can be implemented
@@ -21,15 +21,15 @@ A custom receiver must extend this abstract class by implementing two methods
 - `onStop()`: Things to do to stop receiving data.
 
 Both `onStart()` and `onStop()` must not block indefinitely. Typically, `onStart()` would start the threads
-that responsible for receiving the data and `onStop()` would ensure that the receiving by those threads
+that are responsible for receiving the data, and `onStop()` would ensure that these threads receiving the data
 are stopped. The receiving threads can also use `isStopped()`, a `Receiver` method, to check whether they
 should stop receiving data.
 
 Once the data is received, that data can be stored inside Spark
 by calling `store(data)`, which is a method provided by the Receiver class.
-There are number of flavours of `store()` which allow you store the received data
-record-at-a-time or as whole collection of objects / serialized bytes. Note that the flavour of
-`store()` used to implemented a receiver affects its reliability and fault-tolerance semantics.
+There are a number of flavors of `store()` which allow one to store the received data
+record-at-a-time or as whole collection of objects / serialized bytes. Note that the flavor of
+`store()` used to implement a receiver affects its reliability and fault-tolerance semantics.
 This is discussed [later](#receiver-reliability) in more detail.
 
 Any exception in the receiving threads should be caught and handled properly to avoid silent
@@ -60,7 +60,7 @@ class CustomReceiver(host: String, port: Int)
 
   def onStop() {
    // There is nothing much to do as the thread calling receive()
-   // is designed to stop by itself isStopped() returns false
+   // is designed to stop by itself if isStopped() returns false
   }
 
   /** Create a socket connection and receive data until receiver is stopped */
@@ -123,7 +123,7 @@ public class JavaCustomReceiver extends Receiver<String> {
 
   public void onStop() {
     // There is nothing much to do as the thread calling receive()
-    // is designed to stop by itself isStopped() returns false
+    // is designed to stop by itself if isStopped() returns false
   }
 
   /** Create a socket connection and receive data until receiver is stopped */
@@ -167,7 +167,7 @@ public class JavaCustomReceiver extends Receiver<String> {
 
 The custom receiver can be used in a Spark Streaming application by using
 `streamingContext.receiverStream(<instance of custom receiver>)`. This will create
-input DStream using data received by the instance of custom receiver, as shown below
+an input DStream using data received by the instance of custom receiver, as shown below:
 
 <div class="codetabs">
 <div data-lang="scala"  markdown="1" >
@@ -206,22 +206,20 @@ there are two kinds of receivers based on their reliability and fault-tolerance
   and stored in Spark reliably (that is, replicated successfully). Usually,
   implementing this receiver involves careful consideration of the semantics of source
   acknowledgements.
-1. *Unreliable Receiver* - These are receivers for unreliable sources that do not support
-  acknowledging. Even for reliable sources, one may implement an unreliable receiver that
-  do not go into the complexity of acknowledging correctly.
+1. *Unreliable Receiver* - An *unreliable receiver* does *not* send acknowledgement to a source. This can be used for sources that do not support acknowledgement, or even for reliable sources when one does not want or need to go into the complexity of acknowledgement.
 
 To implement a *reliable receiver*, you have to use `store(multiple-records)` to store data.
-This flavour of `store` is a blocking call which returns only after all the given records have
+This flavor of `store` is a blocking call which returns only after all the given records have
 been stored inside Spark. If the receiver's configured storage level uses replication
 (enabled by default), then this call returns after replication has completed.
 Thus it ensures that the data is reliably stored, and the receiver can now acknowledge the
-source appropriately. This ensures that no data is caused when the receiver fails in the middle
+source appropriately. This ensures that no data is lost when the receiver fails in the middle
 of replicating data -- the buffered data will not be acknowledged and hence will be later resent
 by the source.
 
 An *unreliable receiver* does not have to implement any of this logic. It can simply receive
 records from the source and insert them one-at-a-time using `store(single-record)`. While it does
-not get the reliability guarantees of `store(multiple-records)`, it has the following advantages.
+not get the reliability guarantees of `store(multiple-records)`, it has the following advantages:
 
 - The system takes care of chunking that data into appropriate sized blocks (look for block
 interval in the [Spark Streaming Programming Guide](streaming-programming-guide.html)).
diff --git a/docs/streaming-programming-guide.md b/docs/streaming-programming-guide.md
index 836f0473597d8..1eb3b30332e4f 100644
--- a/docs/streaming-programming-guide.md
+++ b/docs/streaming-programming-guide.md
@@ -11,7 +11,7 @@ description: Spark Streaming programming guide and tutorial for Spark SPARK_VERS
 # Overview
 Spark Streaming is an extension of the core Spark API that enables scalable, high-throughput,
 fault-tolerant stream processing of live data streams. Data can be ingested from many sources
-like Kafka, Flume, Twitter, ZeroMQ, Kinesis or TCP sockets can be processed using complex
+like Kafka, Flume, Twitter, ZeroMQ, Kinesis, or TCP sockets, and can be processed using complex
 algorithms expressed with high-level functions like `map`, `reduce`, `join` and `window`.
 Finally, processed data can be pushed out to filesystems, databases,
 and live dashboards. In fact, you can apply Spark's
@@ -52,7 +52,7 @@ different languages.
 
 **Note:** Python API for Spark Streaming has been introduced in Spark 1.2. It has all the DStream
 transformations and almost all the output operations available in Scala and Java interfaces.
-However, it has only support for basic sources like text files and text data over sockets.
+However, it only has support for basic sources like text files and text data over sockets.
 APIs for additional sources, like Kafka and Flume, will be available in the future.
 Further information about available features in the Python API are mentioned throughout this
 document; look out for the tag
@@ -69,10 +69,10 @@ do is as follows.
 
 <div class="codetabs">
 <div data-lang="scala"  markdown="1" >
-First, we import the names of the Spark Streaming classes, and some implicit
-conversions from StreamingContext into our environment, to add useful methods to
+First, we import the names of the Spark Streaming classes and some implicit
+conversions from StreamingContext into our environment in order to add useful methods to
 other classes we need (like DStream). [StreamingContext](api/scala/index.html#org.apache.spark.streaming.StreamingContext) is the
-main entry point for all streaming functionality. We create a local StreamingContext with two execution threads,  and batch interval of 1 second.
+main entry point for all streaming functionality. We create a local StreamingContext with two execution threads,  and a batch interval of 1 second.
 
 {% highlight scala %}
 import org.apache.spark._
@@ -96,7 +96,7 @@ val lines = ssc.socketTextStream("localhost", 9999)
 
 This `lines` DStream represents the stream of data that will be received from the data
 server. Each record in this DStream is a line of text. Next, we want to split the lines by
-space into words.
+space characters into words.
 
 {% highlight scala %}
 // Split each line into words
@@ -463,7 +463,7 @@ receive it there. However, for local testing and unit tests, you can pass "local
 in-process (detects the number of cores in the local system). Note that this internally creates a [SparkContext](api/scala/index.html#org.apache.spark.SparkContext) (starting point of all Spark functionality) which can be accessed as `ssc.sparkContext`.
 
 The batch interval must be set based on the latency requirements of your application
-and available cluster resources. See the [Performance Tuning](#setting-the-right-batch-size)
+and available cluster resources. See the [Performance Tuning](#setting-the-right-batch-interval)
 section for more details.
 
 A `StreamingContext` object can also be created from an existing `SparkContext` object.
@@ -498,7 +498,7 @@ receive it there. However, for local testing and unit tests, you can pass "local
 in-process. Note that this internally creates a [JavaSparkContext](api/java/index.html?org/apache/spark/api/java/JavaSparkContext.html) (starting point of all Spark functionality) which can be accessed as `ssc.sparkContext`.
 
 The batch interval must be set based on the latency requirements of your application
-and available cluster resources. See the [Performance Tuning](#setting-the-right-batch-size)
+and available cluster resources. See the [Performance Tuning](#setting-the-right-batch-interval)
 section for more details.
 
 A `JavaStreamingContext` object can also be created from an existing `JavaSparkContext`.
@@ -531,7 +531,7 @@ receive it there. However, for local testing and unit tests, you can pass "local
 in-process (detects the number of cores in the local system).
 
 The batch interval must be set based on the latency requirements of your application
-and available cluster resources. See the [Performance Tuning](#setting-the-right-batch-size)
+and available cluster resources. See the [Performance Tuning](#setting-the-right-batch-interval)
 section for more details.
 </div>
 </div>
@@ -549,7 +549,7 @@ After a context is defined, you have to do the following.
 - Once a context has been started, no new streaming computations can be set up or added to it.
 - Once a context has been stopped, it cannot be restarted.
 - Only one StreamingContext can be active in a JVM at the same time.
-- stop() on StreamingContext also stops the SparkContext. To stop only the StreamingContext, set optional parameter of `stop()` called `stopSparkContext` to false.
+- stop() on StreamingContext also stops the SparkContext. To stop only the StreamingContext, set the optional parameter of `stop()` called `stopSparkContext` to false.
 - A SparkContext can be re-used to create multiple StreamingContexts, as long as the previous StreamingContext is stopped (without stopping the SparkContext) before the next StreamingContext is created.
 
 ***
@@ -583,7 +583,7 @@ the `flatMap` operation is applied on each RDD in the `lines` DStream to generat
 
 
 These underlying RDD transformations are computed by the Spark engine. The DStream operations
-hide most of these details and provide the developer with higher-level API for convenience.
+hide most of these details and provide the developer with a higher-level API for convenience.
 These operations are discussed in detail in later sections.
 
 ***
@@ -600,7 +600,7 @@ data from a source and stores it in Spark's memory for processing.
 Spark Streaming provides two categories of built-in streaming sources.
 
 - *Basic sources*: Sources directly available in the StreamingContext API.
-  Example: file systems, socket connections, and Akka actors.
+  Examples: file systems, socket connections, and Akka actors.
 - *Advanced sources*: Sources like Kafka, Flume, Kinesis, Twitter, etc. are available through
   extra utility classes. These require linking against extra dependencies as discussed in the
   [linking](#linking) section.
@@ -610,11 +610,11 @@ We are going to discuss some of the sources present in each category later in th
 Note that, if you want to receive multiple streams of data in parallel in your streaming
 application, you can create multiple input DStreams (discussed
 further in the [Performance Tuning](#level-of-parallelism-in-data-receiving) section). This will
-create multiple receivers which will simultaneously receive multiple data streams. But note that
-Spark worker/executor as a long-running task, hence it occupies one of the cores allocated to the
-Spark Streaming application. Hence, it is important to remember that Spark Streaming application
+create multiple receivers which will simultaneously receive multiple data streams. But note that a
+Spark worker/executor is a long-running task, hence it occupies one of the cores allocated to the
+Spark Streaming application. Therefore, it is important to remember that a Spark Streaming application
 needs to be allocated enough cores (or threads, if running locally) to process the received data,
-as well as, to run the receiver(s).
+as well as to run the receiver(s).
 
 ##### Points to remember
 {:.no_toc}
@@ -623,13 +623,13 @@ as well as, to run the receiver(s).
   Either of these means that only one thread will be used for running tasks locally. If you are using
   a input DStream based on a receiver (e.g. sockets, Kafka, Flume, etc.), then the single thread will
   be used to run the receiver, leaving no thread for processing the received data. Hence, when
-  running locally, always use "local[*n*]" as the master URL where *n* > number of receivers to run
-  (see [Spark Properties](configuration.html#spark-properties.html) for information on how to set
+  running locally, always use "local[*n*]" as the master URL, where *n* > number of receivers to run
+  (see [Spark Properties](configuration.html#spark-properties) for information on how to set
   the master).
 
 - Extending the logic to running on a cluster, the number of cores allocated to the Spark Streaming
-  application must be more than the number of receivers. Otherwise the system will receive  data, but
-  not be able to process them.
+  application must be more than the number of receivers. Otherwise the system will receive data, but
+  not be able to process it.
 
 ### Basic Sources
 {:.no_toc}
@@ -639,7 +639,7 @@ which creates a DStream from text
 data received over a TCP socket connection. Besides sockets, the StreamingContext API provides
 methods for creating DStreams from files and Akka actors as input sources.
 
-- **File Streams:** For reading data from files on any file system compatible with the HDFS API (that is, HDFS, S3, NFS, etc.), a DStream can be created as
+- **File Streams:** For reading data from files on any file system compatible with the HDFS API (that is, HDFS, S3, NFS, etc.), a DStream can be created as:
 
     <div class="codetabs">
     <div data-lang="scala" markdown="1">
@@ -687,9 +687,9 @@ out of these sources, *only* Kafka is available in the Python API. We will add m
 
 This category of sources require interfacing with external non-Spark libraries, some of them with
 complex dependencies (e.g., Kafka and Flume). Hence, to minimize issues related to version conflicts
-of dependencies, the functionality to create DStreams from these sources have been moved to separate
-libraries, that can be [linked](#linking) to explicitly when necessary. For example, if you want to
-create a DStream using data from Twitter's stream of tweets, you have to do the following.
+of dependencies, the functionality to create DStreams from these sources has been moved to separate
+libraries that can be [linked](#linking) to explicitly when necessary. For example, if you want to
+create a DStream using data from Twitter's stream of tweets, you have to do the following:
 
 1. *Linking*: Add the artifact `spark-streaming-twitter_{{site.SCALA_BINARY_VERSION}}` to the
   SBT/Maven project dependencies.
@@ -719,7 +719,7 @@ TwitterUtils.createStream(jssc);
 Note that these advanced sources are not available in the Spark shell, hence applications based on
 these advanced sources cannot be tested in the shell. If you really want to use them in the Spark
 shell you will have to download the corresponding Maven artifact's JAR along with its dependencies
-and it in the classpath.
+and add it to the classpath.
 
 Some of these advanced sources are as follows.
 
@@ -743,7 +743,7 @@ Some of these advanced sources are as follows.
 
 <span class="badge" style="background-color: grey">Python API</span> This is not yet supported in Python.
 
-Input DStreams can also be created out of custom data sources. All you have to do is implement an
+Input DStreams can also be created out of custom data sources. All you have to do is implement a
 user-defined **receiver** (see next section to understand what that is) that can receive data from
 the custom sources and push it into Spark. See the [Custom Receiver
 Guide](streaming-custom-receivers.html) for details.
@@ -753,14 +753,12 @@ Guide](streaming-custom-receivers.html) for details.
 
 There can be two kinds of data sources based on their *reliability*. Sources
 (like Kafka and Flume) allow the transferred data to be acknowledged. If the system receiving
-data from these *reliable* sources acknowledge the received data correctly, it can be ensured
-that no data gets lost due to any kind of failure. This leads to two kinds of receivers.
+data from these *reliable* sources acknowledges the received data correctly, it can be ensured
+that no data will be lost due to any kind of failure. This leads to two kinds of receivers:
 
-1. *Reliable Receiver* - A *reliable receiver* correctly acknowledges a reliable
-  source that the data has been received and stored in Spark with replication.
-1. *Unreliable Receiver* - These are receivers for sources that do not support acknowledging. Even
-  for reliable sources, one may implement an unreliable receiver that do not go into the complexity
-  of acknowledging correctly.
+1. *Reliable Receiver* - A *reliable receiver* correctly sends acknowledgment to a reliable
+  source when the data has been received and stored in Spark with replication.
+1. *Unreliable Receiver* - An *unreliable receiver* does *not* send acknowledgment to a source. This can be used for sources that do not support acknowledgment, or even for reliable sources when one does not want or need to go into the complexity of acknowledgment.
 
 The details of how to write a reliable receiver are discussed in the
 [Custom Receiver Guide](streaming-custom-receivers.html).
@@ -828,7 +826,7 @@ Some of the common ones are as follows.
 </tr>
 <tr>
   <td> <b>cogroup</b>(<i>otherStream</i>, [<i>numTasks</i>]) </td>
-  <td> When called on DStream of (K, V) and (K, W) pairs, return a new DStream of
+  <td> When called on a DStream of (K, V) and (K, W) pairs, return a new DStream of
   (K, Seq[V], Seq[W]) tuples.</td>
 </tr>
 <tr>
@@ -852,13 +850,13 @@ A few of these transformations are worth discussing in more detail.
 The `updateStateByKey` operation allows you to maintain arbitrary state while continuously updating
 it with new information. To use this, you will have to do two steps.
 
-1. Define the state - The state can be of arbitrary data type.
+1. Define the state - The state can be an arbitrary data type.
 1. Define the state update function - Specify with a function how to update the state using the
-previous state and the new values from input stream.
+previous state and the new values from an input stream.
 
 Let's illustrate this with an example. Say you want to maintain a running count of each word
 seen in a text data stream. Here, the running count is the state and it is an integer. We
-define the update function as
+define the update function as:
 
 <div class="codetabs">
 <div data-lang="scala" markdown="1">
@@ -947,7 +945,7 @@ operation that is not exposed in the DStream API.
 For example, the functionality of joining every batch in a data stream
 with another dataset is not directly exposed in the DStream API. However,
 you can easily use `transform` to do this. This enables very powerful possibilities. For example,
-if you want to do real-time data cleaning by joining the input data stream with precomputed
+one can do real-time data cleaning by joining the input data stream with precomputed
 spam information (maybe generated with Spark as well) and then filtering based on it.
 
 <div class="codetabs">
@@ -998,7 +996,7 @@ etc. can be changed between batches.
 #### Window Operations
 {:.no_toc}
 Spark Streaming also provides *windowed computations*, which allow you to apply
-transformations over a sliding window of data. This following figure illustrates this sliding
+transformations over a sliding window of data. The following figure illustrates this sliding
 window.
 
 <p style="text-align: center;">
@@ -1010,11 +1008,11 @@ window.
 
 As shown in the figure, every time the window *slides* over a source DStream,
 the source RDDs that fall within the window are combined and operated upon to produce the
-RDDs of the windowed DStream. In this specific case, the operation is applied over last 3 time
+RDDs of the windowed DStream. In this specific case, the operation is applied over the last 3 time
 units of data, and slides by 2 time units. This shows that any window operation needs to
 specify two parameters.
 
- * <i>window length</i> - The duration of the window (3 in the figure)
+ * <i>window length</i> - The duration of the window (3 in the figure).
  * <i>sliding interval</i> - The interval at which the window operation is performed (2 in
  the figure).
 
@@ -1022,7 +1020,7 @@ These two parameters must be multiples of the batch interval of the source DStre
 figure).
 
 Let's illustrate the window operations with an example. Say, you want to extend the
-[earlier example](#a-quick-example) by generating word counts over last 30 seconds of data,
+[earlier example](#a-quick-example) by generating word counts over the last 30 seconds of data,
 every 10 seconds. To do this, we have to apply the `reduceByKey` operation on the `pairs` DStream of
 `(word, 1)` pairs over the last 30 seconds of data. This is done using the
 operation `reduceByKeyAndWindow`.
@@ -1097,13 +1095,13 @@ said two parameters - <i>windowLength</i> and <i>slideInterval</i>.
 <tr>
   <td> <b>reduceByKeyAndWindow</b>(<i>func</i>, <i>invFunc</i>, <i>windowLength</i>,
   <i>slideInterval</i>, [<i>numTasks</i>]) </td>
-  <td> A more efficient version of the above <code>reduceByKeyAndWindow()</code> where the reduce
+  <td markdown="1"> A more efficient version of the above <code>reduceByKeyAndWindow()</code> where the reduce
   value of each window is calculated incrementally using the reduce values of the previous window.
-  This is done by reducing the new data that enter the sliding window, and "inverse reducing" the
-  old data that leave the window. An example would be that of "adding" and "subtracting" counts
-  of keys as the window slides. However, it is applicable to only "invertible reduce functions",
+  This is done by reducing the new data that enters the sliding window, and "inverse reducing" the
+  old data that leaves the window. An example would be that of "adding" and "subtracting" counts
+  of keys as the window slides. However, it is applicable only to "invertible reduce functions",
   that is, those reduce functions which have a corresponding "inverse reduce" function (taken as
-  parameter <i>invFunc</i>. Like in <code>reduceByKeyAndWindow</code>, the number of reduce tasks
+  parameter <i>invFunc</i>). Like in <code>reduceByKeyAndWindow</code>, the number of reduce tasks
   is configurable through an optional argument. Note that [checkpointing](#checkpointing) must be
   enabled for using this operation.
 </td>
@@ -1225,7 +1223,7 @@ For the Python API, see [DStream](api/python/pyspark.streaming.html#pyspark.stre
 ***
 
 ## Output Operations on DStreams
-Output operations allow DStream's data to be pushed out external systems like a database or a file systems.
+Output operations allow DStream's data to be pushed out to external systems like a database or a file systems.
 Since the output operations actually allow the transformed data to be consumed by external systems,
 they trigger the actual execution of all the DStream transformations (similar to actions for RDDs).
 Currently, the following output operations are defined:
@@ -1234,7 +1232,7 @@ Currently, the following output operations are defined:
 <tr><th style="width:30%">Output Operation</th><th>Meaning</th></tr>
 <tr>
   <td> <b>print</b>()</td>
-  <td> Prints first ten elements of every batch of data in a DStream on the driver node running
+  <td> Prints the first ten elements of every batch of data in a DStream on the driver node running
   the streaming application. This is useful for development and debugging.
   <br/>
   <span class="badge" style="background-color: grey">Python API</span> This is called
@@ -1243,12 +1241,12 @@ Currently, the following output operations are defined:
 </tr>
 <tr>
   <td> <b>saveAsTextFiles</b>(<i>prefix</i>, [<i>suffix</i>]) </td>
-  <td> Save this DStream's contents as a text files. The file name at each batch interval is
+  <td> Save this DStream's contents as text files. The file name at each batch interval is
   generated based on <i>prefix</i> and <i>suffix</i>: <i>"prefix-TIME_IN_MS[.suffix]"</i>. </td>
 </tr>
 <tr>
   <td> <b>saveAsObjectFiles</b>(<i>prefix</i>, [<i>suffix</i>]) </td>
-  <td> Save this DStream's contents as a <code>SequenceFile</code> of serialized Java objects. The file
+  <td> Save this DStream's contents as <code>SequenceFiles</code> of serialized Java objects. The file
   name at each batch interval is generated based on <i>prefix</i> and
   <i>suffix</i>: <i>"prefix-TIME_IN_MS[.suffix]"</i>.
   <br/>
@@ -1258,7 +1256,7 @@ Currently, the following output operations are defined:
 </tr>
 <tr>
   <td> <b>saveAsHadoopFiles</b>(<i>prefix</i>, [<i>suffix</i>]) </td>
-  <td> Save this DStream's contents as a Hadoop file. The file name at each batch interval is
+  <td> Save this DStream's contents as Hadoop files. The file name at each batch interval is
   generated based on <i>prefix</i> and <i>suffix</i>: <i>"prefix-TIME_IN_MS[.suffix]"</i>.
   <br>
   <span class="badge" style="background-color: grey">Python API</span> This is not available in
@@ -1268,7 +1266,7 @@ Currently, the following output operations are defined:
 <tr>
   <td> <b>foreachRDD</b>(<i>func</i>) </td>
   <td> The most generic output operator that applies a function, <i>func</i>, to each RDD generated from
-  the stream. This function should push the data in each RDD to a external system, like saving the RDD to
+  the stream. This function should push the data in each RDD to an external system, such as saving the RDD to
   files, or writing it over the network to a database. Note that the function <i>func</i> is executed
   in the driver process running the streaming application, and will usually have RDD actions in it
   that will force the computation of the streaming RDDs.</td>
@@ -1278,14 +1276,14 @@ Currently, the following output operations are defined:
 
 ### Design Patterns for using foreachRDD
 {:.no_toc}
-`dstream.foreachRDD` is a powerful primitive that allows data to sent out to external systems.
+`dstream.foreachRDD` is a powerful primitive that allows data to be sent out to external systems.
 However, it is important to understand how to use this primitive correctly and efficiently.
 Some of the common mistakes to avoid are as follows.
 
 Often writing data to external system requires creating a connection object
 (e.g. TCP connection to a remote server) and using it to send data to a remote system.
 For this purpose, a developer may inadvertently try creating a connection object at
-the Spark driver, but try to use it in a Spark worker to save records in the RDDs.
+the Spark driver, and then try to use it in a Spark worker to save records in the RDDs.
 For example (in Scala),
 
 <div class="codetabs">
@@ -1347,7 +1345,7 @@ dstream.foreachRDD(lambda rdd: rdd.foreach(sendRecord))
 Typically, creating a connection object has time and resource overheads. Therefore, creating and
 destroying a connection object for each record can incur unnecessarily high overheads and can
 significantly reduce the overall throughput of the system. A better solution is to use
-`rdd.foreachPartition` - create a single connection object and send all the records in  a RDD
+`rdd.foreachPartition` - create a single connection object and send all the records in a RDD
 partition using that connection.
 
 <div class="codetabs">
@@ -1551,7 +1549,7 @@ See the full [source code]({{site.SPARK_GITHUB_URL}}/blob/master/examples/src/ma
 </div>
 </div>
 
-You can also run SQL queries on tables defined on streaming data from a different thread (that is, asynchronous to the running StreamingContext). Just make sure that you set the StreamingContext to remember sufficient amount of streaming data such that query can run. Otherwise the StreamingContext, which is unaware of the any asynchronous SQL queries, will delete off old streaming data before the query can complete. For example, if you want to query the last batch, but your query can take 5 minutes to run, then call `streamingContext.remember(Minutes(5))` (in Scala, or equivalent in other languages).
+You can also run SQL queries on tables defined on streaming data from a different thread (that is, asynchronous to the running StreamingContext). Just make sure that you set the StreamingContext to remember a sufficient amount of streaming data such that the query can run. Otherwise the StreamingContext, which is unaware of the any asynchronous SQL queries, will delete off old streaming data before the query can complete. For example, if you want to query the last batch, but your query can take 5 minutes to run, then call `streamingContext.remember(Minutes(5))` (in Scala, or equivalent in other languages).
 
 See the [DataFrames and SQL](sql-programming-guide.html) guide to learn more about DataFrames.
 
@@ -1564,7 +1562,7 @@ You can also easily use machine learning algorithms provided by [MLlib](mllib-gu
 
 ## Caching / Persistence
 Similar to RDDs, DStreams also allow developers to persist the stream's data in memory. That is,
-using `persist()` method on a DStream will automatically persist every RDD of that DStream in
+using the `persist()` method on a DStream will automatically persist every RDD of that DStream in
 memory. This is useful if the data in the DStream will be computed multiple times (e.g., multiple
 operations on the same data). For window-based operations like `reduceByWindow` and
 `reduceByKeyAndWindow` and state-based operations like `updateStateByKey`, this is implicitly true.
@@ -1576,28 +1574,27 @@ default persistence level is set to replicate the data to two nodes for fault-to
 
 Note that, unlike RDDs, the default persistence level of DStreams keeps the data serialized in
 memory. This is further discussed in the [Performance Tuning](#memory-tuning) section. More
-information on different persistence levels can be found in
-[Spark Programming Guide](programming-guide.html#rdd-persistence).
+information on different persistence levels can be found in the [Spark Programming Guide](programming-guide.html#rdd-persistence).
 
 ***
 
 ## Checkpointing
 A streaming application must operate 24/7 and hence must be resilient to failures unrelated
 to the application logic (e.g., system failures, JVM crashes, etc.). For this to be possible,
-Spark Streaming needs to *checkpoints* enough information to a fault-
+Spark Streaming needs to *checkpoint* enough information to a fault-
 tolerant storage system such that it can recover from failures. There are two types of data
 that are checkpointed.
 
 - *Metadata checkpointing* - Saving of the information defining the streaming computation to
   fault-tolerant storage like HDFS. This is used to recover from failure of the node running the
   driver of the streaming application (discussed in detail later). Metadata includes:
-  +  *Configuration* - The configuration that were used to create the streaming application.
+  +  *Configuration* - The configuration that was used to create the streaming application.
   +  *DStream operations* - The set of DStream operations that define the streaming application.
   +  *Incomplete batches* - Batches whose jobs are queued but have not completed yet.
 - *Data checkpointing* - Saving of the generated RDDs to reliable storage. This is necessary
   in some *stateful* transformations that combine data across multiple batches. In such
-  transformations, the generated RDDs depends on RDDs of previous batches, which causes the length
-  of the dependency chain to keep increasing with time. To avoid such unbounded increase in recovery
+  transformations, the generated RDDs depend on RDDs of previous batches, which causes the length
+  of the dependency chain to keep increasing with time. To avoid such unbounded increases in recovery
    time (proportional to dependency chain), intermediate RDDs of stateful transformations are periodically
   *checkpointed* to reliable storage (e.g. HDFS) to cut off the dependency chains.
 
@@ -1611,10 +1608,10 @@ transformations are used.
 Checkpointing must be enabled for applications with any of the following requirements:
 
 - *Usage of stateful transformations* - If either `updateStateByKey` or `reduceByKeyAndWindow` (with
-  inverse function) is used in the application, then the checkpoint directory must be provided for
-  allowing periodic RDD checkpointing.
+  inverse function) is used in the application, then the checkpoint directory must be provided to
+  allow for periodic RDD checkpointing.
 - *Recovering from failures of the driver running the application* - Metadata checkpoints are used
-  for to recover with progress information.
+   to recover with progress information.
 
 Note that simple streaming applications without the aforementioned stateful transformations can be
 run without enabling checkpointing. The recovery from driver failures will also be partial in
@@ -1629,7 +1626,7 @@ Checkpointing can be enabled by setting a directory in a fault-tolerant,
 reliable file system (e.g., HDFS, S3, etc.) to which the checkpoint information will be saved.
 This is done by using `streamingContext.checkpoint(checkpointDirectory)`. This will allow you to
 use the aforementioned stateful transformations. Additionally,
-if you want make the application recover from driver failures, you should rewrite your
+if you want to make the application recover from driver failures, you should rewrite your
 streaming application to have the following behavior.
 
   + When the program is being started for the first time, it will create a new StreamingContext,
@@ -1750,18 +1747,17 @@ You can also explicitly create a `StreamingContext` from the checkpoint data and
 In addition to using `getOrCreate` one also needs to ensure that the driver process gets
 restarted automatically on failure. This can only be done by the deployment infrastructure that is
 used to run the application. This is further discussed in the
-[Deployment](#deploying-applications.html) section.
+[Deployment](#deploying-applications) section.
 
 Note that checkpointing of RDDs incurs the cost of saving to reliable storage.
 This may cause an increase in the processing time of those batches where RDDs get checkpointed.
 Hence, the interval of
 checkpointing needs to be set carefully. At small batch sizes (say 1 second), checkpointing every
 batch may significantly reduce operation throughput. Conversely, checkpointing too infrequently
-causes the lineage and task sizes to grow which may have detrimental effects. For stateful
+causes the lineage and task sizes to grow, which may have detrimental effects. For stateful
 transformations that require RDD checkpointing, the default interval is a multiple of the
 batch interval that is at least 10 seconds. It can be set by using
-`dstream.checkpoint(checkpointInterval)`. Typically, a checkpoint interval of 5 - 10 times of
-sliding interval of a DStream is good setting to try.
+`dstream.checkpoint(checkpointInterval)`. Typically, a checkpoint interval of 5 - 10 sliding intervals of a DStream is a good setting to try.
 
 ***
 
@@ -1834,17 +1830,17 @@ To run a Spark Streaming applications, you need to have the following.
 {:.no_toc}
 
 If a running Spark Streaming application needs to be upgraded with new
-application code, then there are two possible mechanism.
+application code, then there are two possible mechanisms.
 
 - The upgraded Spark Streaming application is started and run in parallel to the existing application.
-Once the new one (receiving the same data as the old one) has been warmed up and ready
+Once the new one (receiving the same data as the old one) has been warmed up and is ready
 for prime time, the old one be can be brought down. Note that this can be done for data sources that support
 sending the data to two destinations (i.e., the earlier and upgraded applications).
 
 - The existing application is shutdown gracefully (see
 [`StreamingContext.stop(...)`](api/scala/index.html#org.apache.spark.streaming.StreamingContext)
 or [`JavaStreamingContext.stop(...)`](api/java/index.html?org/apache/spark/streaming/api/java/JavaStreamingContext.html)
-for graceful shutdown options) which ensure data that have been received is completely
+for graceful shutdown options) which ensure data that has been received is completely
 processed before shutdown. Then the
 upgraded application can be started, which will start processing from the same point where the earlier
 application left off. Note that this can be done only with input sources that support source-side buffering
@@ -1879,10 +1875,10 @@ The following two metrics in web UI are particularly important:
   to finish.
 
 If the batch processing time is consistently more than the batch interval and/or the queueing
-delay keeps increasing, then it indicates the system is
-not able to process the batches as fast they are being generated and falling behind.
+delay keeps increasing, then it indicates that the system is
+not able to process the batches as fast they are being generated and is falling behind.
 In that case, consider
-[reducing](#reducing-the-processing-time-of-each-batch) the batch processing time.
+[reducing](#reducing-the-batch-processing-times) the batch processing time.
 
 The progress of a Spark Streaming program can also be monitored using the
 [StreamingListener](api/scala/index.html#org.apache.spark.streaming.scheduler.StreamingListener) interface,
@@ -1893,8 +1889,8 @@ and it is likely to be improved upon (i.e., more information reported) in the fu
 ***************************************************************************************************
 
 # Performance Tuning
-Getting the best performance of a Spark Streaming application on a cluster requires a bit of
-tuning. This section explains a number of the parameters and configurations that can tuned to
+Getting the best performance out of a Spark Streaming application on a cluster requires a bit of
+tuning. This section explains a number of the parameters and configurations that can be tuned to
 improve the performance of you application. At a high level, you need to consider two things:
 
 1. Reducing the processing time of each batch of data by efficiently using cluster resources.
@@ -1904,12 +1900,12 @@ improve the performance of you application. At a high level, you need to conside
 
 ## Reducing the Batch Processing Times
 There are a number of optimizations that can be done in Spark to minimize the processing time of
-each batch. These have been discussed in detail in [Tuning Guide](tuning.html). This section
+each batch. These have been discussed in detail in the [Tuning Guide](tuning.html). This section
 highlights some of the most important ones.
 
 ### Level of Parallelism in Data Receiving
 {:.no_toc}
-Receiving data over the network (like Kafka, Flume, socket, etc.) requires the data to deserialized
+Receiving data over the network (like Kafka, Flume, socket, etc.) requires the data to be deserialized
 and stored in Spark. If the data receiving becomes a bottleneck in the system, then consider
 parallelizing the data receiving. Note that each input DStream
 creates a single receiver (running on a worker machine) that receives a single stream of data.
@@ -1917,7 +1913,7 @@ Receiving multiple data streams can therefore be achieved by creating multiple i
 and configuring them to receive different partitions of the data stream from the source(s).
 For example, a single Kafka input DStream receiving two topics of data can be split into two
 Kafka input streams, each receiving only one topic. This would run two receivers,
-allowing data to be received in parallel, and increasing overall throughput. These multiple
+allowing data to be received in parallel, thus increasing overall throughput. These multiple
 DStreams can be unioned together to create a single DStream. Then the transformations that were
 being applied on a single input DStream can be applied on the unified stream. This is done as follows.
 
@@ -1947,10 +1943,10 @@ Another parameter that should be considered is the receiver's blocking interval,
 which is determined by the [configuration parameter](configuration.html#spark-streaming)
 `spark.streaming.blockInterval`. For most receivers, the received data is coalesced together into
 blocks of data before storing inside Spark's memory. The number of blocks in each batch
-determines the number of tasks that will be used to process those
+determines the number of tasks that will be used to process 
 the received data in a map-like transformation. The number of tasks per receiver per batch will be
 approximately (batch interval / block interval). For example, block interval of 200 ms will
-create 10 tasks per 2 second batches. Too low the number of tasks (that is, less than the number
+create 10 tasks per 2 second batches. If the number of tasks is too low (that is, less than the number
 of cores per machine), then it will be inefficient as all available cores will not be used to
 process the data. To increase the number of tasks for a given batch interval, reduce the
 block interval. However, the recommended minimum value of block interval is about 50 ms,
@@ -1958,7 +1954,7 @@ below which the task launching overheads may be a problem.
 
 An alternative to receiving data with multiple input streams / receivers is to explicitly repartition
 the input data stream (using `inputStream.repartition(<number of partitions>)`).
-This distributes the received batches of data across specified number of machines in the cluster
+This distributes the received batches of data across the specified number of machines in the cluster
 before further processing.
 
 ### Level of Parallelism in Data Processing
@@ -1966,7 +1962,7 @@ before further processing.
 Cluster resources can be under-utilized if the number of parallel tasks used in any stage of the
 computation is not high enough. For example, for distributed reduce operations like `reduceByKey`
 and `reduceByKeyAndWindow`, the default number of parallel tasks is controlled by
-the`spark.default.parallelism` [configuration property](configuration.html#spark-properties). You
+the `spark.default.parallelism` [configuration property](configuration.html#spark-properties). You
 can pass the level of parallelism as an argument (see
 [`PairDStreamFunctions`](api/scala/index.html#org.apache.spark.streaming.dstream.PairDStreamFunctions)
 documentation), or set the `spark.default.parallelism`
@@ -1974,20 +1970,20 @@ documentation), or set the `spark.default.parallelism`
 
 ### Data Serialization
 {:.no_toc}
-The overheads of data serialization can be reduce by tuning the serialization formats. In case of streaming, there are two types of data that are being serialized.
+The overheads of data serialization can be reduced by tuning the serialization formats. In the case of streaming, there are two types of data that are being serialized.
 
-* **Input data**: By default, the input data received through Receivers is stored in the executors' memory with [StorageLevel.MEMORY_AND_DISK_SER_2](api/scala/index.html#org.apache.spark.storage.StorageLevel$). That is, the data is serialized into bytes to reduce GC overheads, and replicated for tolerating executor failures. Also, the data is kept first in memory, and spilled over to disk only if the memory is unsufficient to hold all the input data necessary for the streaming computation. This serialization obviously has overheads -- the receiver must deserialize the received data and re-serialize it using Spark's serialization format. 
+* **Input data**: By default, the input data received through Receivers is stored in the executors' memory with [StorageLevel.MEMORY_AND_DISK_SER_2](api/scala/index.html#org.apache.spark.storage.StorageLevel$). That is, the data is serialized into bytes to reduce GC overheads, and replicated for tolerating executor failures. Also, the data is kept first in memory, and spilled over to disk only if the memory is insufficient to hold all of the input data necessary for the streaming computation. This serialization obviously has overheads -- the receiver must deserialize the received data and re-serialize it using Spark's serialization format. 
 
-* **Persisted RDDs generated by Streaming Operations**: RDDs generated by streaming computations may be persisted in memory. For example, window operation persist data in memory as they would be processed multiple times. However, unlike Spark, by default RDDs are persisted with [StorageLevel.MEMORY_ONLY_SER](api/scala/index.html#org.apache.spark.storage.StorageLevel$) (i.e. serialized) to minimize GC overheads.
+* **Persisted RDDs generated by Streaming Operations**: RDDs generated by streaming computations may be persisted in memory. For example, window operations persist data in memory as they would be processed multiple times. However, unlike the Spark Core default of [StorageLevel.MEMORY_ONLY](api/scala/index.html#org.apache.spark.storage.StorageLevel$), persisted RDDs generated by streaming computations are persisted with [StorageLevel.MEMORY_ONLY_SER](api/scala/index.html#org.apache.spark.storage.StorageLevel$) (i.e. serialized) by default to minimize GC overheads.
 
-In both cases, using Kryo serialization can reduce both CPU and memory overheads. See the [Spark Tuning Guide](tuning.html#data-serialization)) for more details. Consider registering custom classes, and disabling object reference tracking for Kryo (see Kryo-related configurations in the [Configuration Guide](configuration.html#compression-and-serialization)).
+In both cases, using Kryo serialization can reduce both CPU and memory overheads. See the [Spark Tuning Guide](tuning.html#data-serialization) for more details. For Kryo, consider registering custom classes, and disabling object reference tracking (see Kryo-related configurations in the [Configuration Guide](configuration.html#compression-and-serialization)).
 
-In specific cases where the amount of data that needs to be retained for the streaming application is not large, it may be feasible to persist data (both types) as deserialized objects without incurring excessive GC overheads. For example, if you are using batch intervals of few seconds and no window operations, then you can try disabling serialization in persisted data by explicitly setting the storage level accordingly. This would reduce the CPU overheads due to serialization, potentially improving performance without too much GC overheads.
+In specific cases where the amount of data that needs to be retained for the streaming application is not large, it may be feasible to persist data (both types) as deserialized objects without incurring excessive GC overheads. For example, if you are using batch intervals of a few seconds and no window operations, then you can try disabling serialization in persisted data by explicitly setting the storage level accordingly. This would reduce the CPU overheads due to serialization, potentially improving performance without too much GC overheads.
 
 ### Task Launching Overheads
 {:.no_toc}
 If the number of tasks launched per second is high (say, 50 or more per second), then the overhead
-of sending out tasks to the slaves maybe significant and will make it hard to achieve sub-second
+of sending out tasks to the slaves may be significant and will make it hard to achieve sub-second
 latencies. The overhead can be reduced by the following changes:
 
 * **Task Serialization**: Using Kryo serialization for serializing tasks can reduce the task
@@ -2006,7 +2002,7 @@ thus allowing sub-second batch size to be viable.
 For a Spark Streaming application running on a cluster to be stable, the system should be able to
 process data as fast as it is being received. In other words, batches of data should be processed
 as fast as they are being generated. Whether this is true for an application can be found by
-[monitoring](#monitoring) the processing times in the streaming web UI, where the batch
+[monitoring](#monitoring-applications) the processing times in the streaming web UI, where the batch
 processing time should be less than the batch interval.
 
 Depending on the nature of the streaming
@@ -2019,35 +2015,35 @@ production can be sustained.
 
 A good approach to figure out the right batch size for your application is to test it with a
 conservative batch interval (say, 5-10 seconds) and a low data rate. To verify whether the system
-is able to keep up with data rate, you can check the value of the end-to-end delay experienced
+is able to keep up with the data rate, you can check the value of the end-to-end delay experienced
 by each processed batch (either look for "Total delay" in Spark driver log4j logs, or use the
 [StreamingListener](api/scala/index.html#org.apache.spark.streaming.scheduler.StreamingListener)
 interface).
 If the delay is maintained to be comparable to the batch size, then system is stable. Otherwise,
 if the delay is continuously increasing, it means that the system is unable to keep up and it
 therefore unstable. Once you have an idea of a stable configuration, you can try increasing the
-data rate and/or reducing the batch size. Note that momentary increase in the delay due to
-temporary data rate increases maybe fine as long as the delay reduces back to a low value
+data rate and/or reducing the batch size. Note that a momentary increase in the delay due to
+temporary data rate increases may be fine as long as the delay reduces back to a low value
 (i.e., less than batch size).
 
 ***
 
 ## Memory Tuning
-Tuning the memory usage and GC behavior of Spark applications have been discussed in great detail
+Tuning the memory usage and GC behavior of Spark applications has been discussed in great detail
 in the [Tuning Guide](tuning.html#memory-tuning). It is strongly recommended that you read that. In this section, we discuss a few tuning parameters specifically in the context of Spark Streaming applications.
 
-The amount of cluster memory required by a Spark Streaming application depends heavily on the type of transformations used. For example, if you want to use a window operation on last 10 minutes of data, then your cluster should have sufficient memory to hold 10 minutes of worth of data in memory. Or if you want to use `updateStateByKey` with a large number of keys, then the necessary memory  will be high. On the contrary, if you want to do a simple map-filter-store operation, then necessary memory will be low.
+The amount of cluster memory required by a Spark Streaming application depends heavily on the type of transformations used. For example, if you want to use a window operation on the last 10 minutes of data, then your cluster should have sufficient memory to hold 10 minutes worth of data in memory. Or if you want to use `updateStateByKey` with a large number of keys, then the necessary memory  will be high. On the contrary, if you want to do a simple map-filter-store operation, then the necessary memory will be low.
 
-In general, since the data received through receivers are stored with StorageLevel.MEMORY_AND_DISK_SER_2, the data that does not fit in memory will spill over to the disk. This may reduce the performance of the streaming application, and hence it is advised to provide sufficient memory as required by your streaming application. Its best to try and see the memory usage on a small scale and estimate accordingly. 
+In general, since the data received through receivers is stored with StorageLevel.MEMORY_AND_DISK_SER_2, the data that does not fit in memory will spill over to the disk. This may reduce the performance of the streaming application, and hence it is advised to provide sufficient memory as required by your streaming application. Its best to try and see the memory usage on a small scale and estimate accordingly. 
 
-Another aspect of memory tuning is garbage collection. For a streaming application that require low latency, it is undesirable to have large pauses caused by JVM Garbage Collection. 
+Another aspect of memory tuning is garbage collection. For a streaming application that requires low latency, it is undesirable to have large pauses caused by JVM Garbage Collection. 
 
-There are a few parameters that can help you tune the memory usage and GC overheads.
+There are a few parameters that can help you tune the memory usage and GC overheads:
 
-* **Persistence Level of DStreams**: As mentioned earlier in the [Data Serialization](#data-serialization) section, the input data and RDDs are by default persisted as serialized bytes. This reduces both, the memory usage and GC overheads, compared to deserialized persistence. Enabling Kryo serialization further reduces serialized sizes and memory usage. Further reduction in memory usage can be achieved with compression (see the Spark configuration `spark.rdd.compress`), at the cost of CPU time.
+* **Persistence Level of DStreams**: As mentioned earlier in the [Data Serialization](#data-serialization) section, the input data and RDDs are by default persisted as serialized bytes. This reduces both the memory usage and GC overheads, compared to deserialized persistence. Enabling Kryo serialization further reduces serialized sizes and memory usage. Further reduction in memory usage can be achieved with compression (see the Spark configuration `spark.rdd.compress`), at the cost of CPU time.
 
-* **Clearing old data**: By default, all input data and persisted RDDs generated by DStream transformations are automatically cleared. Spark Streaming decides when to clear the data based on the transformations that are used. For example, if you are using window operation of 10 minutes, then Spark Streaming will keep around last 10 minutes of data, and actively throw away older data. 
-Data can be retained for longer duration (e.g. interactively querying older data) by setting `streamingContext.remember`.
+* **Clearing old data**: By default, all input data and persisted RDDs generated by DStream transformations are automatically cleared. Spark Streaming decides when to clear the data based on the transformations that are used. For example, if you are using a window operation of 10 minutes, then Spark Streaming will keep around the last 10 minutes of data, and actively throw away older data. 
+Data can be retained for a longer duration (e.g. interactively querying older data) by setting `streamingContext.remember`.
 
 * **CMS Garbage Collector**: Use of the concurrent mark-and-sweep GC is strongly recommended for keeping GC-related pauses consistently low. Even though concurrent GC is known to reduce the
 overall processing throughput of the system, its use is still recommended to achieve more
@@ -2077,18 +2073,18 @@ re-computed from the original fault-tolerant dataset using the lineage of operat
 1. Assuming that all of the RDD transformations are deterministic, the data in the final transformed
    RDD will always be the same irrespective of failures in the Spark cluster.
 
-Spark operates on data on fault-tolerant file systems like HDFS or S3. Hence,
+Spark operates on data in fault-tolerant file systems like HDFS or S3. Hence,
 all of the RDDs generated from the fault-tolerant data are also fault-tolerant. However, this is not
 the case for Spark Streaming as the data in most cases is received over the network (except when
 `fileStream` is used). To achieve the same fault-tolerance properties for all of the generated RDDs,
 the received data is replicated among multiple Spark executors in worker nodes in the cluster
 (default replication factor is 2). This leads to two kinds of data in the
-system that needs to recovered in the event of failures:
+system that need to recovered in the event of failures:
 
 1. *Data received and replicated* - This data survives failure of a single worker node as a copy
-  of it exists on one of the nodes.
+  of it exists on one of the other nodes.
 1. *Data received but buffered for replication* - Since this is not replicated,
-   the only way to recover that data is to get it again from the source.
+   the only way to recover this data is to get it again from the source.
 
 Furthermore, there are two kinds of failures that we should be concerned about:
 
@@ -2115,13 +2111,13 @@ In any stream processing system, broadly speaking, there are three steps in proc
 
 1. *Receiving the data*: The data is received from sources using Receivers or otherwise.
 
-1. *Transforming the data*: The data received data is transformed using DStream and RDD transformations.
+1. *Transforming the data*: The received data is transformed using DStream and RDD transformations.
 
 1. *Pushing out the data*: The final transformed data is pushed out to external systems like file systems, databases, dashboards, etc.
 
-If a streaming application has to achieve end-to-end exactly-once guarantees, then each step has to provide exactly-once guarantee. That is, each record must be received exactly once, transformed exactly once, and pushed to downstream systems exactly once. Let's understand the semantics of these steps in the context of Spark Streaming.
+If a streaming application has to achieve end-to-end exactly-once guarantees, then each step has to provide an exactly-once guarantee. That is, each record must be received exactly once, transformed exactly once, and pushed to downstream systems exactly once. Let's understand the semantics of these steps in the context of Spark Streaming.
 
-1. *Receiving the data*: Different input sources provided different guarantees. This is discussed in detail in the next subsection.
+1. *Receiving the data*: Different input sources provide different guarantees. This is discussed in detail in the next subsection.
 
 1. *Transforming the data*: All data that has been received will be processed _exactly once_, thanks to the guarantees that RDDs provide. Even if there are failures, as long as the received input data is accessible, the final transformed RDDs will always have the same contents.
 
@@ -2133,9 +2129,9 @@ Different input sources provide different guarantees, ranging from _at-least onc
 
 ### With Files
 {:.no_toc}
-If all of the input data is already present in a fault-tolerant files system like
-HDFS, Spark Streaming can always recover from any failure and process all the data. This gives
-*exactly-once* semantics, that all the data will be processed exactly once no matter what fails.
+If all of the input data is already present in a fault-tolerant file system like
+HDFS, Spark Streaming can always recover from any failure and process all of the data. This gives
+*exactly-once* semantics, meaning all of the data will be processed exactly once no matter what fails.
 
 ### With Receiver-based Sources
 {:.no_toc}
@@ -2144,21 +2140,21 @@ scenario and the type of receiver.
 As we discussed [earlier](#receiver-reliability), there are two types of receivers:
 
 1. *Reliable Receiver* - These receivers acknowledge reliable sources only after ensuring that
-  the received data has been replicated. If such a receiver fails,
-  the buffered (unreplicated) data does not get acknowledged to the source. If the receiver is
-  restarted, the source will resend the data, and therefore no data will be lost due to the failure.
-1. *Unreliable Receiver* - Such receivers can lose data when they fail due to worker
-  or driver failures.
+  the received data has been replicated. If such a receiver fails, the source will not receive
+  acknowledgment for the buffered (unreplicated) data. Therefore, if the receiver is
+  restarted, the source will resend the data, and no data will be lost due to the failure.
+1. *Unreliable Receiver* - Such receivers do *not* send acknowledgment and therefore *can* lose
+  data when they fail due to worker or driver failures.
 
 Depending on what type of receivers are used we achieve the following semantics.
 If a worker node fails, then there is no data loss with reliable receivers. With unreliable
 receivers, data received but not replicated can get lost. If the driver node fails,
-then besides these losses, all the past data that was received and replicated in memory will be
+then besides these losses, all of the past data that was received and replicated in memory will be
 lost. This will affect the results of the stateful transformations.
 
 To avoid this loss of past received data, Spark 1.2 introduced _write
-ahead logs_ which saves the received data to fault-tolerant storage. With the [write ahead logs
-enabled](#deploying-applications) and reliable receivers, there is zero data loss. In terms of semantics, it provides at-least once guarantee. 
+ahead logs_ which save the received data to fault-tolerant storage. With the [write ahead logs
+enabled](#deploying-applications) and reliable receivers, there is zero data loss. In terms of semantics, it provides an at-least once guarantee. 
 
 The following table summarizes the semantics under failures:
 
@@ -2219,7 +2215,7 @@ additional effort may be necessary to achieve exactly-once semantics. There are
 - *Transactional updates*: All updates are made transactionally so that updates are made exactly once atomically. One way to do this would be the following.
 
     - Use the batch time (available in `foreachRDD`) and the partition index of the RDD to create an identifier. This identifier uniquely identifies a blob data in the streaming application.
-    - Update external system with this blob transactionally (that is, exactly once, atomically) using the identifier. That is, if the identifier is not already committed, commit the partition data and the identifier atomically. Else if this was already committed, skip the update.
+    - Update external system with this blob transactionally (that is, exactly once, atomically) using the identifier. That is, if the identifier is not already committed, commit the partition data and the identifier atomically. Else, if this was already committed, skip the update.
 
           dstream.foreachRDD { (rdd, time) =>
             rdd.foreachPartition { partitionIterator =>
@@ -2229,7 +2225,6 @@ additional effort may be necessary to achieve exactly-once semantics. There are
             }
           }
 
-
 ***************************************************************************************************
 ***************************************************************************************************
 
@@ -2303,7 +2298,7 @@ package and renamed for better clarity.
   - Java docs
     * [JavaStreamingContext](api/java/index.html?org/apache/spark/streaming/api/java/JavaStreamingContext.html),
     [JavaDStream](api/java/index.html?org/apache/spark/streaming/api/java/JavaDStream.html) and
-    [PairJavaDStream](api/java/index.html?org/apache/spark/streaming/api/java/PairJavaDStream.html)
+    [JavaPairDStream](api/java/index.html?org/apache/spark/streaming/api/java/JavaPairDStream.html)
     * [KafkaUtils](api/java/index.html?org/apache/spark/streaming/kafka/KafkaUtils.html),
     [FlumeUtils](api/java/index.html?org/apache/spark/streaming/flume/FlumeUtils.html),
     [KinesisUtils](api/java/index.html?org/apache/spark/streaming/kinesis/KinesisUtils.html)

From cb7ada1196b5e13f0d4bb1988dc707a72003b8bc Mon Sep 17 00:00:00 2001
From: Liang-Chi Hsieh <viirya@gmail.com>
Date: Sat, 13 Jun 2015 22:42:28 -0700
Subject: [PATCH 480/525] [SPARK-8342][SQL] Fix Decimal setOrNull

JIRA: https://issues.apache.org/jira/browse/SPARK-8342

Author: Liang-Chi Hsieh <viirya@gmail.com>

Closes #6797 from viirya/fix_decimal and squashes the following commits:

8a447b1 [Liang-Chi Hsieh] Add unit test.
d67a5ea [Liang-Chi Hsieh] Merge remote-tracking branch 'upstream/master' into fix_decimal
ab6d8af [Liang-Chi Hsieh] Fix setOrNull.
---
 .../src/main/scala/org/apache/spark/sql/types/Decimal.scala | 2 +-
 .../org/apache/spark/sql/types/decimal/DecimalSuite.scala   | 6 ++++++
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/Decimal.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/Decimal.scala
index eb3c58c37f308..a85af9e04aedb 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/Decimal.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/Decimal.scala
@@ -86,7 +86,7 @@ final class Decimal extends Ordered[Decimal] with Serializable {
       if (precision < 19) {
         return null  // Requested precision is too low to represent this value
       }
-      this.decimalVal = BigDecimal(longVal)
+      this.decimalVal = BigDecimal(unscaled)
       this.longVal = 0L
     } else {
       val p = POW_10(math.min(precision, MAX_LONG_DIGITS))
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/types/decimal/DecimalSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/types/decimal/DecimalSuite.scala
index 28b373e258311..4c0365cf1b6f9 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/types/decimal/DecimalSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/types/decimal/DecimalSuite.scala
@@ -156,4 +156,10 @@ class DecimalSuite extends SparkFunSuite with PrivateMethodTester {
     assert(Decimal(-100) % Decimal(3) === Decimal(-1))
     assert(Decimal(100) % Decimal(0) === null)
   }
+
+  test("set/setOrNull") {
+    assert(new Decimal().set(10L, 10, 0).toUnscaledLong === 10L)
+    assert(new Decimal().set(100L, 10, 0).toUnscaledLong === 100L)
+    assert(Decimal(Long.MaxValue, 100, 0).toUnscaledLong === Long.MaxValue)
+  }
 }

From ea7fd2ff6454e8d819a39bf49901074e49b5714e Mon Sep 17 00:00:00 2001
From: Josh Rosen <joshrosen@databricks.com>
Date: Sun, 14 Jun 2015 09:34:35 -0700
Subject: [PATCH 481/525] [SPARK-8354] [SQL] Fix off-by-factor-of-8 error when
 allocating scratch space in UnsafeFixedWidthAggregationMap

UnsafeFixedWidthAggregationMap contains an off-by-factor-of-8 error when allocating row conversion scratch space: we take a size requirement, measured in bytes, then allocate a long array of that size.  This means that we end up allocating 8x too much conversion space.

This patch fixes this by allocating a `byte[]` array instead.  This doesn't impose any new limitations on the maximum sizes of UnsafeRows, since UnsafeRowConverter already used integers when calculating the size requirements for rows.

Author: Josh Rosen <joshrosen@databricks.com>

Closes #6809 from JoshRosen/sql-bytes-vs-words-fix and squashes the following commits:

6520339 [Josh Rosen] Updates to reflect fact that UnsafeRow max size is constrained by max byte[] size
---
 .../UnsafeFixedWidthAggregationMap.java       | 30 +++++++++----------
 .../expressions/UnsafeRowConverter.scala      |  2 +-
 2 files changed, 16 insertions(+), 16 deletions(-)

diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeFixedWidthAggregationMap.java b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeFixedWidthAggregationMap.java
index b23e0efc83332..f7849ebebc573 100644
--- a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeFixedWidthAggregationMap.java
+++ b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeFixedWidthAggregationMap.java
@@ -39,7 +39,7 @@ public final class UnsafeFixedWidthAggregationMap {
    * An empty aggregation buffer, encoded in UnsafeRow format. When inserting a new key into the
    * map, we copy this buffer and use it as the value.
    */
-  private final long[] emptyAggregationBuffer;
+  private final byte[] emptyAggregationBuffer;
 
   private final StructType aggregationBufferSchema;
 
@@ -63,10 +63,10 @@ public final class UnsafeFixedWidthAggregationMap {
   /**
    * Scratch space that is used when encoding grouping keys into UnsafeRow format.
    *
-   * By default, this is a 1MB array, but it will grow as necessary in case larger keys are
+   * By default, this is a 8 kb array, but it will grow as necessary in case larger keys are
    * encountered.
    */
-  private long[] groupingKeyConversionScratchSpace = new long[1024 / 8];
+  private byte[] groupingKeyConversionScratchSpace = new byte[1024 * 8];
 
   private final boolean enablePerfMetrics;
 
@@ -123,13 +123,13 @@ public UnsafeFixedWidthAggregationMap(
   }
 
   /**
-   * Convert a Java object row into an UnsafeRow, allocating it into a new long array.
+   * Convert a Java object row into an UnsafeRow, allocating it into a new byte array.
    */
-  private static long[] convertToUnsafeRow(InternalRow javaRow, StructType schema) {
+  private static byte[] convertToUnsafeRow(InternalRow javaRow, StructType schema) {
     final UnsafeRowConverter converter = new UnsafeRowConverter(schema);
-    final long[] unsafeRow = new long[converter.getSizeRequirement(javaRow)];
-    final long writtenLength =
-      converter.writeRow(javaRow, unsafeRow, PlatformDependent.LONG_ARRAY_OFFSET);
+    final byte[] unsafeRow = new byte[converter.getSizeRequirement(javaRow)];
+    final int writtenLength =
+      converter.writeRow(javaRow, unsafeRow, PlatformDependent.BYTE_ARRAY_OFFSET);
     assert (writtenLength == unsafeRow.length): "Size requirement calculation was wrong!";
     return unsafeRow;
   }
@@ -143,34 +143,34 @@ public UnsafeRow getAggregationBuffer(InternalRow groupingKey) {
     // Make sure that the buffer is large enough to hold the key. If it's not, grow it:
     if (groupingKeySize > groupingKeyConversionScratchSpace.length) {
       // This new array will be initially zero, so there's no need to zero it out here
-      groupingKeyConversionScratchSpace = new long[groupingKeySize];
+      groupingKeyConversionScratchSpace = new byte[groupingKeySize];
     } else {
       // Zero out the buffer that's used to hold the current row. This is necessary in order
       // to ensure that rows hash properly, since garbage data from the previous row could
       // otherwise end up as padding in this row. As a performance optimization, we only zero out
       // the portion of the buffer that we'll actually write to.
-      Arrays.fill(groupingKeyConversionScratchSpace, 0, groupingKeySize, 0);
+      Arrays.fill(groupingKeyConversionScratchSpace, 0, groupingKeySize, (byte) 0);
     }
-    final long actualGroupingKeySize = groupingKeyToUnsafeRowConverter.writeRow(
+    final int actualGroupingKeySize = groupingKeyToUnsafeRowConverter.writeRow(
       groupingKey,
       groupingKeyConversionScratchSpace,
-      PlatformDependent.LONG_ARRAY_OFFSET);
+      PlatformDependent.BYTE_ARRAY_OFFSET);
     assert (groupingKeySize == actualGroupingKeySize) : "Size requirement calculation was wrong!";
 
     // Probe our map using the serialized key
     final BytesToBytesMap.Location loc = map.lookup(
       groupingKeyConversionScratchSpace,
-      PlatformDependent.LONG_ARRAY_OFFSET,
+      PlatformDependent.BYTE_ARRAY_OFFSET,
       groupingKeySize);
     if (!loc.isDefined()) {
       // This is the first time that we've seen this grouping key, so we'll insert a copy of the
       // empty aggregation buffer into the map:
       loc.putNewKey(
         groupingKeyConversionScratchSpace,
-        PlatformDependent.LONG_ARRAY_OFFSET,
+        PlatformDependent.BYTE_ARRAY_OFFSET,
         groupingKeySize,
         emptyAggregationBuffer,
-        PlatformDependent.LONG_ARRAY_OFFSET,
+        PlatformDependent.BYTE_ARRAY_OFFSET,
         emptyAggregationBuffer.length
       );
     }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/UnsafeRowConverter.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/UnsafeRowConverter.scala
index d771e454b5170..5c92f41c639fa 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/UnsafeRowConverter.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/UnsafeRowConverter.scala
@@ -68,7 +68,7 @@ class UnsafeRowConverter(fieldTypes: Array[DataType]) {
    * @param baseOffset the base offset of the destination address
    * @return the number of bytes written. This should be equal to `getSizeRequirement(row)`.
    */
-  def writeRow(row: InternalRow, baseObject: Object, baseOffset: Long): Long = {
+  def writeRow(row: InternalRow, baseObject: Object, baseOffset: Long): Int = {
     unsafeRow.pointTo(baseObject, baseOffset, writers.length, null)
     var fieldNumber = 0
     var appendCursor: Int = fixedLengthSize

From 9073a426e444e4bc6efa8608e54e0a986f38a270 Mon Sep 17 00:00:00 2001
From: Michael Armbrust <michael@databricks.com>
Date: Sun, 14 Jun 2015 11:21:42 -0700
Subject: [PATCH 482/525] [SPARK-8358] [SQL] Wait for child resolution when
 resolving generators

Author: Michael Armbrust <michael@databricks.com>

Closes #6811 from marmbrus/aliasExplodeStar and squashes the following commits:

fbd2065 [Michael Armbrust] more style
806a373 [Michael Armbrust] fix style
7cbb530 [Michael Armbrust] [SPARK-8358][SQL] Wait for child resolution when resolving generatorsa
---
 .../org/apache/spark/sql/catalyst/analysis/Analyzer.scala | 6 ++++--
 .../test/scala/org/apache/spark/sql/DataFrameSuite.scala  | 8 ++++++++
 2 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
index cbd8def4f1d3c..4b7fef7126989 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
@@ -563,7 +563,9 @@ class Analyzer(
     private object AliasedGenerator {
       def unapply(e: Expression): Option[(Generator, Seq[String])] = e match {
         case Alias(g: Generator, name)
-          if g.elementTypes.size > 1 && java.util.regex.Pattern.matches("_c[0-9]+", name) => {
+          if g.resolved &&
+             g.elementTypes.size > 1 &&
+             java.util.regex.Pattern.matches("_c[0-9]+", name) => {
           // Assume the default name given by parser is "_c[0-9]+",
           // TODO in long term, move the naming logic from Parser to Analyzer.
           // In projection, Parser gave default name for TGF as does for normal UDF,
@@ -572,7 +574,7 @@ class Analyzer(
           // Let's simply ignore the default given name for this case.
           Some((g, Nil))
         }
-        case Alias(g: Generator, name) if g.elementTypes.size > 1 =>
+        case Alias(g: Generator, name) if g.resolved && g.elementTypes.size > 1 =>
           // If not given the default names, and the TGF with multiple output columns
           failAnalysis(
             s"""Expect multiple names given for ${g.getClass.getName},
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
index 84835c0db765d..fa98e23e3d147 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
@@ -134,6 +134,14 @@ class DataFrameSuite extends QueryTest {
     )
   }
 
+  test("explode alias and star") {
+    val df = Seq((Array("a"), 1)).toDF("a", "b")
+
+    checkAnswer(
+      df.select(explode($"a").as("a"), $"*"),
+      Row("a", Seq("a"), 1) :: Nil)
+  }
+
   test("selectExpr") {
     checkAnswer(
       testData.selectExpr("abs(key)", "value"),

From 53c16b92a537c392a7c3ebc3ef24c1e86cb1a7a4 Mon Sep 17 00:00:00 2001
From: Reynold Xin <rxin@databricks.com>
Date: Sun, 14 Jun 2015 11:23:23 -0700
Subject: [PATCH 483/525] [SPARK-8362] [SQL] Add unit tests for +, -, *, /, %

Added unit tests for all supported data types for:
- Add
- Subtract
- Multiply
- Divide
- UnaryMinus
- Remainder

Fixed bugs caught by the unit tests.

Author: Reynold Xin <rxin@databricks.com>

Closes #6813 from rxin/SPARK-8362 and squashes the following commits:

fb3fe62 [Reynold Xin] Added Remainder.
3b266ba [Reynold Xin] [SPARK-8362] Add unit tests for +, -, *, /.
---
 .../sql/catalyst/expressions/arithmetic.scala |  31 ++--
 .../ArithmeticExpressionSuite.scala           | 173 +++++++++---------
 2 files changed, 99 insertions(+), 105 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/arithmetic.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/arithmetic.scala
index 18ddac1b598e6..9d1e96572a26d 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/arithmetic.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/arithmetic.scala
@@ -17,7 +17,6 @@
 
 package org.apache.spark.sql.catalyst.expressions
 
-import org.apache.spark.sql.catalyst
 import org.apache.spark.sql.catalyst.analysis.TypeCheckResult
 import org.apache.spark.sql.catalyst.expressions.codegen.{CodeGenContext, GeneratedExpressionCode}
 import org.apache.spark.sql.catalyst.util.TypeUtils
@@ -52,8 +51,8 @@ case class UnaryMinus(child: Expression) extends UnaryArithmetic {
   private lazy val numeric = TypeUtils.getNumeric(dataType)
 
   override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = dataType match {
-    case dt: DecimalType => defineCodeGen(ctx, ev, c => s"c.unary_$$minus()")
-    case dt: NumericType => defineCodeGen(ctx, ev, c => s"-($c)")
+    case dt: DecimalType => defineCodeGen(ctx, ev, c => s"$c.unary_$$minus()")
+    case dt: NumericType => defineCodeGen(ctx, ev, c => s"(${ctx.javaType(dt)})(-($c))")
   }
 
   protected override def evalInternal(evalE: Any) = numeric.negate(evalE)
@@ -144,8 +143,8 @@ abstract class BinaryArithmetic extends BinaryExpression {
       defineCodeGen(ctx, ev, (eval1, eval2) => s"$eval1.$decimalMethod($eval2)")
     // byte and short are casted into int when add, minus, times or divide
     case ByteType | ShortType =>
-      defineCodeGen(ctx, ev, (eval1, eval2) =>
-        s"(${ctx.javaType(dataType)})($eval1 $symbol $eval2)")
+      defineCodeGen(ctx, ev,
+        (eval1, eval2) => s"(${ctx.javaType(dataType)})($eval1 $symbol $eval2)")
     case _ =>
       defineCodeGen(ctx, ev, (eval1, eval2) => s"$eval1 $symbol $eval2")
   }
@@ -205,7 +204,7 @@ case class Multiply(left: Expression, right: Expression) extends BinaryArithmeti
 
 case class Divide(left: Expression, right: Expression) extends BinaryArithmetic {
   override def symbol: String = "/"
-  override def decimalMethod: String = "$divide"
+  override def decimalMethod: String = "$div"
 
   override def nullable: Boolean = true
 
@@ -245,11 +244,8 @@ case class Divide(left: Expression, right: Expression) extends BinaryArithmetic
     } else {
       s"${eval2.primitive} == 0"
     }
-    val method = if (left.dataType.isInstanceOf[DecimalType]) {
-      s".$decimalMethod"
-    } else {
-      s"$symbol"
-    }
+    val method = if (left.dataType.isInstanceOf[DecimalType]) s".$decimalMethod" else s" $symbol "
+    val javaType = ctx.javaType(left.dataType)
     eval1.code + eval2.code +
       s"""
       boolean ${ev.isNull} = false;
@@ -257,7 +253,7 @@ case class Divide(left: Expression, right: Expression) extends BinaryArithmetic
       if (${eval1.isNull} || ${eval2.isNull} || $test) {
         ${ev.isNull} = true;
       } else {
-        ${ev.primitive} = ${eval1.primitive}$method(${eval2.primitive});
+        ${ev.primitive} = ($javaType) (${eval1.primitive}$method(${eval2.primitive}));
       }
       """
   }
@@ -265,7 +261,7 @@ case class Divide(left: Expression, right: Expression) extends BinaryArithmetic
 
 case class Remainder(left: Expression, right: Expression) extends BinaryArithmetic {
   override def symbol: String = "%"
-  override def decimalMethod: String = "reminder"
+  override def decimalMethod: String = "remainder"
 
   override def nullable: Boolean = true
 
@@ -305,11 +301,8 @@ case class Remainder(left: Expression, right: Expression) extends BinaryArithmet
     } else {
       s"${eval2.primitive} == 0"
     }
-    val method = if (left.dataType.isInstanceOf[DecimalType]) {
-      s".$decimalMethod"
-    } else {
-      s"$symbol"
-    }
+    val method = if (left.dataType.isInstanceOf[DecimalType]) s".$decimalMethod" else s" $symbol "
+    val javaType = ctx.javaType(left.dataType)
     eval1.code + eval2.code +
       s"""
       boolean ${ev.isNull} = false;
@@ -317,7 +310,7 @@ case class Remainder(left: Expression, right: Expression) extends BinaryArithmet
       if (${eval1.isNull} || ${eval2.isNull} || $test) {
         ${ev.isNull} = true;
       } else {
-        ${ev.primitive} = ${eval1.primitive}$method(${eval2.primitive});
+        ${ev.primitive} = ($javaType) (${eval1.primitive}$method(${eval2.primitive}));
       }
       """
   }
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ArithmeticExpressionSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ArithmeticExpressionSuite.scala
index 5ff1bca260b24..3f4843259e80b 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ArithmeticExpressionSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ArithmeticExpressionSuite.scala
@@ -17,8 +17,6 @@
 
 package org.apache.spark.sql.catalyst.expressions
 
-import org.scalatest.Matchers._
-
 import org.apache.spark.SparkFunSuite
 import org.apache.spark.sql.catalyst.dsl.expressions._
 import org.apache.spark.sql.types.{Decimal, DoubleType, IntegerType}
@@ -26,100 +24,103 @@ import org.apache.spark.sql.types.{Decimal, DoubleType, IntegerType}
 
 class ArithmeticExpressionSuite extends SparkFunSuite with ExpressionEvalHelper {
 
-  test("arithmetic") {
-    val row = create_row(1, 2, 3, null)
-    val c1 = 'a.int.at(0)
-    val c2 = 'a.int.at(1)
-    val c3 = 'a.int.at(2)
-    val c4 = 'a.int.at(3)
-
-    checkEvaluation(UnaryMinus(c1), -1, row)
-    checkEvaluation(UnaryMinus(Literal.create(100, IntegerType)), -100)
-
-    checkEvaluation(Add(c1, c4), null, row)
-    checkEvaluation(Add(c1, c2), 3, row)
-    checkEvaluation(Add(c1, Literal.create(null, IntegerType)), null, row)
-    checkEvaluation(Add(Literal.create(null, IntegerType), c2), null, row)
-    checkEvaluation(
-      Add(Literal.create(null, IntegerType), Literal.create(null, IntegerType)), null, row)
-
-    checkEvaluation(-c1, -1, row)
-    checkEvaluation(c1 + c2, 3, row)
-    checkEvaluation(c1 - c2, -1, row)
-    checkEvaluation(c1 * c2, 2, row)
-    checkEvaluation(c1 / c2, 0, row)
-    checkEvaluation(c1 % c2, 1, row)
+  /**
+   * Runs through the testFunc for all numeric data types.
+   *
+   * @param testFunc a test function that accepts a conversion function to convert an integer
+   *                 into another data type.
+   */
+  private def testNumericDataTypes(testFunc: (Int => Any) => Unit): Unit = {
+    testFunc(_.toByte)
+    testFunc(_.toShort)
+    testFunc(identity)
+    testFunc(_.toLong)
+    testFunc(_.toFloat)
+    testFunc(_.toDouble)
+    testFunc(Decimal(_))
   }
 
-  test("fractional arithmetic") {
-    val row = create_row(1.1, 2.0, 3.1, null)
-    val c1 = 'a.double.at(0)
-    val c2 = 'a.double.at(1)
-    val c3 = 'a.double.at(2)
-    val c4 = 'a.double.at(3)
-
-    checkEvaluation(UnaryMinus(c1), -1.1, row)
-    checkEvaluation(UnaryMinus(Literal.create(100.0, DoubleType)), -100.0)
-    checkEvaluation(Add(c1, c4), null, row)
-    checkEvaluation(Add(c1, c2), 3.1, row)
-    checkEvaluation(Add(c1, Literal.create(null, DoubleType)), null, row)
-    checkEvaluation(Add(Literal.create(null, DoubleType), c2), null, row)
-    checkEvaluation(
-      Add(Literal.create(null, DoubleType), Literal.create(null, DoubleType)), null, row)
-
-    checkEvaluation(-c1, -1.1, row)
-    checkEvaluation(c1 + c2, 3.1, row)
-    checkDoubleEvaluation(c1 - c2, (-0.9 +- 0.001), row)
-    checkDoubleEvaluation(c1 * c2, (2.2 +- 0.001), row)
-    checkDoubleEvaluation(c1 / c2, (0.55 +- 0.001), row)
-    checkDoubleEvaluation(c3 % c2, (1.1 +- 0.001), row)
+  test("+ (Add)") {
+    testNumericDataTypes { convert =>
+      val left = Literal(convert(1))
+      val right = Literal(convert(2))
+      checkEvaluation(Add(left, right), convert(3))
+      checkEvaluation(Add(Literal.create(null, left.dataType), right), null)
+      checkEvaluation(Add(left, Literal.create(null, right.dataType)), null)
+    }
   }
 
-  test("Abs") {
-    def testAbs(convert: (Int) => Any): Unit = {
-      checkEvaluation(Abs(Literal(convert(0))), convert(0))
-      checkEvaluation(Abs(Literal(convert(1))), convert(1))
-      checkEvaluation(Abs(Literal(convert(-1))), convert(1))
+  test("- (UnaryMinus)") {
+    testNumericDataTypes { convert =>
+      val input = Literal(convert(1))
+      val dataType = input.dataType
+      checkEvaluation(UnaryMinus(input), convert(-1))
+      checkEvaluation(UnaryMinus(Literal.create(null, dataType)), null)
     }
-    testAbs(_.toByte)
-    testAbs(_.toShort)
-    testAbs(identity)
-    testAbs(_.toLong)
-    testAbs(_.toFloat)
-    testAbs(_.toDouble)
-    testAbs(Decimal(_))
   }
 
-  test("Divide") {
-    checkEvaluation(Divide(Literal(2), Literal(1)), 2)
-    checkEvaluation(Divide(Literal(1.0), Literal(2.0)), 0.5)
+  test("- (Minus)") {
+    testNumericDataTypes { convert =>
+      val left = Literal(convert(1))
+      val right = Literal(convert(2))
+      checkEvaluation(Subtract(left, right), convert(-1))
+      checkEvaluation(Subtract(Literal.create(null, left.dataType), right), null)
+      checkEvaluation(Subtract(left, Literal.create(null, right.dataType)), null)
+    }
+  }
+
+  test("* (Multiply)") {
+    testNumericDataTypes { convert =>
+      val left = Literal(convert(1))
+      val right = Literal(convert(2))
+      checkEvaluation(Multiply(left, right), convert(2))
+      checkEvaluation(Multiply(Literal.create(null, left.dataType), right), null)
+      checkEvaluation(Multiply(left, Literal.create(null, right.dataType)), null)
+    }
+  }
+
+  test("/ (Divide) basic") {
+    testNumericDataTypes { convert =>
+      val left = Literal(convert(2))
+      val right = Literal(convert(1))
+      val dataType = left.dataType
+      checkEvaluation(Divide(left, right), convert(2))
+      checkEvaluation(Divide(Literal.create(null, dataType), right), null)
+      checkEvaluation(Divide(left, Literal.create(null, right.dataType)), null)
+      checkEvaluation(Divide(left, Literal(convert(0))), null)  // divide by zero
+    }
+  }
+
+  test("/ (Divide) for integral type") {
+    checkEvaluation(Divide(Literal(1.toByte), Literal(2.toByte)), 0.toByte)
+    checkEvaluation(Divide(Literal(1.toShort), Literal(2.toShort)), 0.toShort)
     checkEvaluation(Divide(Literal(1), Literal(2)), 0)
-    checkEvaluation(Divide(Literal(1), Literal(0)), null)
-    checkEvaluation(Divide(Literal(1.0), Literal(0.0)), null)
-    checkEvaluation(Divide(Literal(0.0), Literal(0.0)), null)
-    checkEvaluation(Divide(Literal(0), Literal.create(null, IntegerType)), null)
-    checkEvaluation(Divide(Literal(1), Literal.create(null, IntegerType)), null)
-    checkEvaluation(Divide(Literal.create(null, IntegerType), Literal(0)), null)
-    checkEvaluation(Divide(Literal.create(null, DoubleType), Literal(0.0)), null)
-    checkEvaluation(Divide(Literal.create(null, IntegerType), Literal(1)), null)
-    checkEvaluation(Divide(Literal.create(null, IntegerType), Literal.create(null, IntegerType)),
-      null)
+    checkEvaluation(Divide(Literal(1.toLong), Literal(2.toLong)), 0.toLong)
   }
 
-  test("Remainder") {
-    checkEvaluation(Remainder(Literal(2), Literal(1)), 0)
-    checkEvaluation(Remainder(Literal(1.0), Literal(2.0)), 1.0)
-    checkEvaluation(Remainder(Literal(1), Literal(2)), 1)
-    checkEvaluation(Remainder(Literal(1), Literal(0)), null)
-    checkEvaluation(Remainder(Literal(1.0), Literal(0.0)), null)
-    checkEvaluation(Remainder(Literal(0.0), Literal(0.0)), null)
-    checkEvaluation(Remainder(Literal(0), Literal.create(null, IntegerType)), null)
-    checkEvaluation(Remainder(Literal(1), Literal.create(null, IntegerType)), null)
-    checkEvaluation(Remainder(Literal.create(null, IntegerType), Literal(0)), null)
-    checkEvaluation(Remainder(Literal.create(null, DoubleType), Literal(0.0)), null)
-    checkEvaluation(Remainder(Literal.create(null, IntegerType), Literal(1)), null)
-    checkEvaluation(Remainder(Literal.create(null, IntegerType), Literal.create(null, IntegerType)),
-      null)
+  test("/ (Divide) for floating point") {
+    checkEvaluation(Divide(Literal(1.0f), Literal(2.0f)), 0.5f)
+    checkEvaluation(Divide(Literal(1.0), Literal(2.0)), 0.5)
+    checkEvaluation(Divide(Literal(Decimal(1.0)), Literal(Decimal(2.0))), Decimal(0.5))
+  }
+
+  test("% (Remainder)") {
+    testNumericDataTypes { convert =>
+      val left = Literal(convert(1))
+      val right = Literal(convert(2))
+      checkEvaluation(Remainder(left, right), convert(1))
+      checkEvaluation(Remainder(Literal.create(null, left.dataType), right), null)
+      checkEvaluation(Remainder(left, Literal.create(null, right.dataType)), null)
+      checkEvaluation(Remainder(left, Literal(convert(0))), null)  // mod by 0
+    }
+  }
+
+  test("Abs") {
+    testNumericDataTypes { convert =>
+      checkEvaluation(Abs(Literal(convert(0))), convert(0))
+      checkEvaluation(Abs(Literal(convert(1))), convert(1))
+      checkEvaluation(Abs(Literal(convert(-1))), convert(1))
+    }
   }
 
   test("MaxOf") {

From f3f2a4397da164f0ddfa5d60bf441099296c4346 Mon Sep 17 00:00:00 2001
From: Peter Hoffmann <ph@peter-hoffmann.com>
Date: Sun, 14 Jun 2015 11:41:16 -0700
Subject: [PATCH 484/525] fix read/write mixup

Author: Peter Hoffmann <ph@peter-hoffmann.com>

Closes #6815 from hoffmann/patch-1 and squashes the following commits:

2abb6da [Peter Hoffmann] fix read/write mixup
---
 docs/sql-programming-guide.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/sql-programming-guide.md b/docs/sql-programming-guide.md
index c5ab074e4439f..7fed1bf8829f5 100644
--- a/docs/sql-programming-guide.md
+++ b/docs/sql-programming-guide.md
@@ -995,11 +995,11 @@ List<String> teenagerNames = teenagers.javaRDD().map(new Function<Row, String>()
 schemaPeople # The DataFrame from the previous example.
 
 # DataFrames can be saved as Parquet files, maintaining the schema information.
-schemaPeople.read.parquet("people.parquet")
+schemaPeople.write.parquet("people.parquet")
 
 # Read in the Parquet file created above.  Parquet files are self-describing so the schema is preserved.
 # The result of loading a parquet file is also a DataFrame.
-parquetFile = sqlContext.write.parquet("people.parquet")
+parquetFile = sqlContext.read.parquet("people.parquet")
 
 # Parquet files can also be registered as tables and then used in SQL statements.
 parquetFile.registerTempTable("parquetFile");

From 4eb48ed1dadee80d78ada5d15884dd348c46ad27 Mon Sep 17 00:00:00 2001
From: Marcelo Vanzin <vanzin@cloudera.com>
Date: Sun, 14 Jun 2015 11:49:16 -0700
Subject: [PATCH 485/525] [SPARK-8065] [SQL] Add support for Hive 0.14
 metastores

This change has two parts.

The first one gets rid of "ReflectionMagic". That worked well for the differences between 0.12 and
0.13, but breaks in 0.14, since some of the APIs that need to be used have primitive types. I could
not figure out a way to make that class work with primitive types. So instead I wrote some shims
 (I can already hear the collective sigh) that find the appropriate methods via reflection. This should
be faster since the method instances are cached, and the code is not much uglier than before,
with the advantage that all the ugliness is local to one file (instead of multiple switch statements on
the version being used scattered in ClientWrapper).

The second part is simple: add code to handle Hive 0.14. A few new methods had to be added
to the new shims.

Author: Marcelo Vanzin <vanzin@cloudera.com>

Closes #6627 from vanzin/SPARK-8065 and squashes the following commits:

3fa4270 [Marcelo Vanzin] Indentation style.
4b8a3d4 [Marcelo Vanzin] Fix dep exclusion.
be3d0cc [Marcelo Vanzin] Merge branch 'master' into SPARK-8065
ca3fb1e [Marcelo Vanzin] Merge branch 'master' into SPARK-8065
b43f13e [Marcelo Vanzin] Since exclusions seem to work, clean up some of the code.
73bd161 [Marcelo Vanzin] Botched merge.
d2ddf01 [Marcelo Vanzin] Comment about excluded dep.
0c929d1 [Marcelo Vanzin] Merge branch 'master' into SPARK-8065
2c3c02e [Marcelo Vanzin] Try to fix tests by adding support for exclusions.
0a03470 [Marcelo Vanzin] Try to fix tests by upgrading calcite dependency.
13b2dfa [Marcelo Vanzin] Fix NPE.
6439d88 [Marcelo Vanzin] Minor style thing.
69b017b [Marcelo Vanzin] Style.
a21cad8 [Marcelo Vanzin] Part II: Add shims / version for Hive 0.14.
ae98c87 [Marcelo Vanzin] PART I: Get rid of reflection magic.
---
 .../org/apache/spark/deploy/SparkSubmit.scala |  33 +-
 .../spark/deploy/SparkSubmitUtilsSuite.scala  |  16 +-
 .../spark/sql/hive/client/ClientWrapper.scala |  86 ++---
 .../spark/sql/hive/client/HiveShim.scala      | 349 ++++++++++++++++++
 .../hive/client/IsolatedClientLoader.scala    |  39 +-
 .../sql/hive/client/ReflectionMagic.scala     | 208 -----------
 .../spark/sql/hive/client/package.scala       |  24 +-
 .../spark/sql/hive/client/VersionsSuite.scala |   2 +-
 8 files changed, 444 insertions(+), 313 deletions(-)
 create mode 100644 sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveShim.scala
 delete mode 100644 sql/hive/src/main/scala/org/apache/spark/sql/hive/client/ReflectionMagic.scala

diff --git a/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala b/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
index b8978e25a02d2..cfcc6d355801e 100644
--- a/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
@@ -832,11 +832,7 @@ private[spark] object SparkSubmitUtils {
       ivyConfName: String,
       md: DefaultModuleDescriptor): Unit = {
     // Add scala exclusion rule
-    val scalaArtifacts = new ArtifactId(new ModuleId("*", "scala-library"), "*", "*", "*")
-    val scalaDependencyExcludeRule =
-      new DefaultExcludeRule(scalaArtifacts, ivySettings.getMatcher("glob"), null)
-    scalaDependencyExcludeRule.addConfiguration(ivyConfName)
-    md.addExcludeRule(scalaDependencyExcludeRule)
+    md.addExcludeRule(createExclusion("*:scala-library:*", ivySettings, ivyConfName))
 
     // We need to specify each component explicitly, otherwise we miss spark-streaming-kafka and
     // other spark-streaming utility components. Underscore is there to differentiate between
@@ -845,13 +841,8 @@ private[spark] object SparkSubmitUtils {
       "sql_", "streaming_", "yarn_", "network-common_", "network-shuffle_", "network-yarn_")
 
     components.foreach { comp =>
-      val sparkArtifacts =
-        new ArtifactId(new ModuleId("org.apache.spark", s"spark-$comp*"), "*", "*", "*")
-      val sparkDependencyExcludeRule =
-        new DefaultExcludeRule(sparkArtifacts, ivySettings.getMatcher("glob"), null)
-      sparkDependencyExcludeRule.addConfiguration(ivyConfName)
-
-      md.addExcludeRule(sparkDependencyExcludeRule)
+      md.addExcludeRule(createExclusion(s"org.apache.spark:spark-$comp*:*", ivySettings,
+        ivyConfName))
     }
   }
 
@@ -864,6 +855,7 @@ private[spark] object SparkSubmitUtils {
    * @param coordinates Comma-delimited string of maven coordinates
    * @param remoteRepos Comma-delimited string of remote repositories other than maven central
    * @param ivyPath The path to the local ivy repository
+   * @param exclusions Exclusions to apply when resolving transitive dependencies
    * @return The comma-delimited path to the jars of the given maven artifacts including their
    *         transitive dependencies
    */
@@ -871,6 +863,7 @@ private[spark] object SparkSubmitUtils {
       coordinates: String,
       remoteRepos: Option[String],
       ivyPath: Option[String],
+      exclusions: Seq[String] = Nil,
       isTest: Boolean = false): String = {
     if (coordinates == null || coordinates.trim.isEmpty) {
       ""
@@ -928,6 +921,10 @@ private[spark] object SparkSubmitUtils {
         // add all supplied maven artifacts as dependencies
         addDependenciesToIvy(md, artifacts, ivyConfName)
 
+        exclusions.foreach { e =>
+          md.addExcludeRule(createExclusion(e + ":*", ivySettings, ivyConfName))
+        }
+
         // resolve dependencies
         val rr: ResolveReport = ivy.resolve(md, resolveOptions)
         if (rr.hasError) {
@@ -944,6 +941,18 @@ private[spark] object SparkSubmitUtils {
       }
     }
   }
+
+  private def createExclusion(
+      coords: String,
+      ivySettings: IvySettings,
+      ivyConfName: String): ExcludeRule = {
+    val c = extractMavenCoordinates(coords)(0)
+    val id = new ArtifactId(new ModuleId(c.groupId, c.artifactId), "*", "*", "*")
+    val rule = new DefaultExcludeRule(id, ivySettings.getMatcher("glob"), null)
+    rule.addConfiguration(ivyConfName)
+    rule
+  }
+
 }
 
 /**
diff --git a/core/src/test/scala/org/apache/spark/deploy/SparkSubmitUtilsSuite.scala b/core/src/test/scala/org/apache/spark/deploy/SparkSubmitUtilsSuite.scala
index 07d261cc428c4..3a8da9fb9ea17 100644
--- a/core/src/test/scala/org/apache/spark/deploy/SparkSubmitUtilsSuite.scala
+++ b/core/src/test/scala/org/apache/spark/deploy/SparkSubmitUtilsSuite.scala
@@ -106,7 +106,7 @@ class SparkSubmitUtilsSuite extends SparkFunSuite with BeforeAndAfterAll {
     IvyTestUtils.withRepository(main, None, None) { repo =>
       // end to end
       val jarPath = SparkSubmitUtils.resolveMavenCoordinates(main.toString, Option(repo),
-        Option(tempIvyPath), true)
+        Option(tempIvyPath), isTest = true)
       assert(jarPath.indexOf(tempIvyPath) >= 0, "should use non-default ivy path")
     }
   }
@@ -115,21 +115,23 @@ class SparkSubmitUtilsSuite extends SparkFunSuite with BeforeAndAfterAll {
     val main = new MavenCoordinate("my.awesome.lib", "mylib", "0.1")
     // Local M2 repository
     IvyTestUtils.withRepository(main, None, Some(SparkSubmitUtils.m2Path)) { repo =>
-      val jarPath = SparkSubmitUtils.resolveMavenCoordinates(main.toString, None, None, true)
+      val jarPath = SparkSubmitUtils.resolveMavenCoordinates(main.toString, None, None,
+        isTest = true)
       assert(jarPath.indexOf("mylib") >= 0, "should find artifact")
     }
     // Local Ivy Repository
     val settings = new IvySettings
     val ivyLocal = new File(settings.getDefaultIvyUserDir, "local" + File.separator)
     IvyTestUtils.withRepository(main, None, Some(ivyLocal), true) { repo =>
-      val jarPath = SparkSubmitUtils.resolveMavenCoordinates(main.toString, None, None, true)
+      val jarPath = SparkSubmitUtils.resolveMavenCoordinates(main.toString, None, None,
+        isTest = true)
       assert(jarPath.indexOf("mylib") >= 0, "should find artifact")
     }
     // Local ivy repository with modified home
     val dummyIvyLocal = new File(tempIvyPath, "local" + File.separator)
     IvyTestUtils.withRepository(main, None, Some(dummyIvyLocal), true) { repo =>
       val jarPath = SparkSubmitUtils.resolveMavenCoordinates(main.toString, None,
-        Some(tempIvyPath), true)
+        Some(tempIvyPath), isTest = true)
       assert(jarPath.indexOf("mylib") >= 0, "should find artifact")
       assert(jarPath.indexOf(tempIvyPath) >= 0, "should be in new ivy path")
     }
@@ -137,7 +139,7 @@ class SparkSubmitUtilsSuite extends SparkFunSuite with BeforeAndAfterAll {
 
   test("dependency not found throws RuntimeException") {
     intercept[RuntimeException] {
-      SparkSubmitUtils.resolveMavenCoordinates("a:b:c", None, None, true)
+      SparkSubmitUtils.resolveMavenCoordinates("a:b:c", None, None, isTest = true)
     }
   }
 
@@ -149,12 +151,12 @@ class SparkSubmitUtilsSuite extends SparkFunSuite with BeforeAndAfterAll {
       components.map(comp => s"org.apache.spark:spark-${comp}2.10:1.2.0").mkString(",") +
       ",org.apache.spark:spark-core_fake:1.2.0"
 
-    val path = SparkSubmitUtils.resolveMavenCoordinates(coordinates, None, None, true)
+    val path = SparkSubmitUtils.resolveMavenCoordinates(coordinates, None, None, isTest = true)
     assert(path === "", "should return empty path")
     val main = MavenCoordinate("org.apache.spark", "spark-streaming-kafka-assembly_2.10", "1.2.0")
     IvyTestUtils.withRepository(main, None, None) { repo =>
       val files = SparkSubmitUtils.resolveMavenCoordinates(coordinates + "," + main.toString,
-        Some(repo), None, true)
+        Some(repo), None, isTest = true)
       assert(files.indexOf(main.artifactId) >= 0, "Did not return artifact")
     }
   }
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/ClientWrapper.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/ClientWrapper.scala
index 99aa0f1ded3f8..0fcba65ca6129 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/ClientWrapper.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/ClientWrapper.scala
@@ -27,7 +27,7 @@ import scala.language.reflectiveCalls
 import org.apache.hadoop.fs.Path
 import org.apache.hadoop.hive.metastore.api.Database
 import org.apache.hadoop.hive.conf.HiveConf
-import org.apache.hadoop.hive.metastore.TableType
+import org.apache.hadoop.hive.metastore.{TableType => HTableType}
 import org.apache.hadoop.hive.metastore.api
 import org.apache.hadoop.hive.metastore.api.FieldSchema
 import org.apache.hadoop.hive.ql.metadata
@@ -59,8 +59,7 @@ private[hive] class ClientWrapper(
     version: HiveVersion,
     config: Map[String, String])
   extends ClientInterface
-  with Logging
-  with ReflectionMagic {
+  with Logging {
 
   // Circular buffer to hold what hive prints to STDOUT and ERR.  Only printed when failures occur.
   private val outputBuffer = new java.io.OutputStream {
@@ -90,6 +89,12 @@ private[hive] class ClientWrapper(
     }
   }
 
+  private val shim = version match {
+    case hive.v12 => new Shim_v0_12()
+    case hive.v13 => new Shim_v0_13()
+    case hive.v14 => new Shim_v0_14()
+  }
+
   val state = {
     val original = Thread.currentThread().getContextClassLoader
     Thread.currentThread().setContextClassLoader(getClass.getClassLoader)
@@ -128,14 +133,7 @@ private[hive] class ClientWrapper(
     val original = Thread.currentThread().getContextClassLoader
     Thread.currentThread().setContextClassLoader(getClass.getClassLoader)
     Hive.set(client)
-    version match {
-      case hive.v12 =>
-        classOf[SessionState]
-          .callStatic[SessionState, SessionState]("start", state)
-      case hive.v13 =>
-        classOf[SessionState]
-          .callStatic[SessionState, SessionState]("setCurrentSessionState", state)
-    }
+    shim.setCurrentSessionState(state)
     val ret = try f finally {
       Thread.currentThread().setContextClassLoader(original)
     }
@@ -193,15 +191,12 @@ private[hive] class ClientWrapper(
         properties = h.getParameters.toMap,
         serdeProperties = h.getTTable.getSd.getSerdeInfo.getParameters.toMap,
         tableType = h.getTableType match {
-          case TableType.MANAGED_TABLE => ManagedTable
-          case TableType.EXTERNAL_TABLE => ExternalTable
-          case TableType.VIRTUAL_VIEW => VirtualView
-          case TableType.INDEX_TABLE => IndexTable
-        },
-        location = version match {
-          case hive.v12 => Option(h.call[URI]("getDataLocation")).map(_.toString)
-          case hive.v13 => Option(h.call[Path]("getDataLocation")).map(_.toString)
+          case HTableType.MANAGED_TABLE => ManagedTable
+          case HTableType.EXTERNAL_TABLE => ExternalTable
+          case HTableType.VIRTUAL_VIEW => VirtualView
+          case HTableType.INDEX_TABLE => IndexTable
         },
+        location = shim.getDataLocation(h),
         inputFormat = Option(h.getInputFormatClass).map(_.getName),
         outputFormat = Option(h.getOutputFormatClass).map(_.getName),
         serde = Option(h.getSerializationLib),
@@ -231,14 +226,7 @@ private[hive] class ClientWrapper(
     // set create time
     qlTable.setCreateTime((System.currentTimeMillis() / 1000).asInstanceOf[Int])
 
-    version match {
-      case hive.v12 =>
-        table.location.map(new URI(_)).foreach(u => qlTable.call[URI, Unit]("setDataLocation", u))
-      case hive.v13 =>
-        table.location
-          .map(new org.apache.hadoop.fs.Path(_))
-          .foreach(qlTable.call[Path, Unit]("setDataLocation", _))
-    }
+    table.location.foreach { loc => shim.setDataLocation(qlTable, loc) }
     table.inputFormat.map(toInputFormat).foreach(qlTable.setInputFormatClass)
     table.outputFormat.map(toOutputFormat).foreach(qlTable.setOutputFormatClass)
     table.serde.foreach(qlTable.setSerializationLib)
@@ -279,13 +267,7 @@ private[hive] class ClientWrapper(
 
   override def getAllPartitions(hTable: HiveTable): Seq[HivePartition] = withHiveState {
     val qlTable = toQlTable(hTable)
-    val qlPartitions = version match {
-      case hive.v12 =>
-        client.call[metadata.Table, JSet[metadata.Partition]]("getAllPartitionsForPruner", qlTable)
-      case hive.v13 =>
-        client.call[metadata.Table, JSet[metadata.Partition]]("getAllPartitionsOf", qlTable)
-    }
-    qlPartitions.toSeq.map(toHivePartition)
+    shim.getAllPartitions(client, qlTable).map(toHivePartition)
   }
 
   override def listTables(dbName: String): Seq[String] = withHiveState {
@@ -315,15 +297,7 @@ private[hive] class ClientWrapper(
       val tokens: Array[String] = cmd_trimmed.split("\\s+")
       // The remainder of the command.
       val cmd_1: String = cmd_trimmed.substring(tokens(0).length()).trim()
-      val proc: CommandProcessor = version match {
-        case hive.v12 =>
-          classOf[CommandProcessorFactory]
-            .callStatic[String, HiveConf, CommandProcessor]("get", tokens(0), conf)
-        case hive.v13 =>
-          classOf[CommandProcessorFactory]
-            .callStatic[Array[String], HiveConf, CommandProcessor]("get", Array(tokens(0)), conf)
-      }
-
+      val proc = shim.getCommandProcessor(tokens(0), conf)
       proc match {
         case driver: Driver =>
           val response: CommandProcessorResponse = driver.run(cmd)
@@ -334,21 +308,7 @@ private[hive] class ClientWrapper(
           }
           driver.setMaxRows(maxRows)
 
-          val results = version match {
-            case hive.v12 =>
-              val res = new JArrayList[String]
-              driver.call[JArrayList[String], Boolean]("getResults", res)
-              res.toSeq
-            case hive.v13 =>
-              val res = new JArrayList[Object]
-              driver.call[JList[Object], Boolean]("getResults", res)
-              res.map { r =>
-                r match {
-                  case s: String => s
-                  case a: Array[Object] => a(0).asInstanceOf[String]
-                }
-              }
-          }
+          val results = shim.getDriverResults(driver)
           driver.close()
           results
 
@@ -382,8 +342,8 @@ private[hive] class ClientWrapper(
       holdDDLTime: Boolean,
       inheritTableSpecs: Boolean,
       isSkewedStoreAsSubdir: Boolean): Unit = withHiveState {
-
-    client.loadPartition(
+    shim.loadPartition(
+      client,
       new Path(loadPath), // TODO: Use URI
       tableName,
       partSpec,
@@ -398,7 +358,8 @@ private[hive] class ClientWrapper(
       tableName: String,
       replace: Boolean,
       holdDDLTime: Boolean): Unit = withHiveState {
-    client.loadTable(
+    shim.loadTable(
+      client,
       new Path(loadPath),
       tableName,
       replace,
@@ -413,7 +374,8 @@ private[hive] class ClientWrapper(
       numDP: Int,
       holdDDLTime: Boolean,
       listBucketingEnabled: Boolean): Unit = withHiveState {
-    client.loadDynamicPartitions(
+    shim.loadDynamicPartitions(
+      client,
       new Path(loadPath),
       tableName,
       partSpec,
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveShim.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveShim.scala
new file mode 100644
index 0000000000000..40c167926c8d6
--- /dev/null
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveShim.scala
@@ -0,0 +1,349 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.hive.client
+
+import java.lang.{Boolean => JBoolean, Integer => JInteger}
+import java.lang.reflect.{Method, Modifier}
+import java.net.URI
+import java.util.{ArrayList => JArrayList, List => JList, Map => JMap, Set => JSet}
+
+import scala.collection.JavaConversions._
+
+import org.apache.hadoop.fs.Path
+import org.apache.hadoop.hive.conf.HiveConf
+import org.apache.hadoop.hive.ql.Driver
+import org.apache.hadoop.hive.ql.metadata.{Hive, Partition, Table}
+import org.apache.hadoop.hive.ql.processors.{CommandProcessor, CommandProcessorFactory}
+import org.apache.hadoop.hive.ql.session.SessionState
+
+/**
+ * A shim that defines the interface between ClientWrapper and the underlying Hive library used to
+ * talk to the metastore. Each Hive version has its own implementation of this class, defining
+ * version-specific version of needed functions.
+ *
+ * The guideline for writing shims is:
+ * - always extend from the previous version unless really not possible
+ * - initialize methods in lazy vals, both for quicker access for multiple invocations, and to
+ *   avoid runtime errors due to the above guideline.
+ */
+private[client] sealed abstract class Shim {
+
+  def setCurrentSessionState(state: SessionState): Unit
+
+  /**
+   * This shim is necessary because the return type is different on different versions of Hive.
+   * All parameters are the same, though.
+   */
+  def getDataLocation(table: Table): Option[String]
+
+  def setDataLocation(table: Table, loc: String): Unit
+
+  def getAllPartitions(hive: Hive, table: Table): Seq[Partition]
+
+  def getCommandProcessor(token: String, conf: HiveConf): CommandProcessor
+
+  def getDriverResults(driver: Driver): Seq[String]
+
+  def loadPartition(
+      hive: Hive,
+      loadPath: Path,
+      tableName: String,
+      partSpec: JMap[String, String],
+      replace: Boolean,
+      holdDDLTime: Boolean,
+      inheritTableSpecs: Boolean,
+      isSkewedStoreAsSubdir: Boolean): Unit
+
+  def loadTable(
+      hive: Hive,
+      loadPath: Path,
+      tableName: String,
+      replace: Boolean,
+      holdDDLTime: Boolean): Unit
+
+  def loadDynamicPartitions(
+      hive: Hive,
+      loadPath: Path,
+      tableName: String,
+      partSpec: JMap[String, String],
+      replace: Boolean,
+      numDP: Int,
+      holdDDLTime: Boolean,
+      listBucketingEnabled: Boolean): Unit
+
+  protected def findStaticMethod(klass: Class[_], name: String, args: Class[_]*): Method = {
+    val method = findMethod(klass, name, args: _*)
+    require(Modifier.isStatic(method.getModifiers()),
+      s"Method $name of class $klass is not static.")
+    method
+  }
+
+  protected def findMethod(klass: Class[_], name: String, args: Class[_]*): Method = {
+    klass.getMethod(name, args: _*)
+  }
+
+}
+
+private[client] class Shim_v0_12 extends Shim {
+
+  private lazy val startMethod =
+    findStaticMethod(
+      classOf[SessionState],
+      "start",
+      classOf[SessionState])
+  private lazy val getDataLocationMethod = findMethod(classOf[Table], "getDataLocation")
+  private lazy val setDataLocationMethod =
+    findMethod(
+      classOf[Table],
+      "setDataLocation",
+      classOf[URI])
+  private lazy val getAllPartitionsMethod =
+    findMethod(
+      classOf[Hive],
+      "getAllPartitionsForPruner",
+      classOf[Table])
+  private lazy val getCommandProcessorMethod =
+    findStaticMethod(
+      classOf[CommandProcessorFactory],
+      "get",
+      classOf[String],
+      classOf[HiveConf])
+  private lazy val getDriverResultsMethod =
+    findMethod(
+      classOf[Driver],
+      "getResults",
+      classOf[JArrayList[String]])
+  private lazy val loadPartitionMethod =
+    findMethod(
+      classOf[Hive],
+      "loadPartition",
+      classOf[Path],
+      classOf[String],
+      classOf[JMap[String, String]],
+      JBoolean.TYPE,
+      JBoolean.TYPE,
+      JBoolean.TYPE,
+      JBoolean.TYPE)
+  private lazy val loadTableMethod =
+    findMethod(
+      classOf[Hive],
+      "loadTable",
+      classOf[Path],
+      classOf[String],
+      JBoolean.TYPE,
+      JBoolean.TYPE)
+  private lazy val loadDynamicPartitionsMethod =
+    findMethod(
+      classOf[Hive],
+      "loadDynamicPartitions",
+      classOf[Path],
+      classOf[String],
+      classOf[JMap[String, String]],
+      JBoolean.TYPE,
+      JInteger.TYPE,
+      JBoolean.TYPE,
+      JBoolean.TYPE)
+
+  override def setCurrentSessionState(state: SessionState): Unit = startMethod.invoke(null, state)
+
+  override def getDataLocation(table: Table): Option[String] =
+    Option(getDataLocationMethod.invoke(table)).map(_.toString())
+
+  override def setDataLocation(table: Table, loc: String): Unit =
+    setDataLocationMethod.invoke(table, new URI(loc))
+
+  override def getAllPartitions(hive: Hive, table: Table): Seq[Partition] =
+    getAllPartitionsMethod.invoke(hive, table).asInstanceOf[JSet[Partition]].toSeq
+
+  override def getCommandProcessor(token: String, conf: HiveConf): CommandProcessor =
+    getCommandProcessorMethod.invoke(null, token, conf).asInstanceOf[CommandProcessor]
+
+  override def getDriverResults(driver: Driver): Seq[String] = {
+    val res = new JArrayList[String]()
+    getDriverResultsMethod.invoke(driver, res)
+    res.toSeq
+  }
+
+  override def loadPartition(
+      hive: Hive,
+      loadPath: Path,
+      tableName: String,
+      partSpec: JMap[String, String],
+      replace: Boolean,
+      holdDDLTime: Boolean,
+      inheritTableSpecs: Boolean,
+      isSkewedStoreAsSubdir: Boolean): Unit = {
+    loadPartitionMethod.invoke(hive, loadPath, tableName, partSpec, replace: JBoolean,
+      holdDDLTime: JBoolean, inheritTableSpecs: JBoolean, isSkewedStoreAsSubdir: JBoolean)
+  }
+
+  override def loadTable(
+      hive: Hive,
+      loadPath: Path,
+      tableName: String,
+      replace: Boolean,
+      holdDDLTime: Boolean): Unit = {
+    loadTableMethod.invoke(hive, loadPath, tableName, replace: JBoolean, holdDDLTime: JBoolean)
+  }
+
+  override def loadDynamicPartitions(
+      hive: Hive,
+      loadPath: Path,
+      tableName: String,
+      partSpec: JMap[String, String],
+      replace: Boolean,
+      numDP: Int,
+      holdDDLTime: Boolean,
+      listBucketingEnabled: Boolean): Unit = {
+    loadDynamicPartitionsMethod.invoke(hive, loadPath, tableName, partSpec, replace: JBoolean,
+      numDP: JInteger, holdDDLTime: JBoolean, listBucketingEnabled: JBoolean)
+  }
+
+}
+
+private[client] class Shim_v0_13 extends Shim_v0_12 {
+
+  private lazy val setCurrentSessionStateMethod =
+    findStaticMethod(
+      classOf[SessionState],
+      "setCurrentSessionState",
+      classOf[SessionState])
+  private lazy val setDataLocationMethod =
+    findMethod(
+      classOf[Table],
+      "setDataLocation",
+      classOf[Path])
+  private lazy val getAllPartitionsMethod =
+    findMethod(
+      classOf[Hive],
+      "getAllPartitionsOf",
+      classOf[Table])
+  private lazy val getCommandProcessorMethod =
+    findStaticMethod(
+      classOf[CommandProcessorFactory],
+      "get",
+      classOf[Array[String]],
+      classOf[HiveConf])
+  private lazy val getDriverResultsMethod =
+    findMethod(
+      classOf[Driver],
+      "getResults",
+      classOf[JList[Object]])
+
+  override def setCurrentSessionState(state: SessionState): Unit =
+    setCurrentSessionStateMethod.invoke(null, state)
+
+  override def setDataLocation(table: Table, loc: String): Unit =
+    setDataLocationMethod.invoke(table, new Path(loc))
+
+  override def getAllPartitions(hive: Hive, table: Table): Seq[Partition] =
+    getAllPartitionsMethod.invoke(hive, table).asInstanceOf[JSet[Partition]].toSeq
+
+  override def getCommandProcessor(token: String, conf: HiveConf): CommandProcessor =
+    getCommandProcessorMethod.invoke(null, Array(token), conf).asInstanceOf[CommandProcessor]
+
+  override def getDriverResults(driver: Driver): Seq[String] = {
+    val res = new JArrayList[Object]()
+    getDriverResultsMethod.invoke(driver, res)
+    res.map { r =>
+      r match {
+        case s: String => s
+        case a: Array[Object] => a(0).asInstanceOf[String]
+      }
+    }
+  }
+
+}
+
+private[client] class Shim_v0_14 extends Shim_v0_13 {
+
+  private lazy val loadPartitionMethod =
+    findMethod(
+      classOf[Hive],
+      "loadPartition",
+      classOf[Path],
+      classOf[String],
+      classOf[JMap[String, String]],
+      JBoolean.TYPE,
+      JBoolean.TYPE,
+      JBoolean.TYPE,
+      JBoolean.TYPE,
+      JBoolean.TYPE,
+      JBoolean.TYPE)
+  private lazy val loadTableMethod =
+    findMethod(
+      classOf[Hive],
+      "loadTable",
+      classOf[Path],
+      classOf[String],
+      JBoolean.TYPE,
+      JBoolean.TYPE,
+      JBoolean.TYPE,
+      JBoolean.TYPE,
+      JBoolean.TYPE)
+  private lazy val loadDynamicPartitionsMethod =
+    findMethod(
+      classOf[Hive],
+      "loadDynamicPartitions",
+      classOf[Path],
+      classOf[String],
+      classOf[JMap[String, String]],
+      JBoolean.TYPE,
+      JInteger.TYPE,
+      JBoolean.TYPE,
+      JBoolean.TYPE,
+      JBoolean.TYPE)
+
+  override def loadPartition(
+      hive: Hive,
+      loadPath: Path,
+      tableName: String,
+      partSpec: JMap[String, String],
+      replace: Boolean,
+      holdDDLTime: Boolean,
+      inheritTableSpecs: Boolean,
+      isSkewedStoreAsSubdir: Boolean): Unit = {
+    loadPartitionMethod.invoke(hive, loadPath, tableName, partSpec, replace: JBoolean,
+      holdDDLTime: JBoolean, inheritTableSpecs: JBoolean, isSkewedStoreAsSubdir: JBoolean,
+      JBoolean.TRUE, JBoolean.FALSE)
+  }
+
+  override def loadTable(
+      hive: Hive,
+      loadPath: Path,
+      tableName: String,
+      replace: Boolean,
+      holdDDLTime: Boolean): Unit = {
+    loadTableMethod.invoke(hive, loadPath, tableName, replace: JBoolean, holdDDLTime: JBoolean,
+      JBoolean.TRUE, JBoolean.FALSE, JBoolean.FALSE)
+  }
+
+  override def loadDynamicPartitions(
+      hive: Hive,
+      loadPath: Path,
+      tableName: String,
+      partSpec: JMap[String, String],
+      replace: Boolean,
+      numDP: Int,
+      holdDDLTime: Boolean,
+      listBucketingEnabled: Boolean): Unit = {
+    loadDynamicPartitionsMethod.invoke(hive, loadPath, tableName, partSpec, replace: JBoolean,
+      numDP: JInteger, holdDDLTime: JBoolean, listBucketingEnabled: JBoolean, JBoolean.FALSE)
+  }
+
+}
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/IsolatedClientLoader.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/IsolatedClientLoader.scala
index 16851fdd71a98..69cfc5c3c3216 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/IsolatedClientLoader.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/IsolatedClientLoader.scala
@@ -18,6 +18,7 @@
 package org.apache.spark.sql.hive.client
 
 import java.io.File
+import java.lang.reflect.InvocationTargetException
 import java.net.{URL, URLClassLoader}
 import java.util
 
@@ -28,6 +29,7 @@ import org.apache.commons.io.{FileUtils, IOUtils}
 
 import org.apache.spark.Logging
 import org.apache.spark.deploy.SparkSubmitUtils
+import org.apache.spark.util.Utils
 
 import org.apache.spark.sql.catalyst.util.quietly
 import org.apache.spark.sql.hive.HiveContext
@@ -48,29 +50,27 @@ private[hive] object IsolatedClientLoader {
   def hiveVersion(version: String): HiveVersion = version match {
     case "12" | "0.12" | "0.12.0" => hive.v12
     case "13" | "0.13" | "0.13.0" | "0.13.1" => hive.v13
+    case "14" | "0.14" | "0.14.0" => hive.v14
   }
 
   private def downloadVersion(version: HiveVersion): Seq[URL] = {
-    val hiveArtifacts =
-      (Seq("hive-metastore", "hive-exec", "hive-common", "hive-serde") ++
-        (if (version.hasBuiltinsJar) "hive-builtins" :: Nil else Nil))
-        .map(a => s"org.apache.hive:$a:${version.fullVersion}") :+
-        "com.google.guava:guava:14.0.1" :+
-        "org.apache.hadoop:hadoop-client:2.4.0"
+    val hiveArtifacts = version.extraDeps ++
+      Seq("hive-metastore", "hive-exec", "hive-common", "hive-serde")
+        .map(a => s"org.apache.hive:$a:${version.fullVersion}") ++
+      Seq("com.google.guava:guava:14.0.1",
+        "org.apache.hadoop:hadoop-client:2.4.0")
 
     val classpath = quietly {
       SparkSubmitUtils.resolveMavenCoordinates(
         hiveArtifacts.mkString(","),
         Some("http://www.datanucleus.org/downloads/maven2"),
-        None)
+        None,
+        exclusions = version.exclusions)
     }
     val allFiles = classpath.split(",").map(new File(_)).toSet
 
     // TODO: Remove copy logic.
-    val tempDir = File.createTempFile("hive", "v" + version.toString)
-    tempDir.delete()
-    tempDir.mkdir()
-
+    val tempDir = Utils.createTempDir(namePrefix = s"hive-${version}")
     allFiles.foreach(f => FileUtils.copyFileToDirectory(f, tempDir))
     tempDir.listFiles().map(_.toURL)
   }
@@ -129,7 +129,7 @@ private[hive] class IsolatedClientLoader(
   /** True if `name` refers to a spark class that must see specific version of Hive. */
   protected def isBarrierClass(name: String): Boolean =
     name.startsWith(classOf[ClientWrapper].getName) ||
-    name.startsWith(classOf[ReflectionMagic].getName) ||
+    name.startsWith(classOf[Shim].getName) ||
     barrierPrefixes.exists(name.startsWith)
 
   protected def classToPath(name: String): String =
@@ -170,11 +170,16 @@ private[hive] class IsolatedClientLoader(
       .newInstance(version, config)
       .asInstanceOf[ClientInterface]
   } catch {
-    case ReflectionException(cnf: NoClassDefFoundError) =>
-      throw new ClassNotFoundException(
-        s"$cnf when creating Hive client using classpath: ${execJars.mkString(", ")}\n" +
-         "Please make sure that jars for your version of hive and hadoop are included in the " +
-        s"paths passed to ${HiveContext.HIVE_METASTORE_JARS}.")
+    case e: InvocationTargetException =>
+      if (e.getCause().isInstanceOf[NoClassDefFoundError]) {
+        val cnf = e.getCause().asInstanceOf[NoClassDefFoundError]
+        throw new ClassNotFoundException(
+          s"$cnf when creating Hive client using classpath: ${execJars.mkString(", ")}\n" +
+           "Please make sure that jars for your version of hive and hadoop are included in the " +
+          s"paths passed to ${HiveContext.HIVE_METASTORE_JARS}.")
+      } else {
+        throw e
+      }
   } finally {
     Thread.currentThread.setContextClassLoader(baseClassLoader)
   }
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/ReflectionMagic.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/ReflectionMagic.scala
deleted file mode 100644
index 4d053ae42c2ea..0000000000000
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/ReflectionMagic.scala
+++ /dev/null
@@ -1,208 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.sql.hive.client
-
-import scala.reflect._
-
-/** Unwraps reflection exceptions. */
-private[client] object ReflectionException {
-  def unapply(a: Throwable): Option[Throwable] = a match {
-    case ite: java.lang.reflect.InvocationTargetException => Option(ite.getCause)
-    case _ => None
-  }
-}
-
-/**
- * Provides implicit functions on any object for calling methods reflectively.
- */
-private[client] trait ReflectionMagic {
-    /** code for InstanceMagic
-        println(
-    (1 to 22).map { n =>
-      def repeat(str: String => String) = (1 to n).map(i => str(i.toString)).mkString(", ")
-      val types = repeat(n => s"A$n <: AnyRef : ClassTag")
-      val inArgs = repeat(n => s"a$n: A$n")
-      val erasure = repeat(n => s"classTag[A$n].erasure")
-      val outArgs = repeat(n => s"a$n")
-      s"""|def call[$types, R](name: String, $inArgs): R = {
-         |  clazz.getMethod(name, $erasure).invoke(a, $outArgs).asInstanceOf[R]
-         |}""".stripMargin
-    }.mkString("\n")
-    )
-   */
-
-  // scalastyle:off
-  protected implicit class InstanceMagic(a: Any) {
-    private val clazz = a.getClass
-
-    def call[R](name: String): R = {
-      clazz.getMethod(name).invoke(a).asInstanceOf[R]
-    }
-    def call[A1 <: AnyRef : ClassTag, R](name: String, a1: A1): R = {
-      clazz.getMethod(name, classTag[A1].erasure).invoke(a, a1).asInstanceOf[R]
-    }
-    def call[A1 <: AnyRef : ClassTag, A2 <: AnyRef : ClassTag, R](name: String, a1: A1, a2: A2): R = {
-      clazz.getMethod(name, classTag[A1].erasure, classTag[A2].erasure).invoke(a, a1, a2).asInstanceOf[R]
-    }
-    def call[A1 <: AnyRef : ClassTag, A2 <: AnyRef : ClassTag, A3 <: AnyRef : ClassTag, R](name: String, a1: A1, a2: A2, a3: A3): R = {
-      clazz.getMethod(name, classTag[A1].erasure, classTag[A2].erasure, classTag[A3].erasure).invoke(a, a1, a2, a3).asInstanceOf[R]
-    }
-    def call[A1 <: AnyRef : ClassTag, A2 <: AnyRef : ClassTag, A3 <: AnyRef : ClassTag, A4 <: AnyRef : ClassTag, R](name: String, a1: A1, a2: A2, a3: A3, a4: A4): R = {
-      clazz.getMethod(name, classTag[A1].erasure, classTag[A2].erasure, classTag[A3].erasure, classTag[A4].erasure).invoke(a, a1, a2, a3, a4).asInstanceOf[R]
-    }
-    def call[A1 <: AnyRef : ClassTag, A2 <: AnyRef : ClassTag, A3 <: AnyRef : ClassTag, A4 <: AnyRef : ClassTag, A5 <: AnyRef : ClassTag, R](name: String, a1: A1, a2: A2, a3: A3, a4: A4, a5: A5): R = {
-      clazz.getMethod(name, classTag[A1].erasure, classTag[A2].erasure, classTag[A3].erasure, classTag[A4].erasure, classTag[A5].erasure).invoke(a, a1, a2, a3, a4, a5).asInstanceOf[R]
-    }
-    def call[A1 <: AnyRef : ClassTag, A2 <: AnyRef : ClassTag, A3 <: AnyRef : ClassTag, A4 <: AnyRef : ClassTag, A5 <: AnyRef : ClassTag, A6 <: AnyRef : ClassTag, R](name: String, a1: A1, a2: A2, a3: A3, a4: A4, a5: A5, a6: A6): R = {
-      clazz.getMethod(name, classTag[A1].erasure, classTag[A2].erasure, classTag[A3].erasure, classTag[A4].erasure, classTag[A5].erasure, classTag[A6].erasure).invoke(a, a1, a2, a3, a4, a5, a6).asInstanceOf[R]
-    }
-    def call[A1 <: AnyRef : ClassTag, A2 <: AnyRef : ClassTag, A3 <: AnyRef : ClassTag, A4 <: AnyRef : ClassTag, A5 <: AnyRef : ClassTag, A6 <: AnyRef : ClassTag, A7 <: AnyRef : ClassTag, R](name: String, a1: A1, a2: A2, a3: A3, a4: A4, a5: A5, a6: A6, a7: A7): R = {
-      clazz.getMethod(name, classTag[A1].erasure, classTag[A2].erasure, classTag[A3].erasure, classTag[A4].erasure, classTag[A5].erasure, classTag[A6].erasure, classTag[A7].erasure).invoke(a, a1, a2, a3, a4, a5, a6, a7).asInstanceOf[R]
-    }
-    def call[A1 <: AnyRef : ClassTag, A2 <: AnyRef : ClassTag, A3 <: AnyRef : ClassTag, A4 <: AnyRef : ClassTag, A5 <: AnyRef : ClassTag, A6 <: AnyRef : ClassTag, A7 <: AnyRef : ClassTag, A8 <: AnyRef : ClassTag, R](name: String, a1: A1, a2: A2, a3: A3, a4: A4, a5: A5, a6: A6, a7: A7, a8: A8): R = {
-      clazz.getMethod(name, classTag[A1].erasure, classTag[A2].erasure, classTag[A3].erasure, classTag[A4].erasure, classTag[A5].erasure, classTag[A6].erasure, classTag[A7].erasure, classTag[A8].erasure).invoke(a, a1, a2, a3, a4, a5, a6, a7, a8).asInstanceOf[R]
-    }
-    def call[A1 <: AnyRef : ClassTag, A2 <: AnyRef : ClassTag, A3 <: AnyRef : ClassTag, A4 <: AnyRef : ClassTag, A5 <: AnyRef : ClassTag, A6 <: AnyRef : ClassTag, A7 <: AnyRef : ClassTag, A8 <: AnyRef : ClassTag, A9 <: AnyRef : ClassTag, R](name: String, a1: A1, a2: A2, a3: A3, a4: A4, a5: A5, a6: A6, a7: A7, a8: A8, a9: A9): R = {
-      clazz.getMethod(name, classTag[A1].erasure, classTag[A2].erasure, classTag[A3].erasure, classTag[A4].erasure, classTag[A5].erasure, classTag[A6].erasure, classTag[A7].erasure, classTag[A8].erasure, classTag[A9].erasure).invoke(a, a1, a2, a3, a4, a5, a6, a7, a8, a9).asInstanceOf[R]
-    }
-    def call[A1 <: AnyRef : ClassTag, A2 <: AnyRef : ClassTag, A3 <: AnyRef : ClassTag, A4 <: AnyRef : ClassTag, A5 <: AnyRef : ClassTag, A6 <: AnyRef : ClassTag, A7 <: AnyRef : ClassTag, A8 <: AnyRef : ClassTag, A9 <: AnyRef : ClassTag, A10 <: AnyRef : ClassTag, R](name: String, a1: A1, a2: A2, a3: A3, a4: A4, a5: A5, a6: A6, a7: A7, a8: A8, a9: A9, a10: A10): R = {
-      clazz.getMethod(name, classTag[A1].erasure, classTag[A2].erasure, classTag[A3].erasure, classTag[A4].erasure, classTag[A5].erasure, classTag[A6].erasure, classTag[A7].erasure, classTag[A8].erasure, classTag[A9].erasure, classTag[A10].erasure).invoke(a, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10).asInstanceOf[R]
-    }
-    def call[A1 <: AnyRef : ClassTag, A2 <: AnyRef : ClassTag, A3 <: AnyRef : ClassTag, A4 <: AnyRef : ClassTag, A5 <: AnyRef : ClassTag, A6 <: AnyRef : ClassTag, A7 <: AnyRef : ClassTag, A8 <: AnyRef : ClassTag, A9 <: AnyRef : ClassTag, A10 <: AnyRef : ClassTag, A11 <: AnyRef : ClassTag, R](name: String, a1: A1, a2: A2, a3: A3, a4: A4, a5: A5, a6: A6, a7: A7, a8: A8, a9: A9, a10: A10, a11: A11): R = {
-      clazz.getMethod(name, classTag[A1].erasure, classTag[A2].erasure, classTag[A3].erasure, classTag[A4].erasure, classTag[A5].erasure, classTag[A6].erasure, classTag[A7].erasure, classTag[A8].erasure, classTag[A9].erasure, classTag[A10].erasure, classTag[A11].erasure).invoke(a, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11).asInstanceOf[R]
-    }
-    def call[A1 <: AnyRef : ClassTag, A2 <: AnyRef : ClassTag, A3 <: AnyRef : ClassTag, A4 <: AnyRef : ClassTag, A5 <: AnyRef : ClassTag, A6 <: AnyRef : ClassTag, A7 <: AnyRef : ClassTag, A8 <: AnyRef : ClassTag, A9 <: AnyRef : ClassTag, A10 <: AnyRef : ClassTag, A11 <: AnyRef : ClassTag, A12 <: AnyRef : ClassTag, R](name: String, a1: A1, a2: A2, a3: A3, a4: A4, a5: A5, a6: A6, a7: A7, a8: A8, a9: A9, a10: A10, a11: A11, a12: A12): R = {
-      clazz.getMethod(name, classTag[A1].erasure, classTag[A2].erasure, classTag[A3].erasure, classTag[A4].erasure, classTag[A5].erasure, classTag[A6].erasure, classTag[A7].erasure, classTag[A8].erasure, classTag[A9].erasure, classTag[A10].erasure, classTag[A11].erasure, classTag[A12].erasure).invoke(a, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12).asInstanceOf[R]
-    }
-    def call[A1 <: AnyRef : ClassTag, A2 <: AnyRef : ClassTag, A3 <: AnyRef : ClassTag, A4 <: AnyRef : ClassTag, A5 <: AnyRef : ClassTag, A6 <: AnyRef : ClassTag, A7 <: AnyRef : ClassTag, A8 <: AnyRef : ClassTag, A9 <: AnyRef : ClassTag, A10 <: AnyRef : ClassTag, A11 <: AnyRef : ClassTag, A12 <: AnyRef : ClassTag, A13 <: AnyRef : ClassTag, R](name: String, a1: A1, a2: A2, a3: A3, a4: A4, a5: A5, a6: A6, a7: A7, a8: A8, a9: A9, a10: A10, a11: A11, a12: A12, a13: A13): R = {
-      clazz.getMethod(name, classTag[A1].erasure, classTag[A2].erasure, classTag[A3].erasure, classTag[A4].erasure, classTag[A5].erasure, classTag[A6].erasure, classTag[A7].erasure, classTag[A8].erasure, classTag[A9].erasure, classTag[A10].erasure, classTag[A11].erasure, classTag[A12].erasure, classTag[A13].erasure).invoke(a, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12, a13).asInstanceOf[R]
-    }
-    def call[A1 <: AnyRef : ClassTag, A2 <: AnyRef : ClassTag, A3 <: AnyRef : ClassTag, A4 <: AnyRef : ClassTag, A5 <: AnyRef : ClassTag, A6 <: AnyRef : ClassTag, A7 <: AnyRef : ClassTag, A8 <: AnyRef : ClassTag, A9 <: AnyRef : ClassTag, A10 <: AnyRef : ClassTag, A11 <: AnyRef : ClassTag, A12 <: AnyRef : ClassTag, A13 <: AnyRef : ClassTag, A14 <: AnyRef : ClassTag, R](name: String, a1: A1, a2: A2, a3: A3, a4: A4, a5: A5, a6: A6, a7: A7, a8: A8, a9: A9, a10: A10, a11: A11, a12: A12, a13: A13, a14: A14): R = {
-      clazz.getMethod(name, classTag[A1].erasure, classTag[A2].erasure, classTag[A3].erasure, classTag[A4].erasure, classTag[A5].erasure, classTag[A6].erasure, classTag[A7].erasure, classTag[A8].erasure, classTag[A9].erasure, classTag[A10].erasure, classTag[A11].erasure, classTag[A12].erasure, classTag[A13].erasure, classTag[A14].erasure).invoke(a, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12, a13, a14).asInstanceOf[R]
-    }
-    def call[A1 <: AnyRef : ClassTag, A2 <: AnyRef : ClassTag, A3 <: AnyRef : ClassTag, A4 <: AnyRef : ClassTag, A5 <: AnyRef : ClassTag, A6 <: AnyRef : ClassTag, A7 <: AnyRef : ClassTag, A8 <: AnyRef : ClassTag, A9 <: AnyRef : ClassTag, A10 <: AnyRef : ClassTag, A11 <: AnyRef : ClassTag, A12 <: AnyRef : ClassTag, A13 <: AnyRef : ClassTag, A14 <: AnyRef : ClassTag, A15 <: AnyRef : ClassTag, R](name: String, a1: A1, a2: A2, a3: A3, a4: A4, a5: A5, a6: A6, a7: A7, a8: A8, a9: A9, a10: A10, a11: A11, a12: A12, a13: A13, a14: A14, a15: A15): R = {
-      clazz.getMethod(name, classTag[A1].erasure, classTag[A2].erasure, classTag[A3].erasure, classTag[A4].erasure, classTag[A5].erasure, classTag[A6].erasure, classTag[A7].erasure, classTag[A8].erasure, classTag[A9].erasure, classTag[A10].erasure, classTag[A11].erasure, classTag[A12].erasure, classTag[A13].erasure, classTag[A14].erasure, classTag[A15].erasure).invoke(a, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12, a13, a14, a15).asInstanceOf[R]
-    }
-    def call[A1 <: AnyRef : ClassTag, A2 <: AnyRef : ClassTag, A3 <: AnyRef : ClassTag, A4 <: AnyRef : ClassTag, A5 <: AnyRef : ClassTag, A6 <: AnyRef : ClassTag, A7 <: AnyRef : ClassTag, A8 <: AnyRef : ClassTag, A9 <: AnyRef : ClassTag, A10 <: AnyRef : ClassTag, A11 <: AnyRef : ClassTag, A12 <: AnyRef : ClassTag, A13 <: AnyRef : ClassTag, A14 <: AnyRef : ClassTag, A15 <: AnyRef : ClassTag, A16 <: AnyRef : ClassTag, R](name: String, a1: A1, a2: A2, a3: A3, a4: A4, a5: A5, a6: A6, a7: A7, a8: A8, a9: A9, a10: A10, a11: A11, a12: A12, a13: A13, a14: A14, a15: A15, a16: A16): R = {
-      clazz.getMethod(name, classTag[A1].erasure, classTag[A2].erasure, classTag[A3].erasure, classTag[A4].erasure, classTag[A5].erasure, classTag[A6].erasure, classTag[A7].erasure, classTag[A8].erasure, classTag[A9].erasure, classTag[A10].erasure, classTag[A11].erasure, classTag[A12].erasure, classTag[A13].erasure, classTag[A14].erasure, classTag[A15].erasure, classTag[A16].erasure).invoke(a, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12, a13, a14, a15, a16).asInstanceOf[R]
-    }
-    def call[A1 <: AnyRef : ClassTag, A2 <: AnyRef : ClassTag, A3 <: AnyRef : ClassTag, A4 <: AnyRef : ClassTag, A5 <: AnyRef : ClassTag, A6 <: AnyRef : ClassTag, A7 <: AnyRef : ClassTag, A8 <: AnyRef : ClassTag, A9 <: AnyRef : ClassTag, A10 <: AnyRef : ClassTag, A11 <: AnyRef : ClassTag, A12 <: AnyRef : ClassTag, A13 <: AnyRef : ClassTag, A14 <: AnyRef : ClassTag, A15 <: AnyRef : ClassTag, A16 <: AnyRef : ClassTag, A17 <: AnyRef : ClassTag, R](name: String, a1: A1, a2: A2, a3: A3, a4: A4, a5: A5, a6: A6, a7: A7, a8: A8, a9: A9, a10: A10, a11: A11, a12: A12, a13: A13, a14: A14, a15: A15, a16: A16, a17: A17): R = {
-      clazz.getMethod(name, classTag[A1].erasure, classTag[A2].erasure, classTag[A3].erasure, classTag[A4].erasure, classTag[A5].erasure, classTag[A6].erasure, classTag[A7].erasure, classTag[A8].erasure, classTag[A9].erasure, classTag[A10].erasure, classTag[A11].erasure, classTag[A12].erasure, classTag[A13].erasure, classTag[A14].erasure, classTag[A15].erasure, classTag[A16].erasure, classTag[A17].erasure).invoke(a, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12, a13, a14, a15, a16, a17).asInstanceOf[R]
-    }
-    def call[A1 <: AnyRef : ClassTag, A2 <: AnyRef : ClassTag, A3 <: AnyRef : ClassTag, A4 <: AnyRef : ClassTag, A5 <: AnyRef : ClassTag, A6 <: AnyRef : ClassTag, A7 <: AnyRef : ClassTag, A8 <: AnyRef : ClassTag, A9 <: AnyRef : ClassTag, A10 <: AnyRef : ClassTag, A11 <: AnyRef : ClassTag, A12 <: AnyRef : ClassTag, A13 <: AnyRef : ClassTag, A14 <: AnyRef : ClassTag, A15 <: AnyRef : ClassTag, A16 <: AnyRef : ClassTag, A17 <: AnyRef : ClassTag, A18 <: AnyRef : ClassTag, R](name: String, a1: A1, a2: A2, a3: A3, a4: A4, a5: A5, a6: A6, a7: A7, a8: A8, a9: A9, a10: A10, a11: A11, a12: A12, a13: A13, a14: A14, a15: A15, a16: A16, a17: A17, a18: A18): R = {
-      clazz.getMethod(name, classTag[A1].erasure, classTag[A2].erasure, classTag[A3].erasure, classTag[A4].erasure, classTag[A5].erasure, classTag[A6].erasure, classTag[A7].erasure, classTag[A8].erasure, classTag[A9].erasure, classTag[A10].erasure, classTag[A11].erasure, classTag[A12].erasure, classTag[A13].erasure, classTag[A14].erasure, classTag[A15].erasure, classTag[A16].erasure, classTag[A17].erasure, classTag[A18].erasure).invoke(a, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12, a13, a14, a15, a16, a17, a18).asInstanceOf[R]
-    }
-    def call[A1 <: AnyRef : ClassTag, A2 <: AnyRef : ClassTag, A3 <: AnyRef : ClassTag, A4 <: AnyRef : ClassTag, A5 <: AnyRef : ClassTag, A6 <: AnyRef : ClassTag, A7 <: AnyRef : ClassTag, A8 <: AnyRef : ClassTag, A9 <: AnyRef : ClassTag, A10 <: AnyRef : ClassTag, A11 <: AnyRef : ClassTag, A12 <: AnyRef : ClassTag, A13 <: AnyRef : ClassTag, A14 <: AnyRef : ClassTag, A15 <: AnyRef : ClassTag, A16 <: AnyRef : ClassTag, A17 <: AnyRef : ClassTag, A18 <: AnyRef : ClassTag, A19 <: AnyRef : ClassTag, R](name: String, a1: A1, a2: A2, a3: A3, a4: A4, a5: A5, a6: A6, a7: A7, a8: A8, a9: A9, a10: A10, a11: A11, a12: A12, a13: A13, a14: A14, a15: A15, a16: A16, a17: A17, a18: A18, a19: A19): R = {
-      clazz.getMethod(name, classTag[A1].erasure, classTag[A2].erasure, classTag[A3].erasure, classTag[A4].erasure, classTag[A5].erasure, classTag[A6].erasure, classTag[A7].erasure, classTag[A8].erasure, classTag[A9].erasure, classTag[A10].erasure, classTag[A11].erasure, classTag[A12].erasure, classTag[A13].erasure, classTag[A14].erasure, classTag[A15].erasure, classTag[A16].erasure, classTag[A17].erasure, classTag[A18].erasure, classTag[A19].erasure).invoke(a, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12, a13, a14, a15, a16, a17, a18, a19).asInstanceOf[R]
-    }
-    def call[A1 <: AnyRef : ClassTag, A2 <: AnyRef : ClassTag, A3 <: AnyRef : ClassTag, A4 <: AnyRef : ClassTag, A5 <: AnyRef : ClassTag, A6 <: AnyRef : ClassTag, A7 <: AnyRef : ClassTag, A8 <: AnyRef : ClassTag, A9 <: AnyRef : ClassTag, A10 <: AnyRef : ClassTag, A11 <: AnyRef : ClassTag, A12 <: AnyRef : ClassTag, A13 <: AnyRef : ClassTag, A14 <: AnyRef : ClassTag, A15 <: AnyRef : ClassTag, A16 <: AnyRef : ClassTag, A17 <: AnyRef : ClassTag, A18 <: AnyRef : ClassTag, A19 <: AnyRef : ClassTag, A20 <: AnyRef : ClassTag, R](name: String, a1: A1, a2: A2, a3: A3, a4: A4, a5: A5, a6: A6, a7: A7, a8: A8, a9: A9, a10: A10, a11: A11, a12: A12, a13: A13, a14: A14, a15: A15, a16: A16, a17: A17, a18: A18, a19: A19, a20: A20): R = {
-      clazz.getMethod(name, classTag[A1].erasure, classTag[A2].erasure, classTag[A3].erasure, classTag[A4].erasure, classTag[A5].erasure, classTag[A6].erasure, classTag[A7].erasure, classTag[A8].erasure, classTag[A9].erasure, classTag[A10].erasure, classTag[A11].erasure, classTag[A12].erasure, classTag[A13].erasure, classTag[A14].erasure, classTag[A15].erasure, classTag[A16].erasure, classTag[A17].erasure, classTag[A18].erasure, classTag[A19].erasure, classTag[A20].erasure).invoke(a, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12, a13, a14, a15, a16, a17, a18, a19, a20).asInstanceOf[R]
-    }
-    def call[A1 <: AnyRef : ClassTag, A2 <: AnyRef : ClassTag, A3 <: AnyRef : ClassTag, A4 <: AnyRef : ClassTag, A5 <: AnyRef : ClassTag, A6 <: AnyRef : ClassTag, A7 <: AnyRef : ClassTag, A8 <: AnyRef : ClassTag, A9 <: AnyRef : ClassTag, A10 <: AnyRef : ClassTag, A11 <: AnyRef : ClassTag, A12 <: AnyRef : ClassTag, A13 <: AnyRef : ClassTag, A14 <: AnyRef : ClassTag, A15 <: AnyRef : ClassTag, A16 <: AnyRef : ClassTag, A17 <: AnyRef : ClassTag, A18 <: AnyRef : ClassTag, A19 <: AnyRef : ClassTag, A20 <: AnyRef : ClassTag, A21 <: AnyRef : ClassTag, R](name: String, a1: A1, a2: A2, a3: A3, a4: A4, a5: A5, a6: A6, a7: A7, a8: A8, a9: A9, a10: A10, a11: A11, a12: A12, a13: A13, a14: A14, a15: A15, a16: A16, a17: A17, a18: A18, a19: A19, a20: A20, a21: A21): R = {
-      clazz.getMethod(name, classTag[A1].erasure, classTag[A2].erasure, classTag[A3].erasure, classTag[A4].erasure, classTag[A5].erasure, classTag[A6].erasure, classTag[A7].erasure, classTag[A8].erasure, classTag[A9].erasure, classTag[A10].erasure, classTag[A11].erasure, classTag[A12].erasure, classTag[A13].erasure, classTag[A14].erasure, classTag[A15].erasure, classTag[A16].erasure, classTag[A17].erasure, classTag[A18].erasure, classTag[A19].erasure, classTag[A20].erasure, classTag[A21].erasure).invoke(a, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12, a13, a14, a15, a16, a17, a18, a19, a20, a21).asInstanceOf[R]
-    }
-    def call[A1 <: AnyRef : ClassTag, A2 <: AnyRef : ClassTag, A3 <: AnyRef : ClassTag, A4 <: AnyRef : ClassTag, A5 <: AnyRef : ClassTag, A6 <: AnyRef : ClassTag, A7 <: AnyRef : ClassTag, A8 <: AnyRef : ClassTag, A9 <: AnyRef : ClassTag, A10 <: AnyRef : ClassTag, A11 <: AnyRef : ClassTag, A12 <: AnyRef : ClassTag, A13 <: AnyRef : ClassTag, A14 <: AnyRef : ClassTag, A15 <: AnyRef : ClassTag, A16 <: AnyRef : ClassTag, A17 <: AnyRef : ClassTag, A18 <: AnyRef : ClassTag, A19 <: AnyRef : ClassTag, A20 <: AnyRef : ClassTag, A21 <: AnyRef : ClassTag, A22 <: AnyRef : ClassTag, R](name: String, a1: A1, a2: A2, a3: A3, a4: A4, a5: A5, a6: A6, a7: A7, a8: A8, a9: A9, a10: A10, a11: A11, a12: A12, a13: A13, a14: A14, a15: A15, a16: A16, a17: A17, a18: A18, a19: A19, a20: A20, a21: A21, a22: A22): R = {
-      clazz.getMethod(name, classTag[A1].erasure, classTag[A2].erasure, classTag[A3].erasure, classTag[A4].erasure, classTag[A5].erasure, classTag[A6].erasure, classTag[A7].erasure, classTag[A8].erasure, classTag[A9].erasure, classTag[A10].erasure, classTag[A11].erasure, classTag[A12].erasure, classTag[A13].erasure, classTag[A14].erasure, classTag[A15].erasure, classTag[A16].erasure, classTag[A17].erasure, classTag[A18].erasure, classTag[A19].erasure, classTag[A20].erasure, classTag[A21].erasure, classTag[A22].erasure).invoke(a, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12, a13, a14, a15, a16, a17, a18, a19, a20, a21, a22).asInstanceOf[R]
-    }
-  }
-
-  /** code for StaticMagic
-        println(
-    (1 to 22).map { n =>
-      def repeat(str: String => String) = (1 to n).map(i => str(i.toString)).mkString(", ")
-      val types = repeat(n => s"A$n <: AnyRef : ClassTag")
-      val inArgs = repeat(n => s"a$n: A$n")
-      val erasure = repeat(n => s"classTag[A$n].erasure")
-      val outArgs = repeat(n => s"a$n")
-      s"""|def callStatic[$types, R](name: String, $inArgs): R = {
-         |  c.getDeclaredMethod(name, $erasure).invoke(c, $outArgs).asInstanceOf[R]
-         |}""".stripMargin
-    }.mkString("\n")
-    )
-   */
-
-  protected implicit class StaticMagic(c: Class[_]) {
-    def callStatic[A1 <: AnyRef : ClassTag, R](name: String, a1: A1): R = {
-      c.getDeclaredMethod(name, classTag[A1].erasure).invoke(c, a1).asInstanceOf[R]
-    }
-    def callStatic[A1 <: AnyRef : ClassTag, A2 <: AnyRef : ClassTag, R](name: String, a1: A1, a2: A2): R = {
-      c.getDeclaredMethod(name, classTag[A1].erasure, classTag[A2].erasure).invoke(c, a1, a2).asInstanceOf[R]
-    }
-    def callStatic[A1 <: AnyRef : ClassTag, A2 <: AnyRef : ClassTag, A3 <: AnyRef : ClassTag, R](name: String, a1: A1, a2: A2, a3: A3): R = {
-      c.getDeclaredMethod(name, classTag[A1].erasure, classTag[A2].erasure, classTag[A3].erasure).invoke(c, a1, a2, a3).asInstanceOf[R]
-    }
-    def callStatic[A1 <: AnyRef : ClassTag, A2 <: AnyRef : ClassTag, A3 <: AnyRef : ClassTag, A4 <: AnyRef : ClassTag, R](name: String, a1: A1, a2: A2, a3: A3, a4: A4): R = {
-      c.getDeclaredMethod(name, classTag[A1].erasure, classTag[A2].erasure, classTag[A3].erasure, classTag[A4].erasure).invoke(c, a1, a2, a3, a4).asInstanceOf[R]
-    }
-    def callStatic[A1 <: AnyRef : ClassTag, A2 <: AnyRef : ClassTag, A3 <: AnyRef : ClassTag, A4 <: AnyRef : ClassTag, A5 <: AnyRef : ClassTag, R](name: String, a1: A1, a2: A2, a3: A3, a4: A4, a5: A5): R = {
-      c.getDeclaredMethod(name, classTag[A1].erasure, classTag[A2].erasure, classTag[A3].erasure, classTag[A4].erasure, classTag[A5].erasure).invoke(c, a1, a2, a3, a4, a5).asInstanceOf[R]
-    }
-    def callStatic[A1 <: AnyRef : ClassTag, A2 <: AnyRef : ClassTag, A3 <: AnyRef : ClassTag, A4 <: AnyRef : ClassTag, A5 <: AnyRef : ClassTag, A6 <: AnyRef : ClassTag, R](name: String, a1: A1, a2: A2, a3: A3, a4: A4, a5: A5, a6: A6): R = {
-      c.getDeclaredMethod(name, classTag[A1].erasure, classTag[A2].erasure, classTag[A3].erasure, classTag[A4].erasure, classTag[A5].erasure, classTag[A6].erasure).invoke(c, a1, a2, a3, a4, a5, a6).asInstanceOf[R]
-    }
-    def callStatic[A1 <: AnyRef : ClassTag, A2 <: AnyRef : ClassTag, A3 <: AnyRef : ClassTag, A4 <: AnyRef : ClassTag, A5 <: AnyRef : ClassTag, A6 <: AnyRef : ClassTag, A7 <: AnyRef : ClassTag, R](name: String, a1: A1, a2: A2, a3: A3, a4: A4, a5: A5, a6: A6, a7: A7): R = {
-      c.getDeclaredMethod(name, classTag[A1].erasure, classTag[A2].erasure, classTag[A3].erasure, classTag[A4].erasure, classTag[A5].erasure, classTag[A6].erasure, classTag[A7].erasure).invoke(c, a1, a2, a3, a4, a5, a6, a7).asInstanceOf[R]
-    }
-    def callStatic[A1 <: AnyRef : ClassTag, A2 <: AnyRef : ClassTag, A3 <: AnyRef : ClassTag, A4 <: AnyRef : ClassTag, A5 <: AnyRef : ClassTag, A6 <: AnyRef : ClassTag, A7 <: AnyRef : ClassTag, A8 <: AnyRef : ClassTag, R](name: String, a1: A1, a2: A2, a3: A3, a4: A4, a5: A5, a6: A6, a7: A7, a8: A8): R = {
-      c.getDeclaredMethod(name, classTag[A1].erasure, classTag[A2].erasure, classTag[A3].erasure, classTag[A4].erasure, classTag[A5].erasure, classTag[A6].erasure, classTag[A7].erasure, classTag[A8].erasure).invoke(c, a1, a2, a3, a4, a5, a6, a7, a8).asInstanceOf[R]
-    }
-    def callStatic[A1 <: AnyRef : ClassTag, A2 <: AnyRef : ClassTag, A3 <: AnyRef : ClassTag, A4 <: AnyRef : ClassTag, A5 <: AnyRef : ClassTag, A6 <: AnyRef : ClassTag, A7 <: AnyRef : ClassTag, A8 <: AnyRef : ClassTag, A9 <: AnyRef : ClassTag, R](name: String, a1: A1, a2: A2, a3: A3, a4: A4, a5: A5, a6: A6, a7: A7, a8: A8, a9: A9): R = {
-      c.getDeclaredMethod(name, classTag[A1].erasure, classTag[A2].erasure, classTag[A3].erasure, classTag[A4].erasure, classTag[A5].erasure, classTag[A6].erasure, classTag[A7].erasure, classTag[A8].erasure, classTag[A9].erasure).invoke(c, a1, a2, a3, a4, a5, a6, a7, a8, a9).asInstanceOf[R]
-    }
-    def callStatic[A1 <: AnyRef : ClassTag, A2 <: AnyRef : ClassTag, A3 <: AnyRef : ClassTag, A4 <: AnyRef : ClassTag, A5 <: AnyRef : ClassTag, A6 <: AnyRef : ClassTag, A7 <: AnyRef : ClassTag, A8 <: AnyRef : ClassTag, A9 <: AnyRef : ClassTag, A10 <: AnyRef : ClassTag, R](name: String, a1: A1, a2: A2, a3: A3, a4: A4, a5: A5, a6: A6, a7: A7, a8: A8, a9: A9, a10: A10): R = {
-      c.getDeclaredMethod(name, classTag[A1].erasure, classTag[A2].erasure, classTag[A3].erasure, classTag[A4].erasure, classTag[A5].erasure, classTag[A6].erasure, classTag[A7].erasure, classTag[A8].erasure, classTag[A9].erasure, classTag[A10].erasure).invoke(c, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10).asInstanceOf[R]
-    }
-    def callStatic[A1 <: AnyRef : ClassTag, A2 <: AnyRef : ClassTag, A3 <: AnyRef : ClassTag, A4 <: AnyRef : ClassTag, A5 <: AnyRef : ClassTag, A6 <: AnyRef : ClassTag, A7 <: AnyRef : ClassTag, A8 <: AnyRef : ClassTag, A9 <: AnyRef : ClassTag, A10 <: AnyRef : ClassTag, A11 <: AnyRef : ClassTag, R](name: String, a1: A1, a2: A2, a3: A3, a4: A4, a5: A5, a6: A6, a7: A7, a8: A8, a9: A9, a10: A10, a11: A11): R = {
-      c.getDeclaredMethod(name, classTag[A1].erasure, classTag[A2].erasure, classTag[A3].erasure, classTag[A4].erasure, classTag[A5].erasure, classTag[A6].erasure, classTag[A7].erasure, classTag[A8].erasure, classTag[A9].erasure, classTag[A10].erasure, classTag[A11].erasure).invoke(c, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11).asInstanceOf[R]
-    }
-    def callStatic[A1 <: AnyRef : ClassTag, A2 <: AnyRef : ClassTag, A3 <: AnyRef : ClassTag, A4 <: AnyRef : ClassTag, A5 <: AnyRef : ClassTag, A6 <: AnyRef : ClassTag, A7 <: AnyRef : ClassTag, A8 <: AnyRef : ClassTag, A9 <: AnyRef : ClassTag, A10 <: AnyRef : ClassTag, A11 <: AnyRef : ClassTag, A12 <: AnyRef : ClassTag, R](name: String, a1: A1, a2: A2, a3: A3, a4: A4, a5: A5, a6: A6, a7: A7, a8: A8, a9: A9, a10: A10, a11: A11, a12: A12): R = {
-      c.getDeclaredMethod(name, classTag[A1].erasure, classTag[A2].erasure, classTag[A3].erasure, classTag[A4].erasure, classTag[A5].erasure, classTag[A6].erasure, classTag[A7].erasure, classTag[A8].erasure, classTag[A9].erasure, classTag[A10].erasure, classTag[A11].erasure, classTag[A12].erasure).invoke(c, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12).asInstanceOf[R]
-    }
-    def callStatic[A1 <: AnyRef : ClassTag, A2 <: AnyRef : ClassTag, A3 <: AnyRef : ClassTag, A4 <: AnyRef : ClassTag, A5 <: AnyRef : ClassTag, A6 <: AnyRef : ClassTag, A7 <: AnyRef : ClassTag, A8 <: AnyRef : ClassTag, A9 <: AnyRef : ClassTag, A10 <: AnyRef : ClassTag, A11 <: AnyRef : ClassTag, A12 <: AnyRef : ClassTag, A13 <: AnyRef : ClassTag, R](name: String, a1: A1, a2: A2, a3: A3, a4: A4, a5: A5, a6: A6, a7: A7, a8: A8, a9: A9, a10: A10, a11: A11, a12: A12, a13: A13): R = {
-      c.getDeclaredMethod(name, classTag[A1].erasure, classTag[A2].erasure, classTag[A3].erasure, classTag[A4].erasure, classTag[A5].erasure, classTag[A6].erasure, classTag[A7].erasure, classTag[A8].erasure, classTag[A9].erasure, classTag[A10].erasure, classTag[A11].erasure, classTag[A12].erasure, classTag[A13].erasure).invoke(c, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12, a13).asInstanceOf[R]
-    }
-    def callStatic[A1 <: AnyRef : ClassTag, A2 <: AnyRef : ClassTag, A3 <: AnyRef : ClassTag, A4 <: AnyRef : ClassTag, A5 <: AnyRef : ClassTag, A6 <: AnyRef : ClassTag, A7 <: AnyRef : ClassTag, A8 <: AnyRef : ClassTag, A9 <: AnyRef : ClassTag, A10 <: AnyRef : ClassTag, A11 <: AnyRef : ClassTag, A12 <: AnyRef : ClassTag, A13 <: AnyRef : ClassTag, A14 <: AnyRef : ClassTag, R](name: String, a1: A1, a2: A2, a3: A3, a4: A4, a5: A5, a6: A6, a7: A7, a8: A8, a9: A9, a10: A10, a11: A11, a12: A12, a13: A13, a14: A14): R = {
-      c.getDeclaredMethod(name, classTag[A1].erasure, classTag[A2].erasure, classTag[A3].erasure, classTag[A4].erasure, classTag[A5].erasure, classTag[A6].erasure, classTag[A7].erasure, classTag[A8].erasure, classTag[A9].erasure, classTag[A10].erasure, classTag[A11].erasure, classTag[A12].erasure, classTag[A13].erasure, classTag[A14].erasure).invoke(c, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12, a13, a14).asInstanceOf[R]
-    }
-    def callStatic[A1 <: AnyRef : ClassTag, A2 <: AnyRef : ClassTag, A3 <: AnyRef : ClassTag, A4 <: AnyRef : ClassTag, A5 <: AnyRef : ClassTag, A6 <: AnyRef : ClassTag, A7 <: AnyRef : ClassTag, A8 <: AnyRef : ClassTag, A9 <: AnyRef : ClassTag, A10 <: AnyRef : ClassTag, A11 <: AnyRef : ClassTag, A12 <: AnyRef : ClassTag, A13 <: AnyRef : ClassTag, A14 <: AnyRef : ClassTag, A15 <: AnyRef : ClassTag, R](name: String, a1: A1, a2: A2, a3: A3, a4: A4, a5: A5, a6: A6, a7: A7, a8: A8, a9: A9, a10: A10, a11: A11, a12: A12, a13: A13, a14: A14, a15: A15): R = {
-      c.getDeclaredMethod(name, classTag[A1].erasure, classTag[A2].erasure, classTag[A3].erasure, classTag[A4].erasure, classTag[A5].erasure, classTag[A6].erasure, classTag[A7].erasure, classTag[A8].erasure, classTag[A9].erasure, classTag[A10].erasure, classTag[A11].erasure, classTag[A12].erasure, classTag[A13].erasure, classTag[A14].erasure, classTag[A15].erasure).invoke(c, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12, a13, a14, a15).asInstanceOf[R]
-    }
-    def callStatic[A1 <: AnyRef : ClassTag, A2 <: AnyRef : ClassTag, A3 <: AnyRef : ClassTag, A4 <: AnyRef : ClassTag, A5 <: AnyRef : ClassTag, A6 <: AnyRef : ClassTag, A7 <: AnyRef : ClassTag, A8 <: AnyRef : ClassTag, A9 <: AnyRef : ClassTag, A10 <: AnyRef : ClassTag, A11 <: AnyRef : ClassTag, A12 <: AnyRef : ClassTag, A13 <: AnyRef : ClassTag, A14 <: AnyRef : ClassTag, A15 <: AnyRef : ClassTag, A16 <: AnyRef : ClassTag, R](name: String, a1: A1, a2: A2, a3: A3, a4: A4, a5: A5, a6: A6, a7: A7, a8: A8, a9: A9, a10: A10, a11: A11, a12: A12, a13: A13, a14: A14, a15: A15, a16: A16): R = {
-      c.getDeclaredMethod(name, classTag[A1].erasure, classTag[A2].erasure, classTag[A3].erasure, classTag[A4].erasure, classTag[A5].erasure, classTag[A6].erasure, classTag[A7].erasure, classTag[A8].erasure, classTag[A9].erasure, classTag[A10].erasure, classTag[A11].erasure, classTag[A12].erasure, classTag[A13].erasure, classTag[A14].erasure, classTag[A15].erasure, classTag[A16].erasure).invoke(c, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12, a13, a14, a15, a16).asInstanceOf[R]
-    }
-    def callStatic[A1 <: AnyRef : ClassTag, A2 <: AnyRef : ClassTag, A3 <: AnyRef : ClassTag, A4 <: AnyRef : ClassTag, A5 <: AnyRef : ClassTag, A6 <: AnyRef : ClassTag, A7 <: AnyRef : ClassTag, A8 <: AnyRef : ClassTag, A9 <: AnyRef : ClassTag, A10 <: AnyRef : ClassTag, A11 <: AnyRef : ClassTag, A12 <: AnyRef : ClassTag, A13 <: AnyRef : ClassTag, A14 <: AnyRef : ClassTag, A15 <: AnyRef : ClassTag, A16 <: AnyRef : ClassTag, A17 <: AnyRef : ClassTag, R](name: String, a1: A1, a2: A2, a3: A3, a4: A4, a5: A5, a6: A6, a7: A7, a8: A8, a9: A9, a10: A10, a11: A11, a12: A12, a13: A13, a14: A14, a15: A15, a16: A16, a17: A17): R = {
-      c.getDeclaredMethod(name, classTag[A1].erasure, classTag[A2].erasure, classTag[A3].erasure, classTag[A4].erasure, classTag[A5].erasure, classTag[A6].erasure, classTag[A7].erasure, classTag[A8].erasure, classTag[A9].erasure, classTag[A10].erasure, classTag[A11].erasure, classTag[A12].erasure, classTag[A13].erasure, classTag[A14].erasure, classTag[A15].erasure, classTag[A16].erasure, classTag[A17].erasure).invoke(c, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12, a13, a14, a15, a16, a17).asInstanceOf[R]
-    }
-    def callStatic[A1 <: AnyRef : ClassTag, A2 <: AnyRef : ClassTag, A3 <: AnyRef : ClassTag, A4 <: AnyRef : ClassTag, A5 <: AnyRef : ClassTag, A6 <: AnyRef : ClassTag, A7 <: AnyRef : ClassTag, A8 <: AnyRef : ClassTag, A9 <: AnyRef : ClassTag, A10 <: AnyRef : ClassTag, A11 <: AnyRef : ClassTag, A12 <: AnyRef : ClassTag, A13 <: AnyRef : ClassTag, A14 <: AnyRef : ClassTag, A15 <: AnyRef : ClassTag, A16 <: AnyRef : ClassTag, A17 <: AnyRef : ClassTag, A18 <: AnyRef : ClassTag, R](name: String, a1: A1, a2: A2, a3: A3, a4: A4, a5: A5, a6: A6, a7: A7, a8: A8, a9: A9, a10: A10, a11: A11, a12: A12, a13: A13, a14: A14, a15: A15, a16: A16, a17: A17, a18: A18): R = {
-      c.getDeclaredMethod(name, classTag[A1].erasure, classTag[A2].erasure, classTag[A3].erasure, classTag[A4].erasure, classTag[A5].erasure, classTag[A6].erasure, classTag[A7].erasure, classTag[A8].erasure, classTag[A9].erasure, classTag[A10].erasure, classTag[A11].erasure, classTag[A12].erasure, classTag[A13].erasure, classTag[A14].erasure, classTag[A15].erasure, classTag[A16].erasure, classTag[A17].erasure, classTag[A18].erasure).invoke(c, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12, a13, a14, a15, a16, a17, a18).asInstanceOf[R]
-    }
-    def callStatic[A1 <: AnyRef : ClassTag, A2 <: AnyRef : ClassTag, A3 <: AnyRef : ClassTag, A4 <: AnyRef : ClassTag, A5 <: AnyRef : ClassTag, A6 <: AnyRef : ClassTag, A7 <: AnyRef : ClassTag, A8 <: AnyRef : ClassTag, A9 <: AnyRef : ClassTag, A10 <: AnyRef : ClassTag, A11 <: AnyRef : ClassTag, A12 <: AnyRef : ClassTag, A13 <: AnyRef : ClassTag, A14 <: AnyRef : ClassTag, A15 <: AnyRef : ClassTag, A16 <: AnyRef : ClassTag, A17 <: AnyRef : ClassTag, A18 <: AnyRef : ClassTag, A19 <: AnyRef : ClassTag, R](name: String, a1: A1, a2: A2, a3: A3, a4: A4, a5: A5, a6: A6, a7: A7, a8: A8, a9: A9, a10: A10, a11: A11, a12: A12, a13: A13, a14: A14, a15: A15, a16: A16, a17: A17, a18: A18, a19: A19): R = {
-      c.getDeclaredMethod(name, classTag[A1].erasure, classTag[A2].erasure, classTag[A3].erasure, classTag[A4].erasure, classTag[A5].erasure, classTag[A6].erasure, classTag[A7].erasure, classTag[A8].erasure, classTag[A9].erasure, classTag[A10].erasure, classTag[A11].erasure, classTag[A12].erasure, classTag[A13].erasure, classTag[A14].erasure, classTag[A15].erasure, classTag[A16].erasure, classTag[A17].erasure, classTag[A18].erasure, classTag[A19].erasure).invoke(c, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12, a13, a14, a15, a16, a17, a18, a19).asInstanceOf[R]
-    }
-    def callStatic[A1 <: AnyRef : ClassTag, A2 <: AnyRef : ClassTag, A3 <: AnyRef : ClassTag, A4 <: AnyRef : ClassTag, A5 <: AnyRef : ClassTag, A6 <: AnyRef : ClassTag, A7 <: AnyRef : ClassTag, A8 <: AnyRef : ClassTag, A9 <: AnyRef : ClassTag, A10 <: AnyRef : ClassTag, A11 <: AnyRef : ClassTag, A12 <: AnyRef : ClassTag, A13 <: AnyRef : ClassTag, A14 <: AnyRef : ClassTag, A15 <: AnyRef : ClassTag, A16 <: AnyRef : ClassTag, A17 <: AnyRef : ClassTag, A18 <: AnyRef : ClassTag, A19 <: AnyRef : ClassTag, A20 <: AnyRef : ClassTag, R](name: String, a1: A1, a2: A2, a3: A3, a4: A4, a5: A5, a6: A6, a7: A7, a8: A8, a9: A9, a10: A10, a11: A11, a12: A12, a13: A13, a14: A14, a15: A15, a16: A16, a17: A17, a18: A18, a19: A19, a20: A20): R = {
-      c.getDeclaredMethod(name, classTag[A1].erasure, classTag[A2].erasure, classTag[A3].erasure, classTag[A4].erasure, classTag[A5].erasure, classTag[A6].erasure, classTag[A7].erasure, classTag[A8].erasure, classTag[A9].erasure, classTag[A10].erasure, classTag[A11].erasure, classTag[A12].erasure, classTag[A13].erasure, classTag[A14].erasure, classTag[A15].erasure, classTag[A16].erasure, classTag[A17].erasure, classTag[A18].erasure, classTag[A19].erasure, classTag[A20].erasure).invoke(c, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12, a13, a14, a15, a16, a17, a18, a19, a20).asInstanceOf[R]
-    }
-    def callStatic[A1 <: AnyRef : ClassTag, A2 <: AnyRef : ClassTag, A3 <: AnyRef : ClassTag, A4 <: AnyRef : ClassTag, A5 <: AnyRef : ClassTag, A6 <: AnyRef : ClassTag, A7 <: AnyRef : ClassTag, A8 <: AnyRef : ClassTag, A9 <: AnyRef : ClassTag, A10 <: AnyRef : ClassTag, A11 <: AnyRef : ClassTag, A12 <: AnyRef : ClassTag, A13 <: AnyRef : ClassTag, A14 <: AnyRef : ClassTag, A15 <: AnyRef : ClassTag, A16 <: AnyRef : ClassTag, A17 <: AnyRef : ClassTag, A18 <: AnyRef : ClassTag, A19 <: AnyRef : ClassTag, A20 <: AnyRef : ClassTag, A21 <: AnyRef : ClassTag, R](name: String, a1: A1, a2: A2, a3: A3, a4: A4, a5: A5, a6: A6, a7: A7, a8: A8, a9: A9, a10: A10, a11: A11, a12: A12, a13: A13, a14: A14, a15: A15, a16: A16, a17: A17, a18: A18, a19: A19, a20: A20, a21: A21): R = {
-      c.getDeclaredMethod(name, classTag[A1].erasure, classTag[A2].erasure, classTag[A3].erasure, classTag[A4].erasure, classTag[A5].erasure, classTag[A6].erasure, classTag[A7].erasure, classTag[A8].erasure, classTag[A9].erasure, classTag[A10].erasure, classTag[A11].erasure, classTag[A12].erasure, classTag[A13].erasure, classTag[A14].erasure, classTag[A15].erasure, classTag[A16].erasure, classTag[A17].erasure, classTag[A18].erasure, classTag[A19].erasure, classTag[A20].erasure, classTag[A21].erasure).invoke(c, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12, a13, a14, a15, a16, a17, a18, a19, a20, a21).asInstanceOf[R]
-    }
-    def callStatic[A1 <: AnyRef : ClassTag, A2 <: AnyRef : ClassTag, A3 <: AnyRef : ClassTag, A4 <: AnyRef : ClassTag, A5 <: AnyRef : ClassTag, A6 <: AnyRef : ClassTag, A7 <: AnyRef : ClassTag, A8 <: AnyRef : ClassTag, A9 <: AnyRef : ClassTag, A10 <: AnyRef : ClassTag, A11 <: AnyRef : ClassTag, A12 <: AnyRef : ClassTag, A13 <: AnyRef : ClassTag, A14 <: AnyRef : ClassTag, A15 <: AnyRef : ClassTag, A16 <: AnyRef : ClassTag, A17 <: AnyRef : ClassTag, A18 <: AnyRef : ClassTag, A19 <: AnyRef : ClassTag, A20 <: AnyRef : ClassTag, A21 <: AnyRef : ClassTag, A22 <: AnyRef : ClassTag, R](name: String, a1: A1, a2: A2, a3: A3, a4: A4, a5: A5, a6: A6, a7: A7, a8: A8, a9: A9, a10: A10, a11: A11, a12: A12, a13: A13, a14: A14, a15: A15, a16: A16, a17: A17, a18: A18, a19: A19, a20: A20, a21: A21, a22: A22): R = {
-      c.getDeclaredMethod(name, classTag[A1].erasure, classTag[A2].erasure, classTag[A3].erasure, classTag[A4].erasure, classTag[A5].erasure, classTag[A6].erasure, classTag[A7].erasure, classTag[A8].erasure, classTag[A9].erasure, classTag[A10].erasure, classTag[A11].erasure, classTag[A12].erasure, classTag[A13].erasure, classTag[A14].erasure, classTag[A15].erasure, classTag[A16].erasure, classTag[A17].erasure, classTag[A18].erasure, classTag[A19].erasure, classTag[A20].erasure, classTag[A21].erasure, classTag[A22].erasure).invoke(c, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12, a13, a14, a15, a16, a17, a18, a19, a20, a21, a22).asInstanceOf[R]
-    }
-  }
-  // scalastyle:on
-}
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/package.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/package.scala
index 410d9881ac214..27a3d8f5896cc 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/package.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/package.scala
@@ -19,15 +19,27 @@ package org.apache.spark.sql.hive
 
 /** Support for interacting with different versions of the HiveMetastoreClient */
 package object client {
-  private[client] abstract class HiveVersion(val fullVersion: String, val hasBuiltinsJar: Boolean)
+  private[client] abstract class HiveVersion(
+      val fullVersion: String,
+      val extraDeps: Seq[String] = Nil,
+      val exclusions: Seq[String] = Nil)
 
   // scalastyle:off
   private[client] object hive {
-    case object v10 extends HiveVersion("0.10.0", true)
-    case object v11 extends HiveVersion("0.11.0", false)
-    case object v12 extends HiveVersion("0.12.0", false)
-    case object v13 extends HiveVersion("0.13.1", false)
+    case object v12 extends HiveVersion("0.12.0")
+    case object v13 extends HiveVersion("0.13.1")
+
+    // Hive 0.14 depends on calcite 0.9.2-incubating-SNAPSHOT which does not exist in
+    // maven central anymore, so override those with a version that exists.
+    //
+    // org.pentaho:pentaho-aggdesigner-algorithm is also nowhere to be found, so exclude
+    // it explicitly. If it's needed by the metastore client, users will have to dig it
+    // out of somewhere and use configuration to point Spark at the correct jars.
+    case object v14 extends HiveVersion("0.14.0",
+      Seq("org.apache.calcite:calcite-core:1.3.0-incubating",
+        "org.apache.calcite:calcite-avatica:1.3.0-incubating"),
+      Seq("org.pentaho:pentaho-aggdesigner-algorithm"))
   }
   // scalastyle:on
 
-}
\ No newline at end of file
+}
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/VersionsSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/VersionsSuite.scala
index 7eb4842726665..9a571650b6e25 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/VersionsSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/VersionsSuite.scala
@@ -72,7 +72,7 @@ class VersionsSuite extends SparkFunSuite with Logging {
     assert(getNestedMessages(e) contains "Unknown column 'A0.OWNER_NAME' in 'field list'")
   }
 
-  private val versions = Seq("12", "13")
+  private val versions = Seq("12", "13", "14")
 
   private var client: ClientInterface = null
 

From 4c5889e8f5fd384a3a33e49d75a64cde95d2c9f3 Mon Sep 17 00:00:00 2001
From: Nicholas Chammas <nicholas.chammas@gmail.com>
Date: Mon, 15 Jun 2015 08:18:01 +0100
Subject: [PATCH 486/525] [SPARK-8316] Upgrade to Maven 3.3.3

Versions of Maven older than 3.3.0 apparently have [a bug in how they handle transitive dependencies](https://github.com/apache/spark/pull/6492#issuecomment-111001101).

I confirmed that upgrading to Maven 3.3.3 resolves at least the particular manifestation of this bug that I ran into.

Author: Nicholas Chammas <nicholas.chammas@gmail.com>

Closes #6770 from nchammas/maven-333 and squashes the following commits:

6bed2d9 [Nicholas Chammas] upgrade to Maven 3.3.3
---
 build/mvn | 35 +++++++++++++----------------------
 1 file changed, 13 insertions(+), 22 deletions(-)

diff --git a/build/mvn b/build/mvn
index 3561110a4c019..e8364181e8230 100755
--- a/build/mvn
+++ b/build/mvn
@@ -69,11 +69,14 @@ install_app() {
 
 # Install maven under the build/ folder
 install_mvn() {
+  local MVN_VERSION="3.3.3"
+
   install_app \
-    "http://archive.apache.org/dist/maven/maven-3/3.2.5/binaries" \
-    "apache-maven-3.2.5-bin.tar.gz" \
-    "apache-maven-3.2.5/bin/mvn"
-  MVN_BIN="${_DIR}/apache-maven-3.2.5/bin/mvn"
+    "http://archive.apache.org/dist/maven/maven-3/${MVN_VERSION}/binaries" \
+    "apache-maven-${MVN_VERSION}-bin.tar.gz" \
+    "apache-maven-${MVN_VERSION}/bin/mvn"
+
+  MVN_BIN="${_DIR}/apache-maven-${MVN_VERSION}/bin/mvn"
 }
 
 # Install zinc under the build/ folder
@@ -105,28 +108,16 @@ install_scala() {
   SCALA_LIBRARY="$(cd "$(dirname ${scala_bin})/../lib" && pwd)/scala-library.jar"
 }
 
-# Determines if a given application is already installed. If not, will attempt
-# to install
-## Arg1 - application name
-## Arg2 - Alternate path to local install under build/ dir
-check_and_install_app() {
-  # create the local environment variable in uppercase
-  local app_bin="`echo $1 | awk '{print toupper(\$0)}'`_BIN"
-  # some black magic to set the generated app variable (i.e. MVN_BIN) into the
-  # environment
-  eval "${app_bin}=`which $1 2>/dev/null`"
-
-  if [ -z "`which $1 2>/dev/null`" ]; then
-    install_$1
-  fi
-}
-
 # Setup healthy defaults for the Zinc port if none were provided from
 # the environment
 ZINC_PORT=${ZINC_PORT:-"3030"}
 
-# Check and install all applications necessary to build Spark
-check_and_install_app "mvn"
+# Install Maven if necessary
+MVN_BIN="$(command -v mvn)"
+
+if [ ! "$MVN_BIN" ]; then
+  install_mvn
+fi
 
 # Install the proper version of Scala and Zinc for the build
 install_zinc

From 56d4e8a2d0f6aab9a599cd8733e20500ffe8fc8a Mon Sep 17 00:00:00 2001
From: andrewor14 <andrew@databricks.com>
Date: Mon, 15 Jun 2015 08:16:22 -0700
Subject: [PATCH 487/525] [SPARK-8350] [R] Log R unit test output to
 "unit-tests.log"

Right now it's logged to "R-unit-tests.log". Jenkins currently only archives files named "unit-tests.log", and this is what all other modules (e.g. SQL, network, REPL) use.
1. We should be consistent
2. I don't want to reconfigure Jenkins to accept a different file

shivaram

Author: andrewor14 <andrew@databricks.com>
Author: Andrew Or <andrew@databricks.com>

Closes #6807 from andrewor14/r-logs and squashes the following commits:

96005d2 [andrewor14] Nest unit-tests.log further until R
407c46c [andrewor14] Add target to log path
d7b68ae [Andrew Or] Log R unit test output to "unit-tests.log"
---
 R/log4j.properties | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/R/log4j.properties b/R/log4j.properties
index 701adb2a3da1d..cce8d9152d32d 100644
--- a/R/log4j.properties
+++ b/R/log4j.properties
@@ -19,7 +19,7 @@
 log4j.rootCategory=INFO, file
 log4j.appender.file=org.apache.log4j.FileAppender
 log4j.appender.file.append=true
-log4j.appender.file.file=R-unit-tests.log
+log4j.appender.file.file=R/target/unit-tests.log
 log4j.appender.file.layout=org.apache.log4j.PatternLayout
 log4j.appender.file.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss.SSS} %t %p %c{1}: %m%n
 

From 6ae21a944a0f4580b55749776223c827450b00da Mon Sep 17 00:00:00 2001
From: Yadong Qi <qiyadong2010@gmail.com>
Date: Mon, 15 Jun 2015 12:01:52 -0700
Subject: [PATCH 488/525] [SPARK-6583] [SQL] Support aggregate functions in
 ORDER BY

Add aggregates in ORDER BY clauses to the `Aggregate` operator beneath.  Project these results away after the Sort.

Based on work by watermen.  Also Closes #5290.

Author: Yadong Qi <qiyadong2010@gmail.com>
Author: Michael Armbrust <michael@databricks.com>

Closes #6816 from marmbrus/pr/5290 and squashes the following commits:

3226a97 [Michael Armbrust] consistent ordering
eb8938d [Michael Armbrust] no vars
c8b25c1 [Yadong Qi] move the test data.
7f9b736 [Yadong Qi] delete Substring case
a1e87c1 [Yadong Qi] fix conflict
f119849 [Yadong Qi] order by aggregated function
---
 .../sql/catalyst/analysis/Analyzer.scala      | 19 ++++++--
 .../org/apache/spark/sql/SQLQuerySuite.scala  | 45 +++++++++++++++++++
 2 files changed, 61 insertions(+), 3 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
index 4b7fef7126989..badf903478303 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
@@ -25,6 +25,7 @@ import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.plans.logical._
 import org.apache.spark.sql.catalyst.rules._
 import org.apache.spark.sql.types._
+import scala.collection.mutable.ArrayBuffer
 
 /**
  * A trivial [[Analyzer]] with an [[EmptyCatalog]] and [[EmptyFunctionRegistry]]. Used for testing
@@ -396,19 +397,31 @@ class Analyzer(
         }
       case s @ Sort(ordering, global, a @ Aggregate(grouping, aggs, child))
           if !s.resolved && a.resolved =>
-        val unresolved = ordering.flatMap(_.collect { case UnresolvedAttribute(name) => name })
         // A small hack to create an object that will allow us to resolve any references that
         // refer to named expressions that are present in the grouping expressions.
         val groupingRelation = LocalRelation(
           grouping.collect { case ne: NamedExpression => ne.toAttribute }
         )
 
-        val (resolvedOrdering, missing) = resolveAndFindMissing(ordering, a, groupingRelation)
+        // Find sort attributes that are projected away so we can temporarily add them back in.
+        val (resolvedOrdering, unresolved) = resolveAndFindMissing(ordering, a, groupingRelation)
+
+        // Find aggregate expressions and evaluate them early, since they can't be evaluated in a
+        // Sort.
+        val (withAggsRemoved, aliasedAggregateList) = resolvedOrdering.map {
+          case aggOrdering if aggOrdering.collect { case a: AggregateExpression => a }.nonEmpty =>
+            val aliased = Alias(aggOrdering.child, "_aggOrdering")()
+            (aggOrdering.copy(child = aliased.toAttribute), aliased :: Nil)
+
+          case other => (other, Nil)
+        }.unzip
+
+        val missing = unresolved ++ aliasedAggregateList.flatten
 
         if (missing.nonEmpty) {
           // Add missing grouping exprs and then project them away after the sort.
           Project(a.output,
-            Sort(resolvedOrdering, global,
+            Sort(withAggsRemoved, global,
               Aggregate(grouping, aggs ++ missing, child)))
         } else {
           s // Nothing we can do here. Return original plan.
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
index d1520b757e57b..a47cc30e92e27 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
@@ -1366,6 +1366,51 @@ class SQLQuerySuite extends QueryTest with BeforeAndAfterAll with SQLTestUtils {
     checkAnswer(sql("SELECT a.`c.b`, `b.$q`[0].`a@!.q`, `q.w`.`w.i&`[0] FROM t"), Row(1, 1, 1))
   }
 
+  test("SPARK-6583 order by aggregated function") {
+    Seq("1" -> 3, "1" -> 4, "2" -> 7, "2" -> 8, "3" -> 5, "3" -> 6, "4" -> 1, "4" -> 2)
+      .toDF("a", "b").registerTempTable("orderByData")
+
+    checkAnswer(
+      sql(
+        """
+          |SELECT a
+          |FROM orderByData
+          |GROUP BY a
+          |ORDER BY sum(b)
+        """.stripMargin),
+      Row("4") :: Row("1") :: Row("3") :: Row("2") :: Nil)
+
+    checkAnswer(
+      sql(
+        """
+          |SELECT sum(b)
+          |FROM orderByData
+          |GROUP BY a
+          |ORDER BY sum(b)
+        """.stripMargin),
+      Row(3) :: Row(7) :: Row(11) :: Row(15) :: Nil)
+
+    checkAnswer(
+      sql(
+        """
+          |SELECT a, sum(b)
+          |FROM orderByData
+          |GROUP BY a
+          |ORDER BY sum(b)
+        """.stripMargin),
+      Row("4", 3) :: Row("1", 7) :: Row("3", 11) :: Row("2", 15) :: Nil)
+
+    checkAnswer(
+      sql(
+        """
+            |SELECT a, sum(b)
+            |FROM orderByData
+            |GROUP BY a
+            |ORDER BY sum(b) + 1
+          """.stripMargin),
+      Row("4", 3) :: Row("1", 7) :: Row("3", 11) :: Row("2", 15) :: Nil)
+  }
+
   test("SPARK-7952: fix the equality check between boolean and numeric types") {
     withTempTable("t") {
       // numeric field i, boolean field j, result of i = j, result of i <=> j

From 1a62d61696a0481508d83a07d19ab3701245ac20 Mon Sep 17 00:00:00 2001
From: tedyu <yuzhihong@gmail.com>
Date: Mon, 15 Jun 2015 17:00:38 -0700
Subject: [PATCH 489/525] SPARK-8336 Fix NullPointerException with
 functions.rand()

This PR fixes the problem reported by Justin Yip in the thread 'NullPointerException with functions.rand()'

Tested using spark-shell and verified that the following works:
sqlContext.createDataFrame(Seq((1,2), (3, 100))).withColumn("index", rand(30)).show()

Author: tedyu <yuzhihong@gmail.com>

Closes #6793 from tedyu/master and squashes the following commits:

62fd97b [tedyu] Create RandomSuite
750f92c [tedyu] Add test for Rand() with seed
a1d66c5 [tedyu] Fix NullPointerException with functions.rand()
---
 .../sql/catalyst/expressions/random.scala     |  6 +++-
 .../catalyst/expressions/RandomSuite.scala    | 33 +++++++++++++++++++
 2 files changed, 38 insertions(+), 1 deletion(-)
 create mode 100644 sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/RandomSuite.scala

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/random.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/random.scala
index cc34467391b96..45588bacd2e45 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/random.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/random.scala
@@ -37,7 +37,11 @@ abstract class RDG(seed: Long) extends LeafExpression with Serializable {
    * Record ID within each partition. By being transient, the Random Number Generator is
    * reset every time we serialize and deserialize it.
    */
-  @transient protected lazy val rng = new XORShiftRandom(seed + TaskContext.get().partitionId())
+  @transient protected lazy val partitionId = TaskContext.get() match {
+    case null => 0
+    case _ => TaskContext.get().partitionId()
+  }
+  @transient protected lazy val rng = new XORShiftRandom(seed + partitionId)
 
   override def deterministic: Boolean = false
 
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/RandomSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/RandomSuite.scala
new file mode 100644
index 0000000000000..9be2b23a53f27
--- /dev/null
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/RandomSuite.scala
@@ -0,0 +1,33 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.catalyst.expressions
+
+import org.scalatest.Matchers._
+
+import org.apache.spark.SparkFunSuite
+import org.apache.spark.sql.catalyst.dsl.expressions._
+import org.apache.spark.sql.types.{DoubleType, IntegerType}
+
+
+class RandomSuite extends SparkFunSuite with ExpressionEvalHelper {
+
+  test("random") {
+    val row = create_row(1.1, 2.0, 3.1, null)
+    checkDoubleEvaluation(Rand(30), (0.7363714192755834 +- 0.001), row)
+  }
+}

From bc76a0f7506c9796209a96b027a236270c23bbf6 Mon Sep 17 00:00:00 2001
From: Davies Liu <davies@databricks.com>
Date: Mon, 15 Jun 2015 23:03:14 -0700
Subject: [PATCH 490/525] [SPARK-7184] [SQL] enable codegen by default

In order to have better performance out of box, this PR turn on codegen by default, then codegen can be tested by sql/test and hive/test.

This PR also fix some corner cases for codegen.

Before 1.5 release, we should re-visit this, turn it off if it's not stable or causing regressions.

cc rxin JoshRosen

Author: Davies Liu <davies@databricks.com>

Closes #6726 from davies/enable_codegen and squashes the following commits:

f3b25a5 [Davies Liu] fix warning
73750ea [Davies Liu] fix long overflow when compare
3017a47 [Davies Liu] Merge branch 'master' of github.com:apache/spark into enable_codegen
a7d75da [Davies Liu] Merge branch 'master' of github.com:apache/spark into enable_codegen
ff5b75a [Davies Liu] Merge branch 'master' of github.com:apache/spark into enable_codegen
f4cf2c2 [Davies Liu] fix style
99fc139 [Davies Liu] Merge branch 'enable_codegen' of github.com:davies/spark into enable_codegen
91fc7a2 [Davies Liu] disable codegen for ScalaUDF
207e339 [Davies Liu] Update CodeGenerator.scala
44573a3 [Davies Liu] check thread safety of expression
f3886fa [Davies Liu] don't inline primitiveTerm for null literal
c8e7cd2 [Davies Liu] address comment
a8618c9 [Davies Liu] enable codegen by default
---
 .../catalyst/expressions/BoundAttribute.scala |  4 +-
 .../sql/catalyst/expressions/Expression.scala | 13 +++
 .../sql/catalyst/expressions/ScalaUdf.scala   |  2 +
 .../sql/catalyst/expressions/SortOrder.scala  |  2 +-
 .../sql/catalyst/expressions/arithmetic.scala | 84 +++++++++----------
 .../expressions/codegen/CodeGenerator.scala   |  8 +-
 .../expressions/namedExpressions.scala        |  2 +
 .../catalyst/expressions/nullFunctions.scala  |  7 +-
 .../expressions/windowExpressions.scala       |  3 +-
 .../plans/logical/LocalRelation.scala         |  2 +-
 .../sql/catalyst/ScalaReflectionSuite.scala   |  3 +-
 .../scala/org/apache/spark/sql/SQLConf.scala  | 10 +--
 .../org/apache/spark/sql/SQLContext.scala     |  5 +-
 .../spark/sql/columnar/ColumnBuilder.scala    |  1 +
 .../spark/sql/execution/SparkPlan.scala       |  7 +-
 .../MonotonicallyIncreasingID.scala           |  2 +
 .../joins/BroadcastLeftSemiJoinHash.scala     |  3 +-
 .../apache/spark/sql/parquet/newParquet.scala |  6 +-
 .../apache/spark/sql/sources/commands.scala   |  5 +-
 .../apache/spark/sql/sources/interfaces.scala |  3 +-
 .../org/apache/spark/sql/hive/hiveUdfs.scala  |  4 +
 21 files changed, 95 insertions(+), 81 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/BoundAttribute.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/BoundAttribute.scala
index c4dd11a4518cd..5db2fcfcb267b 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/BoundAttribute.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/BoundAttribute.scala
@@ -19,9 +19,9 @@ package org.apache.spark.sql.catalyst.expressions
 
 import org.apache.spark.Logging
 import org.apache.spark.sql.catalyst.errors.attachTree
-import org.apache.spark.sql.catalyst.expressions.codegen.{GeneratedExpressionCode, CodeGenContext}
+import org.apache.spark.sql.catalyst.expressions.codegen.{CodeGenContext, GeneratedExpressionCode}
+import org.apache.spark.sql.catalyst.trees
 import org.apache.spark.sql.types._
-import org.apache.spark.sql.catalyst.{InternalRow, trees}
 
 /**
  * A bound reference points to a specific slot in the input tuple, allowing the actual value
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala
index 7427ca76b54d7..a10a959ae766f 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala
@@ -60,6 +60,14 @@ abstract class Expression extends TreeNode[Expression] {
   /** Returns the result of evaluating this expression on a given input Row */
   def eval(input: InternalRow = null): Any
 
+  /**
+   * Return true if this expression is thread-safe, which means it could be used by multiple
+   * threads in the same time.
+   *
+   * An expression that is not thread-safe can not be cached and re-used, especially for codegen.
+   */
+  def isThreadSafe: Boolean = true
+
   /**
    * Returns an [[GeneratedExpressionCode]], which contains Java source code that
    * can be used to generate the result of evaluating the expression on an input row.
@@ -68,6 +76,9 @@ abstract class Expression extends TreeNode[Expression] {
    * @return [[GeneratedExpressionCode]]
    */
   def gen(ctx: CodeGenContext): GeneratedExpressionCode = {
+    if (!isThreadSafe) {
+      throw new Exception(s"$this is not thread-safe, can not be used in codegen")
+    }
     val isNull = ctx.freshName("isNull")
     val primitive = ctx.freshName("primitive")
     val ve = GeneratedExpressionCode("", isNull, primitive)
@@ -169,6 +180,7 @@ abstract class BinaryExpression extends Expression with trees.BinaryNode[Express
 
   override def toString: String = s"($left $symbol $right)"
 
+  override def isThreadSafe: Boolean = left.isThreadSafe && right.isThreadSafe
   /**
    * Short hand for generating binary evaluation code, which depends on two sub-evaluations of
    * the same type.  If either of the sub-expressions is null, the result of this computation
@@ -218,6 +230,7 @@ abstract class UnaryExpression extends Expression with trees.UnaryNode[Expressio
 
   override def foldable: Boolean = child.foldable
   override def nullable: Boolean = child.nullable
+  override def isThreadSafe: Boolean = child.isThreadSafe
 
   /**
    * Called by unary expressions to generate a code block that returns null if its parent returns
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ScalaUdf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ScalaUdf.scala
index b3ce698c5552d..3992f1f59dad8 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ScalaUdf.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ScalaUdf.scala
@@ -958,4 +958,6 @@ case class ScalaUdf(function: AnyRef, dataType: DataType, children: Seq[Expressi
   private[this] val converter = CatalystTypeConverters.createToCatalystConverter(dataType)
   override def eval(input: InternalRow): Any = converter(f(input))
 
+  // TODO(davies): make ScalaUdf work with codegen
+  override def isThreadSafe: Boolean = false
 }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/SortOrder.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/SortOrder.scala
index 8a3435599922f..4baae03b3a224 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/SortOrder.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/SortOrder.scala
@@ -18,7 +18,7 @@
 package org.apache.spark.sql.catalyst.expressions
 
 import org.apache.spark.sql.catalyst.errors.TreeNodeException
-import org.apache.spark.sql.catalyst.{InternalRow, trees}
+import org.apache.spark.sql.catalyst.trees
 import org.apache.spark.sql.types.DataType
 
 abstract sealed class SortDirection
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/arithmetic.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/arithmetic.scala
index 9d1e96572a26d..8b78c50000166 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/arithmetic.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/arithmetic.scala
@@ -341,31 +341,29 @@ case class MaxOf(left: Expression, right: Expression) extends BinaryArithmetic {
   }
 
   override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
-    if (ctx.isNativeType(left.dataType)) {
-      val eval1 = left.gen(ctx)
-      val eval2 = right.gen(ctx)
-      eval1.code + eval2.code + s"""
-        boolean ${ev.isNull} = false;
-        ${ctx.javaType(left.dataType)} ${ev.primitive} =
-          ${ctx.defaultValue(left.dataType)};
-
-        if (${eval1.isNull}) {
-          ${ev.isNull} = ${eval2.isNull};
-          ${ev.primitive} = ${eval2.primitive};
-        } else if (${eval2.isNull}) {
-          ${ev.isNull} = ${eval1.isNull};
+    val eval1 = left.gen(ctx)
+    val eval2 = right.gen(ctx)
+    val compCode = ctx.genComp(dataType, eval1.primitive, eval2.primitive)
+
+    eval1.code + eval2.code + s"""
+      boolean ${ev.isNull} = false;
+      ${ctx.javaType(left.dataType)} ${ev.primitive} =
+        ${ctx.defaultValue(left.dataType)};
+
+      if (${eval1.isNull}) {
+        ${ev.isNull} = ${eval2.isNull};
+        ${ev.primitive} = ${eval2.primitive};
+      } else if (${eval2.isNull}) {
+        ${ev.isNull} = ${eval1.isNull};
+        ${ev.primitive} = ${eval1.primitive};
+      } else {
+        if ($compCode > 0) {
           ${ev.primitive} = ${eval1.primitive};
         } else {
-          if (${eval1.primitive} > ${eval2.primitive}) {
-            ${ev.primitive} = ${eval1.primitive};
-          } else {
-            ${ev.primitive} = ${eval2.primitive};
-          }
+          ${ev.primitive} = ${eval2.primitive};
         }
-      """
-    } else {
-      super.genCode(ctx, ev)
-    }
+      }
+    """
   }
   override def toString: String = s"MaxOf($left, $right)"
 }
@@ -395,33 +393,29 @@ case class MinOf(left: Expression, right: Expression) extends BinaryArithmetic {
   }
 
   override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
-    if (ctx.isNativeType(left.dataType)) {
-
-      val eval1 = left.gen(ctx)
-      val eval2 = right.gen(ctx)
-
-      eval1.code + eval2.code + s"""
-        boolean ${ev.isNull} = false;
-        ${ctx.javaType(left.dataType)} ${ev.primitive} =
-          ${ctx.defaultValue(left.dataType)};
+    val eval1 = left.gen(ctx)
+    val eval2 = right.gen(ctx)
+    val compCode = ctx.genComp(dataType, eval1.primitive, eval2.primitive)
 
-        if (${eval1.isNull}) {
-          ${ev.isNull} = ${eval2.isNull};
-          ${ev.primitive} = ${eval2.primitive};
-        } else if (${eval2.isNull}) {
-          ${ev.isNull} = ${eval1.isNull};
+    eval1.code + eval2.code + s"""
+      boolean ${ev.isNull} = false;
+      ${ctx.javaType(left.dataType)} ${ev.primitive} =
+        ${ctx.defaultValue(left.dataType)};
+
+      if (${eval1.isNull}) {
+        ${ev.isNull} = ${eval2.isNull};
+        ${ev.primitive} = ${eval2.primitive};
+      } else if (${eval2.isNull}) {
+        ${ev.isNull} = ${eval1.isNull};
+        ${ev.primitive} = ${eval1.primitive};
+      } else {
+        if ($compCode < 0) {
           ${ev.primitive} = ${eval1.primitive};
         } else {
-          if (${eval1.primitive} < ${eval2.primitive}) {
-            ${ev.primitive} = ${eval1.primitive};
-          } else {
-            ${ev.primitive} = ${eval2.primitive};
-          }
+          ${ev.primitive} = ${eval2.primitive};
         }
-      """
-    } else {
-      super.genCode(ctx, ev)
-    }
+      }
+    """
   }
 
   override def toString: String = s"MinOf($left, $right)"
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala
index 54f06aaa10484..ab850d17a6dd3 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala
@@ -24,7 +24,6 @@ import com.google.common.cache.{CacheBuilder, CacheLoader}
 import org.codehaus.janino.ClassBodyEvaluator
 
 import org.apache.spark.Logging
-import org.apache.spark.sql.catalyst
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.types._
 import org.apache.spark.unsafe.types.UTF8String
@@ -176,9 +175,8 @@ class CodeGenContext {
    * Generate code for compare expression in Java
    */
   def genComp(dataType: DataType, c1: String, c2: String): String = dataType match {
-    // Use signum() to keep any small difference bwteen float/double
-    case FloatType | DoubleType => s"(int)java.lang.Math.signum($c1 - $c2)"
-    case dt: DataType if isPrimitiveType(dt) => s"(int)($c1 - $c2)"
+    // use c1 - c2 may overflow
+    case dt: DataType if isPrimitiveType(dt) => s"(int)($c1 > $c2 ? 1 : $c1 < $c2 ? -1 : 0)"
     case BinaryType => s"org.apache.spark.sql.catalyst.util.TypeUtils.compareBinary($c1, $c2)"
     case other => s"$c1.compare($c2)"
   }
@@ -266,7 +264,7 @@ abstract class CodeGenerator[InType <: AnyRef, OutType <: AnyRef] extends Loggin
    * weak keys/values and thus does not respond to memory pressure.
    */
   protected val cache = CacheBuilder.newBuilder()
-    .maximumSize(1000)
+    .maximumSize(100)
     .build(
       new CacheLoader[InType, OutType]() {
         override def load(in: InType): OutType = {
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/namedExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/namedExpressions.scala
index f22c8a7f6a374..58dbeaf89cad5 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/namedExpressions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/namedExpressions.scala
@@ -117,6 +117,8 @@ case class Alias(child: Expression, name: String)(
 
   override def eval(input: InternalRow): Any = child.eval(input)
 
+  override def isThreadSafe: Boolean = child.isThreadSafe
+
   override def gen(ctx: CodeGenContext): GeneratedExpressionCode = child.gen(ctx)
 
   override def dataType: DataType = child.dataType
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/nullFunctions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/nullFunctions.scala
index 0d06589a795b1..98acaf23c44c1 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/nullFunctions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/nullFunctions.scala
@@ -19,7 +19,6 @@ package org.apache.spark.sql.catalyst.expressions
 
 import org.apache.spark.sql.catalyst.analysis.UnresolvedException
 import org.apache.spark.sql.catalyst.expressions.codegen.{CodeGenContext, GeneratedExpressionCode}
-import org.apache.spark.sql.catalyst.trees
 import org.apache.spark.sql.types.DataType
 
 case class Coalesce(children: Seq[Expression]) extends Expression {
@@ -53,6 +52,8 @@ case class Coalesce(children: Seq[Expression]) extends Expression {
     result
   }
 
+  override def isThreadSafe: Boolean = children.forall(_.isThreadSafe)
+
   override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
     s"""
       boolean ${ev.isNull} = true;
@@ -73,7 +74,7 @@ case class Coalesce(children: Seq[Expression]) extends Expression {
   }
 }
 
-case class IsNull(child: Expression) extends Predicate with trees.UnaryNode[Expression] {
+case class IsNull(child: Expression) extends UnaryExpression with Predicate {
   override def foldable: Boolean = child.foldable
   override def nullable: Boolean = false
 
@@ -91,7 +92,7 @@ case class IsNull(child: Expression) extends Predicate with trees.UnaryNode[Expr
   override def toString: String = s"IS NULL $child"
 }
 
-case class IsNotNull(child: Expression) extends Predicate with trees.UnaryNode[Expression] {
+case class IsNotNull(child: Expression) extends UnaryExpression with Predicate {
   override def foldable: Boolean = child.foldable
   override def nullable: Boolean = false
   override def toString: String = s"IS NOT NULL $child"
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/windowExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/windowExpressions.scala
index 056f170539884..896e383f50eac 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/windowExpressions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/windowExpressions.scala
@@ -19,8 +19,7 @@ package org.apache.spark.sql.catalyst.expressions
 
 import org.apache.spark.sql.catalyst.analysis.UnresolvedException
 import org.apache.spark.sql.catalyst.errors.TreeNodeException
-import org.apache.spark.sql.catalyst.expressions._
-import org.apache.spark.sql.types.{NumericType, DataType}
+import org.apache.spark.sql.types.{DataType, NumericType}
 
 /**
  * The trait of the Window Specification (specified in the OVER clause or WINDOW clause) for
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LocalRelation.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LocalRelation.scala
index 2c946cd12f8d8..1868f119f0e97 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LocalRelation.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LocalRelation.scala
@@ -18,7 +18,7 @@
 package org.apache.spark.sql.catalyst.plans.logical
 
 import org.apache.spark.sql.catalyst.expressions.Attribute
-import org.apache.spark.sql.catalyst.{InternalRow, CatalystTypeConverters, analysis}
+import org.apache.spark.sql.catalyst.{CatalystTypeConverters, InternalRow, analysis}
 import org.apache.spark.sql.types.{StructField, StructType}
 
 object LocalRelation {
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/ScalaReflectionSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/ScalaReflectionSuite.scala
index b4d5e013f3582..c2d739b529295 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/ScalaReflectionSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/ScalaReflectionSuite.scala
@@ -21,7 +21,6 @@ import java.math.BigInteger
 import java.sql.{Date, Timestamp}
 
 import org.apache.spark.SparkFunSuite
-import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.types._
 
 case class PrimitiveData(
@@ -75,7 +74,7 @@ case class MultipleConstructorsData(a: Int, b: String, c: Double) {
 }
 
 class ScalaReflectionSuite extends SparkFunSuite {
-  import ScalaReflection._
+  import org.apache.spark.sql.catalyst.ScalaReflection._
 
   test("primitive data") {
     val schema = schemaFor[PrimitiveData]
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SQLConf.scala b/sql/core/src/main/scala/org/apache/spark/sql/SQLConf.scala
index 87f40482e31bb..55ab6b3358e3c 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/SQLConf.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/SQLConf.scala
@@ -171,15 +171,11 @@ private[sql] class SQLConf extends Serializable with CatalystConf {
   private[spark] def sortMergeJoinEnabled: Boolean = getConf(SORTMERGE_JOIN, "false").toBoolean
 
   /**
-   * When set to true, Spark SQL will use the Scala compiler at runtime to generate custom bytecode
+   * When set to true, Spark SQL will use the Janino at runtime to generate custom bytecode
    * that evaluates expressions found in queries.  In general this custom code runs much faster
-   * than interpreted evaluation, but there are significant start-up costs due to compilation.
-   * As a result codegen is only beneficial when queries run for a long time, or when the same
-   * expressions are used multiple times.
-   *
-   * Defaults to false as this feature is currently experimental.
+   * than interpreted evaluation, but there are some start-up costs (5-10ms) due to compilation.
    */
-  private[spark] def codegenEnabled: Boolean = getConf(CODEGEN_ENABLED, "false").toBoolean
+  private[spark] def codegenEnabled: Boolean = getConf(CODEGEN_ENABLED, "true").toBoolean
 
   /**
    * caseSensitive analysis true by default
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
index 22d0e50e4ef6f..9d1f89d6d7bd8 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
@@ -31,14 +31,13 @@ import org.apache.spark.SparkContext
 import org.apache.spark.annotation.{DeveloperApi, Experimental}
 import org.apache.spark.api.java.{JavaRDD, JavaSparkContext}
 import org.apache.spark.rdd.RDD
-import org.apache.spark.sql.catalyst.{InternalRow, _}
 import org.apache.spark.sql.catalyst.analysis._
-import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.errors.DialectException
+import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.optimizer.{DefaultOptimizer, Optimizer}
 import org.apache.spark.sql.catalyst.plans.logical.{LocalRelation, LogicalPlan}
 import org.apache.spark.sql.catalyst.rules.RuleExecutor
-import org.apache.spark.sql.catalyst.ParserDialect
+import org.apache.spark.sql.catalyst.{InternalRow, ParserDialect, _}
 import org.apache.spark.sql.execution.{Filter, _}
 import org.apache.spark.sql.sources._
 import org.apache.spark.sql.types._
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/columnar/ColumnBuilder.scala b/sql/core/src/main/scala/org/apache/spark/sql/columnar/ColumnBuilder.scala
index cc7506dec1ee8..1949625699ca8 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/columnar/ColumnBuilder.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/columnar/ColumnBuilder.scala
@@ -18,6 +18,7 @@
 package org.apache.spark.sql.columnar
 
 import java.nio.{ByteBuffer, ByteOrder}
+
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.columnar.ColumnBuilder._
 import org.apache.spark.sql.columnar.compression.{AllCompressionSchemes, CompressibleColumnBuilder}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlan.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlan.scala
index 7739a9f949c77..2b8d30294293c 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlan.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlan.scala
@@ -156,7 +156,7 @@ abstract class SparkPlan extends QueryPlan[SparkPlan] with Logging with Serializ
       expressions: Seq[Expression], inputSchema: Seq[Attribute]): Projection = {
     log.debug(
       s"Creating Projection: $expressions, inputSchema: $inputSchema, codegen:$codegenEnabled")
-    if (codegenEnabled) {
+    if (codegenEnabled && expressions.forall(_.isThreadSafe)) {
       GenerateProjection.generate(expressions, inputSchema)
     } else {
       new InterpretedProjection(expressions, inputSchema)
@@ -168,7 +168,8 @@ abstract class SparkPlan extends QueryPlan[SparkPlan] with Logging with Serializ
       inputSchema: Seq[Attribute]): () => MutableProjection = {
     log.debug(
       s"Creating MutableProj: $expressions, inputSchema: $inputSchema, codegen:$codegenEnabled")
-    if(codegenEnabled) {
+    if(codegenEnabled && expressions.forall(_.isThreadSafe)) {
+
       GenerateMutableProjection.generate(expressions, inputSchema)
     } else {
       () => new InterpretedMutableProjection(expressions, inputSchema)
@@ -178,7 +179,7 @@ abstract class SparkPlan extends QueryPlan[SparkPlan] with Logging with Serializ
 
   protected def newPredicate(
       expression: Expression, inputSchema: Seq[Attribute]): (InternalRow) => Boolean = {
-    if (codegenEnabled) {
+    if (codegenEnabled && expression.isThreadSafe) {
       GeneratePredicate.generate(expression, inputSchema)
     } else {
       InterpretedPredicate.create(expression, inputSchema)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/expressions/MonotonicallyIncreasingID.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/expressions/MonotonicallyIncreasingID.scala
index 68914cf85cb50..3b217348b7b7a 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/expressions/MonotonicallyIncreasingID.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/expressions/MonotonicallyIncreasingID.scala
@@ -48,4 +48,6 @@ private[sql] case class MonotonicallyIncreasingID() extends LeafExpression {
     count += 1
     (TaskContext.get().partitionId().toLong << 33) + currentCount
   }
+
+  override def isThreadSafe: Boolean = false
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/BroadcastLeftSemiJoinHash.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/BroadcastLeftSemiJoinHash.scala
index 044964f3a355b..412a3d4178e12 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/BroadcastLeftSemiJoinHash.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/BroadcastLeftSemiJoinHash.scala
@@ -50,7 +50,8 @@ case class BroadcastLeftSemiJoinHash(
       if (!rowKey.anyNull) {
         val keyExists = hashSet.contains(rowKey)
         if (!keyExists) {
-          hashSet.add(rowKey)
+          // rowKey may be not serializable (from codegen)
+          hashSet.add(rowKey.copy())
         }
       }
     }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/newParquet.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/newParquet.scala
index bc27a9b67a6d6..bba6f1ec96aa8 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/newParquet.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/newParquet.scala
@@ -34,16 +34,16 @@ import org.apache.parquet.hadoop._
 import org.apache.parquet.hadoop.metadata.CompressionCodecName
 import org.apache.parquet.hadoop.util.ContextUtil
 
-import org.apache.spark.sql.catalyst.InternalRow
-import org.apache.spark.{Partition => SparkPartition, SerializableWritable, Logging, SparkException}
 import org.apache.spark.broadcast.Broadcast
 import org.apache.spark.deploy.SparkHadoopUtil
-import org.apache.spark.rdd.RDD._
 import org.apache.spark.rdd.RDD
+import org.apache.spark.rdd.RDD._
 import org.apache.spark.sql._
+import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.sources._
 import org.apache.spark.sql.types.{DataType, StructType}
 import org.apache.spark.util.Utils
+import org.apache.spark.{Logging, SerializableWritable, SparkException, Partition => SparkPartition}
 
 private[sql] class DefaultSource extends HadoopFsRelationProvider {
   override def createRelation(
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/sources/commands.scala b/sql/core/src/main/scala/org/apache/spark/sql/sources/commands.scala
index 1763cee419572..3dbe6faabf453 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/sources/commands.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/sources/commands.scala
@@ -23,7 +23,7 @@ import scala.collection.mutable
 
 import org.apache.hadoop.fs.Path
 import org.apache.hadoop.mapreduce._
-import org.apache.hadoop.mapreduce.lib.output.{FileOutputCommitter => MapReduceFileOutputCommitter, FileOutputFormat}
+import org.apache.hadoop.mapreduce.lib.output.{FileOutputFormat, FileOutputCommitter => MapReduceFileOutputCommitter}
 import org.apache.parquet.hadoop.util.ContextUtil
 
 import org.apache.spark._
@@ -211,6 +211,7 @@ private[sql] case class InsertIntoHadoopFsRelation(
 
         val partitionProj = newProjection(codegenEnabled, partitionOutput, output)
         val dataProj = newProjection(codegenEnabled, dataOutput, output)
+
         val dataConverter: InternalRow => Row = if (needsConversion) {
           CatalystTypeConverters.createToScalaConverter(dataSchema).asInstanceOf[InternalRow => Row]
         } else {
@@ -244,7 +245,7 @@ private[sql] case class InsertIntoHadoopFsRelation(
       inputSchema: Seq[Attribute]): Projection = {
     log.debug(
       s"Creating Projection: $expressions, inputSchema: $inputSchema, codegen:$codegenEnabled")
-    if (codegenEnabled) {
+    if (codegenEnabled && expressions.forall(_.isThreadSafe)) {
       GenerateProjection.generate(expressions, inputSchema)
     } else {
       new InterpretedProjection(expressions, inputSchema)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/sources/interfaces.scala b/sql/core/src/main/scala/org/apache/spark/sql/sources/interfaces.scala
index 27534a1f48ce2..43d3507d7d2ba 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/sources/interfaces.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/sources/interfaces.scala
@@ -576,6 +576,7 @@ abstract class HadoopFsRelation private[sql](maybePartitionSpec: Option[Partitio
     // Yeah, to workaround serialization...
     val dataSchema = this.dataSchema
     val codegenEnabled = this.codegenEnabled
+    val needConversion = this.needConversion
 
     val requiredOutput = requiredColumns.map { col =>
       val field = dataSchema(col)
@@ -590,7 +591,7 @@ abstract class HadoopFsRelation private[sql](maybePartitionSpec: Option[Partitio
         rdd.map(_.asInstanceOf[InternalRow])
       }
     converted.mapPartitions { rows =>
-      val buildProjection = if (codegenEnabled) {
+      val buildProjection = if (codegenEnabled && requiredOutput.forall(_.isThreadSafe)) {
         GenerateMutableProjection.generate(requiredOutput, dataSchema.toAttributes)
       } else {
         () => new InterpretedMutableProjection(requiredOutput, dataSchema.toAttributes)
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/hiveUdfs.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/hiveUdfs.scala
index c40dd4e4b94f8..4986b1ea9d906 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/hiveUdfs.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/hiveUdfs.scala
@@ -120,6 +120,8 @@ private[hive] case class HiveSimpleUdf(funcWrapper: HiveFunctionWrapper, childre
   @transient
   protected lazy val cached: Array[AnyRef] = new Array[AnyRef](children.length)
 
+  override def isThreadSafe: Boolean = false
+
   // TODO: Finish input output types.
   override def eval(input: InternalRow): Any = {
     unwrap(
@@ -178,6 +180,8 @@ private[hive] case class HiveGenericUdf(funcWrapper: HiveFunctionWrapper, childr
 
   lazy val dataType: DataType = inspectorToDataType(returnInspector)
 
+  override def isThreadSafe: Boolean = false
+
   override def eval(input: InternalRow): Any = {
     returnInspector // Make sure initialized.
 

From ccf010f27bc62f7e7f409c6eef7488ab476de609 Mon Sep 17 00:00:00 2001
From: huangzhaowei <carlmartinmax@gmail.com>
Date: Tue, 16 Jun 2015 08:16:09 +0200
Subject: [PATCH 491/525] [SPARK-8367] [STREAMING] Add a limit for
 'spark.streaming.blockInterval` since a data loss bug.

Bug had reported in the jira [SPARK-8367](https://issues.apache.org/jira/browse/SPARK-8367)
The relution is limitting the configuration `spark.streaming.blockInterval` to a positive number.

Author: huangzhaowei <carlmartinmax@gmail.com>
Author: huangzhaowei <SaintBacchus@users.noreply.github.com>

Closes #6818 from SaintBacchus/SPARK-8367 and squashes the following commits:

c9d1927 [huangzhaowei] Update BlockGenerator.scala
bd3f71a [huangzhaowei] Use requre instead of if
3d17796 [huangzhaowei] [SPARK_8367][Streaming]Add a limit for 'spark.streaming.blockInterval' since a data loss bug.
---
 .../org/apache/spark/streaming/receiver/BlockGenerator.scala  | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/streaming/src/main/scala/org/apache/spark/streaming/receiver/BlockGenerator.scala b/streaming/src/main/scala/org/apache/spark/streaming/receiver/BlockGenerator.scala
index 8d73593ab6375..92b51ce39234c 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/receiver/BlockGenerator.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/receiver/BlockGenerator.scala
@@ -24,7 +24,7 @@ import scala.collection.mutable.ArrayBuffer
 import org.apache.spark.{Logging, SparkConf}
 import org.apache.spark.storage.StreamBlockId
 import org.apache.spark.streaming.util.RecurringTimer
-import org.apache.spark.util.{SystemClock, Utils}
+import org.apache.spark.util.SystemClock
 
 /** Listener object for BlockGenerator events */
 private[streaming] trait BlockGeneratorListener {
@@ -80,6 +80,8 @@ private[streaming] class BlockGenerator(
 
   private val clock = new SystemClock()
   private val blockIntervalMs = conf.getTimeAsMs("spark.streaming.blockInterval", "200ms")
+  require(blockIntervalMs > 0, s"'spark.streaming.blockInterval' should be a positive value")
+
   private val blockIntervalTimer =
     new RecurringTimer(clock, blockIntervalMs, updateCurrentBuffer, "BlockGenerator")
   private val blockQueueSize = conf.getInt("spark.streaming.blockQueueSize", 10)

From 658814c898bec04c31a8e57f8da0103497aac6ec Mon Sep 17 00:00:00 2001
From: Kan Zhang <kzhang@apache.org>
Date: Tue, 16 Jun 2015 08:18:26 +0200
Subject: [PATCH 492/525] [SPARK-8129] [CORE] [Sec] Pass auth secrets to
 executors via env variables

Env variables are not visible to non-Spark users, based on suggestion from vanzin.

Author: Kan Zhang <kzhang@apache.org>

Closes #6774 from kanzhang/env and squashes the following commits:

5dd84c6 [Kan Zhang] remove auth secret conf from initial set up for executors
90cb7d2 [Kan Zhang] always filter out auth secret
af4d89d [Kan Zhang] minor refactering
e88993e [Kan Zhang] pass auth secret to executors via env variable
---
 .../org/apache/spark/SecurityManager.scala    | 17 ++++++--
 .../scala/org/apache/spark/SparkConf.scala    |  2 +-
 .../spark/deploy/worker/CommandUtils.scala    | 16 ++++++--
 .../spark/deploy/worker/DriverRunner.scala    |  4 +-
 .../spark/deploy/worker/ExecutorRunner.scala  |  6 +--
 .../deploy/worker/CommandUtilsSuite.scala     | 39 +++++++++++++++++--
 .../deploy/worker/ExecutorRunnerTest.scala    |  7 ++--
 7 files changed, 72 insertions(+), 19 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/SecurityManager.scala b/core/src/main/scala/org/apache/spark/SecurityManager.scala
index 8aed1e20e0686..673ef49e7c1c5 100644
--- a/core/src/main/scala/org/apache/spark/SecurityManager.scala
+++ b/core/src/main/scala/org/apache/spark/SecurityManager.scala
@@ -192,7 +192,7 @@ private[spark] class SecurityManager(sparkConf: SparkConf)
   // key used to store the spark secret in the Hadoop UGI
   private val sparkSecretLookupKey = "sparkCookie"
 
-  private val authOn = sparkConf.getBoolean("spark.authenticate", false)
+  private val authOn = sparkConf.getBoolean(SecurityManager.SPARK_AUTH_CONF, false)
   // keep spark.ui.acls.enable for backwards compatibility with 1.0
   private var aclsOn =
     sparkConf.getBoolean("spark.acls.enable", sparkConf.getBoolean("spark.ui.acls.enable", false))
@@ -365,10 +365,12 @@ private[spark] class SecurityManager(sparkConf: SparkConf)
       cookie
     } else {
       // user must have set spark.authenticate.secret config
-      sparkConf.getOption("spark.authenticate.secret") match {
+      // For Master/Worker, auth secret is in conf; for Executors, it is in env variable
+      sys.env.get(SecurityManager.ENV_AUTH_SECRET)
+        .orElse(sparkConf.getOption(SecurityManager.SPARK_AUTH_SECRET_CONF)) match {
         case Some(value) => value
         case None => throw new Exception("Error: a secret key must be specified via the " +
-          "spark.authenticate.secret config")
+          SecurityManager.SPARK_AUTH_SECRET_CONF + " config")
       }
     }
     sCookie
@@ -449,3 +451,12 @@ private[spark] class SecurityManager(sparkConf: SparkConf)
   override def getSaslUser(appId: String): String = getSaslUser()
   override def getSecretKey(appId: String): String = getSecretKey()
 }
+
+private[spark] object SecurityManager {
+
+  val SPARK_AUTH_CONF: String = "spark.authenticate"
+  val SPARK_AUTH_SECRET_CONF: String = "spark.authenticate.secret"
+  // This is used to set auth secret to an executor's env variable. It should have the same
+  // value as SPARK_AUTH_SECERET_CONF set in SparkConf
+  val ENV_AUTH_SECRET = "_SPARK_AUTH_SECRET"
+}
diff --git a/core/src/main/scala/org/apache/spark/SparkConf.scala b/core/src/main/scala/org/apache/spark/SparkConf.scala
index 46d72841dccce..6cf36fbbd6254 100644
--- a/core/src/main/scala/org/apache/spark/SparkConf.scala
+++ b/core/src/main/scala/org/apache/spark/SparkConf.scala
@@ -557,7 +557,7 @@ private[spark] object SparkConf extends Logging {
   def isExecutorStartupConf(name: String): Boolean = {
     isAkkaConf(name) ||
     name.startsWith("spark.akka") ||
-    name.startsWith("spark.auth") ||
+    (name.startsWith("spark.auth") && name != SecurityManager.SPARK_AUTH_SECRET_CONF) ||
     name.startsWith("spark.ssl") ||
     isSparkPortConf(name)
   }
diff --git a/core/src/main/scala/org/apache/spark/deploy/worker/CommandUtils.scala b/core/src/main/scala/org/apache/spark/deploy/worker/CommandUtils.scala
index 0a1d60f58bc58..45a3f43045437 100644
--- a/core/src/main/scala/org/apache/spark/deploy/worker/CommandUtils.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/worker/CommandUtils.scala
@@ -24,6 +24,7 @@ import scala.collection.JavaConversions._
 import scala.collection.Map
 
 import org.apache.spark.Logging
+import org.apache.spark.SecurityManager
 import org.apache.spark.deploy.Command
 import org.apache.spark.launcher.WorkerCommandBuilder
 import org.apache.spark.util.Utils
@@ -40,12 +41,14 @@ object CommandUtils extends Logging {
    */
   def buildProcessBuilder(
       command: Command,
+      securityMgr: SecurityManager,
       memory: Int,
       sparkHome: String,
       substituteArguments: String => String,
       classPaths: Seq[String] = Seq[String](),
       env: Map[String, String] = sys.env): ProcessBuilder = {
-    val localCommand = buildLocalCommand(command, substituteArguments, classPaths, env)
+    val localCommand = buildLocalCommand(
+      command, securityMgr, substituteArguments, classPaths, env)
     val commandSeq = buildCommandSeq(localCommand, memory, sparkHome)
     val builder = new ProcessBuilder(commandSeq: _*)
     val environment = builder.environment()
@@ -69,6 +72,7 @@ object CommandUtils extends Logging {
    */
   private def buildLocalCommand(
       command: Command,
+      securityMgr: SecurityManager,
       substituteArguments: String => String,
       classPath: Seq[String] = Seq[String](),
       env: Map[String, String]): Command = {
@@ -76,20 +80,26 @@ object CommandUtils extends Logging {
     val libraryPathEntries = command.libraryPathEntries
     val cmdLibraryPath = command.environment.get(libraryPathName)
 
-    val newEnvironment = if (libraryPathEntries.nonEmpty && libraryPathName.nonEmpty) {
+    var newEnvironment = if (libraryPathEntries.nonEmpty && libraryPathName.nonEmpty) {
       val libraryPaths = libraryPathEntries ++ cmdLibraryPath ++ env.get(libraryPathName)
       command.environment + ((libraryPathName, libraryPaths.mkString(File.pathSeparator)))
     } else {
       command.environment
     }
 
+    // set auth secret to env variable if needed
+    if (securityMgr.isAuthenticationEnabled) {
+      newEnvironment += (SecurityManager.ENV_AUTH_SECRET -> securityMgr.getSecretKey)
+    }
+
     Command(
       command.mainClass,
       command.arguments.map(substituteArguments),
       newEnvironment,
       command.classPathEntries ++ classPath,
       Seq[String](), // library path already captured in environment variable
-      command.javaOpts)
+      // filter out auth secret from java options
+      command.javaOpts.filterNot(_.startsWith("-D" + SecurityManager.SPARK_AUTH_SECRET_CONF)))
   }
 
   /** Spawn a thread that will redirect a given stream to a file */
diff --git a/core/src/main/scala/org/apache/spark/deploy/worker/DriverRunner.scala b/core/src/main/scala/org/apache/spark/deploy/worker/DriverRunner.scala
index ef7a703bffe67..1386055eb8c48 100644
--- a/core/src/main/scala/org/apache/spark/deploy/worker/DriverRunner.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/worker/DriverRunner.scala
@@ -85,8 +85,8 @@ private[deploy] class DriverRunner(
           }
 
           // TODO: If we add ability to submit multiple jars they should also be added here
-          val builder = CommandUtils.buildProcessBuilder(driverDesc.command, driverDesc.mem,
-            sparkHome.getAbsolutePath, substituteVariables)
+          val builder = CommandUtils.buildProcessBuilder(driverDesc.command, securityManager,
+            driverDesc.mem, sparkHome.getAbsolutePath, substituteVariables)
           launchDriver(builder, driverDir, driverDesc.supervise)
         }
         catch {
diff --git a/core/src/main/scala/org/apache/spark/deploy/worker/ExecutorRunner.scala b/core/src/main/scala/org/apache/spark/deploy/worker/ExecutorRunner.scala
index 7aa85b732fc87..fff17e1095042 100644
--- a/core/src/main/scala/org/apache/spark/deploy/worker/ExecutorRunner.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/worker/ExecutorRunner.scala
@@ -25,7 +25,7 @@ import akka.actor.ActorRef
 import com.google.common.base.Charsets.UTF_8
 import com.google.common.io.Files
 
-import org.apache.spark.{SparkConf, Logging}
+import org.apache.spark.{SecurityManager, SparkConf, Logging}
 import org.apache.spark.deploy.{ApplicationDescription, ExecutorState}
 import org.apache.spark.deploy.DeployMessages.ExecutorStateChanged
 import org.apache.spark.util.Utils
@@ -125,8 +125,8 @@ private[deploy] class ExecutorRunner(
   private def fetchAndRunExecutor() {
     try {
       // Launch the process
-      val builder = CommandUtils.buildProcessBuilder(appDesc.command, memory,
-        sparkHome.getAbsolutePath, substituteVariables)
+      val builder = CommandUtils.buildProcessBuilder(appDesc.command, new SecurityManager(conf),
+        memory, sparkHome.getAbsolutePath, substituteVariables)
       val command = builder.command()
       logInfo("Launch command: " + command.mkString("\"", "\" \"", "\""))
 
diff --git a/core/src/test/scala/org/apache/spark/deploy/worker/CommandUtilsSuite.scala b/core/src/test/scala/org/apache/spark/deploy/worker/CommandUtilsSuite.scala
index 5b3930c0b0132..7101cb9978df3 100644
--- a/core/src/test/scala/org/apache/spark/deploy/worker/CommandUtilsSuite.scala
+++ b/core/src/test/scala/org/apache/spark/deploy/worker/CommandUtilsSuite.scala
@@ -17,21 +17,52 @@
 
 package org.apache.spark.deploy.worker
 
-import org.apache.spark.SparkFunSuite
+import org.apache.spark.{SecurityManager, SparkConf, SparkFunSuite}
 import org.apache.spark.deploy.Command
 import org.apache.spark.util.Utils
-import org.scalatest.Matchers
+import org.scalatest.{Matchers, PrivateMethodTester}
 
-class CommandUtilsSuite extends SparkFunSuite with Matchers {
+class CommandUtilsSuite extends SparkFunSuite with Matchers with PrivateMethodTester {
 
   test("set libraryPath correctly") {
     val appId = "12345-worker321-9876"
     val sparkHome = sys.props.getOrElse("spark.test.home", fail("spark.test.home is not set!"))
     val cmd = new Command("mainClass", Seq(), Map(), Seq(), Seq("libraryPathToB"), Seq())
-    val builder = CommandUtils.buildProcessBuilder(cmd, 512, sparkHome, t => t)
+    val builder = CommandUtils.buildProcessBuilder(
+      cmd, new SecurityManager(new SparkConf), 512, sparkHome, t => t)
     val libraryPath = Utils.libraryPathEnvName
     val env = builder.environment
     env.keySet should contain(libraryPath)
     assert(env.get(libraryPath).startsWith("libraryPathToB"))
   }
+
+  test("auth secret shouldn't appear in java opts") {
+    val buildLocalCommand = PrivateMethod[Command]('buildLocalCommand)
+    val conf = new SparkConf
+    val secret = "This is the secret sauce"
+    // set auth secret
+    conf.set(SecurityManager.SPARK_AUTH_SECRET_CONF, secret)
+    val command = new Command("mainClass", Seq(), Map(), Seq(), Seq("lib"),
+      Seq("-D" + SecurityManager.SPARK_AUTH_SECRET_CONF + "=" + secret))
+
+    // auth is not set
+    var cmd = CommandUtils invokePrivate buildLocalCommand(
+      command, new SecurityManager(conf), (t: String) => t, Seq(), Map())
+    assert(!cmd.javaOpts.exists(_.startsWith("-D" + SecurityManager.SPARK_AUTH_SECRET_CONF)))
+    assert(!cmd.environment.contains(SecurityManager.ENV_AUTH_SECRET))
+
+    // auth is set to false
+    conf.set(SecurityManager.SPARK_AUTH_CONF, "false")
+    cmd = CommandUtils invokePrivate buildLocalCommand(
+      command, new SecurityManager(conf), (t: String) => t, Seq(), Map())
+    assert(!cmd.javaOpts.exists(_.startsWith("-D" + SecurityManager.SPARK_AUTH_SECRET_CONF)))
+    assert(!cmd.environment.contains(SecurityManager.ENV_AUTH_SECRET))
+
+    // auth is set to true
+    conf.set(SecurityManager.SPARK_AUTH_CONF, "true")
+    cmd = CommandUtils invokePrivate buildLocalCommand(
+      command, new SecurityManager(conf), (t: String) => t, Seq(), Map())
+    assert(!cmd.javaOpts.exists(_.startsWith("-D" + SecurityManager.SPARK_AUTH_SECRET_CONF)))
+    assert(cmd.environment(SecurityManager.ENV_AUTH_SECRET) === secret)
+  }
 }
diff --git a/core/src/test/scala/org/apache/spark/deploy/worker/ExecutorRunnerTest.scala b/core/src/test/scala/org/apache/spark/deploy/worker/ExecutorRunnerTest.scala
index 3da992788962b..bed6f3ea61241 100644
--- a/core/src/test/scala/org/apache/spark/deploy/worker/ExecutorRunnerTest.scala
+++ b/core/src/test/scala/org/apache/spark/deploy/worker/ExecutorRunnerTest.scala
@@ -22,19 +22,20 @@ import java.io.File
 import scala.collection.JavaConversions._
 
 import org.apache.spark.deploy.{ApplicationDescription, Command, ExecutorState}
-import org.apache.spark.{SparkConf, SparkFunSuite}
+import org.apache.spark.{SecurityManager, SparkConf, SparkFunSuite}
 
 class ExecutorRunnerTest extends SparkFunSuite {
   test("command includes appId") {
     val appId = "12345-worker321-9876"
+    val conf = new SparkConf
     val sparkHome = sys.props.getOrElse("spark.test.home", fail("spark.test.home is not set!"))
     val appDesc = new ApplicationDescription("app name", Some(8), 500,
       Command("foo", Seq(appId), Map(), Seq(), Seq(), Seq()), "appUiUrl")
     val er = new ExecutorRunner(appId, 1, appDesc, 8, 500, null, "blah", "worker321", 123,
-      "publicAddr", new File(sparkHome), new File("ooga"), "blah", new SparkConf, Seq("localDir"),
+      "publicAddr", new File(sparkHome), new File("ooga"), "blah", conf, Seq("localDir"),
       ExecutorState.RUNNING)
     val builder = CommandUtils.buildProcessBuilder(
-      appDesc.command, 512, sparkHome, er.substituteVariables)
+      appDesc.command, new SecurityManager(conf), 512, sparkHome, er.substituteVariables)
     assert(builder.command().last === appId)
   }
 }

From 29c5025a7058e64baccb6dfe3ad74d918da64494 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?hushan=5B=E8=83=A1=E7=8F=8A=5D?= <hushan@xiaomi.com>
Date: Tue, 16 Jun 2015 20:48:33 +0100
Subject: [PATCH 493/525] [SPARK-8387] [WEBUI] Only show 4096 bytes content for
 executor log instead of show all
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Author: hushan[胡珊] <hushan@xiaomi.com>

Closes #6834 from suyanNone/small-display and squashes the following commits:

744212f [hushan[胡珊]] Only show 4096 bytes content for executor log instead all
---
 .../scala/org/apache/spark/deploy/yarn/ExecutorRunnable.scala | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/yarn/src/main/scala/org/apache/spark/deploy/yarn/ExecutorRunnable.scala b/yarn/src/main/scala/org/apache/spark/deploy/yarn/ExecutorRunnable.scala
index 9d04d241dae9e..b0937083bc536 100644
--- a/yarn/src/main/scala/org/apache/spark/deploy/yarn/ExecutorRunnable.scala
+++ b/yarn/src/main/scala/org/apache/spark/deploy/yarn/ExecutorRunnable.scala
@@ -303,8 +303,8 @@ class ExecutorRunnable(
       val address = container.getNodeHttpAddress
       val baseUrl = s"$httpScheme$address/node/containerlogs/$containerId/$user"
 
-      env("SPARK_LOG_URL_STDERR") = s"$baseUrl/stderr?start=0"
-      env("SPARK_LOG_URL_STDOUT") = s"$baseUrl/stdout?start=0"
+      env("SPARK_LOG_URL_STDERR") = s"$baseUrl/stderr?start=-4096"
+      env("SPARK_LOG_URL_STDOUT") = s"$baseUrl/stdout?start=-4096"
     }
 
     System.getenv().filterKeys(_.startsWith("SPARK")).foreach { case (k, v) => env(k) = v }

From dc455b88330f79b1181a585277ea9ed3e0763703 Mon Sep 17 00:00:00 2001
From: Moussa Taifi <moutai10@gmail.com>
Date: Tue, 16 Jun 2015 20:59:22 +0100
Subject: [PATCH 494/525] [SPARK-DOCS] [SPARK-SQL] Update
 sql-programming-guide.md

Typo in thriftserver section

Author: Moussa Taifi <moutai10@gmail.com>

Closes #6847 from moutai/patch-1 and squashes the following commits:

1bd29df [Moussa Taifi] Update sql-programming-guide.md
---
 docs/sql-programming-guide.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/sql-programming-guide.md b/docs/sql-programming-guide.md
index 7fed1bf8829f5..61f9c5f02ac72 100644
--- a/docs/sql-programming-guide.md
+++ b/docs/sql-programming-guide.md
@@ -1816,7 +1816,7 @@ To start the JDBC/ODBC server, run the following in the Spark directory:
 This script accepts all `bin/spark-submit` command line options, plus a `--hiveconf` option to
 specify Hive properties.  You may run `./sbin/start-thriftserver.sh --help` for a complete list of
 all available options.  By default, the server listens on localhost:10000. You may override this
-bahaviour via either environment variables, i.e.:
+behaviour via either environment variables, i.e.:
 
 {% highlight bash %}
 export HIVE_SERVER2_THRIFT_PORT=<listening-port>

From 4bd10fd5090fb5f4f139267b82e9f2fc15659796 Mon Sep 17 00:00:00 2001
From: Radek Ostrowski <dest.hawaii@gmail.com>
Date: Tue, 16 Jun 2015 21:04:26 +0100
Subject: [PATCH 495/525] [SQL] [DOC] improved a comment

[SQL][DOC] I found it a bit confusing when I came across it for the first time in the docs

Author: Radek Ostrowski <dest.hawaii@gmail.com>
Author: radek <radek@radeks-MacBook-Pro-2.local>

Closes #6332 from radek1st/master and squashes the following commits:

dae3347 [Radek Ostrowski] fixed typo
c76bb3a [radek] improved a comment
---
 docs/configuration.md                                        | 2 +-
 sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/configuration.md b/docs/configuration.md
index 95a322f79b40b..affcd21514d88 100644
--- a/docs/configuration.md
+++ b/docs/configuration.md
@@ -334,7 +334,7 @@ Apart from these, the following properties are also available, and may be useful
   <td>
     Enable profiling in Python worker, the profile result will show up by `sc.show_profiles()`,
     or it will be displayed before the driver exiting. It also can be dumped into disk by
-    `sc.dump_profiles(path)`. If some of the profile results had been displayed maually,
+    `sc.dump_profiles(path)`. If some of the profile results had been displayed manually,
     they will not be displayed automatically before driver exiting.
 
     By default the `pyspark.profiler.BasicProfiler` will be used, but this can be overridden by
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
index 9ca168881c5b6..444916bbadb48 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
@@ -619,7 +619,7 @@ class DataFrame private[sql](
   def as(alias: Symbol): DataFrame = as(alias.name)
 
   /**
-   * Selects a set of expressions.
+   * Selects a set of column based expressions.
    * {{{
    *   df.select($"colA", $"colB" + 1)
    * }}}

From cebf2411847706a98dc8df9c754ef53d6d12a87c Mon Sep 17 00:00:00 2001
From: Marcelo Vanzin <vanzin@cloudera.com>
Date: Tue, 16 Jun 2015 21:10:18 +0100
Subject: [PATCH 496/525] [SPARK-8126] [BUILD] Make sure temp dir exists when
 running tests.

If you ran "clean" at the top-level sbt project, the temp dir would
go away, so running "test" without restarting sbt would fail. This
fixes that by making sure the temp dir exists before running tests.

Author: Marcelo Vanzin <vanzin@cloudera.com>

Closes #6805 from vanzin/SPARK-8126-fix and squashes the following commits:

12d7768 [Marcelo Vanzin] [SPARK-8126] [build] Make sure temp dir exists when running tests.
---
 project/SparkBuild.scala | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/project/SparkBuild.scala b/project/SparkBuild.scala
index 41b7eba3a06c2..b7a3490787d44 100644
--- a/project/SparkBuild.scala
+++ b/project/SparkBuild.scala
@@ -53,9 +53,6 @@ object BuildCommons {
   val sparkHome = buildLocation
 
   val testTempDir = s"$sparkHome/target/tmp"
-  if (!new File(testTempDir).isDirectory()) {
-    require(new File(testTempDir).mkdirs())
-  }
 }
 
 object SparkBuild extends PomBuild {
@@ -526,6 +523,13 @@ object TestSettings {
     libraryDependencies += "com.novocode" % "junit-interface" % "0.9" % "test",
     // Only allow one test at a time, even across projects, since they run in the same JVM
     parallelExecution in Test := false,
+    // Make sure the test temp directory exists.
+    resourceGenerators in Test <+= resourceManaged in Test map { outDir: File =>
+      if (!new File(testTempDir).isDirectory()) {
+        require(new File(testTempDir).mkdirs())
+      }
+      Seq[File]()
+    },
     concurrentRestrictions in Global += Tags.limit(Tags.Test, 1),
     // Remove certain packages from Scaladoc
     scalacOptions in (Compile, doc) := Seq(

From ca998757e8ff2bdca2c7e88055c389161521d604 Mon Sep 17 00:00:00 2001
From: Yanbo Liang <ybliang8@gmail.com>
Date: Tue, 16 Jun 2015 14:30:30 -0700
Subject: [PATCH 497/525] [SPARK-7916] [MLLIB] MLlib Python doc parity check
 for classification and regression

Check then make the MLlib Python classification and regression doc to be as complete as the Scala doc.

Author: Yanbo Liang <ybliang8@gmail.com>

Closes #6460 from yanboliang/spark-7916 and squashes the following commits:

f8deda4 [Yanbo Liang] trigger jenkins
6dc4d99 [Yanbo Liang] address comments
ce2a43e [Yanbo Liang] truncate too long line and remove extra sparse
3eaf6ad [Yanbo Liang] MLlib Python doc parity check for classification and regression
---
 .../mllib/regression/RidgeRegression.scala    |   2 +-
 python/pyspark/mllib/classification.py        | 187 +++++++++++-------
 python/pyspark/mllib/regression.py            | 167 ++++++++++++----
 3 files changed, 248 insertions(+), 108 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/regression/RidgeRegression.scala b/mllib/src/main/scala/org/apache/spark/mllib/regression/RidgeRegression.scala
index e0c03d8180c7a..7d28ffad45c92 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/regression/RidgeRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/regression/RidgeRegression.scala
@@ -73,7 +73,7 @@ object RidgeRegressionModel extends Loader[RidgeRegressionModel] {
 
 /**
  * Train a regression model with L2-regularization using Stochastic Gradient Descent.
- * This solves the l1-regularized least squares regression formulation
+ * This solves the l2-regularized least squares regression formulation
  *          f(weights) = 1/2n ||A weights-y||^2^  + regParam/2 ||weights||^2^
  * Here the data matrix has n rows, and the input RDD holds the set of rows of A, each with
  * its corresponding right hand side label y.
diff --git a/python/pyspark/mllib/classification.py b/python/pyspark/mllib/classification.py
index a70c664a71fdb..42e41397bf4bc 100644
--- a/python/pyspark/mllib/classification.py
+++ b/python/pyspark/mllib/classification.py
@@ -33,8 +33,8 @@
 
 class LinearClassificationModel(LinearModel):
     """
-    A private abstract class representing a multiclass classification model.
-    The categories are represented by int values: 0, 1, 2, etc.
+    A private abstract class representing a multiclass classification
+    model. The categories are represented by int values: 0, 1, 2, etc.
     """
     def __init__(self, weights, intercept):
         super(LinearClassificationModel, self).__init__(weights, intercept)
@@ -44,10 +44,11 @@ def setThreshold(self, value):
         """
         .. note:: Experimental
 
-        Sets the threshold that separates positive predictions from negative
-        predictions. An example with prediction score greater than or equal
-        to this threshold is identified as an positive, and negative otherwise.
-        It is used for binary classification only.
+        Sets the threshold that separates positive predictions from
+        negative predictions. An example with prediction score greater
+        than or equal to this threshold is identified as an positive,
+        and negative otherwise. It is used for binary classification
+        only.
         """
         self._threshold = value
 
@@ -56,8 +57,9 @@ def threshold(self):
         """
         .. note:: Experimental
 
-        Returns the threshold (if any) used for converting raw prediction scores
-        into 0/1 predictions. It is used for binary classification only.
+        Returns the threshold (if any) used for converting raw
+        prediction scores into 0/1 predictions. It is used for
+        binary classification only.
         """
         return self._threshold
 
@@ -65,22 +67,35 @@ def clearThreshold(self):
         """
         .. note:: Experimental
 
-        Clears the threshold so that `predict` will output raw prediction scores.
-        It is used for binary classification only.
+        Clears the threshold so that `predict` will output raw
+        prediction scores. It is used for binary classification only.
         """
         self._threshold = None
 
     def predict(self, test):
         """
-        Predict values for a single data point or an RDD of points using
-        the model trained.
+        Predict values for a single data point or an RDD of points
+        using the model trained.
         """
         raise NotImplementedError
 
 
 class LogisticRegressionModel(LinearClassificationModel):
 
-    """A linear binary classification model derived from logistic regression.
+    """
+    Classification model trained using Multinomial/Binary Logistic
+    Regression.
+
+    :param weights: Weights computed for every feature.
+    :param intercept: Intercept computed for this model. (Only used
+            in Binary Logistic Regression. In Multinomial Logistic
+            Regression, the intercepts will not be a single value,
+            so the intercepts will be part of the weights.)
+    :param numFeatures: the dimension of the features.
+    :param numClasses: the number of possible outcomes for k classes
+            classification problem in Multinomial Logistic Regression.
+            By default, it is binary logistic regression so numClasses
+            will be set to 2.
 
     >>> data = [
     ...     LabeledPoint(0.0, [0.0, 1.0]),
@@ -161,8 +176,8 @@ def numClasses(self):
 
     def predict(self, x):
         """
-        Predict values for a single data point or an RDD of points using
-        the model trained.
+        Predict values for a single data point or an RDD of points
+        using the model trained.
         """
         if isinstance(x, RDD):
             return x.map(lambda v: self.predict(v))
@@ -225,16 +240,19 @@ def train(cls, data, iterations=100, step=1.0, miniBatchFraction=1.0,
         """
         Train a logistic regression model on the given data.
 
-        :param data:              The training data, an RDD of LabeledPoint.
-        :param iterations:        The number of iterations (default: 100).
+        :param data:              The training data, an RDD of
+                                  LabeledPoint.
+        :param iterations:        The number of iterations
+                                  (default: 100).
         :param step:              The step parameter used in SGD
                                   (default: 1.0).
-        :param miniBatchFraction: Fraction of data to be used for each SGD
-                                  iteration.
+        :param miniBatchFraction: Fraction of data to be used for each
+                                  SGD iteration (default: 1.0).
         :param initialWeights:    The initial weights (default: None).
-        :param regParam:          The regularizer parameter (default: 0.01).
-        :param regType:           The type of regularizer used for training
-                                  our model.
+        :param regParam:          The regularizer parameter
+                                  (default: 0.01).
+        :param regType:           The type of regularizer used for
+                                  training our model.
 
                                   :Allowed values:
                                      - "l1" for using L1 regularization
@@ -243,13 +261,14 @@ def train(cls, data, iterations=100, step=1.0, miniBatchFraction=1.0,
 
                                      (default: "l2")
 
-        :param intercept:         Boolean parameter which indicates the use
-                                  or not of the augmented representation for
-                                  training data (i.e. whether bias features
-                                  are activated or not).
-        :param validateData:      Boolean parameter which indicates if the
-                                  algorithm should validate data before training.
-                                  (default: True)
+        :param intercept:         Boolean parameter which indicates the
+                                  use or not of the augmented representation
+                                  for training data (i.e. whether bias
+                                  features are activated or not,
+                                  default: False).
+        :param validateData:      Boolean parameter which indicates if
+                                  the algorithm should validate data
+                                  before training. (default: True)
         """
         def train(rdd, i):
             return callMLlibFunc("trainLogisticRegressionModelWithSGD", rdd, int(iterations),
@@ -267,12 +286,15 @@ def train(cls, data, iterations=100, initialWeights=None, regParam=0.01, regType
         """
         Train a logistic regression model on the given data.
 
-        :param data:           The training data, an RDD of LabeledPoint.
-        :param iterations:     The number of iterations (default: 100).
+        :param data:           The training data, an RDD of
+                               LabeledPoint.
+        :param iterations:     The number of iterations
+                               (default: 100).
         :param initialWeights: The initial weights (default: None).
-        :param regParam:       The regularizer parameter (default: 0.01).
-        :param regType:        The type of regularizer used for training
-                               our model.
+        :param regParam:       The regularizer parameter
+                               (default: 0.01).
+        :param regType:        The type of regularizer used for
+                               training our model.
 
                                :Allowed values:
                                  - "l1" for using L1 regularization
@@ -281,19 +303,21 @@ def train(cls, data, iterations=100, initialWeights=None, regParam=0.01, regType
 
                                  (default: "l2")
 
-        :param intercept:      Boolean parameter which indicates the use
-                               or not of the augmented representation for
-                               training data (i.e. whether bias features
-                               are activated or not).
-        :param corrections:    The number of corrections used in the LBFGS
-                               update (default: 10).
-        :param tolerance:      The convergence tolerance of iterations for
-                               L-BFGS (default: 1e-4).
+        :param intercept:      Boolean parameter which indicates the
+                               use or not of the augmented representation
+                               for training data (i.e. whether bias
+                               features are activated or not,
+                               default: False).
+        :param corrections:    The number of corrections used in the
+                               LBFGS update (default: 10).
+        :param tolerance:      The convergence tolerance of iterations
+                               for L-BFGS (default: 1e-4).
         :param validateData:   Boolean parameter which indicates if the
-                               algorithm should validate data before training.
-                               (default: True)
-        :param numClasses:     The number of classes (i.e., outcomes) a label can take
-                               in Multinomial Logistic Regression (default: 2).
+                               algorithm should validate data before
+                               training. (default: True)
+        :param numClasses:     The number of classes (i.e., outcomes) a
+                               label can take in Multinomial Logistic
+                               Regression (default: 2).
 
         >>> data = [
         ...     LabeledPoint(0.0, [0.0, 1.0]),
@@ -323,7 +347,11 @@ def train(rdd, i):
 
 class SVMModel(LinearClassificationModel):
 
-    """A support vector machine.
+    """
+    Model for Support Vector Machines (SVMs).
+
+    :param weights: Weights computed for every feature.
+    :param intercept: Intercept computed for this model.
 
     >>> data = [
     ...     LabeledPoint(0.0, [0.0]),
@@ -370,8 +398,8 @@ def __init__(self, weights, intercept):
 
     def predict(self, x):
         """
-        Predict values for a single data point or an RDD of points using
-        the model trained.
+        Predict values for a single data point or an RDD of points
+        using the model trained.
         """
         if isinstance(x, RDD):
             return x.map(lambda v: self.predict(v))
@@ -409,16 +437,19 @@ def train(cls, data, iterations=100, step=1.0, regParam=0.01,
         """
         Train a support vector machine on the given data.
 
-        :param data:              The training data, an RDD of LabeledPoint.
-        :param iterations:        The number of iterations (default: 100).
+        :param data:              The training data, an RDD of
+                                  LabeledPoint.
+        :param iterations:        The number of iterations
+                                  (default: 100).
         :param step:              The step parameter used in SGD
                                   (default: 1.0).
-        :param regParam:          The regularizer parameter (default: 0.01).
-        :param miniBatchFraction: Fraction of data to be used for each SGD
-                                  iteration.
+        :param regParam:          The regularizer parameter
+                                  (default: 0.01).
+        :param miniBatchFraction: Fraction of data to be used for each
+                                  SGD iteration (default: 1.0).
         :param initialWeights:    The initial weights (default: None).
-        :param regType:           The type of regularizer used for training
-                                  our model.
+        :param regType:           The type of regularizer used for
+                                  training our model.
 
                                   :Allowed values:
                                      - "l1" for using L1 regularization
@@ -427,13 +458,14 @@ def train(cls, data, iterations=100, step=1.0, regParam=0.01,
 
                                      (default: "l2")
 
-        :param intercept:         Boolean parameter which indicates the use
-                                  or not of the augmented representation for
-                                  training data (i.e. whether bias features
-                                  are activated or not).
-        :param validateData:      Boolean parameter which indicates if the
-                                  algorithm should validate data before training.
-                                  (default: True)
+        :param intercept:         Boolean parameter which indicates the
+                                  use or not of the augmented representation
+                                  for training data (i.e. whether bias
+                                  features are activated or not,
+                                  default: False).
+        :param validateData:      Boolean parameter which indicates if
+                                  the algorithm should validate data
+                                  before training. (default: True)
         """
         def train(rdd, i):
             return callMLlibFunc("trainSVMModelWithSGD", rdd, int(iterations), float(step),
@@ -449,9 +481,11 @@ class NaiveBayesModel(Saveable, Loader):
     """
     Model for Naive Bayes classifiers.
 
-    Contains two parameters:
-    - pi: vector of logs of class priors (dimension C)
-    - theta: matrix of logs of class conditional probabilities (CxD)
+    :param labels: list of labels.
+    :param pi: log of class priors, whose dimension is C,
+            number of labels.
+    :param theta: log of class conditional probabilities, whose
+            dimension is C-by-D, where D is number of features.
 
     >>> data = [
     ...     LabeledPoint(0.0, [0.0, 0.0]),
@@ -493,7 +527,10 @@ def __init__(self, labels, pi, theta):
         self.theta = theta
 
     def predict(self, x):
-        """Return the most likely class for a data vector or an RDD of vectors"""
+        """
+        Return the most likely class for a data vector
+        or an RDD of vectors
+        """
         if isinstance(x, RDD):
             return x.map(lambda v: self.predict(v))
         x = _convert_to_vector(x)
@@ -523,16 +560,18 @@ class NaiveBayes(object):
     @classmethod
     def train(cls, data, lambda_=1.0):
         """
-        Train a Naive Bayes model given an RDD of (label, features) vectors.
+        Train a Naive Bayes model given an RDD of (label, features)
+        vectors.
 
-        This is the Multinomial NB (U{http://tinyurl.com/lsdw6p}) which can
-        handle all kinds of discrete data.  For example, by converting
-        documents into TF-IDF vectors, it can be used for document
-        classification.  By making every vector a 0-1 vector, it can also be
-        used as Bernoulli NB (U{http://tinyurl.com/p7c96j6}).
+        This is the Multinomial NB (U{http://tinyurl.com/lsdw6p}) which
+        can handle all kinds of discrete data.  For example, by
+        converting documents into TF-IDF vectors, it can be used for
+        document classification. By making every vector a 0-1 vector,
+        it can also be used as Bernoulli NB (U{http://tinyurl.com/p7c96j6}).
+        The input feature values must be nonnegative.
 
         :param data: RDD of LabeledPoint.
-        :param lambda_: The smoothing parameter
+        :param lambda_: The smoothing parameter (default: 1.0).
         """
         first = data.first()
         if not isinstance(first, LabeledPoint):
diff --git a/python/pyspark/mllib/regression.py b/python/pyspark/mllib/regression.py
index 41bde2ce3e60b..0c4d7d3bbee02 100644
--- a/python/pyspark/mllib/regression.py
+++ b/python/pyspark/mllib/regression.py
@@ -33,12 +33,12 @@
 class LabeledPoint(object):
 
     """
-    The features and labels of a data point.
+    Class that represents the features and labels of a data point.
 
     :param label: Label for this data point.
     :param features: Vector of features for this point (NumPy array,
-             list, pyspark.mllib.linalg.SparseVector, or scipy.sparse
-             column matrix)
+            list, pyspark.mllib.linalg.SparseVector, or scipy.sparse
+            column matrix)
 
     Note: 'label' and 'features' are accessible as class attributes.
     """
@@ -59,7 +59,12 @@ def __repr__(self):
 
 class LinearModel(object):
 
-    """A linear model that has a vector of coefficients and an intercept."""
+    """
+    A linear model that has a vector of coefficients and an intercept.
+
+    :param weights: Weights computed for every feature.
+    :param intercept: Intercept computed for this model.
+    """
 
     def __init__(self, weights, intercept):
         self._coeff = _convert_to_vector(weights)
@@ -193,18 +198,28 @@ def train(cls, data, iterations=100, step=1.0, miniBatchFraction=1.0,
               initialWeights=None, regParam=0.0, regType=None, intercept=False,
               validateData=True):
         """
-        Train a linear regression model on the given data.
-
-        :param data:              The training data.
-        :param iterations:        The number of iterations (default: 100).
+        Train a linear regression model using Stochastic Gradient
+        Descent (SGD).
+        This solves the least squares regression formulation
+                f(weights) = 1/n ||A weights-y||^2^
+        (which is the mean squared error).
+        Here the data matrix has n rows, and the input RDD holds the
+        set of rows of A, each with its corresponding right hand side
+        label y. See also the documentation for the precise formulation.
+
+        :param data:              The training data, an RDD of
+                                  LabeledPoint.
+        :param iterations:        The number of iterations
+                                  (default: 100).
         :param step:              The step parameter used in SGD
                                   (default: 1.0).
-        :param miniBatchFraction: Fraction of data to be used for each SGD
-                                  iteration.
+        :param miniBatchFraction: Fraction of data to be used for each
+                                  SGD iteration (default: 1.0).
         :param initialWeights:    The initial weights (default: None).
-        :param regParam:          The regularizer parameter (default: 0.0).
-        :param regType:           The type of regularizer used for training
-                                  our model.
+        :param regParam:          The regularizer parameter
+                                  (default: 0.0).
+        :param regType:           The type of regularizer used for
+                                  training our model.
 
                                   :Allowed values:
                                      - "l1" for using L1 regularization (lasso),
@@ -213,13 +228,14 @@ def train(cls, data, iterations=100, step=1.0, miniBatchFraction=1.0,
 
                                      (default: None)
 
-        :param intercept:         Boolean parameter which indicates the use
-                                  or not of the augmented representation for
-                                  training data (i.e. whether bias features
-                                  are activated or not). (default: False)
-        :param validateData:      Boolean parameter which indicates if the
-                                  algorithm should validate data before training.
-                                  (default: True)
+        :param intercept:         Boolean parameter which indicates the
+                                  use or not of the augmented representation
+                                  for training data (i.e. whether bias
+                                  features are activated or not,
+                                  default: False).
+        :param validateData:      Boolean parameter which indicates if
+                                  the algorithm should validate data
+                                  before training. (default: True)
         """
         def train(rdd, i):
             return callMLlibFunc("trainLinearRegressionModelWithSGD", rdd, int(iterations),
@@ -232,8 +248,8 @@ def train(rdd, i):
 @inherit_doc
 class LassoModel(LinearRegressionModelBase):
 
-    """A linear regression model derived from a least-squares fit with an
-    l_1 penalty term.
+    """A linear regression model derived from a least-squares fit with
+    an l_1 penalty term.
 
     >>> from pyspark.mllib.regression import LabeledPoint
     >>> data = [
@@ -304,7 +320,36 @@ class LassoWithSGD(object):
     def train(cls, data, iterations=100, step=1.0, regParam=0.01,
               miniBatchFraction=1.0, initialWeights=None, intercept=False,
               validateData=True):
-        """Train a Lasso regression model on the given data."""
+        """
+        Train a regression model with L1-regularization using
+        Stochastic Gradient Descent.
+        This solves the l1-regularized least squares regression
+        formulation
+            f(weights) = 1/2n ||A weights-y||^2^  + regParam ||weights||_1
+        Here the data matrix has n rows, and the input RDD holds the
+        set of rows of A, each with its corresponding right hand side
+        label y. See also the documentation for the precise formulation.
+
+        :param data:              The training data, an RDD of
+                                  LabeledPoint.
+        :param iterations:        The number of iterations
+                                  (default: 100).
+        :param step:              The step parameter used in SGD
+                                  (default: 1.0).
+        :param regParam:          The regularizer parameter
+                                  (default: 0.01).
+        :param miniBatchFraction: Fraction of data to be used for each
+                                  SGD iteration (default: 1.0).
+        :param initialWeights:    The initial weights (default: None).
+        :param intercept:         Boolean parameter which indicates the
+                                  use or not of the augmented representation
+                                  for training data (i.e. whether bias
+                                  features are activated or not,
+                                  default: False).
+        :param validateData:      Boolean parameter which indicates if
+                                  the algorithm should validate data
+                                  before training. (default: True)
+        """
         def train(rdd, i):
             return callMLlibFunc("trainLassoModelWithSGD", rdd, int(iterations), float(step),
                                  float(regParam), float(miniBatchFraction), i, bool(intercept),
@@ -316,8 +361,8 @@ def train(rdd, i):
 @inherit_doc
 class RidgeRegressionModel(LinearRegressionModelBase):
 
-    """A linear regression model derived from a least-squares fit with an
-    l_2 penalty term.
+    """A linear regression model derived from a least-squares fit with
+    an l_2 penalty term.
 
     >>> from pyspark.mllib.regression import LabeledPoint
     >>> data = [
@@ -389,7 +434,36 @@ class RidgeRegressionWithSGD(object):
     def train(cls, data, iterations=100, step=1.0, regParam=0.01,
               miniBatchFraction=1.0, initialWeights=None, intercept=False,
               validateData=True):
-        """Train a ridge regression model on the given data."""
+        """
+        Train a regression model with L2-regularization using
+        Stochastic Gradient Descent.
+        This solves the l2-regularized least squares regression
+        formulation
+            f(weights) = 1/2n ||A weights-y||^2^  + regParam/2 ||weights||^2^
+        Here the data matrix has n rows, and the input RDD holds the
+        set of rows of A, each with its corresponding right hand side
+        label y. See also the documentation for the precise formulation.
+
+        :param data:              The training data, an RDD of
+                                  LabeledPoint.
+        :param iterations:        The number of iterations
+                                  (default: 100).
+        :param step:              The step parameter used in SGD
+                                  (default: 1.0).
+        :param regParam:          The regularizer parameter
+                                  (default: 0.01).
+        :param miniBatchFraction: Fraction of data to be used for each
+                                  SGD iteration (default: 1.0).
+        :param initialWeights:    The initial weights (default: None).
+        :param intercept:         Boolean parameter which indicates the
+                                  use or not of the augmented representation
+                                  for training data (i.e. whether bias
+                                  features are activated or not,
+                                  default: False).
+        :param validateData:      Boolean parameter which indicates if
+                                  the algorithm should validate data
+                                  before training. (default: True)
+        """
         def train(rdd, i):
             return callMLlibFunc("trainRidgeModelWithSGD", rdd, int(iterations), float(step),
                                  float(regParam), float(miniBatchFraction), i, bool(intercept),
@@ -400,7 +474,15 @@ def train(rdd, i):
 
 class IsotonicRegressionModel(Saveable, Loader):
 
-    """Regression model for isotonic regression.
+    """
+    Regression model for isotonic regression.
+
+    :param boundaries: Array of boundaries for which predictions are
+            known. Boundaries must be sorted in increasing order.
+    :param predictions: Array of predictions associated to the
+            boundaries at the same index. Results of isotonic
+            regression and therefore monotone.
+    :param isotonic: indicates whether this is isotonic or antitonic.
 
     >>> data = [(1, 0, 1), (2, 1, 1), (3, 2, 1), (1, 3, 1), (6, 4, 1), (17, 5, 1), (16, 6, 1)]
     >>> irm = IsotonicRegression.train(sc.parallelize(data))
@@ -430,6 +512,25 @@ def __init__(self, boundaries, predictions, isotonic):
         self.isotonic = isotonic
 
     def predict(self, x):
+        """
+        Predict labels for provided features.
+        Using a piecewise linear function.
+        1) If x exactly matches a boundary then associated prediction
+        is returned. In case there are multiple predictions with the
+        same boundary then one of them is returned. Which one is
+        undefined (same as java.util.Arrays.binarySearch).
+        2) If x is lower or higher than all boundaries then first or
+        last prediction is returned respectively. In case there are
+        multiple predictions with the same boundary then the lowest
+        or highest is returned respectively.
+        3) If x falls between two values in boundary array then
+        prediction is treated as piecewise linear function and
+        interpolated value is returned. In case there are multiple
+        values with the same boundary then the same rules as in 2)
+        are used.
+
+        :param x: Feature or RDD of Features to be labeled.
+        """
         if isinstance(x, RDD):
             return x.map(lambda v: self.predict(v))
         return np.interp(x, self.boundaries, self.predictions)
@@ -451,15 +552,15 @@ def load(cls, sc, path):
 
 
 class IsotonicRegression(object):
-    """
-    Run IsotonicRegression algorithm to obtain isotonic regression model.
 
-    :param data:            RDD of (label, feature, weight) tuples.
-    :param isotonic:        Whether this is isotonic or antitonic.
-    """
     @classmethod
     def train(cls, data, isotonic=True):
-        """Train a isotonic regression model on the given data."""
+        """
+        Train a isotonic regression model on the given data.
+
+        :param data: RDD of (label, feature, weight) tuples.
+        :param isotonic: Whether this is isotonic or antitonic.
+        """
         boundaries, predictions = callMLlibFunc("trainIsotonicRegressionModel",
                                                 data.map(_convert_to_vector), bool(isotonic))
         return IsotonicRegressionModel(boundaries.toArray(), predictions.toArray(), isotonic)

From 0b8c8fdc121deecacb309784f18470790306dc26 Mon Sep 17 00:00:00 2001
From: baishuo <vc_java@hotmail.com>
Date: Tue, 16 Jun 2015 16:40:02 -0700
Subject: [PATCH 498/525] [SPARK-8156] [SQL] create table to specific database
 by 'use dbname'

when i test the following code:
hiveContext.sql("""use testdb""")
val df = (1 to 3).map(i => (i, s"val_$i", i * 2)).toDF("a", "b", "c")
df.write
.format("parquet")
.mode(SaveMode.Overwrite)
.saveAsTable("ttt3")
hiveContext.sql("show TABLES in default")

found that the table ttt3 will be created under the database "default"

Author: baishuo <vc_java@hotmail.com>

Closes #6695 from baishuo/SPARK-8516-use-database and squashes the following commits:

9e155f9 [baishuo] remove no use comment
cb9f027 [baishuo] modify testcase
00a7a2d [baishuo] modify testcase
4df48c7 [baishuo] modify testcase
b742e69 [baishuo] modify testcase
3d19ad9 [baishuo] create table to specific database
---
 .../org/apache/spark/sql/hive/HiveContext.scala |  6 ++----
 .../spark/sql/hive/HiveMetastoreCatalog.scala   |  2 +-
 .../sql/hive/MetastoreDataSourcesSuite.scala    | 17 +++++++++++++++++
 3 files changed, 20 insertions(+), 5 deletions(-)

diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala
index 3b75b0b04102d..c50835dd8f11d 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala
@@ -271,13 +271,11 @@ class HiveContext(sc: SparkContext) extends SQLContext(sc) {
    * @since 1.3.0
    */
   def refreshTable(tableName: String): Unit = {
-    // TODO: Database support...
-    catalog.refreshTable("default", tableName)
+    catalog.refreshTable(catalog.client.currentDatabase, tableName)
   }
 
   protected[hive] def invalidateTable(tableName: String): Unit = {
-    // TODO: Database support...
-    catalog.invalidateTable("default", tableName)
+    catalog.invalidateTable(catalog.client.currentDatabase, tableName)
   }
 
   /**
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
index 619ef63223241..f35ae96ee0b50 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
@@ -143,7 +143,7 @@ private[hive] class HiveMetastoreCatalog(val client: ClientInterface, hive: Hive
       provider: String,
       options: Map[String, String],
       isExternal: Boolean): Unit = {
-    val (dbName, tblName) = processDatabaseAndTableName("default", tableName)
+    val (dbName, tblName) = processDatabaseAndTableName(client.currentDatabase, tableName)
     val tableProperties = new scala.collection.mutable.HashMap[String, String]
     tableProperties.put("spark.sql.sources.provider", provider)
 
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala
index af586712e3235..79a85b24d2f60 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala
@@ -833,4 +833,21 @@ class MetastoreDataSourcesSuite extends QueryTest with SQLTestUtils with BeforeA
         (70 to 79).map(i => Row(i, s"str$i")))
     }
   }
+
+  test("SPARK-8156:create table to specific database by 'use dbname' ") {
+
+    val df = (1 to 3).map(i => (i, s"val_$i", i * 2)).toDF("a", "b", "c")
+    sqlContext.sql("""create database if not exists testdb8156""")
+    sqlContext.sql("""use testdb8156""")
+    df.write
+      .format("parquet")
+      .mode(SaveMode.Overwrite)
+      .saveAsTable("ttt3")
+
+    checkAnswer(
+      sqlContext.sql("show TABLES in testdb8156").filter("tableName = 'ttt3'"),
+      Row("ttt3", false))
+    sqlContext.sql("""use default""")
+    sqlContext.sql("""drop database if exists testdb8156 CASCADE""")
+  }
 }

From bedff7d532e40557e6c9e4c55e86986ccf77ecd6 Mon Sep 17 00:00:00 2001
From: dragonli <lisurprise@gmail.com>
Date: Tue, 16 Jun 2015 23:44:10 -0700
Subject: [PATCH 499/525] [SPARK-8220][SQL]Add positive identify function

chenghao-intel adrian-wang

Author: dragonli <lisurprise@gmail.com>
Author: zhichao.li <zhichao.li@intel.com>

Closes #6838 from zhichao-li/positive and squashes the following commits:

e1032a0 [dragonli] remove useless import and refactor code
624d438 [zhichao.li] add positive identify function
---
 .../spark/sql/catalyst/analysis/FunctionRegistry.scala |  1 +
 .../spark/sql/catalyst/expressions/arithmetic.scala    |  9 +++++++++
 .../spark/sql/catalyst/optimizer/Optimizer.scala       | 10 ++++++++++
 .../org/apache/spark/sql/MathExpressionsSuite.scala    |  7 +++++++
 4 files changed, 27 insertions(+)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
index 04e306da23e4c..97b123ec2f6d9 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
@@ -120,6 +120,7 @@ object FunctionRegistry {
     expression[Log2]("log2"),
     expression[Pow]("pow"),
     expression[Pow]("power"),
+    expression[UnaryPositive]("positive"),
     expression[Rint]("rint"),
     expression[Signum]("sign"),
     expression[Signum]("signum"),
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/arithmetic.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/arithmetic.scala
index 8b78c50000166..167e460d5a93e 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/arithmetic.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/arithmetic.scala
@@ -58,6 +58,15 @@ case class UnaryMinus(child: Expression) extends UnaryArithmetic {
   protected override def evalInternal(evalE: Any) = numeric.negate(evalE)
 }
 
+case class UnaryPositive(child: Expression) extends UnaryArithmetic {
+  override def toString: String = s"positive($child)"
+
+  override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String =
+    defineCodeGen(ctx, ev, c => c)
+
+  protected override def evalInternal(evalE: Any) = evalE
+}
+
 case class Sqrt(child: Expression) extends UnaryArithmetic {
   override def dataType: DataType = DoubleType
   override def nullable: Boolean = true
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
index f8f1efcc7e990..9132a786f77a7 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
@@ -52,6 +52,7 @@ object DefaultOptimizer extends Optimizer {
       LikeSimplification,
       BooleanSimplification,
       PushPredicateThroughJoin,
+      RemovePositive,
       SimplifyFilters,
       SimplifyCasts,
       SimplifyCaseConversionExpressions) ::
@@ -632,6 +633,15 @@ object SimplifyCasts extends Rule[LogicalPlan] {
   }
 }
 
+/**
+ * Removes [[UnaryPositive]] identify function
+ */
+object RemovePositive extends Rule[LogicalPlan] {
+  def apply(plan: LogicalPlan): LogicalPlan = plan transformAllExpressions {
+    case UnaryPositive(child) => child
+  }
+}
+
 /**
  * Combines two adjacent [[Limit]] operators into one, merging the
  * expressions into one single expression.
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/MathExpressionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/MathExpressionsSuite.scala
index faa1d1193b509..e2daaf6b730c5 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/MathExpressionsSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/MathExpressionsSuite.scala
@@ -262,4 +262,11 @@ class MathExpressionsSuite extends QueryTest {
       ctx.sql("SELECT negative(1), negative(0), negative(-1)"),
       Row(-1, 0, 1))
   }
+
+  test("positive") {
+    val df = Seq((1, -1, "abc")).toDF("a", "b", "c")
+    checkAnswer(df.selectExpr("positive(a)"), Row(1))
+    checkAnswer(df.selectExpr("positive(b)"), Row(-1))
+    checkAnswer(df.selectExpr("positive(c)"), Row("abc"))
+  }
 }

From e3de14d3b20bff92a4d82ac99825fcb5180fdccc Mon Sep 17 00:00:00 2001
From: Reynold Xin <rxin@databricks.com>
Date: Wed, 17 Jun 2015 00:28:40 -0700
Subject: [PATCH 500/525] Closes #6850.


From c13da20a55b80b8632d547240d2c8f97539969a1 Mon Sep 17 00:00:00 2001
From: Vyacheslav Baranov <slavik.baranov@gmail.com>
Date: Wed, 17 Jun 2015 09:42:29 +0100
Subject: [PATCH 501/525] [SPARK-8309] [CORE] Support for more than 12M items
 in OpenHashMap

The problem occurs because the position mask `0xEFFFFFF` is incorrect. It has zero 25th bit, so when capacity grows beyond 2^24, `OpenHashMap` calculates incorrect index of value in `_values` array.

I've also added a size check in `rehash()`, so that it fails instead of reporting invalid item indices.

Author: Vyacheslav Baranov <slavik.baranov@gmail.com>

Closes #6763 from SlavikBaranov/SPARK-8309 and squashes the following commits:

8557445 [Vyacheslav Baranov] Resolved review comments
4d5b954 [Vyacheslav Baranov] Resolved review comments
eaf1e68 [Vyacheslav Baranov] Fixed failing test
f9284fd [Vyacheslav Baranov] Resolved review comments
3920656 [Vyacheslav Baranov] SPARK-8309: Support for more than 12M items in OpenHashMap
---
 .../apache/spark/util/collection/OpenHashSet.scala   | 10 +++++++---
 .../spark/util/collection/OpenHashMapSuite.scala     | 12 +++++++++++-
 .../collection/PrimitiveKeyOpenHashMapSuite.scala    |  2 +-
 3 files changed, 19 insertions(+), 5 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/util/collection/OpenHashSet.scala b/core/src/main/scala/org/apache/spark/util/collection/OpenHashSet.scala
index 64e7102e3654c..60bf4dd7469f1 100644
--- a/core/src/main/scala/org/apache/spark/util/collection/OpenHashSet.scala
+++ b/core/src/main/scala/org/apache/spark/util/collection/OpenHashSet.scala
@@ -45,7 +45,8 @@ class OpenHashSet[@specialized(Long, Int) T: ClassTag](
     loadFactor: Double)
   extends Serializable {
 
-  require(initialCapacity <= (1 << 29), "Can't make capacity bigger than 2^29 elements")
+  require(initialCapacity <= OpenHashSet.MAX_CAPACITY,
+    s"Can't make capacity bigger than ${OpenHashSet.MAX_CAPACITY} elements")
   require(initialCapacity >= 1, "Invalid initial capacity")
   require(loadFactor < 1.0, "Load factor must be less than 1.0")
   require(loadFactor > 0.0, "Load factor must be greater than 0.0")
@@ -223,6 +224,8 @@ class OpenHashSet[@specialized(Long, Int) T: ClassTag](
    */
   private def rehash(k: T, allocateFunc: (Int) => Unit, moveFunc: (Int, Int) => Unit) {
     val newCapacity = _capacity * 2
+    require(newCapacity > 0 && newCapacity <= OpenHashSet.MAX_CAPACITY,
+      s"Can't contain more than ${(loadFactor * OpenHashSet.MAX_CAPACITY).toInt} elements")
     allocateFunc(newCapacity)
     val newBitset = new BitSet(newCapacity)
     val newData = new Array[T](newCapacity)
@@ -276,9 +279,10 @@ class OpenHashSet[@specialized(Long, Int) T: ClassTag](
 private[spark]
 object OpenHashSet {
 
+  val MAX_CAPACITY = 1 << 30
   val INVALID_POS = -1
-  val NONEXISTENCE_MASK = 0x80000000
-  val POSITION_MASK = 0xEFFFFFF
+  val NONEXISTENCE_MASK = 1 << 31
+  val POSITION_MASK = (1 << 31) - 1
 
   /**
    * A set of specialized hash function implementation to avoid boxing hash code computation
diff --git a/core/src/test/scala/org/apache/spark/util/collection/OpenHashMapSuite.scala b/core/src/test/scala/org/apache/spark/util/collection/OpenHashMapSuite.scala
index 94e011799921b..3066e9996abda 100644
--- a/core/src/test/scala/org/apache/spark/util/collection/OpenHashMapSuite.scala
+++ b/core/src/test/scala/org/apache/spark/util/collection/OpenHashMapSuite.scala
@@ -44,7 +44,7 @@ class OpenHashMapSuite extends SparkFunSuite with Matchers {
     val goodMap3 = new OpenHashMap[String, String](256)
     assert(goodMap3.size === 0)
     intercept[IllegalArgumentException] {
-      new OpenHashMap[String, Int](1 << 30) // Invalid map size: bigger than 2^29
+      new OpenHashMap[String, Int](1 << 30 + 1) // Invalid map size: bigger than 2^30
     }
     intercept[IllegalArgumentException] {
       new OpenHashMap[String, Int](-1)
@@ -186,4 +186,14 @@ class OpenHashMapSuite extends SparkFunSuite with Matchers {
     map(null) = 0
     assert(map.contains(null))
   }
+
+  test("support for more than 12M items") {
+    val cnt = 12000000 // 12M
+    val map = new OpenHashMap[Int, Int](cnt)
+    for (i <- 0 until cnt) {
+      map(i) = 1
+    }
+    val numInvalidValues = map.iterator.count(_._2 == 0)
+    assertResult(0)(numInvalidValues)
+  }
 }
diff --git a/core/src/test/scala/org/apache/spark/util/collection/PrimitiveKeyOpenHashMapSuite.scala b/core/src/test/scala/org/apache/spark/util/collection/PrimitiveKeyOpenHashMapSuite.scala
index 462bc2f29f9f8..508e737b725bc 100644
--- a/core/src/test/scala/org/apache/spark/util/collection/PrimitiveKeyOpenHashMapSuite.scala
+++ b/core/src/test/scala/org/apache/spark/util/collection/PrimitiveKeyOpenHashMapSuite.scala
@@ -44,7 +44,7 @@ class PrimitiveKeyOpenHashMapSuite extends SparkFunSuite with Matchers {
     val goodMap3 = new PrimitiveKeyOpenHashMap[Int, Int](256)
     assert(goodMap3.size === 0)
     intercept[IllegalArgumentException] {
-      new PrimitiveKeyOpenHashMap[Int, Int](1 << 30) // Invalid map size: bigger than 2^29
+      new PrimitiveKeyOpenHashMap[Int, Int](1 << 30 + 1) // Invalid map size: bigger than 2^30
     }
     intercept[IllegalArgumentException] {
       new PrimitiveKeyOpenHashMap[Int, Int](-1)

From 104f30c36f3d44b7567f6f77adb92e0a96494541 Mon Sep 17 00:00:00 2001
From: Liang-Chi Hsieh <viirya@gmail.com>
Date: Wed, 17 Jun 2015 09:00:37 -0700
Subject: [PATCH 502/525] [SPARK-7199] [SQL] Add date and timestamp support to
 UnsafeRow

JIRA: https://issues.apache.org/jira/browse/SPARK-7199

Author: Liang-Chi Hsieh <viirya@gmail.com>

Closes #5984 from viirya/add_date_timestamp and squashes the following commits:

7f21ce9 [Liang-Chi Hsieh] For comment.
0b89698 [Liang-Chi Hsieh] Add timestamp to settableFieldTypes.
c30d490 [Liang-Chi Hsieh] Use default IntUnsafeColumnWriter and LongUnsafeColumnWriter.
672ef17 [Liang-Chi Hsieh] Remove getter/setter for Date and Timestamp and use Int and Long for them.
9f3e577 [Liang-Chi Hsieh] Merge remote-tracking branch 'upstream/master' into add_date_timestamp
281e844 [Liang-Chi Hsieh] Fix scala style.
fb532b5 [Liang-Chi Hsieh] Merge remote-tracking branch 'upstream/master' into add_date_timestamp
80af342 [Liang-Chi Hsieh] Fix compiling error.
f4f5de6 [Liang-Chi Hsieh] Fix scala style.
a463e83 [Liang-Chi Hsieh] Use Long to store timestamp for rows.
635388a [Liang-Chi Hsieh] Merge remote-tracking branch 'upstream/master' into add_date_timestamp
46946c6 [Liang-Chi Hsieh] Adapt for moved DateUtils.
b16994e [Liang-Chi Hsieh] Merge remote-tracking branch 'upstream/master' into add_date_timestamp
752251f [Liang-Chi Hsieh] Support setDate. Fix failed test.
fcf8db9 [Liang-Chi Hsieh] Add functions for Date and Timestamp to SpecificRow.
e42a809 [Liang-Chi Hsieh] Fix style.
4c07b57 [Liang-Chi Hsieh] Add date and timestamp support to UnsafeRow.
---
 .../sql/catalyst/expressions/UnsafeRow.java   |  6 ++--
 .../expressions/UnsafeRowConverter.scala      |  4 +++
 .../spark/sql/catalyst/expressions/rows.scala |  3 +-
 .../expressions/UnsafeRowConverterSuite.scala | 30 +++++++++++++++++++
 4 files changed, 39 insertions(+), 4 deletions(-)

diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeRow.java b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeRow.java
index aec88c9241d92..c4b7f8490a05b 100644
--- a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeRow.java
+++ b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeRow.java
@@ -103,7 +103,9 @@ public static int calculateBitSetWidthInBytes(int numFields) {
           IntegerType,
           LongType,
           FloatType,
-          DoubleType
+          DoubleType,
+          DateType,
+          TimestampType
     })));
 
     // We support get() on a superset of the types for which we support set():
@@ -331,8 +333,6 @@ public String getString(int i) {
     return getUTF8String(i).toString();
   }
 
-
-
   @Override
   public InternalRow copy() {
     throw new UnsupportedOperationException();
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/UnsafeRowConverter.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/UnsafeRowConverter.scala
index 5c92f41c639fa..72f740ecaead3 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/UnsafeRowConverter.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/UnsafeRowConverter.scala
@@ -17,6 +17,8 @@
 
 package org.apache.spark.sql.catalyst.expressions
 
+import org.apache.spark.sql.catalyst.util.DateUtils
+import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.types._
 import org.apache.spark.unsafe.PlatformDependent
 import org.apache.spark.unsafe.array.ByteArrayMethods
@@ -120,6 +122,8 @@ private object UnsafeColumnWriter {
       case FloatType => FloatUnsafeColumnWriter
       case DoubleType => DoubleUnsafeColumnWriter
       case StringType => StringUnsafeColumnWriter
+      case DateType => IntUnsafeColumnWriter
+      case TimestampType => LongUnsafeColumnWriter
       case t =>
         throw new UnsupportedOperationException(s"Do not know how to write columns of type $t")
     }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/rows.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/rows.scala
index 534dac1f92e89..1098962ddc018 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/rows.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/rows.scala
@@ -197,9 +197,10 @@ class GenericMutableRow(v: Array[Any]) extends GenericRow(v) with MutableRow {
   override def setFloat(ordinal: Int, value: Float): Unit = { values(ordinal) = value }
   override def setInt(ordinal: Int, value: Int): Unit = { values(ordinal) = value }
   override def setLong(ordinal: Int, value: Long): Unit = { values(ordinal) = value }
-  override def setString(ordinal: Int, value: String) {
+  override def setString(ordinal: Int, value: String): Unit = {
     values(ordinal) = UTF8String.fromString(value)
   }
+
   override def setNullAt(i: Int): Unit = { values(i) = null }
 
   override def setShort(ordinal: Int, value: Short): Unit = { values(ordinal) = value }
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/UnsafeRowConverterSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/UnsafeRowConverterSuite.scala
index 577c7a0de0160..721ef8a22608c 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/UnsafeRowConverterSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/UnsafeRowConverterSuite.scala
@@ -17,12 +17,14 @@
 
 package org.apache.spark.sql.catalyst.expressions
 
+import java.sql.{Date, Timestamp}
 import java.util.Arrays
 
 import org.scalatest.Matchers
 
 import org.apache.spark.SparkFunSuite
 import org.apache.spark.sql.types._
+import org.apache.spark.sql.catalyst.util.DateUtils
 import org.apache.spark.unsafe.PlatformDependent
 import org.apache.spark.unsafe.array.ByteArrayMethods
 
@@ -74,6 +76,34 @@ class UnsafeRowConverterSuite extends SparkFunSuite with Matchers {
     unsafeRow.getString(2) should be ("World")
   }
 
+  test("basic conversion with primitive, string, date and timestamp types") {
+    val fieldTypes: Array[DataType] = Array(LongType, StringType, DateType, TimestampType)
+    val converter = new UnsafeRowConverter(fieldTypes)
+
+    val row = new SpecificMutableRow(fieldTypes)
+    row.setLong(0, 0)
+    row.setString(1, "Hello")
+    row.update(2, DateUtils.fromJavaDate(Date.valueOf("1970-01-01")))
+    row.update(3, DateUtils.fromJavaTimestamp(Timestamp.valueOf("2015-05-08 08:10:25")))
+
+    val sizeRequired: Int = converter.getSizeRequirement(row)
+    sizeRequired should be (8 + (8 * 4) +
+      ByteArrayMethods.roundNumberOfBytesToNearestWord("Hello".getBytes.length + 8))
+    val buffer: Array[Long] = new Array[Long](sizeRequired / 8)
+    val numBytesWritten = converter.writeRow(row, buffer, PlatformDependent.LONG_ARRAY_OFFSET)
+    numBytesWritten should be (sizeRequired)
+
+    val unsafeRow = new UnsafeRow()
+    unsafeRow.pointTo(buffer, PlatformDependent.LONG_ARRAY_OFFSET, fieldTypes.length, null)
+    unsafeRow.getLong(0) should be (0)
+    unsafeRow.getString(1) should be ("Hello")
+    // Date is represented as Int in unsafeRow
+    DateUtils.toJavaDate(unsafeRow.getInt(2)) should be (Date.valueOf("1970-01-01"))
+    // Timestamp is represented as Long in unsafeRow
+    DateUtils.toJavaTimestamp(unsafeRow.getLong(3)) should be
+      (Timestamp.valueOf("2015-05-08 08:10:25"))
+  }
+
   test("null handling") {
     val fieldTypes: Array[DataType] = Array(
       NullType,

From 6765ef98dff070768bbcd585d341ee7664fbe76c Mon Sep 17 00:00:00 2001
From: MechCoder <manojkumarsivaraj334@gmail.com>
Date: Wed, 17 Jun 2015 11:10:16 -0700
Subject: [PATCH 503/525] [SPARK-6390] [SQL] [MLlib] Port MatrixUDT to PySpark

MatrixUDT was recently coded in scala. This has been ported to PySpark

Author: MechCoder <manojkumarsivaraj334@gmail.com>

Closes #6354 from MechCoder/spark-6390 and squashes the following commits:

fc4dc1e [MechCoder] Better error message
c940a44 [MechCoder] Added test
aa9c391 [MechCoder] Add pyUDT to MatrixUDT
62a2a7d [MechCoder] [SPARK-6390] Port MatrixUDT to PySpark
---
 .../apache/spark/mllib/linalg/Matrices.scala  |  2 +
 python/pyspark/mllib/linalg.py                | 59 ++++++++++++++++++-
 python/pyspark/mllib/tests.py                 | 34 ++++++++++-
 python/pyspark/sql/dataframe.py               |  6 +-
 4 files changed, 97 insertions(+), 4 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Matrices.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Matrices.scala
index 9584da8e3a0f9..85e63b1382b5e 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Matrices.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Matrices.scala
@@ -197,6 +197,8 @@ private[spark] class MatrixUDT extends UserDefinedType[Matrix] {
 
   override def typeName: String = "matrix"
 
+  override def pyUDT: String = "pyspark.mllib.linalg.MatrixUDT"
+
   private[spark] override def asNullable: MatrixUDT = this
 }
 
diff --git a/python/pyspark/mllib/linalg.py b/python/pyspark/mllib/linalg.py
index 23d1a79ffe511..e96c5ef87df86 100644
--- a/python/pyspark/mllib/linalg.py
+++ b/python/pyspark/mllib/linalg.py
@@ -36,7 +36,7 @@
 import numpy as np
 
 from pyspark.sql.types import UserDefinedType, StructField, StructType, ArrayType, DoubleType, \
-    IntegerType, ByteType
+    IntegerType, ByteType, BooleanType
 
 
 __all__ = ['Vector', 'DenseVector', 'SparseVector', 'Vectors',
@@ -163,6 +163,59 @@ def simpleString(self):
         return "vector"
 
 
+class MatrixUDT(UserDefinedType):
+    """
+    SQL user-defined type (UDT) for Matrix.
+    """
+
+    @classmethod
+    def sqlType(cls):
+        return StructType([
+            StructField("type", ByteType(), False),
+            StructField("numRows", IntegerType(), False),
+            StructField("numCols", IntegerType(), False),
+            StructField("colPtrs", ArrayType(IntegerType(), False), True),
+            StructField("rowIndices", ArrayType(IntegerType(), False), True),
+            StructField("values", ArrayType(DoubleType(), False), True),
+            StructField("isTransposed", BooleanType(), False)])
+
+    @classmethod
+    def module(cls):
+        return "pyspark.mllib.linalg"
+
+    @classmethod
+    def scalaUDT(cls):
+        return "org.apache.spark.mllib.linalg.MatrixUDT"
+
+    def serialize(self, obj):
+        if isinstance(obj, SparseMatrix):
+            colPtrs = [int(i) for i in obj.colPtrs]
+            rowIndices = [int(i) for i in obj.rowIndices]
+            values = [float(v) for v in obj.values]
+            return (0, obj.numRows, obj.numCols, colPtrs,
+                    rowIndices, values, bool(obj.isTransposed))
+        elif isinstance(obj, DenseMatrix):
+            values = [float(v) for v in obj.values]
+            return (1, obj.numRows, obj.numCols, None, None, values,
+                    bool(obj.isTransposed))
+        else:
+            raise TypeError("cannot serialize type %r" % (type(obj)))
+
+    def deserialize(self, datum):
+        assert len(datum) == 7, \
+            "MatrixUDT.deserialize given row with length %d but requires 7" % len(datum)
+        tpe = datum[0]
+        if tpe == 0:
+            return SparseMatrix(*datum[1:])
+        elif tpe == 1:
+            return DenseMatrix(datum[1], datum[2], datum[5], datum[6])
+        else:
+            raise ValueError("do not recognize type %r" % tpe)
+
+    def simpleString(self):
+        return "matrix"
+
+
 class Vector(object):
 
     __UDT__ = VectorUDT()
@@ -781,10 +834,12 @@ def zeros(size):
 
 
 class Matrix(object):
+
+    __UDT__ = MatrixUDT()
+
     """
     Represents a local matrix.
     """
-
     def __init__(self, numRows, numCols, isTransposed=False):
         self.numRows = numRows
         self.numCols = numCols
diff --git a/python/pyspark/mllib/tests.py b/python/pyspark/mllib/tests.py
index 36a4c7a5408c6..f4c997261ef4e 100644
--- a/python/pyspark/mllib/tests.py
+++ b/python/pyspark/mllib/tests.py
@@ -39,7 +39,7 @@
 from pyspark import SparkContext
 from pyspark.mllib.common import _to_java_object_rdd
 from pyspark.mllib.linalg import Vector, SparseVector, DenseVector, VectorUDT, _convert_to_vector,\
-    DenseMatrix, SparseMatrix, Vectors, Matrices
+    DenseMatrix, SparseMatrix, Vectors, Matrices, MatrixUDT
 from pyspark.mllib.regression import LabeledPoint
 from pyspark.mllib.random import RandomRDDs
 from pyspark.mllib.stat import Statistics
@@ -507,6 +507,38 @@ def test_infer_schema(self):
                 raise TypeError("expecting a vector but got %r of type %r" % (v, type(v)))
 
 
+class MatrixUDTTests(MLlibTestCase):
+
+    dm1 = DenseMatrix(3, 2, [0, 1, 4, 5, 9, 10])
+    dm2 = DenseMatrix(3, 2, [0, 1, 4, 5, 9, 10], isTransposed=True)
+    sm1 = SparseMatrix(1, 1, [0, 1], [0], [2.0])
+    sm2 = SparseMatrix(2, 1, [0, 0, 1], [0], [5.0], isTransposed=True)
+    udt = MatrixUDT()
+
+    def test_json_schema(self):
+        self.assertEqual(MatrixUDT.fromJson(self.udt.jsonValue()), self.udt)
+
+    def test_serialization(self):
+        for m in [self.dm1, self.dm2, self.sm1, self.sm2]:
+            self.assertEqual(m, self.udt.deserialize(self.udt.serialize(m)))
+
+    def test_infer_schema(self):
+        sqlCtx = SQLContext(self.sc)
+        rdd = self.sc.parallelize([("dense", self.dm1), ("sparse", self.sm1)])
+        df = rdd.toDF()
+        schema = df.schema
+        self.assertTrue(schema.fields[1].dataType, self.udt)
+        matrices = df.map(lambda x: x._2).collect()
+        self.assertEqual(len(matrices), 2)
+        for m in matrices:
+            if isinstance(m, DenseMatrix):
+                self.assertTrue(m, self.dm1)
+            elif isinstance(m, SparseMatrix):
+                self.assertTrue(m, self.sm1)
+            else:
+                raise ValueError("Expected a matrix but got type %r" % type(m))
+
+
 @unittest.skipIf(not _have_scipy, "SciPy not installed")
 class SciPyTests(MLlibTestCase):
 
diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py
index 9615e576497cd..152b87351db31 100644
--- a/python/pyspark/sql/dataframe.py
+++ b/python/pyspark/sql/dataframe.py
@@ -194,7 +194,11 @@ def schema(self):
         StructType(List(StructField(age,IntegerType,true),StructField(name,StringType,true)))
         """
         if self._schema is None:
-            self._schema = _parse_datatype_json_string(self._jdf.schema().json())
+            try:
+                self._schema = _parse_datatype_json_string(self._jdf.schema().json())
+            except AttributeError as e:
+                raise Exception(
+                    "Unable to parse datatype from schema. %s" % e)
         return self._schema
 
     @since(1.3)

From 50a0496a43f09d70593419efc38587c8441843bf Mon Sep 17 00:00:00 2001
From: Brennon York <brennon.york@capitalone.com>
Date: Wed, 17 Jun 2015 12:00:34 -0700
Subject: [PATCH 504/525] [SPARK-7017] [BUILD] [PROJECT INFRA] Refactor
 dev/run-tests into Python

All, this is a first attempt at refactoring `dev/run-tests` into Python. Initially I merely converted all Bash calls over to Python, then moved to a much more modular approach (more functions, moved the calls around, etc.). What is here is the initial culmination and should provide a great base to various downstream issues (e.g. SPARK-7016, modularize / parallelize testing, etc.). Would love comments / suggestions for this initial first step!

/cc srowen pwendell nchammas

Author: Brennon York <brennon.york@capitalone.com>

Closes #5694 from brennonyork/SPARK-7017 and squashes the following commits:

154ed73 [Brennon York] updated finding java binary if JAVA_HOME not set
3922a85 [Brennon York] removed necessary passed in variable
f9fbe54 [Brennon York] reverted doc test change
8135518 [Brennon York] removed the test check for documentation changes until jenkins can get updated
05d435b [Brennon York] added check for jekyll install
22edb78 [Brennon York] add check if jekyll isn't installed on the path
2dff136 [Brennon York] fixed pep8 whitespace errors
767a668 [Brennon York] fixed path joining issues, ensured docs actually build on doc changes
c42cf9a [Brennon York] unpack set operations with splat (*)
fb85a41 [Brennon York] fixed minor set bug
0379833 [Brennon York] minor doc addition to print the changed modules
aa03d9e [Brennon York] added documentation builds as a top level test component, altered high level project changes to properly execute core tests only when necessary, changed variable names for simplicity
ec1ae78 [Brennon York] minor name changes, bug fixes
b7c72b9 [Brennon York] reverting streaming context
03fdd7b [Brennon York] fixed the tuple () wraps around example lambda
705d12e [Brennon York] changed example to comply with pep3113 supporting python3
60b3d51 [Brennon York] prepend rather than append onto PATH
7d2f5e2 [Brennon York] updated python tests to remove unused variable
2898717 [Brennon York] added a change to streaming test to check if it only runs streaming tests
eb684b6 [Brennon York] fixed sbt_test_goals reference error
db7ae6f [Brennon York] reverted SPARK_HOME from start of command
1ecca26 [Brennon York] fixed merge conflicts
2fcdfc0 [Brennon York] testing targte branch dump on jenkins
1f607b1 [Brennon York] finalizing revisions to modular tests
8afbe93 [Brennon York] made error codes a global
0629de8 [Brennon York] updated to refactor and remove various small bugs, removed pep8 complaints
d90ab2d [Brennon York] fixed merge conflicts, ensured that for regular builds both core and sql tests always run
b1248dc [Brennon York] exec python rather than running python and exiting with return code
f9deba1 [Brennon York] python to python2 and removed newline
6d0a052 [Brennon York] incorporated merge conflicts with SPARK-7249
f950010 [Brennon York] removed building hive-0.12.0 per SPARK-6908
703f095 [Brennon York] fixed merge conflicts
b1ca593 [Brennon York] reverted the sparkR test
afeb093 [Brennon York] updated to make sparkR test fail
1dada6b [Brennon York] reverted pyspark test failure
9a592ec [Brennon York] reverted mima exclude issue, added pyspark test failure
d825aa4 [Brennon York] revert build break, add mima break
f041d8a [Brennon York] added space from commented import to now test build breaking
983f2a2 [Brennon York] comment out import to fail build test
2386785 [Brennon York] Merge remote-tracking branch 'upstream/master' into SPARK-7017
76335fb [Brennon York] reverted rat license issue for sparkconf
e4a96cc [Brennon York] removed the import error and added license error, fixed the way run-tests and run-tests.py report their error codes
56d3cb9 [Brennon York] changed test back and commented out import to break compile
b37328c [Brennon York] fixed typo and added default return is no error block was found in the environment
7613558 [Brennon York] updated to return the proper env variable for return codes
a5bd445 [Brennon York] reverted license, changed test in shuffle to fail
803143a [Brennon York] removed license file for SparkContext
b0b2604 [Brennon York] comment out import to see if build fails and returns properly
83e80ef [Brennon York] attempt at better python output when called from bash
c095fa6 [Brennon York] removed another wait() call
26e18e8 [Brennon York] removed unnecessary wait()
07210a9 [Brennon York] minor doc string change for java version with namedtuple update
ec03bf3 [Brennon York] added namedtuple for java version to add readability
2cb413b [Brennon York] upcased global variables, changes various calling methods from check_output to check_call
639f1e9 [Brennon York] updated with pep8 rules, fixed minor bugs, added run-tests file in bash to call the run-tests.py script
3c53a1a [Brennon York] uncomment the scala tests :)
6126c4f [Brennon York] refactored run-tests into python
---
 dev/run-tests                      | 219 +-----------
 dev/run-tests-codes.sh             |  11 +-
 dev/run-tests-jenkins              |   2 +
 dev/run-tests.py                   | 536 +++++++++++++++++++++++++++++
 examples/src/main/python/kmeans.py |   2 +-
 5 files changed, 546 insertions(+), 224 deletions(-)
 create mode 100755 dev/run-tests.py

diff --git a/dev/run-tests b/dev/run-tests
index d178e2a4601ea..a00d9f0c27639 100755
--- a/dev/run-tests
+++ b/dev/run-tests
@@ -17,224 +17,7 @@
 # limitations under the License.
 #
 
-# Go to the Spark project root directory
 FWDIR="$(cd "`dirname $0`"/..; pwd)"
 cd "$FWDIR"
 
-# Clean up work directory and caches
-rm -rf ./work
-rm -rf ~/.ivy2/local/org.apache.spark
-rm -rf ~/.ivy2/cache/org.apache.spark
-
-source "$FWDIR/dev/run-tests-codes.sh"
-
-CURRENT_BLOCK=$BLOCK_GENERAL
-
-function handle_error () {
-  echo "[error] Got a return code of $? on line $1 of the run-tests script."
-  exit $CURRENT_BLOCK
-}
-
-
-# Build against the right version of Hadoop.
-{
-  if [ -n "$AMPLAB_JENKINS_BUILD_PROFILE" ]; then
-    if [ "$AMPLAB_JENKINS_BUILD_PROFILE" = "hadoop1.0" ]; then
-      export SBT_MAVEN_PROFILES_ARGS="-Phadoop-1 -Dhadoop.version=1.2.1"
-    elif [ "$AMPLAB_JENKINS_BUILD_PROFILE" = "hadoop2.0" ]; then
-      export SBT_MAVEN_PROFILES_ARGS="-Phadoop-1 -Dhadoop.version=2.0.0-mr1-cdh4.1.1"
-    elif [ "$AMPLAB_JENKINS_BUILD_PROFILE" = "hadoop2.2" ]; then
-      export SBT_MAVEN_PROFILES_ARGS="-Pyarn -Phadoop-2.2"
-    elif [ "$AMPLAB_JENKINS_BUILD_PROFILE" = "hadoop2.3" ]; then
-      export SBT_MAVEN_PROFILES_ARGS="-Pyarn -Phadoop-2.3 -Dhadoop.version=2.3.0"
-    fi
-  fi
-
-  if [ -z "$SBT_MAVEN_PROFILES_ARGS" ]; then
-    export SBT_MAVEN_PROFILES_ARGS="-Pyarn -Phadoop-2.3 -Dhadoop.version=2.3.0"
-  fi
-}
-
-export SBT_MAVEN_PROFILES_ARGS="$SBT_MAVEN_PROFILES_ARGS -Pkinesis-asl"
-
-# Determine Java path and version.
-{
-  if test -x "$JAVA_HOME/bin/java"; then
-      declare java_cmd="$JAVA_HOME/bin/java"
-  else
-      declare java_cmd=java
-  fi
-
-  # We can't use sed -r -e due to OS X / BSD compatibility; hence, all the parentheses.
-  JAVA_VERSION=$(
-    $java_cmd -version 2>&1 \
-    | grep -e "^java version" --max-count=1 \
-    | sed "s/java version \"\(.*\)\.\(.*\)\.\(.*\)\"/\1\2/"
-  )
-
-  if [ "$JAVA_VERSION" -lt 18 ]; then
-    echo "[warn] Java 8 tests will not run because JDK version is < 1.8."
-  fi
-}
-
-# Only run Hive tests if there are SQL changes.
-# Partial solution for SPARK-1455.
-if [ -n "$AMPLAB_JENKINS" ]; then
-  target_branch="$ghprbTargetBranch"
-  git fetch origin "$target_branch":"$target_branch"
-
-  # AMP_JENKINS_PRB indicates if the current build is a pull request build.
-  if [ -n "$AMP_JENKINS_PRB" ]; then
-    # It is a pull request build.
-    sql_diffs=$(
-      git diff --name-only "$target_branch" \
-      | grep -e "^sql/" -e "^bin/spark-sql" -e "^sbin/start-thriftserver.sh"
-    )
-
-    non_sql_diffs=$(
-      git diff --name-only "$target_branch" \
-      | grep -v -e "^sql/" -e "^bin/spark-sql" -e "^sbin/start-thriftserver.sh"
-    )
-
-    if [ -n "$sql_diffs" ]; then
-      echo "[info] Detected changes in SQL. Will run Hive test suite."
-      _RUN_SQL_TESTS=true
-
-      if [ -z "$non_sql_diffs" ]; then
-        echo "[info] Detected no changes except in SQL. Will only run SQL tests."
-        _SQL_TESTS_ONLY=true
-      fi
-    fi
-  else
-    # It is a regular build. We should run SQL tests.
-    _RUN_SQL_TESTS=true
-  fi
-fi
-
-set -o pipefail
-trap 'handle_error $LINENO' ERR
-
-echo ""
-echo "========================================================================="
-echo "Running Apache RAT checks"
-echo "========================================================================="
-
-CURRENT_BLOCK=$BLOCK_RAT
-
-./dev/check-license
-
-echo ""
-echo "========================================================================="
-echo "Running Scala style checks"
-echo "========================================================================="
-
-CURRENT_BLOCK=$BLOCK_SCALA_STYLE
-
-./dev/lint-scala
-
-echo ""
-echo "========================================================================="
-echo "Running Python style checks"
-echo "========================================================================="
-
-CURRENT_BLOCK=$BLOCK_PYTHON_STYLE
-
-./dev/lint-python
-
-echo ""
-echo "========================================================================="
-echo "Building Spark"
-echo "========================================================================="
-
-CURRENT_BLOCK=$BLOCK_BUILD
-
-{
-  HIVE_BUILD_ARGS="$SBT_MAVEN_PROFILES_ARGS -Phive -Phive-thriftserver"
-  echo "[info] Compile with Hive 0.13.1"
-  [ -d "lib_managed" ] && rm -rf lib_managed
-  echo "[info] Building Spark with these arguments: $HIVE_BUILD_ARGS"
-
-  if [ "${AMPLAB_JENKINS_BUILD_TOOL}" == "maven" ]; then
-    build/mvn $HIVE_BUILD_ARGS clean package -DskipTests
-  else
-    echo -e "q\n" \
-      | build/sbt $HIVE_BUILD_ARGS package assembly/assembly streaming-kafka-assembly/assembly \
-      | grep -v -e "info.*Resolving" -e "warn.*Merging" -e "info.*Including"
-  fi
-}
-
-echo ""
-echo "========================================================================="
-echo "Detecting binary incompatibilities with MiMa"
-echo "========================================================================="
-
-CURRENT_BLOCK=$BLOCK_MIMA
-
-./dev/mima
-
-echo ""
-echo "========================================================================="
-echo "Running Spark unit tests"
-echo "========================================================================="
-
-CURRENT_BLOCK=$BLOCK_SPARK_UNIT_TESTS
-
-{
-  # If the Spark SQL tests are enabled, run the tests with the Hive profiles enabled.
-  # This must be a single argument, as it is.
-  if [ -n "$_RUN_SQL_TESTS" ]; then
-    SBT_MAVEN_PROFILES_ARGS="$SBT_MAVEN_PROFILES_ARGS -Phive -Phive-thriftserver"
-  fi
-
-  if [ -n "$_SQL_TESTS_ONLY" ]; then
-    # This must be an array of individual arguments. Otherwise, having one long string
-    # will be interpreted as a single test, which doesn't work.
-    SBT_MAVEN_TEST_ARGS=("catalyst/test" "sql/test" "hive/test" "hive-thriftserver/test" "mllib/test")
-  else
-    SBT_MAVEN_TEST_ARGS=("test")
-  fi
-
-  echo "[info] Running Spark tests with these arguments: $SBT_MAVEN_PROFILES_ARGS ${SBT_MAVEN_TEST_ARGS[@]}"
-
-  if [ "${AMPLAB_JENKINS_BUILD_TOOL}" == "maven" ]; then
-    build/mvn test $SBT_MAVEN_PROFILES_ARGS --fail-at-end
-  else
-    # NOTE: echo "q" is needed because sbt on encountering a build file with failure
-    # (either resolution or compilation) prompts the user for input either q, r, etc
-    # to quit or retry. This echo is there to make it not block.
-    # NOTE: Do not quote $SBT_MAVEN_PROFILES_ARGS or else it will be interpreted as a
-    # single argument!
-    # "${SBT_MAVEN_TEST_ARGS[@]}" is cool because it's an array.
-    # QUESTION: Why doesn't 'yes "q"' work?
-    # QUESTION: Why doesn't 'grep -v -e "^\[info\] Resolving"' work?
-    echo -e "q\n" \
-      | build/sbt $SBT_MAVEN_PROFILES_ARGS "${SBT_MAVEN_TEST_ARGS[@]}" \
-      | grep -v -e "info.*Resolving" -e "warn.*Merging" -e "info.*Including"
-  fi
-}
-
-echo ""
-echo "========================================================================="
-echo "Running PySpark tests"
-echo "========================================================================="
-
-CURRENT_BLOCK=$BLOCK_PYSPARK_UNIT_TESTS
-
-# add path for python 3 in jenkins
-export PATH="${PATH}:/home/anaconda/envs/py3k/bin"
-./python/run-tests
-
-echo ""
-echo "========================================================================="
-echo "Running SparkR tests"
-echo "========================================================================="
-
-CURRENT_BLOCK=$BLOCK_SPARKR_UNIT_TESTS
-
-if [ $(command -v R) ]; then
-  ./R/install-dev.sh
-  ./R/run-tests.sh
-else
-  echo "Ignoring SparkR tests as R was not found in PATH"
-fi
-
+exec python -u ./dev/run-tests.py
diff --git a/dev/run-tests-codes.sh b/dev/run-tests-codes.sh
index 154e01255b2ef..f4b238e1b78a7 100644
--- a/dev/run-tests-codes.sh
+++ b/dev/run-tests-codes.sh
@@ -21,8 +21,9 @@ readonly BLOCK_GENERAL=10
 readonly BLOCK_RAT=11
 readonly BLOCK_SCALA_STYLE=12
 readonly BLOCK_PYTHON_STYLE=13
-readonly BLOCK_BUILD=14
-readonly BLOCK_MIMA=15
-readonly BLOCK_SPARK_UNIT_TESTS=16
-readonly BLOCK_PYSPARK_UNIT_TESTS=17
-readonly BLOCK_SPARKR_UNIT_TESTS=18
+readonly BLOCK_DOCUMENTATION=14
+readonly BLOCK_BUILD=15
+readonly BLOCK_MIMA=16
+readonly BLOCK_SPARK_UNIT_TESTS=17
+readonly BLOCK_PYSPARK_UNIT_TESTS=18
+readonly BLOCK_SPARKR_UNIT_TESTS=19
diff --git a/dev/run-tests-jenkins b/dev/run-tests-jenkins
index 641b0ff3c4be4..c4d39d95d5890 100755
--- a/dev/run-tests-jenkins
+++ b/dev/run-tests-jenkins
@@ -210,6 +210,8 @@ done
       failing_test="Scala style tests"
     elif [ "$test_result" -eq "$BLOCK_PYTHON_STYLE" ]; then
       failing_test="Python style tests"
+    elif [ "$test_result" -eq "$BLOCK_DOCUMENTATION" ]; then
+      failing_test="to generate documentation"
     elif [ "$test_result" -eq "$BLOCK_BUILD" ]; then
       failing_test="to build"
     elif [ "$test_result" -eq "$BLOCK_MIMA" ]; then
diff --git a/dev/run-tests.py b/dev/run-tests.py
new file mode 100755
index 0000000000000..04a7b45741963
--- /dev/null
+++ b/dev/run-tests.py
@@ -0,0 +1,536 @@
+#!/usr/bin/env python2
+
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import os
+import re
+import sys
+import shutil
+import subprocess
+from collections import namedtuple
+
+SPARK_HOME = os.path.join(os.path.dirname(os.path.realpath(__file__)), "..")
+USER_HOME = os.environ.get("HOME")
+
+
+def get_error_codes(err_code_file):
+    """Function to retrieve all block numbers from the `run-tests-codes.sh`
+    file to maintain backwards compatibility with the `run-tests-jenkins`
+    script"""
+
+    with open(err_code_file, 'r') as f:
+        err_codes = [e.split()[1].strip().split('=')
+                     for e in f if e.startswith("readonly")]
+        return dict(err_codes)
+
+
+ERROR_CODES = get_error_codes(os.path.join(SPARK_HOME, "dev/run-tests-codes.sh"))
+
+
+def exit_from_command_with_retcode(cmd, retcode):
+    print "[error] running", cmd, "; received return code", retcode
+    sys.exit(int(os.environ.get("CURRENT_BLOCK", 255)))
+
+
+def rm_r(path):
+    """Given an arbitrary path properly remove it with the correct python
+    construct if it exists
+    - from: http://stackoverflow.com/a/9559881"""
+
+    if os.path.isdir(path):
+        shutil.rmtree(path)
+    elif os.path.exists(path):
+        os.remove(path)
+
+
+def run_cmd(cmd):
+    """Given a command as a list of arguments will attempt to execute the
+    command from the determined SPARK_HOME directory and, on failure, print
+    an error message"""
+
+    if not isinstance(cmd, list):
+        cmd = cmd.split()
+    try:
+        subprocess.check_call(cmd)
+    except subprocess.CalledProcessError as e:
+        exit_from_command_with_retcode(e.cmd, e.returncode)
+
+
+def is_exe(path):
+    """Check if a given path is an executable file
+    - from: http://stackoverflow.com/a/377028"""
+
+    return os.path.isfile(path) and os.access(path, os.X_OK)
+
+
+def which(program):
+    """Find and return the given program by its absolute path or 'None'
+    - from: http://stackoverflow.com/a/377028"""
+
+    fpath, fname = os.path.split(program)
+
+    if fpath:
+        if is_exe(program):
+            return program
+    else:
+        for path in os.environ.get("PATH").split(os.pathsep):
+            path = path.strip('"')
+            exe_file = os.path.join(path, program)
+            if is_exe(exe_file):
+                return exe_file
+    return None
+
+
+def determine_java_executable():
+    """Will return the path of the java executable that will be used by Spark's
+    tests or `None`"""
+
+    # Any changes in the way that Spark's build detects java must be reflected
+    # here. Currently the build looks for $JAVA_HOME/bin/java then falls back to
+    # the `java` executable on the path
+
+    java_home = os.environ.get("JAVA_HOME")
+
+    # check if there is an executable at $JAVA_HOME/bin/java
+    java_exe = which(os.path.join(java_home, "bin", "java")) if java_home else None
+    # if the java_exe wasn't set, check for a `java` version on the $PATH
+    return java_exe if java_exe else which("java")
+
+
+JavaVersion = namedtuple('JavaVersion', ['major', 'minor', 'patch', 'update'])
+
+
+def determine_java_version(java_exe):
+    """Given a valid java executable will return its version in named tuple format
+    with accessors '.major', '.minor', '.patch', '.update'"""
+
+    raw_output = subprocess.check_output([java_exe, "-version"],
+                                         stderr=subprocess.STDOUT)
+    raw_version_str = raw_output.split('\n')[0]  # eg 'java version "1.8.0_25"'
+    version_str = raw_version_str.split()[-1].strip('"')  # eg '1.8.0_25'
+    version, update = version_str.split('_')  # eg ['1.8.0', '25']
+
+    # map over the values and convert them to integers
+    version_info = [int(x) for x in version.split('.') + [update]]
+
+    return JavaVersion(major=version_info[0],
+                       minor=version_info[1],
+                       patch=version_info[2],
+                       update=version_info[3])
+
+
+def set_title_and_block(title, err_block):
+    os.environ["CURRENT_BLOCK"] = ERROR_CODES[err_block]
+    line_str = '=' * 72
+
+    print
+    print line_str
+    print title
+    print line_str
+
+
+def run_apache_rat_checks():
+    set_title_and_block("Running Apache RAT checks", "BLOCK_RAT")
+    run_cmd([os.path.join(SPARK_HOME, "dev", "check-license")])
+
+
+def run_scala_style_checks():
+    set_title_and_block("Running Scala style checks", "BLOCK_SCALA_STYLE")
+    run_cmd([os.path.join(SPARK_HOME, "dev", "lint-scala")])
+
+
+def run_python_style_checks():
+    set_title_and_block("Running Python style checks", "BLOCK_PYTHON_STYLE")
+    run_cmd([os.path.join(SPARK_HOME, "dev", "lint-python")])
+
+
+def build_spark_documentation():
+    set_title_and_block("Building Spark Documentation", "BLOCK_DOCUMENTATION")
+    os.environ["PRODUCTION"] = "1 jekyll build"
+
+    os.chdir(os.path.join(SPARK_HOME, "docs"))
+
+    jekyll_bin = which("jekyll")
+
+    if not jekyll_bin:
+        print "[error] Cannot find a version of `jekyll` on the system; please",
+        print "install one and retry to build documentation."
+        sys.exit(int(os.environ.get("CURRENT_BLOCK", 255)))
+    else:
+        run_cmd([jekyll_bin, "build"])
+
+    os.chdir(SPARK_HOME)
+
+
+def exec_maven(mvn_args=[]):
+    """Will call Maven in the current directory with the list of mvn_args passed
+    in and returns the subprocess for any further processing"""
+
+    run_cmd([os.path.join(SPARK_HOME, "build", "mvn")] + mvn_args)
+
+
+def exec_sbt(sbt_args=[]):
+    """Will call SBT in the current directory with the list of mvn_args passed
+    in and returns the subprocess for any further processing"""
+
+    sbt_cmd = [os.path.join(SPARK_HOME, "build", "sbt")] + sbt_args
+
+    sbt_output_filter = re.compile("^.*[info].*Resolving" + "|" +
+                                   "^.*[warn].*Merging" + "|" +
+                                   "^.*[info].*Including")
+
+    # NOTE: echo "q" is needed because sbt on encountering a build file
+    # with failure (either resolution or compilation) prompts the user for
+    # input either q, r, etc to quit or retry. This echo is there to make it
+    # not block.
+    echo_proc = subprocess.Popen(["echo", "\"q\n\""], stdout=subprocess.PIPE)
+    sbt_proc = subprocess.Popen(sbt_cmd,
+                                stdin=echo_proc.stdout,
+                                stdout=subprocess.PIPE)
+    echo_proc.wait()
+    for line in iter(sbt_proc.stdout.readline, ''):
+        if not sbt_output_filter.match(line):
+            print line,
+    retcode = sbt_proc.wait()
+
+    if retcode > 0:
+        exit_from_command_with_retcode(sbt_cmd, retcode)
+
+
+def get_hadoop_profiles(hadoop_version):
+    """Return a list of profiles indicating which Hadoop version to use from
+    a Hadoop version tag."""
+
+    sbt_maven_hadoop_profiles = {
+        "hadoop1.0": ["-Phadoop-1", "-Dhadoop.version=1.0.4"],
+        "hadoop2.0": ["-Phadoop-1", "-Dhadoop.version=2.0.0-mr1-cdh4.1.1"],
+        "hadoop2.2": ["-Pyarn", "-Phadoop-2.2"],
+        "hadoop2.3": ["-Pyarn", "-Phadoop-2.3", "-Dhadoop.version=2.3.0"],
+    }
+
+    if hadoop_version in sbt_maven_hadoop_profiles:
+        return sbt_maven_hadoop_profiles[hadoop_version]
+    else:
+        print "[error] Could not find", hadoop_version, "in the list. Valid options",
+        print "are", sbt_maven_hadoop_profiles.keys()
+        sys.exit(int(os.environ.get("CURRENT_BLOCK", 255)))
+
+
+def get_build_profiles(hadoop_version="hadoop2.3",
+                       enable_base_profiles=True,
+                       enable_hive_profiles=False,
+                       enable_doc_profiles=False):
+    """Returns a list of hadoop profiles to be used as looked up from the passed in hadoop profile
+    key with the option of adding on the base and hive profiles."""
+
+    base_profiles = ["-Pkinesis-asl"]
+    hive_profiles = ["-Phive", "-Phive-thriftserver"]
+    doc_profiles = []
+    hadoop_profiles = get_hadoop_profiles(hadoop_version)
+
+    build_profiles = hadoop_profiles
+
+    if enable_base_profiles:
+        build_profiles += base_profiles
+
+    if enable_hive_profiles:
+        build_profiles += hive_profiles
+
+    if enable_doc_profiles:
+        build_profiles += doc_profiles
+
+    return build_profiles
+
+
+def build_spark_maven(hadoop_version):
+    # we always build with Hive support even if we skip Hive tests in most builds
+    build_profiles = get_build_profiles(hadoop_version, enable_hive_profiles=True)
+    mvn_goals = ["clean", "package", "-DskipTests"]
+    profiles_and_goals = build_profiles + mvn_goals
+
+    print "[info] Building Spark (w/Hive 0.13.1) using Maven with these arguments:",
+    print " ".join(profiles_and_goals)
+
+    exec_maven(profiles_and_goals)
+
+
+def build_spark_sbt(hadoop_version):
+    build_profiles = get_build_profiles(hadoop_version, enable_hive_profiles=True)
+    sbt_goals = ["package",
+                 "assembly/assembly",
+                 "streaming-kafka-assembly/assembly"]
+    profiles_and_goals = build_profiles + sbt_goals
+
+    print "[info] Building Spark (w/Hive 0.13.1) using SBT with these arguments:",
+    print " ".join(profiles_and_goals)
+
+    exec_sbt(profiles_and_goals)
+
+
+def build_apache_spark(build_tool, hadoop_version):
+    """Will build Spark against Hive v0.13.1 given the passed in build tool (either `sbt` or
+    `maven`). Defaults to using `sbt`."""
+
+    set_title_and_block("Building Spark", "BLOCK_BUILD")
+
+    rm_r("lib_managed")
+
+    if build_tool == "maven":
+        build_spark_maven(hadoop_version)
+    else:
+        build_spark_sbt(hadoop_version)
+
+
+def detect_binary_inop_with_mima():
+    set_title_and_block("Detecting binary incompatibilities with MiMa", "BLOCK_MIMA")
+    run_cmd([os.path.join(SPARK_HOME, "dev", "mima")])
+
+
+def identify_changed_modules(test_env):
+    """Given the passed in environment will determine the changed modules and
+    return them as a set. If the environment is local, will simply run all tests.
+    If run under the `amplab_jenkins` environment will determine the changed files
+    as compared to the `ghprbTargetBranch` and execute the necessary set of tests
+    to provide coverage for the changed code."""
+    changed_modules = set()
+
+    if test_env == "amplab_jenkins":
+        target_branch = os.environ["ghprbTargetBranch"]
+
+        run_cmd(['git', 'fetch', 'origin', str(target_branch+':'+target_branch)])
+
+        raw_output = subprocess.check_output(['git', 'diff', '--name-only', target_branch])
+        # remove any empty strings
+        changed_files = [f for f in raw_output.split('\n') if f]
+
+        sql_files = [f for f in changed_files
+                     if any(f.startswith(p) for p in
+                            ["sql/",
+                             "bin/spark-sql",
+                             "sbin/start-thriftserver.sh",
+                             "examples/src/main/java/org/apache/spark/examples/sql/",
+                             "examples/src/main/scala/org/apache/spark/examples/sql/"])]
+        mllib_files = [f for f in changed_files
+                       if any(f.startswith(p) for p in
+                              ["examples/src/main/java/org/apache/spark/examples/mllib/",
+                               "examples/src/main/scala/org/apache/spark/examples/mllib",
+                               "data/mllib/",
+                               "mllib/"])]
+        streaming_files = [f for f in changed_files
+                           if any(f.startswith(p) for p in
+                                  ["examples/scala-2.10/",
+                                   "examples/src/main/java/org/apache/spark/examples/streaming/",
+                                   "examples/src/main/scala/org/apache/spark/examples/streaming/",
+                                   "external/",
+                                   "extras/java8-tests/",
+                                   "extras/kinesis-asl/",
+                                   "streaming/"])]
+        graphx_files = [f for f in changed_files
+                        if any(f.startswith(p) for p in
+                               ["examples/src/main/scala/org/apache/spark/examples/graphx/",
+                                "graphx/"])]
+        doc_files = [f for f in changed_files if f.startswith("docs/")]
+
+        # union together all changed top level project files
+        top_level_project_files = set().union(*[set(f) for f in [sql_files,
+                                                                 mllib_files,
+                                                                 streaming_files,
+                                                                 graphx_files,
+                                                                 doc_files]])
+        changed_core_files = set(changed_files).difference(top_level_project_files)
+
+        if changed_core_files:
+            changed_modules.add("CORE")
+        if sql_files:
+            print "[info] Detected changes in SQL. Will run Hive test suite."
+            changed_modules.add("SQL")
+        if mllib_files:
+            print "[info] Detected changes in MLlib. Will run MLlib test suite."
+            changed_modules.add("MLLIB")
+        if streaming_files:
+            print "[info] Detected changes in Streaming. Will run Streaming test suite."
+            changed_modules.add("STREAMING")
+        if graphx_files:
+            print "[info] Detected changes in GraphX. Will run GraphX test suite."
+            changed_modules.add("GRAPHX")
+        if doc_files:
+            print "[info] Detected changes in documentation. Will build spark with documentation."
+            changed_modules.add("DOCS")
+
+        return changed_modules
+    else:
+        # we aren't in the Amplab environment so simply run all tests
+        changed_modules.add("ALL")
+        return changed_modules
+
+
+def run_scala_tests_maven(test_profiles):
+    mvn_test_goals = ["test", "--fail-at-end"]
+    profiles_and_goals = test_profiles + mvn_test_goals
+
+    print "[info] Running Spark tests using Maven with these arguments:",
+    print " ".join(profiles_and_goals)
+
+    exec_maven(profiles_and_goals)
+
+
+def run_scala_tests_sbt(test_modules, test_profiles):
+    # declare the variable for reference
+    sbt_test_goals = None
+
+    if "ALL" in test_modules:
+        sbt_test_goals = ["test"]
+    else:
+        # if we only have changes in SQL, MLlib, Streaming, or GraphX then build
+        # a custom test list
+        if "SQL" in test_modules and "CORE" not in test_modules:
+            sbt_test_goals = ["catalyst/test",
+                              "sql/test",
+                              "hive/test",
+                              "hive-thriftserver/test",
+                              "mllib/test",
+                              "examples/test"]
+        if "MLLIB" in test_modules and "CORE" not in test_modules:
+            sbt_test_goals += ["mllib/test", "examples/test"]
+        if "STREAMING" in test_modules and "CORE" not in test_modules:
+            sbt_test_goals += ["streaming/test",
+                               "streaming-flume/test",
+                               "streaming-flume-sink/test",
+                               "streaming-kafka/test",
+                               "streaming-mqtt/test",
+                               "streaming-twitter/test",
+                               "streaming-zeromq/test",
+                               "examples/test"]
+        if "GRAPHX" in test_modules and "CORE" not in test_modules:
+            sbt_test_goals += ["graphx/test", "examples/test"]
+        if not sbt_test_goals:
+            sbt_test_goals = ["test"]
+
+    profiles_and_goals = test_profiles + sbt_test_goals
+
+    print "[info] Running Spark tests using SBT with these arguments:",
+    print " ".join(profiles_and_goals)
+
+    exec_sbt(profiles_and_goals)
+
+
+def run_scala_tests(build_tool, hadoop_version, test_modules):
+    """Function to properly execute all tests passed in as a set from the
+    `determine_test_suites` function"""
+    set_title_and_block("Running Spark unit tests", "BLOCK_SPARK_UNIT_TESTS")
+
+    test_modules = set(test_modules)
+
+    hive_profiles = ("SQL" in test_modules)
+    test_profiles = get_build_profiles(hadoop_version, enable_hive_profiles=hive_profiles)
+
+    if build_tool == "maven":
+        run_scala_tests_maven(test_profiles)
+    else:
+        run_scala_tests_sbt(test_modules, test_profiles)
+
+
+def run_python_tests():
+    set_title_and_block("Running PySpark tests", "BLOCK_PYSPARK_UNIT_TESTS")
+
+    run_cmd([os.path.join(SPARK_HOME, "python", "run-tests")])
+
+
+def run_sparkr_tests():
+    set_title_and_block("Running SparkR tests", "BLOCK_SPARKR_UNIT_TESTS")
+
+    if which("R"):
+        run_cmd([os.path.join(SPARK_HOME, "R", "install-dev.sh")])
+        run_cmd([os.path.join(SPARK_HOME, "R", "run-tests.sh")])
+    else:
+        print "Ignoring SparkR tests as R was not found in PATH"
+
+
+def main():
+    # Ensure the user home directory (HOME) is valid and is an absolute directory
+    if not USER_HOME or not os.path.isabs(USER_HOME):
+        print "[error] Cannot determine your home directory as an absolute path;",
+        print "ensure the $HOME environment variable is set properly."
+        sys.exit(1)
+
+    os.chdir(SPARK_HOME)
+
+    rm_r(os.path.join(SPARK_HOME, "work"))
+    rm_r(os.path.join(USER_HOME, ".ivy2", "local", "org.apache.spark"))
+    rm_r(os.path.join(USER_HOME, ".ivy2", "cache", "org.apache.spark"))
+
+    os.environ["CURRENT_BLOCK"] = ERROR_CODES["BLOCK_GENERAL"]
+
+    java_exe = determine_java_executable()
+
+    if not java_exe:
+        print "[error] Cannot find a version of `java` on the system; please",
+        print "install one and retry."
+        sys.exit(2)
+
+    java_version = determine_java_version(java_exe)
+
+    if java_version.minor < 8:
+        print "[warn] Java 8 tests will not run because JDK version is < 1.8."
+
+    if os.environ.get("AMPLAB_JENKINS"):
+        # if we're on the Amplab Jenkins build servers setup variables
+        # to reflect the environment settings
+        build_tool = os.environ.get("AMPLAB_JENKINS_BUILD_TOOL", "sbt")
+        hadoop_version = os.environ.get("AMPLAB_JENKINS_BUILD_PROFILE", "hadoop2.3")
+        test_env = "amplab_jenkins"
+        # add path for Python3 in Jenkins if we're calling from a Jenkins machine
+        os.environ["PATH"] = "/home/anaconda/envs/py3k/bin:" + os.environ.get("PATH")
+    else:
+        # else we're running locally and can use local settings
+        build_tool = "sbt"
+        hadoop_version = "hadoop2.3"
+        test_env = "local"
+
+    print "[info] Using build tool", build_tool, "with profile", hadoop_version,
+    print "under environment", test_env
+
+    # determine high level changes
+    changed_modules = identify_changed_modules(test_env)
+    print "[info] Found the following changed modules:", ", ".join(changed_modules)
+
+    # license checks
+    run_apache_rat_checks()
+
+    # style checks
+    run_scala_style_checks()
+    run_python_style_checks()
+
+    # determine if docs were changed and if we're inside the amplab environment
+    # note - the below commented out until *all* Jenkins workers can get `jekyll` installed
+    # if "DOCS" in changed_modules and test_env == "amplab_jenkins":
+    #    build_spark_documentation()
+
+    # spark build
+    build_apache_spark(build_tool, hadoop_version)
+
+    # backwards compatibility checks
+    detect_binary_inop_with_mima()
+
+    # run the test suites
+    run_scala_tests(build_tool, hadoop_version, changed_modules)
+    run_python_tests()
+    run_sparkr_tests()
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/src/main/python/kmeans.py b/examples/src/main/python/kmeans.py
index 1456c87312841..0ea7cfb7025a0 100755
--- a/examples/src/main/python/kmeans.py
+++ b/examples/src/main/python/kmeans.py
@@ -68,7 +68,7 @@ def closestPoint(p, centers):
         closest = data.map(
             lambda p: (closestPoint(p, kPoints), (p, 1)))
         pointStats = closest.reduceByKey(
-            lambda (p1, c1), (p2, c2): (p1 + p2, c1 + c2))
+            lambda p1_c1, p2_c2: (p1_c1[0] + p2_c2[0], p1_c1[1] + p2_c2[1]))
         newPoints = pointStats.map(
             lambda st: (st[0], st[1][0] / st[1][1])).collect()
 

From 0c1b2df043fde9ac9f28a5f348ee96ce124f2c6b Mon Sep 17 00:00:00 2001
From: Michael Davies <Michael.BellDavies@gmail.com>
Date: Wed, 17 Jun 2015 12:56:55 -0700
Subject: [PATCH 505/525] [SPARK-8077] [SQL] Optimization for TreeNodes with
 large numbers of children

For example large IN clauses

Large IN clauses are parsed very slowly. For example SQL below (10K items in IN) takes 45-50s.

s"""SELECT * FROM Person WHERE ForeName IN ('${(1 to 10000).map("n" + _).mkString("','")}')"""

This is principally due to TreeNode which repeatedly call contains on children, where children in this case is a List that is 10K long. In effect parsing for large IN clauses is O(N squared).
A lazily initialised Set based on children for contains reduces parse time to around 2.5s

Author: Michael Davies <Michael.BellDavies@gmail.com>

Closes #6673 from MickDavies/SPARK-8077 and squashes the following commits:

38cd425 [Michael Davies] SPARK-8077: Optimization for  TreeNodes with large numbers of children
d80103b [Michael Davies] SPARK-8077: Optimization for  TreeNodes with large numbers of children
e6be8be [Michael Davies] SPARK-8077: Optimization for  TreeNodes with large numbers of children
---
 .../catalyst/plans/logical/LogicalPlan.scala  |  2 +-
 .../spark/sql/catalyst/trees/TreeNode.scala   | 27 +++++++++++--------
 2 files changed, 17 insertions(+), 12 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala
index dba69659afc80..c8c6676f24c17 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala
@@ -90,7 +90,7 @@ abstract class LogicalPlan extends QueryPlan[LogicalPlan] with Logging {
     val input = children.flatMap(_.output)
     productIterator.map {
       // Children are checked using sameResult above.
-      case tn: TreeNode[_] if children contains tn => null
+      case tn: TreeNode[_] if containsChild(tn) => null
       case e: Expression => BindReferences.bindReference(e, input, allowFailures = true)
       case s: Option[_] => s.map {
         case e: Expression => BindReferences.bindReference(e, input, allowFailures = true)
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/trees/TreeNode.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/trees/TreeNode.scala
index 5964e3dc3d77e..f304597bc978e 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/trees/TreeNode.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/trees/TreeNode.scala
@@ -59,9 +59,14 @@ abstract class TreeNode[BaseType <: TreeNode[BaseType]] {
 
   val origin: Origin = CurrentOrigin.get
 
-  /** Returns a Seq of the children of this node */
+  /**
+   * Returns a Seq of the children of this node.
+   * Children should not change. Immutability required for containsChild optimization
+   */
   def children: Seq[BaseType]
 
+  lazy val containsChild: Set[TreeNode[_]] = children.toSet
+
   /**
    * Faster version of equality which short-circuits when two treeNodes are the same instance.
    * We don't just override Object.equals, as doing so prevents the scala compiler from
@@ -147,7 +152,7 @@ abstract class TreeNode[BaseType <: TreeNode[BaseType]] {
   def mapChildren(f: BaseType => BaseType): this.type = {
     var changed = false
     val newArgs = productIterator.map {
-      case arg: TreeNode[_] if children contains arg =>
+      case arg: TreeNode[_] if containsChild(arg) =>
         val newChild = f(arg.asInstanceOf[BaseType])
         if (newChild fastEquals arg) {
           arg
@@ -173,7 +178,7 @@ abstract class TreeNode[BaseType <: TreeNode[BaseType]] {
     val newArgs = productIterator.map {
       // Handle Seq[TreeNode] in TreeNode parameters.
       case s: Seq[_] => s.map {
-        case arg: TreeNode[_] if children contains arg =>
+        case arg: TreeNode[_] if containsChild(arg) =>
           val newChild = remainingNewChildren.remove(0)
           val oldChild = remainingOldChildren.remove(0)
           if (newChild fastEquals oldChild) {
@@ -185,7 +190,7 @@ abstract class TreeNode[BaseType <: TreeNode[BaseType]] {
         case nonChild: AnyRef => nonChild
         case null => null
       }
-      case arg: TreeNode[_] if children contains arg =>
+      case arg: TreeNode[_] if containsChild(arg) =>
         val newChild = remainingNewChildren.remove(0)
         val oldChild = remainingOldChildren.remove(0)
         if (newChild fastEquals oldChild) {
@@ -238,7 +243,7 @@ abstract class TreeNode[BaseType <: TreeNode[BaseType]] {
   def transformChildrenDown(rule: PartialFunction[BaseType, BaseType]): this.type = {
     var changed = false
     val newArgs = productIterator.map {
-      case arg: TreeNode[_] if children contains arg =>
+      case arg: TreeNode[_] if containsChild(arg) =>
         val newChild = arg.asInstanceOf[BaseType].transformDown(rule)
         if (!(newChild fastEquals arg)) {
           changed = true
@@ -246,7 +251,7 @@ abstract class TreeNode[BaseType <: TreeNode[BaseType]] {
         } else {
           arg
         }
-      case Some(arg: TreeNode[_]) if children contains arg =>
+      case Some(arg: TreeNode[_]) if containsChild(arg) =>
         val newChild = arg.asInstanceOf[BaseType].transformDown(rule)
         if (!(newChild fastEquals arg)) {
           changed = true
@@ -257,7 +262,7 @@ abstract class TreeNode[BaseType <: TreeNode[BaseType]] {
       case m: Map[_, _] => m
       case d: DataType => d // Avoid unpacking Structs
       case args: Traversable[_] => args.map {
-        case arg: TreeNode[_] if children contains arg =>
+        case arg: TreeNode[_] if containsChild(arg) =>
           val newChild = arg.asInstanceOf[BaseType].transformDown(rule)
           if (!(newChild fastEquals arg)) {
             changed = true
@@ -295,7 +300,7 @@ abstract class TreeNode[BaseType <: TreeNode[BaseType]] {
   def transformChildrenUp(rule: PartialFunction[BaseType, BaseType]): this.type = {
     var changed = false
     val newArgs = productIterator.map {
-      case arg: TreeNode[_] if children contains arg =>
+      case arg: TreeNode[_] if containsChild(arg) =>
         val newChild = arg.asInstanceOf[BaseType].transformUp(rule)
         if (!(newChild fastEquals arg)) {
           changed = true
@@ -303,7 +308,7 @@ abstract class TreeNode[BaseType <: TreeNode[BaseType]] {
         } else {
           arg
         }
-      case Some(arg: TreeNode[_]) if children contains arg =>
+      case Some(arg: TreeNode[_]) if containsChild(arg) =>
         val newChild = arg.asInstanceOf[BaseType].transformUp(rule)
         if (!(newChild fastEquals arg)) {
           changed = true
@@ -314,7 +319,7 @@ abstract class TreeNode[BaseType <: TreeNode[BaseType]] {
       case m: Map[_, _] => m
       case d: DataType => d // Avoid unpacking Structs
       case args: Traversable[_] => args.map {
-        case arg: TreeNode[_] if children contains arg =>
+        case arg: TreeNode[_] if containsChild(arg) =>
           val newChild = arg.asInstanceOf[BaseType].transformUp(rule)
           if (!(newChild fastEquals arg)) {
             changed = true
@@ -383,7 +388,7 @@ abstract class TreeNode[BaseType <: TreeNode[BaseType]] {
 
   /** Returns a string representing the arguments to this node, minus any children */
   def argString: String = productIterator.flatMap {
-    case tn: TreeNode[_] if children contains tn => Nil
+    case tn: TreeNode[_] if containsChild(tn) => Nil
     case tn: TreeNode[_] if tn.toString contains "\n" => s"(${tn.simpleString})" :: Nil
     case seq: Seq[BaseType] if seq.toSet.subsetOf(children.toSet) => Nil
     case seq: Seq[_] => seq.mkString("[", ",", "]") :: Nil

From f005be02730db315e2a6d4dbecedfd2562b9ef1f Mon Sep 17 00:00:00 2001
From: Sean Owen <sowen@cloudera.com>
Date: Wed, 17 Jun 2015 13:31:10 -0700
Subject: [PATCH 506/525] [SPARK-8395] [DOCS] start-slave.sh docs incorrect

start-slave.sh no longer takes a worker # param in 1.4+

Author: Sean Owen <sowen@cloudera.com>

Closes #6855 from srowen/SPARK-8395 and squashes the following commits:

300278e [Sean Owen] start-slave.sh no longer takes a worker # param in 1.4+
---
 docs/spark-standalone.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/spark-standalone.md b/docs/spark-standalone.md
index 12d7d6e159bea..4f71fbc086cd0 100644
--- a/docs/spark-standalone.md
+++ b/docs/spark-standalone.md
@@ -24,7 +24,7 @@ the master's web UI, which is [http://localhost:8080](http://localhost:8080) by
 
 Similarly, you can start one or more workers and connect them to the master via:
 
-    ./sbin/start-slave.sh <worker#> <master-spark-URL>
+    ./sbin/start-slave.sh <master-spark-URL>
 
 Once you have started a worker, look at the master's web UI ([http://localhost:8080](http://localhost:8080) by default).
 You should see the new node listed there, along with its number of CPUs and memory (minus one gigabyte left for the OS).

From a46594435e0dd6cf86ac562bc11fc13d82b63c71 Mon Sep 17 00:00:00 2001
From: Imran Rashid <irashid@cloudera.com>
Date: Wed, 17 Jun 2015 13:34:26 -0700
Subject: [PATCH 507/525] [SPARK-6782] add sbt-revolver plugin

to make it easier to start & stop http servers in sbt
https://issues.apache.org/jira/browse/SPARK-6782

Author: Imran Rashid <irashid@cloudera.com>

Closes #5426 from squito/SPARK-6782 and squashes the following commits:

dc4fb19 [Imran Rashid] add sbt-revolved plugin, to make it easier to start & stop http servers in sbt
---
 project/SparkBuild.scala | 4 +++-
 project/plugins.sbt      | 2 ++
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/project/SparkBuild.scala b/project/SparkBuild.scala
index b7a3490787d44..e01720296fed0 100644
--- a/project/SparkBuild.scala
+++ b/project/SparkBuild.scala
@@ -27,6 +27,8 @@ import sbtunidoc.Plugin.UnidocKeys.unidocGenjavadocVersion
 import com.typesafe.sbt.pom.{loadEffectivePom, PomBuild, SbtPomKeys}
 import net.virtualvoid.sbt.graph.Plugin.graphSettings
 
+import spray.revolver.RevolverPlugin._
+
 object BuildCommons {
 
   private val buildLocation = file(".").getAbsoluteFile.getParentFile
@@ -159,7 +161,7 @@ object SparkBuild extends PomBuild {
   // Note ordering of these settings matter.
   /* Enable shared settings on all projects */
   (allProjects ++ optionallyEnabledProjects ++ assemblyProjects ++ Seq(spark, tools))
-    .foreach(enable(sharedSettings ++ ExludedDependencies.settings))
+    .foreach(enable(sharedSettings ++ ExludedDependencies.settings ++ Revolver.settings))
 
   /* Enable tests settings for all projects except examples, assembly and tools */
   (allProjects ++ optionallyEnabledProjects).foreach(enable(TestSettings.settings))
diff --git a/project/plugins.sbt b/project/plugins.sbt
index 75bd604a1b857..51820460ca1a0 100644
--- a/project/plugins.sbt
+++ b/project/plugins.sbt
@@ -29,6 +29,8 @@ addSbtPlugin("com.eed3si9n" % "sbt-unidoc" % "0.3.3")
 
 addSbtPlugin("com.cavorite" % "sbt-avro" % "0.3.2")
 
+addSbtPlugin("io.spray" % "sbt-revolver" % "0.7.2")
+
 libraryDependencies += "org.ow2.asm"  % "asm" % "5.0.3"
 
 libraryDependencies += "org.ow2.asm"  % "asm-commons" % "5.0.3"

From 98ee3512b26e87eeb22693a4a61b2c4981f38ca4 Mon Sep 17 00:00:00 2001
From: OopsOutOfMemory <victorshengli@126.com>
Date: Wed, 17 Jun 2015 13:37:59 -0700
Subject: [PATCH 508/525] [SPARK-8010] [SQL] Promote types to StringType as
 implicit conversion in non-binary expression of HiveTypeCoercion

1. Given a query
`select coalesce(null, 1, '1') from dual` will cause exception:
java.lang.RuntimeException: Could not determine return type of Coalesce for IntegerType,StringType
2. Given a query:
`select case when true then 1 else '1' end from dual` will cause exception:
java.lang.RuntimeException: Types in CASE WHEN must be the same or coercible to a common type: StringType != IntegerType
I checked the code, the main cause is the HiveTypeCoercion doesn't do implicit convert when there is a IntegerType and StringType.

Numeric types can be promoted to string type

Hive will always do this implicit conversion.

Author: OopsOutOfMemory <victorshengli@126.com>

Closes #6551 from OopsOutOfMemory/pnts and squashes the following commits:

7a209d7 [OopsOutOfMemory] rebase master
6018613 [OopsOutOfMemory] convert function to method
4cd5618 [OopsOutOfMemory] limit the data type to primitive type
df365d2 [OopsOutOfMemory] refine
95cbd58 [OopsOutOfMemory] fix style
403809c [OopsOutOfMemory] promote non-string to string when can not found tighestCommonTypeOfTwo
---
 .../catalyst/analysis/HiveTypeCoercion.scala  | 20 ++++++++++++++++---
 .../org/apache/spark/sql/SQLQuerySuite.scala  | 10 ++++++++++
 2 files changed, 27 insertions(+), 3 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala
index e7bf7cc1f1313..189451d0d9ad7 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala
@@ -67,6 +67,19 @@ object HiveTypeCoercion {
     })
   }
 
+  /**
+   * Similar to [[findTightestCommonType]], if can not find the TightestCommonType, try to use
+   * [[findTightestCommonTypeToString]] to find the TightestCommonType.
+   */
+  private def findTightestCommonTypeAndPromoteToString(types: Seq[DataType]): Option[DataType] = {
+    types.foldLeft[Option[DataType]](Some(NullType))((r, c) => r match {
+      case None => None
+      case Some(d) =>
+        findTightestCommonTypeOfTwo(d, c).orElse(findTightestCommonTypeToString(d, c))
+    })
+  }
+
+
   /**
    * Find the tightest common type of a set of types by continuously applying
    * `findTightestCommonTypeOfTwo` on these types.
@@ -599,7 +612,7 @@ trait HiveTypeCoercion {
       // compatible with every child column.
       case Coalesce(es) if es.map(_.dataType).distinct.size > 1 =>
         val types = es.map(_.dataType)
-        findTightestCommonType(types) match {
+        findTightestCommonTypeAndPromoteToString(types) match {
           case Some(finalDataType) => Coalesce(es.map(Cast(_, finalDataType)))
           case None =>
             sys.error(s"Could not determine return type of Coalesce for ${types.mkString(",")}")
@@ -634,7 +647,7 @@ trait HiveTypeCoercion {
     def apply(plan: LogicalPlan): LogicalPlan = plan transformAllExpressions {
       case c: CaseWhenLike if c.childrenResolved && !c.valueTypesEqual =>
         logDebug(s"Input values for null casting ${c.valueTypes.mkString(",")}")
-        val maybeCommonType = findTightestCommonType(c.valueTypes)
+        val maybeCommonType = findTightestCommonTypeAndPromoteToString(c.valueTypes)
         maybeCommonType.map { commonType =>
           val castedBranches = c.branches.grouped(2).map {
             case Seq(when, value) if value.dataType != commonType =>
@@ -650,7 +663,8 @@ trait HiveTypeCoercion {
         }.getOrElse(c)
 
       case c: CaseKeyWhen if c.childrenResolved && !c.resolved =>
-        val maybeCommonType = findTightestCommonType((c.key +: c.whenList).map(_.dataType))
+        val maybeCommonType =
+          findTightestCommonTypeAndPromoteToString((c.key +: c.whenList).map(_.dataType))
         maybeCommonType.map { commonType =>
           val castedBranches = c.branches.grouped(2).map {
             case Seq(when, then) if when.dataType != commonType =>
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
index a47cc30e92e27..1a6ee8169c38d 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
@@ -45,6 +45,16 @@ class SQLQuerySuite extends QueryTest with BeforeAndAfterAll with SQLTestUtils {
       Row("one", 6) :: Row("three", 3) :: Nil)
   }
 
+  test("SPARK-8010: promote numeric to string") {
+    val df = Seq((1, 1)).toDF("key", "value")
+    df.registerTempTable("src")
+    val queryCaseWhen = sql("select case when true then 1.0 else '1' end from src ")
+    val queryCoalesce = sql("select coalesce(null, 1, '1') from src ")
+
+    checkAnswer(queryCaseWhen, Row("1.0") :: Nil)
+    checkAnswer(queryCoalesce, Row("1") :: Nil)
+  }
+
   test("SPARK-6743: no columns from cache") {
     Seq(
       (83, 0, 38),

From 7ad8c5d869555b1bf4b50eafdf80e057a0175941 Mon Sep 17 00:00:00 2001
From: Mingfei <mingfei.shi@intel.com>
Date: Wed, 17 Jun 2015 13:40:07 -0700
Subject: [PATCH 509/525] [SPARK-8161] Set externalBlockStoreInitialized to be
 true, after ExternalBlockStore is initialized

externalBlockStoreInitialized is never set to be true, which causes the blocks stored in ExternalBlockStore can not be removed.

Author: Mingfei <mingfei.shi@intel.com>

Closes #6702 from shimingfei/SetTrue and squashes the following commits:

add61d8 [Mingfei] Set externalBlockStoreInitialized to be true, after ExternalBlockStore is initialized
---
 .../main/scala/org/apache/spark/storage/BlockManager.scala    | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/core/src/main/scala/org/apache/spark/storage/BlockManager.scala b/core/src/main/scala/org/apache/spark/storage/BlockManager.scala
index 5048c7dab240b..1beafa1771448 100644
--- a/core/src/main/scala/org/apache/spark/storage/BlockManager.scala
+++ b/core/src/main/scala/org/apache/spark/storage/BlockManager.scala
@@ -83,8 +83,10 @@ private[spark] class BlockManager(
   private var externalBlockStoreInitialized = false
   private[spark] val memoryStore = new MemoryStore(this, maxMemory)
   private[spark] val diskStore = new DiskStore(this, diskBlockManager)
-  private[spark] lazy val externalBlockStore: ExternalBlockStore =
+  private[spark] lazy val externalBlockStore: ExternalBlockStore = {
+    externalBlockStoreInitialized = true
     new ExternalBlockStore(this, executorId)
+  }
 
   private[spark]
   val externalShuffleServiceEnabled = conf.getBoolean("spark.shuffle.service.enabled", false)

From 2837e067099921dd4ab6639ac5f6e89f789d4ff4 Mon Sep 17 00:00:00 2001
From: Carson Wang <carson.wang@intel.com>
Date: Wed, 17 Jun 2015 13:41:36 -0700
Subject: [PATCH 510/525] [SPARK-8372] History server shows incorrect
 information for application not started

The history server may show an incorrect App ID for an incomplete application like <App ID>.inprogress. This app info will never disappear even after the app is completed.
![incorrectappinfo](https://cloud.githubusercontent.com/assets/9278199/8156147/2a10fdbe-137d-11e5-9620-c5b61d93e3c1.png)

The cause of the issue is that a log path name is used as the app id when app id cannot be got during replay.

Author: Carson Wang <carson.wang@intel.com>

Closes #6827 from carsonwang/SPARK-8372 and squashes the following commits:

cdbb089 [Carson Wang] Fix code style
3e46b35 [Carson Wang] Update code style
90f5dde [Carson Wang] Add a unit test
d8c9cd0 [Carson Wang] Replaying events only return information when app is started
---
 .../deploy/history/FsHistoryProvider.scala    | 38 +++++++++-------
 .../history/FsHistoryProviderSuite.scala      | 43 +++++++++++++------
 2 files changed, 53 insertions(+), 28 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/deploy/history/FsHistoryProvider.scala b/core/src/main/scala/org/apache/spark/deploy/history/FsHistoryProvider.scala
index 5427a88f32ffd..db383b9823d3c 100644
--- a/core/src/main/scala/org/apache/spark/deploy/history/FsHistoryProvider.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/history/FsHistoryProvider.scala
@@ -160,7 +160,7 @@ private[history] class FsHistoryProvider(conf: SparkConf, clock: Clock)
           replayBus.addListener(appListener)
           val appInfo = replay(fs.getFileStatus(new Path(logDir, attempt.logPath)), replayBus)
 
-          ui.setAppName(s"${appInfo.name} ($appId)")
+          appInfo.foreach { app => ui.setAppName(s"${app.name} ($appId)") }
 
           val uiAclsEnabled = conf.getBoolean("spark.history.ui.acls.enable", false)
           ui.getSecurityManager.setAcls(uiAclsEnabled)
@@ -282,8 +282,12 @@ private[history] class FsHistoryProvider(conf: SparkConf, clock: Clock)
     val newAttempts = logs.flatMap { fileStatus =>
       try {
         val res = replay(fileStatus, bus)
-        logInfo(s"Application log ${res.logPath} loaded successfully.")
-        Some(res)
+        res match {
+          case Some(r) => logDebug(s"Application log ${r.logPath} loaded successfully.")
+          case None => logWarning(s"Failed to load application log ${fileStatus.getPath}. " +
+            "The application may have not started.")
+        }
+        res
       } catch {
         case e: Exception =>
           logError(
@@ -429,9 +433,11 @@ private[history] class FsHistoryProvider(conf: SparkConf, clock: Clock)
 
   /**
    * Replays the events in the specified log file and returns information about the associated
-   * application.
+   * application. Return `None` if the application ID cannot be located.
    */
-  private def replay(eventLog: FileStatus, bus: ReplayListenerBus): FsApplicationAttemptInfo = {
+  private def replay(
+      eventLog: FileStatus,
+      bus: ReplayListenerBus): Option[FsApplicationAttemptInfo] = {
     val logPath = eventLog.getPath()
     logInfo(s"Replaying log path: $logPath")
     val logInput =
@@ -445,16 +451,18 @@ private[history] class FsHistoryProvider(conf: SparkConf, clock: Clock)
       val appCompleted = isApplicationCompleted(eventLog)
       bus.addListener(appListener)
       bus.replay(logInput, logPath.toString, !appCompleted)
-      new FsApplicationAttemptInfo(
-        logPath.getName(),
-        appListener.appName.getOrElse(NOT_STARTED),
-        appListener.appId.getOrElse(logPath.getName()),
-        appListener.appAttemptId,
-        appListener.startTime.getOrElse(-1L),
-        appListener.endTime.getOrElse(-1L),
-        getModificationTime(eventLog).get,
-        appListener.sparkUser.getOrElse(NOT_STARTED),
-        appCompleted)
+      appListener.appId.map { appId =>
+        new FsApplicationAttemptInfo(
+          logPath.getName(),
+          appListener.appName.getOrElse(NOT_STARTED),
+          appId,
+          appListener.appAttemptId,
+          appListener.startTime.getOrElse(-1L),
+          appListener.endTime.getOrElse(-1L),
+          getModificationTime(eventLog).get,
+          appListener.sparkUser.getOrElse(NOT_STARTED),
+          appCompleted)
+      }
     } finally {
       logInput.close()
     }
diff --git a/core/src/test/scala/org/apache/spark/deploy/history/FsHistoryProviderSuite.scala b/core/src/test/scala/org/apache/spark/deploy/history/FsHistoryProviderSuite.scala
index 09075eeb539aa..d3a6db5f260d6 100644
--- a/core/src/test/scala/org/apache/spark/deploy/history/FsHistoryProviderSuite.scala
+++ b/core/src/test/scala/org/apache/spark/deploy/history/FsHistoryProviderSuite.scala
@@ -67,7 +67,8 @@ class FsHistoryProviderSuite extends SparkFunSuite with BeforeAndAfter with Matc
     // Write a new-style application log.
     val newAppComplete = newLogFile("new1", None, inProgress = false)
     writeFile(newAppComplete, true, None,
-      SparkListenerApplicationStart("new-app-complete", None, 1L, "test", None),
+      SparkListenerApplicationStart(
+        "new-app-complete", Some("new-app-complete"), 1L, "test", None),
       SparkListenerApplicationEnd(5L)
       )
 
@@ -75,13 +76,15 @@ class FsHistoryProviderSuite extends SparkFunSuite with BeforeAndAfter with Matc
     val newAppCompressedComplete = newLogFile("new1compressed", None, inProgress = false,
       Some("lzf"))
     writeFile(newAppCompressedComplete, true, None,
-      SparkListenerApplicationStart("new-app-compressed-complete", None, 1L, "test", None),
+      SparkListenerApplicationStart(
+        "new-app-compressed-complete", Some("new-app-compressed-complete"), 1L, "test", None),
       SparkListenerApplicationEnd(4L))
 
     // Write an unfinished app, new-style.
     val newAppIncomplete = newLogFile("new2", None, inProgress = true)
     writeFile(newAppIncomplete, true, None,
-      SparkListenerApplicationStart("new-app-incomplete", None, 1L, "test", None)
+      SparkListenerApplicationStart(
+        "new-app-incomplete", Some("new-app-incomplete"), 1L, "test", None)
       )
 
     // Write an old-style application log.
@@ -89,7 +92,8 @@ class FsHistoryProviderSuite extends SparkFunSuite with BeforeAndAfter with Matc
     oldAppComplete.mkdir()
     createEmptyFile(new File(oldAppComplete, provider.SPARK_VERSION_PREFIX + "1.0"))
     writeFile(new File(oldAppComplete, provider.LOG_PREFIX + "1"), false, None,
-      SparkListenerApplicationStart("old-app-complete", None, 2L, "test", None),
+      SparkListenerApplicationStart(
+        "old-app-complete", Some("old-app-complete"), 2L, "test", None),
       SparkListenerApplicationEnd(3L)
       )
     createEmptyFile(new File(oldAppComplete, provider.APPLICATION_COMPLETE))
@@ -103,7 +107,8 @@ class FsHistoryProviderSuite extends SparkFunSuite with BeforeAndAfter with Matc
     oldAppIncomplete.mkdir()
     createEmptyFile(new File(oldAppIncomplete, provider.SPARK_VERSION_PREFIX + "1.0"))
     writeFile(new File(oldAppIncomplete, provider.LOG_PREFIX + "1"), false, None,
-      SparkListenerApplicationStart("old-app-incomplete", None, 2L, "test", None)
+      SparkListenerApplicationStart(
+        "old-app-incomplete", Some("old-app-incomplete"), 2L, "test", None)
       )
 
     // Force a reload of data from the log directory, and check that both logs are loaded.
@@ -124,16 +129,16 @@ class FsHistoryProviderSuite extends SparkFunSuite with BeforeAndAfter with Matc
           List(ApplicationAttemptInfo(None, start, end, lastMod, user, completed)))
       }
 
-      list(0) should be (makeAppInfo(newAppComplete.getName(), "new-app-complete", 1L, 5L,
+      list(0) should be (makeAppInfo("new-app-complete", "new-app-complete", 1L, 5L,
         newAppComplete.lastModified(), "test", true))
-      list(1) should be (makeAppInfo(newAppCompressedComplete.getName(),
+      list(1) should be (makeAppInfo("new-app-compressed-complete",
         "new-app-compressed-complete", 1L, 4L, newAppCompressedComplete.lastModified(), "test",
         true))
-      list(2) should be (makeAppInfo(oldAppComplete.getName(), "old-app-complete", 2L, 3L,
+      list(2) should be (makeAppInfo("old-app-complete", "old-app-complete", 2L, 3L,
         oldAppComplete.lastModified(), "test", true))
-      list(3) should be (makeAppInfo(oldAppIncomplete.getName(), "old-app-incomplete", 2L, -1L,
+      list(3) should be (makeAppInfo("old-app-incomplete", "old-app-incomplete", 2L, -1L,
         oldAppIncomplete.lastModified(), "test", false))
-      list(4) should be (makeAppInfo(newAppIncomplete.getName(), "new-app-incomplete", 1L, -1L,
+      list(4) should be (makeAppInfo("new-app-incomplete", "new-app-incomplete", 1L, -1L,
         newAppIncomplete.lastModified(), "test", false))
 
       // Make sure the UI can be rendered.
@@ -157,7 +162,7 @@ class FsHistoryProviderSuite extends SparkFunSuite with BeforeAndAfter with Matc
       logDir.mkdir()
       createEmptyFile(new File(logDir, provider.SPARK_VERSION_PREFIX + "1.0"))
       writeFile(new File(logDir, provider.LOG_PREFIX + "1"), false, Option(codec),
-        SparkListenerApplicationStart("app2", None, 2L, "test", None),
+        SparkListenerApplicationStart("app2", Some("app2"), 2L, "test", None),
         SparkListenerApplicationEnd(3L)
         )
       createEmptyFile(new File(logDir, provider.COMPRESSION_CODEC_PREFIX + codecName))
@@ -180,12 +185,12 @@ class FsHistoryProviderSuite extends SparkFunSuite with BeforeAndAfter with Matc
   test("SPARK-3697: ignore directories that cannot be read.") {
     val logFile1 = newLogFile("new1", None, inProgress = false)
     writeFile(logFile1, true, None,
-      SparkListenerApplicationStart("app1-1", None, 1L, "test", None),
+      SparkListenerApplicationStart("app1-1", Some("app1-1"), 1L, "test", None),
       SparkListenerApplicationEnd(2L)
       )
     val logFile2 = newLogFile("new2", None, inProgress = false)
     writeFile(logFile2, true, None,
-      SparkListenerApplicationStart("app1-2", None, 1L, "test", None),
+      SparkListenerApplicationStart("app1-2", Some("app1-2"), 1L, "test", None),
       SparkListenerApplicationEnd(2L)
       )
     logFile2.setReadable(false, false)
@@ -218,6 +223,18 @@ class FsHistoryProviderSuite extends SparkFunSuite with BeforeAndAfter with Matc
     }
   }
 
+  test("Parse logs that application is not started") {
+    val provider = new FsHistoryProvider((createTestConf()))
+
+    val logFile1 = newLogFile("app1", None, inProgress = true)
+    writeFile(logFile1, true, None,
+      SparkListenerLogStart("1.4")
+    )
+    updateAndCheck(provider) { list =>
+      list.size should be (0)
+    }
+  }
+
   test("SPARK-5582: empty log directory") {
     val provider = new FsHistoryProvider(createTestConf())
 

From 0fc4b96f3e3bf81724ac133a6acc97c1b77271b4 Mon Sep 17 00:00:00 2001
From: zsxwing <zsxwing@gmail.com>
Date: Wed, 17 Jun 2015 13:59:39 -0700
Subject: [PATCH 511/525] [SPARK-8373] [PYSPARK] Add emptyRDD to pyspark and
 fix the issue when calling sum on an empty RDD

This PR fixes the sum issue and also adds `emptyRDD` so that it's easy to create a test case.

Author: zsxwing <zsxwing@gmail.com>

Closes #6826 from zsxwing/python-emptyRDD and squashes the following commits:

b36993f [zsxwing] Update the return type to JavaRDD[T]
71df047 [zsxwing] Add emptyRDD to pyspark and fix the issue when calling sum on an empty RDD
---
 .../scala/org/apache/spark/api/python/PythonRDD.scala     | 5 +++++
 python/pyspark/context.py                                 | 6 ++++++
 python/pyspark/rdd.py                                     | 2 +-
 python/pyspark/tests.py                                   | 8 ++++++++
 4 files changed, 20 insertions(+), 1 deletion(-)

diff --git a/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala b/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala
index 55a37f8c944b2..0103f6c6ab678 100644
--- a/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala
+++ b/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala
@@ -425,6 +425,11 @@ private[spark] object PythonRDD extends Logging {
     iter.foreach(write)
   }
 
+  /** Create an RDD that has no partitions or elements. */
+  def emptyRDD[T](sc: JavaSparkContext): JavaRDD[T] = {
+    sc.emptyRDD[T]
+  }
+
   /**
    * Create an RDD from a path using [[org.apache.hadoop.mapred.SequenceFileInputFormat]],
    * key and value class.
diff --git a/python/pyspark/context.py b/python/pyspark/context.py
index 44d90f1437bc9..90b2fffbb9c7c 100644
--- a/python/pyspark/context.py
+++ b/python/pyspark/context.py
@@ -324,6 +324,12 @@ def stop(self):
         with SparkContext._lock:
             SparkContext._active_spark_context = None
 
+    def emptyRDD(self):
+        """
+        Create an RDD that has no partitions or elements.
+        """
+        return RDD(self._jsc.emptyRDD(), self, NoOpSerializer())
+
     def range(self, start, end=None, step=1, numSlices=None):
         """
         Create a new RDD of int containing elements from `start` to `end`
diff --git a/python/pyspark/rdd.py b/python/pyspark/rdd.py
index 98a8ff8606366..20c0bc93f413c 100644
--- a/python/pyspark/rdd.py
+++ b/python/pyspark/rdd.py
@@ -960,7 +960,7 @@ def sum(self):
         >>> sc.parallelize([1.0, 2.0, 3.0]).sum()
         6.0
         """
-        return self.mapPartitions(lambda x: [sum(x)]).reduce(operator.add)
+        return self.mapPartitions(lambda x: [sum(x)]).fold(0, operator.add)
 
     def count(self):
         """
diff --git a/python/pyspark/tests.py b/python/pyspark/tests.py
index f9fb37f7fc139..11b402e6df6c1 100644
--- a/python/pyspark/tests.py
+++ b/python/pyspark/tests.py
@@ -458,6 +458,14 @@ def test_id(self):
         self.assertEqual(id + 1, id2)
         self.assertEqual(id2, rdd2.id())
 
+    def test_empty_rdd(self):
+        rdd = self.sc.emptyRDD()
+        self.assertTrue(rdd.isEmpty())
+
+    def test_sum(self):
+        self.assertEqual(0, self.sc.emptyRDD().sum())
+        self.assertEqual(6, self.sc.parallelize([1, 2, 3]).sum())
+
     def test_save_as_textfile_with_unicode(self):
         # Regression test for SPARK-970
         x = u"\u00A1Hola, mundo!"

From a411a40de2209c56e898e3fb4af955d7b55af11c Mon Sep 17 00:00:00 2001
From: zsxwing <zsxwing@gmail.com>
Date: Wed, 17 Jun 2015 14:03:15 -0700
Subject: [PATCH 512/525] [SPARK-7913] [CORE] Increase the maximum capacity of
 PartitionedPairBuffe, PartitionedSerializedPairBuffer and AppendOnlyMap

The previous growing strategy is alway doubling the capacity.

This PR adjusts the growing strategy: doubling the capacity but if overflow, use the maximum capacity as the new capacity. It increases the maximum capacity of PartitionedPairBuffer from `2 ^ 29` to `2 ^ 30 - 1`, the maximum capacity of PartitionedSerializedPairBuffer from `2 ^ 28` to `(2 ^ 29) - 1`, and the maximum capacity of AppendOnlyMap from `0.7 * (2 ^ 29)` to `(2 ^ 29)`.

Author: zsxwing <zsxwing@gmail.com>

Closes #6456 from zsxwing/SPARK-7913 and squashes the following commits:

abcb932 [zsxwing] Address comments
e30b61b [zsxwing] Increase the maximum capacity of AppendOnlyMap
05b6420 [zsxwing] Update the exception message
64fe227 [zsxwing] Increase the maximum capacity of PartitionedPairBuffer and PartitionedSerializedPairBuffer
---
 .../spark/util/collection/AppendOnlyMap.scala | 25 +++++++++++++------
 .../collection/PartitionedPairBuffer.scala    | 23 +++++++++++++----
 .../PartitionedSerializedPairBuffer.scala     | 23 +++++++++++++----
 3 files changed, 53 insertions(+), 18 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/util/collection/AppendOnlyMap.scala b/core/src/main/scala/org/apache/spark/util/collection/AppendOnlyMap.scala
index 290282c9c2e28..d215ee43cb539 100644
--- a/core/src/main/scala/org/apache/spark/util/collection/AppendOnlyMap.scala
+++ b/core/src/main/scala/org/apache/spark/util/collection/AppendOnlyMap.scala
@@ -32,12 +32,18 @@ import org.apache.spark.annotation.DeveloperApi
  * size, which is guaranteed to explore all spaces for each key (see
  * http://en.wikipedia.org/wiki/Quadratic_probing).
  *
+ * The map can support up to `536870912 (2 ^ 29)` elements.
+ *
  * TODO: Cache the hash values of each key? java.util.HashMap does that.
  */
 @DeveloperApi
 class AppendOnlyMap[K, V](initialCapacity: Int = 64)
   extends Iterable[(K, V)] with Serializable {
-  require(initialCapacity <= (1 << 29), "Can't make capacity bigger than 2^29 elements")
+
+  import AppendOnlyMap._
+
+  require(initialCapacity <= MAXIMUM_CAPACITY,
+    s"Can't make capacity bigger than ${MAXIMUM_CAPACITY} elements")
   require(initialCapacity >= 1, "Invalid initial capacity")
 
   private val LOAD_FACTOR = 0.7
@@ -193,8 +199,11 @@ class AppendOnlyMap[K, V](initialCapacity: Int = 64)
 
   /** Increase table size by 1, rehashing if necessary */
   private def incrementSize() {
+    if (curSize == MAXIMUM_CAPACITY) {
+      throw new IllegalStateException(s"Can't put more that ${MAXIMUM_CAPACITY} elements")
+    }
     curSize += 1
-    if (curSize > growThreshold) {
+    if (curSize > growThreshold && capacity < MAXIMUM_CAPACITY) {
       growTable()
     }
   }
@@ -206,12 +215,8 @@ class AppendOnlyMap[K, V](initialCapacity: Int = 64)
 
   /** Double the table's size and re-hash everything */
   protected def growTable() {
-    val newCapacity = capacity * 2
-    if (newCapacity >= (1 << 30)) {
-      // We can't make the table this big because we want an array of 2x
-      // that size for our data, but array sizes are at most Int.MaxValue
-      throw new Exception("Can't make capacity bigger than 2^29 elements")
-    }
+    // capacity < MAXIMUM_CAPACITY (2 ^ 29) so capacity * 2 won't overflow
+    val newCapacity = (capacity * 2).min(MAXIMUM_CAPACITY)
     val newData = new Array[AnyRef](2 * newCapacity)
     val newMask = newCapacity - 1
     // Insert all our old values into the new array. Note that because our old keys are
@@ -292,3 +297,7 @@ class AppendOnlyMap[K, V](initialCapacity: Int = 64)
    */
   def atGrowThreshold: Boolean = curSize == growThreshold
 }
+
+private object AppendOnlyMap {
+  val MAXIMUM_CAPACITY = (1 << 29)
+}
diff --git a/core/src/main/scala/org/apache/spark/util/collection/PartitionedPairBuffer.scala b/core/src/main/scala/org/apache/spark/util/collection/PartitionedPairBuffer.scala
index 5a6e9a9580e9b..04bb7fc78c13b 100644
--- a/core/src/main/scala/org/apache/spark/util/collection/PartitionedPairBuffer.scala
+++ b/core/src/main/scala/org/apache/spark/util/collection/PartitionedPairBuffer.scala
@@ -25,11 +25,16 @@ import org.apache.spark.util.collection.WritablePartitionedPairCollection._
 /**
  * Append-only buffer of key-value pairs, each with a corresponding partition ID, that keeps track
  * of its estimated size in bytes.
+ *
+ * The buffer can support up to `1073741823 (2 ^ 30 - 1)` elements.
  */
 private[spark] class PartitionedPairBuffer[K, V](initialCapacity: Int = 64)
   extends WritablePartitionedPairCollection[K, V] with SizeTracker
 {
-  require(initialCapacity <= (1 << 29), "Can't make capacity bigger than 2^29 elements")
+  import PartitionedPairBuffer._
+
+  require(initialCapacity <= MAXIMUM_CAPACITY,
+    s"Can't make capacity bigger than ${MAXIMUM_CAPACITY} elements")
   require(initialCapacity >= 1, "Invalid initial capacity")
 
   // Basic growable array data structure. We use a single array of AnyRef to hold both the keys
@@ -51,11 +56,15 @@ private[spark] class PartitionedPairBuffer[K, V](initialCapacity: Int = 64)
 
   /** Double the size of the array because we've reached capacity */
   private def growArray(): Unit = {
-    if (capacity == (1 << 29)) {
-      // Doubling the capacity would create an array bigger than Int.MaxValue, so don't
-      throw new Exception("Can't grow buffer beyond 2^29 elements")
+    if (capacity >= MAXIMUM_CAPACITY) {
+      throw new IllegalStateException(s"Can't insert more than ${MAXIMUM_CAPACITY} elements")
     }
-    val newCapacity = capacity * 2
+    val newCapacity =
+      if (capacity * 2 < 0 || capacity * 2 > MAXIMUM_CAPACITY) { // Overflow
+        MAXIMUM_CAPACITY
+      } else {
+        capacity * 2
+      }
     val newArray = new Array[AnyRef](2 * newCapacity)
     System.arraycopy(data, 0, newArray, 0, 2 * capacity)
     data = newArray
@@ -86,3 +95,7 @@ private[spark] class PartitionedPairBuffer[K, V](initialCapacity: Int = 64)
     }
   }
 }
+
+private object PartitionedPairBuffer {
+  val MAXIMUM_CAPACITY = Int.MaxValue / 2 // 2 ^ 30 - 1
+}
diff --git a/core/src/main/scala/org/apache/spark/util/collection/PartitionedSerializedPairBuffer.scala b/core/src/main/scala/org/apache/spark/util/collection/PartitionedSerializedPairBuffer.scala
index 862408b7a4d21..ae9a48729e201 100644
--- a/core/src/main/scala/org/apache/spark/util/collection/PartitionedSerializedPairBuffer.scala
+++ b/core/src/main/scala/org/apache/spark/util/collection/PartitionedSerializedPairBuffer.scala
@@ -48,6 +48,8 @@ import org.apache.spark.util.collection.PartitionedSerializedPairBuffer._
  *   |         keyStart         | keyValLen  | partitionId |
  *   +-------------+------------+------------+-------------+
  *
+ * The buffer can support up to `536870911 (2 ^ 29 - 1)` records.
+ *
  * @param metaInitialRecords The initial number of entries in the metadata buffer.
  * @param kvBlockSize The size of each byte buffer in the ChainedBuffer used to store the records.
  * @param serializerInstance the serializer used for serializing inserted records.
@@ -63,6 +65,8 @@ private[spark] class PartitionedSerializedPairBuffer[K, V](
       " Java-serialized objects.")
   }
 
+  require(metaInitialRecords <= MAXIMUM_RECORDS,
+    s"Can't make capacity bigger than ${MAXIMUM_RECORDS} records")
   private var metaBuffer = IntBuffer.allocate(metaInitialRecords * RECORD_SIZE)
 
   private val kvBuffer: ChainedBuffer = new ChainedBuffer(kvBlockSize)
@@ -89,11 +93,17 @@ private[spark] class PartitionedSerializedPairBuffer[K, V](
 
   /** Double the size of the array because we've reached capacity */
   private def growMetaBuffer(): Unit = {
-    if (metaBuffer.capacity.toLong * 2 > Int.MaxValue) {
-      // Doubling the capacity would create an array bigger than Int.MaxValue, so don't
-      throw new Exception(s"Can't grow buffer beyond ${Int.MaxValue} bytes")
+    if (metaBuffer.capacity >= MAXIMUM_META_BUFFER_CAPACITY) {
+      throw new IllegalStateException(s"Can't insert more than ${MAXIMUM_RECORDS} records")
     }
-    val newMetaBuffer = IntBuffer.allocate(metaBuffer.capacity * 2)
+    val newCapacity =
+      if (metaBuffer.capacity * 2 < 0 || metaBuffer.capacity * 2 > MAXIMUM_META_BUFFER_CAPACITY) {
+        // Overflow
+        MAXIMUM_META_BUFFER_CAPACITY
+      } else {
+        metaBuffer.capacity * 2
+      }
+    val newMetaBuffer = IntBuffer.allocate(newCapacity)
     newMetaBuffer.put(metaBuffer.array)
     metaBuffer = newMetaBuffer
   }
@@ -247,12 +257,15 @@ private[spark] class SerializedSortDataFormat extends SortDataFormat[Int, IntBuf
   }
 }
 
-private[spark] object PartitionedSerializedPairBuffer {
+private object PartitionedSerializedPairBuffer {
   val KEY_START = 0 // keyStart, a long, gets split across two ints
   val KEY_VAL_LEN = 2
   val PARTITION = 3
   val RECORD_SIZE = PARTITION + 1 // num ints of metadata
 
+  val MAXIMUM_RECORDS = Int.MaxValue / RECORD_SIZE // (2 ^ 29) - 1
+  val MAXIMUM_META_BUFFER_CAPACITY = MAXIMUM_RECORDS * RECORD_SIZE // (2 ^ 31) - 4
+
   def getKeyStartPos(metaBuffer: IntBuffer, metaBufferPos: Int): Long = {
     val lower32 = metaBuffer.get(metaBufferPos + KEY_START)
     val upper32 = metaBuffer.get(metaBufferPos + KEY_START + 1)

From 7f05b1fe696daa28fee514c9aef805be5913cfcd Mon Sep 17 00:00:00 2001
From: Wenchen Fan <cloud0fan@outlook.com>
Date: Wed, 17 Jun 2015 14:46:00 -0700
Subject: [PATCH 513/525] [SPARK-7067] [SQL] fix bug when use complex nested
 fields in ORDER BY

This PR is a improvement for https://github.com/apache/spark/pull/5189.

The resolution rule for ORDER BY is: first resolve based on what comes from the select clause and then fall back on its child only when this fails.

There are 2 steps. First, try to resolve `Sort` in `ResolveReferences` based on select clause, and ignore exceptions. Second, try to resolve `Sort` in `ResolveSortReferences` and add missing projection.

However, the way we resolve `SortOrder` is wrong. We just resolve `UnresolvedAttribute` and use the result to indicate if we can resolve `SortOrder`. But `UnresolvedAttribute` is only part of `GetField` chain(broken by `GetItem`), so we need to go through the whole chain to indicate if we can resolve `SortOrder`.

With this change, we can also avoid re-throw GetField exception in `CheckAnalysis` which is little ugly.

Author: Wenchen Fan <cloud0fan@outlook.com>

Closes #5659 from cloud-fan/order-by and squashes the following commits:

cfa79f8 [Wenchen Fan] update test
3245d28 [Wenchen Fan] minor improve
465ee07 [Wenchen Fan] address comment
1fc41a2 [Wenchen Fan] fix SPARK-7067
---
 .../sql/catalyst/analysis/Analyzer.scala      | 75 +++++++++++--------
 .../sql/catalyst/analysis/CheckAnalysis.scala |  8 --
 .../catalyst/plans/logical/LogicalPlan.scala  | 43 +++++------
 .../spark/sql/catalyst/trees/TreeNode.scala   |  2 +-
 .../org/apache/spark/sql/SQLQuerySuite.scala  |  8 ++
 5 files changed, 70 insertions(+), 66 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
index badf903478303..21b05760256b4 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
@@ -336,9 +336,15 @@ class Analyzer(
         }
         j.copy(right = newRight)
 
+      // When resolve `SortOrder`s in Sort based on child, don't report errors as
+      // we still have chance to resolve it based on grandchild
+      case s @ Sort(ordering, global, child) if child.resolved && !s.resolved =>
+        val newOrdering = resolveSortOrders(ordering, child, throws = false)
+        Sort(newOrdering, global, child)
+
       case q: LogicalPlan =>
         logTrace(s"Attempting to resolve ${q.simpleString}")
-        q transformExpressionsUp  {
+        q transformExpressionsUp {
           case u @ UnresolvedAttribute(nameParts) if nameParts.length == 1 &&
             resolver(nameParts(0), VirtualColumn.groupingIdName) &&
             q.isInstanceOf[GroupingAnalytics] =>
@@ -373,6 +379,26 @@ class Analyzer(
       exprs.exists(_.collect { case _: Star => true }.nonEmpty)
   }
 
+  private def resolveSortOrders(ordering: Seq[SortOrder], plan: LogicalPlan, throws: Boolean) = {
+    ordering.map { order =>
+      // Resolve SortOrder in one round.
+      // If throws == false or the desired attribute doesn't exist
+      // (like try to resolve `a.b` but `a` doesn't exist), fail and return the origin one.
+      // Else, throw exception.
+      try {
+        val newOrder = order transformUp {
+          case u @ UnresolvedAttribute(nameParts) =>
+            plan.resolve(nameParts, resolver).getOrElse(u)
+          case UnresolvedExtractValue(child, fieldName) if child.resolved =>
+            ExtractValue(child, fieldName, resolver)
+        }
+        newOrder.asInstanceOf[SortOrder]
+      } catch {
+        case a: AnalysisException if !throws => order
+      }
+    }
+  }
+
   /**
    * In many dialects of SQL it is valid to sort by attributes that are not present in the SELECT
    * clause.  This rule detects such queries and adds the required attributes to the original
@@ -383,13 +409,13 @@ class Analyzer(
     def apply(plan: LogicalPlan): LogicalPlan = plan transformUp {
       case s @ Sort(ordering, global, p @ Project(projectList, child))
           if !s.resolved && p.resolved =>
-        val (resolvedOrdering, missing) = resolveAndFindMissing(ordering, p, child)
+        val (newOrdering, missing) = resolveAndFindMissing(ordering, p, child)
 
         // If this rule was not a no-op, return the transformed plan, otherwise return the original.
         if (missing.nonEmpty) {
           // Add missing attributes and then project them away after the sort.
           Project(p.output,
-            Sort(resolvedOrdering, global,
+            Sort(newOrdering, global,
               Project(projectList ++ missing, child)))
         } else {
           logDebug(s"Failed to find $missing in ${p.output.mkString(", ")}")
@@ -404,19 +430,19 @@ class Analyzer(
         )
 
         // Find sort attributes that are projected away so we can temporarily add them back in.
-        val (resolvedOrdering, unresolved) = resolveAndFindMissing(ordering, a, groupingRelation)
+        val (newOrdering, missingAttr) = resolveAndFindMissing(ordering, a, groupingRelation)
 
         // Find aggregate expressions and evaluate them early, since they can't be evaluated in a
         // Sort.
-        val (withAggsRemoved, aliasedAggregateList) = resolvedOrdering.map {
+        val (withAggsRemoved, aliasedAggregateList) = newOrdering.map {
           case aggOrdering if aggOrdering.collect { case a: AggregateExpression => a }.nonEmpty =>
             val aliased = Alias(aggOrdering.child, "_aggOrdering")()
-            (aggOrdering.copy(child = aliased.toAttribute), aliased :: Nil)
+            (aggOrdering.copy(child = aliased.toAttribute), Some(aliased))
 
-          case other => (other, Nil)
+          case other => (other, None)
         }.unzip
 
-        val missing = unresolved ++ aliasedAggregateList.flatten
+        val missing = missingAttr ++ aliasedAggregateList.flatten
 
         if (missing.nonEmpty) {
           // Add missing grouping exprs and then project them away after the sort.
@@ -429,40 +455,25 @@ class Analyzer(
     }
 
     /**
-     * Given a child and a grandchild that are present beneath a sort operator, returns
-     * a resolved sort ordering and a list of attributes that are missing from the child
-     * but are present in the grandchild.
+     * Given a child and a grandchild that are present beneath a sort operator, try to resolve
+     * the sort ordering and returns it with a list of attributes that are missing from the
+     * child but are present in the grandchild.
      */
     def resolveAndFindMissing(
         ordering: Seq[SortOrder],
         child: LogicalPlan,
         grandchild: LogicalPlan): (Seq[SortOrder], Seq[Attribute]) = {
-      // Find any attributes that remain unresolved in the sort.
-      val unresolved: Seq[Seq[String]] =
-        ordering.flatMap(_.collect { case UnresolvedAttribute(nameParts) => nameParts })
-
-      // Create a map from name, to resolved attributes, when the desired name can be found
-      // prior to the projection.
-      val resolved: Map[Seq[String], NamedExpression] =
-        unresolved.flatMap(u => grandchild.resolve(u, resolver).map(a => u -> a)).toMap
-
+      val newOrdering = resolveSortOrders(ordering, grandchild, throws = true)
       // Construct a set that contains all of the attributes that we need to evaluate the
       // ordering.
-      val requiredAttributes = AttributeSet(resolved.values)
-
+      val requiredAttributes = AttributeSet(newOrdering.filter(_.resolved))
       // Figure out which ones are missing from the projection, so that we can add them and
       // remove them after the sort.
       val missingInProject = requiredAttributes -- child.output
-
-      // Now that we have all the attributes we need, reconstruct a resolved ordering.
-      // It is important to do it here, instead of waiting for the standard resolved as adding
-      // attributes to the project below can actually introduce ambiquity that was not present
-      // before.
-      val resolvedOrdering = ordering.map(_ transform {
-        case u @ UnresolvedAttribute(name) => resolved.getOrElse(name, u)
-      }).asInstanceOf[Seq[SortOrder]]
-
-      (resolvedOrdering, missingInProject.toSeq)
+      // It is important to return the new SortOrders here, instead of waiting for the standard
+      // resolving process as adding attributes to the project below can actually introduce
+      // ambiguity that was not present before.
+      (newOrdering, missingInProject.toSeq)
     }
   }
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala
index c0695ae369421..7fabd2bfc80ab 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala
@@ -51,14 +51,6 @@ trait CheckAnalysis {
       case operator: LogicalPlan =>
         operator transformExpressionsUp {
           case a: Attribute if !a.resolved =>
-            if (operator.childrenResolved) {
-              a match {
-                case UnresolvedAttribute(nameParts) =>
-                  // Throw errors for specific problems with get field.
-                  operator.resolveChildren(nameParts, resolver, throwErrors = true)
-              }
-            }
-
             val from = operator.inputSet.map(_.name).mkString(", ")
             a.failAnalysis(s"cannot resolve '${a.prettyString}' given input columns $from")
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala
index c8c6676f24c17..a853e27c1212d 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala
@@ -50,19 +50,19 @@ abstract class LogicalPlan extends QueryPlan[LogicalPlan] with Logging {
    * [[org.apache.spark.sql.catalyst.analysis.UnresolvedRelation UnresolvedRelation]]
    * should return `false`).
    */
-  lazy val resolved: Boolean = !expressions.exists(!_.resolved) && childrenResolved
+  lazy val resolved: Boolean = expressions.forall(_.resolved) && childrenResolved
 
   override protected def statePrefix = if (!resolved) "'" else super.statePrefix
 
   /**
    * Returns true if all its children of this query plan have been resolved.
    */
-  def childrenResolved: Boolean = !children.exists(!_.resolved)
+  def childrenResolved: Boolean = children.forall(_.resolved)
 
   /**
    * Returns true when the given logical plan will return the same results as this logical plan.
    *
-   * Since its likely undecideable to generally determine if two given plans will produce the same
+   * Since its likely undecidable to generally determine if two given plans will produce the same
    * results, it is okay for this function to return false, even if the results are actually
    * the same.  Such behavior will not affect correctness, only the application of performance
    * enhancements like caching.  However, it is not acceptable to return true if the results could
@@ -111,9 +111,8 @@ abstract class LogicalPlan extends QueryPlan[LogicalPlan] with Logging {
    */
   def resolveChildren(
       nameParts: Seq[String],
-      resolver: Resolver,
-      throwErrors: Boolean = false): Option[NamedExpression] =
-    resolve(nameParts, children.flatMap(_.output), resolver, throwErrors)
+      resolver: Resolver): Option[NamedExpression] =
+    resolve(nameParts, children.flatMap(_.output), resolver)
 
   /**
    * Optionally resolves the given strings to a [[NamedExpression]] based on the output of this
@@ -122,9 +121,8 @@ abstract class LogicalPlan extends QueryPlan[LogicalPlan] with Logging {
    */
   def resolve(
       nameParts: Seq[String],
-      resolver: Resolver,
-      throwErrors: Boolean = false): Option[NamedExpression] =
-    resolve(nameParts, output, resolver, throwErrors)
+      resolver: Resolver): Option[NamedExpression] =
+    resolve(nameParts, output, resolver)
 
   /**
    * Given an attribute name, split it to name parts by dot, but
@@ -134,7 +132,7 @@ abstract class LogicalPlan extends QueryPlan[LogicalPlan] with Logging {
   def resolveQuoted(
       name: String,
       resolver: Resolver): Option[NamedExpression] = {
-    resolve(parseAttributeName(name), resolver, true)
+    resolve(parseAttributeName(name), output, resolver)
   }
 
   /**
@@ -219,8 +217,7 @@ abstract class LogicalPlan extends QueryPlan[LogicalPlan] with Logging {
   protected def resolve(
       nameParts: Seq[String],
       input: Seq[Attribute],
-      resolver: Resolver,
-      throwErrors: Boolean): Option[NamedExpression] = {
+      resolver: Resolver): Option[NamedExpression] = {
 
     // A sequence of possible candidate matches.
     // Each candidate is a tuple. The first element is a resolved attribute, followed by a list
@@ -254,19 +251,15 @@ abstract class LogicalPlan extends QueryPlan[LogicalPlan] with Logging {
 
       // One match, but we also need to extract the requested nested field.
       case Seq((a, nestedFields)) =>
-        try {
-          // The foldLeft adds GetFields for every remaining parts of the identifier,
-          // and aliases it with the last part of the identifier.
-          // For example, consider "a.b.c", where "a" is resolved to an existing attribute.
-          // Then this will add GetField("c", GetField("b", a)), and alias
-          // the final expression as "c".
-          val fieldExprs = nestedFields.foldLeft(a: Expression)((expr, fieldName) =>
-            ExtractValue(expr, Literal(fieldName), resolver))
-          val aliasName = nestedFields.last
-          Some(Alias(fieldExprs, aliasName)())
-        } catch {
-          case a: AnalysisException if !throwErrors => None
-        }
+        // The foldLeft adds ExtractValues for every remaining parts of the identifier,
+        // and aliases it with the last part of the identifier.
+        // For example, consider "a.b.c", where "a" is resolved to an existing attribute.
+        // Then this will add ExtractValue("c", ExtractValue("b", a)), and alias
+        // the final expression as "c".
+        val fieldExprs = nestedFields.foldLeft(a: Expression)((expr, fieldName) =>
+          ExtractValue(expr, Literal(fieldName), resolver))
+        val aliasName = nestedFields.last
+        Some(Alias(fieldExprs, aliasName)())
 
       // No matches.
       case Seq() =>
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/trees/TreeNode.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/trees/TreeNode.scala
index f304597bc978e..09f6c6b0ec423 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/trees/TreeNode.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/trees/TreeNode.scala
@@ -285,7 +285,7 @@ abstract class TreeNode[BaseType <: TreeNode[BaseType]] {
    * @param rule the function use to transform this nodes children
    */
   def transformUp(rule: PartialFunction[BaseType, BaseType]): BaseType = {
-    val afterRuleOnChildren = transformChildrenUp(rule);
+    val afterRuleOnChildren = transformChildrenUp(rule)
     if (this fastEquals afterRuleOnChildren) {
       CurrentOrigin.withOrigin(origin) {
         rule.applyOrElse(this, identity[BaseType])
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
index 1a6ee8169c38d..30db840166ca6 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
@@ -1440,4 +1440,12 @@ class SQLQuerySuite extends QueryTest with BeforeAndAfterAll with SQLTestUtils {
       checkAnswer(sql("select i <=> b from t"), sql("select r2 from t"))
     }
   }
+
+  test("SPARK-7067: order by queries for complex ExtractValue chain") {
+    withTempTable("t") {
+      sqlContext.read.json(sqlContext.sparkContext.makeRDD(
+        """{"a": {"b": [{"c": 1}]}, "b": [{"d": 1}]}""" :: Nil)).registerTempTable("t")
+      checkAnswer(sql("SELECT a.b FROM t ORDER BY b[0].d"), Row(Seq(Row(1))))
+    }
+  }
 }

From 302556ff999ba9a1960281de6932e0d904197204 Mon Sep 17 00:00:00 2001
From: Yin Huai <yhuai@databricks.com>
Date: Wed, 17 Jun 2015 14:52:43 -0700
Subject: [PATCH 514/525] [SPARK-8306] [SQL] AddJar command needs to set the
 new class loader to the HiveConf inside executionHive.state.

https://issues.apache.org/jira/browse/SPARK-8306

I will try to add a test later.

marmbrus aarondav

Author: Yin Huai <yhuai@databricks.com>

Closes #6758 from yhuai/SPARK-8306 and squashes the following commits:

1292346 [Yin Huai] [SPARK-8306] AddJar command needs to set the new class loader to the HiveConf inside executionHive.state.
---
 .../spark/sql/hive/client/ClientWrapper.scala |   8 +++++
 .../spark/sql/hive/execution/commands.scala   |  12 ++++++--
 .../test/resources/hive-contrib-0.13.1.jar    | Bin 0 -> 114878 bytes
 .../sql/hive/execution/SQLQuerySuite.scala    |  28 ++++++++++++++++++
 4 files changed, 45 insertions(+), 3 deletions(-)
 create mode 100644 sql/hive/src/test/resources/hive-contrib-0.13.1.jar

diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/ClientWrapper.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/ClientWrapper.scala
index 0fcba65ca6129..982ed63874a5f 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/ClientWrapper.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/ClientWrapper.scala
@@ -95,6 +95,7 @@ private[hive] class ClientWrapper(
     case hive.v14 => new Shim_v0_14()
   }
 
+  // Create an internal session state for this ClientWrapper.
   val state = {
     val original = Thread.currentThread().getContextClassLoader
     Thread.currentThread().setContextClassLoader(getClass.getClassLoader)
@@ -131,8 +132,15 @@ private[hive] class ClientWrapper(
    */
   private def withHiveState[A](f: => A): A = synchronized {
     val original = Thread.currentThread().getContextClassLoader
+    // This setContextClassLoader is used for Hive 0.12's metastore since Hive 0.12 will not
+    // internally override the context class loader of the current thread with the class loader
+    // associated with the HiveConf in `state`.
     Thread.currentThread().setContextClassLoader(getClass.getClassLoader)
+    // Set the thread local metastore client to the client associated with this ClientWrapper.
     Hive.set(client)
+    // Starting from Hive 0.13.0, setCurrentSessionState will use the classLoader associated
+    // with the HiveConf in `state` to override the context class loader of the current
+    // thread.
     shim.setCurrentSessionState(state)
     val ret = try f finally {
       Thread.currentThread().setContextClassLoader(original)
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/commands.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/commands.scala
index 195e5752c3ec0..aad58bfa2e6e0 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/commands.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/commands.scala
@@ -91,9 +91,15 @@ case class AddJar(path: String) extends RunnableCommand {
     val jarURL = new java.io.File(path).toURL
     val newClassLoader = new java.net.URLClassLoader(Array(jarURL), currentClassLoader)
     Thread.currentThread.setContextClassLoader(newClassLoader)
-    org.apache.hadoop.hive.ql.metadata.Hive.get().getConf().setClassLoader(newClassLoader)
-
-    // Add jar to isolated hive classloader
+    // We need to explicitly set the class loader associated with the conf in executionHive's
+    // state because this class loader will be used as the context class loader of the current
+    // thread to execute any Hive command.
+    // We cannot use `org.apache.hadoop.hive.ql.metadata.Hive.get().getConf()` because Hive.get()
+    // returns the value of a thread local variable and its HiveConf may not be the HiveConf
+    // associated with `executionHive.state` (for example, HiveContext is created in one thread
+    // and then add jar is called from another thread).
+    hiveContext.executionHive.state.getConf.setClassLoader(newClassLoader)
+    // Add jar to isolated hive (metadataHive) class loader.
     hiveContext.runSqlHive(s"ADD JAR $path")
 
     // Add jar to executors
diff --git a/sql/hive/src/test/resources/hive-contrib-0.13.1.jar b/sql/hive/src/test/resources/hive-contrib-0.13.1.jar
new file mode 100644
index 0000000000000000000000000000000000000000..ce0740d9245a795b3ae8009df99292a877ce09ac
GIT binary patch
literal 114878
zcmb@u19)WXwk{mowvCQ$+h)f}#kSS4ZL6b>?R0G0ww=Dc_uA+Dd!PT_we~&d*7Fpe
zsxjX=zA=aA_)1X*6buIFZ;uqSKGDB__{R(M?{_&dRUvvQd2t5Ce~>`|CI2SFH|T^L
z{{3@0Fc1*hKa<G`$xDfgsi=OD6Ay`-fazmI3cd>H=<gB~j@%~ZKp%hs&)-!us&DMg
zNFsYUy0AGT)STa&^d5)HlrDt553rf`u!;6hlSq=))jvgTYwoDHo^^M}NgBr+^^cIN
z0ikFdmkCzQnO0=BaYpyt;OOeuilAS}hb6qT;EA9fu^e~FcfwwEZ(7V3299LMF<S&H
zb;}*Lk{Z}UCM2_-raA+wT*Hs$N~iDB;tp=Li8^JHPB7NnL6&<~dhdnHfb`XvKq2pG
z<4g77x!Sx3|L9Ig!2OSv0|5;i|Eqw&ey`ll(fmJU^Z%!S{Rf4iy`iy%>7Njy{41e_
zp^2TH{h!cd{%`3m0Iq*_62kxULB@8r&W-@1KRF}mUmxsZV)iEhwErG}sk@<#z4f0h
zgXXV+%mCJ=W_FG?hR%P&5Yzt^j*+30DLd<*!2KI+nVZ_0Is%OUTu#h?&DKBR{I$5q
z{u<B48DRY<D`WUK5u82jO-+nEoK2nn1dRHx!E8*O4V|3r9RExS{tce~KNt%;JF7qA
z>E98NiQ%6qH{D;a%H?kee|@<q{u;^2)X~J0`F|$ke_%MV{0YpzB^KxZu~>d1{Iys}
z{#q<Hj(@V2f5SR|AOBzZfcm@7he_`id;|ponuP=c68&pF{$}3)0i-4>F7}5*lQXmz
zv9mQcbpB#&ZRq4wtfpg!D~82qY}XOf*-7p&Yi~1$mPJvX-%v~ju*jsCpLYOihNfQn
z!Pm&Kao%?3GVectio(!w>+deqao;FOMXV~^L3|_fjGQdZRqr~7)s7M<%5yitvBh<_
zVfH@swfh}r8|@StZm%PJg=}*m1dODmtXM{Be8&;xPIJ0nhoU-dm?Ey{g>!T9Zh_1*
zp5+IYgnp!qLr3Amm1WEm3e|Q-ity+eMT%S{)PR}w5`r0c!O0I(VB<+z+5k|lHMN(Y
zo=PuECNudZ3`faD^>jWuH9FFyWvea$uv}9$CMI$z6*MZzrc>WdD;`5flXy9<FN{i*
zy3&>0_B|(*($7*c4>NZQDknT_r6A@&92$wUL*XPn>9Tbaahd0!tSLn@PktP(#@mvs
z5asF+=SjqSCf<2FHPuyrf9H}|Jd5~JyvijmLfc33Sk}{%Ghv<QjqT3=6g<H&rf>?@
z@FrNbHR(C?bYM1FZFNQZP|#!lEi)CTNku)wvzE^*vvCMJ>+<kO*1RV>Lu)n%Ao0s;
zU9*|9EwAySOPENed6>X+yovROA6Fr-*+4@Vo7FosUhWsALYq3f(lw5+6#X){z_<2>
zF)8@EYq%Wr<13V~a5Y$t>j3tb={||)<pD|z{*a<Gj0h?%Dv`zjHzx+h`y}Q%@L3Go
z0;Bwj8OUD%i6Cm&rcG31j79sgz#tdYYH&n0*k+-Gm0xZeVB$H#_G`rQ$*f^ZjrILn
zqfPD}K4*{SG#xK=rSX%ekVFKV-vf7rzRh~Q1y`J~M@&=LUezHsrC+sbn5iw^oysHS
zI0sR-?idkuHowaZvZ>8d4wzJxtk?CJwAJ%);~tLSN2;mXtNrkIfEn|a!_wxxCRi%@
zXxXrh9A%!8F6PmjsV>O@V?E+RSZF$0bUEbBCBDD1dAp{crjAl~Hd7uqr=*VAWz2Q^
zzzmq?8^7VIX3hdX%0XR25(r(Ja*7HW#e7E8lmqPuBh3OY7q<|rEhClYyZ(5-I!1oF
zB(3A_3M%Vij}nvS1K|mS-ludkfbou`3u)=GJI-dNxFr427HyK-7~<3a6XEeJ8dUcl
z-x`2$Vfo5wo$m(=O6x}W!Ft1@Oz+I^xhRQ^s)O|oERu*yZ44_SH$Ivg+VXZhCDnI1
zBS9K<MwWmV>&VuJCBfvs#~GdKt}t7+2gX0d+0_TuweyZ&<?^m!Fnfp>Gc##`kaEpj
zRrnkjWjuhz4O(?>0myP+#Q$oX0Eg^IV4m@ncvtO`aA)0wXF)h-uTk%9u0cTLg3;3m
zk;!_uGJJ`U=oiwPTox)~+9O9FiQG*lLNWGikU+CH_KOoA&e_L*=8}K-6_v`nbyW}`
zAPO)bAd&yBUlDS2H1tq$vH6EvQJGRe6+-(Ay=*Gh6P&?NLbHIiEw6(L#fro-CI=`V
zC|>xq;ji}EG~2p`yFW7A?4mQ!5JXYRb$Hy_G2IMh(&JD+>{HwwPI%pQzHG*Ryzgyz
z0e#n$A&Pk+V;-i7k2E9(hL4oijp>ado<iD)!#}ZO79IDF7HU6Z2kRFVa~%lE)5F{p
z(RUkW)^f19oNu9Aco{Bxp<gj>(R1i58Ejv-h5kN>TfNq(?WmyHw06U%)g1UUhCJui
zbqRT~aCWh?%X8@N*_X`%6MNSvmIwHW+HA7&{Fcuayvw;V;{0XMdu#hzevYOUxKkDi
z!x85&u3@x+v%kDo5w7j(b)DEPoY;rA^yBiX!x6M4IuX%Ex$;NF)Wy(~#}gF0cJc(m
z6KL!`ZV&)2nu)bhbOD}M$65;A>C~h%1^s6a6$%p_Hq~}Sz<13OXUHJz+FGM+9Cd8l
z4~<wX=3hMm4p0=<A%<82IV^IL3jC-$_DV7&azZ6ZTAw@@7q8W|YUT;iaZb??`LTHj
zgY%kU7x*DG@YEQG$c4ljTTp2<&?98obgS4Gud}$d8u@EP*znOTSfqe^1_M0Q29*c<
z3|s1J%vYg;229x8y!xOUM-x36K!`{bV~1c<+$;hzb}ys_#{=MlG~uw}6Q@njxB5q_
zKzD{6G9OV327jw$rN0*UgwizJj|q}!yb<uUeLZ3u{Fk^{mx4rc8x(U%8D*;}5;4gF
zM^P03Ko)s}bI~4QPW0tH*Cgo*1i+Fn7U}y9!?4v3?aW9sP-Z_9zOnV|e-uD}C~esc
z(|66^WuMJ&#TEMRDy{h6bMt?wY`mO8pCD4`R`RUTfeey>r0hIunFFd_$*I`>{4wys
zE<h*6RSNI}V|T%?h%y?|&tG7)MTZk>AdzqUe7s|7Kl}df>;tri>6C)CGQG@jqbD>#
z4&*~Mn1qXRm^DKW<bveXAuxen*q4ofo43nQo`?sX$5CtA^oQa@WjT45I}vAn;c}t%
zfNriz?n8lvs1MW5d~_cf1_J)mYmz|S4%PO3(?VAk|C$o7Nw`?t9x6tZ770fV>p5^H
zBTnsfAwKlC9G9m;mkYR-OwEZc8V_j!y|RukX_j=4MN6lFHehCyI746TU|rN~Hh1Gm
zZij_AYLg|9!9mUlDN)Zqgtj12cB@ICM(;EXFtJ}i7n;Ee-iIxQBSvLSCNH}KJBs$Q
zqC8hCjkSKhs{*0YG7F5fF$U=<XDgis^>fhiP0g%Pce1FBTRhMdBu%;`{m57?FEV-f
zFTO@vThb=-_az(r?Q6vUyO&MH+0n(=S<w+-`yW}1>XyQsBHHJr`Xbj)L`o4M)ii&6
z#)_~!e?dhiGEiBhsICo{r3)i~`4aNOO7{Tf*BetXxhO;micdEFH~*|PIb_J60~CBu
zzFVzF_dI;1A8o%rN&T6|ho&J!B(#SV;S8}RpbR-9kinTpqr?qqZYw~R3b0hpT=4g}
zZ5c()iby}SLTlqU4W=(!JbI^7l4CT>tq+2q;*PA1P_&kIA0eKK`dpOnBJ0L$<6Jre
zM`YG%VCskOs&$wn9d*)D&Yp9|)05v-R64eG$aT`r+%T^enyk7se!N5GIKKlz%a#va
zL*9~Hr-yBCZERQ9snfo<${D`QoorFTB*_ODr4@OF+g5|I55xt{H!k6!e_b_GPeI40
zIl^kgVg908l#NM%f6S|WF*Z9D@%BrSeeQW)E5c6euJyq>JvFda+YQUe%671Rpg=Xf
zE!GFQkVX&hf^%Zp^Ii$gTmcDh2#lg!lQbr?rFa}Iznrs#sc_mx_=MApX^8Pxca@Q3
zcHC~OTzsFhAuEpQi!_*u*hl!avJ|IBXw=N|&!3Y7kNHEIubDi$NJW$$_tQ0p=AVEl
zgnBWbW^pEGtf6t-*J)|O%r;?z3JDX`U|DVKDmjsMgDa3H?7DdWb0NqT_Y0te<Dz8k
zaHofDyXi2<95_EA)HUqS;OSe)V8Sb59cTp$DI1em_l{nZ5_w7zwPUHO2#Z~`3-N?i
z0wZG8nXE9>>VlB-!X+H9w>_uOX#~?H5HV`e5q*+*nNwv8ae*69)ny`efZjLcGJHz0
z>@*R~wi7zQ_cQhOJ5}VuVKv>J!MxeGq#voD)IJZ~R$XW>Ur^^_>`e%5xL;l1YOh#B
z?gJpNjz@w2)k~2G*19$PR`-?P>dyb)Rd*p1lRsQkwu+1bsuJ3#?UCBrEu9#|sLZr{
zxoO_QPc$Ybb)&=x*raG&hKrCGglum4a2-@%Xx!#Dq20jZ@Sk7v{rZyKFQ@YiHkM5$
z`JOtuuhxBEUT&}ifs3)=#)s_Dmx_i542Wq50|T-G&fs2cN)N!?u+V7RNAfoBZ~_hU
z4jTqM&D~_4V_#BaQg$*ljxH&l@I9_@7V=IyjHs4){e}*0^zC5UM{skNnldMIYR~j(
zue0gf=a!oWt-97+Lq3Y?Tz{!{u31zviVhOE@$lw3YMlWtxVHA1#&mdG8_~n?cWO+$
zma*V=BJ^fpuoCz_Xr4YiLgV$(N6sQJJhYODcO^quECMj?Q}gXew;wPOJ$S&dZ&~}|
zxHxnM&*HAuE%XviD!wtTkY=u`!^yG?uy@3K#k7)_yyptUFjWUp(q)cQFhF(%k<nyV
zl*|jmy>(37S^RZ7Wq&|e&Y(An<bplPc#rw5fuj`NdiGHC**Q#7<Wh-u)eNh#qBE7<
zJsvdFomACVqk^%hQX0{W2{v25{+-$&0VRQ{ZN-_HyD<dO;7Tf-NM0E+dQUP=HVYyT
z@x!^R7YL$#B$rwnLr+=V>Fi}Mp(3dvDMcAEZnCsNSc~I9A*@vH11aA8`x}mz*Daq9
z*nu9WIPZC0Vpb8i3RXWWfaP8zQy9U?@Cs7ZFNzz{FpWgbByIreKzt;(k4kGd$swIp
zZgM7D$<i220x_w{Hi1JpTQI{>7`iP08Wcbw5NP(9X98_i2YW`F96hUbUUN8FQBIJ`
zT3l3mpXBN<r6P9{QfG)0JkA^~Quc7#B|qGyB{BVlWK7W<7Ik<w3ARX!RpAqEMKsJh
zf;E~XA^2-C@S%uF!qo~=dZh}lh&r4{!f*=o)OhajYum6H`YTzTLwHmUpk?vDM(x2N
zV|&uy_R#ygjOF<E?BRFms%+<`V(Q>xYHMuzhan`Ztl6S6A@Rwx%i%PkX-n%=XX+;^
zYN8VeNygD2swI8X4?LDRuwEaqFfP@%gY^SPX+V+m1|kf>ssRCkrs8$yt<T{)a_4<J
zI2)P*A}}h9LZK0(OO6*YMG>Rpu^UWZAX$viG*4qBkun~uU7sx>yhV<MAi=xNSu9jM
ztu0pj6%wD~W_*)2n=46vp%UudxM+Bd(?*F^qN5z{%6$$~e}sZ<rp>+U@D2B5_eNq0
z!KCl{m_ruSgQQQA*2;uK)=OgNQ{71-_cCHnwQQIvbpSe6#TMGUQTvVZ$AMmUh9qk-
zY3>L5O)sr)A2x=Yvgv`Mvk6K0ak}!VBW)u|Hm={S2i~{c#;`v6cv0~fUG5(q5s2YK
z9Iy9`*!7HPS(dB0gzQgi(fyY6ErjP#nK;Q{yvMNQa2w^AifY%LG>TfZK?9D)#ix)G
z;)p?Vp%2|uI2<-t6T^Bw6_k~XBA%54qf`oaK$gV7+73xSznJlV=`}CYU6vwDq<SkW
zI4q3ug3J8%;R`2&d7p3#C_@oSJXUMous(SElE<|s9%9y<q(^+z5SL1tyPud{P!`SW
z(IfnUSNRFuntC+s=da<1Xm<-2;-)odTE}udTpg#^$Ewa0%eER%ti)HnZTXWWXoblY
zH00cRGN9EZG@O}I@u>R)_}rH!PaNnW7wG7xKbP*<IbQXg@3#&QZCGdw2hv`-^@8I^
zVElDH!E5+`^}`hm>_FGxKtOzOKtKk6t#|*_xBW-IRrv31S5;GYX9X8$dl%<FGDosM
zzE(ChwsSPma0ED;I{u-cjp}PosA_nh@(C9uBa0~*(lX^lXtHy&Wx65Sd4fNtmGckO
z1((UzoQXH!Gv#16mC$M!Udb3Hulu4%5zEk!OHLyZKS^IX81sCOtZ_#bOF1pf9BsSr
z+I_mma<)F-9bkH3pXowNnb7)$@G3-E5XN@GVhWiouo5y2@rR7Uc3be|hZ8kivZ8w9
zbTfDt1NT&z(j>=hC}K>9M|&fT$U^bN3u}!zqgl9bnOKcvzo7<22FBEL^Wt~KBnS=$
zo4=({8sOD3SEusv)?c{bjjhFVM2dZ<nqD~aY)#T)WGPg)oF8wqC1$gX)hVxkNphJS
zE%H3?V6)&chk!{jf303()>+30sJQpoq8Y79pVR@NcY<ozP<=YrG5Jon$~AN^L!c!*
z)CRCyrPIb;geEhC9d0o;u2$L4Dy9dlF?HUR+qxtQpK2Fj=Gfhh%C}RAnS&277Q3v*
zT)077EAXZVReb{^;VIM0YnyVd^eO()Z~zb9wfNYI&EhwFZ{3>9AN5?w9#92e>5cWi
z_sc}~r;GQLe5ts_c-UEVx-U(S@#+fkS(bp}Ei8Grgpv+HvOF&<x>EqU6W0V$(oL#9
zph?65#DVQeOMa4&+`aRC?Og$iE>j}HpMNJkBoM5#Pi>Kj5W<JWsbq_eiqn}4%&2|R
zv@$RvA;!USa(|;hl!mDij%lu1Vp2Mg(dbB3x-w%dh^c;CYh|8*qceyj8Fk@+6^G1M
zUjFBP1D9>dZAqdGxOsz$;{K_>AS43!pmM%owz_L-|M+fpT}THW|3yjo&O5|$ngKkF
zC=biai8Cx^k=hTT=6zGTbS)4$Cy81SZDRxY$1_gzZC-tr>4>5L^J(FWJIwe1HBI_y
z&Lg?<iJmj}{+>+tF+to_a2x`ndY=+oSC5=CY<CQ3d^=osNKCkn$S1hUAqY^OS``D+
zxNv-aQ&%v<ODM;N!l<s8NQliL$f(2K{2aDSsAyHc@Q-Z=MUHFYN7(bv*b0#$2?+k5
zlpy`TzIH;-^Y%Cq`K*pAUX_AmE6mz0e;&&4Xb}sQmS+Yux#dOei>Ob`i%y~DO`jzR
zZ4Z&|X_eTyC^Vm%6g=Eyy0DtR&;*$7G>@iy9Z5{@>hCi=()w_pUACo57aqtXaj4XR
zhQ*}EDRWv|qqVYs@#ah*r_@rXB)Gt;a$<{rZT%v>umE1|v+xwk%R(xEj`*d5fj+%n
zeWv8map$J(D-F%iOQ-b5Sdn3>#i23jfLpI)7Yb23#C|MafUzV7>yh?~vVR7|3Q)@u
z(Xs<KXGsfB2%{Z{Od<zyf&h^-=#I%6yl6smK7MC-1nzVVEnv!_Qp2+ish4VZIReL8
z1hr|;Lrlf-8A7|XQnTXRa4Xp+nAH;XYg~O`gOyD;(JT4sny<JsO7cf9-dS73Ky%nv
zar%>0(*e8wJa{#M_-eV`@=t^6OKIQOjPgSjw}MR49r#rexr-VDC1Wy8hS@~~z4?KK
zZeNZ4Lt2Y8V!3Qf29;miLM=?p;ozC%a(455(W~=hmaXB#K{{oyLT!ZVmrn-3L;A<d
zL6+x(DED(!KFeXiflR}5#CV5u4E-yIqy5Ob+uqPaH(c`Dv{?fj!Mc5Hfmo&kBbh{;
zk>!S{`uocY+w4xo#0nNc=)ko7p71);Qi>c#-WhJRS%6Kx`FE5llKI5}M)KiywGiR2
zb=UkAF4m||O5~;O7H!{<O3C2}hl!^r1TuY)XsU&2@*Kj&=IH6*{Go1%N+h)=7b};C
z#>9XAh-{@GK-mb|nB#h4lY@7XZ&>(}A?oAzR#PU`wDcuMv>$$6SBwt33svpAh2I-l
ziY_H#+5zqJEIi1MKs$4Tr~v!2F)`4|1pQqPWZE5;yP;$~Da$1(pL8MEcobq$ujH!m
zx+>^<WaoVU3ZZ=h9Ov`)$Ov0}P~=k&@VC+n{hD(Fv4wm9iVTBDtgaZ@JN_<WT)Cnf
zJ2e1DSZ(dOVGvl1pa{4Q&$JigM}aW^ZDuV-vLEQbMjLNqmxS5h(S{2V2uSvS7Hy<#
z|1;S95m_cQ)--U%@IE1wQ>>c>Wdihz?=ymD)pXXhYqe{alN4$4Q^Oi<*+uGDSE4iF
z={HXs^|!ng3FjgCUh@zajPvPBF&`$nd%Fks>?i;zM1s7fIiy_YljFX3?Z3KKKJKp8
z+<<vw6x14aq5|t_PVIlj+Hg{SIT~c8$my)b*>hLw1qWxKZYdKU9HV(>93ZDKZ~!j}
z3?{@wg-M`UBq{Du2Jn~fqRhfD(`W}{=mZoRvDCrnD{WV}gK>f91a<^chd%m}4j~eR
zX|fX@l<ZN6x5jUO3BqWyxkM<6){QyB!lA(9Fm{K_58I<~A+<`Qt}cxV-ZOcKig)o4
zCfw8LsLh`63PM>MQxrHEz;0sbJSajXv_FRqWWU#HI>0aH3)t3z*z!-@hMVZCRe6!k
zYBv_=m7_f)i2)Q%;Z&0!4vOWb)(sD%s-YEEQdeijT(qj|=GM-3Y=;q~`ksM5Vd`9#
z(xa*~UnSh@1tjAmv2#$iatMUw($}k;Mw-o~EVb7?7b6$)p0yCTc?FFbj*7?IX(y;y
z(yP!=K3Ueq=g-hr7%eLW@g*rYV}(W{QZw*;$D}P+j{VByu~?6n8S%zk%wS1w)1P?q
zcJ?bZxQm8yY=$%@oA*dNhw9XF6>3)C;zk$RNmZ3bE&JXd$Dk}!8Xj`UO15Qd(wsoj
z%&-DC5#{to7$poE<gSA)TQ;;$;R%f6p~V#U(wQlmY|PvfK~31)WHycYzQK^vB~_uQ
z_2k4vZPMuuL{Ly1`>aAR+p)ldg>AeuDUpi2RU=Ls8<he$^g=7ty@`w}`%sG1XPnjJ
ziw5**xsl3p4oL>GKU04(kjC!X5zDmmBQR`8s#|r<qD6RRb+~7!2Vi+vdPVdoVFWyg
z<y|x1oQz+}GKG2~;?cuht=T9U`Y=h;a-uBJO^)9p9Lc6)b?Gfw04ETN3CxmDpY^=Y
z7TybK`kIPh&K`UYMt|WNN8h~VI+qzi#9oR8)3})-99<|LpCZdoM0a!E9!-Jeyt+#L
zc$#`Ckhc<Dci|di)2s~lYUvtb)6_yslL2S5{|%-Z@C-z#tWiB}3%90j$u4b!&74ts
zgstyMh4*IB6<0AaJpI9-t<*pC9Bg>KW0GNLPS4k%8}OyfCsh3e{z1XLsBcJoDm=l|
zHNlWi$zo-%kET&}DFPBc-YLdntB^EixRt(EGs2x1yV?~m3=_B0q;$Wbc&$}6PkFP;
zG$vK-TitSPQ=K($4qE8jPovV<xx(SVKJxBNs!sIaky-)9E+`D$8z`lcDCeA5StK>c
zzTROHT&Ox1Eo`{&HomQj#}o$+Q@kt@Nw~Tl1GV|gsK(Swt#n$70|iQ)=KD@$HOBg@
zV~ox-Zi2X-TW*Ftdc;@=yRnin)!%!-BxV~k%|6OXk*85Iz+oryzpzKX;$DBqTDdIC
zEKvvp<@SGLn2utY5n-vSg?!7K%Z5Ir8|9H|51uShe1flp`GEQQ1c!qm{T5OAjAL~j
zowTUGtdv9{-lxnxaKd6PuoJRwWs%1xnD=d;x_Mgj8`rb3lcKR#`<3wnU4vF7*Y4W!
z8c$G3bk`Yq^x3fEw$AJO3qtmDPQgvkyk{O!O%a<Qt%77Xh_YWc9{vDvU0x!8I>Me*
zf^)FCPw=T&?))`cbhssetddtqb0?Pue~UTjunr$y`B6dnsR^W_NQd5&PJC(`>w@YG
zP=p7MRxrjsd=7l1GgEhpu=w-TueZ%&d?{k_7;X_U#Nr3N`d2^Hi-u8)sy%+~YQ%~B
zklQvTIQAi`Ih{*-yoo#&S(8f7;cEC>U#QIe53HuqHzc2@mm&`0I<%H>gmsd8o7+8v
z{43?8OXOGnm3Q2HKM63uq=F5hZgYn$Gj0#Iup<lSG!h<32h?OLaYG!jvMz}71VI#-
zM)hUv{1toR!g~y-)8N4P+Q7N{U%Z3$aj{CG2R?8-rmW$+RI5DOzaalLcDR37jd(!-
z0dfD19mfC9V#hxU6<L6-=^w?4vZ<lTe-<km04FptEME#QS!P+;C<v%BLI2tnyzl<N
zU_vNR6u}}Wf<l8U8zW|YW@hQx2D<@aR5Xt6TM^#*qrx&3Ci(LP@B5H`LEQr=HzzrU
zQhmbeuqw%yIn8gKxlb3HTfaU}VEl_YzZ()H;3!LK>;R`3Va><&#%XF9vC)No(A42n
z49viZ4;V!D&cyWtGqvE{Ub9B+=52fH;}LuxXbHu<HX_%by=F!zcNgoo##?mXRbx^-
zM`r3S*)5A$QtQayO$#iu!U4>wP0t>+qzA{dD2tdG4NLOKe_=A(*Ja5*uD5pjmhQNI
zkKp0n7+7l_;{kJ+Vo;Yg0L0-@Mq_7c-H+=c2afK=Q%(oipNKhtF+zP&o*WJ{KH@5!
zP$dDZg!1GZCs9(9wf(H4x2p<#T<)%|^}w__FAxN0GjcjRmYj8lj50|{YU8IgHF3kP
zChr0@TE?xrg)w`LD1}kYds=QfaW#$h-rR5XA&<hYLx8cfo@z`87oxR7;JysoA@{BU
zO%=8FJYO$cqUXR&UX&OjEw4;0HOx=57hmH{VRp(H6fAd7{2|BWkkh(?r%}tfsLPG$
z6O+Aip`CLbhhqNB+a7gJb-=z6@!e^i<qmbZ0DErja=)0oGl_&>k8&@=BX@CPdj$od
zvsjsRA;vL@{VM+(h*5$pYF$b8O(+@_A6y7xJbb8moWV@!2MG-9ScK4i-qlCJt#{4k
zQ`Gz?y|{dV=pc#~pM+)s33v>jg(+DF<`zU&r+i%`B{_vv1o@0Y(lR#iPs6>u_+cDZ
zik-PKb>x>D4vTqt9)XN8u4?lVj@3$(9O{CAZzBEj%0heh5M}O>HbT9f<L~grYYF{k
z(n)6t%M<gN)Ff=^#)$Il9aM!z=fn^tzr;6k8Hn#vLgg#{+1#L{t?UXJprVEi<Ls)o
zuw7Kln#r~MvJ%kOzTnLw4>_XSopHvvIS))%79zu{WljHpp>(0^Gr}LM)g6MVG;u9y
zRvogeoVlS|Jk|WBLL-0Xj(K(BPDohxtT-g`tT{-@%|B?)olz?~bo3lp@bNhNWwdw$
z#kJ|pr?fMS4gKrsGt5`iPaep&HFXpy;YaKgAug=Bn`nCzbONPXgVjo?GeVW?@n5Ti
z==1V}=#jN_1~-q?m`E^}tpvF_HiP!Evp$_s?N?Fl9p>*88`$efHzRtlk8Y+;E0{%r
zZ*~ixL1SJYepd&|fyb!u&FkulBUWU`mphvE?)a+lvzYfD`m}sYd%UMwD+8WOY(EF(
z;v60K)*#T!espH@CDfF6JjbQ7zL13_Xw-$?mSBdRB42oXIu8@ch}W_X+S;y76m!3f
zD?^5yX5cO5bm&9qlA0O;6Pn*C2E6Eoycn7A1Uws4?r0Lz<Tzy?vt^n4FAKsBsr&Nv
zf51$_-^eKLrYbog=ev)I1@F;=i+&;T6pFCe<3vS{!7bZ`3b7WYZ+nqn4|Z*(3?b@0
zqS=?m>OGTmX1;`0!@tEXi~1F24}1zglCi%Dhwh`CahVHS#5ewnkvcG!o{7=>!~~Em
zfL1I<?&Ae56>%&o_6g&6Wv<HS0==sGnN9w(VgYIw@!s``y(PD^#S%?<z9=S@$UD=W
zO9I$pO;~32k&@<MDRMkYi<|Ew))ssy#d^|kvw1kmh_Ux+xforVUOnwkRiYivZvwgW
zB||w4Ta>^h(U>gw!7uHIuThUn@-Sn%8#ITliXTmLRQ-W$ohu|_B_jG|gyP5<_$hcQ
zx80S>Ij1DJh!nr<FijjDf+<o!>d-hXZCznVJZ%~j?t3Izjs%7==Qts$#5Ick5b??L
zo-+Vrs+v+FEKAC{Aab%sl%cmJ_)8516h{IEdNn4od(`zhS&(ZKvnN)8el_Opyjnx~
za;0v_my!}Ix*VkZQlrR-Dw0Rk!uDv%TjV?Xg>Rfg+NYQFdA>4dc4g7cixYGPi(>QH
zK^TZ1%G4UkI@+)nEkwyv#^mD&ebdA5&`!??=HQ)TuMU^FBp+b#JW}TJGO2s>)s9cl
z=Xq?fFAK52TBJV;yqHLwG1;I>Es;NA>($ObQD@}Q6+*3+P)`)VT`57l#dg8R(_!u!
zQG3vDLJ~d1qz+pTm}*ELeN5~SNYwG|UeKQ!KO-tYE(N|S4}1SInCbjoYR)-!XYie#
zNJ&BOXi*yz<|Nh!d^15+T830{M^gI*^sljR)z=+{`FCd+`S-}L-2W{0{d1S@kHTqC
z?U%BO1Qs8zaCJB`Et+LSRW$g4hL%tyyHq-|IDH5;4O~prFY5%kI`>wucDJBw1@lvu
zd!bbUZL@P(LVxoU7U$OsImL3Og5o(7$EWtpjVte?^OsA$=d%|=AhR16h*HLj0I!zQ
zzH~U0G?!E>f@-hlgnU0CQOC<yP_3Dg*J&zRcoA1>s3NU<F|?5|Vm0M}x;r>N<2_us
z5iz}FICBkEqHZhO)M716(_FJx4QH`u+&(I|tsL3a&;mRe+SU11D6wK<4-wTy1P_r5
zN&T4swu*79(Ns2@rjUb!7P8F>YrV_Wg%bG8NBMWVc*A3=NqzS)mtRGK(r06X57E24
z0r2z1AEAyN6>;Bi$!3)%5Pov|+eJ<f)BI9T#dJ}mb=6dgeC9L69ZaVGLE6F)$Q|p#
zVB2Jtq5kroGn3D%K2oL<(NFF|S~v`?gy)xtu=S>UISSJ)!d+RuJwj8AuAUjM(Ta33
zpdPX_y9ON(!-fu(+u%fp722+7GJFozO>OK8ccfn^yQ11q)+{0+s*b%(u`<pr&46w{
zw55a{s$!@E7CK{mBD(0i%HK6aX__6sgd1UB{gG%5r~}QLUB3vka@rhQ->e#*rok5_
zuw^b<QsqNx0t}z|<y3CPC{6u39nO^=3lV-o$#x13-(lZuYU11~Cel9Q_AMu{AWc2c
zPc0QOljD#RYdalz&&FrEFd5rrt^3`~&K@f><WbgNu4m@n&mR2tJiKtLh5?Qg&`zm}
zB(n1X{-9_?^@00s+KHQ;l9;}#>li0gC%bJQKk8`H4KQ0!a_X=YO;C8YPVgC1Hr!u`
z+3-fzlcjO&8G30uH*9g9y$tyl5=d}Av(VWefKBFrYEC`cgs}sfRrurA@HkNPcs9?4
z3$>pZcj)M^n1RCZk#gXBI0~oE)_MMfw_|#DF1uTfBFD`shV54iFy9`J_b`PUx=`<F
z>Df@{8IOIVX<q)H_PPcHnAcJr913q=lv}^NM)`I+I78=>ioMNZ^-@u9!w_0tS)*@`
zI;1=5=5tED3^5XXXBBd)vOrO}c?KWZ9i}`!Q<`LjVEUG?`fB&D#>ow=#H<ek1SF01
z`}x<#`M;SBmiS}r`1cJRQQ<%A^B>J}Eh}f$1@upOGiEpDR%Cc6G75JoA<B_$pxIT5
z4pfp}G9qXqqQT7c%>7-HOZzK)sHO)sZJkDb+K4J`r5<%1bnvK#vc`qQuFA%%)n>hq
zg~rp|YSYGh9yfg{7GM=;`ooS#{<rh`r{<>>j*i=0^H|Wd*zS;JxMbo8J(y~k-fs%S
zBb26h<&L`1R-^Xx)hhNrT%*?~%;!JN(Y)1r;fYlZsKTa3KJOCleiGRxBxZ^z-h>d@
zQukN2$n{<vXa@0&GQqn^mw!!~(5qWP?b~RP=+)<H<p^U)-WzLCIJNiVBIwcB4@XQf
zCL~Ch=&xI0fPO>AgW5GfiUqq_AnL*xD1;$^-8p*}hl<^U23LT2=2rhom*=AC>Zhu3
z@&lL@ryc^bst8q1^W(#;PGHsWfde+!xm|$If^1{q0z`%*cAYVAh@KW5pV19VX{dJZ
z=X=>g`$Sl)*{sx$`txH;yUm<T84o7f^dw$&<?Ztrn^y$BQ)>lLo!c>!+v1C0LDSHT
z>2ld-D;3IwcL2fQmtvk6!5;Q3=RWi@+JZEI*jnNx#G3QNW^iw3K#19_Gp34fi!YyP
zgQkOsf%3O>*0$N8&d?1~?1Nr)S%u0a8<olWNME;}{cxK-E&r#2n9dnrQoynq9sX4$
zj>xM4jiD&RBwdx2jg?*(_eIHUHEZzv;5_P~8k(=(vopPp-)hcHF`bIR0oXz1j|rho
z<8Msrd3l^d^GOFpqIYV}?VjhO@_fmSmKKAG<C0R%iCC<z$Kef^I4}s%Su>XR*iAMA
zED7NAuEGPhQqy-3txK_XEW1*x0%sA(kGX8x6jkc}nyylL16U|&>i%X=#rPid<XnUO
zRxtZ)P4TnT2^>!F9utSWwS29lPcZr(FDKt8it@Ip#Tx_VqFlihynZ3TX06z@RFx!3
zAGb3TliKrNuu{aGMh>bovKvtBHuVl?>6#$cT;9W|M(5KhIAt2DRN+#Y7}CYV*DE$B
zHyMj(QuFcISfl9@fh%pVI?u8}Mn9Aoqo9o?NJW;63z9gZ=wOS8)2&)^GOdlTEalEa
zdZ;sQT6s`HB}q#<oJ3UFoV95;g*Fd*8OM~mh?=2{5XQHfl$!pna-hx5q9x6|Ti#Hv
znV7*N#dd3GxjfCV0<=*S0~|78R#3V`zMP2bLXkJiWqwbOIYhu(tynyu*Cu`eV3XgT
zsAZ-aCs_iNYd~y#V^ZcC1ltr7?u*3ZEZ62jJ7s7V&!obw%x2fvAo4>lcEFty9<&kD
zt&J5+w-P?IKSx&$Rvn6KRlf{-C}-VAk?{JPqw@)l6@??sxWUw-BJi~<GxKN+z%SK8
zmIv)+FcEHj<xi5SRUH(2rrN4dA$K3~79Av^Q@=6v7VT9+pW4;}A5!&3-7-4$^<&PA
z*mKwKg{A3=#svSu4KC=>YpNAA8hKa`Olp+HiPECTO?E~4UExfIlBH}W3z#-LOr~_+
z6iF3x#&fP(y>yr7xjG{Cni;f4zMx=AnsSe(>sx9upv#Z>2YC7HLR*K6dS#%7>N}7C
z+HqFe%Wc|-zHJn`UoheKXJ~!38z$Cri<AjrR_f>Sh^1>5ikSjz&A|}O9?0&6NN#$=
z69;JRF#>GelD(prh>q&rqvw!_ue)J^U(27_d`V;9=@UyMwszpryN3u>-;rg~Zz4>r
z16Izkrd^Fvd4)sM;z<EF4IRlCGMWf;WA``D&+i8bN@*_weP25T&)$?osh$%f1olk9
z{giGrI;a`l3J@cE66U-k!77IQpe3{#&3l?$&GbJKkGU*b+Qw^Ul6A3mp!{0&7Of-Y
zzmFnQ9lb3JU%b+LVY0Xyw$ONIpwCxkRUSf{XWWn>^H?o5xivjC)O=;S89gU^mCHxJ
zJ*l!^)e>9$l3T0yb1bv|r(`Kbs*hrw)`i<srOnl@#$*ZNEeJ4Q@@^pmBjvk(MWt?e
zl2Cy+Qd>uasdS6G*r9eSv#n)I=P)o_IQ%a=eS)hTn>K4e?`if*X>!eZn&fGEo*L|t
zEtW>bJ50XT=Mvt_sQ0PldNm1OYS1KXtSN)#boq-(vWEcoL(wKf?mA35mbiC@D~7|7
zmsUC9QKIY5k?sbqGUjsWc6PK@X^D8c)0fR7tQRt#S4m$OlkY3YEAE%iSol{mhpH(P
zZzWL``|C_N-&^(OHR8#YR@pifuH`zMA78Rmw;#K%hNb*WgpD6Z=u;)E{CRj~zEF9N
zsk|41vLFw7FJ{41n1=MuJk&+qdt&6B%b4WvK1{6m<}YcCW+eJh2UlvWeA%-mf5!t>
z!Hp4S4eO@@a6)YTOenl26hBGa{5-Ip@~3l2P4EIAosPk%xqNUfNVeK;fyYeDR2E|a
z*#{%AwBU{Mw^NF)L$t!RIz$vf<_z8jraL_MNOGCL7MJ{zd?m5RREBF6lW(-Al%>RV
z0yTN>nnm7YXiA<*n&yqBlC!2qf{^2`tED>U?oF5yh4j|VJjovtg^6HzimEzyr02lT
z09IKV!0y-(4zVfgWY>%5jd#}4gbiVI-0ymC%rJy#FwN2ra+x;bN<Oh$XzW=FlRAy0
zL&e5YKwKxDP>9eL89#P|2DX`h`b+$1jmig7n6+L4lX{kVw&VLuHdDC*w|qXJ9Tz93
zL<-OWi00R~Y3pIEkhaHKAQ-$-dM~6pa!*o@RpyRRCZ%3@&+h*^!BR#JgkYq7M4Y8l
zE*8o(2Z<sr6Pp*~Ga}<I=c=sTlZb&uY=cEeBUD<~qTWVM;$c}FnfV^SD%df)rJ1>x
zP3~d-R=Xxa*fFUxxp&ud>Ai!0I7Yw@mr?-f5K<~DBAy=9GM~wg;d~csmdlaRSzYc7
zc2im6FHr^KzF#qWR1{2!R5ar<zyxZyOZZHA>JUw*$|NF-+$cOYS%|W)8~?#OLxz#B
zFUl2hNZbHfzj8_B910coo$A4Iv@<86hS;E53}z|5F0&pNDi^nWMsDC@!aZb9!vVG%
z$aWC7C0hGDpTMhpWWycyX<Khy#>N&C#0?7FErNQP)bdAa#s<CFE&(zzUqt%=Z#9R5
z5Asr~{Rc`+yL~R?9Di_Gze%4HKCuhb9AD4t4G~L3aCt0FBl0J%qz~-DemfjFuLE=b
z6h1Fyb5rniQlWpaF~TyQ3!?>d&!zZFH<CuJ$89?lZHsP4G_1U!NO&MUOurP>@~YVF
zqO9K_v?@_t@yL3K7alrgnY5HI7ba%s`=G_FEQnWp#F%P53GRJ%{3ukvx5(bj8*&Vz
z=>9-1@WH+00UlR0Qe@o|(DRez0r(~fiwH3`>$G+cKgk}kY+paz!be#z@OZ%#&JC2)
z74YGTL^?9zUFDh}x!HDBC3(3#|F*{TsgElA@cUXUuzva6ST^~UQ7A}aNjZg$4nOJC
zV;zkm6ZQ;g+8f*<Ux-ugJxZl2-FAHHO&o3KewG0Si6WP@<Ah__j|&8AP6XYekq%>2
zS0j3l3aPLCbZ!8_PrIH4EPZV;s+679--9~f|C+_(OE)=n{T{~a|DDN7{B;)l_Z?~f
zw4Y4%Z(GZR|F*U4Z_^%zM%MorNGevBQ(03-<C{u}Q%X`G>Lm+KLb7BOrG6oT9{^5>
zYhj9m04ctgVW&|yqSF{IT6xlNUI1XOB&5SJnucC-XWPD#zEZXw7x5`D{}iNv-MiuC
z>vX$nJ<{E9D;C)LY=bud`p&(|_~jcU4#8=dk*<jLw+IDiUSV+rR7VeG2|N`X0;^}$
zZvm7zB?q%nF~+k`<uQqA3rydp_8DvToak5`pM|x3%J*o!t9EeVQv~~KRh-|G1X*WT
zX!F)xFl_?%uuClzQc~-iIE+nUb-9uqoYV~$Z}B|7YN@_=)RWkA?0L$RsMR@otc~kL
z{+?RBo2;-oTrard(yBk_0iW~Js>7^j@bMbVil1ZR%D%(<Y6K5m79PSRB?NB`MgnUL
z@&kP>w-Lq^!b-{{N9%ivR&B&@r~H_nefJGyRBNo*L!#xZr38%Nf%oh{b%i1M1F%(J
zWeO4Ys-<Dj;9Eyb5FcEm5}lgU_RDAT%S((-lG6G)OL{e^E_jUmoBHKTuo}$l(2<0X
zjGE6xc#a#FQqK6zOVI7?pwB5@K>X_hxINgcq*OHwT!Iq5eL@2vkZ#S=#%;%!09@FI
zSIl#g53vljeP_V7Us~&<p4k}nmy0Qv7HI*WReCQjul4vrpF@LW{S<zP$=xhGZt&GD
z9Vd_~p0_8ut%{9=Pc3*Z(8yYN??_uvnPFV}o>PFQ1OU3>g4tL3%j;yF-bRxB6!>Ik
zjW`@yTxq4Yv{%!Vk(P)Ovm<;GW8MxAD1kcMvn2MjWOWjJ)Xo=$nS6x~!6Tt4m<OCf
z_X6!_ub6t?7dHft1P=?%GZ3eCCfD3KMZL!E8l^G@s$iO!XWNJPT3%GXx@IvA6Z>cz
zhU^qaZ7bY$*-fXfVA)ExBkW8VO@oVib&c?X+QkTNkKj3%a-z$~2=U*tQwrjC3wjo~
z{hvd%lRx4|?U-#I*n@SiwRwJYckhCBA*~Vl!6V%oI9VTUsOf{(SYAO4Sjm%cPAMIs
z9|s1?9B+?9McW4mLIH8p0#R#AuZ3>zLy){7dmhi>5@jJ&(Lg!6#y>MZlRInFej4y5
zFZI%k9&zN!S_@@gtqJy1G>m-ia~t`bS#%WN|4g>fUD5yc2_pe8kt{6i?nID$%tMrT
zz9EuwE{rb>$Ewathvb>8iAAuy?1k6X6A`F(V#h*~jaQD^=NAoB&72YbuD|Q7I66p#
z{vj$es{n&t34^`=YAVjowS+`%(K0)%;~F1P@fE5-(nyDseXBiY50^83rEkQ8;(VSW
zgMzBUqt1_>G=!`;PH@OWd~;?-+_s02u-~R30l;%<?-q<F4tlwi#ttf{>KffuwNLCK
zW`PO4EC#@KL}Wp8<mk8Drkq8uM%*<Ex&GRKxGPnczJC=)-s~4lv&cTfJky*Vv8kw?
z^q4!dxl{nRuDo}On<Xz^^BEl4()JicqW|&KtXb$c+?l8aP7t>v;@ep45ct(?W0+A0
z)#MGT`MM?Jm#wz<IC}ES)lVdJ3H#8M?uiYmdi#Jj6(Qztv^c7_2(VHWfPhzq7yuTs
z-AtLGYS^;?D{@`x?|fSohF{#ld~v2ty=pnsz^AEX(|EzN8&h-R7vaCU$gYL6LZ;tc
zIQ`#uq2&KxxX8cHFpx3-QS4VGDcQ}Lp@nWS$nBBIt-(QDXg2pUHf#WiXpLc2T4i#w
z_J@BTXvk<uH`a-cG?$2(>%~XWEtJ}Wl*K`5wpN`BMiLIBt&ZD<B(izKwKxJZc>?)T
z7PtH4L-=-gGct0->)~+)?2lc!PrRpy(C)w~0Z4F=?qlu8r2ATjXUH|`7$s)NHR%{=
z8hSuBra@Ov=Y%3J^(=0nZ~J*JuHJ6ux=*B=IuQeC#Ba4(6`GD;yDTBq2_OuymAMrA
z-Nn`5j@f8K;nG{lql#<OL?6y+<Xv(r^auzjKT!*Crmj8NEjxUb?|f1jgNCm^m`T^u
zHb~#B!2X?&E0ko7fuOF2wgPB!PHJc$@>c3s(U)$l+2De%qk8OFn5$&w&3+N!1PFFv
z-wThGx9A!NuKh@Dsc)Q)c2`=sx-T%+P3Bhe8G~)B?$&oIa5a$OB;BzF-ZJX)=Udk4
zanqzSK$Zb{+uW1{Cf8RtQ_`ZLJJL+is-WAd&oZCUw4`&vz=s7HV>LTiNK4i*R(BsC
zJ!7KfH6zEazfHL!pMnz1Q+)8Dr1LOqK63_-kUI0U>i<C!F#6_^(=&pPhB9oOp&zUB
z=4ts_&=iDnzlQkb!=OwqU<H4Z0X+ZWwPa~4yFqVGH&$+mja#D%_de>dZj(sC%JT!y
zP;3qpn#J^^FF`z#MXpfkfHqyN(c~Dut){6(SjVqQi4EgTCUo9YtA=$xVUtQF{}o)0
z_OevtNyIZ*%pt28o6>|S)ZsSPKPd)Sy2sB?gtsNPJ90kv9s3#fq`?>W<j!Xt34wwN
z*$ibIMH>1Lsvi0fI&w`Nk;zU+-(a)JH;pUx8p?@(N9iy>IC4BBSipcl-W~H!GWS;x
zlp|{l6WlBi!Y<<r*~i#K!qiN=28G6tK#JA?Lc{pA<2)IH9h&O+uR{#a$?p`t{a5?V
z_K<EP&DTh=!-RX(Iq?ERu<s)L{ywamvVOyv&uRA%E==<L0Sc0od#sq0Z1Un+=bT~=
zZ3Ly64sGbARh)b=b9bB{xTOjwnm-3Jf~OR{g66lxj^8dzJ*n|}j`jf=%=Q^EymAH{
zDok$z|7x7o@t#Q=@IXKql>b(X{IBzue{RGLYC^l~s-b@>FptQ5)n6?krv^}OC}&g^
z!7Jt^31_mivcj?xH`0eCnq<hPoAr$s!BUmc)m=#o+4q`)aYS6NDpziUz=YL>k(9O2
z?goZ|6g=lXZOSsw@g}4^uiR&P{my7U&0CMwy)M1>1)eTI6)Bt3hlgY+NZbK<UVgK=
z0-4XSpO)LE*>9Q9s5KT>R}lIOy#mhH3ZONBx3&O|hL6+$j>eDXfL|7GDa0ldm`r36
zYq}yJtB2SLHC2X^0Zn^wOpWxc1G7!TaW|{-j3xz5gQ2psu64xUEOd+{%bQf-xK!7e
zKhiA=Buh@F3{@3NGSl+<q47p{m_-0Aa%f|87ZdDET89=j>!RV~3!BIkB^EWZhbOgt
z1FRQvsnYoB>1o6ZtKXTJBo3{vBvI>&)V*M3$CE==zpLoQFf%E{EbcI^>Zno;jXH!l
zcF-;6qQY_gK(aC^C#D2=)rCtRR40j*&?O#FR21uDrefFi4M2353`?V$OM^B`$}^cJ
zvt~F}RDKa{XN?!i7?O%?IGqBtUFwZ?{7hm8EbW|xI;SlcF<ODQ&IAM-XsNI+s^Fd@
zx;}(jdIfFFSoqo4xyzI_(XOV3h45HGqU=c87<u`XwL5u{pw-wcT{>PgA&|X7qEJHJ
zqGVS^uoVMT>}6`%QO?iGZLXq20ypGlsttniUS#@wb_SGTs>;rhz)lM{h<k%e?7x<o
z8aSC#EfSnZ#W9a)O(REzxSS({hHu*IPP<=cR7Z!PMkFt~JMtzCvy0Xi?Ber?RE=Fg
zBcG2l0pW^7f@V*;=Bn*nDsgJJlcYg?B7)&FX94~Ec^$}x)rLQD+k_I{&gaG1?Vu(d
zS1;ZKV;0pzNo6Sm);?*A@aPp_QoX*2eyR{$=;I)=Iq~cyim*yz;SQOZl5b#JkVvx_
z3yibiI)Bfab+niBp*E}pKPu5LcjVS~%!Qi*`?L4}`qUdA1#z%V#D%&RZln>iMckm3
zq;N)(YMHX^o@Lf_7|C_uSQ8vPCh8~u&sAMQZ+WveMp?{Bf^?39dIZQ1D8l;~>*sa)
z2my1nb!uR_nJxl9VD8LvzX3vlATD$h{A!?XqO)ZM;medTza$qp+CpL!c#3yZ&bf4T
z>wC=j5qc_T(aJCcPH4Y-s)D713AAOo09ZtKz-Uj?rbQG40UOK?f>p^mrzr#HbaMH#
z1)@#ie!q|I_xj5lsS<!)e`1}979Ga=Ww+X-bLgXdj7PI8BL*$z_&Vb*6Mk6m>o9e6
zhW+_I*vAu(N|AZf<6fnr7CBCfK=KLi(+5j-B6h@4h&$2Td`Gga72D%Oh|xp50dMt@
zQMi`}*AQE33Xg73YPN`-Y_L(UD66>Rq}eYg<1!N=fLU+<-d6i}c|!T!1Y~cnzI$bi
ztTV8^4Jy<3;5K|pwKq=ciG=i9heq3Z8BHg=VCsjNiI3jp<0xs<j|@c(*2RfwZ29Ii
z7Ok_}7AYjuah!OTWG#Bocqm$!(r#DAVV<CwmK2imRq-n}O7RVlQu<yRcNQVX_GG(L
zMz*-2K1!fSFOfx93T|QbN0(+>*>o_WF{~tt>VeykOR0LB0D}jtK<1kiXR!|9E$c{x
zfi_`yLzg+{ZZ(Mt0lKW9w2+D8dL*%KVqCf`T5~9QY|;%n>weck-b4XT1^Zc^%5k+l
z*!ypkgLQ6|MIkZ{*i(`5v}~{%yEMJ;pfojfy0N%v=!MjYblzZu2l;%;gT{uCwq~zU
znys2G=*=%o2=s__-YB`ZPQnrtN9NaB0+UgmJ$*+GkYh8T9mDkA$X`*Ces5sLxa{o=
zBt<+=XtKw2eLcx`sL@)E=JRc5+_!GLMj|ld;%rM??n`u!FkgV8%n2df8S>>RDcvU8
zbjoO&+7n6XkeIC7??h=RQfY%bX(A)T#jcty1T2jqCDlH%YdJQFHVjp65&!->fy!S+
zqK!*a@QF?h66*}ox?awYddvnR7%P?|><+Wfx-ye<jfAQ*S1RRKM~aiFuy8N_<b+9x
zH#znD!su@CS%$-d-ZzL&kXnx61T1q~kAXk-`(Yf)VY<c>#tvYr-PiW??aQ{9rdUL(
z8}?~hSL3YcTGg9y9J5zlI<=hw$T^0!Lf0)uB=!qa)`NXc`|(rOqtb|*HE)?(7y4>?
zGrv2GPul2n2Ibml^KgW;p-}(6ch+1&I1woIroltKI4_a=zgT<6CQ*QB%d&3Swr$(C
zZQHhO+qP}nu3NTkYwqjUGtsYOdZHtGen5Uoo!q(hK4+@0rqmcOO7K*rk&~s?4&5Ao
zW1rn3p2ETcbL9uUxWFhWb6#mEWD0t7h3g)9Dj8JhYZp3~%Gc_<8x(HGB7#OawWR$A
zYt4UMcP5o{NKxCXAv4Bmm9SKlY1KF?20oswGEb{ztKE9tm6<&s%P#hm?dJ1?K<(-U
z!Kk_gvS2r|0l5%Nh(n$eQqLvQGHXK=rAbgeAIWE}I}L{iyVrb>9K#A5RJI1;-MHZ4
z@~|XV;-`y|d-^HzH=m}|93&aW+=$Q1(xCy@htA(9X7uqCMX-7X+@l`zA)Yv76hvX6
z9y$`Lgf_Q%ZooxB%hx7@V$omUq<o;dzkWa7ZOdsj10N(Iz=&c`t3<19!@RYqu%xS^
zEteojIA=We=tGzUCkPZTf~gVRf~7PAvxI!x)G1wM%M2V&nP%xFc)a;hNej=1lI>1|
z?yzYzyGk2;)*nh!0*zotUq4W<9ebrutt`222W!QdWd{w0)P!qtOG03^F5c*Hn!9w*
zca|XjL2BUs*gp*$CUeVyY1anD_28x%c&BINt<~=dYAsM4Tsa^RVH{z7Q3(AI==yVx
z8TBAqIJl@Cxc!P^!0~!2K6H*v2&s-U60m|5Q^y46aw~@278zFtEMDp(ZbbS-w||b<
zRFvq7K5Ch-+>ajIVD>lZ|6B{@M=iNxy5ohk&)>f0UvhInRk{}GvyH+Ko^U<w!81XH
zJM%sfILmHKgUYcaeTb)4)XHL=SL04z;S?riu&zE$t4u?2Pp4=XXp5+L8PrzogbeD`
zH}aqd@T!3lL@maPyaB8zi=&cd6B*AMtI2TGeh46MN8hWgz~zsRz7;FSGY%9<*E29s
zloTU-B^zQTB_iURG>T<dAI1~^h)PC?Q8XxI4nq}7-;*m@V`mN@7AJcoGh!(vwdW;U
z;F@H{3?;TFC7<J(bjA!NtqDv*$1x6#P7~7-n52qjm>9VtLMWcNCv&h)CY=$R)QV+T
z9flLj5}D+RWtbHaMwApQdnQX_sYooxD_$#9+AI?@q$Iz$(Kq6r#BQ)NPW10d*Ld=+
z-v2z4pW5*SonuQIb|x?M9cNdZ=u#ixW{>Nrk8qotgjbv8gJ6sxfa+!wxbgndp0Mgo
z;dDlHIdQ$7!fKBz{t#XrPYbmJ?+%6C8}*L*#!lT4(vD5F1KKqLP90ZSao46IY7;Rz
z)F>ZvRvZ>xsPm>RD&UWm6#Dl-MNkTZNYZ;KI-Kw^@$nnz7ejzV^8Rkg3lyRGjHtZg
zr?t|}=Y_CdG!of^ksh2x3rmUUGd%YO<_m$`t(-xy2jwmglat&iK>9w}VlP!MT3S<2
z%I0Zd%(X29#&IQ@c2X@RN;<IA5_lpYU1@5Mr3jfkESyhh@0U~@W4Khu95!CRBSUr&
z1-+<SP@tA<rJKOvRq(EjZK`L-d}|o`&7yslQd{#{`_#+THxOfe!W!<L2l+tpriRS4
z)x*6+4C(tttm(ZB{IK&Tgv^{eVREqnc~!(DH8!{*e%j()ZQh^%8{YnJI4<a?uHh!{
zTAmOiRF<4GULJK*12>izT*K|-KrfQs_}~fGj@Ydy7FA<IpXikj@0#AIT4Vit`s;#M
zNld8q6<EYK4Bl1O{R;J!*V)j1gIq%cCD!NWu<tn7=4dVT)%H_C)Hk_S(6!(JYo}R1
zFIL&#=IPxw^7t<DI^sHFdoIb7w4)W8**eOpwuX;FZi(LKgS`Om6gHMS9aQAFyK(pA
zf74fP*aZ;B1zA(*@A$Aac1WKm_E#FW<<;q$pxV7x)C`P1B<yW<eM(7WF_LZ=$Qfgb
zZh7A3RcRp#$AZJoL7+Fl4)ZNF!0xfHp4}ZL%ryNC9`{AsJKo0MCg`-66l=y>^Ce$p
zUb_jb3}UG#GRF-<q}HM$vk%N>!=|i-lbwe}y!UxoR!nrzq50wy80VLm@(ww6V5u5A
z*TK4$P!7c98}>VXYRKu&wb%4|dT6r`Ot&R&S?O+=)^7V3AAaA4MtC-xg5h2Sn|zkR
zS`so#w*gMAfz{Y(y;;SeT=}vS^OXnYOA`?NmBp7m-%u{bL_Oa9#th_}Ws5c2j*XFy
zYyWQ`f8P_tn^}dsX+PhX;J}D(axEK#jgb%QjU9ST-%L4&&Q^i>T-hY&SA=LqQ)Y<=
z9c%Sw2$SZ^!L-Yx+dj_>%T`pAb+;Q^;L29eEOz<$J8S2S=qnHS`r7zst5~L$YY`iS
zl><MQNkHAL^R!-|C*bPR<MX+4Rgq#p9{Z<Q<tHMkQozu?+4P8>`I_E4t$W6$?!d$w
zms(FCX?NxhOH!5Jq|WkZ7sqwAX`ST6fTb!oox6l;rG#pnHra%)!Tf^Zo*D>->|0Bo
z)X+MuPYkWk_q8@BLiMrlCCcs?*G4-W9!9XE9#B;1mAmdnIQJK9->LT>og`1b83%WC
zTd#(QPsbNE1bKQ4z}9GLI!?f!@gk^dQm1;+)V;3v!}<vg?YsXeJ|mRpht%j_d`A60
zGw%OR4#R&Wv;Dv7;QuM^F#M;uQ=%Xv1;haFW1(iX`k|uLH;bz&p!8fGP7WDT2_`jQ
z<jUquWToiZ6!JsG0|EI1;Frv<0TsClO3IYc-R+U|{^j`zU>}Tw=VY!t|D-S|3X+9l
z%`lxuS#yOU)Ukp~@191Q2u;OPa%}U1Jl;$f7|i&b3n9V?U*ULavDnlwiQX@NX-Ts*
zC2X8gJS{R;EJQ0x7)=r<Vmuu_W=ZW~-Ylci?^K0oOGbnNf<(o%<F+3H)#aJIxjlq*
z;5?a3m3gFGy<qu@JEDFkd`8d*g==CCalD(w4RhS$7>N1n4!Z?pc`6weF_DbBBaW3-
z9_pctK5L3}uHK3<Yi5&D@V=?WCtDI8IcM5#Ir9^D{fG0;{K+<DyxawI4NxSTTL#X_
zt4^P8#H9tG{fO6l!-@}p+A!mbR<)kf$2;25LCr_(JfNrW>nkq*F)W{xgxsixf2Bco
zhw6zs;XU{PG6ZV(zsdoeF^JUB|EBQg-<QV!1~K=)mIMArDxCPg&t{FX)xT5?bl+Ll
zzc&BaYaxD-imhnn+CqGYw16lRmd#=G-nAAot%g>q)dFuO1;41gm0@(Fw?9ARho7)p
z^aFIm#Xhvx9Cmq+FU@>@K70W5F~`ug`<|HF=#ojD123h42CSGvG@%Hg3Zehxolppn
zHWX<{YO9h}TjSN%>Q2;Pm6=K{R~bu|DTu&MRj1ctQ!roTPLvvqj$G91)e@!kIAbBI
zB;@exBM?QY*3%E$W@>mmtX)*`Y7p4%D+X;n)XX|cMrtjTxY{So02_}@)w4#vPV^)@
zV9*#HtGPv<sk)1h{N-jVtlFn+ZFJE6LVh=j#;Y{sCYCT%ig*f;nfX*K+Ew&D_E}0Y
zDoHNzoWf>o3|U77$izn*I`ej47b($}euaFh_N)=)(x$V3%n@kQru3n1{@*?)V~4mf
z8u~3W_x}3v`r$kHCcvy^vG4l04+A`xn<C$3v7FCxw~z8XZ0#&We}2~6Pe$#>YEdCp
z;AL6Xrbeo3ar{?3Trzs6NKU{Bo+&HG#*l`4oI_-ivlUX+yfi03r?TwYc|z^o2Td=!
zxnt=i{3N<$kWR_CX+noVdZ;X8HfX01YsYQxPRn_rq!LF4y*1^qGa^nYycGFQcfNW-
z$kTZ#@7!qNY~;Irtl@#UX|!S9M}*%<;lZ!);los!#On+75bH)GCF7J>nIst(>(FXX
z>nDttu~4&hyx3pig~fLNsWU;mPzFJ6$=@K31CMI`9QH6Wa)h*dK-PvJPRwD1odX<)
z>=U4gzlD|U;_AhfKEr9{wB<W^EW4^5t(0vkOR5T5J$6*JY=53J0;$$G|3LZym0Xlz
zFhJ|avj<cKyPZwT&)T|LUa<YwWuN<w`>P5I05DJRzn6sf|9yJ>cVSu`$~$=_<#*S^
zXXpqjgBW-s=uXl=yfDF^4p%`C5nV!j1_)Ynia6RBf7+M~B5>QLOY*Wx^IqYz)-dHd
z)$&fjup!dMrpwl9#kMVu=DBC{#?{BAYtxeIlkfHOBrC(1ncDiz!RfZ+HP>tQJ#RIw
zZ0GyL2;gF+6Tq7tv!4_@XLKEj(VNTvcCQOGH@WBbc>nV)Aiys@wCpa9@oT<6&iM2k
zK>ACad~W<<9Uyl|&iGxf@yi0>H#%?P;k5Bf1fXxYZ~Q^6ksGzY?!KE^Fa9{{Pj_&d
z$$KQV9?YKJy<Ov147DFtf8QaW@jI8sPua9?()uN%myoI5<P}ZEx31~0$sU~Vmm5IY
zp<nQu7{(9r&?aLy(NLbrTNL!|q<wZ!AhaZ?8h{){+u2Vwbl+Y+H>11B9mI7yG~MJ~
zC3GJQxzPLZ4YLpkpnBDVvB4y@p0o&6mSyKWq%rCdXzWRa;QDg~p%4n#JV6=fGPF<%
z@O(^BXlBJ;<OoWnR-2fs$c14<XX&^-F*Y7HhHk+g#tn?qBeRtgKH{$G>PlX;XlpC!
z2O$vmo)$*bSSS|qR>26Chp&vGm2yEo3u>G#`uaMSjH`$zm^j5E8mnj`gT|)toV}m5
z_<rDDVMi?;WBSEayqe26YZ+qD%MYWjLKmNiH+#mvT5OEiSmjtdOz7JiU1sc@%Bv(x
zq8X(}%wdH?v0IQ#LLAKq?^~ONTQ{>EJ{OAZUL%RP4RM3f^9RsXEw8q5FXv1P!^g<;
z2>Bg7B_3{i&i3r7;ZXyh&3{zX3HqGa7u$+x7|Ux$><!}QgR+-|LOEpy;#WBd)$T0K
z?@3}HeF?&?Ay0#4B`IRMOzr<p=*AHzsX&_PIX1q672Z=iOj81>DV7xAGzdtMbgr9*
zSQHkq^6<o)wZXhnxqHzBN>%G`m^Q$@E1@xi-JF_Sm9-*81VqF1EdGTV3wv&<|F$>t
zBeYQ9j0p45#e2{yK-evHqmy=OGXHR$;wLoRwv~voKw@bs9S)C$%{4DXTO(6c_9;#p
zMG9T#HC0e6zAcVD=fXPoiy$`Qa1q<|bz#I<U&6#fTvcY!WWvC~#K<Qn;eca!EM6~T
z|L_!}?c<)D=*G4uEAojr`r9e$V+^9yB}h**%Q<^gYQo;zFP%V=N0VvBe+xo(+m6(;
zNzx+?$s#nfxIPo$e+r{*V>DS`7^@-YLQ<rV7;+kWhWoKpJKit8JQ1y7mM*<CZ>9Ur
zx|%GC-@fn8ovF<PkAaDZXAPTU7+RrO=#*~tngJP8HXEg?x6Y<tcFS*ONW6|_6A=+N
zlWzY8&EOv)bx(kVRx~zjs%==Vh@>!Yf+;J5h~%?oS<a}GatPX+L&bnuIjG#!usICi
zN;#b|I3w+iedJY9$-IKy@eJX;DDNc0A~(af0$89O6^BtNNsdRqB3STGEQ0=zOiU#j
z)X6gYqJqj0MWag10&h$<tPAIYY=cV9f~`O;7?)D{)bvVS^0q^!P)*8YlTuD8^1P8#
zX_waNUVpmG3+#;TD4<eKHPa$?<I2Z+tpe)nAsnLn{k7S0BLL1Nac?nh2NCaDbwy__
zTf;hYj~M9H3Hv1IPvdr(P%h)QZfLg&`#k6{j&c;sk_y!b2h|Etr5w7_c7Mal71A37
z=t!sFUhZ}CLOceg8U>kBOHv?ch$Zq+3~SQSBisFAeFl{%%_&O1O4zBpJoF<9g>%~d
z7ic4yIa#FWr|fSj$&G2=ep$Kg`+~Jr_vKm)DLFw`()iKbTGLc;yR$^mZR`n7lI+xx
z1KTs<rP<be_T}`9c@wQfJtHqKD@j$Wpb-kp2&tkbj>|xiG-TAB`4pFsXNpc4#HDF@
z_Qyz>Qxv<?&KFo+%JFh?)sifPi|I-xL6i>7O}6Tp#W6~!kpd{DI4Fg+w&vElSyMZ;
z9*lz2>BmgU8WSZiHzO-2)T;_gbwta_Zo`CQSu-^;_235^Yu%-l9Rk5nuJ_6=n`bZy
z8?Ehz-VUJ~x!Nqi1UaDrV;|bAiX>~@YDzvv`hHA9_z!z^_R|tg=$Or~iR$y<ti%R`
z^XJZqbVlmQ?a~wSx7>K&A)NJ2mq1%rhUC%-Gz{$MQIGzb?XNBDF-Pre=|OJaS?hz<
zoTy)QSav9Jk-b}x;(E#Y7oE`+#UE3G;5JX_TA`1@uxA6bv5-4`gjRb~%`kX@I^H2i
zeJ1mHIe^}a%&`63@q$b@OXjeQzZ6+P)56q*1Ix}GGgUJuDsKSXVf#FgdR8#UjFzG}
ztxNkS?TJ@v1^#hm8fcuJm}{eA!gW$O3|iKI|13BKmZQ#zQg($NFq_;#vmbz3JERz^
z-RO{}dBm})6quPj3)%Al_6<$~IqS1%{r~{N5!}imV+X7&`JlfWrPD40%FGavu*V7(
zAaf)iI84J!MhIna#4ZIhP8B4_7D<yI!<HCgH-#vk%b!FW)yki6B)=c3<0n(f+$TN!
z(V%rsjWWh?OpP+pxTdx;)~HI;Wx=eb>%(H2g1QgtYFgn6T=I&XMHkvWWKN3WRpNK*
zY>B8F^NzhP))n?dbK`^;CX0Lv&<cnFU-*|9b#Tn=h=kLUQrnqurz0wMeabFQtA?0f
z1wLK|K4J6JPlvSSb<R`tlotl^e#rf<6Zpv7dPuewU}P<<J1F}F>(Ctvyfcu`7{7cs
zTcRx>*_3mt4QMdgq!Q3fm&`Z2f(;y}q?{H2oWMqZMoH1^Tlz?5hx!v4#?765<d>@P
z?-8Aps<-E{>h&3t{yg0*H0~&|fn}_b=L(`hD^o-}WnwXI;-QLZr;(cj^TM&NpO*#m
z;<2tzmj~?{0MbhG_?hpGat-<7LOWYrK-Is}{GeO-B~j=NleE|}4Z6B7{W0XY<^LFA
zg3e|Lv{@Ln-tGg=hBW3QyoY)<Sn~$%@B^IHa(Rz2d((fqR=PcNf4j!W(=TcRtv8JH
z_K)n<a|hKMkXsiy=0}_3^0-t7?Mxw0_rH01lG8OnZ6`_=3yNZU-z1|jXKX)IUr_y2
z5Bjv0i!gifO<qlPzew|v>S;E4&$&zS6npht4mhl<T#!huG7bC#o9Kym9@g%}$AVsR
z9FB}TjL02ql1*1OTe#0j+R+?Ju+st9m0@L82<e4RL7Tj|fmcv#tJ#a_H<KYoHk(Y6
zYNl?;+KohgPg{!dhE*c_Nt9uVc~JD0ZWGR111ztfVNfziIya#DDxgF0K&~L!M)>{E
zM&PHwj8X9J{tsh+Z-sR%>rbVrtqQU%!Pb!T<3`bs_DX}TXS_UD5zhZo@XM;U#1jaD
zYB>)y==p)x^Gl8R;g~kG6@LfkShE8)pXpn9NteZ_E8OjD+}XM8{>FzPWb0b8`<!aK
zd)912+5GIZ<OyNd#m#m(7UOum@Q|PL@w{1e5<1B?FZe!=+w+zyU4ydVo!2?GD;D6i
z$I!*U79pKTQ-9VS)$AU8dbbRv+>WKZv{4B(pDlpwYqC~JU2oyr!*`$e*e!F@og@FL
z&`h7W#2lkeOC;Tq-}hPG)jVu{$Sl!`87pe3$^f6tWABv->Jz&D$yRWox=P~)mO4_?
zu+zPC-vfT9ODFDokRkYWxux-2NPeio=cdM9QbCwhzK58avs>GGsLQ1Oy`nfeUA42-
zts;BWWCp3rBHbLc+bkp=LKU^uqAsAhu!t&Y9%)2s1?gMCdDqE!V59Ul3jBgm8t<D+
z_+`}kncQR63V4793H;f@tTyU^D&p4(hiu^$3X=S!p;NjrhOsrsn2hMh98u1#5qL$$
z+<MsoSC)<G4~ytWC4?sc>>-QpzoxF?KrmON7Fd6PlP37Zxee;?M&j2;Lud~Kzaj<n
zl0^@gYb2=Gt!YEJn_@aN$IZe2UBP(xbQ=GCN0NF_`R$X?-46Hfgb;YP0CZzQt@J5Z
zYs(kdj%>MLb*|i$Bb@LH$Vn)fyzVBOa3#oUOHC20Z!PnNzTDV?X_|U1Do<U$Gth3g
zsLRTkxYT5=vP_dmD=G%-j*@M`Be}3uesD|a6q{0#sTG*ke{E5cYMuXMk*g$O)<E|O
zc^n@(EBQ;C!fapWi8Z1zxO2SreQIqbyjJe58g1ZH1>RE`sxW`p%DLvF!(;dcId#XQ
z%d;AKdw<@+>K%vP8<;d?UQ&wksh}>TYNKr_vRlp(4P&80^-jI~ZG9}7PzP(!;|Y#)
zVPJi7OJg~`&=wO|_7t=6@v+`=W2RPFzCWV*2<@{8?X9A)TS)D5`_8BAPgWY<;pNbO
z;u5osOamW1Hc#6M)wbdbH^O2a$$K_PDuX}Cf~4?6!19a0;)fY>F-ER3r=ROdJ9LT4
z-kI9;`jh$Bh4cqkug!&J{{!j)2AlaV#@hVWDgPT&4^g)YL^BS=TpVapPAs8C#u(=B
zG9pnAr~OwA=BIWI?^uj>{dZ)oM|8oXRp;rWdcH1yjAz8m^b`hUbt?+FKn;v42I{)O
zA-TM0J@gjdw4#3UOnXUx(GhXXFQTLy0#HSe=uPusxuSH}R*{&UO-0$V7rKtBbWb*d
z(xZhy`6{|n-O`4geyC?sU|&eb#jE<XOU_3SYG(jkN&}v+KkAEoh=sJco|!R<eeOda
ziEQ`E#(nDC&hONh?D$<D;OOSiQy+&0Cypz>$4G2j4#vG}0+QWI8N6Ldh5J-w*RC<p
zzx)RFujz+(+K^{jI8JR{YuESrDUZIXJOr=f*c&dKf023(ZO1)^b<|R?4M=f3Y73tu
zy7HAzYzE&h=iHxK;y;n4uqI6fsppC(E<gE4BM6<zg|NGhKj>}zdHbtN2+l~cF>r%K
z)5@StOCCoI-~ET~!c!&c$VX$pegE~{TD%6uZH@o{z`*>!6%6ygmTUfR`t-l4Z~y)M
zssZ7xeB8oEHgOa>1q1(54+?HfVt^pbN)jNB|Azz~lixTIdZM2mi3|x4*=@_dBHy)v
z7G<+jB?=0O9>shmGP2dyr|zZ4GVl4BX65C%r>w^1{MY++mfp#U4k`7^cbDz;^X7fl
z?dR92_js%O;kYR*!W^@&MhOqMWa*h(@^FE(<guLh41;Jzz2d<cckGZQ7eDZw)8tXu
zBeU2cfEltxfmm+D=S-A#p(B}0N-snCSWzayvQl}eIFojTLvbeFvQ+t0l0~ZW(V6me
z9dmBtpo8=Ho;&8$%DZ<I&Aglc5Hf~}MYsH(Fs6#RSN?DWQ_ZSdm|DGZ3Ai?)QdOB|
zi9#>m<k8WiQ~ZPWuSMZ{n_%umv0ce%mGeItcXevX^86=LR>V{<h`T1Cw}n$Cu~(vW
zoHK@Q!J>hpMK_l=<pzcmMO-gn{pmv4Jd$}*Dd~yjQrY_D{m7!|spU-B@TFxkN$-Qz
zqFbX(5mvlKWD9xvAbz}A!X-oY64-$vOr%q%Y@#$YoSCb*LnW7!Wv6t>^#0!UeVZUh
zZ`Hj_)<(PN{+Ki59WtlxsEQc4_U}7puj;9ech2alm3NTr=DTRdH>pRH)@jyAtX8p?
zt{&x~mdTe5r(cjA+Kd<TPSp}^#tm%D?eZHfsUc%!kbqu0<DQM(YYS7!dyq8Ra_}I{
z@eZq<QdM2|$|g2sOG?{E+s9;<;DGqruKEtbJ=i4C=BD>d%qCIq%R|b8IzimZKx^zc
zb}4kYt#Rg2zQj(lTH!;Rb8ruWA36C$!l>mmceQDfTU8tP9!zU;oi)s7_Y6w`sT}}d
zhSRrzpE0kJr-1^`5tYykZ|M)qPrjYTSgNR=dr2FEe{O&!e-GPLTI)&%g2gk?*1^(P
z3+Gi_crAKwz;$mp42zP<eg-1h{Gyike9Py*ZO_wu9)pOQhMc)`A2x~RNwm|%JB1JS
zEG>j%hLW2(QVtD)Sn3cW8>%mDjfi5Do@p_9t{%O=?t);qbfRgO1xXpxH&INt@%P9o
z30~A0!!Tkzv%sUaNTer&oT?N&nSUYzr9hPe_+$B9D{x0#Onv?-IbAC;F!H@nh*ZH=
zU-JOj3O)XaL`?1#eh_wXZ6bM>OGB>6f`4NJGA4J_b^k=<v%fn1SsF_lO|pw_>naNs
z%U2PbEXj<L?-)U8JLi_Ov~Ca?62x0~_?eL{FCCDg#}LVBClUt)N+)MtNEs=qDrl{?
zQhR!2O1YFYa{N#59mX!!gf|;k(ui{YYD?%~ePaV!L!q!7iFzU!fB+Y~G{>{Uk|z?=
zi-0A;G8G!NtAl_Kowt16^m?$agp9M3zYOgZ_gE0k4eR8lGArqX{|SFtRRODlnvjhn
zrSgT0)b`8p$4<JYozhh!&c=v#8(&vGV>2cEgXk8FN}Uo$CN;1kTc_QQtA?7@bzrA@
z7yWWUt>H|MdSK~QMYX^tIDiFcd_@_9y{Db3L7c)#!u2Mo-?Za>ShtH)kmuMiI`2a<
zV$n8x+fG%#Iy|gbTNhZUXAUKOVu*ewUI}Km=QhwVFDcU=RI9M2flAp*L=mr@P2^OB
zs2{h4_4=ti8T^g$4H>*{DPR_Ppx3b}-Wpcc1Lyhr1&lL(;ZkdRp87<lj$E~IgSCsH
zLEdY|xN19oR8;Zcs!}|~h)V}!bIA-=>^8V(UF#Z5VN!i3!`Oy6m(Ga<2;G94%%5TP
z*rA*Zp95m8-$waj%4G!WSiM6MDf1`ET6WT0mvI)EkQ_;#3qpF&DC!o{l7TJ}k{mK*
zp?*B;1fBw>Sm+8!<&zg_<AaS*(z2TNFRhBpr=|97D{cWF{|@cL985ncAKEw8zmSMi
zk}0WNdiU5!PBlEN<>M1PutvmeRXjmgEEF`;d6iEkHd%o%*H`DkZYQiD)pU@aI^{%L
zM^z6{yG7cdDG^xE8$w<Uv0g(#0&R;npxe@I=ua@6I)}ID&TP9eF&9_)^k<vJ%D!kQ
zK4}&|89sFI)~`|^C2@;Hpu0n_gcB@pyrQp2&i(o1IE-f+p0oKzPn805OH!ac0(MwW
zu-_Z`Zs%>FJz{w1PhKBu`Fsmxpgyvn!^5`>G3*GVRGfhbzo>fN3kF1I;I^=#oIxIj
zxYm1M_3BDwfznBG6cdh5+H=F97TS|mHCa9qY%FXA#ZZV!>lND{330gxARt<Cmn8!;
zqs2j}W3H7>P;N;{wo<mmw3pz(hK%4$J7hP0^Lw|CaKsnB(dCDDl8*!teY8P%Ct}6#
z<-&$-Sq2cfjA!(K=tQ_fNH_v(4{^os?Z9_Xh?8_(>#1i_L9vfWjIgLUWvj$@yTW^q
zzUN+wsr_~Jz;6^5yfjh!Y@mEYV8!qL)KRY)sM+^}Xtsr*ZVN`Bct=8O<10ACayTL$
zH9Ife;^cJ;X{Ypwc_^rz19a3M5!DNDFut;>FF@)=-sy4UiQi1Mx;<RI8IYbH0r1CS
zFdxZ3k=xJ4VU$bsX+VnhzsiSs5G8Y1Ux0n(_j5455pfn@jRSReRp7zOW?W18=^q!z
z_qqx`TYIa)i>vPdu~JtmIR@^$bf6=JZ6Ljnm|sv=iC<D1Rrgq+eG_&Qq^}^1n@ujP
z3^42g#vX^HdPslIf|bc)AI8`JW%1VHloe1;I6h(`A9BA4@GV<tat_}!@h$BY*JY4;
zV*jn0eMmc9Ydd~_cggt?SE4%5lnwP((&(+A)=NItntTw^_<=BaC*5mq8INd|Z(qkl
zeq4fzs+4=^AaJM~x)Or)(#2Z&Wrd9TtBT)8uB}zj>(n;&3&h-9S*6ldUoIHDJXO6W
zCbZHg`mvhMY1rtpSl)ywRopOasVq@6X-S<tDriye%yZIf95#I08U*E~-C$zQYMcJt
zNK`><?B6@Z+3iRq7iig`QjL?}+n$dCNjI8AXKmt4MLd%&pH8W!?4*gIZ|&Xrp*CGo
z;TXXr%ko8GDn6b+>t4H0cEHDA1OGN#=4O3GDNJ)lXRpf#8+%R05y)O=*p;y%?F^|H
z_Ep(e1LbgJQ+H`@&4~35`{yu!WdBZ~%00x+2tXn5dqZ%U?d-A|wpFZqfE2x>x$H&v
zgi^L^=57!(Oh&392c>9ix>P!HIk4j46L9A2g&9QfI#G;$S1NVvO3kpriEiIdJ7v*s
zM(nU0PDf6fO`fwvf>Iz@?+E2>g`}$A4#G1&h5rJ!nG%2E4M%=U#A0MSd1qb(18n@U
z7}JC7!Z#j2QwW1O?PH0%6N+XCTWGh~j)KD&HMRHMykwuPsr?ARBzL~BY6ogu2r4<W
zpftHeAa_{Pcp@=Nz0;CLheIv_wWPXexvUh}^n&EmX0#^w;~(k7E8r3fqDK~UGeBH=
z;YG8ubVmi>7zwAi8)Te4pI-5ig{W^Tu;fGh`?-S@reTs%I)vLQz+z(wV&FU7K`}te
z1FP1y7diV$lM_WRSlqZDdKH>Td$B6=eGLt2piC}c;;L=J>H}vQDXBew2TR$ZIPzy<
zXk<>_G#PX`xmy@_=NI`nS=d5<y@>Hf6hvkkRb5z0fmTo>q!IL3G1(NkFEghc#{_Hg
zn0z(=iZXUF|GOMiibc4xKoKU%vR%1DF(%%!Sh=VelV&CTyZaYEW_wr00|sc!W#Nrz
zKCO11wn(6S2)_>}oT%QGs%EHLcS8Wng3P9rsywE)5O%FT$aNtl{8Ojj5$RgAc^B}h
zjeycFZ0a>Ig$oJYYt4`v%9}@g7wiQuKSc_Nx6Bvw`|egbE8%YJ{0UyyjNh7Ta_A{s
z!bT7{V|jz<j7tK+ST>{lT)`7senjF_Q?~PzSa!q0X%SHbVF5Dvnfy7m2~&Z!(7G`j
zMRc7K7E2iE1+LILIf6V%9#S4sAyPh4K~i2)fwU-jfFY=SYn$N}aLI??jgQ(BAl4(E
z)qm6!-x@^dkG@sW*nTVCkcr&jN7;TQki9&bwlnz6d8k)zkX0-(`$=xQ(T%f%wCA8+
zqLFX$xF7c1H~<ZpPJ~+{{8(1AYz0WocbA2rkOi7djYs%LYI12BZtNm+OHTS;sG%RB
z=O5B;aC^S8{qdKFGZYi!+zOiBozW#<{vlL3qB9BrX{D2wg627kcF{R=i5*|SGnuB+
zy|fk33587CxXe|gLeeDy%lHLNPs|z|a<uvnS0&z;Yo~f1%4()UW~M(Y{Lo~cNY7ft
z5vTMYDuCA=4#FM$&hv#hWq}4+@9{{1PS@Z&G64V*?M#6Y{Pa|zd)?$SHvxNX<-0)_
zyMG4yItbq)uma5x9ALpu))2nDAc4aWT4KRZ00+9!1K$-m`@jN!1;2OYzY*ZQuLAeT
zzn5VK3_dcD0->(Je-Z%)9>E>NfJON0@k8?oB~y6dy#qg#%73uny-x!7M5?jTfd5@~
zvf!v22;Lch0}P()82=I@)RXsuD7gS-iPo24Ugqg%ou3D&Y{6p}s${`wn+f!JKmre+
zG_eId2?Iy?<#B}Si6y1H@Xx@2zXR~Wz(G6l-E(~RL-*Lf7a;^<a0cd)!FT#o55n^S
zz%7yBsV@Wcf#IWp0~h$|M^Of}Y{7T(+j8Kk+lbKf00+3%z;TMo;lXJm3EJ5usXX!1
zfq_Bz<$-}mXu*FXe%4_IxSp)YfL&aHO(@{IrTQFR1Q_6}qY2beN<MJGOaBZw<EQ@t
zX3~NGg8Ht)3Uq70;lfw95u@h<34A^CF~ds+7+5s~$1H3v1q*!Vx95eY1_*>i1MVA=
z*bm&}`RIoj(BulNL<aYj!_)NS9efs`g})5IFUyj|0$u2&?F3GXL1>?>5u=h!+mTtn
z&}<Q?RUzfP?N*pmSZ{pFzz|{-y)H^f3Nv<I>I8i>F+yWmxt`n$>;0%Vc1+GhGAA$U
zt+#_PSC<6p2px5#LAiI#oJ3oY?oFRPMR8{EReO}w9IHZ(b<*GWU<NJQ!dnjQ<(2*G
z`2qJKx0fNA<RJh3C*4$$AhwFnkd>gG&jcF#gXGyPhWB93yg4Q~E@%8{eZpE?;3V&C
zE;Sd8!G_e=EMuP@W52#(w7A$r_v32;-xAs}{O>wO<xBFB>5*sQkzNlKj<!!c{}S=m
zWH{X)nM(v3EOUA<3u3lKcH2Tg+foYixuclL#`kn5?ED`%*ac|H{_)?DB(Q{(GV&y@
zy=r!ornBF(UxR1;$u@GN!VksHdyr>?c;>&LH%MfOtm_gR$$`ak=XI&MMyL`_(uu4=
z#r`qH4uW$E8-|Az<_}N4aJhne6)@Z(14>-(k@CE5@(gw*=jM51n+M#-RJlK(O_50M
znee9;ve{)Ed*@6eN&!rAhc<6NH^WNfvrDtc+&;2;>JfL>^x{KV+uEBlHh(8G7e&%&
zE=%o)j%Y3GXn4dE)-m=RAa^_+nF^N!gvlkf%)A_V7G2^w;}kJao%XHe*lg^^pPd(C
ziPYEh;ud8S`m|}TuVH}d&I+mGl9O=@s}{?dCDoj4CamF}^pY;u`M8vLjBx`*-jxHY
z{jjXJ1k{E)tIjo_vkK@{6=13RA{XB<C-<!^fNu-Adtq<d15EE^ln2`833p`B9%j^!
z+ZI84V_5&3n{<b9y3@Hl;4}x_Dq_=seqbcWc&(_}FNPEod6;k?%VU-8b0){$y*Pr>
ziZqg4#d6uQ)r$N5NwWAy17(9~eW*uM*|Z<KCsUQlkE;0}EVS2+hK{adzAo8I4CMdu
z3G)l77mw>poW;({d*OKh)a>fJ<?ZP6z=tDz)RN@LSnqJm)|v5vu;fm~JQ?5)#_l!n
zu-4Oe$4h(!)9E#sNMpt2Zw5;vQ(E<i<Yp+2dYdp7BEMgI9$s;GM~55xq@IoOE6vI`
z7XSN3TYZ0-ODdD>5ZEZmZWb)<pS5D^B06Z%SO{iJaG%;&?e{-LRyiFi&%OMM`X%|-
zIaTWa#4Pup0M7sD#;H;LbVCwB{wbBna_%Zg7E}<(7bKBM;8#E%P!$9(4n>usS&VF#
zXgyDjugQ`r$oJ3BC-233^Zf=oAUYoB!1a5HdvG^%ZHm~`VFz<`bTwuCoO#{$GP}w7
zKHcT}1*G@SBWcdZH^70cVdfmkDg?%HBKT*k$3P@bejA4_nEsu9<STnWiw*|KJ}z?V
z_K0DtdpzN`qrE#JwA<O6A%RG{C&lX_*^gxDoXlzyau*xaL>6pE9qfYs(__teC37jt
zTRKg#j={oObmZ(q%UU#3)ljX=3`2uNm8D5wN;_TAF&$WuYIx|4emb<!$f3sS#>AYJ
zp)lLR$x}5=)exvSJvM3vtuj&aT&;*NJcV0Ro)JWhd7h;YezW96`L8bzRVC4}62=O;
z8Y4)i1_7!xdFbfxW|DxDC=~;CyPMS5a<%OdwpW*A<+*DR0Yb;Ey!|eqN6L6Di+{Dj
z4=9siRh(G$_rjC9AVW&%RP|F~nbJOuRDmUrjd>~N>NB8_n{ngK6E?xIp{~DrXs$$L
zRmIW!9}Pgm_?NtiN~h9NG%14av3b+`)Uq;_AcL}1RrJB9^pN%I8tn1pG7Qg~&JBlP
z!$F*gPj<vQ&D8m+1AR9!MBaI99tcP2{v;3`1`f{iAW|xgL;~vhio*U<?|Mw0ipmj-
zp7hf_13da`D6Cpdn;!i&Y>mhl4LOZXRYsJdX4%P{*TKOz1q`<>2rnBITfv!3&YJLz
z(nzB2Rr=(RW>v;-;u8#2j{}P4z?XQ&G5Cs8Y&PW-({FS+g_?VQrMDHsT1+I~TFfm$
zqXIxs4i)Ok(cVBs$_nK1Cm-eH58>@*owdKtrdUFi2Fb{UJnMlvM<qm_>*JLYF?bIw
zJ65d^_2X;7x)@mVM2860(EFXyV|TnmKcQ<RMR;25hhW|7ll0DcEE$_z{aA+E0dPWZ
z;Yfbehe+O(M&Nl9;fGEb65a$x=xoN(dr}xiZ<3>{XE6I%7`q2t7`um};C$n62}Ztx
zblbKt8xWz#Elup186`m@^ImDg6}kaBD}p(gE|QcaA*VI6&%BFiu2u^I>>FoRrLJ)z
zr0)wlerbWz0X|CimhZ^ePYH+_kx_PW;oBtDv=%NDRaht6lyQ(_r(D&A5+wWhA^}h?
z9?4LJ6c%PG_ccN(txD95h~1slP7*Tu>ZJC})q$3l5;|-uKs-+Z?Xi$Up3DnziJ`nS
zB)M$MZcEU|h&?{ZKiB!Wmf)F1QRbZzL8R8Gd!&|L2*mQiqKzPec!hvM>xM|skP!JK
z8^2(HwxPV5NGQ<TsFS99M7F3a%(-@~s{~n9k7~eF=08|8=e~_p%fC@V|8|gN9>Irz
zvpnufITsF}P<TSPNY6|uZv8j3m>q#PR1M20Dnnka6O**`<^a?s>DWxM%|9Id$Z@6d
zoF3%TV-dTmNy2d}#oCSI)PJTUPt{)8>xs44=sU_>EBGpQ4;aIxIks`Tz#7feaT3Qp
zd&gn&wET3mS=3KVUk2pbwo~H4R&w5^-XB(nDz{&D-;XMHSSff%<Pd*`TV~Gos)0n*
z8X`mLNv*y*5Vmb{%6uq-6J=c?!QvJ)B=V|&G1$xpTjM5mJf28HPweG`6LE}JjF7*n
zb@&95a~pAqIN>!xeEw#I81Oeq{J=jA4U0;BKQ)~sB8e9fEaBnVP+i^&q;iTzff!<2
zj8KN+l#C%zp^m?%PQOHJ5I-U@ZrDH)vX4JzKRgZ1xm>k_<V512Ic}I8;nU+DkR+aj
zLHZZ45HMaCWZO2RNttbSg(4Uygq~zC%GE@@(mvX8l8Z@I?ap88nA>C&ByT!>P^ve>
zA=`0M>v#YX*Re{QFlCCDlI$+BF7g1ahQ&673c0506}~1KsfNJwO;WX}Rr0=tQ}bk=
zk&Jsfi1jjfEbXNe&V~h7$~-N|^0`^3%1vSJ`{eZq@gleLW9v$uP3b<p;MKr<)}>29
z-@kQJ-OCyQ?l*K#ra*^iQ!rkGz#g$OoNY*xs`2-K>K}#Ara$5IkMF$t$9Mk!aJ~GG
zElO&Xwf<X+#!|G@S}Ku#&x%O=!9`Ti`%HaiLHR6YiQqk;Uh6g?F5~m%YihLLlnBFv
zp!;<^6Yi#JZD|<ZxBAm;j_=v`ULL1bK40H&P<`Au>$UW8g0M|gQk!B1lV~d=OcJBX
zQDy3B_4Jm0j77>t=#Y)nwnK-CH_2kI1l^T_yH+&x58SENWjXLu**YfURAnLhBdbOl
zU9n9Ugj$bhe?}peW3PcuG)X7fM2zi3olrCu6xNQ_`>)hJnz{nzu(qSvy<W>p`M(~;
zCjHO8V$2txn$GW;5NI-QqHmPe-7qDn1QiW2=;t60KrVh4FR0r=G2yK5h#2e9f^t$T
zy5YN{M;N5PHGyYouigs`VYML~p;xsCqx3f~XZ(%PwoDP})Hszop$km$#Fe-}WS&5W
z71{h|-~28TfjyOl^JsG=8D0&q0A@4o*wFa5{R1A0KCB@@T<LE1Xs^^;Zt}xGU~n=)
z3`ZELzf&WO(4%M=?$@Mx6?PDLtc_o3e{X?kUxMf4*wBA^AXZX}xD3ziAaR-&l()}g
zL_)DmEDN`nYut;fpow~1!v`GMP^(m&?AB-SFIvxhoSrNgvWv(}M#XluLJc!X*9ABo
z9-_TlpFpmNKq$2FH_^z(=!O4s0PCX@9bwQG@{=olC3<h<OC8UN<EU*O1yEEv9Ky1}
zDM)}JVNg)V((W@B)DaKXE-DST6#sHfW=smcb1l|M=EYZH<0p|O@$m05?D9ubBI$)J
z&t&gWV>_VQ<FJ)`rBaHXuP$XbqeGlF8RL5hxdB|xKmSUwf<>iT`~URBiY3d9%d+Ce
zUD|#F{@2eigcx9P<9`Tx%fI#ge-y>?pGDLELoF*v{Re~ovuLYl<+IXuimM416IWDd
zt^i-jqYjDN)K!qxXgg`!y4Saul)ndeCl-;^lK)g0Fd<E<*ZWTTc>n$Zy9eJzE2&<m
zKP2+!AAzn?KO@hp(7;#*J)-J0zUS<MkCEy{ncB7<M3xkAM>xssxq-GIPdu4xCNUOF
zX#ybh%^7R57$cl1IqB@hiQ&M2;7jg74p>9Rt*kWRgOerf!OcW@aN<A+bh|K~3}@|b
zFyw1Wnqh&9jZ~3RYP~{UWO-TRYDZlfC`O2T-^-TTOOWhU)-;@^Hc7dWPIj?|wmffg
zCB~_8e<qqK59X4rV8H%aBrVl6WdCz&EAcL86-1vvIL?>7U-=B@w~lLw2RpM{3y9$&
zr&OglJh23>u;KN`Yqrjcqo0(^-lEPAsNSFCL5AAM<mC-~<g4~2bdbkO;NBv3Zx2nt
zMpS;-GS*T*raS3UgV+h;_z1G<fASKLC_5^#LID8$VgLY${GZgo|E2m>gK+<kP)NIf
zUUXC3c;H6(e&S#gasTk}cKLYfqJ{VcY5sr>*TnAl0RvK5Qt(#VmD<)8Tb<^O5!!{?
zNOp8<gsv`?DwnElmCiNQYtze(-z|?v(+TPGGSb~Gx1EmH>|g8Gt<M_|o8LbGVihd5
z$%cJcSPgyI%Q%eQn=!Y0{sCMjH~9fwTjWD-$H!ivy|<xYeDP^`m~RPKOZP?|9h?0;
zw#o;(9vyT2-?r0(a!1F1fZ#e6qVguEgg|j!i*Wgp(!`*-Dd)zZxoPLfpue<p<uKnu
zxcw~l)J)!5u%Pa-;C!QYC+?D=<)-ZQIC2LFFnsB%e({&_Ox}{Fd8wBFgS+kK#b4G7
z@f0sV4fYf+KMnB|DDUgIcl#QO0sT^X`j#q3<9TEIAsUhz^wRj-L%3Xp%e*gApo_F~
ziH!`jRBe(%SQ5!yiB5+(c{~8BLMT*G&1g)XRInv)%Lce+M;<5W;LKf1UuEWHZ&pu(
zY;y@c8uWAjcIN&C9Fq9vuUBf7w6w2ptfloUJ`D-`EDRQ8P)n|Z6d8PARWa6OoD^6^
z#A!@1eFh;?y>pKPq*~3#4o_Mp31tmAD)?Z_q;sDzpFa4JYi_NjWzWpcZrBn799Fon
z(3&tWTV+F!88IdZWOH~g-%iu(wTy|mhSN5>@eyK;w%Fjjk&YslB^%ZA&W(r^)uxfn
zNA&xpr5|C%FV=jtrA{u3SJi{j@Caw!25Q4p2((E;twee*g^!EP?ImQt44c$twVKa@
z2`55i6v~J)*?iHv*CBjwYlCJ(vVk2^SaS^tnz6k-NE-tS7Z=f0nMG$n)kwoD=%d-o
zL!m$!<#GY)axs}B(b%oofw@<=oe?QI#MmMtXCslZgcAX!R;5_5e2{ByoK-KkU_Du(
zr1;ir&s;9z?P;6Nhi9cR=OQ|m1afe_oFw$+QVI-2w2+Ppea7*T4@fZp&_o=rp&-ZT
zvRqZ#s8pDR{$(y*FpIiUp^Fy<;`SOmn#e%~Nn_s{nR^qf)qr1OOmx+JT=f`<Wtk~x
z0y;mjC9FCffwW3KJss>SuVcWYk-#-(b247qT*A?7&P<za?5Zy9%&Wx1q3IPdGPyHj
zruJarDn_VUO@xh&mGn?Pi>i^YhS2FAW}cl;6yHL2AxN0PfmT1fm3f8RUt@&!R(Mrn
z^<Y?Q0$oNWE%%<}*dfT&NhtKQ&uc}Gi6uGG?8ZGE7!)}7Y1;N2h%)hY%c)p!NFzVm
zT#W&}nX`lw9y<m~qbDw?kkjSaF+4boYR%FBVV2I0lm){wG@M^4ul`DE!+-$n<~F3v
z!hkc?`f|T-Kt*bvLOP0I0Jz8{j+{{`Im6rlRh=^#P2mbbh0d@-tzvV)!_H@wIy1N-
z+EOXHlGzTAX7cZ}{D;*3Gop;N2d5<GF3JokKU>B|!0iQ1`+Cx@GfT6m%`iDo@Fuck
zY+e0xjH_&!dAOEG$n7OjFlEb_9(a1E;&4>q%ltEGE0)Vy-D1TJ3vp4G$x$)GfD$E`
zcPpqF8Y#EEG>o3v7h!wEBqB_ltAuh64$~7dW)lsGgj&+86j?Gl(2f!xamldc)m3VQ
zg0Sx@LxpG(4;f-~Gg2c&OxY(?XeuWaXr5PST2`7M?p_A>Xty+gWn74#2-V1=0FIL5
zkvQtbA9Mx;X;O5?qVCKPHcR3Zv29OtV=XFIRfynF4LK0Hc0-eulj5G)d4?Gu(|XXC
zR_{e@U(ba*uPnP}r_}pr`>lA~(pjJEki5(4r@2L77atq$l<K>0ExuwoP6rt4lw~*q
zgc9B}2a(x$q@!e?j3=;Bhh&*fNU#-OvdmmczbCqoDX71L7Ebu?^mg<`ogP%OE!GkP
z6Vk!;^@aIziSgx>wQTo%XJ6jG<_Z(*#rg5*NpXE+ZF-R_A+oRUl120T>9tj-^Q{1X
zxPz)l{_ctcx$9C-co9wcMR$z~o@yYvlkKxq2%qFmjf&e>uWpc_g@*ek+g@Cpy}m}g
zZ~tj<9Qx>0>Uz^Hl@rvVP0`p$EK{SnjVaT-ax@`U)Yz>5VK!~7YFoO!yu7%3{a35)
z#PINRm!v%$f|4KbZL(6dIotA{CJx!oI-Os8YfX^BQ6x8C2`$iQ?os!&haVcY?~QPY
zXH1B=lkJqO)>~~t{<Q?pZ!UReMLxAOFPT=XDIo2b{EMHQKB75-A8M_GWv!FqwI`W>
zgfHo(X_2<z*;M4p8Oc(Pbx99IW?EkB{Kc$~ApEs)kbd5z(ls)Nt(3&cb#9F|b8m9y
z%yrUf<CM}&GpkV6ne5njA%Ceu-UXUMQv;@u%I^Vq1Hmm_I%evLNRO1x8USQ0+m^9}
zl%Bvp0&Gjzv}meIDxcCn4WTV_LY>7y6>tG;Nor4!(Zbrq8qf)BDyPqt@yy!98^8fE
zozpMMc&Zb0KA4@>kAQTBUp8>UOgzsXfO5pVo?QxjB9|YKZJM=|-5&@c-30{2rbe?7
zHsU7P9qG~yDnEA5Jo@geToi|b!VmS)5Qx$i>+ry!_+8yvQ{4vdL@#*(D0=ms+#9*j
zldAB|c=+Lxy1;A6)sNVoC31)V_(ELt30<<t!G3hK2QSKARIhV7U7cI<OZl!A+ce?i
zKs&t)E1ZZS7?$HAH>Hc*COfDVyakDK$dvo%)M1Z1e9<|j=mT5p+4Mo%TXw1PbT(@k
zaTWO(&FN6PFr|$cjcc-OeKI6+A)HWVL;}U^&%;;CpSDK|mkn}%C$a@aU>F~L#2VI&
zL;aM|FLs-bY>&>s-DfpREzBfOY%NS2EzINjGUry13Uk+uXWQ21n}Nk1_0;|c?YKIl
zwc0uGvh)0OkHHD8tQxPRoC;3|W4Mj-Z3l<gQpeXz;Md=n^q4Hh`3sz8&+IU_)BJnX
zbiC_v>}C|D1zVC0-NJ2E`0GGy<{iq%BAk?<znm!7vV}Or8#*BKJ5$sqMfZgl`6yO6
ztQ(Wk&7l(5I@}=F|Hg<le?sN=#Bs4<#i;Fm_0~)ZTjdTjDEW9ro5vwtM*y1R4Td?*
zyQj_GZeB<?k0i0XR@%uSTw4_KC;?m-y{W@)m~1h4_;AsZ*^S9PUvm`l$Tg!u=IhMr
zq{ocj{=ur*6|Isg%!!Zq>&f-^$goGcT1`Gmjj8?uyPDHbCnsnG4jN19fTG7hLCOly
zPD-B!?~;`|XE*|KH9vCLGh_$V_U<KOHBZLPcUvJjfKesXqY@h`W$fJc(S2FUkJc2g
zYN+ha!e)VRtWadxyJowGhujln3EKOFgfrgaz$JuLX3T&srg9WoC1|~}mH+Ngi4~@=
z#=4p6%HLfJ@yr<Sn5I#my?r<hRsv=_)G|<NW(<2J+O!2eYK4%D+9|u<opJEM<?7+U
z>CM|zo!et@5RQ0LzQb0ctX4eqxL8@;@uOaIR=?Rqdz;d_U%G}3dC}96pr)$gd2Pu2
zxUHh*nSd7q`%M#F!8euI9x_5Hso=Pz*DV8Qe`Lxz7|w9$zBYBeR^n;$z*Xjhh2#FC
zsLedO$(d5hLJDlF5-Ft;N-71eYDGd?v#s^o+~>S{ALme1a@imQw(ANb56Km8E~ajA
zL$Z2s)g6gqkj=U-G21RNsoM;>Cz2CrcTqzl^Z0xwB=Wg=2+hhr;mWMR_Ny=IIaP3-
zvq!#lmU#4J$k<AbpvEVp8MMM>&(ER;;MUvg+UwQ9JJRmKr(m&Cvc=pf^pPa?k%ZeA
zDVy%_nEp9qv3k`$Y%8Gu#OJIJg;t@I@&97&9it>+m#oq5vTfV8*=5_dZQE5{=(25_
zUAFD6F5A}4_u#v8daYS=)|~m3nSb*M>>Uw1B8m1eVV(6S$tBoaMdXv<Y;;Xh@~tnG
zpUI!gE+bl<74`s|+|t1Tx9I+N!2IvCg;`0Xir8u>p`W5kQkDat$kB}15Dp}aEQzVY
z2txjbd9DjWvA?BhXG6(^@`Jzqc=QPA6f;*dR8y4jh*wzx{1Qr*m}+xw%k6`rom&1(
zd&%~)JB{=6{eb-mlWy>&5dHHTWpES11Ce;{!~v!lpHh4@0yPbv8sn6AwPE0MNTkF;
z3YGJL8dY(sUPY>@Z-HT`MOmtes>%eT6_w(I59+P7)*#SrNTkR?KqTowUZhDea$kJl
zPtl?zR@B0q$)y0>EcGT~E&j%bj7jA88P0}#&4yW9PL-~C*Avgz{fI6-pVel$dnDd!
ziYq7Heuf(;?>3=`Z(9`BN~cww@no1AEDvlqNhmB*gE(S=`_{$AD4fh@ilKJw?@2e4
zxCJBoAzeK8iaDrGMU6DkrqCHfs*D+okr~Qe8FYN3jxJSgcGU8YP(lU!&c1QrX@0)_
z5`ku_T}C-n>e1x3YyCO#)F-3dupKj;)oWD=DpiT(BpKNBD0{!3iBBiyPE~gW*vm9_
z^+_H2m+$v8SSxg!HQ{;&y?*o<;I8AmZd;iTsg&$AE~9L8o1(U9=S%OO3x}~0T62NJ
zR%}{047Wy?Ry4J%T|_k)y(-381FHF+1&mAPI*1Lc7K>-JVc-xAUI|)OPvJe;tgzep
z)%p)$=d8x|l?LFju+-}NC7GclLoCq^Y5L5gqGrs4rs<O~85(R(){52#t2DJ5#tj3e
zm_|^BYV3g*j#XRMGc@nl14S}S<zlW?GPlKac7DaSS=rUvZq(JTMePBnv9t=hImMTp
zf}cD-r*7_S{u~z$eY=HXo4ZNF7IDmYX1r4#(jJ+Qt-l;*cN2z9$UrWc48r4L8}SXi
zqg^P(6R`+y=zj}6L+aS=yapr%!qy97Vi&%_uywtmv=_2_LVxUGgKYIpgX*Y3)f+cO
zu~-{u4IL+qI!56>nIk5}<6!&7=-+I`L1guuA{I~}l;}d1%Z}*+qwL1!j08MIh~4Z&
zRE`M`(m|m^D|d_eE1?{sKT-RXPd{SqsD+E8-oF;j6`sI>6h=xcBEv|M7M{R{6rLoW
zB$rDr(l2@uQP4Sp7k1MM{U-8~Kj9WaMSm7n+(tD(O!ibi@ggE3Qru2?2?&oYU4Sq0
zC1J^v;3c|$&DWX`=cT#_Ct8YfI(DbQ#gEL^eWS*8AoL|onJ$8F{q2^Q_QO9aRRgje
z!IX3_f4Z`i?qxEPVEIdccMymp9AY0`D5Nv)1#&(e9uf;Waj-3VL2zo+a4;Ua!!2sk
zt=pqXI2>$i)Fngo_VpLP--sW56d%RZv-`No0VN4JAAg)7!?Y-CwE`4p<^jc-{{vz9
zA0-;af368wufDI)<gdZgMBE)#xYk#otVC->;W2d+X8)^3qk5%xEFpgd{!}n9swqEU
z1UxEjs;j$YeDk*&%@=IC;#zINc@aoz)JoObAqJmHBokSTv8o6E-s2;ArU@^qc(vy)
z`uXlFqevDW+;DdSMU%(fTqB|%jQ)9BGDvBzhVdo|L^^UMf9(kfCyD!3dLJ%!B6uS}
zPNu>)9TVB+fipVe$HKJhbDFLBs0qde>(z$2Z))`!URot!avAH??$oyUa4`_{t4=($
z^Kg6gC&F0QtOwSJlZlJXhg^%*w1R_~1j4B_X}^>O5$+OW3{?N>HxjLSm(VE4#p#(h
z%4Yl`#b6O)jLxNjtybQjcsMV(u1V%|Uh;Akdk*yr`qDN%{=DeNnqO;saa$XgdcLOi
z;;hn67#Vm!A?y>Wc?lmUcjdm_h53IHmBv7hsnf{A(!{$!@iwsN2yuq=ouL0~$qi4Y
zn-U4|qOpK-vgZFTA^d&&@!z|Zf8YJ^|L;VWur)Su|9i%0N}RNv7eE;PwBmL+<tRj<
z$PZ97hqi!g-4BgKu&jWP`bvT&61cvm%Sv~_<(OISgGeG^?+=VlC!Q;b<NgH?im`(&
zsm6%I-PP3ffF1wF_x<4++CRgbv(|o95HxDOP^YfW*l2YKN?0ZG+gTh&s@%}I`p`2j
zSjB$L!|{qFaqrC0kRlJpB{<RPH38wYH9M@w+@VJ0*__3GP^v)8)~x)u46!zhXC|aD
zm}>f>k!B=5EJ&y$w|VR3U12=9VD?`M>2O_w-UOWtKRnRDw>aXE>D7`16grWA5hE6B
z?2Yem=zSDSfUzPJCA=_bRmfqSaC5@vY#sLFZw@j|+?i|-v4Sx;Yke4NYHZ7|Gr=t<
zshhR3Q_=)AB*r$#I~%h0zwz(do?Qc97-GdDInh~5ueMIjyyAPArYhWb29N7p8q|{a
zWg6^UN~Vdro5-3mK^v@OgFsl_K~dd-D<3m2bRVUu7tX?sDiK!*CkvL$^M%gZ`UXA+
z&;ZkoDltg#l|-W_T_d^FA4HnBbHkd;C>(1TpqWV?6gLNWz7kKU)mFLHPCV{0ur`;|
z>se)Lfh4y!+^Og<T$H~#DOdagoUle!wc;nsMSWi?EWq{@K3vxeZwll9wtN$cSS=Q3
zm2cYQ0plh_dFPp5jS?A{zAjH)-leRHi1O4(aXttPoSy-!I;6rvanm)E_W2{eAS%vL
zTLR+C6u=t)jT5>5y%OTTTH-(VRw#~30yCoUT8LQ<zm(f8(`gFs8kMzPg&3;LDB7o`
zL1rZ}QZi%a`4v$M_QGEk3}n>hhL3|KcsX5;j<&TuKY9Cr*G9488>yG-&x%1xq0*?;
z_A&V8Hr9v1?29(^|H91>XPRhCN?v{zCYW!AK#E+@B14f7D;zry6PzBRGWsHXE@YL4
zM@}?OA~K~#cJ4TfZb0KfLuB;AqZ*eGCc3T+%F{x%<2sX~z);JyYjcRl`KTI77DMOE
z-*U?RI&gMJ>E@C!{{uS4w1ONDR4(}9CL@<zSc>9vYR`gVIw`T~fqp%iTeK+agn=<_
zZj;)=2zikK(4ZWp0O77y32m4~nXY|Ovf%z}7YP}IHk;^u_at2N&g`&#!M8-bpfyfG
zJaVb7!IdHC6{q-Wv@hrz{&H65*!~jIi8{kDreR@^@^p@i>`51^&|`9b5Nv*k%3&ZT
z6R4$J2cw=L*_7#gd7VN#1Ajyp;D=Uz6~HV6IzZv_Z(Ioax8C$0+LmN1rv+n_;m=jw
zW>`vV=*A{h`QG|XD(aN-701&UI+~;aMNKMmSN1co=Fx=7WL5b#s+T@Yvo{RgpU@`8
zL)T6CW7nDFt1QTK^=nzOa<U1xZ&Rd)oYNog)A)WsD{*2>>5SAy;HFv<8Ac8gLp*$u
zznOYEh=hlRD}9hc#@I2tN9&V~@<bed)OcWO$?MCQIm8UQpoySC(?nB9cPAS=?}AyW
zqL^TIrN;Ed<`Wr=-#JQ*$Iz^6a+>Tll0r(3F|b%QmL`vDY6yK}C&7`*P~|I2Ua%IF
zLR-kRNKOFtQIIl~>2JQ1t~Wd#ESxNbDM;|OEHi|cFmGh0ByuULDT}hKFU5M^bMBp6
z&|aG;7hy8TTY}x>Bg!M(;un4>FqJKMnrqL?Dx)!GvC^DGIhF`hXDHC*+U|hIJ`a@b
zm3{l<n6}WEA5$qfr?7G*0EaBNj!6zcwi>Y-@N9HY?Q*mkxgyVw<JJ>pPc$m8T3Q`m
zLTC-1lFv~tK(mcToTrgo!FzO>@Nl%;3wxngSZGc^G<Dx80`c&stoUKQ$ug!uGuM@?
zQmwm40~ut{`JIIp(Qi(JO&K<VNsG7;ad|pb0}&U~l7%eqM+{kHv1N?`Ga=y^8>JC#
zR1&t#8q1AUYNm66!v*PGh$*gOhm46<ll9?lRd05g=A^mCdSL?In;j{tQz$hRLCGQ#
zstB{AN=~KP=wu-@usBCk0Pe2U(B|tmu=pZ3Qt72A@lhDFsEfCR=?qlr3Ynt~Fy1_=
z2gfdyJA=4D_ByFv+fLL$8BmEC5WEU8WvH!QT`;<aT;n)v>n;06a1Dezr42=fa4jU)
zZsT2w)|hGc#X~5#U1(4-2eDp<4Hbq+FQ_30y<SM3?+iLAa)=z0C_KjWK2M6hYOP}6
z-+cI5gI6qv<xHv-IwV)ROliHXEKpR%bS=j+lp5!S;zS_Ve}CAVwC1~Z$RAqhWR1}7
zpL#~%M;SdOa90OI;?`e(cwjlnTt6c^d>`~UipN2A6dFnTP97BCbxS(LW9B87FoevB
zE4S!m)=~$Kg$0LaHxpNa%fjbMB;?n)dSsS5@fuT|br<|V&=oVk`L%tnP*BMbG@iWh
z+YEg)F^OAVJQ}IE5)LuUFB=2l^$AcML^}V2fuEP=5J(R*>8%9LT&i>dtZ-45lh}(U
zS*{xAJ-OGE=^sK)p8mfh;dmZuGiC)G!>{`9q^@e=H!t=v2l*U-=RS5Ez1wEfMwpp<
zgf6KVHn{uFrI<p;!}$-}<)j=0W2aBJKoy=0IEO~EittUp7$$T%o3QJc;yI`7HU?I$
zAx#rb!&d6gLnY~hmQa>L^BHG^StA=TIU-!na*N8&;c5*c_2lhPuF-KVPx6D5HYDw3
z#$nx-GIqwnhdYMfiMIQSe+nyGhw+P))%&RZ_{7mVz~OBUd>H=9o-#U#nv+2{Eu$8m
z(Vw5TBvC+|DjNE<nm*4FX@tj*G_?KV<ex+Cn`ipr5M7D?*_;FPMs$sCc@Sr4?ep=V
z?_-OwU7p5I*Y2J5;{1W}iyrCIbn*>3utWAur}v{x<ZYexR;e)x_roD&TXP@WZ4nn}
z`#wO6*WL=^31yD%sj|bQL+!O^{rpp{5cLiO*<HXF@8!+e%eP(lEwg%Xy28M>n~Z;Z
z;p{M4tyHi#PoYJVzzom#G)MwZ8Y)1h3+F*+vbKRf%tsHK1}weta6`gt<Q$xDU-!$!
z-Pa5redMY6vm$(O7N3f+eL4eRYtPuNHq_1sYOEZH?TT8Ev<V1s^kWn-F7eZ1x0(X*
zE<=a$KctB(T`obkfP}mWNXT;k2{intH1SW!|6g<R5_+rCY&!#Aa%AqkVBJzk(QhVZ
zWUhDzMN=lBPf4}FK>S}~5$r~Sh#RcmH!{5BWMwZVhYwG={dp@pQhzE!vZ901q|_?*
zQT6ZTMgHJ4q*%h{1uH279%WcotvfS@&z5fQi=t4pvC{j-8&A=J4~mOip7a&whbqo^
zA!Y>250p0R)57UNfv3E^JcvfTD|ueYi_;a7vE4KUME3pwn9jU-%1h11f%cXNrsQ`n
z=Aa4d3p|9jAEjsK3ZgA^i`eI0*A(s~zxjF|E{g0R_P35`uJ_yO$JjoIH=4<h1I+61
zkCg}CsV#(Xl^G(C`Ug{uwc1%jmsZMu%D;LzvP&02+=MbXpBPoT6olb&+{v9$%oLW(
zyOoUW8ka4mu6P34v-A5n`a#3?7FFKB3^5w@8L9nmM^}829@TH5eYk-3>@j@rF@4}o
zcy4$Ij>;gB2Kh3x@H}Z!B6a4=AC30#&PKeyxZDchY5xxw`u{()^<T-HZ<H8(75LxC
zTwSR$uR7IV4R>sf#QF@>H#JWrlsCY3Fo$|HlwPgBlQ~_}zmYk8_)|WzTJ_$9IHY81
z^)me*1~>8+CUF>Jld3trR)4`Q*B{9#&prh6*#Nl3(hh)IdH}d3_7AuPxNe@-jQkgw
zJJd;pYsH9UNajI{XB5V(03Q`Duqg-1*7{-p%8{gSWo|O$HO1?CAyZyy#6RD^VGDTI
z7NydSeF@nVIyS9M?rQ>fqC0;*P&cN=;F}tm?9lpx%-R^gr{+yT&#eeJ<U~r|*6(a=
z3b=`-49-KuYjstOnXwfhFjD-N=YwS^3bQbwQ3p>a8z)BKcs_{ik}PE;3oBF}nKUk#
z$XfLJ(<1GvU$kpsbCzD%zbK^=c0_1Q#K2t^Xzy9tQEb+uC*^z~IQt?lhyvgidMOy%
z2mo$b{0(jy+(Z8>nUiX+pH@To@}-gD|714kUoL`c{CyWzj&XwB%|3M86%71W$nPLw
zBT|S6`;_q*ECk`<3~A2^FtVn7JpM<`mQj&4n%m*`tCC5UB1vB(Ro1JP+Sl8c_>QbP
z&X<}hHeFk6JF4$zKc5e9$;fhHdjLDIUO65!ojwmYocML0uTH1GrV_r&IPa7O<fK#d
z%`$zvW(MYGR|wRjxR#>*P?0b@Q`3CQK<i<<!bV1YawGM%`UQ&|bcHY3Y4OXAEc6%u
z5BKqy7MVA^&;22M>F9oZnK$+iF|iNjShuU5{U_#+{J~F5_vL}lU0!6r!PoX3a4<d=
zMg{h6ApA5SMBzwj>AY4&>wHzKl!!9Tl+-J89m{-OtNsD!s!cLeOC3|g=7PY~(4>si
zpaNxe8cAAIlRaL4qH%$9?7~6)3z{D&!l9|!EXpnT#p7Wq;}Do-z$&59OeT%ddAdwX
z-xFal%?v?f+f8_^(taf;Hf5Xt)-Gc+PPkyzh@#Y7FjuG$$wE_V;t+w3v*n0cn#DaC
zuM?DY8F$_^4jto|r)_A=J_p?4<J+WlYUJyub!y5!On%nPIZg)E%-K(FXwn^-PY@{|
zHe7UzAe55PAUnjU-}t6IiTDFd<w9Fn6=TkP!~+PTGE{yfmIltVK!KZz;sIPJuzYB8
zt7_{-pw%~m{lEsoWyq+)5OIB-QSMh{6N~-W&5(Uz)z<p)ebf5=xvn3_hQ>DDiskZa
zU!FS2j3Ry<XyAQawe0%#9BM<seMM87c!m{mKxzxAIk)%dtogMl(H=cV6??H(AeFQd
zM`Jnlo>H3XkdbyL#nzgo*hoIfQXigRKY0q(Xc|ux@s3TPfrJvrDViwJR`rm8ve>vP
z=$g=$I*(z+3h2QEJpq)9xE-_I*R&)j=pVVG>w;`eA*&o&f_9}<*=$h3YYEgoJNli(
zg;5@sL5`}$dleV6i_rMQSl3n75b?w5A5=Rl%$8zr57967<EHEA(82jxm3{FqKj1dM
z7bj{x71Zsak;xfU;6|(>BAG<Z2kYN%8(hM?n!6Mqk1(?&bKWlQRNjLhjM!(lBqGy~
zT^_4s$Mr9D_L<@<R(+sFfcE7bB?u^g-6>htrJ=~ehX6H%hUiIOW4&u0H0UALCNO3n
z<kmbJvCN_=wT5}^>}A~&7dTcfp<cqvwV38~pf_*!*$MAieR#p`VNyx;XUAyG+)!IK
z?h~>Vr>Mnzw5Tds)B?7-4p(c+Ofee2CEk%;+>D)4LLHBQN6jT%z(Sii-(65S`C^tJ
zl^axV(==~&$oWOeD@5}MV;(vZXz`msECXzCh+1B`QC@V<<4k<9N)|}sYYA-}{Sp!i
z41FbA$hc(s$t99<Wu%8s3Zmtd!_5#&VT2^@=0yWHZYf56LhNpVIAe!K!F+J~8ebpG
zJH6p>(n6}dy{+!kZlK_n{`X&ztH*p`Vd=;~e%pO*hEp1oe1s3$c=atxb&1dsDIE$n
zz;eb~!AisTlnLC)5s>@Qu^Q%3uopMCH-t4)4DZrdFq}E;{Fga4+S8!OR#2M)9tovN
z;6hpJC`3A0+9%MV1=^O4t92yyhYKtyOxitXy%|Yg_-)=Js^Qo2EWyuwe_yg<8FkaA
zLWv)g+w=TxNC8DbD1Z)&wt#Yj?r^BlUy9h+N8<~dX68d{g&$F+s|6f;ZLkkQYcY4I
zKx^6I>J8U*MAKQe-f3u6tUa5T&`jE=f>ztH1Y4J*+?CI+pgsG^3Ij5FgpgHydUTJj
zSueVTWM8807+=Y}XDGEaPKsvRHyLVOyQjE1qHZusmUrJnNbCGNyLTEPht#(Y)}pCU
zW{nnd#I#*@o3J-woJ71(YmPAbdv+ZBOxfb<ih_=Nm_3KNm+-2^F0`x_#9+Nr@qOEE
z`GY-)S#NM44|acE^I2vem^vDamlMX~W7}<leI|1QbvabM&ZNDG0r~}E^aS2apEF%8
z3lgLl4Wv{Fsa9|mMRpTOs26R7)JQ^A{ba)VoXO}ui35gS%O2D-IJDbuI$=JeQL}wX
z+G?cC=ZFMy;p*l+ryCC`R|;`a6BC=*Fv9>iZYT1hcRFy3sNVs(dh_jdSTv}8yZu$W
zX(zFdO-{+@_}f^JZqY4!SZ&F5rrtCY4=&|HwyUEm_}BupZMwF#d!lUsY2!Y0@&=J!
z<DP2UWWxGBoYX~92ThxntHu+ZE~55T+2Qx(4Pv#E0#ZnQDOc#YLn%l6*O4-6gh&m?
z9-f~onCHJ_PG;c8grgs2Zw;QA8ak5&GK76Z@ODsV%FP>$<D8vFE(t~x#P%5#6^09w
z&wkWsK4lZ_H~>0{r|uH9cS;hsFwg0va!__ny{n}i8`w5xM20)B?u!XNH_E!-B+?}q
znjR7C>{@*8+t$4k#M8!0Jh>W5jBZ=*vho;>OyIc8vVRtXkk;tyE)YYTb#k$E9;~p}
zGcyqR5!`1k#OH-L3X)-VJ~eRu+DkW@Us@ANvzBONE|u6!5`tbr*JLamE*k5S>Qd;^
z1WmMbOd*+e(><@79{c*t3*tfG^rhd*5}JHTw1a(ZkHpe@;VOo?vG`$SGPUWB`Bu)%
zc)atZpm9}xIFL6RIIuXd_zh$_s%aQqnroh>e!_DgB^nIbiFITT^f?jq@i9H*k=92q
z@{#VX4{eA5YAwb4<>>n5IT}}0>5C-K1L$H!GpHe740SrPGI>auER<4JpK?iOy(Xe^
z*8SK?p&DA18I~Ob5{S@NE-={~DI&M!FpJfN%3yalp2nSJk8QlSb<P%d3E$k|O_7Y0
zDxH;XySvFP(o4oIg*w(vjkOFIL-(fpRE%pW?NYY%Qwm27q?*k6i^#(8Y=5anPib!i
zUv=s2nQKU9i+K0Eab`QwB{pbUBW2=fNNn!+Y1sJ|(SVOF!~5&43>G@jvFC+bPd$u_
z)5JM)ZQoAcnVcHQ=z@pjR`w&1g(+F?fy9S9>ev!R(!)heS=alkoIAzey;Tg0uky8J
z^wbX{eXq(Lhu-gOWgVo6IONF><S_uew(XRDtI~IB<~q%;L0{>l__?)mAe-kzDRl@+
zV167HaDxZ^UMK$%xDaq;B~;sqPU^Ug)nyUSPF@sx`+<<T4!z!aC}HP)OMd;eCOYvF
zC7=f%Y|L-|Zdiy{wNScot?l*jE<Y+3u<4drw6>5Bhq;&xw<?boC3YpM>t&}fx8th4
zF7qlw(B8%mQCf~#iq1vbX2_uf3o{ii2nEdyj~{Ju9TzP&O!_1)%y>W+$kDOqOBy0;
zNDJkAe}-?9OLEaH%PGiA5+Vm=Tp}MDUZ!^YB6hS@-$cg8w;ns#38KMgdU~=dUv8}M
zm5&K*Z`7PhhxeS?DR-pC;f>h$c1^~w3eJM|oHQuM%jEv|Dy``*MZWuvN4Tv+c{PJB
znH}@T9ocuVoOdyAo52~(cjz3xQd3Xp65DG*-Lzrdz{S1<BQtLBSY<f`D|ANNwj1L&
zQiZm`QM9>v7X*bD5Rpr`->$#5uFz<xc_S2o>+dyg%Dxhe!Tp?I8*h*~r8Mh!RnG<q
z+Q+fACj<L!9X%u@-HSBwQ}KbO{~f0(A!GL|55#-UXxJ5lni50WPZ!$<mw30q_A+qB
zUthrLdChe;nK9fEg5Yx#_7F&Qz!~qpgnCegkuw-pS5%Lwa54py3uCx^Dd6OLVHkoq
zNLK!VGo+q}N@jIR(hH5j-qKiphcGnF%2Crf#t{q&nc3O=T7X|cG#E?d(nxAg7@{Lg
zx|GlzE?HS1iQXS|Z$3;KzN=_GEGkxJ3bhpr<&CMA2ib`cDOL+!r2ZAY{hC5jxhiRF
z%@f88m)Uuz!aeiE>J-vDrepdir+b2x&dTuCSBjS})G$0I`k!7LpmGgza+jDNa0Kxz
zIB0dMcn65mduht-vYRQKNb{H$dER_tb33DHlo}g_aK91SOgPbx!Anq7KN5a-I`Naq
z$mriRcEti6Img0HIb@ySaGGQ5bNU_Lx*_^yJX!GvQYMv}ZX+(#R~zyExTmb-tt_dl
zHuIr6J+3sX%^vkcJ6)}mTW`THxfFdyoFH@=x~->o5}KEvs*m9YZ@o3yXba*FYr93e
zz}dGLpxkH6mPhZph{V}+Go3OZkNu@_Qa2q?#5pYfu2Or6?M`R0X^1ZuS4)Sb=v@p%
zAJAr{p+?3~GkS}ggu;8V8u#qQ5R}!ABtEHO&tG2L?H?g=Qu$li4#=!TGcL}cUtE7j
zctz!qGhD<;C$_qO{vF<UaSE>W7TnqLhtzC^Q8oygl^1`qJDib4e2|N$zhPGrvUJFM
z@Pf4z_Yg_#8E9sK?{`bQKF4894BQdMRZhCmRs(iZ+}^g+Sbb1-yGkDVfZ|MvLMpX-
zaami*UvELJKjRrV^E6O1_eKMcMsZ*kZ{5)RsGrwI^iug_3R|uo{WUwr?tzL)Y9kYL
zvWl&c)nz=T(#ZtqNzP^FH>_Zfq34`^fUG3wrRV37Tf^_}Sa_KOTnfnxk5?hJr9JKW
zz)pvh8J{Y*6-YMmJ5vIT0%NO*td7_*?^qY$&Ft|;5%^;X-I#=5%U~|X0Xs<hAGkJT
zQ?~@|w<J#JfX;10BXsth;PYCrYrdbZgPh=B<)Z8oriJj<BW%{=#*MPBP;+0mZbXn9
zqh}|H0qzwH`f~c0>?6R9M2U}#bCB@&8E{4t-+*QR*xuoMX3kD=yPnKf^zJAHqDO=6
zlS=#56v4zN_p_Cv*V;SrYiY2fbbkxRjHCi3K3BD(G^Xw#KiBOMf`ypj>H693c$YbY
zF3B>{aM6kUGoQ2vJ}OxEF5{M1RzQkaci6q(;M`+f>cGzhJrUifXSyc2EeJ0St0QkQ
zVlLqqQn+GNmnCy>Wo%i_XsM>eqZ?&gY5jTLPU53*N^kcLhZUvOgl##T=l1LGC3893
za-A!ity@Z~>DzMJO^Bx1V-Ih>zbR*8T(ZX|pWg=-O@h-(m{Papyj`3NmXt8ZF3G3*
zhJVs{p|jqyeU)JXEJL>R&JplJy}hRIRMhLYeEWiZ#TEEOxVssbYW9{&@Cq5@mc%|I
zXEBv?jo~--qvzKV{k7H`lwV&>mQl_R78^3G?p>TLjw-)=iKy0HZ0)u%M`g2E9e7uW
zF}|PPhpL_)-)?G%E-hd15@Yrjo#`k|IOS`uQc?wLC3Bf8Hoq+8YJ)vu`LjDI1_w}%
zCNS!O?VZUNZ&xU(E%;DIOLGR(#wg`*H@Nq%Fv1%xEYvX{w3lm76EK|!CaJbPoKqxU
z#BEKq35IfyZ{@Su8Nv9<vHL(LUJXavgv!H#c13-X&1}^;2nSJ4!Q=R+ILNSTQN5$Q
z%jP5`y?BQ+0rR6^KG3zbT0A-*++&-u**_wDLk>R=-8l)Zc(Ta!1#!rJSt_Xu?p=3*
zSf|p0GxB0&Nlu|?V-!xNT1_W}C0%DO)Eb?{bo|cDom!N2oJXTm-N-LtcH6qQ&jlXj
zQ5AcDxx^UH2^>>6WyG`)tO|Bg)oEKZ2pX+38T*@*qMRdx&>upXjlb6{7t;>$a*Kjt
zBnFlVt0g7*9rj=A7m!BfyWN12lpCOBB=A319BoXT4V;|q98KuW?d+_a=#@>JoK+O1
z#Q{J5F{Grd^OvBBR~ULdS)^7R6cw>lT|m7@-g>sQ39POWl9q%oe)YU@bMAtzLyPDW
zYR3F`Pz@PiS$$SfGe{&M-`PY~8uwwg*MYI$`^N|HPhy@pg6Lxgp;SmCGzZQNSOdC1
zFY#c88hydu{J>Y3pk^ag>8RzFEG}Udt=(7c3a5&ERG#5RD$39R)_JC`6qta4H#lJi
zn-q|K`=!Qj)r&JBN_(C<Bje4zr`#Ki`|Vwrcjd+7waHCckwY09rBGF4Qq7wS<p<p}
zFzfkT?Mn!FZoKPxO6yxrP5Jg+=T&E1wBdc`BJS9v+%{Ut3_SDA-p!-m;FW=Btdi8(
zsA<E-Hx|5Wssowg4K5rJV1z;Au)Q)mS8Zt$o{htcd7M#c7zb$9na9EH+>c?#>js{0
z6o&<*vO|}mJ+-#Pdu=&G-OVJQ;+24C#$TyI?L`I&ltI%a<KEN<kxOCZ0T<mUvl%Ow
zCEMd^AjSB_izPR9x!goz*$1HGxJmL3SkpDDMD|^|Juj2*bh28{i(tg;;}o;d7Ue!Z
zy%3*m`#riuH5m%&esV`OT(kK}_VoA{==_X5gHpvzrYqK1TB1yt#z+BvPVmM!2|e+m
z+ATcYZ~>(?%1HZtD#Y0HL*me>O}brtkl&b($^5g{HV(@e)g*@-t!xURC%SC}>W&Sj
zf;W1upQ~$PqhQJ-GPM|A2*a4=MQBdqt^q<(dMMu^RTXP;kK&Hr8<;yk7oL~}?kOX&
zRkFY{!Jk6Z!n8G%aV#MP6bR*OOYH0;vQgA9gts8W+9xV-&MdkLoW<w&DPTU+BuQZA
zU|uN*hiIdz;kYdQ#%mLlewT9Qi*4iGRP%;z!EG5XK7;PxiGv<X<0bMM;6D`D%jx^i
zWp5LG`R<3pFXD3+Lehx*De+(6@aAY(GfA(&0ao~jtb7DiD-Q=yvIPeChv5J08!pBM
zrt~K61~&HACiE)b1Vu&u{$0kxR`&1CQKY~t*Uy09bD?Gqk(6vpLxAD`Rf+-aySxQy
zVI%a+dpee3(onAi)dw0f5-`6%;09BnG(qv&+-6sY8~%fjZ<pT}4v}RMR4uqQI4-zm
z(6z^9wf^|n;UG{EPLm-hcXo$LYyNx|l`?@{?P0DClu|rA84yb!W+fn6y<y8t!2Ol2
zSr$XNVfL>g$o1_%`KL6|EUO)MuwN-C^9Yog3ey-PVD5!Re-5NADy2;y36`U$lgB3v
zN(U9`L@y!R+sW7hvo45CUer8|BGZlp1+fNfLHU3@e2)6Ya)^PN1pYJ+!2|dE!1gM=
zQj8>ix}jEQ!_4}DQHtLq*jcX-Fg|AjdzKrjQ`xXjD^bZVIb}+zWuBzH^`(F-{g0Cx
z6^dS?xc>)#y@C7x`|D-o`UDVs#?^{c5%&&63c&Awk-2}9fe|uPY%Sdgt6yU+ijrEt
zYT-$9|7+GMdj7Y>Zp5as>DEL>+$P{A%IB9E!F3T#Zn$(fO*mK3ba?jVAVONP_vrj2
zBqPwybq>{L^jX6^G@vGE+|xbDTyJkCw9>l?Wk?op+zJJ*3<@u~#Bg5PZ4i{0(G#3c
z4r+<gwxJ@sv&2G9(7XgfnV7V3c{52cGTF-P5ogIbMOq8CxTFcQlbSQ{#%g~rGezvu
zfGMkHzrpEjN@938Hjs9!J4B8*@>kN|IH)L)K>e(qIPcGPIJpB-qjA5+YxFhDoOShc
zJzv1C$3!9Nq*7RG!=N7OMw45J>b9sj)<djwRqZ{1qsvSEfd4fmcnve>$^-uUeSq03
z{by!R@|PR@W&a|s2G%YHfW+~SQ36F?IUqp<AFs?zf=B{imvs43Mq9%uM#10#kXm_)
zvt*<FC7hN$J_!D?p|OEJ{9kcQQ@CmfX9Kib6H{kiudela+xR(O=p0hTz(}FF(Ofa>
z)y9-|ScMPv8mINy?&O(J*2fYz&3V3m6kxO}Q?@kGZggL%bjn4U9+mp%Ep$}1aE+a^
zmGs4YEA<KO%Q}?k*@-0h?!4Ds^^S;+Y`~qsZCH9A%xy{4R@F7528;(g$ehB=nBAH%
z>^bRyH?Hb;ynGZG4|5to+m&}C8u3mZ`W6)Ht<|i2R$WixbWK#8{{`6!?E#bP^JJv&
zYkv+trD>fyy|S%~L1<e@XLF!dXvSj94hEP{raPwoH40=y<1Py3U{ixzZA+mIqG{r9
z#3e}RTi_7|uzXuXYSNq%<9=@)Lsj%t(J>ZzX?tvJl<_3YV2=}1`QGfb*FW^iv+}YT
zlYqx71LXAo<N*!bNn{M%m7E<dY|Z|Bz<=tNH(ifGOLGZ`et|Y^5>bbz_@|qL3IiDe
zB^<3gW3CT&Eu1I+)Vsn@eVT%XPlq5$e>w{C6OmaMu*$!dc2l|r5;aQCWM=l7a_hd$
zO5^){d7|_;$dn-pDUcv>pb;damXQrf{s!Y=0anOK2Mnf?DbHAFm?~IT^2{Xsm~{HX
zS$9WN<H)4@S3Bya^F3$XSsdfQxkJbA8iv}8Mou%XV!r^w^vOwNfnFU1m@1`W;_9Wd
z{Hp`^+5E(2%H>~6Jde_C@bIWT)~d(Ys9E>C^OQ044i!T_%SPgIT(MVBXf>*!gA;F|
zimTd4?STC$nhRFYZ^?AYZ8~+qzVA1K5!d=MEAKtOm9Qg8rgGD}uUh4-JsY?ivkG+6
z#%Fj=opv8sYJzEvDc0F=fm(-nxA)M5<=9$)5nuxdO`&VPtwxBl2|F%Td4#4ikXpaS
z;lj|}kY*3jH5S#$rJFVu&PKurFE<w%zm2kp+Ec~MJ1)k&Q%x~Qf2vVR0FTPHv?|S+
zMMm`zsDL+9It&x(8-SE<Bf2JMU#RripF=$k;KT^5lRY6p9#^591UPaSwMn!ms|Y%a
zes&q*1ba(46C&o}L(0L^5H*qGmpEZTt-7wo#GCCQuaTuT2aNQ49@*B@h9~%Z1?+MR
zR$CG7a7KuRbB;H2tR<ZB7)#hz#L^N|R$9t?gSk)7zQi_saD`Sb|0<FQmv^B!3h^BO
z#tdT-m%AZc@&HnY{VX#;>YnsdZB{TR{~5OQvCO>IT5*YjUbAKu`o|e}pQoG(UoJBK
zY?$24RoaJj_X4Lgh>H-M<D#&(MROw=oKJll9}a|v4rJ39NTzZ#lnR>fOetj(sFBvo
zc<>66f(VI<yafkL7}jL$lGwL0tZTVSRubr3LoM?mY?fU@nb3Ye8M<q7N5s2j37lTt
z8wn_BA@X~q?LV?UOdrJezr1G&;5~}}QTQVPL_Q^RJ4fd~ohRvU7&&y?qD7}&XDlT!
za67+Q>&WoyY#>?W_k}O%%8Z`uHp8wC{3otbcVqZFKYIMJ=~xgM(%p|k_{ZE$*HIP_
zfEhC;COEwgI4?PwI{@1MXE1^|P*@4=`CP)RTtZ@S;h@OMP)O<W5sCh!uS!%mRK+S2
z3=~u4`%JY+!W$ZnSiRUx*@<=fZL0QRgy~b2N-Ozls4za%$<#}lvKmiu9;;;2zarIk
z9=IN9E!;9C<y+EWfP?84dHL1X_FSXDv!`>>b=01s*zDLqLn^D>eJA%|b<3a|rYf?R
z8BUjq8K|?dpHx?^Ri2@hIuB&~(GC({-BME4tc2n|Lvl(M6xo6-LHiEBTBOP1C+AbX
zm5*d3n}sHobfbzt-(phbc)%!$ZH~-nLAmemQ`tFV&B{^XjFS?^wc@L9n2wY%i|x;?
zG7jMEEJLIn5n(naEZ<NYt|qjF`d-S~2H82{0%I&NQibUjI^X5#4b#|`rp`Mcw_rGm
zArnV{;u>)j1VVk(42!)d<jZ7qu-+PY2rD0vki(5$jgeKCHs-wso6CUg6cs^dt(8>W
z<jUHLZQZN4t52XDmlsOiSfa=30nrKhx;R%L=7n)d=NNutOFQ6rE_-Exi)WQ%F5omc
z3&89M_l6Ocn@JT)8e%~)o&g3XADvt8#Tc?kPvhx@J7POexJ(dr23HVX4Y(n<L}J_%
zPxJWl<1GK^h^w>(1vmbB6)+|)C%<JhGIBvg*85GZd4$YQ`Gt@8Blc~K%_cl_RJ82d
z^PvKR%oC>26p}Zb21?&Rzl}S2op#nBr=(Bz5)_e<Ph+<Lt#y<JaXAXt99n)oZDF^N
z583Ty-k~tN4G&C*Q*Lv-EEJ?Fq6DA3LxisO1)3&ATv+qi3DWweYm>qwTxpEk;t;nb
z=1^RP!?PY1aVwH#Tci?;6-wz2R|uK!AE_s9CTc(oAa==z`0_>lfA2hku4aFO4*x*V
zfS-T5km|VNUo;aJ$f)lM0fK-aQd9<r>8nC~Nh%0QaF{>JNuOmX84L+#CP~uvueBfi
zzlK6oDGM9NilF(&ZXS$-cXmXT$~HE9H=L&1d^)HBm(kwg^bzFV%)j*nmg3{xz0UZ5
z<xdmj>?EH2j_>9m=lNKKnS2nyLOd5ew#W#&3q^(^&7^z?-ARD4P;y;%T=sq!$(%X4
zAB;MF<*mDiX{uEGW9bQRxh14|l3m70Zk_ERq&TB3(8St&P^P;mVwtBS1u;EXGRa2>
zrafK^({d;}Qz2Eq6e?sHqm$7o5s9rZI#Zp(ls&j<jGc*PK8eXCTVpqwaCoY)rPc-^
zr#X=l>6J%m4|Lv8{avs4Fc-<9W^W=F1Jqy@6e?Je+dIb!Gh52ap+Wpiw0{~il`<_y
z!?2Y2{pEsZ+bM|!NpWceaRsRb3T&ETBBT$sg4qQE*H%^CV(bT$3XGcN9j=0zIpLJq
z0+sv7FFnR606By$Yknp%?SWC4jbsB=f?-BAEXjxbin?j8bPd6X>Oc}66=^Y1>P};3
z8n4VqE(3K-1t2zc;iRGaV~?=7Hx3qP?5@GGv<r_ZJT*+gbaza5Z4mpvnWEpgmAFu=
zPl<BiMGJypZhxFcxpVl!@W~ro8FP%vcUYY270f}rUfYvK!@+bkz^gUyC)%Esc7)E3
zkvXQ=>xXA!uYwyoNb#{>J6&LU9v$0(^ozU*vGK2B-5eXn+mt9EsXS;(?IF<Iz-F6;
zmA(Mk?R{CyG2aT3axF|b!&r3?$`oKGn-yfX<V4GAmF|(=q5)^$`Nby2*RQ1~G}-R#
zO-gY(TzlUrVS7IdYTXwNUH=Sm169rSo$LHz#!l=KzEPra3s{??*Niyjm)@gq?jjZX
zj!8YWvcF2CUc;=`I0Rh#ou_@wRn2;QW!46PEeE6$3f(Bk7^J0--L{~?W!}j3aDg`K
zLvae-!&h*<e5xy!A(V~65G@GK5eqR%c8TN0qNLo9cp+aKNEfqx3}ZL(DZ2LWE~CeM
z3G*_JK*NshBe2B3NqlxjAhQ$ZHSsE@?ywtzU*=CD>0El1PLfgYkmC4W)$Nqn#k=ud
z%$=ie@WVyDeR}?IUc3Wc9ohc_;13{Oe+T}6FMs;2;<#*|01EFs8dW$^cuf`=%>o9w
z!{33weO|3Sdm}NflBqa;aGc}r*gyzi0u8gI2s%(IbFgjN>2PAw`_tR^YipkZA{iB%
ztIZkWUQGzfENp;>CmTtv=GZ>Se)acGd^u4bcSB-5vvQq%2EP%Y_yLW%NFGEaWiYo%
z*|^EqbIe;Kv@-u(XL?e2O+IMhYAw8WDhwqyli1<1VeO*g1!QJfT+#x8X!Emr^sz&0
zrh87AgC=xersfYbUtv9l0|jR1R_73$lA9dJ$T%uEoGrwN_O-FpaWyt0QuUEl4uYfi
z3L@+<5G$T~a$`RP$e!-T{=S55bz$1x^Ort*AYs4EJq&_TG}Z%`Z%w3Y2UGk4Tl4i*
zj5%a+b$NA-<^qCVGYhoVW_wZ5)~R;X6E8^fBdK6xlo$F+>*WtHF(Nvw3jN(GPoV$G
z+g!8@!(afwr5ONR{vEZ@e^ajbXY2)hDmfcCoBV^ZC9eM^B?<kU9F1UPX_TJ|P8NVe
z*C`Q#PzWwXio^nKg-|{bp=X;)Xq7x02`i-!K~O|QLg)DcAA-R>#4)27OeDqJv72W1
zfqU-z`ms;#FS7tOqR-3Vredt1YOg04Ldb5r*1gYI%O-5PaX;K>t?(9Mk+!82>fF>U
z=9liO$3){IE!BN9%zNC4*o7#P%-u{V>ea^><byKdE<A5Cv(?<})?5-)P0kZ(*^x||
zbs*kaUD(rsyN+qkh&UvFlXtZK5DY(AwijyS5xctUtJk@EBOXmoJPN#UyP6)pPvQ|J
zh#E?$(^7ew0YhH_P26J161fnMmpW9#76aO1vAh<+%z{VCJzLz+Dmy9)s{<H_)Uz&7
z%~#m{YSZ3)iQza5LpL(!eLR$x3I)qafptS+F7F^K_+w`<aE}2c2+f<M@~ht_cM{m<
z)9@$!Fcxni4R0li4+*z+yyJ~WtpJ}!jB6$jaZZ(==!;RRYqm8VjO{NV4Vz?tQX1ev
z9#wlvs;zwU^iIVZ9B-kM{)qhwlmZlOxUv34s$+eS_P`21e@PI-%TLV0;9RS^d^1Gx
zic5TA6~?KAqCpngWEWg^qP2ei&%_}(Wry<S)o3CtM%y76+0}-x_@7-I*y!7R=g5(A
zoWqxcMee0e4FH|vlJTJ#VkS=rx4{P%U4j~ALN70h!qXp7gLwIaN(m4(ECHtX?+iBn
zH)aQ5ntv4vQ~*~ZN&c(dDagnH0gl=33xg;i3{EFTfM4q)*B~H)#H6agQ#=}!$=5qe
z9Amv{WJ~ir0lz5>88BY=$-w)Zd-*WWAAA3;WHu}aCy~UwGN~^o@LgaPdzN{0<XC{M
zvIt#HPDh1ceHK`lu3j>?enLqr{A*f(b%+AJ{36#dWPb&1t>V0BN4vmH;T)<sYd@bE
zmHS$0hxvxmLjcFO+`SM!P@Q_uDZhav?2|Hq_T3T2$(9&}X3QDqa`o<OA@1)ignp%a
z-(L(k;)UNU3-PhcT;xQ3JQFNMcO`Q7j4B*l+Bml7&Qvox#4dI@WG62tUHV2|P9&#K
z@#xrdCb=hx!<t|oK&wA1It8vD;V-?2_qhY2dd!9qk$B7)fG7)lcf&P89Z)qy@#Z@5
zhj^GVod|pY^&Tv(WlS~0YY^<{=a8?UNxxsKFk0$Hy<#tsRs<1O?fx}t&@;^j{-yu<
zPwCD-iUt28-O=PjL3u-K+7v>E4o4$V2rdMhDU6)$u;6Zv>ey`GMC!Z|bKj@`6$&i_
z5>2M-xf92fy~+m4Iv_AX#_Tke?J?ywHE|J_qu2d)#6VXVhn>Y-e9&Bcymuy-sK5XR
znW}z{!Q^LQWM(8K>O4$j!&nQ3dOYC?4JYg;EGGR(YRgG-&rbx|qGd|!*(!+8y+{>w
zE7}s0PV!FUL^J(CS|_h8ujEE<*})nOnNVLLbgSw^dK<@%K|n=d2Kv7J3e9HA8VX87
z%b}}$6uVO!*)&E?yTxFsQpO;8v;9(ov_WNuUZ!p^E0lJk1pb1SzG*oa?-iO~tgOfy
zZ1E;!*k_LE2N#uyk~?ZVGwm`A#Wc@&%FW@NGM^JBSzJeaUdz$#O@+$V5leoYGKXqb
zFwa4@okIp;iX?uDl-V$XL$DgDab=Y8Or-5Nb(D$J0oGd=_bP1nsLP#^z;HN@XZ%e3
zx13na4MoO^UA7Ab<2Z6rq+2dg2U$?mH@&#H7gCW7wi>lbu_9Q7;zS&NqJFE4*8Gi7
zJcK@#9q^Q7I_Ka1&13IaZMYuG#AQN+I=K}IRAUuc{2q|4Fwcv#Rifc|Ck#H3gsaBs
z1$!kI^Bm$!IFr!_sdezCXM}f*5WFHrKn7S!^4~?_VKT|2?7)p6%q&cAK)7=|8l}3U
zuy=HVVNKA-a!Mh;eJ3?JAXM&*$}c$0-BNTGA2RFiXW$d`h!Z4k2nz>6)>$RXcr!SD
zjXvVOq2_KR#38}XL)y;~>qXfqf#ejv(lGuGj|AVelQrPzf`?9VFQQ2e5kqF-6QQ|+
z)*z}(>D)^D&17BNJLE<CU|VcD%y!Kjq1nE)dFO9@Cz4;;Aw*aE94#F__AkC;$o)6I
z178;7x+wY=-;pi<i|+`xqM5hFGXIV5kcABW$#>9xu1`Dxg5Lr_%PH``=R1Jl_m2*N
z^13ZDBLc7NoOL2tdh^Cul@=C(;7=m=AgD4)tg`P9d3hc&nKA1wO;+@}c1hd6AtS)~
z{NZ<)5gH^ykm!UgT}kn$pC&TvSRY#O^}2!FBSbMhTH%^;dF*TZYkfzCb*u`x*72_y
z_EoV%#{v}lScJ1lh#cfI<}|r*gMXt>_6mox-n0l$o=3kT3jf%^<xQOLH40U^!7J6U
z^hwXdHa9Z6i|SC6cugkMv%AB9lHo-`*@~CMDVh~Xh0mnQE9Y7}G>F4aTScD=gf;G~
zU7tOe*kybK-U0P4bvXldB;nDcUc3A$7527(Y=k$McO|Rob^d;wdE(J8e{r(#luz4`
zgnc_cJ^dWcY>ihkHYR>&OeEnUM`_dcLo@zm-mlCLMndUHSBj)N-a(YSDeyPqaiw<B
zeB4v20VOxWP4`iYTx|M#JH~MF4xdw8h)j&6Ha0{`IFlr?n~=+WSw=jSdOuhH={WwE
z{wcqc=_i=5l$6r<s%x}~N3Ad>ni=3Gey)t2cf;cYTaf;a!TQOq!VotuKj69;oG#?m
z;iD4Ue21V)HDqjc&l}7$WEsa+QKK{j6y=|8+9et$l6|lnBon(!!EF)i!77>^!b_VV
zkM{h%XpxC2W#Y4(k(9lJ+GI3{$rjp#l?b|ooszAR>LSPF{~D9gszZJg0<4M~5SPXO
z_f{q5Xy;-tY-ej^;QSBkQjn4A2Q)#}i(_IIE$<HmzagR?2jZa$z7h&u<v;LAs5K;*
z?vhb{(8>jZ`}y-rW)|a7<Fx}b--fc=nWoiT#>sslU%8VXOpkIww?W4ST8Hl%7ySG&
zWKmgNMLAb3AvE@i90@%ZgKEA-5>daIS}$z^bnldP^d^++Y0rUT-)>q6qS-CGV8aNP
z)Kx-oAi2&?I4E528R-`#i)XQP^gP5TAxRmX6m4!SO-0PBz{Db9>p~EP$ScXZE;+S*
zCH=F5IRti5K(46eV4iE5C67#-22z`rPeuO`K0YN?OGW(w5+o{a?e&mpMKC0e(<d;-
zZ=}2TsBu8x{0R4VOd=G|1D=>ZK3r_wgocyo?^Z=eRjF;EuDzQojK!Hhk&S8VQ#Cu_
z3GxBX{qIOA{|#jGPq~JYj1n@S|B+ZjYM}HjIQKU=m1MU*(6YR;0f7o7G7kcWdxrIp
zaoMD-WkcR?{@OQM?N0OK-kY<y`m<G%@Y>8qOS6u{d*>c@US>e);uTIGd>;i#Jx2_M
z9;Q>8Fcub=6W9Y7G<2Z)eC-2Im?eeZBwR&BjERQDS`9De#pbx_+d83fT^`L&Ce-xO
zEhWaS*TeU(Fs7R>MQSbs_7sq1tVzrJh6I(_d%Uv>mOs57+V+wieWq=16~sDs9cf;G
z#cmRV-q&^p>>?fJUE^fE*EP1Q88DJh;Dg1qd6qel4+FXT&TlOJEfIg%v*?+Hm)e9{
zyK9TNIp5Xjbm339qS-W=%iOG7wD;j#d>F?Pyg2X2RA_EBP8aU0*%AeWI%qszh)Roz
zZywmVh%Wf2>FRlOeCrgGC^KTcq0wk)8d|*SwmAlJ4i2l{FU>o5&6v9^iw($+Kw^`b
zAha-q&NQH`WGB1OkKg&_mq{Re@B(I5x$HV}b+QeKIj3{1M?2<ZXV*rM=d)zS*X2Mv
zSc~I0^~^PP7mDP34C|?<fVCi>UIY*V9rF_CJ^?*MWI`Mq=Ro{~XE?T#MI(rTxf{<j
zou|s$Aj3|gPVOc`>J^dykoy=W@UxNR99`bhp#i<-q)9{<On#$Tm9XAtY8G1Q5}1FV
z;JfBajr5g4b(wU|djg(CikS}A369M9UP3=QuRhpMkUd0vLliDUiMK1EzkHQs4Ksa(
z1HD&d?&3}K_Q(D8x4?@EYk+AQLH=ij>i+_~|77c`>$cb`C_2M<UWR6K8|j5@UyF)3
zT*KQ~C}rkYEa(<`W#<kQk)R9>;ipj6cVmb1s@y~91p-xn0t-PQDFac;P*V|RyZ=f=
zAXvhbvq@~1`!<@n+Az&$RpWy_jsN*}L+!tOt&5SBs?3NQ3ZF>m{L3(PyfHG3Ay9F*
zpR{d45-`p?Ic_32sAbUj%Mk<p`}cGa%C0J+GNK|w2?BXX=iy&IO^(Y*A%O<d2X~62
z6p%DzwA|RsQJ$W|;g?64<5WX#6FS7%%}}v6w!`C?@BOU~Qq$;8fZ~U(sX;eU$(<Hu
zLv7<lYTG>BU9f=t98Ky}l0QFFBVq1@CZ?ygOggAhS>`s|8LJPsn2gw*<Y^o1Cfj~P
ziE<`xVzAI{hi7#qcQS3QGs<4MJ4vay?}Xrzy{V){!FNVzjg<3be@m{=^WswOgwDLn
zlj}^~k)gv%rehhB!ic^~&V&!8bs!l`Y6S{RwntEzkim+d)V<&D(reHTsNNMM>Dv_=
zyn$@)GT5P=yYkm#%ML*j;`0hBpw}b!r?Fj=>xziPz-g3_$`jC!2(8$eP1Xj?thIWw
zSft<}CuZyyC*~@x=NXn(>~D@cWf>cGR4S}XVnCH#CH#y;iEj!+b??g5(OSKg87+_c
zAYsX8$!Avm>qAksXA$~qL(v^1`&B9}0jYA_lLG@5yR~=+*V=NWuRYYwA*+aQ4DB;+
zdH)<+H2s|84F;awX@GSObq1@i=u<A-rUu%X)f!Az;s0aqouUNYwrt%DJHxhZ+qP}n
zRt7R`+qP}n8MZQP^Tt|x*FAf$s@6`OQ|sZp{;#u*e~kXkKKdN(Fl<pAUb%8aGRF{y
zCWd7!Hg_DFTn8OhmYJ}3YhJN=I4A8#iCk#4=UxDq)OgOb85d7qj<YSuvEHxL7GO@M
zTBBzu9h1xO1r<4Wr<`qROpN!!brslVqs_KL>6%s^I}@sU_tFT=MzG_0fg^Z9>zWL{
z;{AcZ&9`lO;*GYw3h?ukY>B>M7mI#t0FPPYfGVpWssm~z6vw8o|2*_U&^-cA{}-59
zoNOTMubk*vxRl*@f;M_>ur)<LWu%pMnm6+fB|p^5c!`*{*M0P?xDp_t$oHt~!j9-7
zASPHZDCL#>Zu&yW*5HI@MMgOWuo)l0aW}mynim5wwnfbCn*-Hfj-E%QW!+Cj-!EvU
zJlJY`Ma?Q5U$9>=I4s(W-o*NRSdZ+3pbFr65PWfyXQ%kN11K{V%GG)(%IiQgfz*dc
zF6^*)_n3U5PM-j9TOiWnx~oENC&mZZ0%YNPV>I($b*KxF4yx>ky8u!yzI6>#HryHp
zuKEOi^pVq3e+;F_!DpjLtTj9#zTqC!s$Qq5jv-wl6t$zwaF-asmMOg{!B8A&oG&QX
ztPwKqHf2_Os~ju^_F%G|^7~53@R;|E4H9Ws;o_srA!ZlIp{QSQ32=Ir&_pM8Q8{#u
zb~T_E?zLT3<)}E)_i1J1Ov67GpcS!=sXVk2sdPMfzz(B6AO#D)70?W93gX|S|HcXy
zba`~w*{fpe)k!k)erxA*xIsJt%u*Hj{L8jvxMivr`4`6${z|m{A51|10u}y=v6Rnk
z6_wFGGrKk(s;mqv^!!eK9JhTh1xl@(+(E;Oq%6V2yT7mMY(Q;WJA6o@>we<T^O=Pm
zq7^~VkJU3U=s_Jyk<Yu~M|_8RQw+Ivy|8Zo&M6&O$T{tpb=&clmF#$7>$~v+&|@<}
zAyx|Krqf#hSzTa`d}^;WNZ&CNgnAvPRfHr}E@!Vrqp0AJ2Sa*Yx1OaiOMK1%sUv??
z#&<NsUa9u$ERGmXCbh^$8mUo2xSGsOx`)@;vWcdKrg{yraHY<qHEq<*<R-#JMiPaY
zF}osj0_i^1M^&pa(!gO*PpF4ger)z@^Fsebw7j0NjhPw-xQ#?oz0ysf9=f#}!drb6
z(n@PwQ>@&`nb9=G9UvBeO3T^BrApt9mcqS;Voa+Q$6Rt8E`NTW(2(gij=@s}-;<>>
z5gQYv8nGuKmX*2PVs`N9=<dpHyr(L?)-9;P+4#h-k1?oJ-I2ULPqT@SV6%YRT<I)!
zG1T})wnj~u0^DiSvafDr6G6X5s|>A4o<Y4bfo~sYk|SjHQ>vg49~Degv(RY8gQil`
zeQ3Q|1d{Sqs;re{FR5_ymz`$=f9n~EmVipVbo_7ngV<3*S#<|v2S0^hG^*#nLYZIJ
z)+H|dRj)v+!})eT4>lEAL=5g3z(io?Ylhjh?Tfu#dP~5uGN{y_3se^CK)BUs910cK
z)#;{=N3BL1s0~TfC(VoFU)-&mv=-urVWV()_HWG3y;kX>aZIT=REs5)Da&zLGIne#
zn<V`W2VP;9$5HSiAh5jXE|w=QRJg(S@P&{&(1U`|o2VkF9Q2RasB3bCONk4-&_K{S
zD)lpZ)y?IR*z7*>+~-yBaMIcHP1)UHbKE<nX|H3!wyPeTT$?>yfLB#vX)a~zb!tZ{
zo^+HjXy+5E)K~m_M!GwtY?wtfx>w)5+&eUAhKJd*UW#KKktv2W+Z<_2PQ!}}CdOr9
zX9rJR%T`4A?6*KM?gy6u>nOs!J~Yf8OzB2w%~AKWVfoo@uk;vQZwqwhjt*sKx`6F1
z0J83s8>o9F7n3*LC+S!E8pDy?qEBCsdV-e&qiYl6fQj`vLU;5z@+rCeaY&Vr)@m}d
z@1zsji#f^8bF0d?H0QU1L@_lK9?%N$-8e1;<Y$7}nsM$BT>*iZ*OSqd`i{k+t&p2{
zx<h-U$qZ}5!H2ALZK&We)r4lkPhp-gcU;d_?Ndu97}tLTdlbg{12-W?bM|=Fj04cH
zr$z*;Xp8Ma^lk1u-7MUJVZJbcKyZDtM@<)kxwpyN3?S8B*p(fUC+}7q<~oz4A|RsM
z$F(!>;Vf_q_Lr2C9YmR7!3qnq!NW=)7KJC??S^JXhyXsc=f60tRlVR5q&krlN)WEN
z)x$$@IwDy8N+-QA3n4{C+5xxC`LaD;A^@t^Ze2HeiN82xEJU78X_1cGiVAYI!fDw%
zI6qr4H87xq&5Yh-D88l{zXX-~jLY~8*HP6pgrf~pWA!3Hdqp>25pMQccZ6qO0r(85
zKK)#;mcf*1AMD-Kz`W24$*^N<hKchWGY^pPm31ugk{R$$ct(GegpP-zdqG9RFP=Gp
z6d4`ec}AMni->Q`pLIaDtbw<TsnzQ4<Z`}$RMc&9#!N5{Qv#X3Aq56g1Q#D-|L&b>
z?Tu-H&D-Pa!*C=qb&R8s`h&Oj$JA+*e?`j4{tajT+l~>AOQIckFm~qejC*i`g0<2{
zzoq6i)ZOjM-`hc0)^w!RlAoaeE=Uhz8TuK&1gY(p8u}k#j;||){}7_%GJs#<KtKIr
z?UAsRc=%9H`b0L!>G8pb0$bpWb%VwhpcT)!3?g_B{&@>@`GZESrkAAi_xhS6?(zG}
z#}lxfe_H?n7bY{NCQB3g$z$@Q%&U3?X|9z?z6(j8645GM%aS~iG$oQfl_4f4(r6>m
z2{Az|^jGwil0&cDI`0vq8Z~!;Sz|eYdwdkw+<GX#7<_fPH~hm-`PNU&hB&@K=*TJ-
zLK>2<_)FLUxl;&OC6d<a{DBex+2CEj@f9yGpG8N|;$p?U@u)G$#l!4q9AOeIrER(l
z>aCypuKYKOV&rq#C+IfGwzbW6V5*-~FU?d~`F!Eiu!JJM0xc4o<oJYC*j)jg@Z>$j
zr*MF@HCJKG1{YF+&Pv5;k)<0#sWXHov()nH>2@HAxc`128#CMZzrGG*{eNijGPn7|
zq#|VNY+z;l*Wo|@&!;JdKj&@Qo!XIrK|n6^nxN_&u0hQV$cQi?NytRDiq4m5E(&Qh
z^Jslwc>!p-%n0<DuEoU%1|+4tu6n_EL$0No$xHA-4bmo39S`qZKYmY~4NWIk13(){
z2_h6Tn~9N{jgAb?<m9Oe;!c$uB}b7XPL-i4{b};DcSsuVse`sR?JEV29%+@FN;%Nz
zMGi9KU=%gq(~+lZkW#Nosp4?o+1htcC$Lu3@mq6TZ!XY<rQl!D$YB&P?WFigg3_ve
z;P;#HXh}=;X^%x$4K9I^@!**_Uezs^YJ!2*LuZgmGrW(=&1PHEy<TJgQR+H|6InB*
zxlWaow2!;i^W!jD!Ja&WzY%cf_QwKradUYs-di3T6ZJGSeX}=Z40eQx8Q_E=+VV9o
zdYUPaPD}P);tQRIc(7qB(E;cqQ1}|O;PFIMuah4%z33CM0SSf_sP$s`3kbdWsKI%+
zY>96ctpm2_Oi{ih&7|0|8Gm_{BU|;Ze^hwOfZl~%q)^n+(GWlKg2P4kCVr&Um+#{u
zX(IKlgKWn6!CG<GPg+kNyfi(~OdPneS@M#jQGEP5M@nCm=UI&-J#koqgHrct<ifLu
zek3}2lZD_eAAKGbJ=Tx}vzTo%d(q`aWR#%}!VqRcO?Y3O+L^YJT*%xT2*h?a(D{r|
z7B+tND-}7lqo3o5!FasBAXbBaK~5#-Ez~0zjYVwbmAb?TNZ3_Xl)I<{e5GsC*(l<-
zTvpD;k+!gue?G8|Ksx_*Nkk`KM&B62fv9C;TZ!N{(CCNQovhHC)EO<^RijEXq0!Rm
z*akjU(U_=NHQroi-n)F7UWsv<fr6=NiJ-BhSf{^-Xp*kr{JhAD8}<2G(jvzxF_D@u
zk-PSwJqO$Dkz;`i=bh?`mO(3hCEUGt_^O+rbw^cFu4yxd@0+9{FsY@6j6Sb=Bmy(E
zoW<^6va(c2RS;yq6!i3$TgB{e;cWlvTKO0J_LtIXRFYQ2Qbyvo?Mh2!f)1R3L!iSo
zFhUbc7wWw#<ipDW3Ki%f^5Hh&5R5PKJUh;fu>AblZnfSbR+gYvL~$_oleh0D=civH
zMdWX1S&{(gwi>Mo&+5ze$92cc)?1&C+hEdfG`sOgZFGhD96!N};*JJ1Pwtw6Qjri+
z6vitw4ilckG!7NjdX$|tl*Z$NT$zxVs4UgsLlO&UK^T5ow^Bx?*{%#l{2uiLDL;_~
zCsM7fBYaOYh1}P8mI@Nl<S;h;^+;9BML*dxP;j93(ZJ_K*E~h#XcG#4?0=9#*_bIk
zuNj}tLM;kqBfok&SHIx8fp*(mQPxw6scM39$eYZGyw-8j+BlYRid4wkO?jSnr!C8;
z>wq<%UT@rJ!H}b5*I9>8S=VBWp;@=zhHN;#${d+@8nap?PdUmNgie4_ISckdOl}!o
zItR=&&dPlpZ|idP&`Yy#GdtKYi9LU<oC$^nPQ-1?m(woHL~A|2Q>cazqQ)gumJZkn
z#${@mq|Ja#y#v`${Zi-Vs;=TM1Im!SX9n&RcN!&3Jb5~IOKlEy2i=5}Aoje-8Z3sr
zMc>KLZs%1)oN*%P(mnP_F`bBGgFep?+b7Oa2FLc0vH#HoA&J*`ve=Q=s#l6cstNV=
zZ#g;YwTGPX4~VkCnzxtUc*L4>!d|-0ih8tHAEsDzTJLEKg2Be!E{f7@soWFv8aVvq
z*6AMxO#l3>xsdd>c3h)!KnjK(eNCOTGNG5c2gKKDkRIj6r$nA7hlqe~LVNH{8E4so
zk@kW%3jLO9@*_q$$Hu>zk%hFr*`4B0Ypp2wfWuF*=oifAYi-$SZOg^sjNQk=tzu&r
zT|jflXOY>;9dc;lIzI+r{iil^#!me@$j9+>KpNo%dKSURjWLSFQOm;|L%7-YJo_EE
z!4W+HD=z*EwgD{AC88GR5r7t7M6@qPvR$062U-A6#x9Lrn;+|#_C_O&cla{SF0I|d
zJginw&^1X!6X+0nMy27KfNbQe(g$e-uZVv_!@=9wNc^mt6H`{Gslj6`_UHg@6lstr
zqNmx~69Y|`&qKNddq?m-jsj+_EjSzuLO0S6>Y5+rQf(tlk|Xx4S2QFwSTX7a)9N2O
zL*FnL_!+9VvG$_#aH|bsYS9f2+`<t1d*E*P*%h5%PM9XZzmrbChwvEk3G}<e1O+X%
zK&rfQdSvf{XVTOp$L6TyZ&tSueCEtn2<d5AS}1j7k$(j@T37Eya*SYKgh{8t%p=SB
zz>4;v45Tr=CFmlYlY!Q(&=tt@rGb>Nqgxj!C0TV6$%uP$KcY$7DvZcgRJ#@S3@dG3
z<}Te+2o=Bd!JBnQZ+37zoagR|765&{5^}0qC&gnn;^Z$3w1%`p!??EJ*#E}Dc>wAq
zJ6|2%7oi9|=v6VrHEByMI-)4L59?HWuuD!zfTSj$wGUSUvF|7}`x^W9mqeO-d<m<+
znOZaaQ#}8d)SB);s5RTaQ|s)SH~jw%wa$q+|1YU^)EBj;`3tp<`kScrd*^EJKd3dV
z>=(7BRYilAya($eq{*_(2zBhm-f?kf=4%X*U`wlJ6k4GClUk2+yew61!kTb~H)%AG
zxLdhu??f~CGKj)?x7`dWQ;VK6nm;F|i{<EdRC>A+77-TM+%|C#TJ%U&S9f3&Y!((Q
z)MB_NRjg?2S-5RAIRS9=0WCQ!%(%2mA3rJ$^UVrDq>~)RFx7%g)gUcmBD&U$)O_;F
z#Na!62QV&LavHh4Uj{-SSKU>k9JDq!YsJj)oHAx_wj>#-eP{peg}wHFMXiy$|NGP$
z=TB-a_;*li-wH+UKkU_6Uz4l+|9#K>np*#uT%~OP^J+r;xGbVP(opxB45t&vA9Jhj
zNWTl8$Tf_a033p4_}AR(+I+D3eJcO^eAqQSF1`Iu-%rd>Ko+V$9M&yPJ72G>HCsa>
zzyzj<U+hh!ZarptUc9~C-*f(ekFX5cYXOBBMS~${N5fEpwm|EHK<;-i91D!65K+od
zu@B<6ult%>=b+_H$H+mV%^Z_i_L3|?lF|D{7qAMYJ#PJTi49u~iq7@}edB*yg*6y=
za5evKu^sj1E&&#{jLl7)L7%3UQf|>c{9|fW{c>2xv*>CeCB#m;`Sk7+XVF5|kNoDY
zHbSWkqr3gXVLe4Ped}4IwktD=a#%@;IIq~q9Q)cdH!P->W+GR~_r!%q3N+eHrj|v{
z+i|YxcgLBO=aD)*(;lD|bhz^4_ob<Osg4z;IeU_%1M8pO>Lb}AmmhgabV67=s38cJ
za%c_e)n~s48;GyZeO0(SdMLGqaT<<~;shBHI5cL;_Nb;x*{b&dX&COT%4zZQIoj;y
zKoXtLd0qU!3$HR+U!Y8!wt~bK7eT)ZZu?2A#a7d^g75f)mo-P*m>2{k9q`BYMRhd9
z8gWqTQjRus<mb0nf8&nnhTV`2Jj!J(^raw*PCO(VJnk^JD&9c*(`)eEqyqo0G_5Up
z2)K_)C1)EyttQaBVcv4|s0Sb|^-LYxDYWjOy#-!Vcv(&**GiaM1Xy-tWnQ|(4M;e%
zJU>fbC*&}<7gr2DJJ*Y^oxUX~1UML?BZ4JhS0DY=pX_HM@))e@*SspQ&A#Hpf0r!e
zC49C-cU4rjCU3apd~gEyJ^lv%%WdFHYUZ6FQ*&q>WH4uHIvG5c6#E2vAC;ghIzJp_
z)sXh!qG+D!4Dscz4l7p1%17;t<Pa_2$dxs=ul>3+{UvwF(i+LKE^=yv{^D|a{qBMz
zcvS6y-s6+TEU{o0reOG2gJ$V%$FGsE+9~|1o&U*d!auw^B38Eg|7kTLe(jI--Jx#t
zWSWg62SvH7Fz|E#ie2oP0X%7@AAf2-V{6prjW#QYt|#B891;tE3<PFKG;y!HUf!`M
z={2Md5P=C2N0X_n=giBA)aQ@Aan*1AXpC@z-2gzNWWfm-Qa~J_(9pYqQTUBDN<HG(
z{Two<Y=k+;Mf-6-#Gr&A5q~ONg@k)*MX(E#wot*z6q%s2TuiP2PR7g3X3?NBH2F%J
zXnCxXbXGW-7G9P(PnLqXoAT!*9yOef>KVwl8Kb*(@x5u(n!WB-X{ptqGjJb2vr8r!
zvDj+5v;TG{ro5zDRavA2?TW7*bG+#?rX3X+ZLC~Vtu7!fbIkMn?1^S*jHAxs^gAE_
zq45&$E575VV*DF>3nF2E7D*B%i|<m#A0*8LFx8umShCSyGo^AHDfO!84oI{O(0g=2
zM(r$|Pq6pFaX{!44Zv`S@>QMloXqF7IuUkqTo_%Ha&`AeK@btao++O=s-CTQ&kGCH
zPCm!f6CjKML6r6Tk)Hl%BQ_T*@q&Xfz3Ilt#G1N`$rUa-PTT(K3WZZV+|&yH0m6+x
z35&TR!^ph0oNF_Fs!aKo218}l$;gFdk9>*hfMysRZx%(iiWoMPb)Zic+ptTOh&hP$
zf>toy65NX%#6~~8AKh^A%oTK`TU4jkV0rIRkB>%fn+lNjF3mly$&d;>OQW(`cfL*z
z0Bz~nFBGMUD1%drlHrK&v){ja368x1pF^xp`Y|(+*`;h7H>BPLCtflVI9XCgz^Xbp
z*9z#R*6GH71PwGBRrd_EOKe6_v?r-fekEuy;vQ-VmT>ky!)NI!k294@n3^gF(<H?%
z|L{>Q)D_)(^;i`nKHE#0<FF)0QWGU}(H4dE#VTK7S2+IOT(+A>c|Tv4x7b_~x$sNG
z!s2<rwcOs2GZG)G+U&TWM<c)wRED1E1@Yf$2{v3FLf=>2B!AV7-2b@eh}-<<s`+22
z<W7KJPVUMde3fQ|ViVwNzf_@b(vCidJ~gRP`|M)TSn_W0a))&LCz$k?zm4Z*JDB04
z_^aDUBO0)zGCDp?T)s_bq-J+_`GC=dg|Z>Epon4}qXkD`M6fwQ57I}Ra)>PSDFLuL
zMVz!0?lt{5oLH&)F#;)+NnVWAv1gDO4zbo4APTtSWRj>KzZpb4!(!RK=ruU1{!urP
zcyyd{e^b{$;jLMDDnm`Ssu@ATaSE!qR;Lu}_Zy0+zRL1Bp18Hqm|o9i@fNCxYTRp<
zA>5(=cFbgUvuUE^H)vNTBPHfdqIqYcOsi2Ur7VuD?5{&$Sge3(#>zOVoHf6L@gE$Z
z^S<JGo+OXlx95DJ*wZy?I}mrQu*hcGkVYT(8B|M9)}|!GE~!@XFMiaLfI!DmjtK4e
zaQ0N)d}0*<@FBGB$h@}FYhjzurh|m(4w@wPkl5mPCAr9@M5{Ad1K8`o>}f+^-SB3X
zs5%WIh-brZIn)*=J}m)dg+iGm=crSlTBRwceV{)BK8u9P607lHA$I(cm8!9;jZR`%
zNCOBqqS`8Bomk3!ERQSNG7L#+za{5x!q}1syt(ChN#Q$}G|0cWuVQhwViY4c(C>Zg
zTNXW1WWHk>M(V>t&C!Rab_-+FBDU;3>SNc)<xm0R-v6o?{lG^!mIiLKVfpb8y`N($
z(K!e(NSLakou*D9E^bF)3`3@7xHIO^=Y)AZ1(uvgbVsn1Tc|?Ei!SK6Rfi|~KYK8Y
ztK&a122hQ<0|C@0*OU#HZp&6cw2TfVGc3XHi`XvrAQl@Y;>(;|mYj|Zz)EdrmXuzA
z8oaCuB31e;2n(H;6v=9$<P1bad409a7uZ$K|EL#(_Q?6N+%rOpe3-`BHVvN9cnhaA
zb7=jVvIB~#xgm74Vd<bDpTBGX$AEzn{vCbhA2Zs2-Dmz;G4)>yq%P#FARyHtGyklZ
zIUyi3e}g;gwutjxos&A^m)*xJe(42ZW+DqD`nazJQrtgR;Qpo0e5Nwqz7Ho?e-ojr
zA`Ae)=5R2_xN)F!Hv8HE3pB(CI_Qf77-o(zV=LW33OJGG<N>0jE9elfM&K?O#MX%5
z>_XHw65GFVL}G!TfC@??A4X+Vt|sfl>^O=m_NJ(5<s23itvZ!qrr->Rwhp9e0f;6^
zZqYoDhhsof_0kPQr1_fAkQf+|1Bt^`TeMMBQm4WA2Prru`|5yrAv5!oGk?2MU;pq#
zR#juA)uoVHJPf>e1>zO?!xK6gDl|ek#}M->2}xSBo7pTt8W%Twqr?D(_R@qnF~fyv
z|GLRky#QrXXqC06*muemfpx)f_xol|UtSk2E(=^TP@I(x`o&5P##-KvsC#c|EEx3(
zfpsRx1!%*;Ok|M{N&_Zi2pz^0CZpdyXtNSTY^=l&=WL$u=lhb&OsiOK()fFvb7c#j
z)yc#RFT=WPDCHT3Q$P$^C|Osa6MsEsgE>y(==o@S`UEoMO>-LMNI6H_ErK1|4e~7Y
zPXXLa3T|p)9I6$h=i>qMvz%XO>LM)GReq`2IUUi0uCpn^{zIg4*GSTCK^qMW7QIIU
zI(1SmmEYU=+V{I7Qq_g~lvHy+qCBFAbYkj~)8$e@!!*8tVxHO0vzcwy{mE!K-28s5
zx4;5mQt*xlm-sh`FFz<!>o4Rm*@4!!(yumS)J^bWNzjG=04dcGdF!&ceBa^FgbPM?
z%b!**=TXPRD@3X`&mgE&T3=oyXgEGnCD=|qU!8lbO=wN={k5jFp~?diVll_4%HpUH
zR}{lJYG@<3vaaP$Vz0rXq*2;J5vH-UO~YfC^H)n(q6mnnA@%#Rph__xHYpj?`(OIZ
zs(0VO;@?2!|1qEapHg{&f1~m)r!JCL|JrKELsQ&BLt%NIw!!GLj{me&OU1K0PGx*d
zeP*O?yglDA|KN`>5812uL*P)8w|)tnJBy=!cI5XtdgDHK<N%5wC1^%6nIKyF8YaPo
zRHv~ILWhf3Mw3LUPYgk})U@IT2v%A&jiSo9lu{1$osC`fFDyTA6Sa1`UjGM{_n9r5
zLC>AHNfd|=U$1i5x<|V^EsVP&vC=68A4|=2>GU5R=Tb#7iT$nJY=u%Otbe#3cd4#k
zpZ5Dvx^^J5j(TG4__}mdPxdUU`#E1p3({1MTGwrzl(~O2(?TgQ88x8jAPwL!%JjJ|
zU=~UGM=LZ?`v<AuZN{`CEJKGDt_bC^T?K-8>nQiuE=#{0L!%{~HlE-BypCT=EoeM{
zx?{PHjFftv+Uq6aiSBl|)-cZ>I!E%KItP%ZtfgR&p^7f0U5PN3@Jr{Yfiz}3SG$T4
z#@9)wCqo`ux$y5FalE>aUC)rzjDRdq3$)<9t*#ZiW-jL$x9Aqs5N3N3rOJ#O5~n87
zdKq1EZv*cSi`ru1zsyD)heMCj$44q;9Lid9y5}3AucXxeDRU~4+Oijc`x?6Z{F}}C
zK)=TngMG;y{5q*keQc-1x&!rfu?hk!vU0f(p-w^I^n$Z@7Nriqy!P^Z97UZFE8R;_
zg&{{W*%_O=Dtv-oeL&j$$s9MeAssvkedC`u{f!#h3;0)lM!rNZqy#=B52$EQ8s%&7
zHK)!87vNC}2Zjx*a3?Y|A8gWeGYo?CrS*(6xpl@wI{clK<JI}c=J{3)Ne|YVmYMem
z2o(fK-MPC&*cldg><aDJFXdJ=O>-$KKVG_|ts8P#G*riD>Qy6oKMLss=b5QZ?P4lK
zLNP$em>>Q^uzW}jKK>3w@Q*V3e-ROQh58ku=S?LjLgLIG(pV2sFG12U2F91NOQhc&
zf(S|uISfjzj6!^pOs^oBkGr|sOxth8y?V$&ge?*XQsy#dX{MCfwNC&|o6c0XRb7!<
zceRj_5vE_i`HxC+>3&tvw5QG}l?Jl@Avfz)Wrljz??<Vx3W}qi&{QQUDz>mBc(g8z
zj;&#u%+c_>a_5r)jq{bS;ZpQ*TWbb9cBK-&XNb;q1#5+#tOuChn!Zu!TvnNLpi17i
zvE!|2nViX%I{zk2tslbHM+-;1m_uh)ud^~8ZZ5gI_F3WK=BL^c#ceh}OB`%M<kXO(
zJglCfXs<K;HOm~WsOSoaI6Lj-KvJBp`knz}hc%e5u2ZKhIYJT0N~6C7_W-5W5a=4(
z!1aN^E7{_1Obmchj|UL^#CA6)9Cgy@Qi(Hn7Zr8Y1@OT0#cj(5pA|Hf_*4?Zp`4Tt
zn|GgImT9L?L!YEK#GX(S{F>UFX(J&5%`5nN1)20AUd@V^;=Zo5m>5^;IgSvF$6f6S
zo5E^xD+E@;JR-oeV(N?2<*q@)IKDbieh1v7U;u9n1CQYI4;Mo_7<h>9h;HcCpo9Gk
z4<Pc8l#~g$RTj?jKYJYO-u;hgL!ZKytM#{~m7DX2s!qp#;bV~;5H_g%Q9CbWnR=7s
z(1QijQ)#e?)P&bCC)niOQAN=(D`w2+=S6cIr+?OtY&n|<tp}!aluEl`_Smig<sXyV
zrnLj6bxY*p2E*O?;@;~ySI~^^J;RT8Cd-799i+nlG`b~`g4;g+4GRh8Hvhb-|Nj>f
zsKkM!HEj+SsbOO9^x!-Ae<~42Dv7<$%yfuM4V`10uZAyS-Z#4m?E1lO3qC;)SSjSf
zpyD&P$3>fZnM^HXyskyl>H>7|m4^{%foXof;9NFQ<BTb@jZ`Z3M8qjtx9FE!Q<X?@
z8HpDn`i{Y<AzGX-_m;=*)>9+#VN=pN*^0}sFuF0{n(-Znz9Pm|_n>rAJ;)~~a`HIY
zwIJ>RMzD4TgDzp!gNm$!G?7gzKbG#2U0Fc+zG1o_fR&+sI+GZ4Bv8M&c(QcZ@B*w4
z>PhW%8ERk1txKh0D>gamIh1tD4Jo*f^Wx0q(ZsRrH?v@4wc=iQ)r3OTG%fFI4{3(}
z9+|R=hyfm=_*lDy`FQ0**h2Q3<h!9*THhlF!F6N=BS&c>QnZ-qb4P|g5putR3*Npr
z&lwJGQ?Rr>N_x`k0yhE+Glji1o)*CvS>!tTuV17#B4zr>meAESKHs3%K1+G`7_qdp
z@~)eQH0ic%vBzroV3|IStb$y_3)9=-p>0vele)#BE}cF=4WK|gNb90!WjBQOIMiv2
zS!(W{7?(-Ykn9qNDDcTj0xw%c>Q^HIPh0ra1}4cJfttZj8(h*WI3X@}b^N6hifI*7
z^PrFv{RG`YYXsB^?Ne(++)}R~cOdn#6SMzaKRXP!B>f%w`5!;z|4KjquRx4d8W8@k
z2cd*!T=-S4F~6`A5wjOM544$7(HD36+7PpX*nHxB16jAr4gE1YAW8eR^5?o*gg48C
zZxa9Y8ss$BZ*E5upO2Rp>>e_bU9oKqEc*fjtT_w1y%~C>VY;eQG<hgVko6h*vc06J
zHNZ4Z%4QHD5xf(b)`M+qR7))>HeymYqOT1xRWz34$~8c1Mw!v9@)q?q-MAG3Zfr&!
z85GrmFY2^W3IbvpKPzM^(YugDrDC-G3@B2hRQEobx4zzhPRF?S>@JyPT&1T8(t6=;
zT)u6!qB2DZ(iPuGaeC8eY%AHU)lhjr8eLp$;ph(*)jOQ3B91bX+4Eq`haFJbO=QOv
z{+jhFlQR%itX^>i{E{9F*+2uz@Mxlvbo#T2G5$0m#ftyMk1!k%sMP5n0@|8a_8$V;
z1DV%Umh&G18q+}%gYD;cAZ`mY;W<9cRhac&ZR|(r^<f~tE|X(q!wEjxsZ%|9*-u?W
zHv)+Q<%Gfx=Io6c$Ft!p-Y47eg4I@xn;5vMHU9h6YY7rq^Lhrdne(zW3~^(Iljfwc
zl5uvI8$dhslZ`7Af`Mq#v0)^IU<u5^?BdDV_;Py<eoRJKyb13>9%VsUM=62{dmIdu
z8Glo@{zQ2w5!9(!$X~S_NBFL#z7UKZ!8N7l{G3r50>I-+O%A|`W;x3(94FlQf(`wG
zyfG}ig0DYh9L;dyK}fXM_=ClL)t<ZXB$}JXpX``bVJU%C$UP(PP>H&TUXi6nEH<k!
z+Ot$AX|@R%b$7pj6YHouu53+_ag<VFeS480DyNuFKv&ruZq9)rp>+d>2`+hMdzOHo
zb0C7O^WE)mqDZFUuM4|!3Yw-hwn~gK_B@tVIn2uPgLtk|k>=DPIM8y9s$DAU3_tsI
zBeS!=IP*PV^_l;U0H%NF=f5R@>1mhp4|`6G*=|1U2QsrfVs+>c<A(Gg6k4RY!_>qX
z`_}W6tI5mT>0a@-AbXZjq$rZufm-0y3MO`BjjwEyLiCu!7HBjjdx622Fn!R$EEepb
z3-Tq(<Bq<_x|wmWUT4a-;}b`N6<ouRwLO<XU41=PbH^zssZT&bru1~uAO7ulXd1;*
zqFR+J!aI}KdBUXDidDUp?iZ;JxHuF(D;0CB6db3XMT!`@yDE{dRl_k^PFOq0R63QA
zVX4nBg>|h&PW{egjU`JcPt@8}j_n2z?=J`8h<m+hHLt#d#VknT8T`!NJNDVjcP6f8
zoT8l!$+_-}zlYAvbs#iH<eIEGfbAkZyGE&lvaD>t@Gt-cCs1@k>kuQYg7zykZ=vb+
zC6*s>InlKCCAlKC%_I%8nI_G|^O4bms;nhPpW@9Uwv{l756UoKl{55HKB^VsK;trP
z?28Mgkx+l~mwmTV+>H|M=>wPRCOo6!-Y5^;Sw+4GWykPqlD;BFnA0R*06MT6a)@`L
zF84o*`0yNN1Ak6B5+D%ZMaaTY6S9!yk@?MpRCnHhNjN=1+8|A54HO%EKeJ(^2}k@2
z2&t0?QC$=4a*T(DeMB(5uP2yun?T%M$kY^7QdA-MjDEt*y~R3kc8*e_1QbSsEwE7-
z2Y*BOY>GCA!`~V#a}Hs^dXpX^c}3<|ofp6(e2Xb}sW_{*QcxkU)1Xt2O0nV^c%N0~
z#Yf7M2b-O<!}z}9RqA*IeiDeaUl!alXJsyp^<w1c%?5YVhG3Bh!B%R8Tt@YlBd=%y
zHPm#M3{gv*7baGgGiQhPojDD=GCD+oc{f|lQVfM}uxTcO)oe&G4LbNUO>0+npWvuE
zp3T2=KOQM1Q0C;zy8iFoM(+~&H_G>K-^OtMTV_K?V+SK+23ki(T17@FTT>z9uMg(>
zR^}eY{|uXJRD<wBT5|v7H7{Jjjs!tL$r;(MHNedDl%T8}(8s6IhoD0k!V_a?=Zr&9
zX;0r!hh~z_Tri&}kv1rn-ecmPD=0KyM-fyD6o-jQZ?m6cvfjPpnPcg^EJ{ufdfS?i
zh$s-&i@y=v^1R)A?0S9Oe0?~u<$dJ(Q6uJIFEfarp?YPG{EO|fpG4~_H!z&S#9rKo
zhqTuiBwIdrw<An<@Rp)#yX?)YH+*{RdXVD{gXi~Pb-5eFmzKJ9Q_p(w52M{>RrZ4D
zChkr4Ho9Izc#HC{CVMJFtuAT7)Qy+yrtG~Gzj{S@^8@xJ>%|qn8m1V156yTl1@<NB
zT_S#Rg!mBf?@rjpM0iW|?^c2D*%G^Irtr9Oh5XQQcmV_XBx81yk9#Er`8*r!e!8lD
zVx|4i9%Q}7h5S&onZ@}mGziD~)EvZ{^1SAan<{s>^(B5v0{$e0xz6*qEk_$_<N&}Y
zDp3$(Rt+4cwCKo70!0W5Gucu8DO-{u&d3tZp=`GMy}G10aR5@d;wzJKrCpH^cl59r
zP-IF>D&0@W#e^iC9C&V}v{cE2f-Wu-1I|opWO9G=iJnm(#uSqM`$|U-%Hym#Vd~@|
z;Cv~fNsiuOya`)OBW%IpcVHBFJY(C<3<lXlXvrM&{S^B?aHL8l4!vs72lOQ4Lq;LZ
zR3>o-;{wIvN&Hc1JP*RnP^+P_;$24uBm%!$@Ly>4H-mx1>Kf`(r5QcimDb<q8~C*|
zl?~j2>ckz?=;OvxJi{368F_66=C#bxMoA*>2tkQ=T-|34rHdf}F?I8~)~YOtVC38B
z6UEijj}sP~M1dF1NfAw^BxX_HVl=-~SZ`|h!MjU=27xOm4tC&2!?aYi(q>Plx>FjX
zfD)ab7_PMR05yvQ9jS0p*R(Nk`(aRWX5_xIG|(^?uC)o}DHfLH;ZdviUbsf6i|js?
z#8PTDO&vJ<5hfyagGAHPCLzWtP}G?@Sx<2CT@afEc|Fiua2l$G9!vCcP_SaXo59de
zU=)XNBG2~h62g@dc~M}prO2u)tc)=@vxTD*kQ7CF^)>(sVnqVa`_oHkqlk%DZ>pwF
zE-EIer~D?a*&MVcvqpjx6zO%CFH4n5Juc|sW`$2jmVz%=15m$F+df(*pVv?z1x+<=
z)+Fa*6rt#6v&t;lmJ?~5PI61vM~BKkZ3|FT3O{Qa%}9)$DxgGTWJfojO#Fq#Dmxah
ztd6tsiov{WD^qkJz_K^6qY#M|8anKyb{tk5tHkeckM>M{QBjstEj0|1c%)=gg>kMf
zWM(*QnXne?5{zW6$N2tAuOriu?3PHrOQXXRW@c10H*PqWFrgMM;Ig^UyHslov6&*4
zJg#l6pK&eKyF|MX|FdUvIg7<~BBCLfBZ>MD@h~bu$?-l8<0QElWHCbcdpe`Y5W{*p
zq9}8Lom9^2eF7YetzT})uQW3RTREY6k_(hmCM<wboFU*;nunQ11By(h+lCa9#c)rg
zT+3Y@BBoh_O|jGa(<Mp|1wkwb;q~Is*FhlYd72~`X)CzOFtd3fPDqT+Kvv2;LBvdl
zdVWM7_!%L&Kqhz*c|>?5bW&sqRBT5m9s69?XyGTUl)1<8xFS+Taqjp=r}o49dH+03
zMD934C4U`=7orgH8QR`&05=KhR!Hegwdz&YvGriYXbF1dz=95j=?U!zhai%OYlGjV
z&yd=g=&4+w8a5o%69jk%9K8GQ7O&7`AlEy!xQ*;iyb9=eA=|j}O>bWS0g&nQLgj0N
zW9g=be@uO@NWCQ390}FkdC}*Gbfztn;-DP)zE1-uJ!-brKUbAHsX|@v=soTm5g~+$
zyhIogARV8cii;p_?C8l>6jyVaV_dfYhZ@PsdNH7}PF|>yEnu>E7>elHav_DEl%|q@
znv-_Ec~P7o=IZIuCBp^=^s4#R6!Xq);6fi`^M7!bJ%YZ+)^yYENB_Jxi-w2(8DU2i
zn{md7$Q>J!8bTx(8mOOAYe$xQ86s>D(7BdZ+hx78nQJDr)$MJy-tG9>;(NKcu@*dZ
z-+}%+|8COWmxi&h*-j9)CAlf%OoEv8eh5K?Y7X-}Or&qN>O!ugnG)sV_?$sHKmANV
zpAl!#H8T1VD)6lrCDY(k+k<Q!kt%@f)5E|+QZ;<Djxdpxw4U@nr(4$cK(WTHu2PtG
z4s!?zYOW}0_Uu@R!y4G*jlc7LR?du%mo8_`&4XeU6?<k<YRRt_fZI2ZcR#x6m~(RA
zx}rhJl@8-%aeN<$Xp12KP9Mg6V#LlXIf(Q+q36i;zHfHGhkj_cGSog*6b$%sV)?WS
zgUMcwYmAdZQR>$I2%8~HabBgH25KE&8f%DNSjucSgsm);6#}LD3m$&cl|No2G{m-U
z1B#Yvcv)+1z-BLNS559qQt<_|?&g}~vfz}@<lhSq;p!i^WJjY_t8<~6Xh;l~e9=nO
zec4A;Yd#}12cRFO1iXi;Huno%^T0c&BjR%2$~)+Eq#iiIv^MKcg)T3uaHoR5I>YfY
zlG{Ey=Uz%m4sv~Nx&(DsvE~u13Vx3>H67iE+sE$s;YA7a)?McMYO76qU#Hz?>CGQ*
zL5rY2m`4ooQ}h9i5xPl(Fqx6jtKD^@fKc;FDOS4orW@$$K7(mbI4ZJIa%<@UF{9W)
zWD{$erZ3fyhsy4ETvSY5Hr<@C<E7D0B+5@<zKqmXqwp5;aZld~FU#2pqC?M;FlqH?
zuU?xPSjFU0ucnlEhwrLyFgY|jQdq2&XNKsbiq2gOO@3>XC5Ia5*(SwDcTRI8&jtFa
zDiSyqQ^gJr!eg2>L>sN{VHY=};cZ@io;SnNL(LkSG+&OIlKG@W$}14<1bzA1`iInK
zFP7{Y-_EPS1E@m1u}v-g5#%$HRa0YX!t)SC**5e5lM$j!ke}=Zht{o!m+Yiy=S$xQ
zEv(a*$F!>=Qc}q+SDQ#WHPV@|t1LB1o4%_y9jz{6VvK5A$<8t`7Dy@|VmhGAdlwkQ
zCYet_PjI`S(kOmpyh)haz<C=jh>e2Hn=c6;Ht51sS|el74IBO9swa{;Y?oNMY#U*J
zFP6vWx5<YoA89F9K%j}=pmc~TJ;b2&drh8xe5TN(+@QYgZa3g>J+2ORsXy8+XAHBM
zJ7uP=pX9Wx;^F|2jWr&=fJ`oRL-tV_=v)pI{KjP#=Z$5mTvxhxlt_)b&+Fn7ND+88
z2Oj@+vP;IM!WvTlfFED8sDsh-l~rzK2sw}S(~3KkINIi*Y+ZB32VVI+S>F4IfD2Xb
z_KYjj!{A#w`GQSF4+?5$qOr%Yb*%gAb+qgd!vVqcXuRKpatTlHsyZ3!HaFOoafy+_
zl8(PhA2#s<YXu&e{sHm2`p?0L$DQvH{n0=yh8y_mCRC>AM%Ma(BRiE`?%HVCXRl)@
z9wD4jX)rLv6))fx11k^e>&}ZT<IXnGHw*X+vkx}R)D%D&HTJ^T^2>Sb8e!G(>M+AE
zj%H_M`u8QKj;0Wn-<w>kJo7XXY90X7v~F25_$9l4GhFZ}H3{@{oNVqKZSA*)+^cFW
zKeimUS^894ZtkXa&oXyUGk5#?@PvcW=ml7oJMXBh^-Vwg%*KY|<<xd+({l8ERmYGS
z1Z~J~wc2D`T<twIeky}rM{dwzbY+=|J!f(u)#jLJ;o@V(aN38?bPc=kM2-9;n*5A+
z!_4Y^G(fQ(mVDyzf{EV4bOY-Ob)6I4qGd9WNOqpb%v2k$FFr<p0ogJnjmYSXbaL|!
z(U^1Rd@qKCCFhobD70M4Il&PAdRQ~q-<6^OB6mCV{uch%1TTxA@i49(7OZ51OUI9%
z`Q;gVWIt|gV&HNJfqL-htX|0BmAU#2)hF?Vtm2)*{`~y&Y!T_-R35NnqbuB1Id+92
z6A+O!=;$mG$pu2Or{vQfLU8@x&xEqMane~IO)$`&^v7Osf=)d{-+3;$M-eYgzkC1i
z;1O2zjw$9DK)Bi~xY9rCn=iORTzJfHB6sD-uaZ#w_~u+`D)NrEDgDwu)7OZ*d&Xui
zDL6h+d+0bvu=O~jqU@B*<eq4~`6}-jd1k)(Nbv-})^`~X>hoD%QeK9seC5Yh*_o46
zb4;}q$Jg5PYes2vkodw>yH%sUfatm8Aeuut=R~Heblv>`>je{3V%ZHn-RyFbFuC^U
zUO;{_{fIp!%!Xr${t-dby(b69@#HzSbQXfk-O?4Fck}$iGz~%wt}6Yy?wXQj--S;h
z$BFeF3wiQYOpcak$YueP2d#bqq$<5c^{temwc=#oFYfm)k}b)SWP@|!A#~5xeaEZp
zt1!l;EcO6T(H1sUmIrp@J^i5|!l4gSW2-$b38#i(mTCP8k~bn%-a&!~XvVALqnYrd
z9^$|&tW4ij_DA%y;ae&XRkN1zi3n7)`)LLuodUq@y->=6XJLhRF4?XK+C8WziY}vc
zu3>>^q}F$|fRW+w6FgaN85_!L`=EKI1NCt`k2H|BFfU#+)FTTAUsj$zXV~X#lOJ9o
zGHcUV?_`c0Ne);;vOC4-9;oB$R2wglKA^bu+UIPmy)SIFMrQZpGaP`~{<6!n_l}T0
zI=$#1-8tLwkUp_UT|dv?ypT_r9Sj$|{aJDQt+CC$!;<W2LZ=)Ol&M>VyLTaL%N#=7
z&C?AJ;zh@o#)FiX2L>_+m)+f`ndg=Fh_T#CIytoZ2*EJFPwmd$nsxXLGmj7qUM;4K
zTBgLF)>>!%!YF$je?rDwf<IEPZSs0drlVZsTE4^ncg?Ab<g1qRh0Xs+!s7VfJB0qC
zIll;vv*AB8Rx)EHWs&&dhRfYy+AFw_dr|XHxAjQ%I>b6sQev#OYl1wdrP$}k=eQS@
z46iiZ33j%@t_0!BOxH$MBtOf_POmRdcwL*j0k+{bAM{alOl#76F+n8V$!KEUVS@|w
zyxCY(MPwt&??Oh~PV)Ufm<CfNm1gqi=;|ULs8&cst~`5^4BvIvk_=u!M{Dk7da<;(
z8=WMU{3_U(9p~$IW+;~3=$;1XRm2!tgN3pnb%%PNnGM(S3_((-hb!OF^UqeKiX4&(
zs~PtLD2x3ft}qpfDI4+%Pr^{6D(m0G_b<=0!yBfxUlVhk-7shz3#v)7_11sVPVBG8
zMEXEZ*;@-=1m_k@qz@r6pR6WqvbHs;mOxz^S+^$H6XTpbh($*Qb;W51`R~z)^2qt=
zT3I!{Y+q=5eSXX8OY(pGoYz`?*W6L(8S>XhHQBEQSmaCS?@+sW8=$hMuT5T-)sny>
z;*rn_N{lKW)s_74*cI8`8z3dOWt1Oe9c!h|y72n(9Dpg;J&@vO>OkYvYwbq!TQBV!
z?Y6P%N7?vlE7-rEdyUkJ2pH_QZ^H!NzOnqb&t1XT)YwhY*g@z&!V%S>+>n->Kc;z@
z@_$`v#OV3;0D?*ihyh2`fPfkTMDc+~xS`Rp<Zr|>Jy^JMw_8RilG;e8x41KzW3}T;
zYqi6^)P>U{z+nicvB<2urZY*eHJEFC!G{6s^&)D3`Om3|DwT4O0ElcR$KRWtIFDQ(
zj#X|4mD8QTdZ}0u+uynr!r$G5xj)pnKZLkHl!mi~uYNy4-Crkjd`NVDY7DD7dr1#(
z9X)vfM?S5AclmWIO8Qps;Vw+D5<ZE(DFScx@s=OkF6}74g$FGWr0k47N2NWh!hEWP
zU+!)ZrbJ$vyJ`175{B$i!dLXb61V|<=JI?h4#WO>mB#)o-060H6Z3wL_IR@ecD(k*
zm=di4=B*k8(Nc=o3fqqR;UKg?n4_{aM^wTikQPX&c3g(ftnc<o&}L?l1l^7AA!I>}
zu;Y=rYIMZl*JqSR6Mte<CnY~Ff0Iy}CN*EElMJd|bjlzYuLi7XR76wnlv+97&Xk%@
za=579KERx+8l1_H4H^fMww^OCpp9}3>_8|inNK3kRnH`9^cAOBCj@u4=@E>#nQ0HY
zI#H2|*073@nwu-dv6M6h4ZVEq*K{$_L%~m^jdn6o04aE!!Pmy7UtIt5&Be0ELk#Y`
zYuanZN-}kQE8SI0B+jfT=66ygy6MP-XFXs)@tVouZBjhS09T*Drc5Jym}+{r&`V7`
zdrM**OM)e<83|*m)mqMb&ab5UjDp4r>Ur1f#F4dQt)%LRd?YfW8_kWCFOK{SGAb*l
zOJbk7mlE7|u9a2Z*!jS>I)REMIiqAfeR6H8CD;b>lPOyCSKJySSfVuxL{ULy@D1ni
zlxTfXeDmZ&(#<GnqrGM#NPlytbz$6FMq9O1iz@^1>Y7j(wh>{9@}Qhcku<f#;{xUc
z7Ijkd@%p{X)S0BZ_}Ba|SHCK=_RQQMXE;y8U&0t)08iq)W?o)Qw_wW7xnOc|Ak`;z
z&Va4y{TiS9mLm<5tUV<jDwQ^LLdh`>lcF=OZd>VKIHDrjj7yT=7^$ol)7D)X59^t;
zog(q5Y}~bSXjFG1q+*k(poO3$Di4mLjZKTgzLlJZTgG@0i)b_Pm?D_O>s!Lqx+m9U
z<*;hT;{;Q{81>8i(5<XZ9<*U@pMl+H{Z}NzUXaG6rDZZ?RPEs*=+-`~l3lBmkxSH?
z%K_JzqJQRvX^-8^px^kxl;yrcYGk7O(V>*>rqp`(;6xi30fqTonK)+d5;m=|k(C>n
zzVU`yVJBF{m=j8crB$1ZG}lmsFfCwwCOHZCZ9#46MAijSa}^qIyNCU~*V_5AyEDyW
z2@KL~KkywQdH=+Ck<^f6iM=9XE<uV)j#6@igD^Mnc)76v`(BZS8D2q#c2iAf%+#8o
z%4p0w!il8u3$Wu>P$!>bSPFsQFTT}&3R7gtOp;5J$X(cg<}+xSJvZ#?dEH-c+$!gG
z-F@S=2PH<hliErk#^=spQ^3i|$(CVzi<bK-&S&@pxU5fHL*iyM_0K4r$$KQ#F;$MB
zkb){4x))1$&*f>euDa^}3yEa`b9M^q{3|y1za<#8XcrH&#+0g2wrzFYwb7w{9w^R2
znAC@40!eNitUCwSrxm@%*A6d_Y;fdtGsT{jDdy72*&r<QD{JiL%=pcfS25jaea84u
zVA1Rr2#aytx%dKFz&B1r=w^pv4jkeEpr}yixilLd+DU9CTS4jRa=zYLsy$ELj@G!v
zQ44m{N^~;z(nJ(`SJ4=d_^kzoStqc{tZqIs=@9P6zdTWjBFGW=pl&@<Stk<T7_?4u
z3XRp*FT^WolPuF4F^i_7O0|xOMY@M3hG#zFv4UsTE+V#Ml#lN^;tAbkCzPF3e#J*V
z13XW8I&ludnb?3O$g*cH-3Ie@VHaeTXnmHuo9c8Tw=P9?qP5OwzTT`S*-;(y$ooL<
z&w0G_e$7;F#4yF6P97{14V%{5!Xc}5zGWjuE4<TL7Z9RdsHrH)$-H7bV$(Vv!(AsH
z-x+ik-wv}L{0dV*$0QPy><<d*78BySA~M#)U$bMq)mue+u`9!sTI+D_L?R1=+pt@8
zysO?#8!}A`74>`zy+Zp4M-S7g`W?*;bQ*(>Uo40tVHAZ%$AWZkU|Vb5slv^-C`QJm
zru~v}d3WYM=?;?tFTg)FdDyeJuVu?|4mR0vO~6q$l60bhgReVNW(X`p3V(cs4-EHM
z#`XY@KZ%t2)?{7gH{-|XYek^?i2C*OuQ^kn^@oYPnO+h-5^;`2uCv<R)DJhGnH#jG
zI`e0UyO*G40WPb1XIf=q)tKSj0*+!cNaL?t_=`S|DVV7^1)J^K7G%R@iOriR{n?mT
zV&f4w9>V1t#7gv_0%Aikl@O;$RZY&LSL{N^GMSA6E}VzM+R&3#>vdz9=cS2_Y7r+d
zuMlqAatTh+L6Uig<Hk*J=+RLC>6;*=K=GERu?O{~VKzoY9eI64FJ>6t*e;EH+7z~?
z8{}LYuW`bb`?fT&Bz%;<S<C*sq3cOc#m2Qeqf<Zs)Um`Z>XY)>i^@FD<F-i+mIUWh
z6HDaPsv^pbp_S%UrisX3ffwav_8?UA6U-)4jF){w6s)U7*UkCqRkdB`MAQcp1(D&3
zt&uV&V%|uS;`0a6JQvk~xfyGdHwF_p6k=6lC)&!dmk*Xo&iSRge)x2={o}hGIeQkg
z7HeTKa&5J<13DQ>LZY*M5#KY5uc|rXDQuXa;heSgkxlZLRABls7}Fc)&T5mjl(%1C
z03l4HnKch?;w)Bm^t-dBk>1srVHURN$zrQFE|E+m7c0ef;`R(Atd!|c?z;9cdyB^D
zl_R@S3yzm`Q2`SdqRXSWY>VQH;wv;nK~jgt5-Q%h7A2pMLCQ2&<={<|liM2|XPi9H
zS5!FkEB`;z&M8ROC|T2cmu=g&ZQHhOa~F2mwr$(CZQFKD%{ko>(|t32W+J|e|N4*2
zT<^+U`Memgii7WFp*+V&7dpr;=~YM~j#VGQvLLhkioZ=4jSTGmZrYe*ktNI#r_^gr
zT(mhjCG!<z@u(+DBsxQF8B?#R0b_0Q>PB2?O<zgr>3k2ie_*I8kDnr4&_qL~fHb$f
z=2j4_uFK0G)1+@h=yj09Nvk9_ba~C9jTf{+Sg%e*@w4=9=_;J`2NfjNV^gZP8JBXJ
z`ibbe9qy13D?r0uf{aTm)8ATuHAT6vL<9aJAt<5&y!uAD2|Dg_=XBl%{7QcoeV(L@
zct4^BxtL>z!0RWy?!4;Q?{@s+yzX{yx8Lp<_!Vzjf=SwKF&eOJ!h^f3O6!mHyY?tP
z=4eeHD2=cmtV~`kvX8c9Z)?~TaKYEs2|tj~=*}ExmoX)5dhauXnTx~?iFdMj$MgoR
zlnjq8Q3+nAv$~a7gAu?QjYG8Yifj&{azCe-*y3gpK1{|eROY}8D1NPfk~GH`+%av@
z)1iasgWPO49}E~aqwhdpid}-I?>WxQ2}n^^Uqe)y$_e5Td(EB+tM@`}(p-wv%JS;m
zd?B+pf}0ecbN!+k%Fv`nyLx%l_o%+cP@~faqaJ`Qcy;);3Vwp!KZ`M?{TOPmAbpw3
z|CLaU&L1-m^0j1Hw1&)sP_#A;K|f!^TES7+gb;6YFW!vZQ!P0^Ig2fT*wgEkbapKl
zblvjYk{h~c@L}JKGqUja1&`MF<aS{Z{U_2EcZBg|nA|1dB3)8)tnjM_4PnbD;agVS
z^2D^^Al;gHLtNQ5SA=E)s$dl=_Qa7RueSqi(XP&f0UM1T8IAEvrO}SH>CB}`&m;YY
z705a$x<xkL;yhF}B2=ZM9Z`g60nT-@yPJ~6i%!t+fr=Mj+bw@9WOVrG^DiDE7#^zV
z9YUKxt_d{vxDFV`73WmH)T<JeCW9bW1^T_=XpB!PADKZUQQRDu-xF~_IK@8vR59K!
z?ok426x=Ux?7{BkS&^`c3eZ*-Px4VrUDh7}q8vOTN%jgJ?L0bbkKu*d**jIF8%Rtv
zyD$r8F9kuQ%v^=L$uELd%QxiAEd$c~s)Ao-3qdpnr9p%rt$)Dp456|_UkLI(lI}ya
zd#t*O7~K;%fgE4o$E65am1l-%6GbQ<lCdcdwIexSZ|k@q-(4t7@6|m<4Cl}4!KDYs
zBxq!l?7?5PxcFX?d9r)!d8nN1m`h5IWCVjhwSbdNwP>I-6dAjw+1``^qT@a-Uz}x8
zUs0SZcQvui$Hr4~5GdL-4*T~U)mnuon<93W;44nm8T=ciT93F%;X+a%LS$OHdr5gF
z<y7>sv3jisiuToxkBuH&j?&gx{HIPlqBJ|aI3uBdkJRdUmhc=Hj~Y@)o(cYZxqyT#
zZ2p`sxPyA4YT}UO<`&s+(28pysv0YaC^)p#1qIHfO4|j0@CyFuIsUbi%^+hB7L*zf
zUL2rLJCX<4B)!GKP)&72J$vPXfbI?@f5A?BV4oPKXW9j(8n&+(3}2Bda0o$h&%l2?
z7XLPOa08d!rNXw4LelLq_ky?GG0qr9e>+BA=C(9@DOS_;3Ha!*_#NGDj-oI{R;Zrh
z;2W;N9P`$!P7ILnMe+EeeO5fDkMlr|YR}j2h>?{H2{z&b-R=6@lL?D68qCmB3aX1Q
zQM((|Aj42(Upk9Z;lZ={&U2jIO1|sr6l_p{295%jI}+Jg4Rf=MZb4IdY4Zr(rwC(v
zv=+9s_sPzC<^FnzINUJzttPsEU38{=?B_|k`NfX1WVrcpO}!w$kEb_V5hc+UGT-$s
zowkz#2U2~A2G`$BY$}#XZ019aOL9fS>SB?KBci9w!vkviq47<NoQ@AJ??;5IJN)Kg
zV}uuq&JSjf*Y$>{a-iMinTYEhn~o`tZ*P?YRxmN%0#o0C1NTfGHdz`}E#I!daISt$
zH)$Z=m_O&6RA*xI6EAH1yuJx33{r~lDRe&?BFxT(woN-aSiC((-cEy<V2i5oJpBdB
zd=NVm-};Y-H)6N!39LeUHRBwQz-RU_$d4Cma`bD@Ze`|ST3AeBiGHBfoZZylj$gah
z_ryZp5g-0Ryf;Z9QE<?@T1+H*UvRz$fHL8*8ox{bP$5JLsPruB3@1xqPk^`rr!XSF
zi`q@Yo8d2#bcBGo8uXHHyb~-%HqpbQHnp9EPF{icfl0MUgZr*--W`7b+toTsM(3{>
z@~>Zie)=h#|MAsY$=%M_NWk66*zy0kUpHz(dc`jye`i;Vce8xLks*;mAW_M^<p2<<
z!R69YC!vt}`?JjvSqGRa#3P*mZIU>id37H<L16AVIffwUwd`&jIi@Q<-_Wz_I@)xx
z6gwJnxlDcD6^f_L%rW>L_&jIbXF6VUA8&kfKes=w_Va$w|2bV_0<|ed$dtgIDIvmS
zRZNvp#I8w?bjXyk#|tkKH*Aw(#2TSzFFs?M_d`5kGM^aBTZQ4s{29c`piDaFWK7x3
zaB(Q6+!w0hHArNZVWCMA!;}?a;4dG|s5>>uIKxCRKGGnKv2!eP+A&a%6@geRnAgfk
zVx>^|OO}rEkDq9n(@f_f-*M4~Pc54pqg2K%0V`Cv=pks6WqR{-XcB5<!dQ?bJMAU)
z5o#4rv<ru+q-vzkOxe>dQ#Lj-^oQ4tie7TO3LEp0VRL9x1%R3C!$7dRMd)X86xL4p
z{cYId#|)f1<O76F1@0=DCx)S-$Ha7AJLqAkw=b1RX0Pe9@)9qm<*b>r@)9l{$5yju
zVL5+L23<;tq;z^q1Ei_9FO*R=o?V1#nE9*z;C#Em%%OeWBkYtZvpjwugw-$~$xxa;
zkjy&H*(^3oao@G*N6|(dY*Q@Eu^Aii*q~Evdyo@;m2m5;D0DS<dWB_`XFE!CHG8^+
z<rUgALezBjaEYZQvmGmLHGkTLrNzH)jPQK^u!*H53_C)EvvBIk-V)SB8}BAojC-)-
zv9_BNY+gKZ0H2A29B}0z7w8i1_;k{sT8xXpM|bw&BU>|e4E2`_UqHlrUp>(OjjyJ0
zV|CZ5(S1zY@9F6euhJD{LzBm6<>8~>5<5_EcYP<t&bifdJ<=llMQ_iWh|ki%coX}k
zAD^gux@%@J*8zAN_xw2Eq$PJ%%`)!Ujwqhm77~n0TVG_DhN*mO0MwG`NV)F0)-lyJ
zl&AM0q(8r@Y9}j_iH%Eq^Wrs>D+qTWdFfWFC4#pMPHm~z6ju^vZ~hI3_ZF=OMSEY?
zOjr)<h1FASTpv>Z6UZ3imVtZcDmv6%40v&2`tcwSycO6>dfP{ph3U%b3i{Ovl&VWp
zgPW1TlO3ZXnDZ%83&5A%%(|5f*~a3oJ?l8xSh6^?K3|aueeklbIkc5YHmJ&RG;T8W
zhSo7@2KS9Vd1nRva3MfJ?<!3$Gtrc^JS_@I>|W-O<F1*aO!!%Bl$xwEY;Hbi2GruY
zEqICFlD+ULmI@g>E|e#GLwP4YdQ&=}J2Adv)$%p;^gb9>T3yDZ0<xW>7gW{r8#i?3
z<z&_l1EYb%IUr53ZEO5~=(>C!^~TX5{nXiH$ZW0idetamMba-vv15}!gefYC++gK~
z1XcT8>q{sOV*tlJsya5blNfV%0Y{^UgFX6CAbo{xr`wMHNgf^Y(!{%t_D+mK*HC&7
zH=3s+sk-+`tN0Z))HoglsgY~p$(92&?+hr4Q9A-0+yx|;HX4l|GidLdq@sDACf7-j
z<YK|UzhPSz<0FIufrS-8R@zHjMg?%11vFuL^T_R06oq`8O=QnD;udz0`iUFGHnPx~
z;7xuk2HaxFxo>0h2giA3p`2eo+lErSfe6yP{Y?5=be!(PZGuo`yxa_r|KhUNX;cpn
zpI>At^`E1TVcqXtvtKV1ffeb{*O3AN@I=q!vTok#h`?`(?GeeGYUh<w!$A#)N{Ml&
zar{7xNQ-?+d3J0gJt-$mRL>#V_7kPgy<-;rWPDg<js_OWlHFNClrORO-~@zqH=Ec_
ztWCMJjzeFgbKE}x8YJXx1uss0N0qoT@ns2220v^nqNj{g=XjY0Au?y9A4}&AGX@>j
z#zrZMOEOX%IL5@uinB887$}A+H}&MF{h;Ce6!6vd!U}(3vh<vPPy7zh7WqN4H;pJH
zC&Ur2RA~+QSqL)p6f@{wo9Fu@8gE&$k1w8za(t%P2{3FlVEKj@{ej=IVN5eV3y+p}
z4MsjdX?SQj&HE}h+Bq$qMmdX#WqHu76evdOdJbEA^C>rGlFVHom3X%W;O|D#&;&90
z{Wpnxp-tG@uzN8rWKGfbn>p=OO*V2MI0gzC#N6?Vsm-t?L%=))w#{qkP>>qe`1RNL
zunJGb9eg4q9V2AflN||)i4oZ)bN!i5y0Yn{^J`vdYd-l5y3_4ll99ce0t&ZsQjs5+
z%YD)6d+AgU%Z4ahBkwRf;JQ8UZpFS++&=%j58(D7V8{5wK9cDUe@&)kw5$zrzWYYc
z^DSJq=VV1@<~w3QR!(*uV}3%Z{=J$=A;@QhPt|-p2)D2-=F4vvw3Sg{Zoz4Xru4m-
zs(D9{Pk$ft9r=qrr;Qxl(enJZT4z6Jv2t(_K(6lgTk3$UVqM26=LpPhe%cvHj?bZ-
z?YV2Vr#Cpa+J$5~hF3Sm)zimTZvHI?s5fq%ozKmj<WuATox)``PyU|ka|*jQA^sA)
zCKSj#5lU*aD=1OSa-kzwtgUj$53eEzRJ);!4b1DGZSW~Mzj5@Z1`vO3V1oNLe%}L}
zRY~8tJ?Ma~f@Ua>^sU*`X^>7yUwS*(E9M$tO=yqkFN>M&6ZL?tVpyn-C|-;Qd~c(f
z;Rn~uFZ~(P2ibAS(9mwO?}eP{8vuQ;vB!m()BC|B#LsXZ)|2fA+yIymR?}zJMaGlv
zyaPP!rBg7{N^kKYXlM-hu_$s}Rwn6jJSpy`gpC=iv`o1fO%)rn)U>QAhuhHBTx0Xq
zQ#?w1R#v9qr1^7%lh`cGv|MO{gW|~<=u04LI(;YC7wXH0;LgiiO8u4Aubg4zpRtp8
z6JLjFFFfa-mTyUYe!yhE5Tyhqo7DDl5L{VQKPkT;X4xTv%(ZKpnOT3sZctM2F|>nR
zRBmLCjUXoZ_@1B+{2G~uuyOS1%stw(fDCn%Eecu7GfcGdIBUb~T4<nSl%d4PSYy1<
zE85gnl&K6^*l^gMec<0=T1uow;Jp@B5WQd|_uLa~+Wt(sUz`PFP{OyPc`Y%FtNUq~
zVMn2P?sLDGiYMTqt?N_@EES<D)hOW&zeDNjb#YmsE0rkc8%u*!sxl3B(p$pi93E<E
z%wYZo$RwxIA(%XtZ}*vZIFe*Bfd|A#U4y?@ngjHq19IRv^vNN;2Q6hXl`-yx2t~c3
zOQQ8FdfO2n*ez{*zB(z$Pt#T^DtLP}26K4nWQ`ms0hB9<Ly3>gv|LM>jx?+e<SCo;
zW(@kyPCvMRRPM4o-geM<D`e7$b0#a>H|M*Lp^z(KLN!SCj|CPLH1kDU5@?Ef%a7`L
z2X^GOC1k%1^eE3#htJnso%Q+oyizwfO|l<Z6K9eLJna=aho2=aQdhG-1+2>tB@?UA
zB?n9?zjjRHt0yotF=S$6<8-IrSMUfMGO3#M#v^FQOwdT)W*(VcoBjn<Q93BZ2-VuQ
zi~dNV5S^}>zjguHtYuWqqpx6#&r!tPjd~p4>!}83$gpvpQ=+1*AX0)A^R85sh|2b7
zQ!JQh0Q1W`NX*>eS!%^;>^FNLUqOkMR#zU?pGtTmOIZoXimA}+G*dRtyfY25kfkX*
z-#_*uJh5*ZR6MfYs(7G$Hw!*sKaD;VMKla4H7~KL(AH2~Y-xBSV{Pbs;HKcAOKJoL
zJfAGbY7$Li<_)s94UwI6;GU8Kw@@GA<}sAyqb9e~xQ1?DLt*DUuv0dRDw)BO&y6UV
zaWd79<Aj{gk8cXo*M$Yz)ajlVwR>z2cXD@T&2OLIwFtl9$4T>d>7_UH*aq#R;kf=T
zOg9wM!s5ob*(I0+*1bDaN~8VezOGRM8bu(gIE8+XyRP?OG@pL2wmOy0L<O7lKB2i~
zsDS+>-p)Fm!_OvkBa|_0eF<-1QjzSiR78nq1iJ1wcOuznFO*MnVlAf1mw}7>^Kh6)
z6S;M;&J|3?A`1uPiffhS)VvIu|J(x2q(55kcg(;!skJbvmqspR3vF7gTi^LvD~&_g
z50FiJ#n_`D8@};?S}3NoNbUWco)@))^|C4dp{+c&X-?!?vBK@Lhwp%WT6)`#shQ-V
z1yye1VgfVS+bEnRH?m6M>#oqCzyi>YfG2hG2>UCkc$A4kSk+j=CB7zouI_wp1~qJ@
zufd?PuS$Bn3dN<dMs!%v+kixrOg5Dlhq@u`?FOCAgDiGE{Kx(ST4m_;)mLl2NUTY;
zg&W^?W<SasBKbXLDu%=0mW1c_#J3Dp@|}S=8{)^iS30%+`3}?<Ejrzv^B3T&gIDl}
zD{EI_*7J_e<A#4Fo>#ghMpyqK%SQ3Zx_!5Y4=QZQallKKhYv1n=`m~#A;~jcGUxtU
zG!!~<GN6_#9ku_a*46G3ln(;$<)D<iT4$STXMw#sEGEo;utm!m@kPI346o}4yw8Mf
z4Zx*&(V5hmMhEQLYkO=hBumv=Yg#KLhid&4tq;<hjnTD#Ksx&by^B7asw~`8&L&z1
zq@aXjut?0hTpDKtE^^ae-Nr8b!DVlVHk!IrI~~O~3jbFo+L%MW`e3Nk8caUhb`vP6
zy*KI5Mkv4%s{`#2LJNb0@)iFSPlTbL|K}k66-~r6+hmrhY2QobNka1dg(p3K|2`YL
ze_v)w3-s}2f5H8;KEpT)vM3!f0F9)BWa~)1s9X|fqGcWfi3ZJwqLf8mA6Stvoj0q=
z#kN)PrD#<G`+!J2X0gf&qs;ikIMXmA{&p3Tkp{$*-jPm5k^bP36pD}ZB;j@!XuXTR
zi@nmK+fp55@o?$r;qWOE|6lT_F1O{P4$J{@t+rS|SdkoH#MvOlox#+bT$D>QzF*}t
ze#|W??EDpiXvzA3!N#bdt0GbU(r)m{b4FGL3|iu*dlnp^x}>bU%X=Ui(V8a^8xpL$
zEN>*d%{ws2!l-!Z0e1)&$A6j?iNlnr<E#3}>PhU8BI!%Hqq4;|>@UGT680;TXnLk^
z9F21Pl6J+bR$FXQv-bF!_stn)@)Jx`?x+;QcGv=)IkNVB)e&I~=sr<*0c&vk?6A9Y
z08_kTyzPDw-bxzd_7Q!3LQJRMM&7LexFglO4xoq?5o{$I*bkJyGV6qtrD7->VuDbs
zk0-HFH<h3`E_RJHB;Q~vf*#(zvlQ7<xnr#nsB>V2J2JGSVD_S;-wzECSWp&)-T$Q%
z5-^WI77xSd?cio(^nayfF}2g<Do&arb7%Dj=9F)(^c=Ms&H+&eF>aUd)C~ldzpF_K
zC<xocb0jmVfwQV(<T#`!KxV*MyseL=H>0j@aNz>rHJtW{a!frg{Wubv7_wd6MV}ZQ
zy1wzO;s;K0;9k}lqTGnjEYvZ6m`|W|9JC~K?ijjCV{TIz#;x10$Ru{PFDhg(VU+ea
z$E{vT3#ucfZ?QtNAY?no#B$|y*}Fh!Bsnii<C927D&bde?y*YeZF=Z~SeMBlJW<5$
zB_QT6oPDl8{Faq-1V~%x1F$*Bc6)LE@*b#$&ciO0)t}*Esu9L)7NqsYf<}JY^BRg7
zM1Jz}Oy~?;m6U#pBWov*>Wo-DPP;Icf4XCPf-uQ8ty2|RJsz%AW$;WK)j>Dbg&WqH
zynvK`IwO0Bw7+1Ua7nvhmUnT)S|7A(9(563#TncRMuFsW)I>+g7h%e+vG(O!AHRrW
z%duL=6)dN8n+LjPbr}lLj{tJ-xH`Uvs6IsQMxUntcHbC<w5Fs@w#HKNWY6&F`~AS!
zF?4wD7SO+E-4Qx{ixRbCJ$FMB+2hHC;#+w8^2cP4kOL5R-pi}6<c{2M!t??7BVFJH
zR@)!Yi5zBjv!HxuGlG9Z73OO|<8mW@Ve4RRyzB{OnMbv;nSDy6WQ|iRz0|_@XpKSH
zGuaYf`5jVnVGg1rxLe|8mgi$~W1{tmhez(D85Y+8&zuUecld>qHj!*jnEYEWj4GNU
zf<1%_I~xgCC^5=+4v&VHx!^-8@mt~Wxa^~u;yZr~u<WDS{+FTQd1;t0s6AO2SMn`g
zeTrk|#Q}VUc5hWzNiG$D?q6yf!c5&Eny5?9OZXQ`hJA2Pcs!+(!PnH+wm+d(`S&jQ
zf|sGZ24RH$;jj9P?cgRU({ez&muLn6@<5q&kz<;i3m6v$4JBbg62LM$voQQ9(1UnG
z<N8ck`(&{5tLV&32;;5Zzcay5Bl%I0Z=RS@(#^1{_V{X8N=_s6f>(D^OM2z7c4?fp
zh%AwHR#!iH2&U#4*pM{Y5?t)2(^t?@g0xZlYtG368e_^<#b!1f(&LCYCBxQbf-dPq
zP7=&*M43hDV9+^EIgC#epkX?A(d_ApM+?hOQP#Dk_BSLoM?+NFP|t~K!wu3@P(aQ1
zTPoFgT{hpy3N8mh;MI+KcLeQgNMcyxeJelSLB6y0xA9Gq@25w*({ZPoxudEeJk<eO
z62V)H>jEx`J#U308+8eMCZ{9x^5a&F#MRtXN!(1uwI~phi6y*7<7)6~r0{0qUh2fW
zh~@VjB>Myg8Tf@I4eG_bnC15%B>Moy7=BAg?pYGp8|MuuCk6h^H1^L-8dxN_eH03{
zqymSO83&S~4z(2CYZD4R2Nq6s4G0RGuu+j!i{zN|pNgB~ZmI24N%}Ko43T+%3DCz%
zZrt(4LULZ$M+WnE()vqW%CK5ifE@YOA<dsIF(<z04uRr0!WQGI=(Hu;M2|BwDA)`o
zD`)<eOL@mQEkM-_XKSb9LQ#WHC=shyIYF>7LDF=Vh$G!Z4tuijuM-G$o`pruy&1i`
z8ThL?Iq->=hUoaIMbVbb@qA}PT+@t2TQ*I>uo*PBpo~Xs+9`38TcXAZwngEO8yinv
zctbDJl89WfiqsE`WT8-bMx=3KdmqS%P+}y9zoWo*Cuzr>gs^%>MsE!<?1^N4OK(jt
z2#J$NxuKa&chY!Gj^@%TIu_!Ru{6ybfYpUVc6LjDjicz0{$b_RLin9={>pSs3}dii
z80k{Gs55NWD%q>36A(1=NE<)i`hz1=jI_5Xk`KQh`M6DkK8UPJd#MAMNsH480D3u!
zEGIVY0{y6~2F6eVc8@Bv_6m=*lS!uOFCbbW25=pxTI6k=xLPE5ovK=7yzzJq#@{+e
zwaCyqKRd&R-$-D;r$==h{^T;+Q0mjlM&YnY=7-jZh?~=C<0OMv&aN8UH>is)VhsKs
zg6W@x-*6yxT6bAt!0AO3Cff?o#O<}_o~{vm<+`tsm}%s{59Q3tYok4i#6p`CX$zP>
zK#LpnbLH%0<IQH#<Hk<Q{^tGHw@-N7NSJ9qUfq>H^keS-*9I-oKOC(ZRW=<Fm65(T
znMO(I6aNUC3k$1TQI4%yghLp_Gnproti~1W{yb=JZ=I*2YEPBcsOaFB8fj;|17#b@
zXJh!Q?`A0C<tp-Nh&Cl|r)YVuHjsL8y}#9dyy1DZx%qMTvIY2SUrj;}jnMvL*eqj#
z(<++6EqT%tY8b5nXlocP8Y4RX?a!sW3NC{bofHF>=}XA2VUR4E3>jPL4&!ghO0C9q
zm3%W6<!N2{$s%*+f;GDHtYsz4r!*zax$1HIMa?dv%BIVvG@-2OsrM)3c4@!}nc%PI
zsEXAj%B*f0IE@A1DK5vdN^}oDLsIHtm&+$|zpmKXG(NWT#loH}|I4wtvqa-l)k))E
zjVENTaVLu++j`$Mw76_YEGjLAWvNN}P2|c0WtB>itt1)l<M%9Ift>C+mdbIa(<$x-
zNPy6=<}*zOTgM2$GY`cj#&he{GBIBboP9H<fDB7^opBe{u)58gGl(K2qqr0inoxc$
zF@oq+A`|{Z^fh+?@r)yVM=0P4GBW#+S0bHVI8!;9ygoT&HN6u03}d!oyR2P+F@WLH
z6OHEo{b8VL*K;*ru8(UpzpN!4mKXxSiJetZepVPZY19$NoK3d0vjZSbBkA<!@%aR6
zW^dIUvYCC`VR<mLnQ)YIEw;wa4))!P^A+Rs=q`nOC|4f>nPls#4c?zOS#o3=N|6$1
z%}^>c3_fsC7b;NG8kIxPKxC|Ik8N@lcs84Bu6f^nc}r-xwwRpE1E79QV@V9rLA3r*
z8;z$_&9?~>)$=D77bTTx8s*b+hrD{l;v;1+AyWmaG@Ua&c|K*^JI?1j=hqa1Po%#Y
zym19e0Y4r92LdJ@+=l1^RYPGFp3&aKe*0SSakXT>z~`=ioVN$qt+a&RXbp>uig>8t
zV_vSJsTUfu!d(N2Zyaqyy2gAkg01@0#b$ZOS~ZwH$0&QS2(v8JdyW>dL~afH%-<xh
z*kbVZ@NC`k=HS{x$srRoeD%AfyQOrcBM!ui%|kNyBXqvfAkgOejSs+7?LqEYzCh?A
z>JGEMB)x!i^*;|Z-ypJtT<_ugkoW}Z-eTJ0>f~#Xz!7+bU_ExqoJpqrR{es;BEW)K
zFv@Z8(||}oEV1-!!zsb^YeOz+_wzt1*#>+h(N>4bRK7t8^ZmQVVvS8^4cIW!?^Tnt
z2WFoF0g;hs8RGF&FF9xyYi?8wayl&tKeVv(Z-^qGOMs1*9}Hd^^pCmD|2^rQjhswq
zjotLE?W~MxMU8EKRx1sag_J~u-R!Jvf7UMl0R~ptR76z$Nq(Xhd@2cec`Iwx@H8Mh
zWGhSHSTLbkI3=EnH4EBN;tHyEd)ENHZfnj@%(sPN8Qn(3#)Z;LN7J{B??T2;HrM(A
zGyzU1o{LYm=M2Z`*Uael?$6J8#b2E-3_+c2+aWMoaxt;@E&f(Wi1bW528cAf4umB1
z$b$|3=)kH5a`rr7t_NPsVV2TneUE5cgPsn~`QWU#K8S24hcV%?eO^ccy&3)h{Vel#
zouTwWyFd70C!K><_uT3G4r|m$cSG@$D!aQp)n-%M&<s;Gn^=<dSw@Aa;>gO`H*^EX
zCdXh-cMkm(cW%dUTKmld6@J=#gz=`|{R|!aL3Uj9{cz|__x}2#Lp4H84Q!Md<UE!L
zh0KP>E$sc+f>(eB1L>?<K?@>7M^@vG5$c@@&qq$R6R6=uGt&@~Y$7%uMiks_T1=;1
zI(7N@rfW2%5s@^<LbHrk<H~fhp5|G-()%>nKFHl&;#wB;;#LYibB+?N93vtvig?q(
zO%~A+vG@~SNwg4i&E;5mai%FAWA-m$5tQ`8-bJDG2IjszaFm0MdPHZVFiL|NUi1|L
z-*~6#rPJ-qx&?bj;?c(_L+<o+6672%GA(mgZkZqP<il90+Rh4$MotV3%IX|P;K9lA
zhAe}8D3F4Z$fCcNr!eOD9BOp^a{r9sovj+<iMevs;^2gh6lgm*5Lk<WJ}rnZ6Dow{
z?mLX$MF`Uywt%pw*HNwS`y4Si*#;h3$3yTRJizZBVfD*8rOKcTW?LclLMy>aZcxFW
z-c3F@xxAQUkT&2<GNA;b+&Ed*6U)|1>eL>zrnwF;MReeKlCd{rSSO)StA;CnT4vdi
zn*mp<IfN-0*a!?-&M0+6sDgE9Y$#v!mqU4f8-6{AaU3E^ggY4)>Y+b@HVq=C4WkB@
zuO_t$1kAYgP+jZD>6NJo$@f|#!+=%wH~Mc7z>i5r&TCLQ1mEPIC|0jJ=oUPb#D5?G
z3S6c@z!u)MsSYAsk*TTF`nOZ?^aLjeQt<Q*Cx}t-^cGX_^m8L;??a9~^Z*CVYhDCF
z4YP=JCuGk<8mC&Q2jydq!#QzHb5~^(3MjWU@#+#Q1)W<)dMJ2#$BwjdHx}CT&M3ic
z(LD?Gx~PynW>YWHhkszOw4RS;x$$(Z7HK;wAE#MRG(3dQ3ZciLr)L<jJ?}|D-g|X{
zTsG3UN9yC|5(h9@K&XqJ&q!G#6W8e(NkYtNMFB;M6wNW}9iC3-U%ij&4O^zWT>9L@
zKKg2jb&>z}fy#V@)A<e(ij)9u7)E^5#W&7sSFgQz^_5*V!hB|(?xsw9mID78Vgb8@
zdYa$f(RsUs+ReI{YG@;Er=Nt@t23W($#x+e=ryyE<^hqc3kD;`^bvW%CxG=j1!S@j
zO&RJB!SfR-xaGK9s6*ngk1+`nETBKIytsRQ;8v`eBz;f<uj9L+^M+p=Je9;#SLs!1
zqZ1&NS)LzZHS_TgDz99}5qvGA+-MIY>wh-;V96OK;uZ((&I_&Z1$aC&WQqoLY-)FY
zC%kBx+gu|k&N_evtK1_FdThVS!kzWlZa)6=-CZ@eBHDy23b{paA`_EH8p^}BPoB|>
zH)Mx_N*{<R)=L!jK&#%wbt7BVHY2^2N%V??g%3NuL*}IM*BoRmRDp+6h5fv+o@YBw
zZu|;Dgu!n3R^-?H^}sxzqXft9ejQ`r?Uix^y^*I%ZTPMwR-Q2DCN_SNSPLNo`Ef|b
z;U2t;@AEV+ECtOqv8f-VnMhCsE@^z7(<TtR+o<a$o3<OA=M70OJU?cuGl1=TYiwCd
zI)SMUv0!>=dv*@Qt4ta47<t~qD?9QVq%{Nxa>iFYUYSfF{+z<p6i%9PTcGCiM)hdX
zX_*xS=yEVP?<ljBIdm9CKiTCR&D36o<ZN9i2H_bIyAibWzk}o7QWq&Qe)M4Mj~*2K
ze?J-oZJlkL{&6rADqE`}iX!=tU}~+{914r;nw497ixboUbTvTdAVG-`?)+GU=yF`Z
z=yRoSI@RUQMEjiDo^+deQg}R^z@9{(on}sg7)W`*1jBN=np|~GduM+hZ!cf}k}{m^
z!y}<)YvKA6pTa<2c^V-+j*O^Eosx1f%q(3EyTj>ay9<ic&{q-4j(88{ZL<PK5Ttl$
z>J`k%14|)89wH=M0ua!MT&CM{r^1>YJ2KCL-Ue?YL_lg{j7KQIP}d3_^k8rT9@JY0
zH6W4S_5*L|I-Q>sV|Qxp?oC`cK48GM9_Q$fbSR;KskV33WLw{W>`$wwb`9JQ;x9NT
zn!m}T7QMK$m_(hb0}G71>MsNx6u$g$;t$>pb_Qs$xopcr8Ko;pZ{at+%sA-!NKENA
z&VFK0LLEb#mq8OH4)BN+D~ThMtaQY875aP>W8Nr1{2Mo<Y}_amyUkT4+9sm#vIS8M
z(T8cBR01!)tu7aKooKABTFQ$OCT+4sJD^jk9HGU!x!RPb$iz^BbOdgj-+pmxkV>1<
zGOD+vbQ0F)!n|I1xJQ%!0l%#Qi#diYT)O%5&>b~iV|mCtB2BI4mJr1nN<T%v1aV;q
zOlDQBkJ{p%CN@@>^`Y1bsn%+*DH7o-rNuHo!jna%W{l3Y0>vcdEcd%!woFr|uynR+
z)ULHzMp<ZMAKukwey~@LZsI`&zP-;GhXxr{sZbu7f1!x^qj#Urcx~55u)^k)A<;oQ
z8<x?Xr-ZhC6VlwS8uc0cXN~MpQF@d@niIUD(FL1Y<-lW;Fu9T2p&?+p#$^D9r@_6L
zlE7I1R87%-=WaDDWQh>REn%kW8bUC|8gtz3j&oYnx94j($4Au&&iaHWWcwSTt4Mbm
z?=wTetx7b~W%|(!UOM1ZyntD-*f>s4Nc5hz_il)?K&NM<c$?r8z}t7|SN4nt0!Zbt
z_$z^x(JguAMG*l{|FIDIkXd#Z|8;!BQRLybNH%|XZPxO-cZS)l;0Zhc@Q}W~|5H5B
zQA2Mu@r^D~maq7>DqB3-TV}};&o{`?e)36o@=W4wte>}9G|5+=Xj+tPhyN!iI7aO#
z)SFo*KW~+sL)F|SvwsIp`XuXb#Ci|cq&RVKlAo%p$-pLLpOv4<&lQ;{;f4z1h8nu;
zz$U31C6^A991>5!@ndOrvA_QyJDGrc)E@?X1ZKhGLN>3&W@m<$ToQHB84)qOhWPz&
z&<%V=$IyzO<tG==U%wdt@5yiN@J}@;tw8jTWN%ddFQ)5P5w-S8Rns5&Ta8AGcHpEX
zOjl;0M7bWM0D3O#=6ILUy7>#6A9#4S6p@?w$L}w){Z#E7qL-P$73t%N4EAHrWOmb|
z<?CzS-%7U*U;%Dq!Fgf$2}%+Cs&Xpye?p;1^vO~b<mBayM=nxjdiN)*2DyVAq}{0d
zFH%&}F|F&#g@(=6JvV9sG2LAHR~{ogh`5GhVAh&+Qw&m01TUFgTTL}BolDtl+g*Nq
z7PKP@(fixyPy{W+nb-xJOxnFV+t+utIO~6IPz5AZ^Q#QpnpM#$<mk|VXbs58R5a{G
z*N%H=_Fhy<<Xuw3h$zpW&Rw`<Zc-u6HsICj$0mwJt;m!YeddQg4mGpT9GiGj{A-*H
z?4Ca1fdm3VAf3MFP-k~tgPeZUTDrZ68n$6ornjn16HkMPhVh(7S>nYF7aCDF*3FGW
z0ZnEUZe_dZMa)_~!qDhu7<DPLbdY^IA&8R{-)O)M2zJv@K!t`cP_lWnWNHZ_*^1)B
z9F9R55q)-2&Bj(tE-k{NY2n$o&GMUKu=S>F^T*u$L9<z|q`?B4IDm9i#H$)m=Ytwx
zrjYwBB+<*)+?*%XBC^lsb!4K84Mqij$B;J3Fu!N;3okWjH>43<G#w#y=)5ShHeaeE
zSLQ1M{gU3MAp4KDGoaZwdt4&+>muE82Pn=V?jAy*W+O`oV#h~ufI<`m=LF+(d{Nlf
zu!~lqD&ynwlVN)vZUKoN1c`Wq{Rx?h#Npfm<S`oUvIdE@{5tHa_c+AGQ$P~nNUDAL
z^FEMEjUGWkc!cU1IE6euFcXkGS}aaRC~OR=2c|^dL{}VK09ge+-8TqIf#mUoGdPLd
zSqEB!1xDJVL0hC4Gj+!Jg}UkYbn<QgoUq9!p5!-#y_&tl{%a>hd-&jv?Z*e|<Ht#v
z=YJUd3dV-E4n_*b`bPic%@itYDq;yEeR<f{Q}q)Q;s0UHLrc<*S6{#jB0}azhy_N1
z`|E}o6qIR{x|*g<z}tgLO{+{kjw-F=wbQGHTrO3@p!`;({6!-9+2*>Tt%oacS7X{f
zeaN}Z?%j4=?d$Ui-4nhdw#!zO;HDUAPJAaZftDhT@7IddcAMnatZ3Hj1WA2A){&zU
zF7YgbAW7a4{tQKhsV1Hasha3B*7NgkB8dUb>BRf-$$q7I_cv*6V{B{aCi`inFDKFa
zrI0pL0*F?tXood5>r*QMEU)L#+iyK?j0po2Sb{0)n3oH{n%hP}Q^V`^>Ba+KyD!Jr
zYo?bos5PjDFE|}~57W@B`jtQBGN_QZ82@(E;qZa0+i@AYH=>ho7&jI_Z|%UfQgaz~
zftaIMvFy^VgSdh=XYUfVC|@$s(~ga>%S?Us6CrbmYRT%kEpk#f>5QV|OPp!d760=f
zHy<SgIBG+zY-sAdZy>x>8D_mlWfU38B4xu_r)2LP-PH6q>{2VcR<eyAgok0Kc-mUM
zhiv7L!h?ysp%06gyc%*KGfy{3Lwv3j1A{ZE4qG&79I#bOMvvz6<kM2e*@#nhnm%J3
z{YC8;z&_}jd&xw9+k5P#{_pb`KW7e8<xy0e;)6tvl@Y^$(s=coW&lwgm&KmuWvuBE
zl~gz2d=i}@bwt^ngi!bb=?4PpnRO$h@{+MfabgkRw)AZyMA8{U0!86-Mt#FXz?HsU
zhF!hL{B0u`;J352SL*6Lo3S)`;sd=Z$Os+_PPfLh7svAD0>#z*GMJQ}Q$cJWx9_{R
zJ#0x>=JXY}RwbuYD?0C)HLoF03~p3NzwgktlsCTm;GAHr>|>j*8+e{!F2v5g(3|ap
zFwjxYdnDKR5PR4LV;sENjLpBb;V08;pXOr;EIvay)3;Eno64pS`|NzxAU9mONC%~G
z4(x#!lNSWyD=5I}ZNW=W$Vc1_i@WV75MKPR#2mNC?mn9_Ma@Y?$>t!=7*l2`CTX09
zi?Of3*00~S^{$pXmmvyULRi}|$IZ|^*!XrCkI=Q`5NwKoU88J=&T&1J&$C(6U*Lil
zImGLW_PCj@_pmdC5*%-wfJg`6So&g}S=TWG&kI`y+|ABwK5Ow(>4T6Fr@PCyOj9Q(
z@%YG+s_37XnII1FAVJ@S0%#usiXB2c9p(7l_Fs_RMpPfLXtme_d9EcT4q|5eupHIu
z&Q>Gvq6Lp}Wv&G()yR8;$U2?g${Cp;8Ezqn^3|Y)5I8>%DHctz*LgX3`+8)d(MI9+
zmA?!Yuui4+u!dmdR)xKI6J?*}*sJx>ymHO$F7*9-b32hhx@BycX+ou3rh&~OuBts2
ziWvAa_^oAkm~kWnok&8QVq^}c1Q!`X$MrQa1B4w}JpIo<3yzU5zd^DV5z%)gZ+MO?
zU{RSZKmP6ctoY0cT*;3bZ~Zxth5m;%N!HoP&e`c_E7@4z`oDkw6XvP?)2Bi6kwK^%
zZ38Fbt5DOhs2qZ>`J-DX*MO(CCRCw@Rg9WBSqB(Q)6SV5xet4z`jCg(d5dDov{E*)
zlFCsTHLp9%JBz;Fbxb5pR&hy3;<&|q{OLAz?{mTPeLsx-3sFrzw!gtYT6usjpCB3%
zf~M@G&nb;6J7m)e!m;l%j%}jYPNmmB42lNKUOhw>*{f*B5!g7WMHLqPZ5;E6dVVL%
zo_WW{jc_LyTZ*2jxOKcAJ&16yX;^{#;e6KtZRO}_p%gi2AfwrW(J8srQn%Dr<#MM@
z-B~Q2)-&m}BIP&gaS{W3)jh_lOFz>y)LP#8T5pEok_F4F7L{jjn*Li+qYdWyQQ7h9
z5J$H$X3}xc?`n*6Kl{}pFdOA6>}^;vaHKqQ#s?P!3u8)!)75gLlRB(wk&b=g-JGfI
zCM({9EKNl@eL1RWd#msk)G%G<0PJ~f#uJ>YD)9*;Jx!e&2~`m`xUXeQ99MLb{2$~}
zJ{?+d<48=^Y8QG+E^5f0DBF;Q^4THJxK$=+Im7ydgPF#*l9aAXIEiR4G6>zoRr#`C
z8Y7J)d^<PEkJGn(k6GDAohmGqr(Zb!Sd(?We_{|o5E@9H$EFb_7YyVr&m+FU67xf@
zZa=-gsW~)<1V)lw;_i)QSm7?D)_hIGIYgIHihI3$;q3WFz0zh3q4J0Cgm0cF=j<i8
zF&evEta+NZJbW<0dw4av$^5f*a*#XG*e)*!C9;P!vY;-#Lx?0OUO4?*&!IV$r}?4q
z^~rrofh~b^DwFN3ISx1nAo?e~0juPCh;6_op)i!FSlOH#E(VU?AD6#Hj8S{od{=Kb
zH2mNrxhO`3z^U`iv3M&qztg>b6#}LUXAq^{!g0UoqdHYVXA+(~gT;l~6Q=*+A)V%v
z=!PxyThPgi{$WrE&8PWLH|#ec6ukaf0?)9@rG|rqPee8=^m@Zz0D-w2^pX(&Y6#bF
znxqRSkbyS<)kd}zDu;p=kezW<@co%AbO|$5TXPE>9kJ9UzD(qF-E&95_yj-LmHJ&_
z_P^}}x^0ElxBGtbZ17E^01p$#B8tU-q#iIzT%b}+RJ&nLm+sMyJR#&e@dRuWiD%MK
z!}aP%j1b65#KpABqX|_-heYXT#HHd{v$oCEs)|vvO}2gW6^S0WS0(lnJ0ReRcRp0t
zijAGwj6w#d;arvx+3P$>U+@%28a{lBiOA^3WVx>$)`)>a!I{6lWB#i;d8S*6+52G=
zr2bR`-v6i){Ig-J`lC$#>pHGrY~*bCk7e~th5v%Ba?jIfw^jLjExqwClmRb5zZqG0
znGeJ$(J$zQ$j)CDVApZhr)Cnr)F1F8{I^WcOp+raBDyd!eaiB@Zhg*5<@NRb1i68r
z1pq2}Z^6t7^gw>L+op!~ssJj<x!Mus!Fe~MQ`(XszT^yYaCA=YwLI_AOF0!I=0P@>
zO`+{bhGSTsrI&SSjh*5F=H^Ub7c9@8#M?uU4kp03>snH0N&qDYYpfG_%3BcHiz!wo
z$g5s^SZ|Ya5q$_(YX~7aRG_C~?sk59{T+=^1tOn(w~)hZ5t{xRIkkIW60U<cLfE8G
zr8Sz=$kkcjpq5<Q6B0G7(+LRQ2G2;XyX_EG0Gx0?nv*9h$SueuM7d(NWjU*<w)a<%
zog};?=dL@=9o)Iw)XmXa0aQJrev{`rK~upJ5QpMPchA-HMp#1bL4z)qB&BmhVO?Kb
zn}2bEjXLU6$1`Io&#lUY`Df=*;j8J6i~e-3DlA9V_{^yxpmN*975S_rsp#D%(K+*F
z0ddh%RIg9OMzfi-DyEuWaBxa)w;fiX`o_-`l6^ukkZ^@);njka>=HZQ3+Nh5k+dXc
z_j}+$cnTz`G!@(w8d9ymJCr9n#yG4j(&0+ZI$0PX5+nQT-wu+r5oPYppYVzO37`K=
z-2O9s#Epz?oXnlvrS<LX{wZ!V6|7~E=;3@kXsIg7@km3m3<eON`K)JXMIv=WQY2yU
zvs(2&7S=7VBwW@fZIXMY<6z^&_F!*B(ThR-`eR~d3NlkoO|oQ6-;S@Vw}0vE{tbTT
zK|51)`xAVn_HZ*IitG$Z`+6h)d@~>zkQ)b9h<?<6eW_hb>PGeO$JMInVMn`O^2>QX
z8EW3T@bH-6bo8S9WH4}|Nt$S{Y9ygvwl{19gkA6wfdS-u--k%CJYtHQ%Q(fYx}gpi
zZEW|1gi|Tuy~(=nH)?4}P#moAc1i;5V%9<I?7+gl_|RdO*qqv(impZZMQ@RqVrnZT
zN6`pDYpxc3WB65a6;aS)`6;hBH6yaM54atuBTf<^mmpeBD010#+adE3TFEEBB;vaV
z)t%PSZ<X7O-AQM5bdxE}&c*e_*h1HAK~wB*xE1!1N+~o6n>$C+e1)K9Ck?f<BqZ?0
zIx7gytIZ<vD0jqIhKs;tl59WCzu1D+J_n~*>Dm?gVLqjr#B(-yc<%8P&~JkaB2-ub
z-kbubaEF3QQS!uwd!!w>`4O5<Z$y5=8^VC|1;C{4DZSQl^$nDP3uIv?QfY?1(sRPf
zs!?>Wyo(IPfrd)T6Ns(h>S&qIH*QGdBl}AS|27|{y=Lj#`)LVagZ%o%|Nm2W{y8Hq
zR9aU+<cH&qx2l6fX(g8L*nuYAZSqG81$PV0DTR<TrlNWn95s<bN!na6N%oh$h4gw7
z&yg56?e3qhS;zv;4OBvZ)Hn6?D4J&9oM+VSy7mR2i!^|uk8sw<LUc$`${n_YRHP^}
z4B2x*HL(*F78%q=u7{fKFi+p&5@bu%F_wdnQKoxHkkX`jPe<P6IYtiW`XnCSS)y>2
zX6JRH(Al(N_`^@R)?^2XV#@;Tek2j4Iqx2T>72XlzG#o(vAKcJw!4yLsq!GLin{Jq
zuMhtDcV7iE%GPsQXpwc`Mkz|E)o)TxCUNN%WtrM$`GMi@TH}V_@UxU8tw-Ns*8b7j
zj^=G=(Dk8F(k^r@p&|$HWKLTj9)E&h2QsX<BLG&=>=QdWLC{2uA2NDecd|qAZF$Bi
zO2n&bvGNq6p6lxV&=7(x*3b}26ZGtj)UZw22!m77WZES6c44E3r?8duqibShBK9`(
zKpr{spe)v9L16KbMh*11Il@QW&6$kyAU8Do0kheO3VpFY&NDMtk6@4=N}3YaoLrA#
z&{n)921-rIUT9G92;;$5KBTXpZfON%Kosq#X-|M#ash5IZYH$gQ#SFe^ZEe^KanDJ
z7(w{9@0(O<?s-QmRa56qP&W}AXUc%$>D=f?N#$FVx9~b5bE=qVK)bYj!ezSIthl*_
zxpTN?mzq1FE7E-Bm2%l83jtk!EmfT0v<fM=36%Cj-QSb^r<_&vI;Xe!_JENS(3<C7
z2iVBj9`=YWJ;$2g-XVfra^OVpF#gY@n7O_z8S+EVa0YJSy@5>qP}#X;^X(xWQ)JgZ
zUYQ|YLQd3NDmsb?V>AnQI2-~off9TvP=7?g*rWpuTY|(V;yB3lj{Y3T{b`um)54<&
z=g)v;7~2D&QE);-*Xo1I)!~Vwan;V%RvlG=e-k+i3gJ^=)GDSjUScJCFwu*|L#`;K
z5Bptbw95gw7xi!0^pGse@zI~k`@fs&eE*{a`q#?)&-}?$_-BJ{Q$vNn%uSSUCTvJP
zJA!y9P|u(I;dg(y?EI>@>&VrF$&dGo>8sI0FYZn>Lslva3keYm*G1O2*D>d^Zny6T
z&<(~W3r0(K2&h{44PBo{Aau}C6-%4Pu-r8dx4zlMst`TZW23z2zE{ZxDgDGq{amwx
z5ow>ft!!o_#Ue2v+8&@`!uq(7I>p_^QzfRoJXk74R@lIrfAO1?y}9(^L(FkhxnnG9
z>K<q)Lt<4IR#={#&AD>4#i(%aP1k#?FU1tlbRhbzVHjin7S=$3y>di(mTmEALIaKd
zi58#mGz%ZP%Tckh*>$$0veH@_Lm$cNu**fkTh6B#$9L0xpJ}1{6=v#cTpdmsMWcA2
zD1B26c77;X-#(gb&E(C~ouZ*y7Bd@K7KOOA^Y=7sXmOkAtna#^o8zkg35!E+sQzOf
z@!R5r{JLISrq~))e$WkqbDMq;FvkgoHxpN#MuB~tQ;6EiE65WwmoE6PYc0X_uUnu$
z!dd~mtHe}@W1zl7>ZGe4Ul*Jf%oBU-_>&-ot58_yh#Lw0mdmvWZki`MAi1`2SCU%9
z2bDdJn6CcUG*`P8J4Swg4ZeQ;tG;_79ingi36Ebtp+fT?g~|VUApec-hUlY%8|uE7
z&0pfbA$FqQIor!q9OSpNmoPM=v_>$X;7Jeov1AA(_s8aDyYd+x_I9m)-UrtUrSFf?
z7dlbtLs+U;bbx2=mCR-Ixv832rJ$$T?}JFC%nGlkPdXlywl{ZzE9h|W()HAQU;Ge5
zv%XSL*G2O-4gtiKx-3{jDI^%+)UR|19}1v*?|vyKm0Bx*Iu#a|O=-%{HWtEjngM$3
zQ#=8_0=2L}iu>CY(ED!+<=d%US?ec9u>P~b)4%kdC5_$xy8|cgWbB~tWcv^1bLBsK
z(VA){l++=6pJ8(-PLN@&HWdC2mAMi^*+G{D+GeijF5zg1``bOdJK?Y~C1}zQn?L5S
z_3eCux4C*?jHXi=_nFsPChzal&)L7;ZixKJY=qzheHq1B4;?7|^INEsIpH@3|1hB|
z(IfFQz7=|?4T=G!aS7SW4gw<QQ{XU=jqgDVu0J5??t1uXvUBfq6eVKn{jdS1Aw&n!
z0$50))<>t@F*sz@lI%nvJG4-YgWBht;^*B%G2L^I-ItmqJ+`#q+w?U9n@UizKV$J=
z=TlP8D}v(mL8Ql>g3;%0HBjC`WhNK{MK3G#LS)jDQYwF7<o5}*D1j5pEN=Q)oJCxq
zV>Ril<f%*Kw?m?7c7OY|)9<iL!$qNg8J8ZpyQ@^RDc!p50r&4lj7J`#>#RZ8xF?J5
zhbDiY2+!)kXR#75;m9hf7osKy?!uIynf~f;|KUlsad~#u1*<4JoF+=*2I!Ci2|5l<
zsBcuVCEV~?=XDkBr2%`F_XT(=S*OGhpttX#8Ehml+pI=8NMl&C`Ncb&&dmf7VyxL~
z^%HU}(fNrZD`FJU=kNZQ$YyMZvRj%`i1<|Ch4ncMt3b;;NUzV)g^LGOVPi~JI20<?
zVXGR=6N|@JludBjs|%;;pYY`m!mVrLVqC8Q&~I#AgVAYKmis9=+o0uXU0uk|;z13<
z-h}90{CSk9dEoDP@=~we<nXZCQ~rxOcE`K~qsv^FP_Up4ieGSYkD1T976vzoC5`Pp
zw|=lQyCuevCf3F+Tnx-X-XI#E4o2MG%_sik#qSoNgrD)W(bqsTuaQ&j1y5aL7sPbz
z&6+_Bo<-D8CmZJ{Kr}A~s-G3#*wLQw5pl!HRIy7W9_DMI^wuSy9f0u_QgatNfkf++
zf4t|m%e2i6<{J8i3D;&99h*kN6~b^V49pRyJNxiE*eAzbA2C+fCq?jleS{k4by==f
z!f9JY_@pF^FCo96!@ufRH*JsY+waIfu-|J{ak7uz$a3+pNbRy8dw$(8N1M!W-ARwy
zQ>_%oK|SaTrB^-d(q2ZC3r4R0tF^0u%4+G_bc1w<G?Eh1-QC^Y-JwXMGy>8{3kV2;
zl+vxzARr)(bVw`lAFg^4J{7P2XDxj3UCe%V&7L@W_ViGUw56>}Od7wSq4xAE&Z}B>
z5s@6ecVF=CjcZx)A0K>nktAfnhC7a$Y-fCsS^g-CXWC$z;bQdC!IFWQ0HRhr5WN_F
zD|&rdjDGE?adHBZ-~z~n5i7;_??W5~RF6chgs?#pBilDh$;Nb+a2YU{h4D02dTfI7
zN~Q=iLSO6SRl(BHQQ07RY6p^AMP;w6FW%v=Az}kFG-O>=5u!D3Sxly`7Oy9bKK9W+
zA-yHoaY3l7K&ggtJTGpafX&rP-78*L>ODl7*ivR}3}w^Qx=6BYp<556Xn?ng)nl@t
zxsju;Ez_;ft9$i~)I|QQ`xP>E&Fy#Q?Q6ZH67WxV=&pxdlUR8Dyic%-Tb@tH2`nRc
zLSB?z;9<o;xyk(=1+u|rjizHB+}mImHRT?+OPvqAW%vxFUNQcbroJXz$tZqJvr^O1
z!V^LFrqqL%@`-&oqW@e|f!LQmJ%{DSJ$cDeP-@}pKCxY<WKuB`RT`GZH%`E8*JPCD
zJ?7pM&DLUR_N=xc3(~7!_ee`kWM<w-+)pXnSU)-3*tiRV?N3J<Vn@auH!Y;ulo<x{
z(0IpCqlp;@RkcN5MVkq=E~d}-7VHWWXDKl;O%`6VVabrM)(B?vMytJ8lN0O<1fJK>
zu7?e}&*_REj`Iq+H{PterKL=Rp_?%cH!Mw4-!#NsNtTB&oc8gN&a7V--4JVMiqWF0
zO%JWoY+Q&1V*ufDkgXi!0W{PwHdvWy+CWXY?PiX@VuTjXEj2PJxp#wuGf9?9hT6<f
ztA4WU!2`L^ALp5<(AejsFUXV?qVL3O_enQb?4&3@i}jF&e*3&N4!xQ>-y|s9cc<=?
zqKdPU#?55}B^M*ojX@4>+JbqeN=3gwIW}xht)>_^CE=Uz-d%&to}0Sa(z6jGkmi-M
z^{gmaRK%~X#9gqv>lN%%7pcnD5wt9EX6>x}AaS!w_Y?lC7UQSv6*Nf2Ug(`Jf+&*i
zBb_DA7{nIb;(6(~9tuG()O@uWdQWn_TDFFCd8<t*7y=iJCGZ~*dw`~UeiAa3fzRd1
zNKrv+TGoE9vIZHK?4%dN8GTctg9}x-nMj<=TwJBjl29G@ZeeJi+mOIUh<Wf9u`P3p
z=3uIqX{ml>##0r<8Vgh!ID^}8fh?`?`3g|?x;(M>8Br)S2>0*Zq^Q_nmeyt=H9Un6
z%6~mCn2z2|)d<rp%n?T9z=Z%$_6!y{M~@^M8(L8MnD)b7wWtfz(!9ADArV5plrm5A
zZVdQP&d1nWN+oX|jkik<FTJR4oO%n}-1YVo#rog~zf~NXQ!;p3n(})x&y`n=c&)Yz
zg>4=+LP-MjBw_UT3N1<UkM~7I^YXZ(d5aAElMx7R>FCTn%@f|I_jQd(y;X}w=BL?L
zxQWIz8&6Rdq~O^aDtIKQFNnDG2_?KP{UHQgqdUgxEtlH|3B@yk3er%UH0%5@31u*k
z(jRGxG9BO)A%k!{gUSnUi=28*QLsrw97Rw-!hQsmVcO!r*>(q-aecjjIOP2YZ2OnO
zjN)aykK-GVkFaySeHPX^)8djo*m&N>z>an|T!c|%<<%C^!dUC#joEp}IWfhoxQ_#z
zl8fkpImCBS71KPx&k|4ZFB)GxL@7<(ZrGC==bH6+Nj`=YY*Pg}M%m&z3O;7~h<7X2
z@eY%0g=^y{nAA~-Cz&)j?1FBt_iCbzZ(_QKTGd}?!iR71tIueUY<9cb>St%K^Jbh~
z*o<jz$1r17Xk1qE@rQTg)fKY|&PPeFm;@dtU$;YxExoU~_aQG;3>U|eS>5R!$1|80
zlLkB@w|!MUId=9&xKOVlzN?f`=W1EyAYRjldQol^x@Uv-7O9ThD@Zf$#0@_zDrW4I
zH13HQwOXVy(zQix?~i2>=!A;KnE|&Xy!;PB7KqZ<h=j?dr4Bwp-LdK_@9CW<sOhCJ
zP;DZGbwVY40Z$dbt#8;j3Gp8Lh*Fjl%YNX{2Ogzb_hm5er^RKFuy?$claE*EEcsyW
z%dzhdPqSW5EhsOVan=R)5y*jkg#Vga@HM+UNBQf@r|3nAj(ICEksB<i1%cPCreY*E
zm)7hxyxI47$+aIz!<F@gR?{R?w7bV9^g&jbSw-N3MIkJ<(L`V}ci3%Gm-uD%gG@7f
z(_Ze{K3#&lz5(t$H+5c8USa|>V~;+8{nPRufBpc3p&B?Yq-#}U7_#w|ad0Wc+{h76
zj@V=A%=nyf@2}zdqqvzUYZ@mIHLg706rq|BRpR1}uL~Q-;eDv&5s{{C5>H&2@y;(p
zKE7^bd970TAgz+SUwSC+b*&DeWGx|;Qw=uM%@MoXa8=V=RH)ew{n_f8nIAt2<Y}Xb
zXwBSYokliPs-m`LgS1<ra8p##;auaEgs<>&=naj*#GbxBw>(OU8X}1-X4HP2bIWqO
z7ejzHjwtc^19h$fteTm)!HG0(*ou<&YwW50xf`C;54;}}&lFoGt3SKn7->CcFoqp$
zVgwdx!`-Na*0b}T358*(?<Ts-&GuRE_EFn4*mz4#smX-O<5`u#;U>ML`7+j|>86`6
zV%{?jDuOT>FHtbFyxN73qW+M;*1mErBTQ_zFaJp_-Adm~aq+YAD)IK}V<NKR+XdI8
zCWwvdg#vCwCZRUJ{>X>iuZ{SoKg+z|gh(h?eQ*cNoT9j|IWgldSWLOrb&J5a8cFa&
z?@-iT)?pHt)r|QlgG6u<nIKdh(nPtU;S-w37c1wOiXX%Ud5B%Vlfck1SVzpZPw?Rx
zc9XiqI2O7ak8jdi^0oAG7Jn=71K{ejw`>yK(F>Rw-4yJV4wQWpBCuEzNf`v-$975O
zYU0eIK)UYnrA!#{(a5KtfZ_QFn9vCPw_%#>%Th$%(ay-k$?4l#Bvr0avY8+Gj+T+y
zz{uOZg;9T!sws&@g#s8$a=-mGQ*lK@)>T`{`YwN>DY5%X0o3lL-6jiNZ@ic{j%Rlf
z8z9?Ip;5tob!Q8SH;BwJ$b!HmB-?6e$y1|^+_i@1imBIlJbaJlx@)wTM3eoLGnlRL
zRAQwR+uk<DIql92A~NQ%>@RJ}6?GCNl(DkdB&6`dw+zH(P{da3p$3ONvR*;ZiN^r0
z=%(z(^2{d}1&diD@;drxJNjWt&8$jQHLlp{Rx(2egzxK9Pb}=aPi(?qQ*4MwAR#!_
zgYQAOUdP#B3}N5cCR?cSCv-<4EhZrwX+@Dk+j+4?d2#-;M$i>j1Ss?!@bi1?-0$CA
zibU|hHSr?{PPQys8WO{#5wp^dXj+kH!;Glgyd~!=97SepEYolYxkhe_{-)vPHsba|
z*UGUw*oa~GO(Pf>Y22c?@&FZQ-iMiJ+M(35_iaL`$BN_v^zX(OR0QVRB4o_Al!q$^
z3(^Q;S60}r<*K2Q3>a$<3ND0vm@$;@+RjwcPI@&Vb@+U`{!WwOX2`(D&y9}Q$8?ks
ze!;vQlwng`gj~xjE#ywM!Asw*Z{tjxg!4pBAGrfUX9GV#tzi%u5oG~730YAFSvh41
zA(1<8O3zmCKGVl(Gom`Ebh5UJ$oqSc{d3kxlJsQpP?NeS+Nme=v%%0C0a;{pFB_;1
zRoufvnwM@ja-hC{w*5%@<KQ~nw2mg6tOjZWezhSzYLeRATqS0n;c*{7m(7D4=}HHu
zO+w0DYF1twZFdr*pS{Zh_4)43vNI<8;kq-~5kNq^0iOq|Hk~smEFv!=D=Z=_Bq5?S
zucf6l%Zcj!L2c3CNw@q9SKFIdwV=Kn^3zNKiK*}6l|qWhkSgYlZV<nDd$Mn%V=d+S
z0QM%@!m47RkrZ8Zz3P(}Ll3m*=4ub@o(>7RaY%TOWkQM&cbVh%v)rQ+vkq%vnW1_;
z8U_VM4cQe#l|zjCf!0@&I+@Z~Kq*!L^fe_}c%8uUVyXLe62p9kC2x>2uX-s8Id$rf
z=H-=@Zcx_xJ9rX(+_iF}Ld!3AD|rIAonu29>1v2M9&>yWS4u)_blfqe77o%X3J!*T
zlugoo$Nb$pv1P)Qm9V+0mHO>T0WlcFOrP8oLuc+)tZU{ne5J*Ek4f(DW8j%7;1c#w
zjwf%u9Ok7R3(vR>G0$tZ&7E#!E^{0Ur%$ebpU=P=Rf$iZco$x=@P+)U%Td~CqsfM?
zWS@d4*a6NnfN2go>Qmpk-V}dx`C+?ZcQrce1rIBGZ%5Bt+`B#_SW+3jw^L#u8yB+-
z5!wWId0$D;t3Zv2Vae;c7GX08kQEi)ie9U&GS7TOP*MP~@}z0Rci+to4s|e4I~syT
z6qNQ;M>>04t0jM~akq+|Nof#qt~GCB2=v2Ar~1O?q13lebd{d;;x)^(!a66E#CnlR
z_E+CCj7Sox(#Uj~O`UwNRC~mG#Ii^f&z^hK!;X&=K74KDXfbF*06Vqfb^20P#-Xa&
zHRo(XXvE!kNe6^+2WNDj9EEMP)cNa3Hz;-%W)`jMR~?{Qcpp91G;+l325(f!hoIVM
zqf3ByUg~kLh%|2Pr!=V_sTs28q<?@$PCxh<9=D7(Bj3k@DAT(oM<<cyJzRpF?5E>K
zUQK67otcJdGK!Q)Qr=ymnoW#)$8OVYltmRO)@Ve_gXbA>Zv;-o*gi})(}?@ZC?hhW
z$LA>z3q>n|nR;Y?ie=thnFmRH_?m35nxsKHDo5;DN4c%n^BcQc%VnjJj5P}UZ(6*a
zDf`G$Uoe|LXqD8{!O_#x)z!;5S&AQM-0hlQ8Y18XO01t8Zm#bg9c)i<DC3$i7;8(9
z$=-Q~nF-r;Qx!|A1l(Rcj?BotAXzagleTREZgK1c`n0)Gb$bKmHr#=5_UFM_D(uh=
z(hTD}u~Bvn+a+pS{E!>4kM?h8E_Z;dja!AQq>xTYT7@eT4S`_sh16HNq~2;<Z)NRH
zuc-ll%>I&#A38l3y13i`H5*MJhL%FmrJP}+V9yr&9Tld^i(H5y8t~Vg)*tVSJww`^
zI<DMS7KYVXlUW;n@+98Mu@#18vW)S@`Ub+j9LU1kq~aI52JTF4H(+sIzvQn$IcDYI
zt-;qCGSzhr^{^2=vP}{*QmB*46SIolU(<D=gWBLE+<nJ`ppdAzw<lZs;Way(+tcPx
zY;Ef|AR4Qg)lF^`uan(P_d3ZSEP|&NZke7a(7_>mBh5%}C%VlNW?@qISvcy62l4=i
z7}l|sNlTeK?|nsR0T|<aiYLZq7=|mlnjUtT&lK-Zt}6$zl2G3EWQuUq%uB4&k||Lb
z--~1j?g@?#&I*WUcf~7y1mzGbfbY~-Z-oRk6jzX}K{bqzDyjfSK7#8jiml?T0|n1%
zsL66vd5n8V^h}>Vi~fD4#ws1B4PUZp(!|qK%VOe|qqK%h*;{(v1APx%vmR(Eg~m^Y
zG_=k4+Yg(;`rgRVZE~b|e53iJ9`vlu=^Yec56H!j8Hcrc`h@p^Nw|9fecB`L66U5x
zghq6UXH*aOBumJyGkM#;!f*>d$(USPA6*NBFJdd>c`^YvBfjZj!mn;$7@7<kPv5>E
z7R(Y%^U&TZzMpBKzqTe5d#jfwrBE@61X@0R2M3(%$pDB+dx>Oe%gR9kZ<ko4Wp^aZ
zrel(<vHA@)$j6=cf`}0fpa>qNdlS-`*X6d$yBZcpI4Oy!>%~=c45>H?;^@}&^xk`X
z&p>otf?iw+vtH;}0>5*&7fQ2i#9AVRUtvW%P}5g6&|M^&l`K4QrmaE_!R(RW_*yUR
z^{^ftp6n@+P>}@3JYt&1N1oSvaQLOLpsgKm(s<n)g>ZJ5*&%_|M5gwe8sd37%v26y
zR?QW!htPmC?YB^}S2)L`D&mL;_e!2Nz~2jpLIo~2jR5|IUT6jjENeYA^(IVY`1Enf
z?09c$pW22iRas^zOm6v}nvi-o6fK71`n~qBI*7?ITBkZM`*7n1pO4gLxWf}i$&KyF
zjdjP}5udvmiL@9XFp6~vux|J>^$W(6y@oTW(-uJ;8l$1^9432ck&A%POBF)8b#GBo
zoY6<*Na1>X<LUhe=$H$Bt?kwp988#zBhQ}EwbSNMvcVW}(H&R$y}SkX>iWQvrcRiP
zf`QN&HKeB(Cp!;rS~1JVP)bdYR8*>T?PuEQMjxrzm=Cxyea2msV3ET*vP{?wKo9Q)
zu~g1RiXt=I-x+$XKiJ5YJdv#0)Z~G{oiACW(aNQa!m8GrSBaeDJXtKPqeiK}I+~bg
zgNgS3Q{fbvDu$-(qrjB=j>$7PGXsH4)pe?_1$%due27ACvyISI%a+8X&Xl4Dy+g=f
z!RzrE?Uu}6Do`{|)f8074eWzQ+d%A2ZTYM^{@VH#6hy8kU5KXh=VvWNDW{AW<_Ue*
zam7F_Y$;f7I!uT%)xi0y*kCpX>sC&9PcFH#wlDF4!@fMaPikCxEtSX{I)m)c;*%=&
z{z~-R<|F61B<I`^qQwqfZzP6^a)fD#aAe83Iu^N-Bev=U%k9scl)mrpb*a}G);25G
ze)Obp0BIH_auKa*$%m)$!Hgcg|1kMx4Z2obtLmVogl2ag>Krxk2|-f;ALm_XY29{M
zCR-?-onApow#PjyT~ItULdA&b<b9gG%(`pTCnEe#G^QgUS}{eC#ZOMDdkvyp!)({+
z-p=P)Y8k{@4JgD0siOkd6t=cOxgMvfO|rTQd?X6}m_MO%GpBaY;EARNRiqmQ&-?ha
zTZ@Vy5r|`z758XRly%d!^i-@{>btq?_@TyJG;o{bhK2YNYT7pB+A`j-r!<Z>2BN;r
zaU@uOug}e!pkhV8dv9n+a;t9pn$CUgv`HSiJBh+gGJ2|Eh{b$_+U}|JoI#~gFAlYy
zJyzsPUGFg}joK&>nJ*wq?)-dfAoSBn;5eFOd!KP}>QJuf-Op}N1F>@?xH0@G!8eV>
zJd^jjJ|CZ|YqU44qdcV_`sm@USvZf6=w2Y7<e?n%`5uMxLPv!}RZ!4dFkSR%QD2Vs
zZU3o))a*u9>HZPJ$hMIhoob$Kl85vxL^-B9eML;eS}PdXgw~T*5YP-GOCE`sLw;6t
z5QaG`RuIDT3)9p+GU&|dw#43!6ouG$9-mgqI2#sPUeM&;@d)X{y^m-%vC6)491#g0
zOJQSQ2a7^mZexWZHd4p-fmb)!0kdD9bmc(<<MYk12}))<U08m6B^Ra$yC8)J9Qv;m
zUlv$*5vLOfaMiyLpfbdax@WMW+HUuhnGoDCke+5FBICWGHm~7_X_drbFPr3p9!Hpz
zkE)%|Yz;{J$*WGRF=cm+>R)h#xIpkW$uMU(ZISKSK3`$rT~+@~7XfXY)^z6ty;$JB
zbC#Ov-RQCq8+UcLb(yyYE*fYJ8*d^gG-MCm!t!z*Anjo2zzg!d`OC|}g~~ziWiQ6T
zRt`7VOSCwn1wkyc8AYMDWv^rUeNq&YomnV;jb5-CF;@nyGz>}t@1S5Li|pdcgS6m8
zy8FCAbp3hNFi3CQ(~#+bl!Zu3$v2$=wrzs(Eb9Kks<qS+Qq(Y56{K3Q#5X$LPEoDs
zVZ4c2n0E8Amv{yhP~fElGC+0MUB2+Zm;gj$A0EW~^TZw|6}QB&&x{rNt!H`=L-tzg
z&2$ckcM<h#=^S<|{D(e6%G=DEuryoUmO&e3m~vyOAVyp`bd4BXz^6^IuDqovbZY67
z{MhWVFStBmK;TqD&c~?0$2~1Ii}-NbTpV6gZY!bw?;?&0HilUO4ZF9J@iby6X({Jk
zyu={fAy2uJ!3aMGTJ2XBc%$SnYnj`X<}o9=BM2NWaPvnEsJ(*!YrMk@S+n3L)Li*q
zhTZF<FG$}W^4<|7H?vk~e3aMz2@ASgfM3RMH&SfIu;(NB7+#M^b+(7Fp~Px#{u9TT
zT*6$k*M!M>Zqg)-sgZ++5EHqr7&(-cCkT!MiNy`7cU<PGYhR-T)t`R15q`GRC@-H0
zvH=!Cy0jo5$lotDq=D6ttdhutrk2xK8;1A((-Fs9B@3wBR2B=o5BE#wf{KJJgsW{S
zVuHy_u2({pbkJW9Io{=oul3C~cKf6%lY2{Ib+qim$JC9KR`{*zdJ%@#si+$+`J9h+
zxwq%|VqM-j&#WA-2p-9}W|c#40oOB5a#x$PAsi>GJ>C0!tliJ*WK{VS_x-C{Pha?r
zryJgyp{>i3FKab)=r-cno;-JVbE4!ed<yU2cE`)jX?L2=Tc=k^?e*^Q#K+)k2Z*o-
zmh{ESMO-{Sj<zRFR?OR!NwAeMGNhX1T#OA8&?Z`x#%lNboiU!GJ})(NE<je@1?~X4
zuZN|J&Z}j?re2G)<wcyDXD_94ze}J{rBkpD?C@>aSn7dn0xqw)dv~nSlbUID1|vCi
z8kxH#o-gnMM%`Y)$-`nU1N%Yjt6@W@xyLV4oM?hl@gb~ho}*&AYI$~y5i9ZAn2;7r
z?A%8+3Wg?Ii;E(cfYtQDnbKfrYXH|3D8jcxgd!-kfFmw~Bf?dzX&9KQG*WAP)?Z$<
z^pfl}ePqR;-O;_d+TDzHsV}mE4VkgCpl`ZSjG4hiRLo&UQ17E+nOX(NzQ!ghbq3do
z8!;5AB&=-vhnMIl_i6P-Xt=6X7P-fV!{@tcRPP<j_H8wOE`LfkT5Yn2n+2P#ZQdI&
zKI>W|Ch@$uF{$<$c}U&C-tGp21sC{&K{K?#>=aZ9=O>VH<2ZO!fjLnZ{QyOO8p?iV
zo|xBnS!boS8#IsNQ*{<}gHL-YuGQcwJjC$0?GCz(Qa*%)0a~lIh7ZZzibDxnJx@0H
zwnPi|2F%hBp_rzfN>O3p{En3j=4pUdd2YWwi-ZOSe&LH}IGyNG0h!GH=FU$X+%~hA
zcT3tg9m|Q?bHG-D$4Lcr5%}Z-j-KwimhW&?<tX6X9=AS$%&;%0Sw%V`s4}vq;<L2U
zJSZLSL@Gyu1yktbxUCrElWB*0T8gPy;vufh$dexL;4CektZO8Qqu5R41Ro7z=JS*i
zy1Nfv_EW>+nmp16+b7g?t@exCMt52E24JaHr=|K6>|sdha(vdC7!AQnmX1m_H9(`d
z#V0ys&_41t=YaL0gO|ke#u~Q-`q=72xwALnX~7Y`HCE+7P}k}Z6wqRr0YgeBG#P8|
zE<sKwXiCAFi3zP!E7>L`Q!?6%?}Sr(wG3S{<Bw{N%*GV~GCunf@o1@*tU^HkHYow;
z(=6pe`)gs|_Xyjl$3w;gt4k)A9OHdK*{ytVK@u&7{3+B7(fuuMr@=sCva;R=LwVoj
z!Rle4J|F8ZPP`&D@`fcAy+qp11^PwGrz7-8aM_`l9fp?D!<V}^lWI4_Up~cEeXiJk
zq+DB$e#0D#Q}jupB~ldh1SQirR3_wXc1%$Xg6uGeaK%G%jQb={qWkhFB{N)N^J9i5
zNn$_a-hDgU&Vj7NI>vz8OR1fgqi_wKbqluDem+q=%6Npe9M1c$<ROVv2&lqj!&@02
z8#Y@IE^c>-_X2$)Nim+h80Mr+TWFdnFQw<#M=hh#FoaFs5zgMpYDl=1Fk|uYqsfr&
zo0&vYUCiOnFt1C*Z$<gM1;NHz3=FrXiU3M5xd`*Nb!+MAW6p`Hb;_GRZp<+z{`_KQ
zR1v+D2DrRHiW8J%848Py|Av63>N64yM`*UZxAVwIx7?J~1tkljPheg7RC!HlqOImd
z8JMsWg#}D)%JJ;S9i5fuE!rP$_78MK2Ds<5<hSbwENKXFHz{a8eccy^^q?|s4^8lq
z%qo>U{rpY^>1wdNFj2HYk0^=XkTvAqLz14M!&R2R$O9K{*N%oB#k6ssdy#8>xrWfM
ztkGN0<V8iqkt9ES&L(qSu0)x6+-M^OzPI2Q9>6BW2bKBs!^~6hC-!5NTL-=x_jfy5
zhuW7%O(~?x@Z&I)C8BY}uLt;ax6Itb0^tZy0&89w0(l5K4c$Ae>=mw^#XKSP@uff7
zY<qlz=PD*1rBhUwy}O1M<Rcd37tl&1QIC|JCQqk>GRFKcqKtUjwqX0o**Dq|3n6I)
z)Azyy_#-EMy0t9{?#$j7d0#ot!e_`BJs(5R6YJE3>Do)g3_T$UF>wmMq20hVbS*)@
z*Ac;&#MdtZFH@>-TZ4H6R0rCC>oEP^JY-;AB&$m}_q7loOAznFyPyhT1S5<n(A)}p
zg;DwxDF`wRyl^jG)!CIyun@4P7KV+?tI*)(H@-izc4aRif$an`n#%lFU1T+7Mm4nf
zANBT85m@S_9bm1azSAGLp@#x`;u9^bD)%}9D*)Ufc;yLQUJsdFLCU0IWHHQhTl$B$
zKJo1dB1OR&adMgfbxZ|p$xo!Of56)hRW2c^tZ1u_H@?X&Y9Tr&cZ9>uAVRutI|xqr
zTtSN-Ousb!_6&qr-T{7tf{cD^s@?OL6ef-j$PpMveW*&1jMA;S(b!Z<=Aw6U14K#Z
z6Bs;vRUBLz&_y-^)uZW=E8yx78nKGt`%B~svEk|xL%HJQf)>hMXs*GIMkv78%R}2T
zH)ffs6IMf5kXo~XbtvEyXZfD8m6FJfnbJ?oUDt@#BxVwaD^k6&=V*hwc)zL+deswc
zc{T9mTucPAh1MX}(5yH3c)|2N7)_4d@Co9RRD5N{JM8OZb&r{F_r{IAK2A|6SgdB1
z-2IRd(TeQ1C*X!nInGmK?G;dk-A{$4wNHm5RGPBXZBkLD0#2rm;_mdeNy!Zamw<?V
zjNLW?EVBVYYN&LxPv<(VDx8N>bTlknA8}4HA7143?nNhr{Db%eyezXQfmbR)Fm%c|
z+!RSM%-u-&_FEFNQoDoZL*#y{JN0%)y+dU~UB@5Sb`c(II9&HKdUwbZ<!JVKZZuAy
zAyJy6l=hGY4MS57mNyn-H{r(Ophg)9l|;(6sUdeB3e%>*B1~)i$SpLd_f2MNl8sPI
zxa0&CN{6xU86-yGT3}f3pbf?fv>UG&XGkblo4FkDLT^h%&3o_=@Wbj*rZhgNmciIm
zt}AWLk29I_9F2Xw4`o)UOk@Iw0<#D*s&Tznm0>F?crh92gH!KK(o%WUq~6a2ZU*=C
z9-DR_$;|m`EwRt}6O!C|+9992d@xSZ!YOUKU25G?&~#U8)$QZGH`x$FwSjeHXmAyE
z;9_%Tnci^icq%zHme$}lslw7AnCqv0T#=072>3zR`axx#!UNQci3!jnTkIzzH{~X@
zQf?_MQ{SuTFG+)K=$oQ(f`r@p(4o>6og(^{)1cZ6`!?=cYBiG~<ETz_evi3Qc6olj
z1~+$K$lO46f+X3s%I;hb3E!Bp0tt+WLOioh;T*;Y3Zul4e0&!$wBXxPT@m%LQ@XcG
z`Ps0DIP)?iQgR6mvJ{OzOQDv*KVY9^$1m%!PKXOHO~cnA&yD0Fyylp&Z1Ztx`|d!J
zN0Nx8DQGiELXGP5Ui9$OH$I^=YluVf;nTSY1FlkQ1P@UZyTav>QVP)ZAPjJg>9_D0
zUrd4CP-m<V)>>Sm+Puaqlt-!*S@7n1nhzUpOF>cj?WfM;UI#CH_(0L2z-Ev!nF)1p
z1iO82xVe%B1npy4O6xV<3bJ;v#l%6Bt&P3YkXUyL!+D6k5G>RS6)u#W%{o*tDY|AC
zS(47R3H#95@dmp0z|1a-CAccodxa*@S{lOv-<WqXht?grQ*3&O-LW7QgO5JUNOD=G
zZuVDT68YZs(?Mc{%`8X;XDb=>Fba*+_jxJaC6(ld(P_ZDucNpN=F*zc=n$I37e4#?
zz7vB(F=P~<R$;3n4;mls+t)5P*W2fhBiV@7TiaVFCJbHbU+RV?zc|u7Sx8kc^v1la
zS6u9~Ze9oK-LjM?McEfXqryrdgCp#uh-1GDB14}Tf=l8BSE~Q!^ZfDW6|llO&uZ=3
z7h1(9p2q{b%XP?nI+L}>DCOEmCAaHeo+b(|PJHxssa+U`#?)<Fn?b_gn?9bA9rvz!
zQLCZn^n8A-3Ca$W|CEq*$a#gIjd>hHwi%M!Ovdh3zZ0iDVqajS{1JIe*Fp}rjWZHE
z3L#s=bJu{??pV%>PK4!?w7A^TLy;8==GUeB@f^32C|X4}o{ujE(f7owdqDO+vc1lu
z4t0B|w*<{OK$Z*dc|bBPOgb?Uj<e;MxxQXb4egAL8zatqTW62i?6Kc{u<jkaT4F)5
zS2hlWC>wN555envrxsn0ZhPPKlzjz_Wa9T4R|fC%eUNI^3P-Ng>DlE?Nq$d`hF!Rg
zw`F5y;#AiwZpZmR1ew*i6MqPV3TR8jTMVDIeC@WN-fC3TrZqb&8ehmE8isR-H{oGs
zT9qV+K%Z8Nc1?4-6t5Zd#jV_TD-@3iy!|q*Kk3fIB6Bp}3mH~q%bK?&OZHqvt<E5E
zUywn53E42!wbqD71MTbYa1Y|x9r=Svi$P5xA&bs~w<o)Z9UyiAn~wfiJvf^a+O(2|
zu;zh2FRQjmFqUa=@UBPI$PMfsJlh;ysb~;<oyy1!nIrLG?pdm7al@FUh&!0P81AcL
z<=gkt+S3V-k<i@=);5P8upk?iu?n~uu!cVpM6vS-Eb)I6|5S5|JAt9Dz8_S;$bTzq
z)QGaB$e0Ctlo>6hTg<N9S;iF)m7}sAGlRJJ6xWxfk!ndwnsOqOOjs@^lNRAf8rzFP
zhnd%?$*C^JX4yPha;Ud>6HdQvD6y})oQ_!G`fy*!jZ`tK0h`1}_}~;mY7t%3*Azkr
zs<#%VIz>5qLFuBbaXXKAL)*ewMRnMHo~o&-tM$}EiPbaM+D%8#^G&A1TWEsZ^W1#O
zF!ywQ+AjMQ?|bsb$zv>fsv|4(0Sm$OA*g3<f_S_E`J&OBhni1v9l_ur$hiDZpzLm7
z2rsP0>zSb(m@`d3nIMKh%P83R^i0;1p1xRmKlq@I|6!KE!3I5~%gSIbXF3)#EdN#(
zVI5b;D_2*x(CQf^pAV5+B(X`Jsv!1Yn5;+)c?TAJl!#QaJ&xo2vBXC66`|&)L?I4U
zniF|Duiz%4pk(mrQ+yp+szede42lxo-~JS@CocoTO|UUrJKt1mh2N7+99%w+WLL<6
z-ET6(0DbUT<w(%eKTWK%u)cqmSG=ImEGbAXx*nODe$)4$dwzbOie!fr212{;=zzP|
zed3TcHl0yaIh&ot<=9kcfQbWVG?4$H5X^p0t>Y)cs?U~pj!~6!$@Q9gAI%V+q%xz&
zgb&J2+BZr0ANfAUJ~$Px2j8M7D~e0XL-tECM{GgBFB*$Lw&#aUr*&?(+dH(7m|0oh
z(cZ>7&(#0{c?c}cfz|lG&xKc&HU_RHw*RL7{qG+W0nPq>ulePV?HtYM4eSk!%uVRc
zEnH20(HPhVxT^6P1v(=;TW3cL!+$BkfEd?Q2;<oo*=+*p!(l*P-C2Ig6>MCfWN&9f
z?`~t=r8ed`#)^DrSk0Q_j=7nzV#ViP2Xj;9u|)G_aU3P!#R!bHguR3hB)?8>%-tS8
zEDbEJ=gi&Zo~5NDuSfmMkkM;(=5c7{@iOKD-B5S9`8!pp=T@6u;MH&6-F@Gdx(GeF
z0WuB~(XS7eBdy?{O1cBzwj@_J5XiTzY>&G$MfqXsme%_oJE~^QxTvyQ;wBj^;x__(
z^E;63TT4r>li9~%CF(yQr439~Gf5%?%{ds)mPdQtksN(j;K@=2(akYpwf#gbt(4$t
z$7%%iz+3X3n5NGk!P-H^4>!-fqkKs}BWr4oV1D!hJqb6p&=(^hs0`NN%?N5N6zhPv
zmnuiO6di5!5M)-aYbT51LuKqIt=Z*V%FjpgW^JHJLM1jo!6Kku^IoC!&z9ogMQqy;
zb2B;^tgl_W*6zOoAzEqAfvbzy=HC~kx{=OzgiwDiNKJ>IB;?gpK`#=YFM5QqG=UK;
z(9QA-Koh~L6X0a;bZT^(!Z3>{MhzmN=Q$aSE8zOYD?XH-Yhu#UKuUt+tdnE9QC=fv
z>nlv&6*z4Fs0{Lz%^qI_Z$Lc+52}`v_)flHx0<MR(|9ypPl3;X?grV>cC(Bij^hHn
zMd3iZYZc0e+;_DKF9i?7K5oXgzsi2z_w)!%(~A)!)G^NRtvPq`UZ#EfLJFLsZ@B{`
zeUgO=rawz=xlLfTx;}aZ-gCSGRcV|`<GjjBgy&++*$xcs9PqN4_FMANuD47Fcft6W
zxFY?p(<&1p=O|Q_P4@=Uz3ev7cto=lP_37yIdiuZ2bQ$X>|Ks|ysgqR{MmhUq<9;5
zlLG1}GI1%@bc?7OP5}XPuqR+evE)?>?&d|Mwjx|YA&?LD$nJKCsi9dX`@REzNH_W}
z$}}905mzb8`IhxN*!bKUgL-(CtVLW%%4r!%fz+Gz7(1p`N)`lcGhPd@e1fdSQAC70
zZxv1t<dpc#Y4!FZdB)qf3qTN2bx!R9oT8J}#70v1ma$U#?%2&cbMzzRtZPT|Pi}LE
zc(?DnsM{);S}vF#XvnkG^?;4J*GB;<{oddk`#I=)=4j#V3X5?)N<N(fz0BZ7(2Q^^
zr$ja;R<+gTc{uwm77T@^)t$8BuW3~ZHkVYfu8kmV)KDh4I|dXKLchUBP}E7GIbJN>
z*cx<ywD;g~rz1*%y<QV-Xc7I54b#Kk4qq6g(I+-z0R#pcJM|_#H%=jsQVDW4@D4vm
zATETZJ+$at3g_RDw9qSv-{{|{Up6T-7j;k!DZC?(qhcAMw$uGVb?KHt>dj9Mq&xAw
zKsnkn$Sj3PrvjmgTT%Gep4x@9<I|`kM#)0<>bN+3pgB)W=`?B*ZuWN|vjGB!1MtK7
z2LZ(1(GEBt>1<)*gwfdu=YJCcq}QvCWXdBa-`{u2<|aN2;U0LY={qTE4Z78hiHOKy
zZsFFd4!><IvEt0$FpVt_s!u-XHE-RVo0m?Srk3DTR%4#+y;oR)QGMOcOM9ncXsg=+
ztulwYLA5{G=x}H<KH<4~(lms;6et)vI2^$6|Au3rkjd8v8D#A&i|ITno;QO5IopjT
z(62@40RA8m$ag~j+UV>f+m{mf_upVDBOogwDx#!JFC%(R0R*H2(BF5BKp_78&lxZ=
zKob}SfPj6@>g5*j7g`8?ZSiH`_+>-X3k|I=HZ(UdwzISUWqYg(?Nff({_DW?bMDW^
zy)Vz1ynR0>az1wboEYghTEMpWng9}zGkK=Gud4Ve(LWr0Bj#dk`b!aME)bOaJ%Znk
z|NhEo)E5XRTus>2!rH{t&e6ue`Iiiz<;8z_&NMLfpXeGIIGM1r{F3}dwi$p)!xaiP
z`%?1sDm^oO<N6=Nx&0@)zoTxl3-sHsrtjixVf{-@oE_Ep@|-Ce7U;HDXyPA=&S$st
z8J^wF_e<h8zR~_i9jaFow=r=xZ~_i*olO#d-aRhzocdqLn%mh~{hH^CDqg;txQnsD
zuRGt_toF-urea#4=<Zc2KBM?M?m+R4@IU%>y_&L<i6c;~_m|4OXtSE%rs>4|OR^W$
z3JRovU#SG3;_$D$;G$ZEev{(wsFmb`TEnlV44g##6{i=YP1kSH`(Hj`{uyK866^W*
zfPUZqwH(klR<nIEfN!qU>A7F22m{5N|EcE;oFWynvo$hs{&KqLT#_${N`>3w%0K|7
z0J?(EH#p!k+t&sdvi}HCz|qmbL&?SF$AEc#j8D{o0n`Qn6}SLMg7k-gqW_FIKZcXk
z6t@WlpjrX>L@&UN0xRD?k`Ji3<YMG3?`UEB6P=Rqm$;P!n4<vZSuKulPquFdFq`=!
zOaWu#AIsO;Ze&jiU^N0>!}bjb_{{bl0rGo)0}JpJMLRbo69*R)+i%5VE;~s911rcT
z5G!~PKtS{_fXHKC3G&^X;Op2e_|FioY~t?xCH4D@{m8z2{Uy0x&Cvol>HA}mZyUKh
z=m)sr01PO7ftxzQzv4#17O1E7D`#>)w(9f(IyfhQY;*zn6A%HfQ1-9%@E!73HAq|7
zntV~k_eBRU8|Iz2I~)^Wc@$tJAajACJz!G!7Yu!Gm>*k#6I6jo7g&}^0v;fF0Xha4
z!LNY+Zo>5~6n;Gg3Am%M;P)p2E{jYcS>aR%bgX6oTkHa^2C%BT0{8Ot=KLPwGlw%U
zwEkK>=`v{D&`K5~pl4mc__7y3Zv(ZO{u=Zj^M5O_dKtSo+B0?n*!02#7HSu5j3e~_
zfc-072l1Rx7y+@m1PCw}gQtPQUqF8m_v@jr%i?l|qtb=}cqf33FN#a0^cR@l_oZ)@
zHZDV_9TJ|Z0)tR5@t0zJ-wd7YOQiZ2(Eo`0<4%-40?%!Z0vr@(xg5%djsF7in=}6c
z*=bg}XAcOX<iLm|^-WOVGut=c^)HaWsQFhub)8MW(+RNeDq!D>z%Y0I8u(Yk21+=D
zxG~_&&jIj@fn)COUjToR^=B?cuI#A93QVL<alTx{@lDg&zD2%&4f<oBx?kn2FbBk{
zBw)tK@eS`E%Zi5o5#*0ya4hnxtN?Wm0P4KhPkp2R2<FRu@ZUSo&y-3iQ+ERufQ<lL
zNb`b(YB7He{Utb^4-68v#wPARRso7)KaC}zf^LA9ixZLQxc?_Uei9_X_p10*fZmG*
z#Ey%D@ALToALJiPJW}i8Zwp8~0Z1%&LGzBOe}VfgUj4c+-FsP1c;;~<0Qkj0Smx3H
z1NbjTC8>&MJ!(J}YdO{Bs703X7uesAN<Y>(tWm*i9U!n9U~T>j8lNk@68L<><;%{9
zvWb(kilVeQuubw~v<^tE417Sv-~gh)1vHT7SEBv1Rq|~*ry?xy?+VnyR`$nmPZYg6
z34RFYRq^L=2JSzB^J-^Kmj~eH0qKCb#J^9r?|$W<!vWHfok=SL{37CNVC`~VpXl-g
zqfcHoxeEZ#2aI`<fvW01XTZRnOvb<+Se07Xnq9#FLK95X83S(t1{D95|7_IXd->-K
zSlE&Q!Yi5EIXYi~AK4q)A_k02nLrIF^$YmS4S$R;;A-}d0lsc{0RO#$3&I6m9oZlH
z!DjOxa{-v(s~-TrT)}{gc6KlVV1^373>U}Fr>%d?z&AAjpGwXK&OcenkPe?vD*>`w
z0`M1uC<X9(=&F^>_gk0W_izLEGw)Xcwxh{@fz80w=XW+%{;&7Vcm56bKkok<-iO&R
z_T~VPeGr&8@L%vg(yqV3|8a1`EDoGGdo#ubNGyH<&7<d!(Zn3>Tz<KqXIgbo%myH4
z0y18l8G!WtF)HBO|JwH_Gm;~}^(WQ<rV+3caWS$y8u)*~y`oEnS|#4T0dyx3V6uHt
z+g>C8ANW7E@q9E>Gc#agJ3!+XJqqsqAEN>t_{V4yy;H<P0Ma-h=f&Y8dGe3Z0K=X|
zZ(%zZVE5w+C96`ZqRIhA?g7HL`30Arp7~=2zH{mC!}xb!U%}Nbv845nUB2SOA9MA+
z%QOAh9Cr(qc+LjXG{77R7qp?W{O7n|%pq-e<yfA-{c@-SfX@NoFZQ_TwLgddGMNyy
zwllbDEGIzDL~R1_;{km63*vWf{yF{^@g;1pLXU!mBK@)ShueRP{yWmYyVKP27Lfjb
z`y%81pW}bg{%`d~Z%V_HAA6tsr@v+3H@xp5C5+wv569B-<3ESDu>EmYb8N4A_+wv#
z1?s8(`K;w^=5V#Id0sQP_2X$f2l$_(f1Ne|&a_>B1LFfGurc`xc)xLRm~n%;l7X*p
zg@HG?O3c!BX2K@G7ZwKA7M?$QzoLhJS3Mmt?b%fAEK%y)lkK~L{$J6)yes>$QA?yp
zMWKN=DeWX6Agtd&fX{5-S<K(XQ8Y0#aaS^N6#mI{H$ql7P#ky><^u@AbpeBy;&(C5
z_kw-_borX?Z&=yBlU#oY^t0`+%a{9}Lk;p>3H5td{a#+UUcNN)9CAbCk0HOmLh>@;
z<%{ah0aqoi1iVl>_$Pz+<;%d%0hXly8^G0jIF~Q^Imf!A@ZYe$Y>@r$l|Pr&clk1r
zbHrek-$DGgi}S;)MJ|I~zIx&utWE8|fL$zy{^OW&`L=>{tW%Buf^|u-pLMm%wcXF*
z_;vmZ+(prThIY9`_&J)H-tVA&UoiZ#kuF!?KBsN2|2wq5zMci{f&57=?#qCeD+Zqf
zk^^sHt{Ad`9(<vG@MYqci|n2gmoxbt;{QtXV=ud0%krGO7_i@eHShnD|CyyPS06kF
z*aI?L{|msE=yRzS;bnDQuE}=}CkO22Uk&#q?EJ5;&Jv3+m%TfuEp7*To>=^)2;TRM
zoi)B(j_$ni+V_pWFHU!v#N{#*=Ool`e~16I=)`&J|IMZbrXpXTvrNA2(C=D*I~Q}F
zwRrg$4biu@|0L^O5Bs0SfR-<vlleKZi^*>~R};H%+T~ovAD^80#%Z?i<Nrwc>p7b5
s0e+NOe-5x1b)_CIuzm(`o@XyF1qtL^oMr4gfPAJ00V#?GW~Cti2l(aj-T(jq

literal 0
HcmV?d00001

diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
index f8908760cc897..984d97d27bf54 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
@@ -934,4 +934,32 @@ class SQLQuerySuite extends QueryTest {
       sql("set hive.exec.dynamic.partition.mode=strict")
     }
   }
+
+  test("Call add jar in a different thread (SPARK-8306)") {
+    @volatile var error: Option[Throwable] = None
+    val thread = new Thread {
+      override def run() {
+        // To make sure this test works, this jar should not be loaded in another place.
+        TestHive.sql(
+          s"ADD JAR ${TestHive.getHiveFile("hive-contrib-0.13.1.jar").getCanonicalPath()}")
+        try {
+          TestHive.sql(
+            """
+              |CREATE TEMPORARY FUNCTION example_max
+              |AS 'org.apache.hadoop.hive.contrib.udaf.example.UDAFExampleMax'
+            """.stripMargin)
+        } catch {
+          case throwable: Throwable =>
+            error = Some(throwable)
+        }
+      }
+    }
+    thread.start()
+    thread.join()
+    error match {
+      case Some(throwable) =>
+        fail("CREATE TEMPORARY FUNCTION should not fail.", throwable)
+      case None => // OK
+    }
+  }
 }

From a06d9c8e76bb904d48764802aa3affff93b00baa Mon Sep 17 00:00:00 2001
From: zsxwing <zsxwing@gmail.com>
Date: Wed, 17 Jun 2015 15:00:03 -0700
Subject: [PATCH 515/525] [SPARK-8404] [STREAMING] [TESTS] Use thread-safe
 collections to make the tests more reliable

KafkaStreamSuite, DirectKafkaStreamSuite, JavaKafkaStreamSuite and JavaDirectKafkaStreamSuite use non-thread-safe collections to collect data in one thread and check it in another thread. It may fail the tests.

This PR changes them to thread-safe collections.

Note: I cannot reproduce the test failures in my environment. But at least, this PR should make the tests more reliable.

Author: zsxwing <zsxwing@gmail.com>

Closes #6852 from zsxwing/fix-KafkaStreamSuite and squashes the following commits:

d464211 [zsxwing] Use thread-safe collections to make the tests more reliable
---
 .../kafka/JavaDirectKafkaStreamSuite.java          |  6 ++----
 .../streaming/kafka/JavaKafkaStreamSuite.java      |  6 ++----
 .../streaming/kafka/DirectKafkaStreamSuite.scala   | 14 ++++++++------
 .../spark/streaming/kafka/KafkaStreamSuite.scala   |  7 ++-----
 4 files changed, 14 insertions(+), 19 deletions(-)

diff --git a/external/kafka/src/test/java/org/apache/spark/streaming/kafka/JavaDirectKafkaStreamSuite.java b/external/kafka/src/test/java/org/apache/spark/streaming/kafka/JavaDirectKafkaStreamSuite.java
index 4c1d6a03eb2b8..c0669fb336657 100644
--- a/external/kafka/src/test/java/org/apache/spark/streaming/kafka/JavaDirectKafkaStreamSuite.java
+++ b/external/kafka/src/test/java/org/apache/spark/streaming/kafka/JavaDirectKafkaStreamSuite.java
@@ -18,9 +18,7 @@
 package org.apache.spark.streaming.kafka;
 
 import java.io.Serializable;
-import java.util.HashMap;
-import java.util.HashSet;
-import java.util.Arrays;
+import java.util.*;
 
 import scala.Tuple2;
 
@@ -116,7 +114,7 @@ public String call(MessageAndMetadata<String, String> msgAndMd) throws Exception
     );
     JavaDStream<String> unifiedStream = stream1.union(stream2);
 
-    final HashSet<String> result = new HashSet<String>();
+    final Set<String> result = Collections.synchronizedSet(new HashSet<String>());
     unifiedStream.foreachRDD(
         new Function<JavaRDD<String>, Void>() {
           @Override
diff --git a/external/kafka/src/test/java/org/apache/spark/streaming/kafka/JavaKafkaStreamSuite.java b/external/kafka/src/test/java/org/apache/spark/streaming/kafka/JavaKafkaStreamSuite.java
index 540f4ceabab47..e4c659215b767 100644
--- a/external/kafka/src/test/java/org/apache/spark/streaming/kafka/JavaKafkaStreamSuite.java
+++ b/external/kafka/src/test/java/org/apache/spark/streaming/kafka/JavaKafkaStreamSuite.java
@@ -18,9 +18,7 @@
 package org.apache.spark.streaming.kafka;
 
 import java.io.Serializable;
-import java.util.HashMap;
-import java.util.List;
-import java.util.Random;
+import java.util.*;
 
 import scala.Tuple2;
 
@@ -94,7 +92,7 @@ public void testKafkaStream() throws InterruptedException {
       topics,
       StorageLevel.MEMORY_ONLY_SER());
 
-    final HashMap<String, Long> result = new HashMap<String, Long>();
+    final Map<String, Long> result = Collections.synchronizedMap(new HashMap<String, Long>());
 
     JavaDStream<String> words = stream.map(
       new Function<Tuple2<String, String>, String>() {
diff --git a/external/kafka/src/test/scala/org/apache/spark/streaming/kafka/DirectKafkaStreamSuite.scala b/external/kafka/src/test/scala/org/apache/spark/streaming/kafka/DirectKafkaStreamSuite.scala
index 47bbfb605850a..212eb35c61b66 100644
--- a/external/kafka/src/test/scala/org/apache/spark/streaming/kafka/DirectKafkaStreamSuite.scala
+++ b/external/kafka/src/test/scala/org/apache/spark/streaming/kafka/DirectKafkaStreamSuite.scala
@@ -99,7 +99,8 @@ class DirectKafkaStreamSuite
         ssc, kafkaParams, topics)
     }
 
-    val allReceived = new ArrayBuffer[(String, String)]
+    val allReceived =
+      new ArrayBuffer[(String, String)] with mutable.SynchronizedBuffer[(String, String)]
 
     stream.foreachRDD { rdd =>
     // Get the offset ranges in the RDD
@@ -162,7 +163,7 @@ class DirectKafkaStreamSuite
       "Start offset not from latest"
     )
 
-    val collectedData = new mutable.ArrayBuffer[String]()
+    val collectedData = new mutable.ArrayBuffer[String]() with mutable.SynchronizedBuffer[String]
     stream.map { _._2 }.foreachRDD { rdd => collectedData ++= rdd.collect() }
     ssc.start()
     val newData = Map("b" -> 10)
@@ -208,7 +209,7 @@ class DirectKafkaStreamSuite
       "Start offset not from latest"
     )
 
-    val collectedData = new mutable.ArrayBuffer[String]()
+    val collectedData = new mutable.ArrayBuffer[String]() with mutable.SynchronizedBuffer[String]
     stream.foreachRDD { rdd => collectedData ++= rdd.collect() }
     ssc.start()
     val newData = Map("b" -> 10)
@@ -324,7 +325,8 @@ class DirectKafkaStreamSuite
         ssc, kafkaParams, Set(topic))
     }
 
-    val allReceived = new ArrayBuffer[(String, String)]
+    val allReceived =
+      new ArrayBuffer[(String, String)] with mutable.SynchronizedBuffer[(String, String)]
 
     stream.foreachRDD { rdd => allReceived ++= rdd.collect() }
     ssc.start()
@@ -350,8 +352,8 @@ class DirectKafkaStreamSuite
 }
 
 object DirectKafkaStreamSuite {
-  val collectedData = new mutable.ArrayBuffer[String]()
-  var total = -1L
+  val collectedData = new mutable.ArrayBuffer[String]() with mutable.SynchronizedBuffer[String]
+  @volatile var total = -1L
 
   class InputInfoCollector extends StreamingListener {
     val numRecordsSubmitted = new AtomicLong(0L)
diff --git a/external/kafka/src/test/scala/org/apache/spark/streaming/kafka/KafkaStreamSuite.scala b/external/kafka/src/test/scala/org/apache/spark/streaming/kafka/KafkaStreamSuite.scala
index 8ee2cc660f849..797b07f80d8ee 100644
--- a/external/kafka/src/test/scala/org/apache/spark/streaming/kafka/KafkaStreamSuite.scala
+++ b/external/kafka/src/test/scala/org/apache/spark/streaming/kafka/KafkaStreamSuite.scala
@@ -65,7 +65,7 @@ class KafkaStreamSuite extends SparkFunSuite with Eventually with BeforeAndAfter
 
     val stream = KafkaUtils.createStream[String, String, StringDecoder, StringDecoder](
       ssc, kafkaParams, Map(topic -> 1), StorageLevel.MEMORY_ONLY)
-    val result = new mutable.HashMap[String, Long]()
+    val result = new mutable.HashMap[String, Long]() with mutable.SynchronizedMap[String, Long]
     stream.map(_._2).countByValue().foreachRDD { r =>
       val ret = r.collect()
       ret.toMap.foreach { kv =>
@@ -77,10 +77,7 @@ class KafkaStreamSuite extends SparkFunSuite with Eventually with BeforeAndAfter
     ssc.start()
 
     eventually(timeout(10000 milliseconds), interval(100 milliseconds)) {
-      assert(sent.size === result.size)
-      sent.keys.foreach { k =>
-        assert(sent(k) === result(k).toInt)
-      }
+      assert(sent === result)
     }
   }
 }

From d1069cba4a2eb4f00fc3306993a49284efad00c7 Mon Sep 17 00:00:00 2001
From: Punya Biswal <pbiswal@palantir.com>
Date: Wed, 17 Jun 2015 15:29:39 -0700
Subject: [PATCH 516/525] [SPARK-8397] [SQL] Allow custom configuration for
 TestHive

We encourage people to use TestHive in unit tests, because it's
impossible to create more than one HiveContext within one process. The
current implementation locks people into using a local[2] SparkContext
underlying their HiveContext.  We should make it possible to override
this using a system property so that people can test against
local-cluster or remote spark clusters to make their tests more
realistic.

Author: Punya Biswal <pbiswal@palantir.com>

Closes #6844 from punya/feature/SPARK-8397 and squashes the following commits:

97ef394 [Punya Biswal] [SPARK-8397][SQL] Allow custom configuration for TestHive
---
 .../main/scala/org/apache/spark/sql/hive/test/TestHive.scala    | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/test/TestHive.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/test/TestHive.scala
index 7c7afc824d7a6..92155096202b3 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/test/TestHive.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/test/TestHive.scala
@@ -49,7 +49,7 @@ import scala.collection.JavaConversions._
 object TestHive
   extends TestHiveContext(
     new SparkContext(
-      "local[2]",
+      System.getProperty("spark.sql.test.master", "local[2]"),
       "TestSQLContext",
       new SparkConf()
         .set("spark.sql.test", "")

From 165f52f2f9d2d75a4b55b6443ca0354d5e66e14e Mon Sep 17 00:00:00 2001
From: Josh Rosen <joshrosen@databricks.com>
Date: Wed, 17 Jun 2015 19:02:25 -0700
Subject: [PATCH 517/525] [HOTFIX] [PROJECT-INFRA] Fix bug in dev/run-tests for
 MLlib-only PRs

---
 dev/run-tests.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/dev/run-tests.py b/dev/run-tests.py
index 04a7b45741963..c64c71f4f723f 100755
--- a/dev/run-tests.py
+++ b/dev/run-tests.py
@@ -391,7 +391,7 @@ def run_scala_tests_maven(test_profiles):
 
 def run_scala_tests_sbt(test_modules, test_profiles):
     # declare the variable for reference
-    sbt_test_goals = None
+    sbt_test_goals = []
 
     if "ALL" in test_modules:
         sbt_test_goals = ["test"]
@@ -399,12 +399,12 @@ def run_scala_tests_sbt(test_modules, test_profiles):
         # if we only have changes in SQL, MLlib, Streaming, or GraphX then build
         # a custom test list
         if "SQL" in test_modules and "CORE" not in test_modules:
-            sbt_test_goals = ["catalyst/test",
-                              "sql/test",
-                              "hive/test",
-                              "hive-thriftserver/test",
-                              "mllib/test",
-                              "examples/test"]
+            sbt_test_goals += ["catalyst/test",
+                               "sql/test",
+                               "hive/test",
+                               "hive-thriftserver/test",
+                               "mllib/test",
+                               "examples/test"]
         if "MLLIB" in test_modules and "CORE" not in test_modules:
             sbt_test_goals += ["mllib/test", "examples/test"]
         if "STREAMING" in test_modules and "CORE" not in test_modules:

From 4817ccdf50ef6ee24192800f9924d9ef3bb74e12 Mon Sep 17 00:00:00 2001
From: zsxwing <zsxwing@gmail.com>
Date: Wed, 17 Jun 2015 22:07:16 -0700
Subject: [PATCH 518/525] [SPARK-8373] [PYSPARK] Remove PythonRDD.emptyRDD

This is a follow-up PR to remove unused `PythonRDD.emptyRDD` added by #6826

Author: zsxwing <zsxwing@gmail.com>

Closes #6867 from zsxwing/remove-PythonRDD-emptyRDD and squashes the following commits:

b66d363 [zsxwing] Remove PythonRDD.emptyRDD
---
 .../main/scala/org/apache/spark/api/python/PythonRDD.scala   | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala b/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala
index 0103f6c6ab678..55a37f8c944b2 100644
--- a/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala
+++ b/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala
@@ -425,11 +425,6 @@ private[spark] object PythonRDD extends Logging {
     iter.foreach(write)
   }
 
-  /** Create an RDD that has no partitions or elements. */
-  def emptyRDD[T](sc: JavaSparkContext): JavaRDD[T] = {
-    sc.emptyRDD[T]
-  }
-
   /**
    * Create an RDD from a path using [[org.apache.hadoop.mapred.SequenceFileInputFormat]],
    * key and value class.

From 22732e1eca730929345e440ba831386ee7446b74 Mon Sep 17 00:00:00 2001
From: MechCoder <manojkumarsivaraj334@gmail.com>
Date: Wed, 17 Jun 2015 22:08:38 -0700
Subject: [PATCH 519/525] [SPARK-7605] [MLLIB] [PYSPARK] Python API for
 ElementwiseProduct

Python API for org.apache.spark.mllib.feature.ElementwiseProduct

Author: MechCoder <manojkumarsivaraj334@gmail.com>

Closes #6346 from MechCoder/spark-7605 and squashes the following commits:

79d1ef5 [MechCoder] Consistent and support list / array types
5f81d81 [MechCoder] [SPARK-7605] [MLlib] Python API for ElementwiseProduct
---
 docs/mllib-feature-extraction.md              | 22 +++++++++++
 .../mllib/api/python/PythonMLLibAPI.scala     |  8 ++++
 python/pyspark/mllib/feature.py               | 37 ++++++++++++++++++-
 python/pyspark/mllib/tests.py                 | 13 +++++++
 4 files changed, 78 insertions(+), 2 deletions(-)

diff --git a/docs/mllib-feature-extraction.md b/docs/mllib-feature-extraction.md
index 4fe470a8de810..1197dbbb8d982 100644
--- a/docs/mllib-feature-extraction.md
+++ b/docs/mllib-feature-extraction.md
@@ -558,6 +558,28 @@ JavaRDD<Vector> transformedData2 = data.map(
   }
 );
 
+{% endhighlight %}
+</div>
+
+<div data-lang="python">
+{% highlight python %}
+from pyspark import SparkContext
+from pyspark.mllib.linalg import Vectors
+from pyspark.mllib.feature import ElementwiseProduct
+
+# Load and parse the data
+sc = SparkContext()
+data = sc.textFile("data/mllib/kmeans_data.txt")
+parsedData = data.map(lambda x: [float(t) for t in x.split(" ")])
+
+# Create weight vector.
+transformingVector = Vectors.dense([0.0, 1.0, 2.0])
+transformer = ElementwiseProduct(transformingVector)
+
+# Batch transform.
+transformedData = transformer.transform(parsedData)
+transformedData2 = transformer.transform(parsedData.first())
+
 {% endhighlight %}
 </div>
 </div>
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala b/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala
index 8f66bc808a007..1812b3ac7cc0e 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala
@@ -702,6 +702,14 @@ private[python] class PythonMLLibAPI extends Serializable {
     }
   }
 
+  def elementwiseProductVector(scalingVector: Vector, vector: Vector): Vector = {
+    new ElementwiseProduct(scalingVector).transform(vector)
+  }
+
+  def elementwiseProductVector(scalingVector: Vector, vector: JavaRDD[Vector]): JavaRDD[Vector] = {
+    new ElementwiseProduct(scalingVector).transform(vector)
+  }
+
   /**
    * Java stub for mllib Statistics.colStats(X: RDD[Vector]).
    * TODO figure out return type.
diff --git a/python/pyspark/mllib/feature.py b/python/pyspark/mllib/feature.py
index da90554f41437..cf5fdf2cf9788 100644
--- a/python/pyspark/mllib/feature.py
+++ b/python/pyspark/mllib/feature.py
@@ -33,12 +33,13 @@
 from pyspark import SparkContext
 from pyspark.rdd import RDD, ignore_unicode_prefix
 from pyspark.mllib.common import callMLlibFunc, JavaModelWrapper
-from pyspark.mllib.linalg import Vectors, DenseVector, SparseVector, _convert_to_vector
+from pyspark.mllib.linalg import (
+    Vector, Vectors, DenseVector, SparseVector, _convert_to_vector)
 from pyspark.mllib.regression import LabeledPoint
 
 __all__ = ['Normalizer', 'StandardScalerModel', 'StandardScaler',
            'HashingTF', 'IDFModel', 'IDF', 'Word2Vec', 'Word2VecModel',
-           'ChiSqSelector', 'ChiSqSelectorModel']
+           'ChiSqSelector', 'ChiSqSelectorModel', 'ElementwiseProduct']
 
 
 class VectorTransformer(object):
@@ -520,6 +521,38 @@ def fit(self, data):
         return Word2VecModel(jmodel)
 
 
+class ElementwiseProduct(VectorTransformer):
+    """
+    .. note:: Experimental
+
+    Scales each column of the vector, with the supplied weight vector.
+    i.e the elementwise product.
+
+    >>> weight = Vectors.dense([1.0, 2.0, 3.0])
+    >>> eprod = ElementwiseProduct(weight)
+    >>> a = Vectors.dense([2.0, 1.0, 3.0])
+    >>> eprod.transform(a)
+    DenseVector([2.0, 2.0, 9.0])
+    >>> b = Vectors.dense([9.0, 3.0, 4.0])
+    >>> rdd = sc.parallelize([a, b])
+    >>> eprod.transform(rdd).collect()
+    [DenseVector([2.0, 2.0, 9.0]), DenseVector([9.0, 6.0, 12.0])]
+    """
+    def __init__(self, scalingVector):
+        self.scalingVector = _convert_to_vector(scalingVector)
+
+    def transform(self, vector):
+        """
+        Computes the Hadamard product of the vector.
+        """
+        if isinstance(vector, RDD):
+            vector = vector.map(_convert_to_vector)
+
+        else:
+            vector = _convert_to_vector(vector)
+        return callMLlibFunc("elementwiseProductVector", self.scalingVector, vector)
+
+
 def _test():
     import doctest
     from pyspark import SparkContext
diff --git a/python/pyspark/mllib/tests.py b/python/pyspark/mllib/tests.py
index f4c997261ef4e..c482e6b0681e3 100644
--- a/python/pyspark/mllib/tests.py
+++ b/python/pyspark/mllib/tests.py
@@ -46,6 +46,7 @@
 from pyspark.mllib.feature import Word2Vec
 from pyspark.mllib.feature import IDF
 from pyspark.mllib.feature import StandardScaler
+from pyspark.mllib.feature import ElementwiseProduct
 from pyspark.serializers import PickleSerializer
 from pyspark.sql import SQLContext
 
@@ -850,6 +851,18 @@ def test_model_transform(self):
         self.assertEqual(model.transform([1.0, 2.0, 3.0]), DenseVector([1.0, 2.0, 3.0]))
 
 
+class ElementwiseProductTests(MLlibTestCase):
+    def test_model_transform(self):
+        weight = Vectors.dense([3, 2, 1])
+
+        densevec = Vectors.dense([4, 5, 6])
+        sparsevec = Vectors.sparse(3, [0], [1])
+        eprod = ElementwiseProduct(weight)
+        self.assertEqual(eprod.transform(densevec), DenseVector([12, 10, 6]))
+        self.assertEqual(
+            eprod.transform(sparsevec), SparseVector(3, [0], [3]))
+
+
 if __name__ == "__main__":
     if not _have_scipy:
         print("NOTE: Skipping SciPy tests as it does not seem to be installed")

From e2cdb0568b14df29bbdb1ee9a13ee361c9ddad9c Mon Sep 17 00:00:00 2001
From: xutingjun <xutingjun@huawei.com>
Date: Wed, 17 Jun 2015 22:31:01 -0700
Subject: [PATCH 520/525] [SPARK-8392] RDDOperationGraph: getting cached nodes
 is slow

```def getAllNodes: Seq[RDDOperationNode] =
{ _childNodes ++ _childClusters.flatMap(_.childNodes) }```

when the ```_childClusters``` has so many nodes, the process will hang on. I think we can improve the efficiency here.

Author: xutingjun <xutingjun@huawei.com>

Closes #6839 from XuTingjun/DAGImprove and squashes the following commits:

53b03ea [xutingjun] change code to more concise and easier to read
f98728b [xutingjun] fix words: node -> nodes
f87c663 [xutingjun] put the filter inside
81f9fd2 [xutingjun] put the filter inside
---
 core/src/main/scala/org/apache/spark/ui/UIUtils.scala       | 2 +-
 .../scala/org/apache/spark/ui/scope/RDDOperationGraph.scala | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/ui/UIUtils.scala b/core/src/main/scala/org/apache/spark/ui/UIUtils.scala
index 65162f4fdcd62..7898039519201 100644
--- a/core/src/main/scala/org/apache/spark/ui/UIUtils.scala
+++ b/core/src/main/scala/org/apache/spark/ui/UIUtils.scala
@@ -362,7 +362,7 @@ private[spark] object UIUtils extends Logging {
               { g.incomingEdges.map { e => <div class="incoming-edge">{e.fromId},{e.toId}</div> } }
               { g.outgoingEdges.map { e => <div class="outgoing-edge">{e.fromId},{e.toId}</div> } }
               {
-                g.rootCluster.getAllNodes.filter(_.cached).map { n =>
+                g.rootCluster.getCachedNodes.map { n =>
                   <div class="cached-rdd">{n.id}</div>
                 }
               }
diff --git a/core/src/main/scala/org/apache/spark/ui/scope/RDDOperationGraph.scala b/core/src/main/scala/org/apache/spark/ui/scope/RDDOperationGraph.scala
index d6a5085db1efb..ffea9817c0b08 100644
--- a/core/src/main/scala/org/apache/spark/ui/scope/RDDOperationGraph.scala
+++ b/core/src/main/scala/org/apache/spark/ui/scope/RDDOperationGraph.scala
@@ -66,9 +66,9 @@ private[ui] class RDDOperationCluster(val id: String, private var _name: String)
     _childClusters += childCluster
   }
 
-  /** Return all the nodes container in this cluster, including ones nested in other clusters. */
-  def getAllNodes: Seq[RDDOperationNode] = {
-    _childNodes ++ _childClusters.flatMap(_.childNodes)
+  /** Return all the nodes which are cached. */
+  def getCachedNodes: Seq[RDDOperationNode] = {
+    _childNodes.filter(_.cached) ++ _childClusters.flatMap(_.getCachedNodes)
   }
 }
 

From 3b6107704fb946e9fcb8c1c9bc4ded1b88c571af Mon Sep 17 00:00:00 2001
From: Burak Yavuz <brkyvz@gmail.com>
Date: Wed, 17 Jun 2015 22:33:37 -0700
Subject: [PATCH 521/525] [SPARK-8095] Resolve dependencies of --packages in
 local ivy cache

Dependencies of artifacts in the local ivy cache were not being resolved properly. The dependencies were not being picked up. Now they should be.

cc andrewor14

Author: Burak Yavuz <brkyvz@gmail.com>

Closes #6788 from brkyvz/local-ivy-fix and squashes the following commits:

2875bf4 [Burak Yavuz] fix temp dir bug
48cc648 [Burak Yavuz] improve deletion
a69e3e6 [Burak Yavuz] delete cache before test as well
0037197 [Burak Yavuz] fix merge conflicts
f60772c [Burak Yavuz] use different folder for m2 cache during testing
b6ef038 [Burak Yavuz] [SPARK-8095] Resolve dependencies of Spark Packages in local ivy cache
---
 .../org/apache/spark/deploy/SparkSubmit.scala |  22 +++-
 .../apache/spark/deploy/IvyTestUtils.scala    | 124 +++++++++++++++---
 .../spark/deploy/SparkSubmitUtilsSuite.scala  |  22 ++--
 3 files changed, 135 insertions(+), 33 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala b/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
index cfcc6d355801e..abf222757a95b 100644
--- a/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
@@ -35,7 +35,8 @@ import org.apache.ivy.core.resolve.ResolveOptions
 import org.apache.ivy.core.retrieve.RetrieveOptions
 import org.apache.ivy.core.settings.IvySettings
 import org.apache.ivy.plugins.matcher.GlobPatternMatcher
-import org.apache.ivy.plugins.resolver.{ChainResolver, IBiblioResolver}
+import org.apache.ivy.plugins.repository.file.FileRepository
+import org.apache.ivy.plugins.resolver.{FileSystemResolver, ChainResolver, IBiblioResolver}
 import org.apache.spark.SPARK_VERSION
 import org.apache.spark.deploy.rest._
 import org.apache.spark.util.{ChildFirstURLClassLoader, MutableURLClassLoader, Utils}
@@ -735,8 +736,14 @@ private[spark] object SparkSubmitUtils {
   }
 
   /** Path of the local Maven cache. */
-  private[spark] def m2Path: File = new File(System.getProperty("user.home"),
-    ".m2" + File.separator + "repository" + File.separator)
+  private[spark] def m2Path: File = {
+    if (Utils.isTesting) {
+      // test builds delete the maven cache, and this can cause flakiness
+      new File("dummy", ".m2" + File.separator + "repository")
+    } else {
+      new File(System.getProperty("user.home"), ".m2" + File.separator + "repository")
+    }
+  }
 
   /**
    * Extracts maven coordinates from a comma-delimited string
@@ -756,12 +763,13 @@ private[spark] object SparkSubmitUtils {
     localM2.setName("local-m2-cache")
     cr.add(localM2)
 
-    val localIvy = new IBiblioResolver
-    localIvy.setRoot(new File(ivySettings.getDefaultIvyUserDir,
-      "local" + File.separator).toURI.toString)
+    val localIvy = new FileSystemResolver
+    val localIvyRoot = new File(ivySettings.getDefaultIvyUserDir, "local")
+    localIvy.setLocal(true)
+    localIvy.setRepository(new FileRepository(localIvyRoot))
     val ivyPattern = Seq("[organisation]", "[module]", "[revision]", "[type]s",
       "[artifact](-[classifier]).[ext]").mkString(File.separator)
-    localIvy.setPattern(ivyPattern)
+    localIvy.addIvyPattern(localIvyRoot.getAbsolutePath + File.separator + ivyPattern)
     localIvy.setName("local-ivy-cache")
     cr.add(localIvy)
 
diff --git a/core/src/test/scala/org/apache/spark/deploy/IvyTestUtils.scala b/core/src/test/scala/org/apache/spark/deploy/IvyTestUtils.scala
index 7d39984424842..823050b0aabbe 100644
--- a/core/src/test/scala/org/apache/spark/deploy/IvyTestUtils.scala
+++ b/core/src/test/scala/org/apache/spark/deploy/IvyTestUtils.scala
@@ -24,6 +24,8 @@ import com.google.common.io.{Files, ByteStreams}
 
 import org.apache.commons.io.FileUtils
 
+import org.apache.ivy.core.settings.IvySettings
+
 import org.apache.spark.TestUtils.{createCompiledClass, JavaSourceFromString}
 import org.apache.spark.deploy.SparkSubmitUtils.MavenCoordinate
 
@@ -44,13 +46,30 @@ private[deploy] object IvyTestUtils {
       if (!useIvyLayout) {
         Seq(groupDirs, artifactDirs, artifact.version).mkString(File.separator)
       } else {
-        Seq(groupDirs, artifactDirs, artifact.version, ext + "s").mkString(File.separator)
+        Seq(artifact.groupId, artifactDirs, artifact.version, ext + "s").mkString(File.separator)
       }
     new File(prefix, artifactPath)
   }
 
-  private def artifactName(artifact: MavenCoordinate, ext: String = ".jar"): String = {
-    s"${artifact.artifactId}-${artifact.version}$ext"
+  /** Returns the artifact naming based on standard ivy or maven format. */
+  private def artifactName(
+      artifact: MavenCoordinate,
+      useIvyLayout: Boolean,
+      ext: String = ".jar"): String = {
+    if (!useIvyLayout) {
+      s"${artifact.artifactId}-${artifact.version}$ext"
+    } else {
+      s"${artifact.artifactId}$ext"
+    }
+  }
+
+  /** Returns the directory for the given groupId based on standard ivy or maven format. */
+  private def getBaseGroupDirectory(artifact: MavenCoordinate, useIvyLayout: Boolean): String = {
+    if (!useIvyLayout) {
+      artifact.groupId.replace(".", File.separator)
+    } else {
+      artifact.groupId
+    }
   }
 
   /** Write the contents to a file to the supplied directory. */
@@ -92,6 +111,22 @@ private[deploy] object IvyTestUtils {
     createCompiledClass(className, dir, sourceFile, Seq.empty)
   }
 
+  private def createDescriptor(
+      tempPath: File,
+      artifact: MavenCoordinate,
+      dependencies: Option[Seq[MavenCoordinate]],
+      useIvyLayout: Boolean): File = {
+    if (useIvyLayout) {
+      val ivyXmlPath = pathFromCoordinate(artifact, tempPath, "ivy", true)
+      Files.createParentDirs(new File(ivyXmlPath, "dummy"))
+      createIvyDescriptor(ivyXmlPath, artifact, dependencies)
+    } else {
+      val pomPath = pathFromCoordinate(artifact, tempPath, "pom", useIvyLayout)
+      Files.createParentDirs(new File(pomPath, "dummy"))
+      createPom(pomPath, artifact, dependencies)
+    }
+  }
+
   /** Helper method to write artifact information in the pom. */
   private def pomArtifactWriter(artifact: MavenCoordinate, tabCount: Int = 1): String = {
     var result = "\n" + "  " * tabCount + s"<groupId>${artifact.groupId}</groupId>"
@@ -121,15 +156,55 @@ private[deploy] object IvyTestUtils {
       "\n  <dependencies>\n" + inside + "\n  </dependencies>"
     }.getOrElse("")
     content += "\n</project>"
-    writeFile(dir, artifactName(artifact, ".pom"), content.trim)
+    writeFile(dir, artifactName(artifact, false, ".pom"), content.trim)
+  }
+
+  /** Helper method to write artifact information in the ivy.xml. */
+  private def ivyArtifactWriter(artifact: MavenCoordinate): String = {
+    s"""<dependency org="${artifact.groupId}" name="${artifact.artifactId}"
+       |            rev="${artifact.version}" force="true"
+       |            conf="compile->compile(*),master(*);runtime->runtime(*)"/>""".stripMargin
+  }
+
+  /** Create a pom file for this artifact. */
+  private def createIvyDescriptor(
+      dir: File,
+      artifact: MavenCoordinate,
+      dependencies: Option[Seq[MavenCoordinate]]): File = {
+    var content = s"""
+        |<?xml version="1.0" encoding="UTF-8"?>
+        |<ivy-module version="2.0" xmlns:m="http://ant.apache.org/ivy/maven">
+        |  <info organisation="${artifact.groupId}"
+        |        module="${artifact.artifactId}"
+        |        revision="${artifact.version}"
+        |        status="release" publication="20150405222456" />
+        |  <configurations>
+        |    <conf name="default" visibility="public" description="" extends="runtime,master"/>
+        |    <conf name="compile" visibility="public" description=""/>
+        |    <conf name="master" visibility="public" description=""/>
+        |    <conf name="runtime" visibility="public" description="" extends="compile"/>
+        |    <conf name="pom" visibility="public" description=""/>
+        |  </configurations>
+        |  <publications>
+        |     <artifact name="${artifactName(artifact, true, "")}" type="jar" ext="jar"
+        |               conf="master"/>
+        |  </publications>
+      """.stripMargin.trim
+    content += dependencies.map { deps =>
+      val inside = deps.map(ivyArtifactWriter).mkString("\n")
+      "\n  <dependencies>\n" + inside + "\n  </dependencies>"
+    }.getOrElse("")
+    content += "\n</ivy-module>"
+    writeFile(dir, "ivy.xml", content.trim)
   }
 
   /** Create the jar for the given maven coordinate, using the supplied files. */
   private def packJar(
       dir: File,
       artifact: MavenCoordinate,
-      files: Seq[(String, File)]): File = {
-    val jarFile = new File(dir, artifactName(artifact))
+      files: Seq[(String, File)],
+      useIvyLayout: Boolean): File = {
+    val jarFile = new File(dir, artifactName(artifact, useIvyLayout))
     val jarFileStream = new FileOutputStream(jarFile)
     val jarStream = new JarOutputStream(jarFileStream, new java.util.jar.Manifest())
 
@@ -187,12 +262,10 @@ private[deploy] object IvyTestUtils {
         } else {
           Seq(javaFile)
         }
-      val jarFile = packJar(jarPath, artifact, allFiles)
+      val jarFile = packJar(jarPath, artifact, allFiles, useIvyLayout)
       assert(jarFile.exists(), "Problem creating Jar file")
-      val pomPath = pathFromCoordinate(artifact, tempPath, "pom", useIvyLayout)
-      Files.createParentDirs(new File(pomPath, "dummy"))
-      val pomFile = createPom(pomPath, artifact, dependencies)
-      assert(pomFile.exists(), "Problem creating Pom file")
+      val descriptor = createDescriptor(tempPath, artifact, dependencies, useIvyLayout)
+      assert(descriptor.exists(), "Problem creating Pom file")
     } finally {
       FileUtils.deleteDirectory(root)
     }
@@ -237,7 +310,10 @@ private[deploy] object IvyTestUtils {
       dependencies: Option[String],
       rootDir: Option[File],
       useIvyLayout: Boolean = false,
-      withPython: Boolean = false)(f: String => Unit): Unit = {
+      withPython: Boolean = false,
+      ivySettings: IvySettings = new IvySettings)(f: String => Unit): Unit = {
+    val deps = dependencies.map(SparkSubmitUtils.extractMavenCoordinates)
+    purgeLocalIvyCache(artifact, deps, ivySettings)
     val repo = createLocalRepositoryForTests(artifact, dependencies, rootDir, useIvyLayout,
       withPython)
     try {
@@ -245,17 +321,29 @@ private[deploy] object IvyTestUtils {
     } finally {
       // Clean up
       if (repo.toString.contains(".m2") || repo.toString.contains(".ivy2")) {
-        FileUtils.deleteDirectory(new File(repo,
-          artifact.groupId.replace(".", File.separator) + File.separator + artifact.artifactId))
-        dependencies.map(SparkSubmitUtils.extractMavenCoordinates).foreach { seq =>
-          seq.foreach { dep =>
-            FileUtils.deleteDirectory(new File(repo,
-              dep.artifactId.replace(".", File.separator)))
+        val groupDir = getBaseGroupDirectory(artifact, useIvyLayout)
+        FileUtils.deleteDirectory(new File(repo, groupDir + File.separator + artifact.artifactId))
+        deps.foreach { _.foreach { dep =>
+            FileUtils.deleteDirectory(new File(repo, getBaseGroupDirectory(dep, useIvyLayout)))
           }
         }
       } else {
         FileUtils.deleteDirectory(repo)
       }
+      purgeLocalIvyCache(artifact, deps, ivySettings)
+    }
+  }
+
+  /** Deletes the test packages from the ivy cache */
+  private def purgeLocalIvyCache(
+      artifact: MavenCoordinate,
+      dependencies: Option[Seq[MavenCoordinate]],
+      ivySettings: IvySettings): Unit = {
+    // delete the artifact from the cache as well if it already exists
+    FileUtils.deleteDirectory(new File(ivySettings.getDefaultCache, artifact.groupId))
+    dependencies.foreach { _.foreach { dep =>
+        FileUtils.deleteDirectory(new File(ivySettings.getDefaultCache, dep.groupId))
+      }
     }
   }
 }
diff --git a/core/src/test/scala/org/apache/spark/deploy/SparkSubmitUtilsSuite.scala b/core/src/test/scala/org/apache/spark/deploy/SparkSubmitUtilsSuite.scala
index 3a8da9fb9ea17..12c40f0b7d658 100644
--- a/core/src/test/scala/org/apache/spark/deploy/SparkSubmitUtilsSuite.scala
+++ b/core/src/test/scala/org/apache/spark/deploy/SparkSubmitUtilsSuite.scala
@@ -24,7 +24,7 @@ import org.scalatest.BeforeAndAfterAll
 
 import org.apache.ivy.core.module.descriptor.MDArtifact
 import org.apache.ivy.core.settings.IvySettings
-import org.apache.ivy.plugins.resolver.IBiblioResolver
+import org.apache.ivy.plugins.resolver.{AbstractResolver, FileSystemResolver, IBiblioResolver}
 
 import org.apache.spark.SparkFunSuite
 import org.apache.spark.deploy.SparkSubmitUtils.MavenCoordinate
@@ -68,7 +68,7 @@ class SparkSubmitUtilsSuite extends SparkFunSuite with BeforeAndAfterAll {
     // should have central and spark-packages by default
     assert(res1.getResolvers.size() === 4)
     assert(res1.getResolvers.get(0).asInstanceOf[IBiblioResolver].getName === "local-m2-cache")
-    assert(res1.getResolvers.get(1).asInstanceOf[IBiblioResolver].getName === "local-ivy-cache")
+    assert(res1.getResolvers.get(1).asInstanceOf[FileSystemResolver].getName === "local-ivy-cache")
     assert(res1.getResolvers.get(2).asInstanceOf[IBiblioResolver].getName === "central")
     assert(res1.getResolvers.get(3).asInstanceOf[IBiblioResolver].getName === "spark-packages")
 
@@ -76,10 +76,10 @@ class SparkSubmitUtilsSuite extends SparkFunSuite with BeforeAndAfterAll {
     val resolver2 = SparkSubmitUtils.createRepoResolvers(Option(repos), settings)
     assert(resolver2.getResolvers.size() === 7)
     val expected = repos.split(",").map(r => s"$r/")
-    resolver2.getResolvers.toArray.zipWithIndex.foreach { case (resolver: IBiblioResolver, i) =>
+    resolver2.getResolvers.toArray.zipWithIndex.foreach { case (resolver: AbstractResolver, i) =>
       if (i > 3) {
         assert(resolver.getName === s"repo-${i - 3}")
-        assert(resolver.getRoot === expected(i - 4))
+        assert(resolver.asInstanceOf[IBiblioResolver].getRoot === expected(i - 4))
       }
     }
   }
@@ -112,28 +112,34 @@ class SparkSubmitUtilsSuite extends SparkFunSuite with BeforeAndAfterAll {
   }
 
   test("search for artifact at local repositories") {
-    val main = new MavenCoordinate("my.awesome.lib", "mylib", "0.1")
+    val main = new MavenCoordinate("my.great.lib", "mylib", "0.1")
+    val dep = "my.great.dep:mydep:0.5"
     // Local M2 repository
-    IvyTestUtils.withRepository(main, None, Some(SparkSubmitUtils.m2Path)) { repo =>
+    IvyTestUtils.withRepository(main, Some(dep), Some(SparkSubmitUtils.m2Path)) { repo =>
       val jarPath = SparkSubmitUtils.resolveMavenCoordinates(main.toString, None, None,
         isTest = true)
       assert(jarPath.indexOf("mylib") >= 0, "should find artifact")
+      assert(jarPath.indexOf("mydep") >= 0, "should find dependency")
     }
     // Local Ivy Repository
     val settings = new IvySettings
     val ivyLocal = new File(settings.getDefaultIvyUserDir, "local" + File.separator)
-    IvyTestUtils.withRepository(main, None, Some(ivyLocal), true) { repo =>
+    IvyTestUtils.withRepository(main, Some(dep), Some(ivyLocal), useIvyLayout = true) { repo =>
       val jarPath = SparkSubmitUtils.resolveMavenCoordinates(main.toString, None, None,
         isTest = true)
       assert(jarPath.indexOf("mylib") >= 0, "should find artifact")
+      assert(jarPath.indexOf("mydep") >= 0, "should find dependency")
     }
     // Local ivy repository with modified home
     val dummyIvyLocal = new File(tempIvyPath, "local" + File.separator)
-    IvyTestUtils.withRepository(main, None, Some(dummyIvyLocal), true) { repo =>
+    settings.setDefaultIvyUserDir(new File(tempIvyPath))
+    IvyTestUtils.withRepository(main, Some(dep), Some(dummyIvyLocal), useIvyLayout = true,
+      ivySettings = settings) { repo =>
       val jarPath = SparkSubmitUtils.resolveMavenCoordinates(main.toString, None,
         Some(tempIvyPath), isTest = true)
       assert(jarPath.indexOf("mylib") >= 0, "should find artifact")
       assert(jarPath.indexOf(tempIvyPath) >= 0, "should be in new ivy path")
+      assert(jarPath.indexOf("mydep") >= 0, "should find dependency")
     }
   }
 

From 9db73ec12412f6809030546cf69dcb32d2c8e0fe Mon Sep 17 00:00:00 2001
From: Lianhui Wang <lianhuiwang09@gmail.com>
Date: Wed, 17 Jun 2015 22:52:47 -0700
Subject: [PATCH 522/525] [SPARK-8381][SQL]reuse typeConvert when convert
 Seq[Row] to catalyst type

reuse-typeConvert when convert Seq[Row] to CatalystType

Author: Lianhui Wang <lianhuiwang09@gmail.com>

Closes #6831 from lianhuiwang/reuse-typeConvert and squashes the following commits:

1fec395 [Lianhui Wang] remove CatalystTypeConverters.convertToCatalyst
714462d [Lianhui Wang] add package[sql]
9d1fbf3 [Lianhui Wang] address JoshRosen's comments
768956f [Lianhui Wang] update scala style
4498c62 [Lianhui Wang] reuse typeConvert
---
 .../spark/sql/catalyst/CatalystTypeConverters.scala    | 10 ----------
 .../spark/sql/catalyst/ScalaReflectionSuite.scala      |  4 ++--
 .../main/scala/org/apache/spark/sql/DataFrame.scala    |  8 ++++----
 .../main/scala/org/apache/spark/sql/SQLContext.scala   |  8 ++++----
 .../org/apache/spark/sql/execution/commands.scala      |  4 ++--
 5 files changed, 12 insertions(+), 22 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/CatalystTypeConverters.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/CatalystTypeConverters.scala
index 6175456c58214..620e8de83a96c 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/CatalystTypeConverters.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/CatalystTypeConverters.scala
@@ -335,16 +335,6 @@ object CatalystTypeConverters {
     override def toScalaImpl(row: InternalRow, column: Int): Double = row.getDouble(column)
   }
 
-  /**
-   * Converts Scala objects to catalyst rows / types. This method is slow, and for batch
-   * conversion you should be using converter produced by createToCatalystConverter.
-   * Note: This is always called after schemaFor has been called.
-   *       This ordering is important for UDT registration.
-   */
-  def convertToCatalyst(scalaValue: Any, dataType: DataType): Any = {
-    getConverterForType(dataType).toCatalyst(scalaValue)
-  }
-
   /**
    * Creates a converter function that will convert Scala objects to the specified Catalyst type.
    * Typical use case would be converting a collection of rows that have the same schema. You will
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/ScalaReflectionSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/ScalaReflectionSuite.scala
index c2d739b529295..b4b00f558463f 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/ScalaReflectionSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/ScalaReflectionSuite.scala
@@ -258,7 +258,7 @@ class ScalaReflectionSuite extends SparkFunSuite {
     val data = PrimitiveData(1, 1, 1, 1, 1, 1, true)
     val convertedData = InternalRow(1, 1.toLong, 1.toDouble, 1.toFloat, 1.toShort, 1.toByte, true)
     val dataType = schemaFor[PrimitiveData].dataType
-    assert(CatalystTypeConverters.convertToCatalyst(data, dataType) === convertedData)
+    assert(CatalystTypeConverters.createToCatalystConverter(dataType)(data) === convertedData)
   }
 
   test("convert Option[Product] to catalyst") {
@@ -268,7 +268,7 @@ class ScalaReflectionSuite extends SparkFunSuite {
     val dataType = schemaFor[OptionalData].dataType
     val convertedData = InternalRow(2, 2.toLong, 2.toDouble, 2.toFloat, 2.toShort, 2.toByte, true,
       InternalRow(1, 1, 1, 1, 1, 1, true))
-    assert(CatalystTypeConverters.convertToCatalyst(data, dataType) === convertedData)
+    assert(CatalystTypeConverters.createToCatalystConverter(dataType)(data) === convertedData)
   }
 
   test("infer schema from case class with multiple constructors") {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
index 444916bbadb48..466258e76f9f6 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
@@ -1029,10 +1029,10 @@ class DataFrame private[sql](
 
     val elementTypes = schema.toAttributes.map { attr => (attr.dataType, attr.nullable) }
     val names = schema.toAttributes.map(_.name)
+    val convert = CatalystTypeConverters.createToCatalystConverter(schema)
 
     val rowFunction =
-      f.andThen(_.map(CatalystTypeConverters.convertToCatalyst(_, schema)
-        .asInstanceOf[InternalRow]))
+      f.andThen(_.map(convert(_).asInstanceOf[InternalRow]))
     val generator = UserDefinedGenerator(elementTypes, rowFunction, input.map(_.expr))
 
     Generate(generator, join = true, outer = false,
@@ -1059,8 +1059,8 @@ class DataFrame private[sql](
     val names = attributes.map(_.name)
 
     def rowFunction(row: Row): TraversableOnce[InternalRow] = {
-      f(row(0).asInstanceOf[A]).map(o =>
-        InternalRow(CatalystTypeConverters.convertToCatalyst(o, dataType)))
+      val convert = CatalystTypeConverters.createToCatalystConverter(dataType)
+      f(row(0).asInstanceOf[A]).map(o => InternalRow(convert(o)))
     }
     val generator = UserDefinedGenerator(elementTypes, rowFunction, apply(inputColumn).expr :: Nil)
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
index 9d1f89d6d7bd8..6b605f7130167 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
@@ -536,12 +536,12 @@ class SQLContext(@transient val sparkContext: SparkContext)
         Class.forName(className, true, Utils.getContextOrSparkClassLoader))
       val extractors =
         localBeanInfo.getPropertyDescriptors.filterNot(_.getName == "class").map(_.getReadMethod)
-
+      val methodsToConverts = extractors.zip(attributeSeq).map { case (e, attr) =>
+        (e, CatalystTypeConverters.createToCatalystConverter(attr.dataType))
+      }
       iter.map { row =>
         new GenericRow(
-          extractors.zip(attributeSeq).map { case (e, attr) =>
-            CatalystTypeConverters.convertToCatalyst(e.invoke(row), attr.dataType)
-          }.toArray[Any]
+          methodsToConverts.map { case (e, convert) => convert(e.invoke(row)) }.toArray[Any]
         ) : InternalRow
       }
     }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/commands.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/commands.scala
index 653792ea2e537..c9dfcea5d051e 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/commands.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/commands.scala
@@ -65,8 +65,8 @@ private[sql] case class ExecutedCommand(cmd: RunnableCommand) extends SparkPlan
   override def executeTake(limit: Int): Array[Row] = sideEffectResult.take(limit).toArray
 
   protected override def doExecute(): RDD[InternalRow] = {
-    val converted = sideEffectResult.map(r =>
-      CatalystTypeConverters.convertToCatalyst(r, schema).asInstanceOf[InternalRow])
+    val convert = CatalystTypeConverters.createToCatalystConverter(schema)
+    val converted = sideEffectResult.map(convert(_).asInstanceOf[InternalRow])
     sqlContext.sparkContext.parallelize(converted, 1)
   }
 }

From 78a430ea4d2aef58a8bf38ce488553ca6acea428 Mon Sep 17 00:00:00 2001
From: zsxwing <zsxwing@gmail.com>
Date: Wed, 17 Jun 2015 23:22:54 -0700
Subject: [PATCH 523/525] [SPARK-7961][SQL]Refactor SQLConf to display better
 error message

1. Add `SQLConfEntry` to store the information about a configuration. For those configurations that cannot be found in `sql-programming-guide.md`, I left the doc as `<TODO>`.
2. Verify the value when setting a configuration if this is in SQLConf.
3. Use `SET -v` to display all public configurations.

Author: zsxwing <zsxwing@gmail.com>

Closes #6747 from zsxwing/sqlconf and squashes the following commits:

7d09bad [zsxwing] Use SQLConfEntry in HiveContext
49f6213 [zsxwing] Add getConf, setConf to SQLContext and HiveContext
e014f53 [zsxwing] Merge branch 'master' into sqlconf
93dad8e [zsxwing] Fix the unit tests
cf950c1 [zsxwing] Fix the code style and tests
3c5f03e [zsxwing] Add unsetConf(SQLConfEntry) and fix the code style
a2f4add [zsxwing] getConf will return the default value if a config is not set
037b1db [zsxwing] Add schema to SetCommand
0520c3c [zsxwing] Merge branch 'master' into sqlconf
7afb0ec [zsxwing] Fix the configurations about HiveThriftServer
7e728e3 [zsxwing] Add doc for SQLConfEntry and fix 'toString'
5e95b10 [zsxwing] Add enumConf
c6ba76d [zsxwing] setRawString => setConfString, getRawString => getConfString
4abd807 [zsxwing] Fix the test for 'set -v'
6e47e56 [zsxwing] Fix the compilation error
8973ced [zsxwing] Remove floatConf
1fc3a8b [zsxwing] Remove the 'conf' command and use 'set -v' instead
99c9c16 [zsxwing] Fix tests that use SQLConfEntry as a string
88a03cc [zsxwing] Add new lines between confs and return types
ce7c6c8 [zsxwing] Remove seqConf
f3c1b33 [zsxwing] Refactor SQLConf to display better error message
---
 docs/sql-programming-guide.md                 |   4 +-
 .../scala/org/apache/spark/sql/SQLConf.scala  | 493 ++++++++++++++----
 .../org/apache/spark/sql/SQLContext.scala     |  25 +-
 .../org/apache/spark/sql/SparkSQLParser.scala |   4 +-
 .../apache/spark/sql/execution/commands.scala |  98 +++-
 .../spark/sql/execution/debug/package.scala   |   2 +-
 .../sql/parquet/ParquetTableOperations.scala  |   8 +-
 .../apache/spark/sql/parquet/newParquet.scala |   6 +-
 .../apache/spark/sql/sources/commands.scala   |   2 +-
 .../spark/sql/test/TestSQLContext.scala       |   2 +-
 .../spark/sql/DataFrameAggregateSuite.scala   |   4 +-
 .../org/apache/spark/sql/DataFrameSuite.scala |  14 +-
 .../org/apache/spark/sql/JoinSuite.scala      |  14 +-
 .../apache/spark/sql/SQLConfEntrySuite.scala  | 150 ++++++
 .../org/apache/spark/sql/SQLConfSuite.scala   |  10 +-
 .../org/apache/spark/sql/SQLQuerySuite.scala  |  42 +-
 .../columnar/PartitionBatchPruningSuite.scala |   8 +-
 .../spark/sql/execution/PlannerSuite.scala    |   8 +-
 .../org/apache/spark/sql/json/JsonSuite.scala |   4 +-
 .../sql/parquet/ParquetFilterSuite.scala      |  14 +-
 .../spark/sql/parquet/ParquetIOSuite.scala    |  16 +-
 .../spark/sql/parquet/ParquetQuerySuite.scala |   8 +-
 .../spark/sql/sources/DataSourceTest.scala    |   2 +-
 .../apache/spark/sql/test/SQLTestUtils.scala  |   6 +-
 .../hive/thriftserver/HiveThriftServer2.scala |   4 +-
 .../SparkExecuteStatementOperation.scala      |   2 +-
 .../HiveThriftServer2Suites.scala             |  22 +-
 .../execution/HiveCompatibilitySuite.scala    |   8 +-
 .../SortMergeCompatibilitySuite.scala         |   4 +-
 .../apache/spark/sql/hive/HiveContext.scala   |  88 +++-
 .../apache/spark/sql/hive/test/TestHive.scala |   5 +-
 .../spark/sql/hive/HiveParquetSuite.scala     |   4 +-
 .../sql/hive/MetastoreDataSourcesSuite.scala  |  16 +-
 .../spark/sql/hive/StatisticsSuite.scala      |   8 +-
 .../sql/hive/execution/HiveQuerySuite.scala   |  12 +-
 .../sql/hive/execution/SQLQuerySuite.scala    |  20 +-
 .../apache/spark/sql/hive/parquetSuites.scala |  20 +-
 37 files changed, 861 insertions(+), 296 deletions(-)
 create mode 100644 sql/core/src/test/scala/org/apache/spark/sql/SQLConfEntrySuite.scala

diff --git a/docs/sql-programming-guide.md b/docs/sql-programming-guide.md
index 61f9c5f02ac72..c6e6ec88a205f 100644
--- a/docs/sql-programming-guide.md
+++ b/docs/sql-programming-guide.md
@@ -1220,7 +1220,7 @@ Configuration of Parquet can be done using the `setConf` method on `SQLContext`
   <td>false</td>
   <td>
     Some other Parquet-producing systems, in particular Impala and older versions of Spark SQL, do
-    not differentiate between binary data and strings when writing out the Parquet schema.  This
+    not differentiate between binary data and strings when writing out the Parquet schema. This
     flag tells Spark SQL to interpret binary data as a string to provide compatibility with these systems.
   </td>
 </tr>
@@ -1237,7 +1237,7 @@ Configuration of Parquet can be done using the `setConf` method on `SQLContext`
   <td><code>spark.sql.parquet.cacheMetadata</code></td>
   <td>true</td>
   <td>
-    Turns on caching of Parquet schema metadata.  Can speed up querying of static data.
+    Turns on caching of Parquet schema metadata. Can speed up querying of static data.
   </td>
 </tr>
 <tr>
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SQLConf.scala b/sql/core/src/main/scala/org/apache/spark/sql/SQLConf.scala
index 55ab6b3358e3c..16493c3d7c19c 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/SQLConf.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/SQLConf.scala
@@ -25,74 +25,333 @@ import scala.collection.JavaConversions._
 import org.apache.spark.sql.catalyst.CatalystConf
 
 private[spark] object SQLConf {
-  val COMPRESS_CACHED = "spark.sql.inMemoryColumnarStorage.compressed"
-  val COLUMN_BATCH_SIZE = "spark.sql.inMemoryColumnarStorage.batchSize"
-  val IN_MEMORY_PARTITION_PRUNING = "spark.sql.inMemoryColumnarStorage.partitionPruning"
-  val AUTO_BROADCASTJOIN_THRESHOLD = "spark.sql.autoBroadcastJoinThreshold"
-  val DEFAULT_SIZE_IN_BYTES = "spark.sql.defaultSizeInBytes"
-  val SHUFFLE_PARTITIONS = "spark.sql.shuffle.partitions"
-  val CODEGEN_ENABLED = "spark.sql.codegen"
-  val UNSAFE_ENABLED = "spark.sql.unsafe.enabled"
-  val DIALECT = "spark.sql.dialect"
-  val CASE_SENSITIVE = "spark.sql.caseSensitive"
-
-  val PARQUET_BINARY_AS_STRING = "spark.sql.parquet.binaryAsString"
-  val PARQUET_INT96_AS_TIMESTAMP = "spark.sql.parquet.int96AsTimestamp"
-  val PARQUET_CACHE_METADATA = "spark.sql.parquet.cacheMetadata"
-  val PARQUET_COMPRESSION = "spark.sql.parquet.compression.codec"
-  val PARQUET_FILTER_PUSHDOWN_ENABLED = "spark.sql.parquet.filterPushdown"
-  val PARQUET_USE_DATA_SOURCE_API = "spark.sql.parquet.useDataSourceApi"
-
-  val ORC_FILTER_PUSHDOWN_ENABLED = "spark.sql.orc.filterPushdown"
-
-  val HIVE_VERIFY_PARTITIONPATH = "spark.sql.hive.verifyPartitionPath"
-
-  val COLUMN_NAME_OF_CORRUPT_RECORD = "spark.sql.columnNameOfCorruptRecord"
-  val BROADCAST_TIMEOUT = "spark.sql.broadcastTimeout"
+
+  private val sqlConfEntries = java.util.Collections.synchronizedMap(
+    new java.util.HashMap[String, SQLConfEntry[_]]())
+
+  /**
+   * An entry contains all meta information for a configuration.
+   *
+   * @param key the key for the configuration
+   * @param defaultValue the default value for the configuration
+   * @param valueConverter how to convert a string to the value. It should throw an exception if the
+   *                       string does not have the required format.
+   * @param stringConverter how to convert a value to a string that the user can use it as a valid
+   *                        string value. It's usually `toString`. But sometimes, a custom converter
+   *                        is necessary. E.g., if T is List[String], `a, b, c` is better than
+   *                        `List(a, b, c)`.
+   * @param doc the document for the configuration
+   * @param isPublic if this configuration is public to the user. If it's `false`, this
+   *                 configuration is only used internally and we should not expose it to the user.
+   * @tparam T the value type
+   */
+  private[sql] class SQLConfEntry[T] private(
+      val key: String,
+      val defaultValue: Option[T],
+      val valueConverter: String => T,
+      val stringConverter: T => String,
+      val doc: String,
+      val isPublic: Boolean) {
+
+    def defaultValueString: String = defaultValue.map(stringConverter).getOrElse("<undefined>")
+
+    override def toString: String = {
+      s"SQLConfEntry(key = $key, defaultValue=$defaultValueString, doc=$doc, isPublic = $isPublic)"
+    }
+  }
+
+  private[sql] object SQLConfEntry {
+
+    private def apply[T](
+          key: String,
+          defaultValue: Option[T],
+          valueConverter: String => T,
+          stringConverter: T => String,
+          doc: String,
+          isPublic: Boolean): SQLConfEntry[T] =
+      sqlConfEntries.synchronized {
+        if (sqlConfEntries.containsKey(key)) {
+          throw new IllegalArgumentException(s"Duplicate SQLConfEntry. $key has been registered")
+        }
+        val entry =
+          new SQLConfEntry[T](key, defaultValue, valueConverter, stringConverter, doc, isPublic)
+        sqlConfEntries.put(key, entry)
+        entry
+      }
+
+    def intConf(
+          key: String,
+          defaultValue: Option[Int] = None,
+          doc: String = "",
+          isPublic: Boolean = true): SQLConfEntry[Int] =
+      SQLConfEntry(key, defaultValue, { v =>
+        try {
+          v.toInt
+        } catch {
+          case _: NumberFormatException =>
+            throw new IllegalArgumentException(s"$key should be int, but was $v")
+        }
+      }, _.toString, doc, isPublic)
+
+    def longConf(
+        key: String,
+        defaultValue: Option[Long] = None,
+        doc: String = "",
+        isPublic: Boolean = true): SQLConfEntry[Long] =
+      SQLConfEntry(key, defaultValue, { v =>
+        try {
+          v.toLong
+        } catch {
+          case _: NumberFormatException =>
+            throw new IllegalArgumentException(s"$key should be long, but was $v")
+        }
+      }, _.toString, doc, isPublic)
+
+    def doubleConf(
+        key: String,
+        defaultValue: Option[Double] = None,
+        doc: String = "",
+        isPublic: Boolean = true): SQLConfEntry[Double] =
+      SQLConfEntry(key, defaultValue, { v =>
+        try {
+          v.toDouble
+        } catch {
+          case _: NumberFormatException =>
+            throw new IllegalArgumentException(s"$key should be double, but was $v")
+        }
+      }, _.toString, doc, isPublic)
+
+    def booleanConf(
+        key: String,
+        defaultValue: Option[Boolean] = None,
+        doc: String = "",
+        isPublic: Boolean = true): SQLConfEntry[Boolean] =
+      SQLConfEntry(key, defaultValue, { v =>
+        try {
+          v.toBoolean
+        } catch {
+          case _: IllegalArgumentException =>
+            throw new IllegalArgumentException(s"$key should be boolean, but was $v")
+        }
+      }, _.toString, doc, isPublic)
+
+    def stringConf(
+        key: String,
+        defaultValue: Option[String] = None,
+        doc: String = "",
+        isPublic: Boolean = true): SQLConfEntry[String] =
+      SQLConfEntry(key, defaultValue, v => v, v => v, doc, isPublic)
+
+    def enumConf[T](
+        key: String,
+        valueConverter: String => T,
+        validValues: Set[T],
+        defaultValue: Option[T] = None,
+        doc: String = "",
+        isPublic: Boolean = true): SQLConfEntry[T] =
+      SQLConfEntry(key, defaultValue, v => {
+        val _v = valueConverter(v)
+        if (!validValues.contains(_v)) {
+          throw new IllegalArgumentException(
+            s"The value of $key should be one of ${validValues.mkString(", ")}, but was $v")
+        }
+        _v
+      }, _.toString, doc, isPublic)
+
+    def seqConf[T](
+        key: String,
+        valueConverter: String => T,
+        defaultValue: Option[Seq[T]] = None,
+        doc: String = "",
+        isPublic: Boolean = true): SQLConfEntry[Seq[T]] = {
+      SQLConfEntry(
+        key, defaultValue, _.split(",").map(valueConverter), _.mkString(","), doc, isPublic)
+    }
+
+    def stringSeqConf(
+        key: String,
+        defaultValue: Option[Seq[String]] = None,
+        doc: String = "",
+        isPublic: Boolean = true): SQLConfEntry[Seq[String]] = {
+      seqConf(key, s => s, defaultValue, doc, isPublic)
+    }
+  }
+
+  import SQLConfEntry._
+
+  val COMPRESS_CACHED = booleanConf("spark.sql.inMemoryColumnarStorage.compressed",
+    defaultValue = Some(true),
+    doc = "When set to true Spark SQL will automatically select a compression codec for each " +
+      "column based on statistics of the data.")
+
+  val COLUMN_BATCH_SIZE = intConf("spark.sql.inMemoryColumnarStorage.batchSize",
+    defaultValue = Some(10000),
+    doc = "Controls the size of batches for columnar caching.  Larger batch sizes can improve " +
+      "memory utilization and compression, but risk OOMs when caching data.")
+
+  val IN_MEMORY_PARTITION_PRUNING =
+    booleanConf("spark.sql.inMemoryColumnarStorage.partitionPruning",
+      defaultValue = Some(false),
+      doc = "<TODO>")
+
+  val AUTO_BROADCASTJOIN_THRESHOLD = intConf("spark.sql.autoBroadcastJoinThreshold",
+    defaultValue = Some(10 * 1024 * 1024),
+    doc = "Configures the maximum size in bytes for a table that will be broadcast to all worker " +
+      "nodes when performing a join.  By setting this value to -1 broadcasting can be disabled. " +
+      "Note that currently statistics are only supported for Hive Metastore tables where the " +
+      "command<code>ANALYZE TABLE &lt;tableName&gt; COMPUTE STATISTICS noscan</code> has been run.")
+
+  val DEFAULT_SIZE_IN_BYTES = longConf("spark.sql.defaultSizeInBytes", isPublic = false)
+
+  val SHUFFLE_PARTITIONS = intConf("spark.sql.shuffle.partitions",
+    defaultValue = Some(200),
+    doc = "Configures the number of partitions to use when shuffling data for joins or " +
+      "aggregations.")
+
+  val CODEGEN_ENABLED = booleanConf("spark.sql.codegen",
+    defaultValue = Some(true),
+    doc = "When true, code will be dynamically generated at runtime for expression evaluation in" +
+      " a specific query. For some queries with complicated expression this option can lead to " +
+      "significant speed-ups. However, for simple queries this can actually slow down query " +
+      "execution.")
+
+  val UNSAFE_ENABLED = booleanConf("spark.sql.unsafe.enabled",
+    defaultValue = Some(false),
+    doc = "<TDDO>")
+
+  val DIALECT = stringConf("spark.sql.dialect", defaultValue = Some("sql"), doc = "<TODO>")
+
+  val CASE_SENSITIVE = booleanConf("spark.sql.caseSensitive",
+    defaultValue = Some(true),
+    doc = "<TODO>")
+
+  val PARQUET_BINARY_AS_STRING = booleanConf("spark.sql.parquet.binaryAsString",
+    defaultValue = Some(false),
+    doc = "Some other Parquet-producing systems, in particular Impala and older versions of " +
+      "Spark SQL, do not differentiate between binary data and strings when writing out the " +
+      "Parquet schema. This flag tells Spark SQL to interpret binary data as a string to provide " +
+      "compatibility with these systems.")
+
+  val PARQUET_INT96_AS_TIMESTAMP = booleanConf("spark.sql.parquet.int96AsTimestamp",
+    defaultValue = Some(true),
+    doc = "Some Parquet-producing systems, in particular Impala, store Timestamp into INT96. " +
+      "Spark would also store Timestamp as INT96 because we need to avoid precision lost of the " +
+      "nanoseconds field. This flag tells Spark SQL to interpret INT96 data as a timestamp to " +
+      "provide compatibility with these systems.")
+
+  val PARQUET_CACHE_METADATA = booleanConf("spark.sql.parquet.cacheMetadata",
+    defaultValue = Some(true),
+    doc = "Turns on caching of Parquet schema metadata. Can speed up querying of static data.")
+
+  val PARQUET_COMPRESSION = enumConf("spark.sql.parquet.compression.codec",
+    valueConverter = v => v.toLowerCase,
+    validValues = Set("uncompressed", "snappy", "gzip", "lzo"),
+    defaultValue = Some("gzip"),
+    doc = "Sets the compression codec use when writing Parquet files. Acceptable values include: " +
+      "uncompressed, snappy, gzip, lzo.")
+
+  val PARQUET_FILTER_PUSHDOWN_ENABLED = booleanConf("spark.sql.parquet.filterPushdown",
+    defaultValue = Some(false),
+    doc = "Turn on Parquet filter pushdown optimization. This feature is turned off by default" +
+      " because of a known bug in Paruet 1.6.0rc3 " +
+      "(<a href=\"https://issues.apache.org/jira/browse/PARQUET-136\">PARQUET-136</a>). However, " +
+      "if your table doesn't contain any nullable string or binary columns, it's still safe to " +
+      "turn this feature on.")
+
+  val PARQUET_USE_DATA_SOURCE_API = booleanConf("spark.sql.parquet.useDataSourceApi",
+    defaultValue = Some(true),
+    doc = "<TODO>")
+
+  val ORC_FILTER_PUSHDOWN_ENABLED = booleanConf("spark.sql.orc.filterPushdown",
+    defaultValue = Some(false),
+    doc = "<TODO>")
+
+  val HIVE_VERIFY_PARTITIONPATH = booleanConf("spark.sql.hive.verifyPartitionPath",
+    defaultValue = Some(true),
+    doc = "<TODO>")
+
+  val COLUMN_NAME_OF_CORRUPT_RECORD = stringConf("spark.sql.columnNameOfCorruptRecord",
+    defaultValue = Some("_corrupt_record"),
+    doc = "<TODO>")
+
+  val BROADCAST_TIMEOUT = intConf("spark.sql.broadcastTimeout",
+    defaultValue = Some(5 * 60),
+    doc = "<TODO>")
 
   // Options that control which operators can be chosen by the query planner.  These should be
   // considered hints and may be ignored by future versions of Spark SQL.
-  val EXTERNAL_SORT = "spark.sql.planner.externalSort"
-  val SORTMERGE_JOIN = "spark.sql.planner.sortMergeJoin"
+  val EXTERNAL_SORT = booleanConf("spark.sql.planner.externalSort",
+    defaultValue = Some(true),
+    doc = "When true, performs sorts spilling to disk as needed otherwise sort each partition in" +
+      " memory.")
+
+  val SORTMERGE_JOIN = booleanConf("spark.sql.planner.sortMergeJoin",
+    defaultValue = Some(false),
+    doc = "<TODO>")
 
   // This is only used for the thriftserver
-  val THRIFTSERVER_POOL = "spark.sql.thriftserver.scheduler.pool"
-  val THRIFTSERVER_UI_STATEMENT_LIMIT = "spark.sql.thriftserver.ui.retainedStatements"
-  val THRIFTSERVER_UI_SESSION_LIMIT = "spark.sql.thriftserver.ui.retainedSessions"
+  val THRIFTSERVER_POOL = stringConf("spark.sql.thriftserver.scheduler.pool",
+    doc = "Set a Fair Scheduler pool for a JDBC client session")
+
+  val THRIFTSERVER_UI_STATEMENT_LIMIT = intConf("spark.sql.thriftserver.ui.retainedStatements",
+    defaultValue = Some(200),
+    doc = "<TODO>")
+
+  val THRIFTSERVER_UI_SESSION_LIMIT = intConf("spark.sql.thriftserver.ui.retainedSessions",
+    defaultValue = Some(200),
+    doc = "<TODO>")
 
   // This is used to set the default data source
-  val DEFAULT_DATA_SOURCE_NAME = "spark.sql.sources.default"
+  val DEFAULT_DATA_SOURCE_NAME = stringConf("spark.sql.sources.default",
+    defaultValue = Some("org.apache.spark.sql.parquet"),
+    doc = "<TODO>")
+
   // This is used to control the when we will split a schema's JSON string to multiple pieces
   // in order to fit the JSON string in metastore's table property (by default, the value has
   // a length restriction of 4000 characters). We will split the JSON string of a schema
   // to its length exceeds the threshold.
-  val SCHEMA_STRING_LENGTH_THRESHOLD = "spark.sql.sources.schemaStringLengthThreshold"
+  val SCHEMA_STRING_LENGTH_THRESHOLD = intConf("spark.sql.sources.schemaStringLengthThreshold",
+    defaultValue = Some(4000),
+    doc = "<TODO>")
 
   // Whether to perform partition discovery when loading external data sources.  Default to true.
-  val PARTITION_DISCOVERY_ENABLED = "spark.sql.sources.partitionDiscovery.enabled"
+  val PARTITION_DISCOVERY_ENABLED = booleanConf("spark.sql.sources.partitionDiscovery.enabled",
+    defaultValue = Some(true),
+    doc = "<TODO>")
 
   // Whether to perform partition column type inference. Default to true.
-  val PARTITION_COLUMN_TYPE_INFERENCE = "spark.sql.sources.partitionColumnTypeInference.enabled"
+  val PARTITION_COLUMN_TYPE_INFERENCE =
+    booleanConf("spark.sql.sources.partitionColumnTypeInference.enabled",
+      defaultValue = Some(true),
+      doc = "<TODO>")
 
   // The output committer class used by FSBasedRelation. The specified class needs to be a
   // subclass of org.apache.hadoop.mapreduce.OutputCommitter.
   // NOTE: This property should be set in Hadoop `Configuration` rather than Spark `SQLConf`
-  val OUTPUT_COMMITTER_CLASS = "spark.sql.sources.outputCommitterClass"
+  val OUTPUT_COMMITTER_CLASS =
+    stringConf("spark.sql.sources.outputCommitterClass", isPublic = false)
 
   // Whether to perform eager analysis when constructing a dataframe.
   // Set to false when debugging requires the ability to look at invalid query plans.
-  val DATAFRAME_EAGER_ANALYSIS = "spark.sql.eagerAnalysis"
+  val DATAFRAME_EAGER_ANALYSIS = booleanConf("spark.sql.eagerAnalysis",
+    defaultValue = Some(true),
+    doc = "<TODO>")
 
   // Whether to automatically resolve ambiguity in join conditions for self-joins.
   // See SPARK-6231.
-  val DATAFRAME_SELF_JOIN_AUTO_RESOLVE_AMBIGUITY = "spark.sql.selfJoinAutoResolveAmbiguity"
+  val DATAFRAME_SELF_JOIN_AUTO_RESOLVE_AMBIGUITY =
+    booleanConf("spark.sql.selfJoinAutoResolveAmbiguity", defaultValue = Some(true), doc = "<TODO>")
 
   // Whether to retain group by columns or not in GroupedData.agg.
-  val DATAFRAME_RETAIN_GROUP_COLUMNS = "spark.sql.retainGroupColumns"
+  val DATAFRAME_RETAIN_GROUP_COLUMNS = booleanConf("spark.sql.retainGroupColumns",
+    defaultValue = Some(true),
+    doc = "<TODO>")
 
-  val USE_SQL_SERIALIZER2 = "spark.sql.useSerializer2"
+  val USE_SQL_SERIALIZER2 = booleanConf("spark.sql.useSerializer2",
+    defaultValue = Some(true), doc = "<TODO>")
 
-  val USE_JACKSON_STREAMING_API = "spark.sql.json.useJacksonStreamingAPI"
+  val USE_JACKSON_STREAMING_API = booleanConf("spark.sql.json.useJacksonStreamingAPI",
+    defaultValue = Some(true), doc = "<TODO>")
 
   object Deprecated {
     val MAPRED_REDUCE_TASKS = "mapred.reduce.tasks"
@@ -131,56 +390,54 @@ private[sql] class SQLConf extends Serializable with CatalystConf {
    * Note that the choice of dialect does not affect things like what tables are available or
    * how query execution is performed.
    */
-  private[spark] def dialect: String = getConf(DIALECT, "sql")
+  private[spark] def dialect: String = getConf(DIALECT)
 
   /** When true tables cached using the in-memory columnar caching will be compressed. */
-  private[spark] def useCompression: Boolean = getConf(COMPRESS_CACHED, "true").toBoolean
+  private[spark] def useCompression: Boolean = getConf(COMPRESS_CACHED)
 
   /** The compression codec for writing to a Parquetfile */
-  private[spark] def parquetCompressionCodec: String = getConf(PARQUET_COMPRESSION, "gzip")
+  private[spark] def parquetCompressionCodec: String = getConf(PARQUET_COMPRESSION)
+
+  private[spark] def parquetCacheMetadata: Boolean = getConf(PARQUET_CACHE_METADATA)
 
   /** The number of rows that will be  */
-  private[spark] def columnBatchSize: Int = getConf(COLUMN_BATCH_SIZE, "10000").toInt
+  private[spark] def columnBatchSize: Int = getConf(COLUMN_BATCH_SIZE)
 
   /** Number of partitions to use for shuffle operators. */
-  private[spark] def numShufflePartitions: Int = getConf(SHUFFLE_PARTITIONS, "200").toInt
+  private[spark] def numShufflePartitions: Int = getConf(SHUFFLE_PARTITIONS)
 
   /** When true predicates will be passed to the parquet record reader when possible. */
-  private[spark] def parquetFilterPushDown =
-    getConf(PARQUET_FILTER_PUSHDOWN_ENABLED, "false").toBoolean
+  private[spark] def parquetFilterPushDown: Boolean = getConf(PARQUET_FILTER_PUSHDOWN_ENABLED)
 
   /** When true uses Parquet implementation based on data source API */
-  private[spark] def parquetUseDataSourceApi =
-    getConf(PARQUET_USE_DATA_SOURCE_API, "true").toBoolean
+  private[spark] def parquetUseDataSourceApi: Boolean = getConf(PARQUET_USE_DATA_SOURCE_API)
 
-  private[spark] def orcFilterPushDown =
-    getConf(ORC_FILTER_PUSHDOWN_ENABLED, "false").toBoolean
+  private[spark] def orcFilterPushDown: Boolean = getConf(ORC_FILTER_PUSHDOWN_ENABLED)
 
   /** When true uses verifyPartitionPath to prune the path which is not exists. */
-  private[spark] def verifyPartitionPath =
-    getConf(HIVE_VERIFY_PARTITIONPATH, "true").toBoolean
+  private[spark] def verifyPartitionPath: Boolean = getConf(HIVE_VERIFY_PARTITIONPATH)
 
   /** When true the planner will use the external sort, which may spill to disk. */
-  private[spark] def externalSortEnabled: Boolean = getConf(EXTERNAL_SORT, "true").toBoolean
+  private[spark] def externalSortEnabled: Boolean = getConf(EXTERNAL_SORT)
 
   /**
    * Sort merge join would sort the two side of join first, and then iterate both sides together
    * only once to get all matches. Using sort merge join can save a lot of memory usage compared
    * to HashJoin.
    */
-  private[spark] def sortMergeJoinEnabled: Boolean = getConf(SORTMERGE_JOIN, "false").toBoolean
+  private[spark] def sortMergeJoinEnabled: Boolean = getConf(SORTMERGE_JOIN)
 
   /**
    * When set to true, Spark SQL will use the Janino at runtime to generate custom bytecode
    * that evaluates expressions found in queries.  In general this custom code runs much faster
    * than interpreted evaluation, but there are some start-up costs (5-10ms) due to compilation.
    */
-  private[spark] def codegenEnabled: Boolean = getConf(CODEGEN_ENABLED, "true").toBoolean
+  private[spark] def codegenEnabled: Boolean = getConf(CODEGEN_ENABLED)
 
   /**
    * caseSensitive analysis true by default
    */
-  def caseSensitiveAnalysis: Boolean = getConf(SQLConf.CASE_SENSITIVE, "true").toBoolean
+  def caseSensitiveAnalysis: Boolean = getConf(SQLConf.CASE_SENSITIVE)
 
   /**
    * When set to true, Spark SQL will use managed memory for certain operations.  This option only
@@ -188,15 +445,14 @@ private[sql] class SQLConf extends Serializable with CatalystConf {
    *
    * Defaults to false as this feature is currently experimental.
    */
-  private[spark] def unsafeEnabled: Boolean = getConf(UNSAFE_ENABLED, "false").toBoolean
+  private[spark] def unsafeEnabled: Boolean = getConf(UNSAFE_ENABLED)
 
-  private[spark] def useSqlSerializer2: Boolean = getConf(USE_SQL_SERIALIZER2, "true").toBoolean
+  private[spark] def useSqlSerializer2: Boolean = getConf(USE_SQL_SERIALIZER2)
 
   /**
    * Selects between the new (true) and old (false) JSON handlers, to be removed in Spark 1.5.0
    */
-  private[spark] def useJacksonStreamingAPI: Boolean =
-    getConf(USE_JACKSON_STREAMING_API, "true").toBoolean
+  private[spark] def useJacksonStreamingAPI: Boolean = getConf(USE_JACKSON_STREAMING_API)
 
   /**
    * Upper bound on the sizes (in bytes) of the tables qualified for the auto conversion to
@@ -205,8 +461,7 @@ private[sql] class SQLConf extends Serializable with CatalystConf {
    *
    * Hive setting: hive.auto.convert.join.noconditionaltask.size, whose default value is 10000.
    */
-  private[spark] def autoBroadcastJoinThreshold: Int =
-    getConf(AUTO_BROADCASTJOIN_THRESHOLD, (10 * 1024 * 1024).toString).toInt
+  private[spark] def autoBroadcastJoinThreshold: Int = getConf(AUTO_BROADCASTJOIN_THRESHOLD)
 
   /**
    * The default size in bytes to assign to a logical operator's estimation statistics.  By default,
@@ -215,82 +470,116 @@ private[sql] class SQLConf extends Serializable with CatalystConf {
    * in joins.
    */
   private[spark] def defaultSizeInBytes: Long =
-    getConf(DEFAULT_SIZE_IN_BYTES, (autoBroadcastJoinThreshold + 1).toString).toLong
+    getConf(DEFAULT_SIZE_IN_BYTES, autoBroadcastJoinThreshold + 1L)
 
   /**
    * When set to true, we always treat byte arrays in Parquet files as strings.
    */
-  private[spark] def isParquetBinaryAsString: Boolean =
-    getConf(PARQUET_BINARY_AS_STRING, "false").toBoolean
+  private[spark] def isParquetBinaryAsString: Boolean = getConf(PARQUET_BINARY_AS_STRING)
 
   /**
    * When set to true, we always treat INT96Values in Parquet files as timestamp.
    */
-  private[spark] def isParquetINT96AsTimestamp: Boolean =
-    getConf(PARQUET_INT96_AS_TIMESTAMP, "true").toBoolean
+  private[spark] def isParquetINT96AsTimestamp: Boolean = getConf(PARQUET_INT96_AS_TIMESTAMP)
 
   /**
    * When set to true, partition pruning for in-memory columnar tables is enabled.
    */
-  private[spark] def inMemoryPartitionPruning: Boolean =
-    getConf(IN_MEMORY_PARTITION_PRUNING, "false").toBoolean
+  private[spark] def inMemoryPartitionPruning: Boolean = getConf(IN_MEMORY_PARTITION_PRUNING)
 
-  private[spark] def columnNameOfCorruptRecord: String =
-    getConf(COLUMN_NAME_OF_CORRUPT_RECORD, "_corrupt_record")
+  private[spark] def columnNameOfCorruptRecord: String = getConf(COLUMN_NAME_OF_CORRUPT_RECORD)
 
   /**
    * Timeout in seconds for the broadcast wait time in hash join
    */
-  private[spark] def broadcastTimeout: Int =
-    getConf(BROADCAST_TIMEOUT, (5 * 60).toString).toInt
+  private[spark] def broadcastTimeout: Int = getConf(BROADCAST_TIMEOUT)
 
-  private[spark] def defaultDataSourceName: String =
-    getConf(DEFAULT_DATA_SOURCE_NAME, "org.apache.spark.sql.parquet")
+  private[spark] def defaultDataSourceName: String = getConf(DEFAULT_DATA_SOURCE_NAME)
 
-  private[spark] def partitionDiscoveryEnabled() =
-    getConf(SQLConf.PARTITION_DISCOVERY_ENABLED, "true").toBoolean
+  private[spark] def partitionDiscoveryEnabled(): Boolean =
+    getConf(SQLConf.PARTITION_DISCOVERY_ENABLED)
 
-  private[spark] def partitionColumnTypeInferenceEnabled() =
-    getConf(SQLConf.PARTITION_COLUMN_TYPE_INFERENCE, "true").toBoolean
+  private[spark] def partitionColumnTypeInferenceEnabled(): Boolean =
+    getConf(SQLConf.PARTITION_COLUMN_TYPE_INFERENCE)
 
   // Do not use a value larger than 4000 as the default value of this property.
   // See the comments of SCHEMA_STRING_LENGTH_THRESHOLD above for more information.
-  private[spark] def schemaStringLengthThreshold: Int =
-    getConf(SCHEMA_STRING_LENGTH_THRESHOLD, "4000").toInt
+  private[spark] def schemaStringLengthThreshold: Int = getConf(SCHEMA_STRING_LENGTH_THRESHOLD)
 
-  private[spark] def dataFrameEagerAnalysis: Boolean =
-    getConf(DATAFRAME_EAGER_ANALYSIS, "true").toBoolean
+  private[spark] def dataFrameEagerAnalysis: Boolean = getConf(DATAFRAME_EAGER_ANALYSIS)
 
   private[spark] def dataFrameSelfJoinAutoResolveAmbiguity: Boolean =
-    getConf(DATAFRAME_SELF_JOIN_AUTO_RESOLVE_AMBIGUITY, "true").toBoolean
+    getConf(DATAFRAME_SELF_JOIN_AUTO_RESOLVE_AMBIGUITY)
 
-  private[spark] def dataFrameRetainGroupColumns: Boolean =
-    getConf(DATAFRAME_RETAIN_GROUP_COLUMNS, "true").toBoolean
+  private[spark] def dataFrameRetainGroupColumns: Boolean = getConf(DATAFRAME_RETAIN_GROUP_COLUMNS)
 
   /** ********************** SQLConf functionality methods ************ */
 
   /** Set Spark SQL configuration properties. */
   def setConf(props: Properties): Unit = settings.synchronized {
-    props.foreach { case (k, v) => settings.put(k, v) }
+    props.foreach { case (k, v) => setConfString(k, v) }
   }
 
-  /** Set the given Spark SQL configuration property. */
-  def setConf(key: String, value: String): Unit = {
+  /** Set the given Spark SQL configuration property using a `string` value. */
+  def setConfString(key: String, value: String): Unit = {
     require(key != null, "key cannot be null")
     require(value != null, s"value cannot be null for key: $key")
+    val entry = sqlConfEntries.get(key)
+    if (entry != null) {
+      // Only verify configs in the SQLConf object
+      entry.valueConverter(value)
+    }
     settings.put(key, value)
   }
 
+  /** Set the given Spark SQL configuration property. */
+  def setConf[T](entry: SQLConfEntry[T], value: T): Unit = {
+    require(entry != null, "entry cannot be null")
+    require(value != null, s"value cannot be null for key: ${entry.key}")
+    require(sqlConfEntries.get(entry.key) == entry, s"$entry is not registered")
+    settings.put(entry.key, entry.stringConverter(value))
+  }
+
   /** Return the value of Spark SQL configuration property for the given key. */
-  def getConf(key: String): String = {
-    Option(settings.get(key)).getOrElse(throw new NoSuchElementException(key))
+  def getConfString(key: String): String = {
+    Option(settings.get(key)).
+      orElse {
+        // Try to use the default value
+        Option(sqlConfEntries.get(key)).map(_.defaultValueString)
+      }.
+      getOrElse(throw new NoSuchElementException(key))
+  }
+
+  /**
+   * Return the value of Spark SQL configuration property for the given key. If the key is not set
+   * yet, return `defaultValue`. This is useful when `defaultValue` in SQLConfEntry is not the
+   * desired one.
+   */
+  def getConf[T](entry: SQLConfEntry[T], defaultValue: T): T = {
+    require(sqlConfEntries.get(entry.key) == entry, s"$entry is not registered")
+    Option(settings.get(entry.key)).map(entry.valueConverter).getOrElse(defaultValue)
   }
 
   /**
    * Return the value of Spark SQL configuration property for the given key. If the key is not set
-   * yet, return `defaultValue`.
+   * yet, return `defaultValue` in [[SQLConfEntry]].
+   */
+  def getConf[T](entry: SQLConfEntry[T]): T = {
+    require(sqlConfEntries.get(entry.key) == entry, s"$entry is not registered")
+    Option(settings.get(entry.key)).map(entry.valueConverter).orElse(entry.defaultValue).
+      getOrElse(throw new NoSuchElementException(entry.key))
+  }
+
+  /**
+   * Return the `string` value of Spark SQL configuration property for the given key. If the key is
+   * not set yet, return `defaultValue`.
    */
-  def getConf(key: String, defaultValue: String): String = {
+  def getConfString(key: String, defaultValue: String): String = {
+    val entry = sqlConfEntries.get(key)
+    if (entry != null && defaultValue != "<undefined>") {
+      // Only verify configs in the SQLConf object
+      entry.valueConverter(defaultValue)
+    }
     Option(settings.get(key)).getOrElse(defaultValue)
   }
 
@@ -300,11 +589,25 @@ private[sql] class SQLConf extends Serializable with CatalystConf {
    */
   def getAllConfs: immutable.Map[String, String] = settings.synchronized { settings.toMap }
 
-  private[spark] def unsetConf(key: String) {
+  /**
+   * Return all the configuration definitions that have been defined in [[SQLConf]]. Each
+   * definition contains key, defaultValue and doc.
+   */
+  def getAllDefinedConfs: Seq[(String, String, String)] = sqlConfEntries.synchronized {
+    sqlConfEntries.values.filter(_.isPublic).map { entry =>
+      (entry.key, entry.defaultValueString, entry.doc)
+    }.toSeq
+  }
+
+  private[spark] def unsetConf(key: String): Unit = {
     settings -= key
   }
 
-  private[spark] def clear() {
+  private[spark] def unsetConf(entry: SQLConfEntry[_]): Unit = {
+    settings -= entry.key
+  }
+
+  private[spark] def clear(): Unit = {
     settings.clear()
   }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
index 6b605f7130167..04fc798bf3738 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
@@ -31,6 +31,7 @@ import org.apache.spark.SparkContext
 import org.apache.spark.annotation.{DeveloperApi, Experimental}
 import org.apache.spark.api.java.{JavaRDD, JavaSparkContext}
 import org.apache.spark.rdd.RDD
+import org.apache.spark.sql.SQLConf.SQLConfEntry
 import org.apache.spark.sql.catalyst.analysis._
 import org.apache.spark.sql.catalyst.errors.DialectException
 import org.apache.spark.sql.catalyst.expressions._
@@ -79,13 +80,16 @@ class SQLContext(@transient val sparkContext: SparkContext)
    */
   def setConf(props: Properties): Unit = conf.setConf(props)
 
+  /** Set the given Spark SQL configuration property. */
+  private[sql] def setConf[T](entry: SQLConfEntry[T], value: T): Unit = conf.setConf(entry, value)
+
   /**
    * Set the given Spark SQL configuration property.
    *
    * @group config
    * @since 1.0.0
    */
-  def setConf(key: String, value: String): Unit = conf.setConf(key, value)
+  def setConf(key: String, value: String): Unit = conf.setConfString(key, value)
 
   /**
    * Return the value of Spark SQL configuration property for the given key.
@@ -93,7 +97,22 @@ class SQLContext(@transient val sparkContext: SparkContext)
    * @group config
    * @since 1.0.0
    */
-  def getConf(key: String): String = conf.getConf(key)
+  def getConf(key: String): String = conf.getConfString(key)
+
+  /**
+   * Return the value of Spark SQL configuration property for the given key. If the key is not set
+   * yet, return `defaultValue` in [[SQLConfEntry]].
+   */
+  private[sql] def getConf[T](entry: SQLConfEntry[T]): T = conf.getConf(entry)
+
+  /**
+   * Return the value of Spark SQL configuration property for the given key. If the key is not set
+   * yet, return `defaultValue`. This is useful when `defaultValue` in SQLConfEntry is not the
+   * desired one.
+   */
+  private[sql] def getConf[T](entry: SQLConfEntry[T], defaultValue: T): T = {
+    conf.getConf(entry, defaultValue)
+  }
 
   /**
    * Return the value of Spark SQL configuration property for the given key. If the key is not set
@@ -102,7 +121,7 @@ class SQLContext(@transient val sparkContext: SparkContext)
    * @group config
    * @since 1.0.0
    */
-  def getConf(key: String, defaultValue: String): String = conf.getConf(key, defaultValue)
+  def getConf(key: String, defaultValue: String): String = conf.getConfString(key, defaultValue)
 
   /**
    * Return all the configuration properties that have been set (i.e. not the default).
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SparkSQLParser.scala b/sql/core/src/main/scala/org/apache/spark/sql/SparkSQLParser.scala
index 305b306a79871..e59fa6e162900 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/SparkSQLParser.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/SparkSQLParser.scala
@@ -44,8 +44,8 @@ private[sql] class SparkSQLParser(fallback: String => LogicalPlan) extends Abstr
 
     private val pair: Parser[LogicalPlan] =
       (key ~ ("=".r ~> value).?).? ^^ {
-        case None => SetCommand(None, output)
-        case Some(k ~ v) => SetCommand(Some(k.trim -> v.map(_.trim)), output)
+        case None => SetCommand(None)
+        case Some(k ~ v) => SetCommand(Some(k.trim -> v.map(_.trim)))
       }
 
     def apply(input: String): LogicalPlan = parseAll(pair, input) match {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/commands.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/commands.scala
index c9dfcea5d051e..5e9951f248ff2 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/commands.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/commands.scala
@@ -17,6 +17,8 @@
 
 package org.apache.spark.sql.execution
 
+import java.util.NoSuchElementException
+
 import org.apache.spark.Logging
 import org.apache.spark.annotation.DeveloperApi
 import org.apache.spark.rdd.RDD
@@ -75,48 +77,92 @@ private[sql] case class ExecutedCommand(cmd: RunnableCommand) extends SparkPlan
  * :: DeveloperApi ::
  */
 @DeveloperApi
-case class SetCommand(
-    kv: Option[(String, Option[String])],
-    override val output: Seq[Attribute])
-  extends RunnableCommand with Logging {
+case class SetCommand(kv: Option[(String, Option[String])]) extends RunnableCommand with Logging {
+
+  private def keyValueOutput: Seq[Attribute] = {
+    val schema = StructType(
+      StructField("key", StringType, false) ::
+        StructField("value", StringType, false) :: Nil)
+    schema.toAttributes
+  }
 
-  override def run(sqlContext: SQLContext): Seq[Row] = kv match {
+  private val (_output, runFunc): (Seq[Attribute], SQLContext => Seq[Row]) = kv match {
     // Configures the deprecated "mapred.reduce.tasks" property.
     case Some((SQLConf.Deprecated.MAPRED_REDUCE_TASKS, Some(value))) =>
-      logWarning(
-        s"Property ${SQLConf.Deprecated.MAPRED_REDUCE_TASKS} is deprecated, " +
-          s"automatically converted to ${SQLConf.SHUFFLE_PARTITIONS} instead.")
-      if (value.toInt < 1) {
-        val msg = s"Setting negative ${SQLConf.Deprecated.MAPRED_REDUCE_TASKS} for automatically " +
-          "determining the number of reducers is not supported."
-        throw new IllegalArgumentException(msg)
-      } else {
-        sqlContext.setConf(SQLConf.SHUFFLE_PARTITIONS, value)
-        Seq(Row(s"${SQLConf.SHUFFLE_PARTITIONS}=$value"))
+      val runFunc = (sqlContext: SQLContext) => {
+        logWarning(
+          s"Property ${SQLConf.Deprecated.MAPRED_REDUCE_TASKS} is deprecated, " +
+            s"automatically converted to ${SQLConf.SHUFFLE_PARTITIONS.key} instead.")
+        if (value.toInt < 1) {
+          val msg =
+            s"Setting negative ${SQLConf.Deprecated.MAPRED_REDUCE_TASKS} for automatically " +
+              "determining the number of reducers is not supported."
+          throw new IllegalArgumentException(msg)
+        } else {
+          sqlContext.setConf(SQLConf.SHUFFLE_PARTITIONS.key, value)
+          Seq(Row(SQLConf.SHUFFLE_PARTITIONS.key, value))
+        }
       }
+      (keyValueOutput, runFunc)
 
     // Configures a single property.
     case Some((key, Some(value))) =>
-      sqlContext.setConf(key, value)
-      Seq(Row(s"$key=$value"))
+      val runFunc = (sqlContext: SQLContext) => {
+        sqlContext.setConf(key, value)
+        Seq(Row(key, value))
+      }
+      (keyValueOutput, runFunc)
 
-    // Queries all key-value pairs that are set in the SQLConf of the sqlContext.
-    // Notice that different from Hive, here "SET -v" is an alias of "SET".
     // (In Hive, "SET" returns all changed properties while "SET -v" returns all properties.)
-    case Some(("-v", None)) | None =>
-      sqlContext.getAllConfs.map { case (k, v) => Row(s"$k=$v") }.toSeq
+    // Queries all key-value pairs that are set in the SQLConf of the sqlContext.
+    case None =>
+      val runFunc = (sqlContext: SQLContext) => {
+        sqlContext.getAllConfs.map { case (k, v) => Row(k, v) }.toSeq
+      }
+      (keyValueOutput, runFunc)
+
+    // Queries all properties along with their default values and docs that are defined in the
+    // SQLConf of the sqlContext.
+    case Some(("-v", None)) =>
+      val runFunc = (sqlContext: SQLContext) => {
+        sqlContext.conf.getAllDefinedConfs.map { case (key, defaultValue, doc) =>
+          Row(key, defaultValue, doc)
+        }
+      }
+      val schema = StructType(
+        StructField("key", StringType, false) ::
+          StructField("default", StringType, false) ::
+          StructField("meaning", StringType, false) :: Nil)
+      (schema.toAttributes, runFunc)
 
     // Queries the deprecated "mapred.reduce.tasks" property.
     case Some((SQLConf.Deprecated.MAPRED_REDUCE_TASKS, None)) =>
-      logWarning(
-        s"Property ${SQLConf.Deprecated.MAPRED_REDUCE_TASKS} is deprecated, " +
-          s"showing ${SQLConf.SHUFFLE_PARTITIONS} instead.")
-      Seq(Row(s"${SQLConf.SHUFFLE_PARTITIONS}=${sqlContext.conf.numShufflePartitions}"))
+      val runFunc = (sqlContext: SQLContext) => {
+        logWarning(
+          s"Property ${SQLConf.Deprecated.MAPRED_REDUCE_TASKS} is deprecated, " +
+            s"showing ${SQLConf.SHUFFLE_PARTITIONS.key} instead.")
+        Seq(Row(SQLConf.SHUFFLE_PARTITIONS.key, sqlContext.conf.numShufflePartitions.toString))
+      }
+      (keyValueOutput, runFunc)
 
     // Queries a single property.
     case Some((key, None)) =>
-      Seq(Row(s"$key=${sqlContext.getConf(key, "<undefined>")}"))
+      val runFunc = (sqlContext: SQLContext) => {
+        val value =
+          try {
+            sqlContext.getConf(key)
+          } catch {
+            case _: NoSuchElementException => "<undefined>"
+          }
+        Seq(Row(key, value))
+      }
+      (keyValueOutput, runFunc)
   }
+
+  override val output: Seq[Attribute] = _output
+
+  override def run(sqlContext: SQLContext): Seq[Row] = runFunc(sqlContext)
+
 }
 
 /**
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/debug/package.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/debug/package.scala
index 3ee4033baee2e..2964edac1aba2 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/debug/package.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/debug/package.scala
@@ -48,7 +48,7 @@ package object debug {
    */
   implicit class DebugSQLContext(sqlContext: SQLContext) {
     def debug(): Unit = {
-      sqlContext.setConf(SQLConf.DATAFRAME_EAGER_ANALYSIS, "false")
+      sqlContext.setConf(SQLConf.DATAFRAME_EAGER_ANALYSIS, false)
     }
   }
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableOperations.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableOperations.scala
index 39360e13313a3..65ecad9878f8e 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableOperations.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableOperations.scala
@@ -113,12 +113,12 @@ private[sql] case class ParquetTableScan(
       .foreach(ParquetInputFormat.setFilterPredicate(conf, _))
 
     // Tell FilteringParquetRowInputFormat whether it's okay to cache Parquet and FS metadata
-    conf.set(
-      SQLConf.PARQUET_CACHE_METADATA,
-      sqlContext.getConf(SQLConf.PARQUET_CACHE_METADATA, "true"))
+    conf.setBoolean(
+      SQLConf.PARQUET_CACHE_METADATA.key,
+      sqlContext.getConf(SQLConf.PARQUET_CACHE_METADATA, true))
 
     // Use task side metadata in parquet
-    conf.setBoolean(ParquetInputFormat.TASK_SIDE_METADATA, true);
+    conf.setBoolean(ParquetInputFormat.TASK_SIDE_METADATA, true)
 
     val baseRDD =
       new org.apache.spark.rdd.NewHadoopRDD(
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/newParquet.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/newParquet.scala
index bba6f1ec96aa8..4c702c3b0d43f 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/newParquet.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/newParquet.scala
@@ -220,7 +220,7 @@ private[sql] class ParquetRelation2(
     }
 
     conf.setClass(
-      SQLConf.OUTPUT_COMMITTER_CLASS,
+      SQLConf.OUTPUT_COMMITTER_CLASS.key,
       committerClass,
       classOf[ParquetOutputCommitter])
 
@@ -259,7 +259,7 @@ private[sql] class ParquetRelation2(
       filters: Array[Filter],
       inputFiles: Array[FileStatus],
       broadcastedConf: Broadcast[SerializableWritable[Configuration]]): RDD[Row] = {
-    val useMetadataCache = sqlContext.getConf(SQLConf.PARQUET_CACHE_METADATA, "true").toBoolean
+    val useMetadataCache = sqlContext.getConf(SQLConf.PARQUET_CACHE_METADATA)
     val parquetFilterPushDown = sqlContext.conf.parquetFilterPushDown
     // Create the function to set variable Parquet confs at both driver and executor side.
     val initLocalJobFuncOpt =
@@ -498,7 +498,7 @@ private[sql] object ParquetRelation2 extends Logging {
       ParquetTypesConverter.convertToString(dataSchema.toAttributes))
 
     // Tell FilteringParquetRowInputFormat whether it's okay to cache Parquet and FS metadata
-    conf.set(SQLConf.PARQUET_CACHE_METADATA, useMetadataCache.toString)
+    conf.setBoolean(SQLConf.PARQUET_CACHE_METADATA.key, useMetadataCache)
   }
 
   /** This closure sets input paths at the driver side. */
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/sources/commands.scala b/sql/core/src/main/scala/org/apache/spark/sql/sources/commands.scala
index 3dbe6faabf453..d39a20b388375 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/sources/commands.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/sources/commands.scala
@@ -323,7 +323,7 @@ private[sql] abstract class BaseWriterContainer(
 
   private def newOutputCommitter(context: TaskAttemptContext): OutputCommitter = {
     val committerClass = context.getConfiguration.getClass(
-      SQLConf.OUTPUT_COMMITTER_CLASS, null, classOf[OutputCommitter])
+      SQLConf.OUTPUT_COMMITTER_CLASS.key, null, classOf[OutputCommitter])
 
     Option(committerClass).map { clazz =>
       logInfo(s"Using user defined output committer class ${clazz.getCanonicalName}")
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/test/TestSQLContext.scala b/sql/core/src/main/scala/org/apache/spark/sql/test/TestSQLContext.scala
index 356a6100d2cf5..9fa394525d65c 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/test/TestSQLContext.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/test/TestSQLContext.scala
@@ -38,7 +38,7 @@ class LocalSQLContext
   protected[sql] class SQLSession extends super.SQLSession {
     protected[sql] override lazy val conf: SQLConf = new SQLConf {
       /** Fewer partitions to speed up testing. */
-      override def numShufflePartitions: Int = this.getConf(SQLConf.SHUFFLE_PARTITIONS, "5").toInt
+      override def numShufflePartitions: Int = this.getConf(SQLConf.SHUFFLE_PARTITIONS, 5)
     }
   }
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameAggregateSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameAggregateSuite.scala
index 790b405c72697..b26d3ab253a1d 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameAggregateSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameAggregateSuite.scala
@@ -68,12 +68,12 @@ class DataFrameAggregateSuite extends QueryTest {
       Seq(Row(1, 3), Row(2, 3), Row(3, 3))
     )
 
-    ctx.conf.setConf("spark.sql.retainGroupColumns", "false")
+    ctx.conf.setConf(SQLConf.DATAFRAME_RETAIN_GROUP_COLUMNS, false)
     checkAnswer(
       testData2.groupBy("a").agg(sum($"b")),
       Seq(Row(3), Row(3), Row(3))
     )
-    ctx.conf.setConf("spark.sql.retainGroupColumns", "true")
+    ctx.conf.setConf(SQLConf.DATAFRAME_RETAIN_GROUP_COLUMNS, true)
   }
 
   test("agg without groups") {
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
index fa98e23e3d147..ba1d020f22f11 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
@@ -33,7 +33,7 @@ class DataFrameSuite extends QueryTest {
   test("analysis error should be eagerly reported") {
     val oldSetting = ctx.conf.dataFrameEagerAnalysis
     // Eager analysis.
-    ctx.setConf(SQLConf.DATAFRAME_EAGER_ANALYSIS, "true")
+    ctx.setConf(SQLConf.DATAFRAME_EAGER_ANALYSIS, true)
 
     intercept[Exception] { testData.select('nonExistentName) }
     intercept[Exception] {
@@ -47,11 +47,11 @@ class DataFrameSuite extends QueryTest {
     }
 
     // No more eager analysis once the flag is turned off
-    ctx.setConf(SQLConf.DATAFRAME_EAGER_ANALYSIS, "false")
+    ctx.setConf(SQLConf.DATAFRAME_EAGER_ANALYSIS, false)
     testData.select('nonExistentName)
 
     // Set the flag back to original value before this test.
-    ctx.setConf(SQLConf.DATAFRAME_EAGER_ANALYSIS, oldSetting.toString)
+    ctx.setConf(SQLConf.DATAFRAME_EAGER_ANALYSIS, oldSetting)
   }
 
   test("dataframe toString") {
@@ -70,7 +70,7 @@ class DataFrameSuite extends QueryTest {
 
   test("invalid plan toString, debug mode") {
     val oldSetting = ctx.conf.dataFrameEagerAnalysis
-    ctx.setConf(SQLConf.DATAFRAME_EAGER_ANALYSIS, "true")
+    ctx.setConf(SQLConf.DATAFRAME_EAGER_ANALYSIS, true)
 
     // Turn on debug mode so we can see invalid query plans.
     import org.apache.spark.sql.execution.debug._
@@ -83,7 +83,7 @@ class DataFrameSuite extends QueryTest {
         badPlan.toString)
 
     // Set the flag back to original value before this test.
-    ctx.setConf(SQLConf.DATAFRAME_EAGER_ANALYSIS, oldSetting.toString)
+    ctx.setConf(SQLConf.DATAFRAME_EAGER_ANALYSIS, oldSetting)
   }
 
   test("access complex data") {
@@ -556,13 +556,13 @@ class DataFrameSuite extends QueryTest {
 
   test("SPARK-6899") {
     val originalValue = ctx.conf.codegenEnabled
-    ctx.setConf(SQLConf.CODEGEN_ENABLED, "true")
+    ctx.setConf(SQLConf.CODEGEN_ENABLED, true)
     try{
       checkAnswer(
         decimalData.agg(avg('a)),
         Row(new java.math.BigDecimal(2.0)))
     } finally {
-      ctx.setConf(SQLConf.CODEGEN_ENABLED, originalValue.toString)
+      ctx.setConf(SQLConf.CODEGEN_ENABLED, originalValue)
     }
   }
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/JoinSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/JoinSuite.scala
index ffd26c4f5a7c2..20390a5544304 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/JoinSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/JoinSuite.scala
@@ -95,14 +95,14 @@ class JoinSuite extends QueryTest with BeforeAndAfterEach {
         classOf[BroadcastNestedLoopJoin])
     ).foreach { case (query, joinClass) => assertJoin(query, joinClass) }
     try {
-      ctx.conf.setConf("spark.sql.planner.sortMergeJoin", "true")
+      ctx.conf.setConf(SQLConf.SORTMERGE_JOIN, true)
       Seq(
         ("SELECT * FROM testData JOIN testData2 ON key = a", classOf[SortMergeJoin]),
         ("SELECT * FROM testData JOIN testData2 ON key = a and key = 2", classOf[SortMergeJoin]),
         ("SELECT * FROM testData JOIN testData2 ON key = a where key = 2", classOf[SortMergeJoin])
       ).foreach { case (query, joinClass) => assertJoin(query, joinClass) }
     } finally {
-      ctx.conf.setConf("spark.sql.planner.sortMergeJoin", SORTMERGEJOIN_ENABLED.toString)
+      ctx.conf.setConf(SQLConf.SORTMERGE_JOIN, SORTMERGEJOIN_ENABLED)
     }
   }
 
@@ -118,7 +118,7 @@ class JoinSuite extends QueryTest with BeforeAndAfterEach {
         classOf[BroadcastHashJoin])
     ).foreach { case (query, joinClass) => assertJoin(query, joinClass) }
     try {
-      ctx.conf.setConf("spark.sql.planner.sortMergeJoin", "true")
+      ctx.conf.setConf(SQLConf.SORTMERGE_JOIN, true)
       Seq(
         ("SELECT * FROM testData join testData2 ON key = a", classOf[BroadcastHashJoin]),
         ("SELECT * FROM testData join testData2 ON key = a and key = 2",
@@ -127,7 +127,7 @@ class JoinSuite extends QueryTest with BeforeAndAfterEach {
           classOf[BroadcastHashJoin])
       ).foreach { case (query, joinClass) => assertJoin(query, joinClass) }
     } finally {
-      ctx.conf.setConf("spark.sql.planner.sortMergeJoin", SORTMERGEJOIN_ENABLED.toString)
+      ctx.conf.setConf(SQLConf.SORTMERGE_JOIN, SORTMERGEJOIN_ENABLED)
     }
 
     ctx.sql("UNCACHE TABLE testData")
@@ -416,7 +416,7 @@ class JoinSuite extends QueryTest with BeforeAndAfterEach {
     ctx.sql("CACHE TABLE testData")
     val tmp = ctx.conf.autoBroadcastJoinThreshold
 
-    ctx.sql(s"SET ${SQLConf.AUTO_BROADCASTJOIN_THRESHOLD}=1000000000")
+    ctx.sql(s"SET ${SQLConf.AUTO_BROADCASTJOIN_THRESHOLD.key}=1000000000")
     Seq(
       ("SELECT * FROM testData LEFT SEMI JOIN testData2 ON key = a",
         classOf[BroadcastLeftSemiJoinHash])
@@ -424,7 +424,7 @@ class JoinSuite extends QueryTest with BeforeAndAfterEach {
       case (query, joinClass) => assertJoin(query, joinClass)
     }
 
-    ctx.sql(s"SET ${SQLConf.AUTO_BROADCASTJOIN_THRESHOLD}=-1")
+    ctx.sql(s"SET ${SQLConf.AUTO_BROADCASTJOIN_THRESHOLD.key}=-1")
 
     Seq(
       ("SELECT * FROM testData LEFT SEMI JOIN testData2 ON key = a", classOf[LeftSemiJoinHash])
@@ -432,7 +432,7 @@ class JoinSuite extends QueryTest with BeforeAndAfterEach {
       case (query, joinClass) => assertJoin(query, joinClass)
     }
 
-    ctx.setConf(SQLConf.AUTO_BROADCASTJOIN_THRESHOLD, tmp.toString)
+    ctx.setConf(SQLConf.AUTO_BROADCASTJOIN_THRESHOLD, tmp)
     ctx.sql("UNCACHE TABLE testData")
   }
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLConfEntrySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLConfEntrySuite.scala
new file mode 100644
index 0000000000000..2e33777f14adc
--- /dev/null
+++ b/sql/core/src/test/scala/org/apache/spark/sql/SQLConfEntrySuite.scala
@@ -0,0 +1,150 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql
+
+import org.apache.spark.SparkFunSuite
+import org.apache.spark.sql.SQLConf._
+
+class SQLConfEntrySuite extends SparkFunSuite {
+
+  val conf = new SQLConf
+
+  test("intConf") {
+    val key = "spark.sql.SQLConfEntrySuite.int"
+    val confEntry = SQLConfEntry.intConf(key)
+    assert(conf.getConf(confEntry, 5) === 5)
+
+    conf.setConf(confEntry, 10)
+    assert(conf.getConf(confEntry, 5) === 10)
+
+    conf.setConfString(key, "20")
+    assert(conf.getConfString(key, "5") === "20")
+    assert(conf.getConfString(key) === "20")
+    assert(conf.getConf(confEntry, 5) === 20)
+
+    val e = intercept[IllegalArgumentException] {
+      conf.setConfString(key, "abc")
+    }
+    assert(e.getMessage === s"$key should be int, but was abc")
+  }
+
+  test("longConf") {
+    val key = "spark.sql.SQLConfEntrySuite.long"
+    val confEntry = SQLConfEntry.longConf(key)
+    assert(conf.getConf(confEntry, 5L) === 5L)
+
+    conf.setConf(confEntry, 10L)
+    assert(conf.getConf(confEntry, 5L) === 10L)
+
+    conf.setConfString(key, "20")
+    assert(conf.getConfString(key, "5") === "20")
+    assert(conf.getConfString(key) === "20")
+    assert(conf.getConf(confEntry, 5L) === 20L)
+
+    val e = intercept[IllegalArgumentException] {
+      conf.setConfString(key, "abc")
+    }
+    assert(e.getMessage === s"$key should be long, but was abc")
+  }
+
+  test("booleanConf") {
+    val key = "spark.sql.SQLConfEntrySuite.boolean"
+    val confEntry = SQLConfEntry.booleanConf(key)
+    assert(conf.getConf(confEntry, false) === false)
+
+    conf.setConf(confEntry, true)
+    assert(conf.getConf(confEntry, false) === true)
+
+    conf.setConfString(key, "true")
+    assert(conf.getConfString(key, "false") === "true")
+    assert(conf.getConfString(key) === "true")
+    assert(conf.getConf(confEntry, false) === true)
+
+    val e = intercept[IllegalArgumentException] {
+      conf.setConfString(key, "abc")
+    }
+    assert(e.getMessage === s"$key should be boolean, but was abc")
+  }
+
+  test("doubleConf") {
+    val key = "spark.sql.SQLConfEntrySuite.double"
+    val confEntry = SQLConfEntry.doubleConf(key)
+    assert(conf.getConf(confEntry, 5.0) === 5.0)
+
+    conf.setConf(confEntry, 10.0)
+    assert(conf.getConf(confEntry, 5.0) === 10.0)
+
+    conf.setConfString(key, "20.0")
+    assert(conf.getConfString(key, "5.0") === "20.0")
+    assert(conf.getConfString(key) === "20.0")
+    assert(conf.getConf(confEntry, 5.0) === 20.0)
+
+    val e = intercept[IllegalArgumentException] {
+      conf.setConfString(key, "abc")
+    }
+    assert(e.getMessage === s"$key should be double, but was abc")
+  }
+
+  test("stringConf") {
+    val key = "spark.sql.SQLConfEntrySuite.string"
+    val confEntry = SQLConfEntry.stringConf(key)
+    assert(conf.getConf(confEntry, "abc") === "abc")
+
+    conf.setConf(confEntry, "abcd")
+    assert(conf.getConf(confEntry, "abc") === "abcd")
+
+    conf.setConfString(key, "abcde")
+    assert(conf.getConfString(key, "abc") === "abcde")
+    assert(conf.getConfString(key) === "abcde")
+    assert(conf.getConf(confEntry, "abc") === "abcde")
+  }
+
+  test("enumConf") {
+    val key = "spark.sql.SQLConfEntrySuite.enum"
+    val confEntry = SQLConfEntry.enumConf(key, v => v, Set("a", "b", "c"), defaultValue = Some("a"))
+    assert(conf.getConf(confEntry) === "a")
+
+    conf.setConf(confEntry, "b")
+    assert(conf.getConf(confEntry) === "b")
+
+    conf.setConfString(key, "c")
+    assert(conf.getConfString(key, "a") === "c")
+    assert(conf.getConfString(key) === "c")
+    assert(conf.getConf(confEntry) === "c")
+
+    val e = intercept[IllegalArgumentException] {
+      conf.setConfString(key, "d")
+    }
+    assert(e.getMessage === s"The value of $key should be one of a, b, c, but was d")
+  }
+
+  test("stringSeqConf") {
+    val key = "spark.sql.SQLConfEntrySuite.stringSeq"
+    val confEntry = SQLConfEntry.stringSeqConf("spark.sql.SQLConfEntrySuite.stringSeq",
+      defaultValue = Some(Nil))
+    assert(conf.getConf(confEntry, Seq("a", "b", "c")) === Seq("a", "b", "c"))
+
+    conf.setConf(confEntry, Seq("a", "b", "c", "d"))
+    assert(conf.getConf(confEntry, Seq("a", "b", "c")) === Seq("a", "b", "c", "d"))
+
+    conf.setConfString(key, "a,b,c,d,e")
+    assert(conf.getConfString(key, "a,b,c") === "a,b,c,d,e")
+    assert(conf.getConfString(key) === "a,b,c,d,e")
+    assert(conf.getConf(confEntry, Seq("a", "b", "c")) === Seq("a", "b", "c", "d", "e"))
+  }
+}
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLConfSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLConfSuite.scala
index 76d0dd1744a41..75791e9d53c20 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/SQLConfSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/SQLConfSuite.scala
@@ -75,6 +75,14 @@ class SQLConfSuite extends QueryTest {
   test("deprecated property") {
     ctx.conf.clear()
     ctx.sql(s"set ${SQLConf.Deprecated.MAPRED_REDUCE_TASKS}=10")
-    assert(ctx.getConf(SQLConf.SHUFFLE_PARTITIONS) === "10")
+    assert(ctx.conf.numShufflePartitions === 10)
+  }
+
+  test("invalid conf value") {
+    ctx.conf.clear()
+    val e = intercept[IllegalArgumentException] {
+      ctx.sql(s"set ${SQLConf.CASE_SENSITIVE.key}=10")
+    }
+    assert(e.getMessage === s"${SQLConf.CASE_SENSITIVE.key} should be boolean, but was 10")
   }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
index 30db840166ca6..82f3fdb48b557 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
@@ -190,7 +190,7 @@ class SQLQuerySuite extends QueryTest with BeforeAndAfterAll with SQLTestUtils {
 
   test("aggregation with codegen") {
     val originalValue = sqlContext.conf.codegenEnabled
-    sqlContext.setConf(SQLConf.CODEGEN_ENABLED, "true")
+    sqlContext.setConf(SQLConf.CODEGEN_ENABLED, true)
     // Prepare a table that we can group some rows.
     sqlContext.table("testData")
       .unionAll(sqlContext.table("testData"))
@@ -287,7 +287,7 @@ class SQLQuerySuite extends QueryTest with BeforeAndAfterAll with SQLTestUtils {
         Row(0, null, 0) :: Nil)
     } finally {
       sqlContext.dropTempTable("testData3x")
-      sqlContext.setConf(SQLConf.CODEGEN_ENABLED, originalValue.toString)
+      sqlContext.setConf(SQLConf.CODEGEN_ENABLED, originalValue)
     }
   }
 
@@ -480,41 +480,41 @@ class SQLQuerySuite extends QueryTest with BeforeAndAfterAll with SQLTestUtils {
 
   test("sorting") {
     val before = sqlContext.conf.externalSortEnabled
-    sqlContext.setConf(SQLConf.EXTERNAL_SORT, "false")
+    sqlContext.setConf(SQLConf.EXTERNAL_SORT, false)
     sortTest()
-    sqlContext.setConf(SQLConf.EXTERNAL_SORT, before.toString)
+    sqlContext.setConf(SQLConf.EXTERNAL_SORT, before)
   }
 
   test("external sorting") {
     val before = sqlContext.conf.externalSortEnabled
-    sqlContext.setConf(SQLConf.EXTERNAL_SORT, "true")
+    sqlContext.setConf(SQLConf.EXTERNAL_SORT, true)
     sortTest()
-    sqlContext.setConf(SQLConf.EXTERNAL_SORT, before.toString)
+    sqlContext.setConf(SQLConf.EXTERNAL_SORT, before)
   }
 
   test("SPARK-6927 sorting with codegen on") {
     val externalbefore = sqlContext.conf.externalSortEnabled
     val codegenbefore = sqlContext.conf.codegenEnabled
-    sqlContext.setConf(SQLConf.EXTERNAL_SORT, "false")
-    sqlContext.setConf(SQLConf.CODEGEN_ENABLED, "true")
+    sqlContext.setConf(SQLConf.EXTERNAL_SORT, false)
+    sqlContext.setConf(SQLConf.CODEGEN_ENABLED, true)
     try{
       sortTest()
     } finally {
-      sqlContext.setConf(SQLConf.EXTERNAL_SORT, externalbefore.toString)
-      sqlContext.setConf(SQLConf.CODEGEN_ENABLED, codegenbefore.toString)
+      sqlContext.setConf(SQLConf.EXTERNAL_SORT, externalbefore)
+      sqlContext.setConf(SQLConf.CODEGEN_ENABLED, codegenbefore)
     }
   }
 
   test("SPARK-6927 external sorting with codegen on") {
     val externalbefore = sqlContext.conf.externalSortEnabled
     val codegenbefore = sqlContext.conf.codegenEnabled
-    sqlContext.setConf(SQLConf.CODEGEN_ENABLED, "true")
-    sqlContext.setConf(SQLConf.EXTERNAL_SORT, "true")
+    sqlContext.setConf(SQLConf.CODEGEN_ENABLED, true)
+    sqlContext.setConf(SQLConf.EXTERNAL_SORT, true)
     try {
       sortTest()
     } finally {
-      sqlContext.setConf(SQLConf.EXTERNAL_SORT, externalbefore.toString)
-      sqlContext.setConf(SQLConf.CODEGEN_ENABLED, codegenbefore.toString)
+      sqlContext.setConf(SQLConf.EXTERNAL_SORT, externalbefore)
+      sqlContext.setConf(SQLConf.CODEGEN_ENABLED, codegenbefore)
     }
   }
 
@@ -908,25 +908,25 @@ class SQLQuerySuite extends QueryTest with BeforeAndAfterAll with SQLTestUtils {
     sql(s"SET $testKey=$testVal")
     checkAnswer(
       sql("SET"),
-      Row(s"$testKey=$testVal")
+      Row(testKey, testVal)
     )
 
     sql(s"SET ${testKey + testKey}=${testVal + testVal}")
     checkAnswer(
       sql("set"),
       Seq(
-        Row(s"$testKey=$testVal"),
-        Row(s"${testKey + testKey}=${testVal + testVal}"))
+        Row(testKey, testVal),
+        Row(testKey + testKey, testVal + testVal))
     )
 
     // "set key"
     checkAnswer(
       sql(s"SET $testKey"),
-      Row(s"$testKey=$testVal")
+      Row(testKey, testVal)
     )
     checkAnswer(
       sql(s"SET $nonexistentKey"),
-      Row(s"$nonexistentKey=<undefined>")
+      Row(nonexistentKey, "<undefined>")
     )
     sqlContext.conf.clear()
   }
@@ -1340,12 +1340,12 @@ class SQLQuerySuite extends QueryTest with BeforeAndAfterAll with SQLTestUtils {
   }
 
   test("SPARK-4699 case sensitivity SQL query") {
-    sqlContext.setConf(SQLConf.CASE_SENSITIVE, "false")
+    sqlContext.setConf(SQLConf.CASE_SENSITIVE, false)
     val data = TestData(1, "val_1") :: TestData(2, "val_2") :: Nil
     val rdd = sqlContext.sparkContext.parallelize((0 to 1).map(i => data(i)))
     rdd.toDF().registerTempTable("testTable1")
     checkAnswer(sql("SELECT VALUE FROM TESTTABLE1 where KEY = 1"), Row("val_1"))
-    sqlContext.setConf(SQLConf.CASE_SENSITIVE, "true")
+    sqlContext.setConf(SQLConf.CASE_SENSITIVE, true)
   }
 
   test("SPARK-6145: ORDER BY test for nested fields") {
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/columnar/PartitionBatchPruningSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/columnar/PartitionBatchPruningSuite.scala
index 6545c6b314a4c..2c0879927a129 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/columnar/PartitionBatchPruningSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/columnar/PartitionBatchPruningSuite.scala
@@ -32,7 +32,7 @@ class PartitionBatchPruningSuite extends SparkFunSuite with BeforeAndAfterAll wi
 
   override protected def beforeAll(): Unit = {
     // Make a table with 5 partitions, 2 batches per partition, 10 elements per batch
-    ctx.setConf(SQLConf.COLUMN_BATCH_SIZE, "10")
+    ctx.setConf(SQLConf.COLUMN_BATCH_SIZE, 10)
 
     val pruningData = ctx.sparkContext.makeRDD((1 to 100).map { key =>
       val string = if (((key - 1) / 10) % 2 == 0) null else key.toString
@@ -41,14 +41,14 @@ class PartitionBatchPruningSuite extends SparkFunSuite with BeforeAndAfterAll wi
     pruningData.registerTempTable("pruningData")
 
     // Enable in-memory partition pruning
-    ctx.setConf(SQLConf.IN_MEMORY_PARTITION_PRUNING, "true")
+    ctx.setConf(SQLConf.IN_MEMORY_PARTITION_PRUNING, true)
     // Enable in-memory table scan accumulators
     ctx.setConf("spark.sql.inMemoryTableScanStatistics.enable", "true")
   }
 
   override protected def afterAll(): Unit = {
-    ctx.setConf(SQLConf.COLUMN_BATCH_SIZE, originalColumnBatchSize.toString)
-    ctx.setConf(SQLConf.IN_MEMORY_PARTITION_PRUNING, originalInMemoryPartitionPruning.toString)
+    ctx.setConf(SQLConf.COLUMN_BATCH_SIZE, originalColumnBatchSize)
+    ctx.setConf(SQLConf.IN_MEMORY_PARTITION_PRUNING, originalInMemoryPartitionPruning)
   }
 
   before {
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/PlannerSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/PlannerSuite.scala
index 3e27f58a92d01..5854ab48db552 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/PlannerSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/PlannerSuite.scala
@@ -63,7 +63,7 @@ class PlannerSuite extends SparkFunSuite {
 
   test("sizeInBytes estimation of limit operator for broadcast hash join optimization") {
     def checkPlan(fieldTypes: Seq[DataType], newThreshold: Int): Unit = {
-      setConf(SQLConf.AUTO_BROADCASTJOIN_THRESHOLD, newThreshold.toString)
+      setConf(SQLConf.AUTO_BROADCASTJOIN_THRESHOLD, newThreshold)
       val fields = fieldTypes.zipWithIndex.map {
         case (dataType, index) => StructField(s"c${index}", dataType, true)
       } :+ StructField("key", IntegerType, true)
@@ -119,12 +119,12 @@ class PlannerSuite extends SparkFunSuite {
 
     checkPlan(complexTypes, newThreshold = 901617)
 
-    setConf(SQLConf.AUTO_BROADCASTJOIN_THRESHOLD, origThreshold.toString)
+    setConf(SQLConf.AUTO_BROADCASTJOIN_THRESHOLD, origThreshold)
   }
 
   test("InMemoryRelation statistics propagation") {
     val origThreshold = conf.autoBroadcastJoinThreshold
-    setConf(SQLConf.AUTO_BROADCASTJOIN_THRESHOLD, 81920.toString)
+    setConf(SQLConf.AUTO_BROADCASTJOIN_THRESHOLD, 81920)
 
     testData.limit(3).registerTempTable("tiny")
     sql("CACHE TABLE tiny")
@@ -139,6 +139,6 @@ class PlannerSuite extends SparkFunSuite {
     assert(broadcastHashJoins.size === 1, "Should use broadcast hash join")
     assert(shuffledHashJoins.isEmpty, "Should not use shuffled hash join")
 
-    setConf(SQLConf.AUTO_BROADCASTJOIN_THRESHOLD, origThreshold.toString)
+    setConf(SQLConf.AUTO_BROADCASTJOIN_THRESHOLD, origThreshold)
   }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/json/JsonSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/json/JsonSuite.scala
index fca24364fe6ec..945d4375035fd 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/json/JsonSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/json/JsonSuite.scala
@@ -1077,14 +1077,14 @@ class JsonSuite extends QueryTest with TestJsonData {
   }
 
   test("SPARK-7565 MapType in JsonRDD") {
-    val useStreaming = ctx.getConf(SQLConf.USE_JACKSON_STREAMING_API, "true")
+    val useStreaming = ctx.conf.useJacksonStreamingAPI
     val oldColumnNameOfCorruptRecord = ctx.conf.columnNameOfCorruptRecord
     ctx.setConf(SQLConf.COLUMN_NAME_OF_CORRUPT_RECORD, "_unparsed")
 
     val schemaWithSimpleMap = StructType(
       StructField("map", MapType(StringType, IntegerType, true), false) :: Nil)
     try{
-      for (useStreaming <- List("true", "false")) {
+      for (useStreaming <- List(true, false)) {
         ctx.setConf(SQLConf.USE_JACKSON_STREAMING_API, useStreaming)
         val temp = Utils.createTempDir().getPath
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetFilterSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetFilterSuite.scala
index fa5d4eca05d9f..a2763c78b6450 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetFilterSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetFilterSuite.scala
@@ -51,7 +51,7 @@ class ParquetFilterSuiteBase extends QueryTest with ParquetTest {
       expected: Seq[Row]): Unit = {
     val output = predicate.collect { case a: Attribute => a }.distinct
 
-    withSQLConf(SQLConf.PARQUET_FILTER_PUSHDOWN_ENABLED -> "true") {
+    withSQLConf(SQLConf.PARQUET_FILTER_PUSHDOWN_ENABLED.key -> "true") {
       val query = df
         .select(output.map(e => Column(e)): _*)
         .where(Column(predicate))
@@ -314,17 +314,17 @@ class ParquetDataSourceOnFilterSuite extends ParquetFilterSuiteBase with BeforeA
   lazy val originalConf = sqlContext.conf.parquetUseDataSourceApi
 
   override protected def beforeAll(): Unit = {
-    sqlContext.conf.setConf(SQLConf.PARQUET_USE_DATA_SOURCE_API, "true")
+    sqlContext.conf.setConf(SQLConf.PARQUET_USE_DATA_SOURCE_API, true)
   }
 
   override protected def afterAll(): Unit = {
-    sqlContext.setConf(SQLConf.PARQUET_USE_DATA_SOURCE_API, originalConf.toString)
+    sqlContext.setConf(SQLConf.PARQUET_USE_DATA_SOURCE_API, originalConf)
   }
 
   test("SPARK-6554: don't push down predicates which reference partition columns") {
     import sqlContext.implicits._
 
-    withSQLConf(SQLConf.PARQUET_FILTER_PUSHDOWN_ENABLED -> "true") {
+    withSQLConf(SQLConf.PARQUET_FILTER_PUSHDOWN_ENABLED.key -> "true") {
       withTempPath { dir =>
         val path = s"${dir.getCanonicalPath}/part=1"
         (1 to 3).map(i => (i, i.toString)).toDF("a", "b").write.parquet(path)
@@ -343,17 +343,17 @@ class ParquetDataSourceOffFilterSuite extends ParquetFilterSuiteBase with Before
   lazy val originalConf = sqlContext.conf.parquetUseDataSourceApi
 
   override protected def beforeAll(): Unit = {
-    sqlContext.conf.setConf(SQLConf.PARQUET_USE_DATA_SOURCE_API, "false")
+    sqlContext.conf.setConf(SQLConf.PARQUET_USE_DATA_SOURCE_API, false)
   }
 
   override protected def afterAll(): Unit = {
-    sqlContext.setConf(SQLConf.PARQUET_USE_DATA_SOURCE_API, originalConf.toString)
+    sqlContext.setConf(SQLConf.PARQUET_USE_DATA_SOURCE_API, originalConf)
   }
 
   test("SPARK-6742: don't push down predicates which reference partition columns") {
     import sqlContext.implicits._
 
-    withSQLConf(SQLConf.PARQUET_FILTER_PUSHDOWN_ENABLED -> "true") {
+    withSQLConf(SQLConf.PARQUET_FILTER_PUSHDOWN_ENABLED.key -> "true") {
       withTempPath { dir =>
         val path = s"${dir.getCanonicalPath}/part=1"
         (1 to 3).map(i => (i, i.toString)).toDF("a", "b").write.parquet(path)
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetIOSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetIOSuite.scala
index fc827bc4ca11b..284d99d4938d1 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetIOSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetIOSuite.scala
@@ -94,8 +94,8 @@ class ParquetIOSuiteBase extends QueryTest with ParquetTest {
     val data = (1 to 4).map(i => Tuple1(i.toString))
     // Property spark.sql.parquet.binaryAsString shouldn't affect Parquet files written by Spark SQL
     // as we store Spark SQL schema in the extra metadata.
-    withSQLConf(SQLConf.PARQUET_BINARY_AS_STRING -> "false")(checkParquetFile(data))
-    withSQLConf(SQLConf.PARQUET_BINARY_AS_STRING -> "true")(checkParquetFile(data))
+    withSQLConf(SQLConf.PARQUET_BINARY_AS_STRING.key -> "false")(checkParquetFile(data))
+    withSQLConf(SQLConf.PARQUET_BINARY_AS_STRING.key -> "true")(checkParquetFile(data))
   }
 
   test("fixed-length decimals") {
@@ -231,7 +231,7 @@ class ParquetIOSuiteBase extends QueryTest with ParquetTest {
     val data = (0 until 10).map(i => (i, i.toString))
 
     def checkCompressionCodec(codec: CompressionCodecName): Unit = {
-      withSQLConf(SQLConf.PARQUET_COMPRESSION -> codec.name()) {
+      withSQLConf(SQLConf.PARQUET_COMPRESSION.key -> codec.name()) {
         withParquetFile(data) { path =>
           assertResult(sqlContext.conf.parquetCompressionCodec.toUpperCase) {
             compressionCodecFor(path)
@@ -408,7 +408,7 @@ class ParquetIOSuiteBase extends QueryTest with ParquetTest {
       val clonedConf = new Configuration(configuration)
 
       configuration.set(
-        SQLConf.OUTPUT_COMMITTER_CLASS, classOf[ParquetOutputCommitter].getCanonicalName)
+        SQLConf.OUTPUT_COMMITTER_CLASS.key, classOf[ParquetOutputCommitter].getCanonicalName)
 
       configuration.set(
         "spark.sql.parquet.output.committer.class",
@@ -440,11 +440,11 @@ class ParquetDataSourceOnIOSuite extends ParquetIOSuiteBase with BeforeAndAfterA
   private lazy val originalConf = sqlContext.conf.parquetUseDataSourceApi
 
   override protected def beforeAll(): Unit = {
-    sqlContext.conf.setConf(SQLConf.PARQUET_USE_DATA_SOURCE_API, "true")
+    sqlContext.conf.setConf(SQLConf.PARQUET_USE_DATA_SOURCE_API, true)
   }
 
   override protected def afterAll(): Unit = {
-    sqlContext.setConf(SQLConf.PARQUET_USE_DATA_SOURCE_API, originalConf.toString)
+    sqlContext.setConf(SQLConf.PARQUET_USE_DATA_SOURCE_API.key, originalConf.toString)
   }
 
   test("SPARK-6330 regression test") {
@@ -464,10 +464,10 @@ class ParquetDataSourceOffIOSuite extends ParquetIOSuiteBase with BeforeAndAfter
   private lazy val originalConf = sqlContext.conf.parquetUseDataSourceApi
 
   override protected def beforeAll(): Unit = {
-    sqlContext.conf.setConf(SQLConf.PARQUET_USE_DATA_SOURCE_API, "false")
+    sqlContext.conf.setConf(SQLConf.PARQUET_USE_DATA_SOURCE_API, false)
   }
 
   override protected def afterAll(): Unit = {
-    sqlContext.setConf(SQLConf.PARQUET_USE_DATA_SOURCE_API, originalConf.toString)
+    sqlContext.setConf(SQLConf.PARQUET_USE_DATA_SOURCE_API, originalConf)
   }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetQuerySuite.scala
index be3b34d5b9b70..fafad67dde3a7 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetQuerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetQuerySuite.scala
@@ -128,11 +128,11 @@ class ParquetDataSourceOnQuerySuite extends ParquetQuerySuiteBase with BeforeAnd
   private lazy val originalConf = sqlContext.conf.parquetUseDataSourceApi
 
   override protected def beforeAll(): Unit = {
-    sqlContext.conf.setConf(SQLConf.PARQUET_USE_DATA_SOURCE_API, "true")
+    sqlContext.conf.setConf(SQLConf.PARQUET_USE_DATA_SOURCE_API, true)
   }
 
   override protected def afterAll(): Unit = {
-    sqlContext.setConf(SQLConf.PARQUET_USE_DATA_SOURCE_API, originalConf.toString)
+    sqlContext.setConf(SQLConf.PARQUET_USE_DATA_SOURCE_API, originalConf)
   }
 }
 
@@ -140,10 +140,10 @@ class ParquetDataSourceOffQuerySuite extends ParquetQuerySuiteBase with BeforeAn
   private lazy val originalConf = sqlContext.conf.parquetUseDataSourceApi
 
   override protected def beforeAll(): Unit = {
-    sqlContext.conf.setConf(SQLConf.PARQUET_USE_DATA_SOURCE_API, "false")
+    sqlContext.conf.setConf(SQLConf.PARQUET_USE_DATA_SOURCE_API, false)
   }
 
   override protected def afterAll(): Unit = {
-    sqlContext.setConf(SQLConf.PARQUET_USE_DATA_SOURCE_API, originalConf.toString)
+    sqlContext.setConf(SQLConf.PARQUET_USE_DATA_SOURCE_API, originalConf)
   }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/DataSourceTest.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/DataSourceTest.scala
index 3f77960d09246..00cc7d5ea580f 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/sources/DataSourceTest.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/sources/DataSourceTest.scala
@@ -27,7 +27,7 @@ abstract class DataSourceTest extends QueryTest with BeforeAndAfter {
   // We want to test some edge cases.
   protected implicit lazy val caseInsensitiveContext = {
     val ctx = new SQLContext(TestSQLContext.sparkContext)
-    ctx.setConf(SQLConf.CASE_SENSITIVE, "false")
+    ctx.setConf(SQLConf.CASE_SENSITIVE, false)
     ctx
   }
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/test/SQLTestUtils.scala b/sql/core/src/test/scala/org/apache/spark/sql/test/SQLTestUtils.scala
index ac4a00a6f3dac..fa01823e9417c 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/test/SQLTestUtils.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/test/SQLTestUtils.scala
@@ -37,11 +37,11 @@ trait SQLTestUtils {
    */
   protected def withSQLConf(pairs: (String, String)*)(f: => Unit): Unit = {
     val (keys, values) = pairs.unzip
-    val currentValues = keys.map(key => Try(sqlContext.conf.getConf(key)).toOption)
-    (keys, values).zipped.foreach(sqlContext.conf.setConf)
+    val currentValues = keys.map(key => Try(sqlContext.conf.getConfString(key)).toOption)
+    (keys, values).zipped.foreach(sqlContext.conf.setConfString)
     try f finally {
       keys.zip(currentValues).foreach {
-        case (key, Some(value)) => sqlContext.conf.setConf(key, value)
+        case (key, Some(value)) => sqlContext.conf.setConfString(key, value)
         case (key, None) => sqlContext.conf.unsetConf(key)
       }
     }
diff --git a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/HiveThriftServer2.scala b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/HiveThriftServer2.scala
index c9da25253e13f..700d994bb6a83 100644
--- a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/HiveThriftServer2.scala
+++ b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/HiveThriftServer2.scala
@@ -153,9 +153,9 @@ object HiveThriftServer2 extends Logging {
     val sessionList = new mutable.LinkedHashMap[String, SessionInfo]
     val executionList = new mutable.LinkedHashMap[String, ExecutionInfo]
     val retainedStatements =
-      conf.getConf(SQLConf.THRIFTSERVER_UI_STATEMENT_LIMIT, "200").toInt
+      conf.getConf(SQLConf.THRIFTSERVER_UI_STATEMENT_LIMIT)
     val retainedSessions =
-      conf.getConf(SQLConf.THRIFTSERVER_UI_SESSION_LIMIT, "200").toInt
+      conf.getConf(SQLConf.THRIFTSERVER_UI_SESSION_LIMIT)
     var totalRunning = 0
 
     override def onJobStart(jobStart: SparkListenerJobStart): Unit = {
diff --git a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkExecuteStatementOperation.scala b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkExecuteStatementOperation.scala
index e071103df925c..e8758887ff3a2 100644
--- a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkExecuteStatementOperation.scala
+++ b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkExecuteStatementOperation.scala
@@ -219,7 +219,7 @@ private[hive] class SparkExecuteStatementOperation(
       result = hiveContext.sql(statement)
       logDebug(result.queryExecution.toString())
       result.queryExecution.logical match {
-        case SetCommand(Some((SQLConf.THRIFTSERVER_POOL, Some(value))), _) =>
+        case SetCommand(Some((SQLConf.THRIFTSERVER_POOL.key, Some(value)))) =>
           sessionToActivePool(parentSession.getSessionHandle) = value
           logInfo(s"Setting spark.scheduler.pool=$value for future statements in this session.")
         case _ =>
diff --git a/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/HiveThriftServer2Suites.scala b/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/HiveThriftServer2Suites.scala
index 178bd1f5cb164..301aa5a6411e2 100644
--- a/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/HiveThriftServer2Suites.scala
+++ b/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/HiveThriftServer2Suites.scala
@@ -113,8 +113,8 @@ class HiveThriftBinaryServerSuite extends HiveThriftJdbcTest {
     withJdbcStatement { statement =>
       val resultSet = statement.executeQuery("SET spark.sql.hive.version")
       resultSet.next()
-      assert(resultSet.getString(1) ===
-        s"spark.sql.hive.version=${HiveContext.hiveExecutionVersion}")
+      assert(resultSet.getString(1) === "spark.sql.hive.version")
+      assert(resultSet.getString(2) === HiveContext.hiveExecutionVersion)
     }
   }
 
@@ -238,7 +238,7 @@ class HiveThriftBinaryServerSuite extends HiveThriftJdbcTest {
       // first session, we get the default value of the session status
       { statement =>
 
-        val rs1 = statement.executeQuery(s"SET ${SQLConf.SHUFFLE_PARTITIONS}")
+        val rs1 = statement.executeQuery(s"SET ${SQLConf.SHUFFLE_PARTITIONS.key}")
         rs1.next()
         defaultV1 = rs1.getString(1)
         assert(defaultV1 != "200")
@@ -256,19 +256,21 @@ class HiveThriftBinaryServerSuite extends HiveThriftJdbcTest {
       { statement =>
 
         val queries = Seq(
-            s"SET ${SQLConf.SHUFFLE_PARTITIONS}=291",
+            s"SET ${SQLConf.SHUFFLE_PARTITIONS.key}=291",
             "SET hive.cli.print.header=true"
             )
 
         queries.map(statement.execute)
-        val rs1 = statement.executeQuery(s"SET ${SQLConf.SHUFFLE_PARTITIONS}")
+        val rs1 = statement.executeQuery(s"SET ${SQLConf.SHUFFLE_PARTITIONS.key}")
         rs1.next()
-        assert("spark.sql.shuffle.partitions=291" === rs1.getString(1))
+        assert("spark.sql.shuffle.partitions" === rs1.getString(1))
+        assert("291" === rs1.getString(2))
         rs1.close()
 
         val rs2 = statement.executeQuery("SET hive.cli.print.header")
         rs2.next()
-        assert("hive.cli.print.header=true" === rs2.getString(1))
+        assert("hive.cli.print.header" === rs2.getString(1))
+        assert("true" === rs2.getString(2))
         rs2.close()
       },
 
@@ -276,7 +278,7 @@ class HiveThriftBinaryServerSuite extends HiveThriftJdbcTest {
       // default value
       { statement =>
 
-        val rs1 = statement.executeQuery(s"SET ${SQLConf.SHUFFLE_PARTITIONS}")
+        val rs1 = statement.executeQuery(s"SET ${SQLConf.SHUFFLE_PARTITIONS.key}")
         rs1.next()
         assert(defaultV1 === rs1.getString(1))
         rs1.close()
@@ -404,8 +406,8 @@ class HiveThriftHttpServerSuite extends HiveThriftJdbcTest {
     withJdbcStatement { statement =>
       val resultSet = statement.executeQuery("SET spark.sql.hive.version")
       resultSet.next()
-      assert(resultSet.getString(1) ===
-        s"spark.sql.hive.version=${HiveContext.hiveExecutionVersion}")
+      assert(resultSet.getString(1) === "spark.sql.hive.version")
+      assert(resultSet.getString(2) === HiveContext.hiveExecutionVersion)
     }
   }
 }
diff --git a/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveCompatibilitySuite.scala b/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveCompatibilitySuite.scala
index 82c0b494598a8..432de2564d080 100644
--- a/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveCompatibilitySuite.scala
+++ b/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveCompatibilitySuite.scala
@@ -47,17 +47,17 @@ class HiveCompatibilitySuite extends HiveQueryFileTest with BeforeAndAfter {
     // Add Locale setting
     Locale.setDefault(Locale.US)
     // Set a relatively small column batch size for testing purposes
-    TestHive.setConf(SQLConf.COLUMN_BATCH_SIZE, "5")
+    TestHive.setConf(SQLConf.COLUMN_BATCH_SIZE, 5)
     // Enable in-memory partition pruning for testing purposes
-    TestHive.setConf(SQLConf.IN_MEMORY_PARTITION_PRUNING, "true")
+    TestHive.setConf(SQLConf.IN_MEMORY_PARTITION_PRUNING, true)
   }
 
   override def afterAll() {
     TestHive.cacheTables = false
     TimeZone.setDefault(originalTimeZone)
     Locale.setDefault(originalLocale)
-    TestHive.setConf(SQLConf.COLUMN_BATCH_SIZE, originalColumnBatchSize.toString)
-    TestHive.setConf(SQLConf.IN_MEMORY_PARTITION_PRUNING, originalInMemoryPartitionPruning.toString)
+    TestHive.setConf(SQLConf.COLUMN_BATCH_SIZE, originalColumnBatchSize)
+    TestHive.setConf(SQLConf.IN_MEMORY_PARTITION_PRUNING, originalInMemoryPartitionPruning)
   }
 
   /** A list of tests deemed out of scope currently and thus completely disregarded. */
diff --git a/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/SortMergeCompatibilitySuite.scala b/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/SortMergeCompatibilitySuite.scala
index 65d070bd3cbde..f458567e5d7ea 100644
--- a/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/SortMergeCompatibilitySuite.scala
+++ b/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/SortMergeCompatibilitySuite.scala
@@ -26,11 +26,11 @@ import org.apache.spark.sql.hive.test.TestHive
 class SortMergeCompatibilitySuite extends HiveCompatibilitySuite {
   override def beforeAll() {
     super.beforeAll()
-    TestHive.setConf(SQLConf.SORTMERGE_JOIN, "true")
+    TestHive.setConf(SQLConf.SORTMERGE_JOIN, true)
   }
 
   override def afterAll() {
-    TestHive.setConf(SQLConf.SORTMERGE_JOIN, "false")
+    TestHive.setConf(SQLConf.SORTMERGE_JOIN, false)
     super.afterAll()
   }
 
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala
index c50835dd8f11d..4a66d6508ae0a 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala
@@ -21,15 +21,13 @@ import java.io.File
 import java.net.{URL, URLClassLoader}
 import java.sql.Timestamp
 
-import org.apache.hadoop.hive.common.StatsSetupConst
-import org.apache.hadoop.hive.common.`type`.HiveDecimal
-import org.apache.spark.sql.catalyst.ParserDialect
-
 import scala.collection.JavaConversions._
 import scala.collection.mutable.HashMap
 import scala.language.implicitConversions
 
 import org.apache.hadoop.fs.{FileSystem, Path}
+import org.apache.hadoop.hive.common.StatsSetupConst
+import org.apache.hadoop.hive.common.`type`.HiveDecimal
 import org.apache.hadoop.hive.conf.HiveConf
 import org.apache.hadoop.hive.ql.metadata.Table
 import org.apache.hadoop.hive.ql.parse.VariableSubstitution
@@ -39,6 +37,9 @@ import org.apache.hadoop.hive.serde2.io.{DateWritable, TimestampWritable}
 import org.apache.spark.SparkContext
 import org.apache.spark.annotation.Experimental
 import org.apache.spark.sql._
+import org.apache.spark.sql.SQLConf.SQLConfEntry
+import org.apache.spark.sql.SQLConf.SQLConfEntry._
+import org.apache.spark.sql.catalyst.ParserDialect
 import org.apache.spark.sql.catalyst.analysis._
 import org.apache.spark.sql.catalyst.plans.logical._
 import org.apache.spark.sql.execution.{ExecutedCommand, ExtractPythonUdfs, SetCommand}
@@ -69,13 +70,14 @@ class HiveContext(sc: SparkContext) extends SQLContext(sc) {
 
   import HiveContext._
 
+  println("create HiveContext")
+
   /**
    * When true, enables an experimental feature where metastore tables that use the parquet SerDe
    * are automatically converted to use the Spark SQL parquet table scan, instead of the Hive
    * SerDe.
    */
-  protected[sql] def convertMetastoreParquet: Boolean =
-    getConf("spark.sql.hive.convertMetastoreParquet", "true") == "true"
+  protected[sql] def convertMetastoreParquet: Boolean = getConf(CONVERT_METASTORE_PARQUET)
 
   /**
    * When true, also tries to merge possibly different but compatible Parquet schemas in different
@@ -84,7 +86,7 @@ class HiveContext(sc: SparkContext) extends SQLContext(sc) {
    * This configuration is only effective when "spark.sql.hive.convertMetastoreParquet" is true.
    */
   protected[sql] def convertMetastoreParquetWithSchemaMerging: Boolean =
-    getConf("spark.sql.hive.convertMetastoreParquet.mergeSchema", "false") == "true"
+    getConf(CONVERT_METASTORE_PARQUET_WITH_SCHEMA_MERGING)
 
   /**
    * When true, a table created by a Hive CTAS statement (no USING clause) will be
@@ -98,8 +100,7 @@ class HiveContext(sc: SparkContext) extends SQLContext(sc) {
    *   - The CTAS statement specifies SequenceFile (STORED AS SEQUENCEFILE) as the file format
    *     and no SerDe is specified (no ROW FORMAT SERDE clause).
    */
-  protected[sql] def convertCTAS: Boolean =
-    getConf("spark.sql.hive.convertCTAS", "false").toBoolean
+  protected[sql] def convertCTAS: Boolean = getConf(CONVERT_CTAS)
 
   /**
    * The version of the hive client that will be used to communicate with the metastore.  Note that
@@ -117,8 +118,7 @@ class HiveContext(sc: SparkContext) extends SQLContext(sc) {
    *              option is only valid when using the execution version of Hive.
    *  - maven - download the correct version of hive on demand from maven.
    */
-  protected[hive] def hiveMetastoreJars: String =
-    getConf(HIVE_METASTORE_JARS, "builtin")
+  protected[hive] def hiveMetastoreJars: String = getConf(HIVE_METASTORE_JARS)
 
   /**
    * A comma separated list of class prefixes that should be loaded using the classloader that
@@ -128,11 +128,7 @@ class HiveContext(sc: SparkContext) extends SQLContext(sc) {
    * custom appenders that are used by log4j.
    */
   protected[hive] def hiveMetastoreSharedPrefixes: Seq[String] =
-    getConf("spark.sql.hive.metastore.sharedPrefixes", jdbcPrefixes)
-      .split(",").filterNot(_ == "")
-
-  private def jdbcPrefixes = Seq(
-    "com.mysql.jdbc", "org.postgresql", "com.microsoft.sqlserver", "oracle.jdbc").mkString(",")
+    getConf(HIVE_METASTORE_SHARED_PREFIXES).filterNot(_ == "")
 
   /**
    * A comma separated list of class prefixes that should explicitly be reloaded for each version
@@ -140,14 +136,12 @@ class HiveContext(sc: SparkContext) extends SQLContext(sc) {
    * prefix that typically would be shared (i.e. org.apache.spark.*)
    */
   protected[hive] def hiveMetastoreBarrierPrefixes: Seq[String] =
-    getConf("spark.sql.hive.metastore.barrierPrefixes", "")
-      .split(",").filterNot(_ == "")
+    getConf(HIVE_METASTORE_BARRIER_PREFIXES).filterNot(_ == "")
 
   /*
    * hive thrift server use background spark sql thread pool to execute sql queries
    */
-  protected[hive] def hiveThriftServerAsync: Boolean =
-    getConf("spark.sql.hive.thriftServer.async", "true").toBoolean
+  protected[hive] def hiveThriftServerAsync: Boolean = getConf(HIVE_THRIFT_SERVER_ASYNC)
 
   @transient
   protected[sql] lazy val substitutor = new VariableSubstitution()
@@ -364,7 +358,11 @@ class HiveContext(sc: SparkContext) extends SQLContext(sc) {
     hiveconf.set(key, value)
   }
 
-  /* A catalyst metadata catalog that points to the Hive Metastore. */
+  private[sql] override def setConf[T](entry: SQLConfEntry[T], value: T): Unit = {
+    setConf(entry.key, entry.stringConverter(value))
+  }
+
+    /* A catalyst metadata catalog that points to the Hive Metastore. */
   @transient
   override protected[sql] lazy val catalog =
     new HiveMetastoreCatalog(metadataHive, this) with OverrideCatalog
@@ -402,8 +400,7 @@ class HiveContext(sc: SparkContext) extends SQLContext(sc) {
   protected[hive] class SQLSession extends super.SQLSession {
     protected[sql] override lazy val conf: SQLConf = new SQLConf {
       override def dialect: String = getConf(SQLConf.DIALECT, "hiveql")
-      override def caseSensitiveAnalysis: Boolean =
-        getConf(SQLConf.CASE_SENSITIVE, "false").toBoolean
+      override def caseSensitiveAnalysis: Boolean = getConf(SQLConf.CASE_SENSITIVE, false)
     }
 
     /**
@@ -519,7 +516,50 @@ private[hive] object HiveContext {
   val hiveExecutionVersion: String = "0.13.1"
 
   val HIVE_METASTORE_VERSION: String = "spark.sql.hive.metastore.version"
-  val HIVE_METASTORE_JARS: String = "spark.sql.hive.metastore.jars"
+  val HIVE_METASTORE_JARS = stringConf("spark.sql.hive.metastore.jars",
+    defaultValue = Some("builtin"),
+    doc = "Location of the jars that should be used to instantiate the HiveMetastoreClient. This" +
+      " property can be one of three options: " +
+      "1. \"builtin\" Use Hive 0.13.1, which is bundled with the Spark assembly jar when " +
+      "<code>-Phive</code> is enabled. When this option is chosen, " +
+      "spark.sql.hive.metastore.version must be either <code>0.13.1</code> or not defined. " +
+      "2. \"maven\" Use Hive jars of specified version downloaded from Maven repositories." +
+      "3. A classpath in the standard format for both Hive and Hadoop.")
+
+  val CONVERT_METASTORE_PARQUET = booleanConf("spark.sql.hive.convertMetastoreParquet",
+    defaultValue = Some(true),
+    doc = "When set to false, Spark SQL will use the Hive SerDe for parquet tables instead of " +
+      "the built in support.")
+
+  val CONVERT_METASTORE_PARQUET_WITH_SCHEMA_MERGING = booleanConf(
+    "spark.sql.hive.convertMetastoreParquet.mergeSchema",
+    defaultValue = Some(false),
+    doc = "TODO")
+
+  val CONVERT_CTAS = booleanConf("spark.sql.hive.convertCTAS",
+    defaultValue = Some(false),
+    doc = "TODO")
+
+  val HIVE_METASTORE_SHARED_PREFIXES = stringSeqConf("spark.sql.hive.metastore.sharedPrefixes",
+    defaultValue = Some(jdbcPrefixes),
+    doc = "A comma separated list of class prefixes that should be loaded using the classloader " +
+      "that is shared between Spark SQL and a specific version of Hive. An example of classes " +
+      "that should be shared is JDBC drivers that are needed to talk to the metastore. Other " +
+      "classes that need to be shared are those that interact with classes that are already " +
+      "shared. For example, custom appenders that are used by log4j.")
+
+  private def jdbcPrefixes = Seq(
+    "com.mysql.jdbc", "org.postgresql", "com.microsoft.sqlserver", "oracle.jdbc")
+
+  val HIVE_METASTORE_BARRIER_PREFIXES = stringSeqConf("spark.sql.hive.metastore.barrierPrefixes",
+    defaultValue = Some(Seq()),
+    doc = "A comma separated list of class prefixes that should explicitly be reloaded for each " +
+      "version of Hive that Spark SQL is communicating with. For example, Hive UDFs that are " +
+      "declared in a prefix that typically would be shared (i.e. <code>org.apache.spark.*</code>).")
+
+  val HIVE_THRIFT_SERVER_ASYNC = booleanConf("spark.sql.hive.thriftServer.async",
+    defaultValue = Some(true),
+    doc = "TODO")
 
   /** Constructs a configuration for hive, where the metastore is located in a temp directory. */
   def newTemporaryConfiguration(): Map[String, String] = {
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/test/TestHive.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/test/TestHive.scala
index 92155096202b3..f901bd8171508 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/test/TestHive.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/test/TestHive.scala
@@ -112,12 +112,11 @@ class TestHiveContext(sc: SparkContext) extends HiveContext(sc) {
   protected[hive] class SQLSession extends super.SQLSession {
     /** Fewer partitions to speed up testing. */
     protected[sql] override lazy val conf: SQLConf = new SQLConf {
-      override def numShufflePartitions: Int = getConf(SQLConf.SHUFFLE_PARTITIONS, "5").toInt
+      override def numShufflePartitions: Int = getConf(SQLConf.SHUFFLE_PARTITIONS, 5)
       // TODO as in unit test, conf.clear() probably be called, all of the value will be cleared.
       // The super.getConf(SQLConf.DIALECT) is "sql" by default, we need to set it as "hiveql"
       override def dialect: String = super.getConf(SQLConf.DIALECT, "hiveql")
-      override def caseSensitiveAnalysis: Boolean =
-        getConf(SQLConf.CASE_SENSITIVE, "false").toBoolean
+      override def caseSensitiveAnalysis: Boolean = getConf(SQLConf.CASE_SENSITIVE, false)
     }
   }
 
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveParquetSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveParquetSuite.scala
index a0d80dc39c108..af68615e8e9d6 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveParquetSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveParquetSuite.scala
@@ -81,11 +81,11 @@ class HiveParquetSuite extends QueryTest with ParquetTest {
     }
   }
 
-  withSQLConf(SQLConf.PARQUET_USE_DATA_SOURCE_API -> "true") {
+  withSQLConf(SQLConf.PARQUET_USE_DATA_SOURCE_API.key -> "true") {
     run("Parquet data source enabled")
   }
 
-  withSQLConf(SQLConf.PARQUET_USE_DATA_SOURCE_API -> "false") {
+  withSQLConf(SQLConf.PARQUET_USE_DATA_SOURCE_API.key -> "false") {
     run("Parquet data source disabled")
   }
 }
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala
index 79a85b24d2f60..cc294bc3e8bc3 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala
@@ -456,7 +456,7 @@ class MetastoreDataSourcesSuite extends QueryTest with SQLTestUtils with BeforeA
       withTable("savedJsonTable") {
         val df = (1 to 10).map(i => i -> s"str$i").toDF("a", "b")
 
-        withSQLConf(SQLConf.DEFAULT_DATA_SOURCE_NAME -> "json") {
+        withSQLConf(SQLConf.DEFAULT_DATA_SOURCE_NAME.key -> "json") {
           // Save the df as a managed table (by not specifying the path).
           df.write.saveAsTable("savedJsonTable")
 
@@ -484,7 +484,7 @@ class MetastoreDataSourcesSuite extends QueryTest with SQLTestUtils with BeforeA
         }
 
         // Create an external table by specifying the path.
-        withSQLConf(SQLConf.DEFAULT_DATA_SOURCE_NAME -> "not a source name") {
+        withSQLConf(SQLConf.DEFAULT_DATA_SOURCE_NAME.key -> "not a source name") {
           df.write
             .format("org.apache.spark.sql.json")
             .mode(SaveMode.Append)
@@ -508,7 +508,7 @@ class MetastoreDataSourcesSuite extends QueryTest with SQLTestUtils with BeforeA
           s"""{ "a": $i, "b": "str$i" }"""
         }))
 
-        withSQLConf(SQLConf.DEFAULT_DATA_SOURCE_NAME -> "not a source name") {
+        withSQLConf(SQLConf.DEFAULT_DATA_SOURCE_NAME.key -> "not a source name") {
           df.write
             .format("json")
             .mode(SaveMode.Append)
@@ -516,7 +516,7 @@ class MetastoreDataSourcesSuite extends QueryTest with SQLTestUtils with BeforeA
             .saveAsTable("savedJsonTable")
         }
 
-        withSQLConf(SQLConf.DEFAULT_DATA_SOURCE_NAME -> "json") {
+        withSQLConf(SQLConf.DEFAULT_DATA_SOURCE_NAME.key -> "json") {
           createExternalTable("createdJsonTable", tempPath.toString)
           assert(table("createdJsonTable").schema === df.schema)
           checkAnswer(sql("SELECT * FROM createdJsonTable"), df)
@@ -533,7 +533,7 @@ class MetastoreDataSourcesSuite extends QueryTest with SQLTestUtils with BeforeA
         checkAnswer(read.json(tempPath.toString), df)
 
         // Try to specify the schema.
-        withSQLConf(SQLConf.DEFAULT_DATA_SOURCE_NAME -> "not a source name") {
+        withSQLConf(SQLConf.DEFAULT_DATA_SOURCE_NAME.key -> "not a source name") {
           val schema = StructType(StructField("b", StringType, true) :: Nil)
           createExternalTable(
             "createdJsonTable",
@@ -563,8 +563,8 @@ class MetastoreDataSourcesSuite extends QueryTest with SQLTestUtils with BeforeA
 
   test("scan a parquet table created through a CTAS statement") {
     withSQLConf(
-      "spark.sql.hive.convertMetastoreParquet" -> "true",
-      SQLConf.PARQUET_USE_DATA_SOURCE_API -> "true") {
+      HiveContext.CONVERT_METASTORE_PARQUET.key -> "true",
+      SQLConf.PARQUET_USE_DATA_SOURCE_API.key -> "true") {
 
       withTempTable("jt") {
         (1 to 10).map(i => i -> s"str$i").toDF("a", "b").registerTempTable("jt")
@@ -706,7 +706,7 @@ class MetastoreDataSourcesSuite extends QueryTest with SQLTestUtils with BeforeA
   }
 
   test("SPARK-6024 wide schema support") {
-    withSQLConf(SQLConf.SCHEMA_STRING_LENGTH_THRESHOLD -> "4000") {
+    withSQLConf(SQLConf.SCHEMA_STRING_LENGTH_THRESHOLD.key -> "4000") {
       withTable("wide_schema") {
         // We will need 80 splits for this schema if the threshold is 4000.
         val schema = StructType((1 to 5000).map(i => StructField(s"c_$i", StringType, true)))
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala
index 78c94e6490e36..f067ea0d4fc75 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala
@@ -167,7 +167,7 @@ class StatisticsSuite extends QueryTest with BeforeAndAfterAll {
       ctx.conf.settings.synchronized {
         val tmp = ctx.conf.autoBroadcastJoinThreshold
 
-        sql(s"""SET ${SQLConf.AUTO_BROADCASTJOIN_THRESHOLD}=-1""")
+        sql(s"""SET ${SQLConf.AUTO_BROADCASTJOIN_THRESHOLD.key}=-1""")
         df = sql(query)
         bhj = df.queryExecution.sparkPlan.collect { case j: BroadcastHashJoin => j }
         assert(bhj.isEmpty, "BroadcastHashJoin still planned even though it is switched off")
@@ -176,7 +176,7 @@ class StatisticsSuite extends QueryTest with BeforeAndAfterAll {
         assert(shj.size === 1,
           "ShuffledHashJoin should be planned when BroadcastHashJoin is turned off")
 
-        sql(s"""SET ${SQLConf.AUTO_BROADCASTJOIN_THRESHOLD}=$tmp""")
+        sql(s"""SET ${SQLConf.AUTO_BROADCASTJOIN_THRESHOLD.key}=$tmp""")
       }
 
       after()
@@ -225,7 +225,7 @@ class StatisticsSuite extends QueryTest with BeforeAndAfterAll {
     ctx.conf.settings.synchronized {
       val tmp = ctx.conf.autoBroadcastJoinThreshold
 
-      sql(s"SET ${SQLConf.AUTO_BROADCASTJOIN_THRESHOLD}=-1")
+      sql(s"SET ${SQLConf.AUTO_BROADCASTJOIN_THRESHOLD.key}=-1")
       df = sql(leftSemiJoinQuery)
       bhj = df.queryExecution.sparkPlan.collect {
         case j: BroadcastLeftSemiJoinHash => j
@@ -238,7 +238,7 @@ class StatisticsSuite extends QueryTest with BeforeAndAfterAll {
       assert(shj.size === 1,
         "LeftSemiJoinHash should be planned when BroadcastHashJoin is turned off")
 
-      sql(s"SET ${SQLConf.AUTO_BROADCASTJOIN_THRESHOLD}=$tmp")
+      sql(s"SET ${SQLConf.AUTO_BROADCASTJOIN_THRESHOLD.key}=$tmp")
     }
 
   }
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala
index 6d8d99ebc8164..51dabc67fa7c1 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala
@@ -1084,14 +1084,16 @@ class HiveQuerySuite extends HiveComparisonTest with BeforeAndAfter {
     val testKey = "spark.sql.key.usedfortestonly"
     val testVal = "test.val.0"
     val nonexistentKey = "nonexistent"
-    val KV = "([^=]+)=([^=]*)".r
-    def collectResults(df: DataFrame): Set[(String, String)] =
+    def collectResults(df: DataFrame): Set[Any] =
       df.collect().map {
         case Row(key: String, value: String) => key -> value
-        case Row(KV(key, value)) => key -> value
+        case Row(key: String, defaultValue: String, doc: String) => (key, defaultValue, doc)
       }.toSet
     conf.clear()
 
+    val expectedConfs = conf.getAllDefinedConfs.toSet
+    assertResult(expectedConfs)(collectResults(sql("SET -v")))
+
     // "SET" itself returns all config variables currently specified in SQLConf.
     // TODO: Should we be listing the default here always? probably...
     assert(sql("SET").collect().size == 0)
@@ -1102,16 +1104,12 @@ class HiveQuerySuite extends HiveComparisonTest with BeforeAndAfter {
 
     assert(hiveconf.get(testKey, "") == testVal)
     assertResult(Set(testKey -> testVal))(collectResults(sql("SET")))
-    assertResult(Set(testKey -> testVal))(collectResults(sql("SET -v")))
 
     sql(s"SET ${testKey + testKey}=${testVal + testVal}")
     assert(hiveconf.get(testKey + testKey, "") == testVal + testVal)
     assertResult(Set(testKey -> testVal, (testKey + testKey) -> (testVal + testVal))) {
       collectResults(sql("SET"))
     }
-    assertResult(Set(testKey -> testVal, (testKey + testKey) -> (testVal + testVal))) {
-      collectResults(sql("SET -v"))
-    }
 
     // "SET key"
     assertResult(Set(testKey -> testVal)) {
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
index 984d97d27bf54..e1c9926bed524 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
@@ -24,7 +24,7 @@ import org.apache.spark.sql._
 import org.apache.spark.sql.hive.test.TestHive
 import org.apache.spark.sql.hive.test.TestHive._
 import org.apache.spark.sql.hive.test.TestHive.implicits._
-import org.apache.spark.sql.hive.{HiveQLDialect, MetastoreRelation}
+import org.apache.spark.sql.hive.{HiveContext, HiveQLDialect, MetastoreRelation}
 import org.apache.spark.sql.parquet.ParquetRelation2
 import org.apache.spark.sql.sources.LogicalRelation
 import org.apache.spark.sql.types._
@@ -191,9 +191,9 @@ class SQLQuerySuite extends QueryTest {
       }
     }
 
-    val originalConf = getConf("spark.sql.hive.convertCTAS", "false")
+    val originalConf = convertCTAS
 
-    setConf("spark.sql.hive.convertCTAS", "true")
+    setConf(HiveContext.CONVERT_CTAS, true)
 
     sql("CREATE TABLE ctas1 AS SELECT key k, value FROM src ORDER BY k, value")
     sql("CREATE TABLE IF NOT EXISTS ctas1 AS SELECT key k, value FROM src ORDER BY k, value")
@@ -235,7 +235,7 @@ class SQLQuerySuite extends QueryTest {
     checkRelation("ctas1", false)
     sql("DROP TABLE ctas1")
 
-    setConf("spark.sql.hive.convertCTAS", originalConf)
+    setConf(HiveContext.CONVERT_CTAS, originalConf)
   }
 
   test("SQL Dialect Switching") {
@@ -332,7 +332,7 @@ class SQLQuerySuite extends QueryTest {
 
     val origUseParquetDataSource = conf.parquetUseDataSourceApi
     try {
-      setConf(SQLConf.PARQUET_USE_DATA_SOURCE_API, "false")
+      setConf(SQLConf.PARQUET_USE_DATA_SOURCE_API, false)
       sql(
         """CREATE TABLE ctas5
           | STORED AS parquet AS
@@ -348,7 +348,7 @@ class SQLQuerySuite extends QueryTest {
         "MANAGED_TABLE"
       )
 
-      val default = getConf("spark.sql.hive.convertMetastoreParquet", "true")
+      val default = convertMetastoreParquet
       // use the Hive SerDe for parquet tables
       sql("set spark.sql.hive.convertMetastoreParquet = false")
       checkAnswer(
@@ -356,7 +356,7 @@ class SQLQuerySuite extends QueryTest {
         sql("SELECT key, value FROM src ORDER BY key, value").collect().toSeq)
       sql(s"set spark.sql.hive.convertMetastoreParquet = $default")
     } finally {
-      setConf(SQLConf.PARQUET_USE_DATA_SOURCE_API, origUseParquetDataSource.toString)
+      setConf(SQLConf.PARQUET_USE_DATA_SOURCE_API, origUseParquetDataSource)
     }
   }
 
@@ -603,8 +603,8 @@ class SQLQuerySuite extends QueryTest {
     // generates an invalid query plan.
     val rdd = sparkContext.makeRDD((1 to 5).map(i => s"""{"a":[$i, ${i + 1}]}"""))
     read.json(rdd).registerTempTable("data")
-    val originalConf = getConf("spark.sql.hive.convertCTAS", "false")
-    setConf("spark.sql.hive.convertCTAS", "false")
+    val originalConf = convertCTAS
+    setConf(HiveContext.CONVERT_CTAS, false)
 
     sql("CREATE TABLE explodeTest (key bigInt)")
     table("explodeTest").queryExecution.analyzed match {
@@ -621,7 +621,7 @@ class SQLQuerySuite extends QueryTest {
 
     sql("DROP TABLE explodeTest")
     dropTempTable("data")
-    setConf("spark.sql.hive.convertCTAS", originalConf)
+    setConf(HiveContext.CONVERT_CTAS, originalConf)
   }
 
   test("sanity test for SPARK-6618") {
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/parquetSuites.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/parquetSuites.scala
index 3864349cdbd89..c2e09800933b5 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/parquetSuites.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/parquetSuites.scala
@@ -153,7 +153,7 @@ class ParquetMetastoreSuiteBase extends ParquetPartitioningTest {
     val rdd2 = sparkContext.parallelize((1 to 10).map(i => s"""{"a":[$i, null]}"""))
     read.json(rdd2).registerTempTable("jt_array")
 
-    setConf("spark.sql.hive.convertMetastoreParquet", "true")
+    setConf(HiveContext.CONVERT_METASTORE_PARQUET, true)
   }
 
   override def afterAll(): Unit = {
@@ -164,7 +164,7 @@ class ParquetMetastoreSuiteBase extends ParquetPartitioningTest {
     sql("DROP TABLE normal_parquet")
     sql("DROP TABLE IF EXISTS jt")
     sql("DROP TABLE IF EXISTS jt_array")
-    setConf("spark.sql.hive.convertMetastoreParquet", "false")
+    setConf(HiveContext.CONVERT_METASTORE_PARQUET, false)
   }
 
   test(s"conversion is working") {
@@ -199,14 +199,14 @@ class ParquetDataSourceOnMetastoreSuite extends ParquetMetastoreSuiteBase {
         |  OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat'
       """.stripMargin)
 
-    conf.setConf(SQLConf.PARQUET_USE_DATA_SOURCE_API, "true")
+    conf.setConf(SQLConf.PARQUET_USE_DATA_SOURCE_API, true)
   }
 
   override def afterAll(): Unit = {
     super.afterAll()
     sql("DROP TABLE IF EXISTS test_parquet")
 
-    setConf(SQLConf.PARQUET_USE_DATA_SOURCE_API, originalConf.toString)
+    setConf(SQLConf.PARQUET_USE_DATA_SOURCE_API, originalConf)
   }
 
   test("scan an empty parquet table") {
@@ -546,12 +546,12 @@ class ParquetDataSourceOffMetastoreSuite extends ParquetMetastoreSuiteBase {
 
   override def beforeAll(): Unit = {
     super.beforeAll()
-    conf.setConf(SQLConf.PARQUET_USE_DATA_SOURCE_API, "false")
+    conf.setConf(SQLConf.PARQUET_USE_DATA_SOURCE_API, false)
   }
 
   override def afterAll(): Unit = {
     super.afterAll()
-    setConf(SQLConf.PARQUET_USE_DATA_SOURCE_API, originalConf.toString)
+    setConf(SQLConf.PARQUET_USE_DATA_SOURCE_API, originalConf)
   }
 
   test("MetastoreRelation in InsertIntoTable will not be converted") {
@@ -692,12 +692,12 @@ class ParquetDataSourceOnSourceSuite extends ParquetSourceSuiteBase {
 
   override def beforeAll(): Unit = {
     super.beforeAll()
-    conf.setConf(SQLConf.PARQUET_USE_DATA_SOURCE_API, "true")
+    conf.setConf(SQLConf.PARQUET_USE_DATA_SOURCE_API, true)
   }
 
   override def afterAll(): Unit = {
     super.afterAll()
-    setConf(SQLConf.PARQUET_USE_DATA_SOURCE_API, originalConf.toString)
+    setConf(SQLConf.PARQUET_USE_DATA_SOURCE_API, originalConf)
   }
 
   test("values in arrays and maps stored in parquet are always nullable") {
@@ -750,12 +750,12 @@ class ParquetDataSourceOffSourceSuite extends ParquetSourceSuiteBase {
 
   override def beforeAll(): Unit = {
     super.beforeAll()
-    conf.setConf(SQLConf.PARQUET_USE_DATA_SOURCE_API, "false")
+    conf.setConf(SQLConf.PARQUET_USE_DATA_SOURCE_API, false)
   }
 
   override def afterAll(): Unit = {
     super.afterAll()
-    setConf(SQLConf.PARQUET_USE_DATA_SOURCE_API, originalConf.toString)
+    setConf(SQLConf.PARQUET_USE_DATA_SOURCE_API, originalConf)
   }
 }
 

From fee3438a32136a8edbca71efb566965587a88826 Mon Sep 17 00:00:00 2001
From: Liang-Chi Hsieh <viirya@gmail.com>
Date: Wed, 17 Jun 2015 23:31:30 -0700
Subject: [PATCH 524/525] [SPARK-8218][SQL] Add binary log math function

JIRA: https://issues.apache.org/jira/browse/SPARK-8218

Because there is already `log` unary function defined, the binary log function is called `logarithm` for now.

Author: Liang-Chi Hsieh <viirya@gmail.com>

Closes #6725 from viirya/expr_binary_log and squashes the following commits:

bf96bd9 [Liang-Chi Hsieh] Compare log result in string.
102070d [Liang-Chi Hsieh] Round log result to better comparing in python test.
fd01863 [Liang-Chi Hsieh] For comments.
beed631 [Liang-Chi Hsieh] Merge remote-tracking branch 'upstream/master' into expr_binary_log
6089d11 [Liang-Chi Hsieh] Remove unnecessary override.
8cf37b7 [Liang-Chi Hsieh] For comments.
bc89597 [Liang-Chi Hsieh] For comments.
db7dc38 [Liang-Chi Hsieh] Use ctor instead of companion object.
0634ef7 [Liang-Chi Hsieh] Merge remote-tracking branch 'upstream/master' into expr_binary_log
1750034 [Liang-Chi Hsieh] Merge remote-tracking branch 'upstream/master' into expr_binary_log
3d75bfc [Liang-Chi Hsieh] Fix scala style.
5b39c02 [Liang-Chi Hsieh] Merge remote-tracking branch 'upstream/master' into expr_binary_log
23c54a3 [Liang-Chi Hsieh] Fix scala style.
ebc9929 [Liang-Chi Hsieh] Let Logarithm accept one parameter too.
605574d [Liang-Chi Hsieh] Merge remote-tracking branch 'upstream/master' into expr_binary_log
21c3bfd [Liang-Chi Hsieh] Fix scala style.
c6c187f [Liang-Chi Hsieh] For comments.
c795342 [Liang-Chi Hsieh] Merge remote-tracking branch 'upstream/master' into expr_binary_log
f373bac [Liang-Chi Hsieh] Add binary log expression.
---
 python/pyspark/sql/functions.py               | 18 ++++++++++++++++-
 .../catalyst/analysis/FunctionRegistry.scala  |  1 +
 .../spark/sql/catalyst/expressions/math.scala | 20 +++++++++++++++++++
 .../expressions/MathFunctionsSuite.scala      | 18 +++++++++++++++++
 .../org/apache/spark/sql/functions.scala      | 16 +++++++++++++++
 .../spark/sql/MathExpressionsSuite.scala      | 13 ++++++++++++
 6 files changed, 85 insertions(+), 1 deletion(-)

diff --git a/python/pyspark/sql/functions.py b/python/pyspark/sql/functions.py
index bbf465aca8d4d..177fc196e0834 100644
--- a/python/pyspark/sql/functions.py
+++ b/python/pyspark/sql/functions.py
@@ -18,6 +18,7 @@
 """
 A collections of builtin functions
 """
+import math
 import sys
 
 if sys.version < "3":
@@ -143,7 +144,7 @@ def _():
     'atan2': 'Returns the angle theta from the conversion of rectangular coordinates (x, y) to' +
              'polar coordinates (r, theta).',
     'hypot': 'Computes `sqrt(a^2^ + b^2^)` without intermediate overflow or underflow.',
-    'pow': 'Returns the value of the first argument raised to the power of the second argument.'
+    'pow': 'Returns the value of the first argument raised to the power of the second argument.',
 }
 
 _window_functions = {
@@ -403,6 +404,21 @@ def when(condition, value):
     return Column(jc)
 
 
+@since(1.4)
+def log(col, base=math.e):
+    """Returns the first argument-based logarithm of the second argument.
+
+    >>> df.select(log(df.age, 10.0).alias('ten')).map(lambda l: str(l.ten)[:7]).collect()
+    ['0.30102', '0.69897']
+
+    >>> df.select(log(df.age).alias('e')).map(lambda l: str(l.e)[:7]).collect()
+    ['0.69314', '1.60943']
+    """
+    sc = SparkContext._active_spark_context
+    jc = sc._jvm.functions.log(base, _to_java_column(col))
+    return Column(jc)
+
+
 @since(1.4)
 def lag(col, count=1, default=None):
     """
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
index 97b123ec2f6d9..13b2bb05f5280 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
@@ -112,6 +112,7 @@ object FunctionRegistry {
     expression[Expm1]("expm1"),
     expression[Floor]("floor"),
     expression[Hypot]("hypot"),
+    expression[Logarithm]("log"),
     expression[Log]("ln"),
     expression[Log10]("log10"),
     expression[Log1p]("log1p"),
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/math.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/math.scala
index 42c596b5b31ab..67cb0b508ca9e 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/math.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/math.scala
@@ -255,3 +255,23 @@ case class Pow(left: Expression, right: Expression)
       """
   }
 }
+
+case class Logarithm(left: Expression, right: Expression)
+  extends BinaryMathExpression((c1, c2) => math.log(c2) / math.log(c1), "LOG") {
+  def this(child: Expression) = {
+    this(EulerNumber(), child)
+  }
+
+  override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
+    val logCode = if (left.isInstanceOf[EulerNumber]) {
+      defineCodeGen(ctx, ev, (c1, c2) => s"java.lang.Math.log($c2)")
+    } else {
+      defineCodeGen(ctx, ev, (c1, c2) => s"java.lang.Math.log($c2) / java.lang.Math.log($c1)")
+    }
+    logCode + s"""
+      if (Double.valueOf(${ev.primitive}).isNaN()) {
+        ${ev.isNull} = true;
+      }
+    """
+  }
+}
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/MathFunctionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/MathFunctionsSuite.scala
index 864c954ee82cb..0050ad3fe8302 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/MathFunctionsSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/MathFunctionsSuite.scala
@@ -204,4 +204,22 @@ class MathFunctionsSuite extends SparkFunSuite with ExpressionEvalHelper {
     testBinary(Atan2, math.atan2)
   }
 
+  test("binary log") {
+    val f = (c1: Double, c2: Double) => math.log(c2) / math.log(c1)
+    val domain = (1 to 20).map(v => (v * 0.1, v * 0.2))
+
+    domain.foreach { case (v1, v2) =>
+      checkEvaluation(Logarithm(Literal(v1), Literal(v2)), f(v1 + 0.0, v2 + 0.0), EmptyRow)
+      checkEvaluation(Logarithm(Literal(v2), Literal(v1)), f(v2 + 0.0, v1 + 0.0), EmptyRow)
+      checkEvaluation(new Logarithm(Literal(v1)), f(math.E, v1 + 0.0), EmptyRow)
+    }
+    checkEvaluation(
+      Logarithm(Literal.create(null, DoubleType), Literal(1.0)),
+      null,
+      create_row(null))
+    checkEvaluation(
+      Logarithm(Literal(1.0), Literal.create(null, DoubleType)),
+      null,
+      create_row(null))
+  }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
index c5b77724aae17..dff0932c450a8 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
@@ -1083,6 +1083,22 @@ object functions {
    */
   def log(columnName: String): Column = log(Column(columnName))
 
+  /**
+   * Returns the first argument-base logarithm of the second argument.
+   *
+   * @group math_funcs
+   * @since 1.4.0
+   */
+  def log(base: Double, a: Column): Column = Logarithm(lit(base).expr, a.expr)
+
+  /**
+   * Returns the first argument-base logarithm of the second argument.
+   *
+   * @group math_funcs
+   * @since 1.4.0
+   */
+  def log(base: Double, columnName: String): Column = log(base, Column(columnName))
+
   /**
    * Computes the logarithm of the given value in base 10.
    *
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/MathExpressionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/MathExpressionsSuite.scala
index e2daaf6b730c5..7c9c121b956bb 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/MathExpressionsSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/MathExpressionsSuite.scala
@@ -236,6 +236,19 @@ class MathExpressionsSuite extends QueryTest {
     testOneToOneNonNegativeMathFunction(log1p, math.log1p)
   }
 
+  test("binary log") {
+    val df = Seq[(Integer, Integer)]((123, null)).toDF("a", "b")
+    checkAnswer(
+      df.select(org.apache.spark.sql.functions.log("a"),
+        org.apache.spark.sql.functions.log(2.0, "a"),
+        org.apache.spark.sql.functions.log("b")),
+      Row(math.log(123), math.log(123) / math.log(2), null))
+
+    checkAnswer(
+      df.selectExpr("log(a)", "log(2.0, a)", "log(b)"),
+      Row(math.log(123), math.log(123) / math.log(2), null))
+  }
+
   test("abs") {
     val input =
       Seq[(java.lang.Double, java.lang.Double)]((null, null), (0.0, 0.0), (1.5, 1.5), (-2.5, 2.5))

From e86fbdb1e6f1538f65ef78d90bbc41604f6bd580 Mon Sep 17 00:00:00 2001
From: Yijie Shen <henry.yijieshen@gmail.com>
Date: Wed, 17 Jun 2015 23:46:57 -0700
Subject: [PATCH 525/525] [SPARK-8283][SQL] Resolve udf_struct test failure in
 HiveCompatibilitySuite

This PR aimed to resolve udf_struct test failure in HiveCompatibilitySuite.

Currently, this is done by loosening CreateStruct's children type from NamedExpression to Expression and automatically generating StructField name for non-NamedExpression children.

The naming convention for unnamed children follows the udf's counterpart in Hive:
`col1, col2, col3, ...`

Author: Yijie Shen <henry.yijieshen@gmail.com>

Closes #6828 from yijieshen/SPARK-8283 and squashes the following commits:

6052b73 [Yijie Shen] Doc fix
677e0b7 [Yijie Shen] Resolve udf_struct test failure by automatically generate structField name for non-NamedExpression children
---
 .../sql/catalyst/expressions/complexTypes.scala     | 13 +++++++++----
 .../sql/hive/execution/HiveCompatibilitySuite.scala |  2 +-
 2 files changed, 10 insertions(+), 5 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypes.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypes.scala
index 1aaf9b309efc3..72fdcebb4cbc8 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypes.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypes.scala
@@ -53,7 +53,7 @@ case class CreateArray(children: Seq[Expression]) extends Expression {
  * Returns a Row containing the evaluation of all children expressions.
  * TODO: [[CreateStruct]] does not support codegen.
  */
-case class CreateStruct(children: Seq[NamedExpression]) extends Expression {
+case class CreateStruct(children: Seq[Expression]) extends Expression {
 
   override def foldable: Boolean = children.forall(_.foldable)
 
@@ -62,9 +62,14 @@ case class CreateStruct(children: Seq[NamedExpression]) extends Expression {
   override lazy val dataType: StructType = {
     assert(resolved,
       s"CreateStruct contains unresolvable children: ${children.filterNot(_.resolved)}.")
-    val fields = children.map { child =>
-      StructField(child.name, child.dataType, child.nullable, child.metadata)
-    }
+      val fields = children.zipWithIndex.map { case (child, idx) =>
+        child match {
+          case ne: NamedExpression =>
+            StructField(ne.name, ne.dataType, ne.nullable, ne.metadata)
+          case _ =>
+            StructField(s"col${idx + 1}", child.dataType, child.nullable, Metadata.empty)
+        }
+      }
     StructType(fields)
   }
 
diff --git a/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveCompatibilitySuite.scala b/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveCompatibilitySuite.scala
index 432de2564d080..f88e62763ca70 100644
--- a/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveCompatibilitySuite.scala
+++ b/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveCompatibilitySuite.scala
@@ -933,7 +933,7 @@ class HiveCompatibilitySuite extends HiveQueryFileTest with BeforeAndAfter {
     "udf_stddev_pop",
     "udf_stddev_samp",
     "udf_string",
-    // "udf_struct",  TODO: FIX THIS and enable it.
+    "udf_struct",
     "udf_substring",
     "udf_subtract",
     "udf_sum",

MLlib model	PMML model
KMeansModel	ClusteringModel
LinearRegressionModel	RegressionModel (functionName="regression")
RidgeRegressionModel	RegressionModel (functionName="regression")
LassoModel	RegressionModel (functionName="regression")
SVMModel	RegressionModel (functionName="classification" normalizationMethod="none")
Binary LogisticRegressionModel	RegressionModel (functionName="classification" normalizationMethod="logit")
Data type	Value type in R	API to access or create a data type
ByteType	+ integer + Note: Numbers will be converted to 1-byte signed integer numbers at runtime. + Please make sure that numbers are within the range of -128 to 127. +	+ "byte" +
ShortType	+ integer + Note: Numbers will be converted to 2-byte signed integer numbers at runtime. + Please make sure that numbers are within the range of -32768 to 32767. +	+ "short" +
IntegerType	integer	+ "integer" +
LongType	+ integer + Note: Numbers will be converted to 8-byte signed integer numbers at runtime. + Please make sure that numbers are within the range of + -9223372036854775808 to 9223372036854775807. + Otherwise, please convert data to decimal.Decimal and use DecimalType. +	+ "long" +
FloatType	+ numeric + Note: Numbers will be converted to 4-byte single-precision floating + point numbers at runtime. +	+ "float" +
DoubleType	numeric	+ "double" +
DecimalType	Not supported	+ Not supported +
StringType	character	+ "string" +
BinaryType	raw	+ "binary" +
BooleanType	logical	+ "bool" +
TimestampType	POSIXct	+ "timestamp" +
DateType	Date	+ "date" +
ArrayType	vector or list	+ list(type="array", elementType=elementType, containsNull=[containsNull]) + Note: The default value of containsNull is True. +
MapType	enviroment	+ list(type="map", keyType=keyType, valueType=valueType, valueContainsNull=[valueContainsNull]) + Note: The default value of valueContainsNull is True. +
StructType	named list	+ list(type="struct", fields=fields) + Note: fields is a Seq of StructFields. Also, two fields with the same + name are not allowed. +
StructField	The value type in R of the data type of this field + (For example, integer for a StructField with the data type IntegerType)	+ list(name=name, type=dataType, nullable=nullable) +
Property Name	Meaning
`spark.sql.hive.metastore.version`	+ The version of the hive client that will be used to communicate with the metastore. Available + options are `0.12.0` and `0.13.1`. Defaults to `0.13.1`. +
`spark.sql.hive.metastore.jars`	+ The location of the jars that should be used to instantiate the HiveMetastoreClient. This + property can be one of three options: + + `builtin` + Use Hive 0.13.1, which is bundled with the Spark assembly jar when `-Phive` is + enabled. When this option is chosen, `spark.sql.hive.metastore.version` must be + either `0.13.1` or not defined. + `maven` + Use Hive jars of specified version downloaded from Maven repositories. + A classpath in the standard format for both Hive and Hadoop. + + Defaults to `builtin`. +
`spark.sql.hive.metastore.sharedPrefixes`	+ + A comma separated list of class prefixes that should be loaded using the classloader that is + shared between Spark SQL and a specific version of Hive. An example of classes that should + be shared is JDBC drivers that are needed to talk to the metastore. Other classes that need + to be shared are those that interact with classes that are already shared. For example, + custom appenders that are used by log4j. + + + Defaults to `com.mysql.jdbc,org.postgresql,com.microsoft.sqlserver,oracle.jdbc`. + +
`spark.sql.hive.metastore.barrierPrefixes`	+ + A comma separated list of class prefixes that should explicitly be reloaded for each version + of Hive that Spark SQL is communicating with. For example, Hive UDFs that are declared in a + prefix that typically would be shared (i.e. `org.apache.spark.*`). + + Defaults to empty. +
Property Name	Meaning
Property Name	Default	Meaning
`spark.sql.hive.metastore.version`	`0.13.1`	- The version of the hive client that will be used to communicate with the metastore. Available - options are `0.12.0` and `0.13.1`. Defaults to `0.13.1`. + Version of the Hive metastore. Available + options are `0.12.0` and `0.13.1`. Support for more versions is coming in the future.
`spark.sql.hive.metastore.jars`	`builtin`	- The location of the jars that should be used to instantiate the HiveMetastoreClient. This + Location of the jars that should be used to instantiate the HiveMetastoreClient. This property can be one of three options: `builtin` Use Hive 0.13.1, which is bundled with the Spark assembly jar when `-Phive` is - enabled. When this option is chosen, `spark.sql.hive.metastore.version` must be + enabled. When this option is chosen, `spark.sql.hive.metastore.version` must be either `0.13.1` or not defined. `maven` Use Hive jars of specified version downloaded from Maven repositories. A classpath in the standard format for both Hive and Hadoop. - Defaults to `builtin`.
`spark.sql.hive.metastore.sharedPrefixes`	`com.mysql.jdbc, org.postgresql, com.microsoft.sqlserver, oracle.jdbc`	A comma separated list of class prefixes that should be loaded using the classloader that is shared between Spark SQL and a specific version of Hive. An example of classes that should be shared is JDBC drivers that are needed to talk to the metastore. Other classes that need - to be shared are those that interact with classes that are already shared. For example, + to be shared are those that interact with classes that are already shared. For example, custom appenders that are used by log4j. - - Defaults to `com.mysql.jdbc,org.postgresql,com.microsoft.sqlserver,oracle.jdbc`. -
`spark.sql.hive.metastore.barrierPrefixes`	`(empty)`	A comma separated list of class prefixes that should explicitly be reloaded for each version - of Hive that Spark SQL is communicating with. For example, Hive UDFs that are declared in a + of Hive that Spark SQL is communicating with. For example, Hive UDFs that are declared in a prefix that typically would be shared (i.e. `org.apache.spark.*`). - Defaults to empty.