From 4f7fbefb8db56ecaab66bb0ac2ab124416fefe58 Mon Sep 17 00:00:00 2001
From: lockwobr <lockwobr@gmail.com>
Date: Wed, 24 Jun 2015 02:48:56 +0900
Subject: [PATCH 1/7] [SQL] [DOCS] updated the documentation for explode

the syntax was incorrect in the example in explode

Author: lockwobr <lockwobr@gmail.com>

Closes #6943 from lockwobr/master and squashes the following commits:

3d864d1 [lockwobr] updated the documentation for explode
---
 sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
index 492a3321bc0bc..f3f0f5305318e 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
@@ -1049,7 +1049,7 @@ class DataFrame private[sql](
    * columns of the input row are implicitly joined with each value that is output by the function.
    *
    * {{{
-   *   df.explode("words", "word")(words: String => words.split(" "))
+   *   df.explode("words", "word"){words: String => words.split(" ")}
    * }}}
    * @group dfops
    * @since 1.3.0

From 7b1450b666f88452e7fe969a6d59e8b24842ea39 Mon Sep 17 00:00:00 2001
From: Cheng Hao <hao.cheng@intel.com>
Date: Tue, 23 Jun 2015 10:52:17 -0700
Subject: [PATCH 2/7] [SPARK-7235] [SQL] Refactor the grouping sets

The logical plan `Expand` takes the `output` as constructor argument, which break the references chain. We need to refactor the code, as well as the column pruning.

Author: Cheng Hao <hao.cheng@intel.com>

Closes #5780 from chenghao-intel/expand and squashes the following commits:

76e4aa4 [Cheng Hao] revert the change for case insenstive
7c10a83 [Cheng Hao] refactor the grouping sets
---
 .../sql/catalyst/analysis/Analyzer.scala      | 55 ++----------
 .../expressions/namedExpressions.scala        |  2 +-
 .../sql/catalyst/optimizer/Optimizer.scala    |  4 +
 .../plans/logical/basicOperators.scala        | 84 ++++++++++++++-----
 .../spark/sql/execution/SparkStrategies.scala |  4 +-
 5 files changed, 78 insertions(+), 71 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
index 6311784422a91..0a3f5a7b5cade 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
@@ -192,49 +192,17 @@ class Analyzer(
       Seq.tabulate(1 << c.groupByExprs.length)(i => i)
     }
 
-    /**
-     * Create an array of Projections for the child projection, and replace the projections'
-     * expressions which equal GroupBy expressions with Literal(null), if those expressions
-     * are not set for this grouping set (according to the bit mask).
-     */
-    private[this] def expand(g: GroupingSets): Seq[Seq[Expression]] = {
-      val result = new scala.collection.mutable.ArrayBuffer[Seq[Expression]]
-
-      g.bitmasks.foreach { bitmask =>
-        // get the non selected grouping attributes according to the bit mask
-        val nonSelectedGroupExprs = ArrayBuffer.empty[Expression]
-        var bit = g.groupByExprs.length - 1
-        while (bit >= 0) {
-          if (((bitmask >> bit) & 1) == 0) nonSelectedGroupExprs += g.groupByExprs(bit)
-          bit -= 1
-        }
-
-        val substitution = (g.child.output :+ g.gid).map(expr => expr transformDown {
-          case x: Expression if nonSelectedGroupExprs.find(_ semanticEquals x).isDefined =>
-            // if the input attribute in the Invalid Grouping Expression set of for this group
-            // replace it with constant null
-            Literal.create(null, expr.dataType)
-          case x if x == g.gid =>
-            // replace the groupingId with concrete value (the bit mask)
-            Literal.create(bitmask, IntegerType)
-        })
-
-        result += substitution
-      }
-
-      result.toSeq
-    }
-
     def apply(plan: LogicalPlan): LogicalPlan = plan transform {
-      case a: Cube if a.resolved =>
-        GroupingSets(bitmasks(a), a.groupByExprs, a.child, a.aggregations, a.gid)
-      case a: Rollup if a.resolved =>
-        GroupingSets(bitmasks(a), a.groupByExprs, a.child, a.aggregations, a.gid)
-      case x: GroupingSets if x.resolved =>
+      case a: Cube =>
+        GroupingSets(bitmasks(a), a.groupByExprs, a.child, a.aggregations)
+      case a: Rollup =>
+        GroupingSets(bitmasks(a), a.groupByExprs, a.child, a.aggregations)
+      case x: GroupingSets =>
+        val gid = AttributeReference(VirtualColumn.groupingIdName, IntegerType, false)()
         Aggregate(
-          x.groupByExprs :+ x.gid,
+          x.groupByExprs :+ VirtualColumn.groupingIdAttribute,
           x.aggregations,
-          Expand(expand(x), x.child.output :+ x.gid, x.child))
+          Expand(x.bitmasks, x.groupByExprs, gid, x.child))
     }
   }
 
@@ -368,12 +336,7 @@ class Analyzer(
 
       case q: LogicalPlan =>
         logTrace(s"Attempting to resolve ${q.simpleString}")
-        q transformExpressionsUp {
-          case u @ UnresolvedAttribute(nameParts) if nameParts.length == 1 &&
-            resolver(nameParts(0), VirtualColumn.groupingIdName) &&
-            q.isInstanceOf[GroupingAnalytics] =>
-            // Resolve the virtual column GROUPING__ID for the operator GroupingAnalytics
-            q.asInstanceOf[GroupingAnalytics].gid
+        q transformExpressionsUp  {
           case u @ UnresolvedAttribute(nameParts) =>
             // Leave unchanged if resolution fails.  Hopefully will be resolved next round.
             val result =
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/namedExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/namedExpressions.scala
index 58dbeaf89cad5..9cacdceb13837 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/namedExpressions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/namedExpressions.scala
@@ -262,5 +262,5 @@ case class PrettyAttribute(name: String) extends Attribute with trees.LeafNode[E
 
 object VirtualColumn {
   val groupingIdName: String = "grouping__id"
-  def newGroupingId: AttributeReference = AttributeReference(groupingIdName, IntegerType, false)()
+  val groupingIdAttribute: UnresolvedAttribute = UnresolvedAttribute(groupingIdName)
 }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
index 9132a786f77a7..98b4476076854 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
@@ -121,6 +121,10 @@ object UnionPushdown extends Rule[LogicalPlan] {
  */
 object ColumnPruning extends Rule[LogicalPlan] {
   def apply(plan: LogicalPlan): LogicalPlan = plan transform {
+    case a @ Aggregate(_, _, e @ Expand(_, groupByExprs, _, child))
+      if (child.outputSet -- AttributeSet(groupByExprs) -- a.references).nonEmpty =>
+      a.copy(child = e.copy(child = prunedChild(child, AttributeSet(groupByExprs) ++ a.references)))
+
     // Eliminate attributes that are not needed to calculate the specified aggregates.
     case a @ Aggregate(_, _, child) if (child.outputSet -- a.references).nonEmpty =>
       a.copy(child = Project(a.references.toSeq, child))
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicOperators.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicOperators.scala
index 7814e51628db6..fae339808c233 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicOperators.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicOperators.scala
@@ -20,6 +20,7 @@ package org.apache.spark.sql.catalyst.plans.logical
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.plans._
 import org.apache.spark.sql.types._
+import org.apache.spark.util.collection.OpenHashSet
 
 case class Project(projectList: Seq[NamedExpression], child: LogicalPlan) extends UnaryNode {
   override def output: Seq[Attribute] = projectList.map(_.toAttribute)
@@ -228,24 +229,76 @@ case class Window(
 /**
  * Apply the all of the GroupExpressions to every input row, hence we will get
  * multiple output rows for a input row.
- * @param projections The group of expressions, all of the group expressions should
- *                    output the same schema specified by the parameter `output`
- * @param output      The output Schema
+ * @param bitmasks The bitmask set represents the grouping sets
+ * @param groupByExprs The grouping by expressions
  * @param child       Child operator
  */
 case class Expand(
-    projections: Seq[Seq[Expression]],
-    output: Seq[Attribute],
+    bitmasks: Seq[Int],
+    groupByExprs: Seq[Expression],
+    gid: Attribute,
     child: LogicalPlan) extends UnaryNode {
   override def statistics: Statistics = {
     val sizeInBytes = child.statistics.sizeInBytes * projections.length
     Statistics(sizeInBytes = sizeInBytes)
   }
+
+  val projections: Seq[Seq[Expression]] = expand()
+
+  /**
+   * Extract attribute set according to the grouping id
+   * @param bitmask bitmask to represent the selected of the attribute sequence
+   * @param exprs the attributes in sequence
+   * @return the attributes of non selected specified via bitmask (with the bit set to 1)
+   */
+  private def buildNonSelectExprSet(bitmask: Int, exprs: Seq[Expression])
+  : OpenHashSet[Expression] = {
+    val set = new OpenHashSet[Expression](2)
+
+    var bit = exprs.length - 1
+    while (bit >= 0) {
+      if (((bitmask >> bit) & 1) == 0) set.add(exprs(bit))
+      bit -= 1
+    }
+
+    set
+  }
+
+  /**
+   * Create an array of Projections for the child projection, and replace the projections'
+   * expressions which equal GroupBy expressions with Literal(null), if those expressions
+   * are not set for this grouping set (according to the bit mask).
+   */
+  private[this] def expand(): Seq[Seq[Expression]] = {
+    val result = new scala.collection.mutable.ArrayBuffer[Seq[Expression]]
+
+    bitmasks.foreach { bitmask =>
+      // get the non selected grouping attributes according to the bit mask
+      val nonSelectedGroupExprSet = buildNonSelectExprSet(bitmask, groupByExprs)
+
+      val substitution = (child.output :+ gid).map(expr => expr transformDown {
+        case x: Expression if nonSelectedGroupExprSet.contains(x) =>
+          // if the input attribute in the Invalid Grouping Expression set of for this group
+          // replace it with constant null
+          Literal.create(null, expr.dataType)
+        case x if x == gid =>
+          // replace the groupingId with concrete value (the bit mask)
+          Literal.create(bitmask, IntegerType)
+      })
+
+      result += substitution
+    }
+
+    result.toSeq
+  }
+
+  override def output: Seq[Attribute] = {
+    child.output :+ gid
+  }
 }
 
 trait GroupingAnalytics extends UnaryNode {
   self: Product =>
-  def gid: AttributeReference
   def groupByExprs: Seq[Expression]
   def aggregations: Seq[NamedExpression]
 
@@ -266,17 +319,12 @@ trait GroupingAnalytics extends UnaryNode {
  * @param child        Child operator
  * @param aggregations The Aggregation expressions, those non selected group by expressions
  *                     will be considered as constant null if it appears in the expressions
- * @param gid          The attribute represents the virtual column GROUPING__ID, and it's also
- *                     the bitmask indicates the selected GroupBy Expressions for each
- *                     aggregating output row.
- *                     The associated output will be one of the value in `bitmasks`
  */
 case class GroupingSets(
     bitmasks: Seq[Int],
     groupByExprs: Seq[Expression],
     child: LogicalPlan,
-    aggregations: Seq[NamedExpression],
-    gid: AttributeReference = VirtualColumn.newGroupingId) extends GroupingAnalytics {
+    aggregations: Seq[NamedExpression]) extends GroupingAnalytics {
 
   def withNewAggs(aggs: Seq[NamedExpression]): GroupingAnalytics =
     this.copy(aggregations = aggs)
@@ -290,15 +338,11 @@ case class GroupingSets(
  * @param child        Child operator
  * @param aggregations The Aggregation expressions, those non selected group by expressions
  *                     will be considered as constant null if it appears in the expressions
- * @param gid          The attribute represents the virtual column GROUPING__ID, and it's also
- *                     the bitmask indicates the selected GroupBy Expressions for each
- *                     aggregating output row.
  */
 case class Cube(
     groupByExprs: Seq[Expression],
     child: LogicalPlan,
-    aggregations: Seq[NamedExpression],
-    gid: AttributeReference = VirtualColumn.newGroupingId) extends GroupingAnalytics {
+    aggregations: Seq[NamedExpression]) extends GroupingAnalytics {
 
   def withNewAggs(aggs: Seq[NamedExpression]): GroupingAnalytics =
     this.copy(aggregations = aggs)
@@ -313,15 +357,11 @@ case class Cube(
  * @param child        Child operator
  * @param aggregations The Aggregation expressions, those non selected group by expressions
  *                     will be considered as constant null if it appears in the expressions
- * @param gid          The attribute represents the virtual column GROUPING__ID, and it's also
- *                     the bitmask indicates the selected GroupBy Expressions for each
- *                     aggregating output row.
  */
 case class Rollup(
     groupByExprs: Seq[Expression],
     child: LogicalPlan,
-    aggregations: Seq[NamedExpression],
-    gid: AttributeReference = VirtualColumn.newGroupingId) extends GroupingAnalytics {
+    aggregations: Seq[NamedExpression]) extends GroupingAnalytics {
 
   def withNewAggs(aggs: Seq[NamedExpression]): GroupingAnalytics =
     this.copy(aggregations = aggs)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala
index 5c420eb9d761f..1ff1cc224de8c 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala
@@ -308,8 +308,8 @@ private[sql] abstract class SparkStrategies extends QueryPlanner[SparkPlan] {
         execution.Project(projectList, planLater(child)) :: Nil
       case logical.Filter(condition, child) =>
         execution.Filter(condition, planLater(child)) :: Nil
-      case logical.Expand(projections, output, child) =>
-        execution.Expand(projections, output, planLater(child)) :: Nil
+      case e @ logical.Expand(_, _, _, child) =>
+        execution.Expand(e.projections, e.output, planLater(child)) :: Nil
       case logical.Aggregate(group, agg, child) =>
         execution.Aggregate(partial = false, group, agg, planLater(child)) :: Nil
       case logical.Window(projectList, windowExpressions, spec, child) =>

From 6f4cadf5ee81467d077febc53d36571dd232295d Mon Sep 17 00:00:00 2001
From: Davies Liu <davies@databricks.com>
Date: Tue, 23 Jun 2015 11:55:47 -0700
Subject: [PATCH 3/7] [SPARK-8432] [SQL] fix hashCode() and equals() of
 BinaryType in Row

Also added more tests in LiteralExpressionSuite

Author: Davies Liu <davies@databricks.com>

Closes #6876 from davies/fix_hashcode and squashes the following commits:

429c2c0 [Davies Liu] Merge branch 'master' of github.com:apache/spark into fix_hashcode
32d9811 [Davies Liu] fix test
a0626ed [Davies Liu] Merge branch 'master' of github.com:apache/spark into fix_hashcode
89c2432 [Davies Liu] fix style
bd20780 [Davies Liu] check with catalyst types
41caec6 [Davies Liu] change for to while
d96929b [Davies Liu] address comment
6ad2a90 [Davies Liu] fix style
5819d33 [Davies Liu] unify equals() and hashCode()
0fff25d [Davies Liu] fix style
53c38b1 [Davies Liu] fix hashCode() and equals() of BinaryType in Row
---
 .../java/org/apache/spark/sql/BaseRow.java    | 21 ------
 .../main/scala/org/apache/spark/sql/Row.scala | 32 ---------
 .../spark/sql/catalyst/InternalRow.scala      | 67 ++++++++++++++++++-
 .../codegen/GenerateProjection.scala          |  1 +
 .../spark/sql/catalyst/expressions/rows.scala | 52 --------------
 .../expressions/ExpressionEvalHelper.scala    | 27 ++++++--
 .../expressions/LiteralExpressionSuite.scala  | 61 ++++++++++++++---
 .../expressions/StringFunctionsSuite.scala    |  5 +-
 .../apache/spark/unsafe/types/UTF8String.java |  6 +-
 .../spark/unsafe/types/UTF8StringSuite.java   |  2 -
 10 files changed, 139 insertions(+), 135 deletions(-)

diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/BaseRow.java b/sql/catalyst/src/main/java/org/apache/spark/sql/BaseRow.java
index 611e02d8fb666..6a2356f1f9c6f 100644
--- a/sql/catalyst/src/main/java/org/apache/spark/sql/BaseRow.java
+++ b/sql/catalyst/src/main/java/org/apache/spark/sql/BaseRow.java
@@ -155,27 +155,6 @@ public int fieldIndex(String name) {
     throw new UnsupportedOperationException();
   }
 
-  /**
-   * A generic version of Row.equals(Row), which is used for tests.
-   */
-  @Override
-  public boolean equals(Object other) {
-    if (other instanceof Row) {
-      Row row = (Row) other;
-      int n = size();
-      if (n != row.size()) {
-        return false;
-      }
-      for (int i = 0; i < n; i ++) {
-        if (isNullAt(i) != row.isNullAt(i) || (!isNullAt(i) && !get(i).equals(row.get(i)))) {
-          return false;
-        }
-      }
-      return true;
-    }
-    return false;
-  }
-
   @Override
   public InternalRow copy() {
     final int n = size();
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/Row.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/Row.scala
index 8aaf5d7d89154..e99d5c87a44fe 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/Row.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/Row.scala
@@ -17,8 +17,6 @@
 
 package org.apache.spark.sql
 
-import scala.util.hashing.MurmurHash3
-
 import org.apache.spark.sql.catalyst.expressions.GenericRow
 import org.apache.spark.sql.types.StructType
 
@@ -365,36 +363,6 @@ trait Row extends Serializable {
     false
   }
 
-  override def equals(that: Any): Boolean = that match {
-    case null => false
-    case that: Row =>
-      if (this.length != that.length) {
-        return false
-      }
-      var i = 0
-      val len = this.length
-      while (i < len) {
-        if (apply(i) != that.apply(i)) {
-          return false
-        }
-        i += 1
-      }
-      true
-    case _ => false
-  }
-
-  override def hashCode: Int = {
-    // Using Scala's Seq hash code implementation.
-    var n = 0
-    var h = MurmurHash3.seqSeed
-    val len = length
-    while (n < len) {
-      h = MurmurHash3.mix(h, apply(n).##)
-      n += 1
-    }
-    MurmurHash3.finalizeHash(h, n)
-  }
-
   /* ---------------------- utility methods for Scala ---------------------- */
 
   /**
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/InternalRow.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/InternalRow.scala
index e3c2cc243310b..d7b537a9fe3bc 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/InternalRow.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/InternalRow.scala
@@ -18,7 +18,7 @@
 package org.apache.spark.sql.catalyst
 
 import org.apache.spark.sql.Row
-import org.apache.spark.sql.catalyst.expressions.GenericRow
+import org.apache.spark.sql.catalyst.expressions._
 
 /**
  * An abstract class for row used internal in Spark SQL, which only contain the columns as
@@ -26,7 +26,70 @@ import org.apache.spark.sql.catalyst.expressions.GenericRow
  */
 abstract class InternalRow extends Row {
   // A default implementation to change the return type
-  override def copy(): InternalRow = {this}
+  override def copy(): InternalRow = this
+
+  override def equals(o: Any): Boolean = {
+    if (!o.isInstanceOf[Row]) {
+      return false
+    }
+
+    val other = o.asInstanceOf[Row]
+    if (length != other.length) {
+      return false
+    }
+
+    var i = 0
+    while (i < length) {
+      if (isNullAt(i) != other.isNullAt(i)) {
+        return false
+      }
+      if (!isNullAt(i)) {
+        val o1 = apply(i)
+        val o2 = other.apply(i)
+        if (o1.isInstanceOf[Array[Byte]]) {
+          // handle equality of Array[Byte]
+          val b1 = o1.asInstanceOf[Array[Byte]]
+          if (!o2.isInstanceOf[Array[Byte]] ||
+            !java.util.Arrays.equals(b1, o2.asInstanceOf[Array[Byte]])) {
+            return false
+          }
+        } else if (o1 != o2) {
+          return false
+        }
+      }
+      i += 1
+    }
+    true
+  }
+
+  // Custom hashCode function that matches the efficient code generated version.
+  override def hashCode: Int = {
+    var result: Int = 37
+    var i = 0
+    while (i < length) {
+      val update: Int =
+        if (isNullAt(i)) {
+          0
+        } else {
+          apply(i) match {
+            case b: Boolean => if (b) 0 else 1
+            case b: Byte => b.toInt
+            case s: Short => s.toInt
+            case i: Int => i
+            case l: Long => (l ^ (l >>> 32)).toInt
+            case f: Float => java.lang.Float.floatToIntBits(f)
+            case d: Double =>
+              val b = java.lang.Double.doubleToLongBits(d)
+              (b ^ (b >>> 32)).toInt
+            case a: Array[Byte] => java.util.Arrays.hashCode(a)
+            case other => other.hashCode()
+          }
+        }
+      result = 37 * result + update
+      i += 1
+    }
+    result
+  }
 }
 
 object InternalRow {
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateProjection.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateProjection.scala
index 2e20eda1a3002..e362625469e29 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateProjection.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateProjection.scala
@@ -127,6 +127,7 @@ object GenerateProjection extends CodeGenerator[Seq[Expression], Projection] {
         case FloatType => s"Float.floatToIntBits($col)"
         case DoubleType =>
             s"(int)(Double.doubleToLongBits($col) ^ (Double.doubleToLongBits($col) >>> 32))"
+        case BinaryType => s"java.util.Arrays.hashCode($col)"
         case _ => s"$col.hashCode()"
       }
       s"isNullAt($i) ? 0 : ($nonNull)"
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/rows.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/rows.scala
index 1098962ddc018..0d4c9ace5e124 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/rows.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/rows.scala
@@ -121,58 +121,6 @@ class GenericRow(protected[sql] val values: Array[Any]) extends InternalRow {
     }
   }
 
-  // TODO(davies): add getDate and getDecimal
-
-  // Custom hashCode function that matches the efficient code generated version.
-  override def hashCode: Int = {
-    var result: Int = 37
-
-    var i = 0
-    while (i < values.length) {
-      val update: Int =
-        if (isNullAt(i)) {
-          0
-        } else {
-          apply(i) match {
-            case b: Boolean => if (b) 0 else 1
-            case b: Byte => b.toInt
-            case s: Short => s.toInt
-            case i: Int => i
-            case l: Long => (l ^ (l >>> 32)).toInt
-            case f: Float => java.lang.Float.floatToIntBits(f)
-            case d: Double =>
-              val b = java.lang.Double.doubleToLongBits(d)
-              (b ^ (b >>> 32)).toInt
-            case other => other.hashCode()
-          }
-        }
-      result = 37 * result + update
-      i += 1
-    }
-    result
-  }
-
-  override def equals(o: Any): Boolean = o match {
-    case other: InternalRow =>
-      if (values.length != other.length) {
-        return false
-      }
-
-      var i = 0
-      while (i < values.length) {
-        if (isNullAt(i) != other.isNullAt(i)) {
-          return false
-        }
-        if (apply(i) != other.apply(i)) {
-          return false
-        }
-        i += 1
-      }
-      true
-
-    case _ => false
-  }
-
   override def copy(): InternalRow = this
 }
 
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionEvalHelper.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionEvalHelper.scala
index 12d2da8b33986..158f54af13802 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionEvalHelper.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionEvalHelper.scala
@@ -38,10 +38,23 @@ trait ExpressionEvalHelper {
 
   protected def checkEvaluation(
       expression: Expression, expected: Any, inputRow: InternalRow = EmptyRow): Unit = {
-    checkEvaluationWithoutCodegen(expression, expected, inputRow)
-    checkEvaluationWithGeneratedMutableProjection(expression, expected, inputRow)
-    checkEvaluationWithGeneratedProjection(expression, expected, inputRow)
-    checkEvaluationWithOptimization(expression, expected, inputRow)
+    val catalystValue = CatalystTypeConverters.convertToCatalyst(expected)
+    checkEvaluationWithoutCodegen(expression, catalystValue, inputRow)
+    checkEvaluationWithGeneratedMutableProjection(expression, catalystValue, inputRow)
+    checkEvaluationWithGeneratedProjection(expression, catalystValue, inputRow)
+    checkEvaluationWithOptimization(expression, catalystValue, inputRow)
+  }
+
+  /**
+   * Check the equality between result of expression and expected value, it will handle
+   * Array[Byte].
+   */
+  protected def checkResult(result: Any, expected: Any): Boolean = {
+    (result, expected) match {
+      case (result: Array[Byte], expected: Array[Byte]) =>
+        java.util.Arrays.equals(result, expected)
+      case _ => result == expected
+    }
   }
 
   protected def evaluate(expression: Expression, inputRow: InternalRow = EmptyRow): Any = {
@@ -55,7 +68,7 @@ trait ExpressionEvalHelper {
     val actual = try evaluate(expression, inputRow) catch {
       case e: Exception => fail(s"Exception evaluating $expression", e)
     }
-    if (actual != expected) {
+    if (!checkResult(actual, expected)) {
       val input = if (inputRow == EmptyRow) "" else s", input: $inputRow"
       fail(s"Incorrect evaluation (codegen off): $expression, " +
         s"actual: $actual, " +
@@ -83,7 +96,7 @@ trait ExpressionEvalHelper {
     }
 
     val actual = plan(inputRow).apply(0)
-    if (actual != expected) {
+    if (!checkResult(actual, expected)) {
       val input = if (inputRow == EmptyRow) "" else s", input: $inputRow"
       fail(s"Incorrect Evaluation: $expression, actual: $actual, expected: $expected$input")
     }
@@ -109,7 +122,7 @@ trait ExpressionEvalHelper {
     }
 
     val actual = plan(inputRow)
-    val expectedRow = new GenericRow(Array[Any](CatalystTypeConverters.convertToCatalyst(expected)))
+    val expectedRow = new GenericRow(Array[Any](expected))
     if (actual.hashCode() != expectedRow.hashCode()) {
       fail(
         s"""
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/LiteralExpressionSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/LiteralExpressionSuite.scala
index f44f55dfb92d1..d924ff7a102f6 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/LiteralExpressionSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/LiteralExpressionSuite.scala
@@ -18,12 +18,26 @@
 package org.apache.spark.sql.catalyst.expressions
 
 import org.apache.spark.SparkFunSuite
-import org.apache.spark.sql.types.StringType
+import org.apache.spark.sql.types._
 
 
 class LiteralExpressionSuite extends SparkFunSuite with ExpressionEvalHelper {
 
-  // TODO: Add tests for all data types.
+  test("null") {
+    checkEvaluation(Literal.create(null, BooleanType), null)
+    checkEvaluation(Literal.create(null, ByteType), null)
+    checkEvaluation(Literal.create(null, ShortType), null)
+    checkEvaluation(Literal.create(null, IntegerType), null)
+    checkEvaluation(Literal.create(null, LongType), null)
+    checkEvaluation(Literal.create(null, FloatType), null)
+    checkEvaluation(Literal.create(null, LongType), null)
+    checkEvaluation(Literal.create(null, StringType), null)
+    checkEvaluation(Literal.create(null, BinaryType), null)
+    checkEvaluation(Literal.create(null, DecimalType()), null)
+    checkEvaluation(Literal.create(null, ArrayType(ByteType, true)), null)
+    checkEvaluation(Literal.create(null, MapType(StringType, IntegerType)), null)
+    checkEvaluation(Literal.create(null, StructType(Seq.empty)), null)
+  }
 
   test("boolean literals") {
     checkEvaluation(Literal(true), true)
@@ -31,25 +45,52 @@ class LiteralExpressionSuite extends SparkFunSuite with ExpressionEvalHelper {
   }
 
   test("int literals") {
-    checkEvaluation(Literal(1), 1)
-    checkEvaluation(Literal(0L), 0L)
+    List(0, 1, Int.MinValue, Int.MaxValue).foreach { d =>
+      checkEvaluation(Literal(d), d)
+      checkEvaluation(Literal(d.toLong), d.toLong)
+      checkEvaluation(Literal(d.toShort), d.toShort)
+      checkEvaluation(Literal(d.toByte), d.toByte)
+    }
+    checkEvaluation(Literal(Long.MinValue), Long.MinValue)
+    checkEvaluation(Literal(Long.MaxValue), Long.MaxValue)
   }
 
   test("double literals") {
-    List(0.0, -0.0, Double.NegativeInfinity, Double.PositiveInfinity).foreach {
-      d => {
-        checkEvaluation(Literal(d), d)
-        checkEvaluation(Literal(d.toFloat), d.toFloat)
-      }
+    List(0.0, -0.0, Double.NegativeInfinity, Double.PositiveInfinity).foreach { d =>
+      checkEvaluation(Literal(d), d)
+      checkEvaluation(Literal(d.toFloat), d.toFloat)
     }
+    checkEvaluation(Literal(Double.MinValue), Double.MinValue)
+    checkEvaluation(Literal(Double.MaxValue), Double.MaxValue)
+    checkEvaluation(Literal(Float.MinValue), Float.MinValue)
+    checkEvaluation(Literal(Float.MaxValue), Float.MaxValue)
+
   }
 
   test("string literals") {
+    checkEvaluation(Literal(""), "")
     checkEvaluation(Literal("test"), "test")
-    checkEvaluation(Literal.create(null, StringType), null)
+    checkEvaluation(Literal("\0"), "\0")
   }
 
   test("sum two literals") {
     checkEvaluation(Add(Literal(1), Literal(1)), 2)
   }
+
+  test("binary literals") {
+    checkEvaluation(Literal.create(new Array[Byte](0), BinaryType), new Array[Byte](0))
+    checkEvaluation(Literal.create(new Array[Byte](2), BinaryType), new Array[Byte](2))
+  }
+
+  test("decimal") {
+    List(0.0, 1.2, 1.1111, 5).foreach { d =>
+      checkEvaluation(Literal(Decimal(d)), Decimal(d))
+      checkEvaluation(Literal(Decimal(d.toInt)), Decimal(d.toInt))
+      checkEvaluation(Literal(Decimal(d.toLong)), Decimal(d.toLong))
+      checkEvaluation(Literal(Decimal((d * 1000L).toLong, 10, 1)),
+        Decimal((d * 1000L).toLong, 10, 1))
+    }
+  }
+
+  // TODO(davies): add tests for ArrayType, MapType and StructType
 }
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringFunctionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringFunctionsSuite.scala
index d363e631540d8..5dbb1d562c1d9 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringFunctionsSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringFunctionsSuite.scala
@@ -222,9 +222,6 @@ class StringFunctionsSuite extends SparkFunSuite with ExpressionEvalHelper {
     checkEvaluation(StringLength(regEx), 5, create_row("abdef"))
     checkEvaluation(StringLength(regEx), 0, create_row(""))
     checkEvaluation(StringLength(regEx), null, create_row(null))
-    // TODO currently bug in codegen, let's temporally disable this
-    // checkEvaluation(StringLength(Literal.create(null, StringType)), null, create_row("abdef"))
+    checkEvaluation(StringLength(Literal.create(null, StringType)), null, create_row("abdef"))
   }
-
-
 }
diff --git a/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java b/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java
index 9871a70a40e69..9302b472925ed 100644
--- a/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java
+++ b/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java
@@ -17,10 +17,10 @@
 
 package org.apache.spark.unsafe.types;
 
+import javax.annotation.Nonnull;
 import java.io.Serializable;
 import java.io.UnsupportedEncodingException;
 import java.util.Arrays;
-import javax.annotation.Nonnull;
 
 import org.apache.spark.unsafe.PlatformDependent;
 
@@ -202,10 +202,6 @@ public int compare(final UTF8String other) {
   public boolean equals(final Object other) {
     if (other instanceof UTF8String) {
       return Arrays.equals(bytes, ((UTF8String) other).getBytes());
-    } else if (other instanceof String) {
-      // Used only in unit tests.
-      String s = (String) other;
-      return bytes.length >= s.length() && length() == s.length() && toString().equals(s);
     } else {
       return false;
     }
diff --git a/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java b/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java
index 80c179a1b5e75..796cdc9dbebdb 100644
--- a/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java
+++ b/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java
@@ -28,8 +28,6 @@ private void checkBasic(String str, int len) throws UnsupportedEncodingException
     Assert.assertEquals(UTF8String.fromString(str).length(), len);
     Assert.assertEquals(UTF8String.fromBytes(str.getBytes("utf8")).length(), len);
 
-    Assert.assertEquals(UTF8String.fromString(str), str);
-    Assert.assertEquals(UTF8String.fromBytes(str.getBytes("utf8")), str);
     Assert.assertEquals(UTF8String.fromString(str).toString(), str);
     Assert.assertEquals(UTF8String.fromBytes(str.getBytes("utf8")).toString(), str);
     Assert.assertEquals(UTF8String.fromBytes(str.getBytes("utf8")), UTF8String.fromString(str));

From 2b1111dd0b8deb9ad8d43fec792e60e3d0c4de75 Mon Sep 17 00:00:00 2001
From: Holden Karau <holden@pigscanfly.ca>
Date: Tue, 23 Jun 2015 12:42:17 -0700
Subject: [PATCH 4/7] [SPARK-7888] Be able to disable intercept in linear
 regression in ml package

Author: Holden Karau <holden@pigscanfly.ca>

Closes #6927 from holdenk/SPARK-7888-Be-able-to-disable-intercept-in-Linear-Regression-in-ML-package and squashes the following commits:

0ad384c [Holden Karau] Add MiMa excludes
4016fac [Holden Karau] Switch to wild card import, remove extra blank lines
ae5baa8 [Holden Karau] CR feedback, move the fitIntercept down rather than changing ymean and etc above
f34971c [Holden Karau] Fix some more long lines
319bd3f [Holden Karau] Fix long lines
3bb9ee1 [Holden Karau] Update the regression suite tests
7015b9f [Holden Karau] Our code performs the same with R, except we need more than one data point but that seems reasonable
0b0c8c0 [Holden Karau] fix the issue with the sample R code
e2140ba [Holden Karau] Add a test, it fails!
5e84a0b [Holden Karau] Write out thoughts and use the correct trait
91ffc0a [Holden Karau] more murh
006246c [Holden Karau] murp?
---
 .../ml/regression/LinearRegression.scala      |  30 +++-
 .../ml/regression/LinearRegressionSuite.scala | 149 +++++++++++++++++-
 project/MimaExcludes.scala                    |   5 +
 3 files changed, 172 insertions(+), 12 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala
index 01306545fc7cd..1b1d7299fb496 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala
@@ -26,7 +26,7 @@ import org.apache.spark.Logging
 import org.apache.spark.annotation.Experimental
 import org.apache.spark.ml.PredictorParams
 import org.apache.spark.ml.param.ParamMap
-import org.apache.spark.ml.param.shared.{HasElasticNetParam, HasMaxIter, HasRegParam, HasTol}
+import org.apache.spark.ml.param.shared._
 import org.apache.spark.ml.util.Identifiable
 import org.apache.spark.mllib.linalg.{Vector, Vectors}
 import org.apache.spark.mllib.linalg.BLAS._
@@ -41,7 +41,8 @@ import org.apache.spark.util.StatCounter
  * Params for linear regression.
  */
 private[regression] trait LinearRegressionParams extends PredictorParams
-  with HasRegParam with HasElasticNetParam with HasMaxIter with HasTol
+    with HasRegParam with HasElasticNetParam with HasMaxIter with HasTol
+    with HasFitIntercept
 
 /**
  * :: Experimental ::
@@ -72,6 +73,14 @@ class LinearRegression(override val uid: String)
   def setRegParam(value: Double): this.type = set(regParam, value)
   setDefault(regParam -> 0.0)
 
+  /**
+   * Set if we should fit the intercept
+   * Default is true.
+   * @group setParam
+   */
+  def setFitIntercept(value: Boolean): this.type = set(fitIntercept, value)
+  setDefault(fitIntercept -> true)
+
   /**
    * Set the ElasticNet mixing parameter.
    * For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty.
@@ -123,6 +132,7 @@ class LinearRegression(override val uid: String)
     val numFeatures = summarizer.mean.size
     val yMean = statCounter.mean
     val yStd = math.sqrt(statCounter.variance)
+    // look at glmnet5.m L761 maaaybe that has info
 
     // If the yStd is zero, then the intercept is yMean with zero weights;
     // as a result, training is not needed.
@@ -142,7 +152,7 @@ class LinearRegression(override val uid: String)
     val effectiveL1RegParam = $(elasticNetParam) * effectiveRegParam
     val effectiveL2RegParam = (1.0 - $(elasticNetParam)) * effectiveRegParam
 
-    val costFun = new LeastSquaresCostFun(instances, yStd, yMean,
+    val costFun = new LeastSquaresCostFun(instances, yStd, yMean, $(fitIntercept),
       featuresStd, featuresMean, effectiveL2RegParam)
 
     val optimizer = if ($(elasticNetParam) == 0.0 || effectiveRegParam == 0.0) {
@@ -180,7 +190,7 @@ class LinearRegression(override val uid: String)
     // The intercept in R's GLMNET is computed using closed form after the coefficients are
     // converged. See the following discussion for detail.
     // http://stats.stackexchange.com/questions/13617/how-is-the-intercept-computed-in-glmnet
-    val intercept = yMean - dot(weights, Vectors.dense(featuresMean))
+    val intercept = if ($(fitIntercept)) yMean - dot(weights, Vectors.dense(featuresMean)) else 0.0
     if (handlePersistence) instances.unpersist()
 
     // TODO: Converts to sparse format based on the storage, but may base on the scoring speed.
@@ -234,6 +244,7 @@ class LinearRegressionModel private[ml] (
  * See this discussion for detail.
  * http://stats.stackexchange.com/questions/13617/how-is-the-intercept-computed-in-glmnet
  *
+ * When training with intercept enabled,
  * The objective function in the scaled space is given by
  * {{{
  * L = 1/2n ||\sum_i w_i(x_i - \bar{x_i}) / \hat{x_i} - (y - \bar{y}) / \hat{y}||^2,
@@ -241,6 +252,10 @@ class LinearRegressionModel private[ml] (
  * where \bar{x_i} is the mean of x_i, \hat{x_i} is the standard deviation of x_i,
  * \bar{y} is the mean of label, and \hat{y} is the standard deviation of label.
  *
+ * If we fitting the intercept disabled (that is forced through 0.0),
+ * we can use the same equation except we set \bar{y} and \bar{x_i} to 0 instead
+ * of the respective means.
+ *
  * This can be rewritten as
  * {{{
  * L = 1/2n ||\sum_i (w_i/\hat{x_i})x_i - \sum_i (w_i/\hat{x_i})\bar{x_i} - y / \hat{y}
@@ -255,6 +270,7 @@ class LinearRegressionModel private[ml] (
  * \sum_i w_i^\prime x_i - y / \hat{y} + offset
  * }}}
  *
+ *
  * Note that the effective weights and offset don't depend on training dataset,
  * so they can be precomputed.
  *
@@ -301,6 +317,7 @@ private class LeastSquaresAggregator(
     weights: Vector,
     labelStd: Double,
     labelMean: Double,
+    fitIntercept: Boolean,
     featuresStd: Array[Double],
     featuresMean: Array[Double]) extends Serializable {
 
@@ -321,7 +338,7 @@ private class LeastSquaresAggregator(
       }
       i += 1
     }
-    (weightsArray, -sum + labelMean / labelStd, weightsArray.length)
+    (weightsArray, if (fitIntercept) labelMean / labelStd - sum else 0.0, weightsArray.length)
   }
 
   private val effectiveWeightsVector = Vectors.dense(effectiveWeightsArray)
@@ -404,6 +421,7 @@ private class LeastSquaresCostFun(
     data: RDD[(Double, Vector)],
     labelStd: Double,
     labelMean: Double,
+    fitIntercept: Boolean,
     featuresStd: Array[Double],
     featuresMean: Array[Double],
     effectiveL2regParam: Double) extends DiffFunction[BDV[Double]] {
@@ -412,7 +430,7 @@ private class LeastSquaresCostFun(
     val w = Vectors.fromBreeze(weights)
 
     val leastSquaresAggregator = data.treeAggregate(new LeastSquaresAggregator(w, labelStd,
-      labelMean, featuresStd, featuresMean))(
+      labelMean, fitIntercept, featuresStd, featuresMean))(
         seqOp = (c, v) => (c, v) match {
           case (aggregator, (label, features)) => aggregator.add(label, features)
         },
diff --git a/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala
index 732e2c42be144..ad1e9da692ee2 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala
@@ -26,6 +26,7 @@ import org.apache.spark.sql.{DataFrame, Row}
 class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
 
   @transient var dataset: DataFrame = _
+  @transient var datasetWithoutIntercept: DataFrame = _
 
   /**
    * In `LinearRegressionSuite`, we will make sure that the model trained by SparkML
@@ -34,14 +35,24 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
    *
    * import org.apache.spark.mllib.util.LinearDataGenerator
    * val data =
-   *   sc.parallelize(LinearDataGenerator.generateLinearInput(6.3, Array(4.7, 7.2), 10000, 42), 2)
-   * data.map(x=> x.label + ", " + x.features(0) + ", " + x.features(1)).saveAsTextFile("path")
+   *   sc.parallelize(LinearDataGenerator.generateLinearInput(6.3, Array(4.7, 7.2),
+   *     Array(0.9, -1.3), Array(0.7, 1.2), 10000, 42, 0.1), 2)
+   * data.map(x=> x.label + ", " + x.features(0) + ", " + x.features(1)).coalesce(1)
+   *   .saveAsTextFile("path")
    */
   override def beforeAll(): Unit = {
     super.beforeAll()
     dataset = sqlContext.createDataFrame(
       sc.parallelize(LinearDataGenerator.generateLinearInput(
         6.3, Array(4.7, 7.2), Array(0.9, -1.3), Array(0.7, 1.2), 10000, 42, 0.1), 2))
+    /**
+     * datasetWithoutIntercept is not needed for correctness testing but is useful for illustrating
+     * training model without intercept
+     */
+    datasetWithoutIntercept = sqlContext.createDataFrame(
+      sc.parallelize(LinearDataGenerator.generateLinearInput(
+        0.0, Array(4.7, 7.2), Array(0.9, -1.3), Array(0.7, 1.2), 10000, 42, 0.1), 2))
+
   }
 
   test("linear regression with intercept without regularization") {
@@ -78,6 +89,42 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
     }
   }
 
+  test("linear regression without intercept without regularization") {
+    val trainer = (new LinearRegression).setFitIntercept(false)
+    val model = trainer.fit(dataset)
+    val modelWithoutIntercept = trainer.fit(datasetWithoutIntercept)
+
+    /**
+     * weights <- coef(glmnet(features, label, family="gaussian", alpha = 0, lambda = 0,
+     *   intercept = FALSE))
+     * > weights
+     *  3 x 1 sparse Matrix of class "dgCMatrix"
+     *                           s0
+     * (Intercept)         .
+     * as.numeric.data.V2. 6.995908
+     * as.numeric.data.V3. 5.275131
+     */
+    val weightsR = Array(6.995908, 5.275131)
+
+    assert(model.intercept ~== 0 relTol 1E-3)
+    assert(model.weights(0) ~== weightsR(0) relTol 1E-3)
+    assert(model.weights(1) ~== weightsR(1) relTol 1E-3)
+    /**
+     * Then again with the data with no intercept:
+     * > weightsWithoutIntercept
+     * 3 x 1 sparse Matrix of class "dgCMatrix"
+     *                             s0
+     * (Intercept)           .
+     * as.numeric.data3.V2. 4.70011
+     * as.numeric.data3.V3. 7.19943
+     */
+    val weightsWithoutInterceptR = Array(4.70011, 7.19943)
+
+    assert(modelWithoutIntercept.intercept ~== 0 relTol 1E-3)
+    assert(modelWithoutIntercept.weights(0) ~== weightsWithoutInterceptR(0) relTol 1E-3)
+    assert(modelWithoutIntercept.weights(1) ~== weightsWithoutInterceptR(1) relTol 1E-3)
+  }
+
   test("linear regression with intercept with L1 regularization") {
     val trainer = (new LinearRegression).setElasticNetParam(1.0).setRegParam(0.57)
     val model = trainer.fit(dataset)
@@ -87,11 +134,11 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
      * > weights
      *  3 x 1 sparse Matrix of class "dgCMatrix"
      *                           s0
-     * (Intercept)         6.311546
-     * as.numeric.data.V2. 2.123522
-     * as.numeric.data.V3. 4.605651
+     * (Intercept)         6.24300
+     * as.numeric.data.V2. 4.024821
+     * as.numeric.data.V3. 6.679841
      */
-    val interceptR = 6.243000
+    val interceptR = 6.24300
     val weightsR = Array(4.024821, 6.679841)
 
     assert(model.intercept ~== interceptR relTol 1E-3)
@@ -106,6 +153,36 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
     }
   }
 
+  test("linear regression without intercept with L1 regularization") {
+    val trainer = (new LinearRegression).setElasticNetParam(1.0).setRegParam(0.57)
+      .setFitIntercept(false)
+    val model = trainer.fit(dataset)
+
+    /**
+     * weights <- coef(glmnet(features, label, family="gaussian", alpha = 1.0, lambda = 0.57,
+     *   intercept=FALSE))
+     * > weights
+     *  3 x 1 sparse Matrix of class "dgCMatrix"
+     *                           s0
+     * (Intercept)          .
+     * as.numeric.data.V2. 6.299752
+     * as.numeric.data.V3. 4.772913
+     */
+    val interceptR = 0.0
+    val weightsR = Array(6.299752, 4.772913)
+
+    assert(model.intercept ~== interceptR relTol 1E-3)
+    assert(model.weights(0) ~== weightsR(0) relTol 1E-3)
+    assert(model.weights(1) ~== weightsR(1) relTol 1E-3)
+
+    model.transform(dataset).select("features", "prediction").collect().foreach {
+      case Row(features: DenseVector, prediction1: Double) =>
+        val prediction2 =
+          features(0) * model.weights(0) + features(1) * model.weights(1) + model.intercept
+        assert(prediction1 ~== prediction2 relTol 1E-5)
+    }
+  }
+
   test("linear regression with intercept with L2 regularization") {
     val trainer = (new LinearRegression).setElasticNetParam(0.0).setRegParam(2.3)
     val model = trainer.fit(dataset)
@@ -134,6 +211,36 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
     }
   }
 
+  test("linear regression without intercept with L2 regularization") {
+    val trainer = (new LinearRegression).setElasticNetParam(0.0).setRegParam(2.3)
+      .setFitIntercept(false)
+    val model = trainer.fit(dataset)
+
+    /**
+     * weights <- coef(glmnet(features, label, family="gaussian", alpha = 0.0, lambda = 2.3,
+     *   intercept = FALSE))
+     * > weights
+     *  3 x 1 sparse Matrix of class "dgCMatrix"
+     *                           s0
+     * (Intercept)         .
+     * as.numeric.data.V2. 5.522875
+     * as.numeric.data.V3. 4.214502
+     */
+    val interceptR = 0.0
+    val weightsR = Array(5.522875, 4.214502)
+
+    assert(model.intercept ~== interceptR relTol 1E-3)
+    assert(model.weights(0) ~== weightsR(0) relTol 1E-3)
+    assert(model.weights(1) ~== weightsR(1) relTol 1E-3)
+
+    model.transform(dataset).select("features", "prediction").collect().foreach {
+      case Row(features: DenseVector, prediction1: Double) =>
+        val prediction2 =
+          features(0) * model.weights(0) + features(1) * model.weights(1) + model.intercept
+        assert(prediction1 ~== prediction2 relTol 1E-5)
+    }
+  }
+
   test("linear regression with intercept with ElasticNet regularization") {
     val trainer = (new LinearRegression).setElasticNetParam(0.3).setRegParam(1.6)
     val model = trainer.fit(dataset)
@@ -161,4 +268,34 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
         assert(prediction1 ~== prediction2 relTol 1E-5)
     }
   }
+
+  test("linear regression without intercept with ElasticNet regularization") {
+    val trainer = (new LinearRegression).setElasticNetParam(0.3).setRegParam(1.6)
+      .setFitIntercept(false)
+    val model = trainer.fit(dataset)
+
+    /**
+     * weights <- coef(glmnet(features, label, family="gaussian", alpha = 0.3, lambda = 1.6,
+     *   intercept=FALSE))
+     * > weights
+     * 3 x 1 sparse Matrix of class "dgCMatrix"
+     * s0
+     * (Intercept)         .
+     * as.numeric.dataM.V2. 5.673348
+     * as.numeric.dataM.V3. 4.322251
+     */
+    val interceptR = 0.0
+    val weightsR = Array(5.673348, 4.322251)
+
+    assert(model.intercept ~== interceptR relTol 1E-3)
+    assert(model.weights(0) ~== weightsR(0) relTol 1E-3)
+    assert(model.weights(1) ~== weightsR(1) relTol 1E-3)
+
+    model.transform(dataset).select("features", "prediction").collect().foreach {
+      case Row(features: DenseVector, prediction1: Double) =>
+        val prediction2 =
+          features(0) * model.weights(0) + features(1) * model.weights(1) + model.intercept
+        assert(prediction1 ~== prediction2 relTol 1E-5)
+    }
+  }
 }
diff --git a/project/MimaExcludes.scala b/project/MimaExcludes.scala
index 7a748fb5e38bd..f678c69a6dfa9 100644
--- a/project/MimaExcludes.scala
+++ b/project/MimaExcludes.scala
@@ -53,6 +53,11 @@ object MimaExcludes {
             // Removing a testing method from a private class
             ProblemFilters.exclude[MissingMethodProblem](
               "org.apache.spark.streaming.kafka.KafkaTestUtils.waitUntilLeaderOffset"),
+            // While private MiMa is still not happy about the changes,
+            ProblemFilters.exclude[MissingMethodProblem](
+              "org.apache.spark.ml.regression.LeastSquaresAggregator.this"),
+            ProblemFilters.exclude[MissingMethodProblem](
+              "org.apache.spark.ml.regression.LeastSquaresCostFun.this"),
             // SQL execution is considered private.
             excludePackage("org.apache.spark.sql.execution"),
             // NanoTime and CatalystTimestampConverter is only used inside catalyst,

From f2022fa0d375c804eca7803e172543b23ecbb9b7 Mon Sep 17 00:00:00 2001
From: MechCoder <manojkumarsivaraj334@gmail.com>
Date: Tue, 23 Jun 2015 12:43:32 -0700
Subject: [PATCH 5/7] [SPARK-8265] [MLLIB] [PYSPARK] Add LinearDataGenerator to
 pyspark.mllib.utils

It is useful to generate linear data for easy testing of linear models and in general. Scala already has it. This is just a wrapper around the Scala code.

Author: MechCoder <manojkumarsivaraj334@gmail.com>

Closes #6715 from MechCoder/generate_linear_input and squashes the following commits:

6182884 [MechCoder] Minor changes
8bda047 [MechCoder] Minor style fixes
0f1053c [MechCoder] [SPARK-8265] Add LinearDataGenerator to pyspark.mllib.utils
---
 .../mllib/api/python/PythonMLLibAPI.scala     | 32 ++++++++++++++++-
 python/pyspark/mllib/tests.py                 | 22 ++++++++++--
 python/pyspark/mllib/util.py                  | 35 +++++++++++++++++++
 3 files changed, 86 insertions(+), 3 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala b/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala
index f9a271f47ee2c..c4bea7c2cad4f 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala
@@ -51,6 +51,7 @@ import org.apache.spark.mllib.tree.loss.Losses
 import org.apache.spark.mllib.tree.model.{DecisionTreeModel, GradientBoostedTreesModel, RandomForestModel}
 import org.apache.spark.mllib.tree.{DecisionTree, GradientBoostedTrees, RandomForest}
 import org.apache.spark.mllib.util.MLUtils
+import org.apache.spark.mllib.util.LinearDataGenerator
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.DataFrame
 import org.apache.spark.storage.StorageLevel
@@ -972,7 +973,7 @@ private[python] class PythonMLLibAPI extends Serializable {
   def estimateKernelDensity(
       sample: JavaRDD[Double],
       bandwidth: Double, points: java.util.ArrayList[Double]): Array[Double] = {
-    return new KernelDensity().setSample(sample).setBandwidth(bandwidth).estimate(
+    new KernelDensity().setSample(sample).setBandwidth(bandwidth).estimate(
       points.asScala.toArray)
   }
 
@@ -991,6 +992,35 @@ private[python] class PythonMLLibAPI extends Serializable {
       List[AnyRef](model.clusterCenters, Vectors.dense(model.clusterWeights)).asJava
   }
 
+  /**
+   * Wrapper around the generateLinearInput method of LinearDataGenerator.
+   */
+  def generateLinearInputWrapper(
+      intercept: Double,
+      weights: JList[Double],
+      xMean: JList[Double],
+      xVariance: JList[Double],
+      nPoints: Int,
+      seed: Int,
+      eps: Double): Array[LabeledPoint] = {
+    LinearDataGenerator.generateLinearInput(
+      intercept, weights.asScala.toArray, xMean.asScala.toArray,
+      xVariance.asScala.toArray, nPoints, seed, eps).toArray
+  }
+
+  /**
+   * Wrapper around the generateLinearRDD method of LinearDataGenerator.
+   */
+  def generateLinearRDDWrapper(
+      sc: JavaSparkContext,
+      nexamples: Int,
+      nfeatures: Int,
+      eps: Double,
+      nparts: Int,
+      intercept: Double): JavaRDD[LabeledPoint] = {
+    LinearDataGenerator.generateLinearRDD(
+      sc, nexamples, nfeatures, eps, nparts, intercept)
+  }
 }
 
 /**
diff --git a/python/pyspark/mllib/tests.py b/python/pyspark/mllib/tests.py
index c8d61b9855a69..509faa11df170 100644
--- a/python/pyspark/mllib/tests.py
+++ b/python/pyspark/mllib/tests.py
@@ -49,8 +49,8 @@
 from pyspark.mllib.stat import Statistics
 from pyspark.mllib.feature import Word2Vec
 from pyspark.mllib.feature import IDF
-from pyspark.mllib.feature import StandardScaler
-from pyspark.mllib.feature import ElementwiseProduct
+from pyspark.mllib.feature import StandardScaler, ElementwiseProduct
+from pyspark.mllib.util import LinearDataGenerator
 from pyspark.serializers import PickleSerializer
 from pyspark.streaming import StreamingContext
 from pyspark.sql import SQLContext
@@ -1019,6 +1019,24 @@ def collect(rdd):
         self.assertEqual(predict_results, [[0, 1, 1], [1, 0, 1]])
 
 
+class LinearDataGeneratorTests(MLlibTestCase):
+    def test_dim(self):
+        linear_data = LinearDataGenerator.generateLinearInput(
+            intercept=0.0, weights=[0.0, 0.0, 0.0],
+            xMean=[0.0, 0.0, 0.0], xVariance=[0.33, 0.33, 0.33],
+            nPoints=4, seed=0, eps=0.1)
+        self.assertEqual(len(linear_data), 4)
+        for point in linear_data:
+            self.assertEqual(len(point.features), 3)
+
+        linear_data = LinearDataGenerator.generateLinearRDD(
+            sc=sc, nexamples=6, nfeatures=2, eps=0.1,
+            nParts=2, intercept=0.0).collect()
+        self.assertEqual(len(linear_data), 6)
+        for point in linear_data:
+            self.assertEqual(len(point.features), 2)
+
+
 if __name__ == "__main__":
     if not _have_scipy:
         print("NOTE: Skipping SciPy tests as it does not seem to be installed")
diff --git a/python/pyspark/mllib/util.py b/python/pyspark/mllib/util.py
index 16a90db146ef0..348238319e407 100644
--- a/python/pyspark/mllib/util.py
+++ b/python/pyspark/mllib/util.py
@@ -257,6 +257,41 @@ def load(cls, sc, path):
         return cls(java_model)
 
 
+class LinearDataGenerator(object):
+    """Utils for generating linear data"""
+
+    @staticmethod
+    def generateLinearInput(intercept, weights, xMean, xVariance,
+                            nPoints, seed, eps):
+        """
+        :param: intercept bias factor, the term c in X'w + c
+        :param: weights   feature vector, the term w in X'w + c
+        :param: xMean     Point around which the data X is centered.
+        :param: xVariance Variance of the given data
+        :param: nPoints   Number of points to be generated
+        :param: seed      Random Seed
+        :param: eps       Used to scale the noise. If eps is set high,
+                          the amount of gaussian noise added is more.
+        Returns a list of LabeledPoints of length nPoints
+        """
+        weights = [float(weight) for weight in weights]
+        xMean = [float(mean) for mean in xMean]
+        xVariance = [float(var) for var in xVariance]
+        return list(callMLlibFunc(
+            "generateLinearInputWrapper", float(intercept), weights, xMean,
+            xVariance, int(nPoints), int(seed), float(eps)))
+
+    @staticmethod
+    def generateLinearRDD(sc, nexamples, nfeatures, eps,
+                          nParts=2, intercept=0.0):
+        """
+        Generate a RDD of LabeledPoints.
+        """
+        return callMLlibFunc(
+            "generateLinearRDDWrapper", sc, int(nexamples), int(nfeatures),
+            float(eps), int(nParts), float(intercept))
+
+
 def _test():
     import doctest
     from pyspark.context import SparkContext

From f2fb0285ab6d4225c5350f109dea6c1c017bb491 Mon Sep 17 00:00:00 2001
From: Alok  Singh <singhal@Aloks-MacBook-Pro.local>
Date: Tue, 23 Jun 2015 12:47:55 -0700
Subject: [PATCH 6/7] [SPARK-8111] [SPARKR] SparkR shell should display Spark
 logo and version banner on startup.

spark version is taken from the environment variable SPARK_VERSION

Author: Alok  Singh <singhal@Aloks-MacBook-Pro.local>
Author: Alok  Singh <singhal@aloks-mbp.usca.ibm.com>

Closes #6944 from aloknsingh/aloknsingh_spark_jiras and squashes the following commits:

ed607bd [Alok  Singh] [SPARK-8111][SparkR] As per suggestion, 1) using the version from sparkContext rather than the Sys.env. 2) change "Welcome to SparkR!" to "Welcome to" followed by Spark logo and version
acd5b85 [Alok  Singh] fix the jira SPARK-8111 to add the spark version and logo. Currently spark version is taken from the environment variable SPARK_VERSION
---
 R/pkg/inst/profile/shell.R | 16 +++++++++++++++-
 1 file changed, 15 insertions(+), 1 deletion(-)

diff --git a/R/pkg/inst/profile/shell.R b/R/pkg/inst/profile/shell.R
index 773b6ecf582d9..7189f1a260934 100644
--- a/R/pkg/inst/profile/shell.R
+++ b/R/pkg/inst/profile/shell.R
@@ -27,7 +27,21 @@
   sc <- SparkR::sparkR.init()
   assign("sc", sc, envir=.GlobalEnv)
   sqlContext <- SparkR::sparkRSQL.init(sc)
+  sparkVer <- SparkR:::callJMethod(sc, "version")
   assign("sqlContext", sqlContext, envir=.GlobalEnv)
-  cat("\n Welcome to SparkR!")
+  cat("\n Welcome to")
+  cat("\n")
+  cat("    ____              __", "\n")
+  cat("   / __/__  ___ _____/ /__", "\n")
+  cat("  _\\ \\/ _ \\/ _ `/ __/  '_/", "\n")
+  cat(" /___/ .__/\\_,_/_/ /_/\\_\\")
+  if (nchar(sparkVer) == 0) {
+    cat("\n")
+  } else {
+    cat("   version ", sparkVer, "\n") 
+  }
+  cat("    /_/", "\n")
+  cat("\n")
+
   cat("\n Spark context is available as sc, SQL context is available as sqlContext\n")
 }

From a8031183aff2e23de9204ddfc7e7f5edbf052a7e Mon Sep 17 00:00:00 2001
From: Oleksiy Dyagilev <oleksiy_dyagilev@epam.com>
Date: Tue, 23 Jun 2015 13:12:19 -0700
Subject: [PATCH 7/7] [SPARK-8525] [MLLIB] fix LabeledPoint parser when there
 is a whitespace between label and features vector

fix LabeledPoint parser when there is a whitespace between label and features vector, e.g.
(y, [x1, x2, x3])

Author: Oleksiy Dyagilev <oleksiy_dyagilev@epam.com>

Closes #6954 from fe2s/SPARK-8525 and squashes the following commits:

0755b9d [Oleksiy Dyagilev] [SPARK-8525][MLLIB] addressing comment, removing dep on commons-lang
c1abc2b [Oleksiy Dyagilev] [SPARK-8525][MLLIB] fix LabeledPoint parser when there is a whitespace on specific position
---
 .../scala/org/apache/spark/mllib/util/NumericParser.scala  | 2 ++
 .../apache/spark/mllib/regression/LabeledPointSuite.scala  | 5 +++++
 .../org/apache/spark/mllib/util/NumericParserSuite.scala   | 7 +++++++
 3 files changed, 14 insertions(+)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/util/NumericParser.scala b/mllib/src/main/scala/org/apache/spark/mllib/util/NumericParser.scala
index 308f7f3578e21..a841c5caf0142 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/util/NumericParser.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/util/NumericParser.scala
@@ -98,6 +98,8 @@ private[mllib] object NumericParser {
         }
       } else if (token == ")") {
         parsing = false
+      } else if (token.trim.isEmpty){
+          // ignore whitespaces between delim chars, e.g. ", ["
       } else {
         // expecting a number
         items.append(parseDouble(token))
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/regression/LabeledPointSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/regression/LabeledPointSuite.scala
index d8364a06de4da..f8d0af8820e64 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/regression/LabeledPointSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/regression/LabeledPointSuite.scala
@@ -31,6 +31,11 @@ class LabeledPointSuite extends SparkFunSuite {
     }
   }
 
+  test("parse labeled points with whitespaces") {
+    val point = LabeledPoint.parse("(0.0, [1.0, 2.0])")
+    assert(point === LabeledPoint(0.0, Vectors.dense(1.0, 2.0)))
+  }
+
   test("parse labeled points with v0.9 format") {
     val point = LabeledPoint.parse("1.0,1.0 0.0 -2.0")
     assert(point === LabeledPoint(1.0, Vectors.dense(1.0, 0.0, -2.0)))
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/util/NumericParserSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/util/NumericParserSuite.scala
index 8dcb9ba9be108..fa4f74d71b7e7 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/util/NumericParserSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/util/NumericParserSuite.scala
@@ -37,4 +37,11 @@ class NumericParserSuite extends SparkFunSuite {
       }
     }
   }
+
+  test("parser with whitespaces") {
+    val s = "(0.0, [1.0, 2.0])"
+    val parsed = NumericParser.parse(s).asInstanceOf[Seq[_]]
+    assert(parsed(0).asInstanceOf[Double] === 0.0)
+    assert(parsed(1).asInstanceOf[Array[Double]] === Array(1.0, 2.0))
+  }
 }