diff --git a/docs/_data/menu-ml.yaml b/docs/_data/menu-ml.yaml
index 047423f75aec1..b5a6641e2e7e2 100644
--- a/docs/_data/menu-ml.yaml
+++ b/docs/_data/menu-ml.yaml
@@ -1,3 +1,5 @@
+- text: Basic statistics
+ url: ml-statistics.html
- text: Pipelines
url: ml-pipeline.html
- text: Extracting, transforming and selecting features
diff --git a/docs/ml-statistics.md b/docs/ml-statistics.md
new file mode 100644
index 0000000000000..abfb3cab1e566
--- /dev/null
+++ b/docs/ml-statistics.md
@@ -0,0 +1,92 @@
+---
+layout: global
+title: Basic Statistics
+displayTitle: Basic Statistics
+---
+
+
+`\[
+\newcommand{\R}{\mathbb{R}}
+\newcommand{\E}{\mathbb{E}}
+\newcommand{\x}{\mathbf{x}}
+\newcommand{\y}{\mathbf{y}}
+\newcommand{\wv}{\mathbf{w}}
+\newcommand{\av}{\mathbf{\alpha}}
+\newcommand{\bv}{\mathbf{b}}
+\newcommand{\N}{\mathbb{N}}
+\newcommand{\id}{\mathbf{I}}
+\newcommand{\ind}{\mathbf{1}}
+\newcommand{\0}{\mathbf{0}}
+\newcommand{\unit}{\mathbf{e}}
+\newcommand{\one}{\mathbf{1}}
+\newcommand{\zero}{\mathbf{0}}
+\]`
+
+**Table of Contents**
+
+* This will become a table of contents (this text will be scraped).
+{:toc}
+
+## Correlation
+
+Calculating the correlation between two series of data is a common operation in Statistics. In `spark.ml`
+we provide the flexibility to calculate pairwise correlations among many series. The supported
+correlation methods are currently Pearson's and Spearman's correlation.
+
+
+
+[`Correlation`](api/scala/index.html#org.apache.spark.ml.stat.Correlation$)
+computes the correlation matrix for the input Dataset of Vectors using the specified method.
+The output will be a DataFrame that contains the correlation matrix of the column of vectors.
+
+{% include_example scala/org/apache/spark/examples/ml/CorrelationExample.scala %}
+
+
+
+[`Correlation`](api/java/org/apache/spark/ml/stat/Correlation.html)
+computes the correlation matrix for the input Dataset of Vectors using the specified method.
+The output will be a DataFrame that contains the correlation matrix of the column of vectors.
+
+{% include_example java/org/apache/spark/examples/ml/JavaCorrelationExample.java %}
+
+
+
+[`Correlation`](api/python/pyspark.ml.html#pyspark.ml.stat.Correlation$)
+computes the correlation matrix for the input Dataset of Vectors using the specified method.
+The output will be a DataFrame that contains the correlation matrix of the column of vectors.
+
+{% include_example python/ml/correlation_example.py %}
+
+
+
+
+## Hypothesis testing
+
+Hypothesis testing is a powerful tool in statistics to determine whether a result is statistically
+significant, whether this result occurred by chance or not. `spark.ml` currently supports Pearson's
+Chi-squared ( $\chi^2$) tests for independence.
+
+`ChiSquareTest` conducts Pearson's independence test for every feature against the label.
+For each feature, the (feature, label) pairs are converted into a contingency matrix for which
+the Chi-squared statistic is computed. All label and feature values must be categorical.
+
+
+
+Refer to the [`ChiSquareTest` Scala docs](api/scala/index.html#org.apache.spark.ml.stat.ChiSquareTest$) for details on the API.
+
+{% include_example scala/org/apache/spark/examples/ml/ChiSquareTestExample.scala %}
+
+
+
+Refer to the [`ChiSquareTest` Java docs](api/java/org/apache/spark/ml/stat/ChiSquareTest.html) for details on the API.
+
+{% include_example java/org/apache/spark/examples/ml/JavaChiSquareTestExample.java %}
+
+
+
+Refer to the [`ChiSquareTest` Python docs](api/python/index.html#pyspark.ml.stat.ChiSquareTest$) for details on the API.
+
+{% include_example python/ml/chi_square_test_example.py %}
+
+
+
\ No newline at end of file
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaChiSquareTestExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaChiSquareTestExample.java
new file mode 100644
index 0000000000000..4b39350fab9b5
--- /dev/null
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaChiSquareTestExample.java
@@ -0,0 +1,75 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.examples.ml;
+
+import org.apache.spark.sql.SparkSession;
+
+// $example on$
+import java.util.Arrays;
+import java.util.List;
+
+import org.apache.spark.ml.linalg.Vectors;
+import org.apache.spark.ml.linalg.VectorUDT;
+import org.apache.spark.ml.stat.ChiSquareTest;
+import org.apache.spark.sql.Dataset;
+import org.apache.spark.sql.Row;
+import org.apache.spark.sql.RowFactory;
+import org.apache.spark.sql.types.*;
+// $example off$
+
+/**
+ * An example for Chi-square hypothesis testing.
+ * Run with
+ *
+ * bin/run-example ml.JavaChiSquareTestExample
+ *
+ */
+public class JavaChiSquareTestExample {
+
+ public static void main(String[] args) {
+ SparkSession spark = SparkSession
+ .builder()
+ .appName("JavaChiSquareTestExample")
+ .getOrCreate();
+
+ // $example on$
+ List data = Arrays.asList(
+ RowFactory.create(0.0, Vectors.dense(0.5, 10.0)),
+ RowFactory.create(0.0, Vectors.dense(1.5, 20.0)),
+ RowFactory.create(1.0, Vectors.dense(1.5, 30.0)),
+ RowFactory.create(0.0, Vectors.dense(3.5, 30.0)),
+ RowFactory.create(0.0, Vectors.dense(3.5, 40.0)),
+ RowFactory.create(1.0, Vectors.dense(3.5, 40.0))
+ );
+
+ StructType schema = new StructType(new StructField[]{
+ new StructField("label", DataTypes.DoubleType, false, Metadata.empty()),
+ new StructField("features", new VectorUDT(), false, Metadata.empty()),
+ });
+
+ Dataset df = spark.createDataFrame(data, schema);
+ Row r = ChiSquareTest.test(df, "features", "label").head();
+ System.out.println("pValues: " + r.get(0).toString());
+ System.out.println("degreesOfFreedom: " + r.getList(1).toString());
+ System.out.println("statistics: " + r.get(2).toString());
+
+ // $example off$
+
+ spark.stop();
+ }
+}
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaCorrelationExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaCorrelationExample.java
new file mode 100644
index 0000000000000..2a6d62ab3fb73
--- /dev/null
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaCorrelationExample.java
@@ -0,0 +1,72 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.examples.ml;
+
+import org.apache.spark.sql.SparkSession;
+
+// $example on$
+import java.util.Arrays;
+import java.util.List;
+
+import org.apache.spark.ml.linalg.Vectors;
+import org.apache.spark.ml.linalg.VectorUDT;
+import org.apache.spark.ml.stat.Correlation;
+import org.apache.spark.sql.Dataset;
+import org.apache.spark.sql.Row;
+import org.apache.spark.sql.RowFactory;
+import org.apache.spark.sql.types.*;
+// $example off$
+
+/**
+ * An example for computing correlation matrix.
+ * Run with
+ *
+ * bin/run-example ml.JavaCorrelationExample
+ *
+ */
+public class JavaCorrelationExample {
+
+ public static void main(String[] args) {
+ SparkSession spark = SparkSession
+ .builder()
+ .appName("JavaCorrelationExample")
+ .getOrCreate();
+
+ // $example on$
+ List data = Arrays.asList(
+ RowFactory.create(Vectors.sparse(4, new int[]{0, 3}, new double[]{1.0, -2.0})),
+ RowFactory.create(Vectors.dense(4.0, 5.0, 0.0, 3.0)),
+ RowFactory.create(Vectors.dense(6.0, 7.0, 0.0, 8.0)),
+ RowFactory.create(Vectors.sparse(4, new int[]{0, 3}, new double[]{9.0, 1.0}))
+ );
+
+ StructType schema = new StructType(new StructField[]{
+ new StructField("features", new VectorUDT(), false, Metadata.empty()),
+ });
+
+ Dataset df = spark.createDataFrame(data, schema);
+ Row r1 = Correlation.corr(df, "features").head();
+ System.out.println("Pearson correlation matrix:\n" + r1.get(0).toString());
+
+ Row r2 = Correlation.corr(df, "features", "spearman").head();
+ System.out.println("Spearman correlation matrix:\n" + r2.get(0).toString());
+ // $example off$
+
+ spark.stop();
+ }
+}
diff --git a/examples/src/main/python/ml/chi_square_test_example.py b/examples/src/main/python/ml/chi_square_test_example.py
new file mode 100644
index 0000000000000..8f25318ded00a
--- /dev/null
+++ b/examples/src/main/python/ml/chi_square_test_example.py
@@ -0,0 +1,52 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from __future__ import print_function
+
+from pyspark.sql import SparkSession
+# $example on$
+from pyspark.ml.linalg import Vectors
+from pyspark.ml.stat import ChiSquareTest
+# $example off$
+
+"""
+An example for Chi-square hypothesis testing.
+Run with:
+ bin/spark-submit examples/src/main/python/ml/chi_square_test_example.py
+"""
+if __name__ == "__main__":
+ spark = SparkSession \
+ .builder \
+ .appName("ChiSquareTestExample") \
+ .getOrCreate()
+
+ # $example on$
+ data = [(0.0, Vectors.dense(0.5, 10.0)),
+ (0.0, Vectors.dense(1.5, 20.0)),
+ (1.0, Vectors.dense(1.5, 30.0)),
+ (0.0, Vectors.dense(3.5, 30.0)),
+ (0.0, Vectors.dense(3.5, 40.0)),
+ (1.0, Vectors.dense(3.5, 40.0))]
+ df = spark.createDataFrame(data, ["label", "features"])
+
+ r = ChiSquareTest.test(df, "features", "label").head()
+ print("pValues: " + str(r.pValues))
+ print("degreesOfFreedom: " + str(r.degreesOfFreedom))
+ print("statistics: " + str(r.statistics))
+ # $example off$
+
+ spark.stop()
diff --git a/examples/src/main/python/ml/correlation_example.py b/examples/src/main/python/ml/correlation_example.py
new file mode 100644
index 0000000000000..0a9d30da5a42e
--- /dev/null
+++ b/examples/src/main/python/ml/correlation_example.py
@@ -0,0 +1,51 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from __future__ import print_function
+
+# $example on$
+from pyspark.ml.linalg import Vectors
+from pyspark.ml.stat import Correlation
+# $example off$
+from pyspark.sql import SparkSession
+
+"""
+An example for computing correlation matrix.
+Run with:
+ bin/spark-submit examples/src/main/python/ml/correlation_example.py
+"""
+if __name__ == "__main__":
+ spark = SparkSession \
+ .builder \
+ .appName("CorrelationExample") \
+ .getOrCreate()
+
+ # $example on$
+ data = [(Vectors.sparse(4, [(0, 1.0), (3, -2.0)]),),
+ (Vectors.dense([4.0, 5.0, 0.0, 3.0]),),
+ (Vectors.dense([6.0, 7.0, 0.0, 8.0]),),
+ (Vectors.sparse(4, [(0, 9.0), (3, 1.0)]),)]
+ df = spark.createDataFrame(data, ["features"])
+
+ r1 = Correlation.corr(df, "features").head()
+ print("Pearson correlation matrix:\n" + str(r1[0]))
+
+ r2 = Correlation.corr(df, "features", "spearman").head()
+ print("Spearman correlation matrix:\n" + str(r2[0]))
+ # $example off$
+
+ spark.stop()
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/ChiSquareTestExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/ChiSquareTestExample.scala
new file mode 100644
index 0000000000000..dcee1e427ce58
--- /dev/null
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/ChiSquareTestExample.scala
@@ -0,0 +1,63 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// scalastyle:off println
+package org.apache.spark.examples.ml
+
+// $example on$
+import org.apache.spark.ml.linalg.{Vector, Vectors}
+import org.apache.spark.ml.stat.ChiSquareTest
+// $example off$
+import org.apache.spark.sql.SparkSession
+
+/**
+ * An example for Chi-square hypothesis testing.
+ * Run with
+ * {{{
+ * bin/run-example ml.ChiSquareTestExample
+ * }}}
+ */
+object ChiSquareTestExample {
+
+ def main(args: Array[String]): Unit = {
+ val spark = SparkSession
+ .builder
+ .appName("ChiSquareTestExample")
+ .getOrCreate()
+ import spark.implicits._
+
+ // $example on$
+ val data = Seq(
+ (0.0, Vectors.dense(0.5, 10.0)),
+ (0.0, Vectors.dense(1.5, 20.0)),
+ (1.0, Vectors.dense(1.5, 30.0)),
+ (0.0, Vectors.dense(3.5, 30.0)),
+ (0.0, Vectors.dense(3.5, 40.0)),
+ (1.0, Vectors.dense(3.5, 40.0))
+ )
+
+ val df = data.toDF("label", "features")
+ val chi = ChiSquareTest.test(df, "features", "label").head
+ println("pValues = " + chi.getAs[Vector](0))
+ println("degreesOfFreedom = " + chi.getSeq[Int](1).mkString("[", ",", "]"))
+ println("statistics = " + chi.getAs[Vector](2))
+ // $example off$
+
+ spark.stop()
+ }
+}
+// scalastyle:on println
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/CorrelationExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/CorrelationExample.scala
new file mode 100644
index 0000000000000..3f57dc342eb00
--- /dev/null
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/CorrelationExample.scala
@@ -0,0 +1,63 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// scalastyle:off println
+package org.apache.spark.examples.ml
+
+// $example on$
+import org.apache.spark.ml.linalg.{Matrix, Vectors}
+import org.apache.spark.ml.stat.Correlation
+import org.apache.spark.sql.Row
+// $example off$
+import org.apache.spark.sql.SparkSession
+
+/**
+ * An example for computing correlation matrix.
+ * Run with
+ * {{{
+ * bin/run-example ml.CorrelationExample
+ * }}}
+ */
+object CorrelationExample {
+
+ def main(args: Array[String]): Unit = {
+ val spark = SparkSession
+ .builder
+ .appName("CorrelationExample")
+ .getOrCreate()
+ import spark.implicits._
+
+ // $example on$
+ val data = Seq(
+ Vectors.sparse(4, Seq((0, 1.0), (3, -2.0))),
+ Vectors.dense(4.0, 5.0, 0.0, 3.0),
+ Vectors.dense(6.0, 7.0, 0.0, 8.0),
+ Vectors.sparse(4, Seq((0, 9.0), (3, 1.0)))
+ )
+
+ val df = data.map(Tuple1.apply).toDF("features")
+ val Row(coeff1: Matrix) = Correlation.corr(df, "features").head
+ println("Pearson correlation matrix:\n" + coeff1.toString)
+
+ val Row(coeff2: Matrix) = Correlation.corr(df, "features", "spearman").head
+ println("Spearman correlation matrix:\n" + coeff2.toString)
+ // $example off$
+
+ spark.stop()
+ }
+}
+// scalastyle:on println