### 4.1 基本统计

#### 一. 相关性

In [4]:
val myspark = spark
import myspark.implicits._
import org.apache.spark.ml.linalg.{Matrix, Vectors}
import org.apache.spark.ml.stat.Correlation
import org.apache.spark.sql.Row

val data = Seq(
    Vectors.sparse(4,Seq((0,1.0),(3,-2.0))),
    Vectors.dense(4.0,5.0,0.0,3.0),
    Vectors.dense(6.0,7.0,0.0,8.0),
    Vectors.sparse(4,Seq((0,9.0),(3,1.0)))
)
val df = data.map(Tuple1.apply).toDF("features")
df.show

+--------------------+
|            features|
+--------------------+
|(4,[0,3],[1.0,-2.0])|
|   [4.0,5.0,0.0,3.0]|
|   [6.0,7.0,0.0,8.0]|
| (4,[0,3],[9.0,1.0])|
+--------------------+



In [3]:
val Row(coeff1:Matrix) = Correlation.corr(df,"features").head
val Row(coeff2:Matrix) = Correlation.corr(df,"features","spearman").head

println(s"Pearson correlation matrix:\n $coeff1")
println(s"Spearman correlation matrix:\n $coeff2")

Pearson correlation matrix:
 1.0                   0.055641488407465814  NaN  0.4004714203168137  
0.055641488407465814  1.0                   NaN  0.9135958615342522  
NaN                   NaN                   1.0  NaN                 
0.4004714203168137    0.9135958615342522    NaN  1.0                 
Spearman correlation matrix:
 1.0                  0.10540925533894532  NaN  0.40000000000000174  
0.10540925533894532  1.0                  NaN  0.9486832980505141   
NaN                  NaN                  1.0  NaN                  
0.40000000000000174  0.9486832980505141   NaN  1.0                  


#### 二. 卡方检验

In [16]:
import org.apache.spark.ml.linalg.{Vector, Vectors}
import org.apache.spark.ml.stat.ChiSquareTest

val data = Seq(
  (0.0, Vectors.dense(0.5, 10.0)),
  (0.0, Vectors.dense(1.5, 20.0)),
  (1.0, Vectors.dense(1.5, 30.0)),
  (0.0, Vectors.dense(3.5, 30.0)),
  (0.0, Vectors.dense(3.5, 40.0)),
  (1.0, Vectors.dense(3.5, -40.0))
)
val df = data.toDF("label","features")
val chi = ChiSquareTest.test(df,"features","label").head

println(s"pValues = ${chi.getAs[Vector](0)}")
println(s"degreesOfFreedom ${chi.getSeq[Int](1).mkString("[", ",", "]")}")
println(s"statistics ${chi.getAs[Vector](2)}")

pValues = [0.6872892787909721,0.44089552967916945]
degreesOfFreedom [2,4]
statistics [0.75,3.7500000000000004]


### 4.2 PipeLine

#### 一. Example: Estimator, Transformer, and Param

In [None]:
import org.apache.spark.ml.classification.LogisticRegression
import org.apache.spark.ml.linalg.{Vector, Vectors}
import org.apache.spark.ml.param.ParamMap
import org.apache.spark.sql.Row

val data = Seq(
  (1.0, Vectors.dense(0.0, 1.1, 0.1)),
  (0.0, Vectors.dense(2.0, 1.0, -1.0)),
  (0.0, Vectors.dense(2.0, 1.3, 1.0)),
  (1.0, Vectors.dense(0.0, 1.2, -0.5))
).toDF("label", "features")

val clf = new LogisticRegression()
// println(s"LogisticRegression parameters:\n ${clf.explainParams()}")
println(s"LogisticRegression parameters:\n ${clf.explainParams()}\n")

clf.setMaxIter(10).
    setRegParam(0.01)

val model1 = clf.fit(data)
// This prints the parameter (name: value) pairs, where names are unique IDs for this LogisticRegression instance.
println(s"Model 1 was fit using parameters: ${model1.parent.extractParamMap()}")

//使用ParamMap指定参数
val paramMap = ParamMap(clf.maxIter->20).
                    put(clf.maxIter->30).  // Specify 1 Param. This overwrites the original maxIter.
                    put(clf.regParam->0.1, clf.threshold->0.55)  //// Specify multiple Params.

//组合多个ParamMap
val paramMap2 = ParamMap(clf.probabilityCol->"myProbility")
val paramMapCombine = paramMap ++ paramMap2

// clf.fit时指定的参数会覆盖声明clf时指定的参数
val model2 = clf.fit(data,paramMapCombine
println(s"Model 2 was fit using parameters: ${model2.parent.extractParamMap}")


// Prepare test data.
val test_data = spark.createDataFrame(Seq(
  (1.0, Vectors.dense(-1.0, 1.5, 1.3)),
  (0.0, Vectors.dense(3.0, 2.0, -0.1)),
  (1.0, Vectors.dense(0.0, 2.2, -1.5))
)).toDF("label", "features")
// Make predictions on test data using the Transformer.transform() method.
// LogisticRegression.transform will only use the 'features' column.
// Note that model2.transform() outputs a 'myProbability' column instead of the usual
// 'probability' column since we renamed the lr.probabilityCol parameter previously.
model2.transform(test_data).
    select("features","label","myProbility","prediction").
    collect().
    foreach(
        case Row(features:Vector,label:Double,prob:Vector,prediction:Double) => 
            println(s"($features, $label) -> prob=$prob, prediction=$prediction")
    )

#### 二. Example: Pipeline

In [1]:
import org.apache.spark.ml.{Pipeline, PipelineModel}
import org.apache.spark.ml.classification.LogisticRegression
import org.apache.spark.ml.feature.{HashingTF, Tokenizer}
import org.apache.spark.ml.linalg.Vector
import org.apache.spark.sql.Row

val data = Seq(
  (0L, "a b c d e spark", 1.0),
  (1L, "b d", 0.0),
  (2L, "spark f g h", 1.0),
  (3L, "hadoop mapreduce", 0.0)
).toDF("id", "text", "label")
// Configure an ML pipeline, which consists of three stages: tokenizer, hashingTF, and lr.
val tokenizer = new Tokenizer().setInputCol("text").setOutputCol("word")
val hashingTF = new HashingTF().setNumFeatures(1000).setInputCol(tokenizer.getOutputCol).setOutputCol("features")
val lr = new LogisticRegression().setMaxIter(10).setRegParam(0.001)

val pipeline = new Pipeline().setStages(Array(tokenizer,hashingTF,lr))
val model = pipeline.fit(data)

data = [id: bigint, text: string ... 1 more field]
tokenizer = tok_c0b3bee4beb2
hashingTF = hashingTF_58319b374a93
lr = logreg_c88c60484ad2
pipeline = pipeline_626438889f4a
model = pipeline_626438889f4a


pipeline_626438889f4a