-
Notifications
You must be signed in to change notification settings - Fork 1
/
PipeLine_Example.scala
64 lines (52 loc) · 1.96 KB
/
PipeLine_Example.scala
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
package mypc.spark.codes.ml
import org.apache.log4j.{Level, Logger}
import org.apache.spark.ml.{Pipeline, PipelineModel}
import org.apache.spark.ml.classification.LogisticRegression
import org.apache.spark.ml.feature.{HashingTF, Tokenizer}
import org.apache.spark.sql.SparkSession
import org.apache.spark.ml.linalg.Vector
import org.apache.spark.sql.Row
/**
* Created by maniram on 26/1/18.
*/
object PipeLine_Example {
Logger.getLogger("org").setLevel(Level.ERROR)
def main(args : Array[String]): Unit ={
val spark = SparkSession.builder().appName("PipeLine_Example").master("local[2]").getOrCreate()
val training = spark.createDataFrame(Seq(
(0L, "a b c d e spark", 1.0),
(1L, "b d", 0.0),
(2L, "spark f g h", 1.0),
(3L, "hadoop mapreduce", 0.0)
)).toDF("id", "text", "label")
val tokenizer = new Tokenizer()
.setInputCol("text")
.setOutputCol("words")
val hashingTF= new HashingTF()
.setInputCol(tokenizer.getOutputCol)
.setNumFeatures(700)
.setOutputCol("features")
val logit = new LogisticRegression()
.setMaxIter(800)
.setRegParam(.001)
val pipeLine = new Pipeline()
.setStages(Array(tokenizer,hashingTF,logit))
val model = pipeLine.fit(dataset = training)
pipeLine.write.overwrite().save("/home/maniram/data/PipeLine_Unfit_Model")
model.write.overwrite().save("/home/maniram/data/PipeLine_fitted_Model")
val model2 = PipelineModel.load("/home/maniram/data/PipeLine_fitted_Model")
val test = spark.createDataFrame(Seq(
(4L, "spark i j k"),
(5L, "l m n"),
(6L, "spark hadoop spark"),
(7L, "apache hadoop")
)).toDF("id", "text")
model2.transform(test)
.select("id","text","probability","prediction")
.collect()
.foreach{
case Row(id:Long,text:String,prob: Vector ,pred:Double) =>
println(s"($id , $text ) => prob = $prob , Prediction = $pred")
}
}
}