## 7 个Spark编程练习题

In [14]:
import org.apache.spark.sql.SparkSession

val spark = SparkSession
.builder()
.appName("Spark SQL basic example")
.enableHiveSupport()
.getOrCreate()

//以支持将RDD隐式转换成DataFrame
import spark.implicits._

spark = org.apache.spark.sql.SparkSession@69155c2c


org.apache.spark.sql.SparkSession@69155c2c

### 一，求平均值

In [232]:
//求data的平均值
import util.Random
val data = for(i<- List.range(1,10)) yield Random.nextInt(100)

data = List(51, 52, 25, 16, 78, 81, 15, 38, 29)


List(51, 52, 25, 16, 78, 81, 15, 38, 29)

RDD编程实现

In [233]:
//使用RDD编程实现
val rdd = sc.parallelize(data,5)
val mean = rdd.map(_.toDouble).reduce(_+_)/rdd.count
println(mean)

42.77777777777778


rdd = ParallelCollectionRDD[957] at parallelize at <console>:65
mean = 42.77777777777778


42.77777777777778

In [234]:
//使用SparkSQL编程实现
val df = data.toDF("value")
df.agg("value"->"avg").show

+-----------------+
|       avg(value)|
+-----------------+
|42.77777777777778|
+-----------------+



df = [value: int]


[value: int]

### 二， WordCount统计词频

In [36]:
//统计file中每个词的词频
val file = "wordcount"

file = wordcount


wordcount

In [26]:
//使用RDD编程实现
val rdd = sc.textFile(file)
rdd.flatMap(_.trim.split(" ")).map((_,1)).reduceByKey(_+_).collect

rdd = wordcount MapPartitionsRDD[66] at textFile at <console>:42


Array((BeiJing,2), (hello,5), (XiCheng,1), (world,1), (China,2), (TianAnMen,1))

In [64]:
//使用SparkSQL编程实现
val df = spark.read.option("header","false").csv(file).toDF("value")
df.flatMap(row=>row(0).toString.trim.split(" ")).groupBy("value").count.show

+---------+-----+
|    value|count|
+---------+-----+
|  BeiJing|    2|
|    hello|    5|
|    China|    2|
|    world|    1|
|  XiCheng|    1|
|TianAnMen|    1|
+---------+-----+



df = [value: string]


[value: string]

### 三，求TopN 

In [42]:
//有一批学生信息表格，包括name,age,score
//找出score排名前3的学生
val students = List(("LiLei",18,87),
                   ("HanMeiMei",16,77),
                   ("DaChui",16,66),
                   ("Jim",18,80),
                   ("RuHua",20,50))

students = List((LiLei,18,87), (HanMeiMei,16,77), (DaChui,16,66), (Jim,18,80), (RuHua,20,50))


List((LiLei,18,87), (HanMeiMei,16,77), (DaChui,16,66), (Jim,18,80), (RuHua,20,50))

In [43]:
//使用RDD编程接口
val rdd = sc.parallelize(students)
rdd.sortBy(_._3,ascending = false).take(3)

rdd = ParallelCollectionRDD[174] at parallelize at <console>:42


Array((LiLei,18,87), (Jim,18,80), (HanMeiMei,16,77))

In [46]:
//使用SparkSQL编程接口
val df = students.toDF("name","age","score")
df.orderBy(df("score").desc).show(3)

+---------+---+-----+
|     name|age|score|
+---------+---+-----+
|    LiLei| 18|   87|
|      Jim| 18|   80|
|HanMeiMei| 16|   77|
+---------+---+-----+
only showing top 3 rows



df = [name: string, age: int ... 1 more field]


[name: string, age: int ... 1 more field]

### 四，求最大值最小值

In [65]:
//求最大值最小值
val data = List(1,7,8,5,3,18,34,23,67,53,9,0,12,8)

data = List(1, 7, 8, 5, 3, 18, 34, 23, 67, 53, 9, 0, 12, 8)


List(1, 7, 8, 5, 3, 18, 34, 23, 67, 53, 9, 0, 12, 8)

In [66]:
//使用RDD编程实现，方案1
val rdd = sc.parallelize(data,3)

val max_value = rdd.reduce((a,b)=> if(a>b) a else b)
val min_value = rdd.reduce((a,b)=> if(a>b) b else a)

println("max_value:" + max_value)
println("min_value:" + min_value)

max_value:67
min_value:0


rdd = ParallelCollectionRDD[354] at parallelize at <console>:43
max_value = 67
min_value = 0


0

In [70]:
//使用RDD编程实现，方案2
val rdd = sc.parallelize(data,3)
val temp = rdd.mapPartitions(iterator => {
    var min = Integer.MAX_VALUE
    var max = Integer.MIN_VALUE
    for(x <- iterator){
        if(x>max) max = x
        if(x<min) min = x
    }
    Iterator((min,max))
})
val result = temp.reduce((a,b)=>
          {val min = if(a._1<= b._1) a._1 else b._1
           val max = if(a._2 >= b._2) a._2 else b._2
           (min,max)
          })


rdd = ParallelCollectionRDD[357] at parallelize at <console>:43
temp = MapPartitionsRDD[358] at mapPartitions at <console>:44
result = (0,67)


(0,67)

In [72]:
//使用SparkSQL编程实现
import org.apache.spark.sql.functions._
val df = data.toDF("value")
df.agg(max("value") as "max_value",min("value") as "min_value").show

+---------+---------+
|max_value|min_value|
+---------+---------+
|       67|        0|
+---------+---------+



df = [value: int]


[value: int]

### 五，排序并返回序号

In [95]:
//任务：排序并返回序号
val data = List(1,7,8,5,3,18,34,9,0,12,8)

data = List(1, 7, 8, 5, 3, 18, 34, 9, 0, 12, 8)


List(1, 7, 8, 5, 3, 18, 34, 9, 0, 12, 8)

In [125]:
//使用RDD编程实现：方案1
val rdd = sc.parallelize(data,3)
val len = rdd.count
val sortedrdd = rdd.map((_,1)).sortByKey().map(_._1).repartition(1)
val index = sc.parallelize(0 to len.toInt-1,1)
index.zip(sortedrdd).collect

rdd = ParallelCollectionRDD[587] at parallelize at <console>:60
len = 11
sortedrdd = MapPartitionsRDD[596] at repartition at <console>:62
index = ParallelCollectionRDD[597] at parallelize at <console>:63


Array((0,0), (1,1), (2,3), (3,5), (4,7), (5,8), (6,8), (7,9), (8,12), (9,18), (10,34))

In [128]:
//使用RDD编程实现：方案2
val rdd = sc.parallelize(data,3)
val sortedrdd = rdd.map((_,1)).sortByKey().map(_._1).repartition(1)
var idx = -1
sortedrdd.map(value => {
    idx+=1
    (idx,value)
}).collect

rdd = ParallelCollectionRDD[621] at parallelize at <console>:58
sortedrdd = MapPartitionsRDD[630] at repartition at <console>:59
idx = -1


Array((0,0), (1,1), (2,3), (3,5), (4,7), (5,8), (6,8), (7,9), (8,12), (9,18), (10,34))

In [134]:
//使用RDD编程实现：方案3
val rdd = sc.parallelize(data,3)
//利用zipWithIndex方法
val result = rdd.map((_,1)).sortByKey()
    .map(_._1).zipWithIndex().map(x =>(x._2+1,x._1)).collect

rdd = ParallelCollectionRDD[670] at parallelize at <console>:62
result = Array((1,0), (2,1), (3,3), (4,5), (5,7), (6,8), (7,8), (8,9), (9,12), (10,18), (11,34))


Array((1,0), (2,1), (3,3), (4,5), (5,7), (6,8), (7,8), (8,9), (9,12), (10,18), (11,34))

In [132]:
//使用SparkSQL编程实现：方案4
import org.apache.spark.sql.expressions.Window 
import org.apache.spark.sql.functions.row_number 

val df = data.toDF("value").sort("value")
val w = Window.orderBy("value") 

val result = df.withColumn("index", row_number().over(w)-1)
result.show

+-----+-----+
|value|index|
+-----+-----+
|    0|    0|
|    1|    1|
|    3|    2|
|    5|    3|
|    7|    4|
|    8|    5|
|    8|    6|
|    9|    7|
|   12|    8|
|   18|    9|
|   34|   10|
+-----+-----+



df = [value: int]
w = org.apache.spark.sql.expressions.WindowSpec@5aecc773
result = [value: int, index: int]


[value: int, index: int]

In [183]:
//使用SparkSQL编程实现：方案5

val df = data.toDF("value").sort("value")
val rdd = df.rdd.map(_(0)).zipWithIndex().map(x=>(x._2,x._1.toString.toInt))
rdd.toDF("idx","value").show

+---+-----+
|idx|value|
+---+-----+
|  0|    0|
|  1|    1|
|  2|    3|
|  3|    5|
|  4|    7|
|  5|    8|
|  6|    8|
|  7|    9|
|  8|   12|
|  9|   18|
| 10|   34|
+---+-----+



df = [value: int]
rdd = MapPartitionsRDD[856] at map at <console>:63


MapPartitionsRDD[856] at map at <console>:63

In [212]:
//使用SparkSQL编程实现：方案6

val df = data.toDF("value").sort("value")
var idx = -1
def index():Int ={
    idx = idx+1
    idx
}
spark.udf.register("index",(x:Any) => index())
df.selectExpr("index(value) as idx","value").show()

+---+-----+
|idx|value|
+---+-----+
|  0|    0|
|  1|    1|
|  2|    3|
|  3|    5|
|  4|    7|
|  5|    8|
|  6|    8|
|  7|    9|
|  8|   12|
|  9|   18|
| 10|   34|
+---+-----+



df = [value: int]
idx = 10


index: ()Int


10

###  六，二次排序

In [224]:
//首先根据学生的score从大到小排序，
//如果score相同，根据age从大到小排序
//数据表结构：name,age,score
val students = List(("LiLei",16,87),
                    ("HanMeiMei",17,87),
                   ("DaChui",16,77),
                   ("RuHua",18,50))



students = List((LiLei,16,87), (HanMeiMei,17,87), (DaChui,16,77), (RuHua,18,50))


List((LiLei,16,87), (HanMeiMei,17,87), (DaChui,16,77), (RuHua,18,50))

In [230]:
//使用RDD进行编程

case class Student(name:String,age:Int,score:Int)
    extends Ordered[Student] with Serializable{
        override def compare(other:Student):Int = {
        if(this.score - other.score!=0) {
            this.score - other.score
        }
        else {
            this.age - other.age
        }
        }
    }

val rdd = sc.parallelize(students).map(s=>Student(s._1,s._2,s._3))
rdd.map((_,1)).sortByKey(ascending = false).map(_._1).collect.foreach(println)

Student(HanMeiMei,17,87)
Student(LiLei,16,87)
Student(DaChui,16,77)
Student(RuHua,18,50)


defined class Student
rdd = MapPartitionsRDD[947] at map at <console>:35


MapPartitionsRDD[947] at map at <console>:35

In [231]:
//使用SparkSQL进行编程

val df= students.toDF("name","age","score")
df.sort($"score".desc,$"age".desc).show

+---------+---+-----+
|     name|age|score|
+---------+---+-----+
|HanMeiMei| 17|   87|
|    LiLei| 16|   87|
|   DaChui| 16|   77|
|    RuHua| 18|   50|
+---------+---+-----+



df = [name: string, age: int ... 1 more field]


[name: string, age: int ... 1 more field]

### 七，连接操作

In [238]:
//已知班级信息表和成绩表，找出班级平均分在75分以上的班级

//班级信息表包括cls,name,成绩表包括name,score
val classes = List(("class1","LiLei"),
              ("class1","HanMeiMei"),
              ("class2","DaChui"),
              ("class2","RuHua"))
val scores = List(("LiLei",76),
             ("HanMeiMei",80),
             ("DaChui",70),
             ("RuHua",60))


classes = List((class1,LiLei), (class1,HanMeiMei), (class2,DaChui), (class2,RuHua))
scores = List((LiLei,76), (HanMeiMei,80), (DaChui,70), (RuHua,60))


List((LiLei,76), (HanMeiMei,80), (DaChui,70), (RuHua,60))

In [273]:
//RDD编程实现
val classesRDD = sc.parallelize(classes).map(s=>(s._2,s._1))
val scoresRDD = sc.parallelize(scores)

val joinedRDD = scoresRDD.join(classesRDD).map(s=>(s._2._2,s._2._1))

def mean(l:List[Int]):Double ={
    val doublel = for(x<-l)yield x.toDouble
    doublel.sum/(l.length)
}

joinedRDD.groupByKey()
         .map(x=>(x._1,mean(x._2.toList)))
         .filter(_._2>75)
         .collect

classesRDD = MapPartitionsRDD[1026] at map at <console>:68
scoresRDD = ParallelCollectionRDD[1027] at parallelize at <console>:69
joinedRDD = MapPartitionsRDD[1031] at map at <console>:71


mean: (l: List[Int])Double


Array((class1,78.0))

In [283]:
//SparkSQL编程实现

val dfcls = classes.toDF("cls","name")
val dfscore = scores.toDF("name","score")

dfcls.join(dfscore,"name")
     .groupBy("cls")
     .agg("score"->"avg")
     .where("avg(score)>75.0")
     .show

+------+----------+
|   cls|avg(score)|
+------+----------+
|class1|      78.0|
+------+----------+



dfcls = [cls: string, name: string]
dfscore = [name: string, score: int]


[name: string, score: int]