In [1]:
from pyspark.sql import SparkSession 

# 啟用 SparkSession 與 SparkContext

* 指定主機位置: local
* 指定應用程式名稱: pyspark-command-1 (請查看 4040 port)

In [2]:
spark = SparkSession.builder.master("local").appName("pyspark-command-1").getOrCreate()
spark.sparkContext

# map 、collect、count、sum
* 建立 RDD
* 透過 map 處理資料
* 利用 collect 顯示 RDD 內容
* 利用 count 顯示筆數
* 利用 sum 加總

In [3]:
sc = spark.sparkContext

In [4]:
myRDD1 = sc.parallelize([3,2,1,4,5,8,9])
myRDD2 = myRDD1.map(lambda x: (x**2))

In [5]:
myRDD1.collect()

[3, 2, 1, 4, 5, 8, 9]

In [6]:
myRDD2.collect()

[9, 4, 1, 16, 25, 64, 81]

In [7]:
myRDD2.sum()

200

# flatMap 
* 將計算結果拉平

In [8]:
myRDD3=myRDD1.flatMap(lambda x: (x, 100*x, x**2))

In [9]:
myRDD3.collect()

[3,
 300,
 9,
 2,
 200,
 4,
 1,
 100,
 1,
 4,
 400,
 16,
 5,
 500,
 25,
 8,
 800,
 64,
 9,
 900,
 81]

In [10]:
myRDD3.count()

21

# filter

In [11]:
myRDD4 = myRDD1.filter(lambda x: x%2==1)

In [12]:
myRDD4.collect()

[3, 1, 5, 9]

# groupByKey
* 依照 key 的內容彙總

In [13]:
myRDD5 = sc.parallelize([("USA", 1), ("USA", 2), ("India", 1), ("UK", 1), ("India", 4), ("India", 9), \
    ("USA", 8), ("USA", 3), ("India", 4), ("UK", 6), ("UK", 9), ("UK", 5), ("Taiwan",4)], 3) 

In [14]:
# groupByKey表示同key下，剩下是list
myRDD6 = myRDD5.groupByKey()

In [15]:
for i in myRDD6.collect():
    print(i[0], [j for j in i[1]])

USA [1, 2, 8, 3]
UK [1, 6, 9, 5]
Taiwan [4]
India [1, 4, 9, 4]


# reduceByKey 與 sortBy

In [16]:
# reduce表示同key要相加
myRDD6 = myRDD5.reduceByKey(lambda i, j: i+j)

In [17]:
myRDD6.collect()

[('USA', 14), ('UK', 21), ('Taiwan', 4), ('India', 18)]

In [18]:
# 針對第二個元素進行排序
myRDD7 = myRDD6.sortBy(lambda x: x[1]).collect()

In [19]:
myRDD7

[('Taiwan', 4), ('USA', 14), ('India', 18), ('UK', 21)]

# first 與 take

In [20]:
myRDD1.first()

3

In [21]:
myRDD1.take(1)

[3]

# reduce

In [22]:
myRDD8 = sc.parallelize(myRDD7)

In [23]:
myRDD8.collect()

[('Taiwan', 4), ('USA', 14), ('India', 18), ('UK', 21)]

In [24]:
#map把第二個元素抓出來，交給後面的reduce相加
myRDD9 = myRDD8.map(lambda x: x[1]).reduce(lambda x, y: x+y)

In [25]:
myRDD9

57

In [26]:
#map把第一個元素抓出來，並且比對是否要替換，最後在跟第二個元素一起輸出
myRDD10 = myRDD8.map(lambda x: (x[0].replace("UK", "UUU"), x[1]))

In [27]:
myRDD11=myRDD10.collect()

In [28]:
myRDD11

[('Taiwan', 4), ('USA', 14), ('India', 18), ('UUU', 21)]

In [29]:
myRDD1.first()

3

In [30]:
sc.stop()