In [6]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt

In [9]:
x = np.linspace(0, 10, 200)

In [14]:
y = np.sin(5*x) / (1+x**2)

In [16]:
from pyspark import SparkContext
sc =SparkContext()
#step1. 建立intRDD, 使用SparkContext parallelize() method, Transformation 1
intRDD=sc.parallelize([3,1,2,5,5])  #這是 Transformation,不會立刻執行

In [17]:
intRDD.collect() #collect() method 是 action, 會執行T1, 將輸入 list 作輸出

[3, 1, 2, 5, 5]

In [18]:
#step2. 建立stringRDD
stringRDD=sc.parallelize(["Apple","Orange","Banana","Grape","Apple"]) #T2
stringRDD.collect() #A2

['Apple', 'Orange', 'Banana', 'Grape', 'Apple']

In [19]:
#step3. map 運算
#map, 以傳入的函數將每一元素經函數運算產生另一RDD
#以 addOne() 函數為例-->將每一元素加一

In [20]:
#step3.1 建立addOne() function
def addOne(y):
    return (y+1)

In [21]:
#step3.2 將函數名稱 addOne 當參數傳給map函數
intRDD.map(addOne).collect()   #intRDD (T1), map(T2) , collect()-->Action

[4, 2, 3, 6, 6]

In [22]:
#step4. map運算,使用lambda function

In [24]:
#將step3.1 addOne() function, 以lambda 方式在 map中使用
intRDD.map(lambda x:x+1).collect()

[4, 2, 3, 6, 6]

In [25]:
#step5. map字串運算

In [26]:
stringRDD.map(lambda x:x.capitalize()).collect()

['Apple', 'Orange', 'Banana', 'Grape', 'Apple']

In [27]:
#step6. filter 數字運算
#對RDD內每一元素進行篩選,產生下一個RDD

In [28]:
intRDD.filter(lambda x: 2<x<5).collect()

[3]

In [29]:
intRDD.filter(lambda x: x==3).collect()

[3]

In [30]:
#step7. filter 字串運算
#對stringRDD內每一字串元素進行篩選運算,產生下一個RDD

In [31]:
stringRDD.filter(lambda x:len(x)>5).collect() #找出字串長度大於5

['Orange', 'Banana']

In [32]:
#step8. distinct 運算

In [33]:
stringRDD.collect()

['Apple', 'Orange', 'Banana', 'Grape', 'Apple']

In [34]:
stringRDD.distinct().collect()

['Orange', 'Grape', 'Apple', 'Banana']

In [35]:
#step9. randomSplit 運算
#pyspark.RDD.randomSplit(weights, seed=None),
#Randomly splits this RDD with the provided weights.
#Returns: split RDDs in a list

In [36]:
intRDD.collect()

[3, 1, 2, 5, 5]

In [37]:
splRDD=intRDD.randomSplit([0.4,0.6]) #依4:6比例,用亂數方式將RDD 分成兩個 RDD

In [38]:
splRDD[0].collect() #第一個RDD

[3]

In [39]:
splRDD[1].collect() #第二個RDD

[1, 2, 5, 5]

In [40]:
#step10. group 運算 --> groupBy()
#依傳入的lambda函數規則,將資料分為多個List

In [41]:
#example1:將intRDD 分成奇數與偶數
gRDD=intRDD.groupBy(lambda x: "odd" if(x%2 == 1) else "even").collect() #直接 collect() action

In [42]:
type(gRDD) #gRDD --> list of tuple

list

In [43]:
type(gRDD[0]),type(gRDD[0][0]),type(gRDD[0][1])

(tuple, str, pyspark.resultiterable.ResultIterable)

In [44]:
print(gRDD[0][0],list(gRDD[0][1])) #以list() 將Iterable轉成list object

even [2]


In [45]:
print(gRDD[1][0],sorted(gRDD[1][1]))

odd [1, 3, 5, 5]
