In [1]:
from pyspark.sql import SparkSession 

In [2]:
spark = SparkSession.builder.master("local").appName("pyspark-command-1").getOrCreate()
spark.sparkContext

In [4]:
sc = spark.sparkContext

# map

In [21]:
x=sc.parallelize([1,2,3])

In [22]:
y=x.map(lambda x: (x,x**2))

In [23]:
print(x.collect())

[1, 2, 3]


In [24]:
print(y.collect())

[(1, 1), (2, 4), (3, 9)]


# flatMap

In [25]:
x=sc.parallelize([1,2,3])
y=x.flatMap(lambda x: (x, 100*x, x**2))

In [26]:
print(x.collect())

[1, 2, 3]


In [27]:
print(y.collect())

[1, 100, 1, 2, 200, 4, 3, 300, 9]


# mapPartitions

In [28]:
x=sc.parallelize([1,2,3],2)
def f(iterator): yield sum(iterator)

In [29]:
y=x.mapPartitions(f)


In [33]:
#glom顯示partition內的值
print(x.glom().collect())

[[1], [2, 3]]


In [34]:
#透過 sum 把元素相加
print(y.glom().collect())

[[1], [5]]


# mapPartitionsWithIndex

In [40]:
x=sc.parallelize([1,2,3,4,5],3)

In [41]:
def f(partitionIndex, iterator): yield (partitionIndex, sum(iterator))

In [42]:
y=x.mapPartitionsWithIndex(f)

In [43]:
print(x.glom().collect())

[[1], [2, 3], [4, 5]]


In [45]:
#帶有index號碼 (由0開始)
print(y.glom().collect())

[[(0, 1)], [(1, 5)], [(2, 9)]]


# getNumPartitions

In [46]:
x=sc.parallelize([1,2,3],2)

In [47]:
y=x.getNumPartitions()

In [48]:
print(x.glom().collect())

[[1], [2, 3]]


In [49]:
print(y)

2


# filter

In [50]:
x=sc.parallelize([1,2,3])

In [54]:
#只取單數
y=x.filter(lambda x: x%2 ==1)

In [52]:
print(x.collect())

[1, 2, 3]


In [53]:
print(y.collect())

[1, 3]


# distinct

In [55]:
x=sc.parallelize(['A','A','B'])

In [56]:
y=x.distinct()

In [57]:
print(x.collect())

['A', 'A', 'B']


In [58]:
print(y.collect())

['A', 'B']


# sample

In [75]:
#有無中括號有差別
#x=sc.parallelize([range(7)])
x=sc.parallelize(range(0,7))
x.collect()

[0, 1, 2, 3, 4, 5, 6]

In [76]:
ylist=[x.sample(withReplacement=False, fraction=0.5) for i in range(5)]

In [77]:
print('x=' + str(x.collect()))

x=[0, 1, 2, 3, 4, 5, 6]


In [78]:
for cnt, y in zip(range(len(ylist)), ylist):
    print('sample ' + str(cnt) + ' y= ' + str(y.collect()))

sample 0 y= [0, 3, 6]
sample 1 y= [3, 6]
sample 2 y= []
sample 3 y= [3, 4, 5, 6]
sample 4 y= [4, 6]


# takeSample

In [79]:
x=sc.parallelize(range(0,7))
x.collect()

[0, 1, 2, 3, 4, 5, 6]

In [80]:
ylist=[x.takeSample(withReplacement=False, num=3) for i in range(5)]

In [81]:
print('x= ' + str(x.collect()))

x= [0, 1, 2, 3, 4, 5, 6]


In [82]:
for cnt,y in zip(range(len(ylist)),ylist):
    print('sample: ' + str(cnt) + ' y=' + str(y))

sample: 0 y=[0, 5, 1]
sample: 1 y=[0, 1, 2]
sample: 2 y=[3, 0, 6]
sample: 3 y=[2, 4, 5]
sample: 4 y=[2, 1, 0]


# union

In [83]:
x=sc.parallelize(['A','A','B'])
y=sc.parallelize(['D','C','A'])

In [84]:
z=x.union(y)

In [85]:
print(x.collect())

['A', 'A', 'B']


In [86]:
print(y.collect())

['D', 'C', 'A']


In [87]:
print(z.collect())

['A', 'A', 'B', 'D', 'C', 'A']


# intersection

In [88]:
x=sc.parallelize(['A','A','B'])
y=sc.parallelize(['A','C','D'])

In [89]:
z=x.intersection(y)

In [90]:
print(z.collect())

['A']


# sortByKey

In [91]:
x=sc.parallelize([('B',1),('A',2),('C',3)])

In [92]:
y=x.sortByKey()

In [93]:
print(x.collect())

[('B', 1), ('A', 2), ('C', 3)]


In [94]:
print(y.collect())

[('A', 2), ('B', 1), ('C', 3)]


In [5]:
sc.stop()

In [6]:
spark.stop()