In [1]:
from pyspark.sql import SparkSession 

In [2]:
spark = SparkSession.builder.master("local").appName("pyspark-command-2").getOrCreate()
spark.sparkContext

In [3]:
sc=spark.sparkContext

# countByKey

In [4]:
x=sc.parallelize([('B',1),('B',2),('A',3),('A',4),('A',5)])

In [5]:
y=x.countByKey()

In [7]:
y

defaultdict(int, {'B': 2, 'A': 3})

# join

In [8]:
x=sc.parallelize([('C',4),('B',3),('A',2),('A',1)])

In [9]:
y=sc.parallelize([('A',8),('B',7),('A',6),('D',5)])

In [10]:
z=x.join(y)

In [12]:
z.collect()

[('B', (3, 7)), ('A', (2, 8)), ('A', (2, 6)), ('A', (1, 8)), ('A', (1, 6))]

# leftOuterJoin

In [13]:
x=sc.parallelize([('C',4),('B',3),('A',2),('A',1)])

In [14]:
y=sc.parallelize([('A',8),('B',7),('A',6),('D',5)])

In [15]:
z=x.leftOuterJoin(y)

In [16]:
#D不見了
z.collect()

[('C', (4, None)),
 ('B', (3, 7)),
 ('A', (2, 8)),
 ('A', (2, 6)),
 ('A', (1, 8)),
 ('A', (1, 6))]

# rightOuterJoin

In [17]:
x=sc.parallelize([('C',4),('B',3),('A',2),('A',1)])

In [18]:
y=sc.parallelize([('A',8),('B',7),('A',6),('D',5)])

In [19]:
z=x.rightOuterJoin(y)

In [20]:
z.collect()

[('B', (3, 7)),
 ('A', (2, 8)),
 ('A', (2, 6)),
 ('A', (1, 8)),
 ('A', (1, 6)),
 ('D', (None, 5))]

# partitionBy

In [22]:
x=sc.parallelize([(0,1),(1,2),(2,3)],2)

In [23]:
y=x.partitionBy(numPartitions=3,partitionFunc=lambda x:x)

In [24]:
x.glom().collect()

[[(0, 1)], [(1, 2), (2, 3)]]

In [25]:
y.glom().collect()

[[(0, 1)], [(1, 2)], [(2, 3)]]

# combineByKey

In [38]:
x=sc.parallelize([('B',1),('B',2),('A',3),('A',4),('A',5)])

In [39]:
createCombiner = (lambda el: [(el,el**2)]) 

In [40]:
mergeVal = (lambda aggregated, el: aggregated + [(el,el**2)]) # append to aggregated

In [41]:
mergeComb = (lambda agg1,agg2: agg1 + agg2 )  # append agg1 with agg2

In [42]:
y = x.combineByKey(createCombiner,mergeVal,mergeComb)

In [43]:
x.collect()

[('B', 1), ('B', 2), ('A', 3), ('A', 4), ('A', 5)]

In [44]:
y.collect()

[('B', [(1, 1), (2, 4)]), ('A', [(3, 9), (4, 16), (5, 25)])]

# aggregateByKey

In [45]:
x=sc.parallelize([('B',1),('B',2),('A',3),('A',4),('A',5)])

In [46]:
zeroValue=[]

In [47]:
mergeVal=(lambda aggregated, el: aggregated + [(el,el**2)])

In [48]:
mergeComb = (lambda agg1, agg2:agg1+agg2)

In [49]:
y=x.aggregateByKey(zeroValue,mergeVal,mergeComb)

In [50]:
x.collect()

[('B', 1), ('B', 2), ('A', 3), ('A', 4), ('A', 5)]

In [51]:
y.collect()

[('B', [(1, 1), (2, 4)]), ('A', [(3, 9), (4, 16), (5, 25)])]

# foldByKey

In [52]:
x=sc.parallelize([('B',1),('B',2),('A',3),('A',4),('A',5)])

In [53]:
zeroValue=1

In [54]:
y=x.foldByKey(zeroValue,lambda agg,x:agg*x)

In [55]:
x.collect()

[('B', 1), ('B', 2), ('A', 3), ('A', 4), ('A', 5)]

In [56]:
y.collect()

[('B', 2), ('A', 60)]

# groupByKey

In [57]:
x=sc.parallelize([('B',5),('B',4),('A',3),('A',2),('A',1)])

In [58]:
y=x.groupByKey()


In [59]:
x.collect()

[('B', 5), ('B', 4), ('A', 3), ('A', 2), ('A', 1)]

In [60]:
y.collect()

[('B', <pyspark.resultiterable.ResultIterable at 0x7f1ec4083630>),
 ('A', <pyspark.resultiterable.ResultIterable at 0x7f1ec4083470>)]

In [61]:
print([(j[0],[i for i in j[1]]) for j in y.collect()])

[('B', [5, 4]), ('A', [3, 2, 1])]


# flatMapValues

In [62]:
x=sc.parallelize([('A',(1,2,3)), ('B',(4,5))])

In [64]:
y=x.flatMapValues(lambda x: [i**2 for i in x])

In [65]:
x.collect()

[('A', (1, 2, 3)), ('B', (4, 5))]

In [66]:
y.collect()

[('A', 1), ('A', 4), ('A', 9), ('B', 16), ('B', 25)]

# mapValues

In [67]:
x=sc.parallelize([('A',(1,2,3)), ('B', (4,5))])

In [68]:
y=x.mapValues(lambda x: [i**2 for i in x])

In [69]:
x.collect()

[('A', (1, 2, 3)), ('B', (4, 5))]

In [71]:
y.collect()

[('A', [1, 4, 9]), ('B', [16, 25])]

# groupWith

In [72]:
x=sc.parallelize([('C',4),('B',(3,3)), ('A',2), ('A',(1,1))])

In [73]:
y=sc.parallelize([('B',(7,7)),('A',6),('D',(5,5))])

In [74]:
z=sc.parallelize([('D',9), ('B',(8,8))])

In [75]:
a=x.groupWith(y,z)

In [76]:
x.collect()

[('C', 4), ('B', (3, 3)), ('A', 2), ('A', (1, 1))]

In [77]:
y.collect()

[('B', (7, 7)), ('A', 6), ('D', (5, 5))]

In [78]:
z.collect()

[('D', 9), ('B', (8, 8))]

In [79]:
print("Result:")
for key,val in list(a.collect()):
    print(key, [list(i) for i in val])

Result:
C [[4], [], []]
B [[(3, 3)], [(7, 7)], [(8, 8)]]
D [[], [(5, 5)], [9]]
A [[2, (1, 1)], [6], []]


# cogroup

In [80]:
x=sc.parallelize([('C',4),('B',(3,3),('A',2),('A',(1,1)))])

In [81]:
y=sc.parallelize([('A',8),('B',7),('A',6),('D',(5,5))])

In [82]:
z=x.cogroup(y)

In [83]:
x.collect()

[('C', 4), ('B', (3, 3), ('A', 2), ('A', (1, 1)))]

In [84]:
z.collect()

[('C',
  (<pyspark.resultiterable.ResultIterable at 0x7f1ec408acc0>,
   <pyspark.resultiterable.ResultIterable at 0x7f1ec408aac8>)),
 ('B',
  (<pyspark.resultiterable.ResultIterable at 0x7f1ec408a898>,
   <pyspark.resultiterable.ResultIterable at 0x7f1ec408a048>)),
 ('A',
  (<pyspark.resultiterable.ResultIterable at 0x7f1ec408af60>,
   <pyspark.resultiterable.ResultIterable at 0x7f1ec408ab70>)),
 ('D',
  (<pyspark.resultiterable.ResultIterable at 0x7f1ec408a828>,
   <pyspark.resultiterable.ResultIterable at 0x7f1ec408a3c8>))]

In [86]:
for key, val in list(z.collect()):
    print(key, [list(i) for i in val])

C [[4], []]
B [[(3, 3)], [7]]
A [[], [8, 6]]
D [[], [(5, 5)]]


# sampleByKey

In [88]:
x=sc.parallelize([('A',1),('B',2),('C',3),('B',4),('A',5)])

In [89]:
y=x.sampleByKey(withReplacement=False, fractions={'A':0.5, 'B':1, 'C':0.2})

In [90]:
x.collect()

[('A', 1), ('B', 2), ('C', 3), ('B', 4), ('A', 5)]

In [91]:
y.collect()

[('B', 2), ('C', 3), ('B', 4), ('A', 5)]

# subtractByKey

In [92]:
x=sc.parallelize([('C',1),('B',2),('A',3),('A',4)])

In [93]:
y=sc.parallelize([('A',5),('D',6),('A',7),('D',8)])

In [94]:
z=x.subtractByKey(y)

In [95]:
x.collect()

[('C', 1), ('B', 2), ('A', 3), ('A', 4)]

In [96]:
y.collect()

[('A', 5), ('D', 6), ('A', 7), ('D', 8)]

In [97]:
z.collect()

[('C', 1), ('B', 2)]

# subtract

In [98]:
x=sc.parallelize([('C',4),('B',3),('A',2),('A',1)])

In [99]:
y=sc.parallelize([('C',8),('A',2),('D',1)])

In [100]:
z=x.subtract(y)

In [101]:
x.collect()

[('C', 4), ('B', 3), ('A', 2), ('A', 1)]

In [102]:
y.collect()

[('C', 8), ('A', 2), ('D', 1)]

In [103]:
z.collect()

[('C', 4), ('B', 3), ('A', 1)]

# keyBy

In [104]:
x=sc.parallelize([1,3,4])

In [105]:
y=x.keyBy(lambda x: x**2)

In [106]:
x.collect()

[1, 3, 4]

In [107]:
y.collect()

[(1, 1), (9, 3), (16, 4)]

# repartition

In [109]:
x=sc.parallelize([1,2,3,4,5],2)

In [110]:
y=x.repartition(numPartitions=3)

In [111]:
x.glom().collect()

[[1, 2], [3, 4, 5]]

In [112]:
y.glom().collect()

[[], [1, 2], [3, 4, 5]]

# coalesce

In [113]:
x=sc.parallelize([1,2,3,4,5],2)

In [114]:
y=x.coalesce(numPartitions=1)

In [115]:
x.glom().collect()

[[1, 2], [3, 4, 5]]

In [116]:
y.glom().collect()

[[1, 2, 3, 4, 5]]

# zip

In [117]:
x=sc.parallelize(['B','A','A'])

In [118]:
y=x.map(lambda x :ord(x))

In [119]:
z=x.zip(y)

In [120]:
x.collect()

['B', 'A', 'A']

In [121]:
y.collect()

[66, 65, 65]

In [122]:
z.collect()

[('B', 66), ('A', 65), ('A', 65)]

# zipWithIndex

In [123]:
x=sc.parallelize(['B','A','A'],2)

In [124]:
y=x.zipWithIndex()

In [125]:
print(x.glom().collect())

[['B'], ['A', 'A']]


In [126]:
print(y.glom().collect())

[[('B', 0)], [('A', 1), ('A', 2)]]


# zipWithUniqueId

In [127]:
x=sc.parallelize(['B','A','A'],2)

In [128]:
y=x.zipWithUniqueId()

In [129]:
x.glom().collect()

[['B'], ['A', 'A']]

In [130]:
y.collect()

[('B', 0), ('A', 1), ('A', 3)]