In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.appName('c13').getOrCreate()

In [3]:
myCollection = "Spark The Definitive Guide : Big Data Processing Made Simple".split(" ")

In [4]:
words = spark.sparkContext.parallelize(myCollection,2)

In [5]:
# key-value basics (Key-Value RDDs)

In [7]:
words.map(lambda word: (word.lower(), 1)).collect()

[('spark', 1),
 ('the', 1),
 ('definitive', 1),
 ('guide', 1),
 (':', 1),
 ('big', 1),
 ('data', 1),
 ('processing', 1),
 ('made', 1),
 ('simple', 1)]

In [8]:
# keyBy

In [9]:
keyword = words.keyBy(lambda word: word.lower()[0])

In [10]:
keyword.collect()

[('s', 'Spark'),
 ('t', 'The'),
 ('d', 'Definitive'),
 ('g', 'Guide'),
 (':', ':'),
 ('b', 'Big'),
 ('d', 'Data'),
 ('p', 'Processing'),
 ('m', 'Made'),
 ('s', 'Simple')]

In [11]:
#mapping over values

In [12]:
keyword.mapValues(lambda word: word.upper()).collect()

[('s', 'SPARK'),
 ('t', 'THE'),
 ('d', 'DEFINITIVE'),
 ('g', 'GUIDE'),
 (':', ':'),
 ('b', 'BIG'),
 ('d', 'DATA'),
 ('p', 'PROCESSING'),
 ('m', 'MADE'),
 ('s', 'SIMPLE')]

In [13]:
# flatmap

In [14]:
keyword.flatMapValues(lambda word: word.upper()).collect()

[('s', 'S'),
 ('s', 'P'),
 ('s', 'A'),
 ('s', 'R'),
 ('s', 'K'),
 ('t', 'T'),
 ('t', 'H'),
 ('t', 'E'),
 ('d', 'D'),
 ('d', 'E'),
 ('d', 'F'),
 ('d', 'I'),
 ('d', 'N'),
 ('d', 'I'),
 ('d', 'T'),
 ('d', 'I'),
 ('d', 'V'),
 ('d', 'E'),
 ('g', 'G'),
 ('g', 'U'),
 ('g', 'I'),
 ('g', 'D'),
 ('g', 'E'),
 (':', ':'),
 ('b', 'B'),
 ('b', 'I'),
 ('b', 'G'),
 ('d', 'D'),
 ('d', 'A'),
 ('d', 'T'),
 ('d', 'A'),
 ('p', 'P'),
 ('p', 'R'),
 ('p', 'O'),
 ('p', 'C'),
 ('p', 'E'),
 ('p', 'S'),
 ('p', 'S'),
 ('p', 'I'),
 ('p', 'N'),
 ('p', 'G'),
 ('m', 'M'),
 ('m', 'A'),
 ('m', 'D'),
 ('m', 'E'),
 ('s', 'S'),
 ('s', 'I'),
 ('s', 'M'),
 ('s', 'P'),
 ('s', 'L'),
 ('s', 'E')]

In [15]:
# extracting keys and values

In [17]:
keyword.keys().collect()

['s', 't', 'd', 'g', ':', 'b', 'd', 'p', 'm', 's']

In [18]:
keyword.values().collect()

['Spark',
 'The',
 'Definitive',
 'Guide',
 ':',
 'Big',
 'Data',
 'Processing',
 'Made',
 'Simple']

In [19]:
# lookup

In [20]:
keyword.lookup("s")

['Spark', 'Simple']

In [21]:
# sampleByKey

In [22]:
import random

In [23]:
distinctChars = words.flatMap(lambda word: list(word.lower())).distinct().collect()

In [25]:
sampleMap = dict(map(lambda c: (c, random.random()), distinctChars))

In [26]:
words.map(lambda word: (word.lower()[0], word)).sampleByKey(True, sampleMap, 6).collect()

[('t', 'The'), ('t', 'The'), ('m', 'Made')]

In [27]:
#Aggregations

In [28]:
chars = words.flatMap(lambda word: word.lower())

In [29]:
KVcharacters = chars.map(lambda letter: (letter, 1))

In [30]:
def maxFunc(left,right):
    return max(left,right)

In [31]:
def addFunc(left,right):
    return left + right

In [32]:
nums = spark.sparkContext.parallelize(range(1,31),5)

In [33]:
#countByKey

In [34]:
KVcharacters.countByKey()

defaultdict(int,
            {':': 1,
             'a': 4,
             'b': 1,
             'c': 1,
             'd': 4,
             'e': 7,
             'f': 1,
             'g': 3,
             'h': 1,
             'i': 7,
             'k': 1,
             'l': 1,
             'm': 2,
             'n': 2,
             'o': 1,
             'p': 3,
             'r': 2,
             's': 4,
             't': 3,
             'u': 1,
             'v': 1})

In [35]:
# understanding aggregation implementation

In [37]:
#groupByKey # problematic in out of memory

In [36]:
KVcharacters.groupByKey().map(lambda row: (row[0], reduce(addFunc,row[1])))

PythonRDD[21] at RDD at PythonRDD.scala:48

In [38]:
# reduceByKey

In [39]:
KVcharacters.reduceByKey(addFunc).collect()

[('g', 3),
 ('s', 4),
 ('l', 1),
 ('b', 1),
 ('h', 1),
 ('i', 7),
 ('c', 1),
 ('p', 3),
 ('d', 4),
 ('r', 2),
 ('o', 1),
 ('u', 1),
 ('k', 1),
 ('m', 2),
 ('f', 1),
 ('v', 1),
 ('a', 4),
 ('t', 3),
 ('n', 2),
 ('e', 7),
 (':', 1)]

In [42]:
# aggregate # maybe cause out of memory

In [41]:
nums.aggregate(0,maxFunc,addFunc)

90

In [43]:
# treeAggregate

In [44]:
depth = 3

In [45]:
nums.treeAggregate(0,maxFunc,addFunc,depth)

90

In [46]:
#aggregateByKey

In [48]:
KVcharacters.aggregateByKey(0,addFunc,maxFunc).collect()

[('g', 2),
 ('s', 3),
 ('l', 1),
 ('b', 1),
 ('h', 1),
 ('i', 4),
 ('c', 1),
 ('p', 2),
 ('d', 2),
 ('r', 1),
 ('o', 1),
 ('u', 1),
 ('k', 1),
 ('m', 2),
 ('f', 1),
 ('v', 1),
 ('a', 3),
 ('t', 2),
 ('n', 1),
 ('e', 4),
 (':', 1)]

In [49]:
#combineByKey

In [50]:
def valToCombiner(value):
    return [value]

In [51]:
def mergeValuesFunc(vals,valToAppend):
    vals.append(valToAppend)
    return vals

In [52]:
def mergeCombinerFunc(vals1, vals2):
    return vals1 + vals2

In [53]:
outputPartition = 6

In [54]:
KVcharacters.combineByKey(valToCombiner,mergeValuesFunc,mergeCombinerFunc,outputPartition).collect()

[('s', [1, 1, 1, 1]),
 ('l', [1]),
 ('d', [1, 1, 1, 1]),
 (':', [1]),
 ('v', [1]),
 ('r', [1, 1]),
 ('c', [1]),
 ('p', [1, 1, 1]),
 ('t', [1, 1, 1]),
 ('k', [1]),
 ('n', [1, 1]),
 ('o', [1]),
 ('u', [1]),
 ('g', [1, 1, 1]),
 ('i', [1, 1, 1, 1, 1, 1, 1]),
 ('b', [1]),
 ('h', [1]),
 ('a', [1, 1, 1, 1]),
 ('e', [1, 1, 1, 1, 1, 1, 1]),
 ('m', [1, 1]),
 ('f', [1])]

In [55]:
#foldByKey

In [56]:
KVcharacters.foldByKey(0,addFunc).collect()

[('g', 3),
 ('s', 4),
 ('l', 1),
 ('b', 1),
 ('h', 1),
 ('i', 7),
 ('c', 1),
 ('p', 3),
 ('d', 4),
 ('r', 2),
 ('o', 1),
 ('u', 1),
 ('k', 1),
 ('m', 2),
 ('f', 1),
 ('v', 1),
 ('a', 4),
 ('t', 3),
 ('n', 2),
 ('e', 7),
 (':', 1)]

In [57]:
# cogroups

In [58]:
import random

In [59]:
distinctChars = words.flatMap(lambda word: word.lower()).distinct()

In [60]:
charRDD = distinctChars.map(lambda c: (c,random.random()))

In [61]:
charRDD2 = distinctChars.map(lambda c: (c,random.random()))

In [62]:
charRDD.cogroup(charRDD2).take(5)

[('g',
  (<pyspark.resultiterable.ResultIterable at 0x7fb0da7707f0>,
   <pyspark.resultiterable.ResultIterable at 0x7fb0da770ba8>)),
 ('s',
  (<pyspark.resultiterable.ResultIterable at 0x7fb0da770978>,
   <pyspark.resultiterable.ResultIterable at 0x7fb0da609400>)),
 ('b',
  (<pyspark.resultiterable.ResultIterable at 0x7fb0da609c88>,
   <pyspark.resultiterable.ResultIterable at 0x7fb0da5e3898>)),
 ('l',
  (<pyspark.resultiterable.ResultIterable at 0x7fb0da770d68>,
   <pyspark.resultiterable.ResultIterable at 0x7fb0da5e3e48>)),
 ('c',
  (<pyspark.resultiterable.ResultIterable at 0x7fb0da5e36a0>,
   <pyspark.resultiterable.ResultIterable at 0x7fb0da5e32b0>))]

In [63]:
#joins

In [64]:
# inner joins

In [65]:
keyedChars = distinctChars.map(lambda c: (c, random.random()))

In [66]:
outputPartitions = 10

In [67]:
KVcharacters.join(keyedChars).count()

51

In [68]:
KVcharacters.join(keyedChars,outputPartitions).count()

51

In [69]:
# zip

In [70]:
numRange = spark.sparkContext.parallelize(range(10), 2)

In [71]:
words.zip(numRange).collect()

[('Spark', 0),
 ('The', 1),
 ('Definitive', 2),
 ('Guide', 3),
 (':', 4),
 ('Big', 5),
 ('Data', 6),
 ('Processing', 7),
 ('Made', 8),
 ('Simple', 9)]

In [72]:
# controlling partitions

In [73]:
# coalesce

In [74]:
words.coalesce(1).getNumPartitions()

1

In [75]:
# repartition

In [76]:
words.repartition(10)

MapPartitionsRDD[90] at coalesce at NativeMethodAccessorImpl.java:0

In [78]:
# custom partitioning

In [79]:
df = spark.read.option("header","true").option("inferSchema","true")\
.csv('retail-data/all/')

In [80]:
rdd = df.coalesce(10).rdd

In [81]:
df.printSchema()

root
 |-- InvoiceNo: string (nullable = true)
 |-- StockCode: string (nullable = true)
 |-- Description: string (nullable = true)
 |-- Quantity: integer (nullable = true)
 |-- InvoiceDate: string (nullable = true)
 |-- UnitPrice: double (nullable = true)
 |-- CustomerID: integer (nullable = true)
 |-- Country: string (nullable = true)



In [82]:
#key skew

In [84]:
def partitionFunc(key):
    import random
    if key == 17850 and key == 12583:
        return 0
    else:
        return random.randint(1,2)

In [85]:
keyedRDD = rdd.keyBy(lambda row: row[6])

In [86]:
keyedRDD\
.partitionBy(3,partitionFunc)\
.map(lambda x: x[0])\
.glom()\
.map(lambda x: len(set(x)))\
.take(5)

[0, 4313, 4297]