In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.appName('c12').getOrCreate()

In [3]:
spark.range(10).rdd

MapPartitionsRDD[4] at javaToPython at NativeMethodAccessorImpl.java:0

In [4]:
spark.range(10).toDF("id").rdd.map(lambda row: row[0])

PythonRDD[10] at RDD at PythonRDD.scala:48

In [5]:
#rdd to dataframe or dataset

In [6]:
spark.range(10).rdd.toDF()

DataFrame[id: bigint]

In [7]:
# from a local collection

In [15]:
myCollection = "Spark The Definitive Guide : Big Data Processing Made Simple".split(" ")

In [16]:
words = spark.sparkContext.parallelize(myCollection, 2)

In [10]:
# set name for SparkUI

In [17]:
words.setName("myWords")

myWords ParallelCollectionRDD[27] at parallelize at PythonRDD.scala:475

In [18]:
words.name()

'myWords'

In [13]:
# distinct

In [19]:
words.distinct().count()

10

In [21]:
# filter

In [20]:
def startsWithS(individual):
    return individual.startswith("S")

In [22]:
words.filter(lambda word: startsWithS(word)).collect()

['Spark', 'Simple']

In [23]:
# map

In [24]:
words2 = words.map(lambda word: (word, word[0], word.startswith("S")))

In [25]:
words2.filter(lambda record: record[2]).take(5)

[('Spark', 'S', True), ('Simple', 'S', True)]

In [26]:
# flatmap

In [32]:
words.flatMap(lambda word: list(word)).take(5)

['S', 'p', 'a', 'r', 'k']

In [33]:
# sort

In [34]:
words.sortBy(lambda word: len(word) * -1).take(2)

['Definitive', 'Processing']

In [35]:
# random split

In [36]:
fiftyFiftySplit = words.randomSplit([0.5,0.5])

In [40]:
#reduce

In [41]:
spark.sparkContext.parallelize(range(1, 21)).reduce(lambda x,y: x + y)

210

In [42]:
def wordLengthReducer(leftWord, rightWord):
    if len(leftWord) > len(rightWord):
        return leftWord
    else:
        return rightWord

In [43]:
words.reduce(wordLengthReducer)

'Processing'

In [44]:
words.count()

10

In [45]:
#countApprox

In [46]:
confidence = 0.95

In [47]:
timeoutMilliseconds = 400

In [48]:
words.countApprox(confidence,timeoutMilliseconds)

Py4JError: An error occurred while calling o264.sumApprox. Trace:
py4j.Py4JException: Method sumApprox([class java.lang.Double, class java.lang.Integer]) does not exist
	at py4j.reflection.ReflectionEngine.getMethod(ReflectionEngine.java:318)
	at py4j.reflection.ReflectionEngine.getMethod(ReflectionEngine.java:326)
	at py4j.Gateway.invoke(Gateway.java:272)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:214)
	at java.lang.Thread.run(Thread.java:748)



In [49]:
# countApproxDistinct

In [50]:
words.countApproxDistinct(0.05)

10

In [51]:
words.countApproxDistinct(4,10)

TypeError: countApproxDistinct() takes from 1 to 2 positional arguments but 3 were given

In [52]:
# countByValue

In [53]:
words.countByValue()

defaultdict(int,
            {':': 1,
             'Big': 1,
             'Data': 1,
             'Definitive': 1,
             'Guide': 1,
             'Made': 1,
             'Processing': 1,
             'Simple': 1,
             'Spark': 1,
             'The': 1})

In [54]:
# countByValueApprox

In [55]:
# words.countByValueApprox(1000, 0.95)

In [56]:
# first

In [57]:
words.first()

'Spark'

In [58]:
# max and min

In [59]:
words.max()

'The'

In [60]:
words.min()

':'

In [61]:
# take

In [62]:
words.take(5)

['Spark', 'The', 'Definitive', 'Guide', ':']

In [63]:
words.takeOrdered(5)

[':', 'Big', 'Data', 'Definitive', 'Guide']

In [65]:
withReplacement = True

In [66]:
numberToTake = 6

In [67]:
randomSeed = 100

In [69]:
words.takeSample(withReplacement=withReplacement, num=numberToTake, seed=randomSeed)

['Data', 'Definitive', 'Data', 'The', 'Definitive', 'Spark']

In [70]:
#cache

In [71]:
words.cache()

myWords ParallelCollectionRDD[27] at parallelize at PythonRDD.scala:475

In [73]:
#storagelevel

In [72]:
words.getStorageLevel()

StorageLevel(False, True, False, False, 1)

In [74]:
# checkpointing

In [75]:
spark.sparkContext.setCheckpointDir('/home/mack/spark-2.1.0-bin-hadoop2.7/python')

In [76]:
words.checkpoint()

In [77]:
# pipe RDDs to System Commands

In [78]:
words.pipe("wc -l").collect()

['5', '5']

In [79]:
#mapPartitions

In [80]:
words.mapPartitions(lambda part: [1]).sum()

2

In [81]:
#mapPartitionsWithIndex

In [82]:
def indexedFunc(partitionIndex, withinPartIterator):
    return ["partition: {} => {}".format(partitionIndex,
                                        x) for x in withinPartIterator]

In [83]:
words.mapPartitionsWithIndex(indexedFunc).collect()

['partition: 0 => Spark',
 'partition: 0 => The',
 'partition: 0 => Definitive',
 'partition: 0 => Guide',
 'partition: 0 => :',
 'partition: 1 => Big',
 'partition: 1 => Data',
 'partition: 1 => Processing',
 'partition: 1 => Made',
 'partition: 1 => Simple']

In [84]:
# glom

In [85]:
spark.sparkContext.parallelize(["Hello","World"], 2).glom().collect()

[['Hello'], ['World']]