In [1]:
from pyspark.context import SparkContext
from pyspark.sql.session import SparkSession
sc = SparkContext('local')
spark = SparkSession(sc)

24/06/07 12:50:08 WARN Utils: Your hostname, Arbnors-MacBook-Pro.local resolves to a loopback address: 127.0.0.1; using 10.11.6.58 instead (on interface en0)
24/06/07 12:50:08 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/06/07 12:50:08 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [2]:
spark.range(10).rdd.collect()

                                                                                

[Row(id=0),
 Row(id=1),
 Row(id=2),
 Row(id=3),
 Row(id=4),
 Row(id=5),
 Row(id=6),
 Row(id=7),
 Row(id=8),
 Row(id=9)]

## One of the easiest ways to get RDDs is from an existing DataFrame or Dataset. Converting these to an RDD is simple: just use the rdd method on any of these data types.

In [3]:
spark.range(10).toDF("id").rdd.map(lambda x : x[0]).collect()

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]

In [4]:
spark.range(10).show()

+---+
| id|
+---+
|  0|
|  1|
|  2|
|  3|
|  4|
|  5|
|  6|
|  7|
|  8|
|  9|
+---+



To create an RDD from a collection, you will need to use the parallelize method on a SparkContext (within a SparkSession).

This turns a single node collection into a parallel collection.

When creating this parallel collection, you can also explicitly state the number of partitions into which you would like to distribute this array. In this case, we are creating two partitions:

In [36]:
myCollection = "My name is Nol James Bond"\
    .split(" ")

In [37]:
words = spark.sparkContext.parallelize(myCollection, 2)

In [38]:
words.collect()

['My', 'name', 'is', 'Nol,', 'James', 'Bond']

In [39]:
def startsWithB(oneWord):
    return oneWord.startswith("B")

In [40]:
words.filter(lambda word : startsWithB(word)).collect()

['Bond']

In [41]:
words2 = words.map(lambda word: (word, word[0], word.startswith('B')))

In [42]:
words2.collect()

[('My', 'M', False),
 ('name', 'n', False),
 ('is', 'i', False),
 ('Nol,', 'N', False),
 ('James', 'J', False),
 ('Bond', 'B', True)]

In [43]:
words2.filter(lambda row: row[2]).take(5)

[('Bond', 'B', True)]

In [44]:
words.flatMap(lambda row: list(row)).take(10)

['M', 'y', 'n', 'a', 'm', 'e', 'i', 's', 'N', 'o']

In [46]:
words.sortBy(lambda row: len(row) * -1).take(5) # The -1 is to make the list in ascending order

['James', 'name', 'Nol,', 'Bond', 'My']

In [47]:
spark.sparkContext.parallelize(range(1,21)).reduce(lambda x, y: x + y)

210

In [48]:
spark.sparkContext.parallelize(range(1,21)).collect()

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20]

In [50]:
def longestWord(leftWord, rightWord):
    if len(leftWord) > len(rightWord):
        return leftWord
    else:
        return rightWord

In [51]:
words.reduce(longestWord)

'James'

In [52]:
words.count()

6

In [53]:
confidence = 0.9
timeOutMiliSec = 400

In [54]:
words.countApprox(timeOutMiliSec, confidence)

6

In [61]:
spark.sparkContext.parallelize(["Hello", "How", "are", "you", "today"], 3).glom().collect()

[['Hello'], ['How', 'are'], ['you', 'today']]

In [63]:
words.map(lambda word: (word.lower(), len(word))).collect()

[('my', 2), ('name', 4), ('is', 2), ('nol,', 4), ('james', 5), ('bond', 4)]

In [68]:
keyword = words.keyBy(lambda word : (word.lower()[0:2]))

In [69]:
keyword.collect()

[('my', 'My'),
 ('na', 'name'),
 ('is', 'is'),
 ('no', 'Nol,'),
 ('ja', 'James'),
 ('bo', 'Bond')]