In [1]:
import findspark
findspark.init()

In [2]:
from pyspark.sql import SparkSession
from pyspark.conf import SparkConf
from pyspark import SparkContext

In [3]:
pyspark = SparkSession.builder \
.master('local[4]') \
.appName('BasicTransformations') \
.config('spark.executor.memory', '4g') \
.config('spark.driver.memory', '2g') \
.getOrCreate()

In [4]:
sc = pyspark.sparkContext

## 1) Some Queries

In [5]:
ages = [('Amanda', 35), ('Jessie', 20), ('Jack', 25)]

In [6]:
ages_rdd = sc.parallelize(ages)
ages_rdd.take(3)

[('Amanda', 35), ('Jessie', 20), ('Jack', 25)]

In [7]:
# younger than 30
ages_rdd.filter(lambda key_value: key_value[1] < 30).take(3)

[('Jessie', 20), ('Jack', 25)]

In [8]:
# named Jack
ages_rdd.filter(lambda key_value: key_value[0] == 'Jack').take(3)

[('Jack', 25)]

## 2) Some Operations

In [14]:
rdd = sc.parallelize([(3,4),(1,2),(3,6)])

In [15]:
rdd.reduceByKey(lambda x,y: x+y).take(3)

[(1, 2), (3, 10)]

In [16]:
rdd.groupByKey().take(3)

[(1, <pyspark.resultiterable.ResultIterable at 0x1b52022beb8>),
 (3, <pyspark.resultiterable.ResultIterable at 0x1b52022bdd8>)]

In [17]:
rdd.mapValues(lambda x: x*100).collect()

[(3, 400), (1, 200), (3, 600)]

In [18]:
rdd.values().collect()

[4, 2, 6]

In [19]:
rdd.sortByKey().collect()

[(1, 2), (3, 4), (3, 6)]

In [21]:
rdd2 = sc.parallelize([(3,9)])
rdd.subtractByKey(rdd2).collect()

[(1, 2)]

In [22]:
rdd.join(rdd2).collect()

[(3, (4, 9)), (3, (6, 9))]

In [23]:
rdd.rightOuterJoin(rdd2).collect()

[(3, (4, 9)), (3, (6, 9))]

In [24]:
rdd.leftOuterJoin(rdd2).collect()

[(1, (2, None)), (3, (4, 9)), (3, (6, 9))]

In [25]:
rdd.cogroup(rdd2).collect()

[(1,
  (<pyspark.resultiterable.ResultIterable at 0x1b51ea84f60>,
   <pyspark.resultiterable.ResultIterable at 0x1b5201da550>)),
 (3,
  (<pyspark.resultiterable.ResultIterable at 0x1b5201da3c8>,
   <pyspark.resultiterable.ResultIterable at 0x1b5201da358>))]