# RDD - Transformations

## Setup Spark environment

In [1]:
from pathlib import Path

installation_folder = Path("/content/spark-3.5.0-bin-hadoop3")

if not installation_folder.exists():

  # Install Java locally
  !apt-get install openjdk-8-jdk-headless -qq > /dev/null

  # Download & decompress Spark
  !wget -q https://dlcdn.apache.org/spark/spark-3.5.0/spark-3.5.0-bin-hadoop3.tgz -O spark-3.5.0-bin-hadoop3.tgz
  !tar xf spark-3.5.0-bin-hadoop3.tgz

  # Install finspark
  !pip install -q findspark

  # Setup required environment variables
  import os
  os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
  os.environ["SPARK_HOME"] = "/content/spark-3.5.0-bin-hadoop3"

  print("Spark setup finished!")

else:
  print("Skipping Spark setup")

Spark setup finished!


## Prepare the Spark context

In [2]:
# Import findpsark
import findspark

# Configure the environment
findspark.init()

# Import the Spark components required for the context creation
from pyspark import SparkConf, SparkContext

# Configure and create the context
conf = SparkConf()
conf = conf.setAppName('mds-session')
conf = conf.setMaster('local[*]')
sc = SparkContext.getOrCreate(conf=conf)

## map

In [3]:
rdd1 = sc.parallelize(['a', 'b', 'c', 'a', 'c'])
rdd2 = rdd1.map(lambda element: (element, 1))
print(rdd1.collect())
print(rdd2.collect())

['a', 'b', 'c', 'a', 'c']
[('a', 1), ('b', 1), ('c', 1), ('a', 1), ('c', 1)]


In [4]:
def createTuple(element):
    return (element, 1)

rdd1 = sc.parallelize(['a', 'b', 'c', 'a', 'c'])
rdd2 = rdd1.map(createTuple)
print(rdd1.collect())
print(rdd2.collect())

['a', 'b', 'c', 'a', 'c']
[('a', 1), ('b', 1), ('c', 1), ('a', 1), ('c', 1)]


## flatMap

In [5]:
def splitLine(line):
    return line.split()

rdd1 = sc.parallelize(['en un lugar', 'de la mancha', 'de cuyo nombre', 'no quiero acordarme'])
rdd2 = rdd1.map(splitLine)
print(rdd1.collect())
print(rdd2.collect())

['en un lugar', 'de la mancha', 'de cuyo nombre', 'no quiero acordarme']
[['en', 'un', 'lugar'], ['de', 'la', 'mancha'], ['de', 'cuyo', 'nombre'], ['no', 'quiero', 'acordarme']]


In [6]:
rdd1 = sc.parallelize(['en un lugar', 'de la mancha', 'de cuyo nombre', 'no quiero acordarme'])
rdd2 = rdd1.flatMap(splitLine)
print(rdd1.collect())
print(rdd2.collect())

['en un lugar', 'de la mancha', 'de cuyo nombre', 'no quiero acordarme']
['en', 'un', 'lugar', 'de', 'la', 'mancha', 'de', 'cuyo', 'nombre', 'no', 'quiero', 'acordarme']


In [8]:
def splitLineAndRemove(line):
    if line != 'de la mancha':
        return line.split()
    else:
        return []

In [9]:
rdd1 = sc.parallelize(['en un lugar', 'de la mancha', 'de cuyo nombre', 'no quiero acordarme'])
rdd2 = rdd1.map(splitLineAndRemove)
print(rdd1.collect())
print(rdd2.collect())

['en un lugar', 'de la mancha', 'de cuyo nombre', 'no quiero acordarme']
[['en', 'un', 'lugar'], [], ['de', 'cuyo', 'nombre'], ['no', 'quiero', 'acordarme']]


In [10]:
rdd1 = sc.parallelize(['en un lugar', 'de la mancha', 'de cuyo nombre', 'no quiero acordarme'])
rdd2 = rdd1.flatMap(splitLineAndRemove)
print(rdd1.collect())
print(rdd2.collect())

['en un lugar', 'de la mancha', 'de cuyo nombre', 'no quiero acordarme']
['en', 'un', 'lugar', 'de', 'cuyo', 'nombre', 'no', 'quiero', 'acordarme']


## filter

In [12]:
rdd1 = sc.parallelize(list(range(10)))
rdd2 = rdd1.map(lambda element: element % 2 == 0)
print(rdd1.collect())
print(rdd2.collect())

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
[True, False, True, False, True, False, True, False, True, False]


## groupBy

In [14]:
rdd1 = sc.parallelize(['2014-12-31', '2015-01-25', '2016-05-17', '2016-11-08', '2017-01-05', '2014-08-06'])
rdd2 = rdd1.groupBy(lambda element: element.split('-')[0])
print(rdd1.collect())
print(rdd2.collect())
print([(key, list(value)) for (key, value) in rdd2.collect()])

['2014-12-31', '2015-01-25', '2016-05-17', '2016-11-08', '2017-01-05', '2014-08-06']
[('2014', <pyspark.resultiterable.ResultIterable object at 0x7852dcc37c10>), ('2015', <pyspark.resultiterable.ResultIterable object at 0x7852dcc34220>), ('2016', <pyspark.resultiterable.ResultIterable object at 0x7852dcc37d00>), ('2017', <pyspark.resultiterable.ResultIterable object at 0x7852dcc351b0>)]
[('2014', ['2014-12-31', '2014-08-06']), ('2015', ['2015-01-25']), ('2016', ['2016-05-17', '2016-11-08']), ('2017', ['2017-01-05'])]


## sortBy

In [15]:
rdd1 = sc.parallelize([('a', 1), ('c', 10), ('b', 2)])
rdd2 = rdd1.sortBy(lambda element: element[0])
rdd3 = rdd1.sortBy(lambda element: element[1], ascending = False)
print(rdd1.collect())
print(rdd2.collect())
print(rdd3.collect())

[('a', 1), ('c', 10), ('b', 2)]
[('a', 1), ('b', 2), ('c', 10)]
[('c', 10), ('b', 2), ('a', 1)]


## union

In [16]:
rdd1 = sc.parallelize(['daniel', 'jose', 'miguel', 'juan'])
rdd2 = sc.parallelize(['maria', 'rocio', 'cristina', 'lucia', 'juan'])
rdd3 = rdd1.union(rdd2)
print(rdd3.collect())

['daniel', 'jose', 'miguel', 'juan', 'maria', 'rocio', 'cristina', 'lucia', 'juan']


## intersection

In [24]:
rdd1 = sc.parallelize(['daniel', 'jose', 'miguel', 'maria', 'lucia'])
rdd2 = sc.parallelize(['maria', 'rocio', 'cristina', 'lucia'])
rdd3 = rdd1.intersection(rdd2)
print(rdd3.collect())

['maria', 'lucia']


## cartesian

In [18]:
rdd1 = sc.parallelize(['daniel', 'jose', 'miguel', 'maria', 'lucia'])
rdd2 = sc.parallelize(['maria', 'rocio', 'cristina', 'lucia'])
rdd3 = rdd1.cartesian(rdd2)
print(rdd3.collect())

[('daniel', 'maria'), ('daniel', 'rocio'), ('jose', 'maria'), ('jose', 'rocio'), ('daniel', 'cristina'), ('daniel', 'lucia'), ('jose', 'cristina'), ('jose', 'lucia'), ('miguel', 'maria'), ('miguel', 'rocio'), ('maria', 'maria'), ('maria', 'rocio'), ('lucia', 'maria'), ('lucia', 'rocio'), ('miguel', 'cristina'), ('miguel', 'lucia'), ('maria', 'cristina'), ('maria', 'lucia'), ('lucia', 'cristina'), ('lucia', 'lucia')]


## distinct

In [19]:
rdd1 = sc.parallelize(['madrid', 'barcelona', 'madrid', 'valencia', 'sevilla', 'valencia'])
rdd2 = rdd1.distinct()
print(rdd1.collect())
print(rdd2.collect())

['madrid', 'barcelona', 'madrid', 'valencia', 'sevilla', 'valencia']
['madrid', 'barcelona', 'sevilla', 'valencia']


## sample

In [21]:
rdd1 = sc.parallelize(list(range(100)))
rdd2 = rdd1.sample(False, 0.1)
print(rdd2.collect())

[2, 13, 14, 24, 26, 27, 40, 49, 52, 55, 67, 75, 83, 96, 98]


## Stop the Spark context

In [None]:
sc.stop()