# RDD - Actions

## Setup Spark environment

In [1]:
from pathlib import Path

installation_folder = Path("/content/spark-3.5.0-bin-hadoop3")

if not installation_folder.exists():

  # Install Java locally
  !apt-get install openjdk-8-jdk-headless -qq > /dev/null

  # Download & decompress Spark
  !wget -q https://dlcdn.apache.org/spark/spark-3.5.0/spark-3.5.0-bin-hadoop3.tgz -O spark-3.5.0-bin-hadoop3.tgz
  !tar xf spark-3.5.0-bin-hadoop3.tgz

  # Install finspark
  !pip install -q findspark

  # Setup required environment variables
  import os
  os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
  os.environ["SPARK_HOME"] = "/content/spark-3.5.0-bin-hadoop3"

  print("Spark setup finished!")

else:
  print("Skipping Spark setup")

Spark setup finished!


## Prepare the Spark context

In [30]:
# Import findpsark
import findspark

# Configure the environment
findspark.init()

# Import the Spark components required for the context creation
from pyspark import SparkConf, SparkContext

# Configure and create the context
conf = SparkConf()
conf = conf.setAppName('mds-session')
conf = conf.setMaster('local[32]')
sc = SparkContext.getOrCreate(conf=conf)

## reduce

In [5]:
rdd1 = sc.parallelize([('a', 1), ('b', 2), ('c', 3)])
rdd1.reduce(lambda el1, el2: len(el2))

TypeError: ignored

## count, max, min, sum, mean, stdev, variance

In [6]:
# Actions: count, max, min, sum, mean, stdev, variance
rdd1 = sc.parallelize(list(range(100)))

In [7]:
rdd1.count()

100

In [8]:
rdd1.max()

99

In [9]:
rdd1.min()

0

In [10]:
rdd1.sum()

4950

In [11]:
rdd1.mean()

49.5

In [12]:
rdd1.stdev()

28.86607004772212

In [13]:
rdd1.variance()

833.25

## collect, take, first

In [14]:
rdd1 = sc.parallelize([('a', 1), ('c', 10), ('b', 2)])

In [17]:
type(rdd1)

pyspark.rdd.RDD

In [21]:
rdd1.take(1)

[('a', 1)]

In [20]:
rdd1.first()

('a', 1)

## saveAsTextFile

In [31]:
# Actions: saveAsTextFile
rdd1 = sc.parallelize(range(100))
rdd1.saveAsTextFile('/content/output1')

## Close the Spark context

In [29]:
sc.stop()