# RDD - Creation

## Setup Spark environment

In [1]:
from pathlib import Path

installation_folder = Path("/content/spark-3.5.0-bin-hadoop3")

if not installation_folder.exists():

  # Install Java locally
  !apt-get install openjdk-8-jdk-headless -qq > /dev/null

  # Download & decompress Spark
  !wget -q https://dlcdn.apache.org/spark/spark-3.5.0/spark-3.5.0-bin-hadoop3.tgz -O spark-3.5.0-bin-hadoop3.tgz
  !tar xf spark-3.5.0-bin-hadoop3.tgz

  # Install finspark
  !pip install -q findspark

  # Setup required environment variables
  import os
  os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
  os.environ["SPARK_HOME"] = "/content/spark-3.5.0-bin-hadoop3"

  print("Spark setup finished!")

else:
  print("Skipping Spark setup")

Spark setup finished!


## Prepare the Spark Context

In [2]:
# Import findpsark
import findspark

# Configure the environment
findspark.init()

# Import the Spark components required for the context creation
from pyspark import SparkConf, SparkContext

# Configure and create the context
conf = SparkConf()
conf = conf.setAppName('mds-session')
conf = conf.setMaster('local[*]')
sc = SparkContext.getOrCreate(conf=conf)

In [3]:
sc

## Parallelize collections

In [5]:
# Read a local file and create a collection of lines
file = open(f'/content/spark-3.5.0-bin-hadoop3/README.md', encoding='utf-8')
lineCollection = []
for line in file:
    lineCollection.append(line)
file.close()

# Distribute the collection of lines through the cluster
linesRDD = sc.parallelize(lineCollection)

pyspark.rdd.RDD

In [6]:

# Perform an execution over the lines (count words on the file)
linesRDD.map(lambda line: len(line.split())).sum()

521

## Load text files

In [7]:
# Read a local file as RDD
# IMPORTANT: File should be accesible by all the worker nodes
file = sc.textFile(f'/content/spark-3.5.0-bin-hadoop3/README.md')

# Perform an execution over the file (count words on the file)
file.map(lambda line: len(line.split())).sum()

521

## Close the Spark context

In [8]:
sc.stop()