# Shared variables

## Setup Spark Environment

In [1]:
from pathlib import Path

installation_folder = Path("/content/spark-3.5.0-bin-hadoop3")

if not installation_folder.exists():

  # Install Java locally
  !apt-get install openjdk-8-jdk-headless -qq > /dev/null

  # Download & decompress Spark
  !wget -q https://dlcdn.apache.org/spark/spark-3.5.0/spark-3.5.0-bin-hadoop3.tgz -O spark-3.5.0-bin-hadoop3.tgz
  !tar xf spark-3.5.0-bin-hadoop3.tgz

  # Install finspark
  !pip install -q findspark

  # Setup required environment variables
  import os
  os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
  os.environ["SPARK_HOME"] = "/content/spark-3.5.0-bin-hadoop3"

  print("Spark setup finished!")

else:
  print("Skipping Spark setup")

Spark setup finished!


## Prepare the Spark context

In [2]:
# Import findpsark
import findspark

# Configure the environment
findspark.init()

# Import the Spark components required for the context creation
from pyspark import SparkConf, SparkContext

# Configure and create the context
conf = SparkConf()
conf = conf.setAppName('mds-session')
conf = conf.setMaster('local[*]')
sc = SparkContext.getOrCreate(conf=conf)

## Accumulators

In [5]:
errors = sc.accumulator(0)

def parseDate(date):
    try:
        year, month, day = date.split('-')
        return (year, month + '-' + day)
    except:
        errors.add(1)

rdd1 = sc.parallelize(['2014-12-31', '2015-01-25', '2016-05-17', '2016-', '2017-01-05', '2014-06'])
rdd2 = rdd1.map(parseDate)
results = rdd2.collect()

print(errors.value)

2


## Broadcast variables

In [4]:
students = dict()
students['25'] = 'Cristina'
students['12'] = 'Lucia'
studentsBC = sc.broadcast(students)

subjects = dict()
subjects['0'] = 'Lengua'
subjects['1'] = 'Matematicas'
subjectsBC = sc.broadcast(subjects)

rdd1 = sc.parallelize([(25, 0, 10), (25, 1, 8), (12, 0, 7), (12, 1, 9)])

def translate(element):
    return (studentsBC.value[str(element[0])], subjectsBC.value[str(element[1])], element[2])

rdd2 = rdd1.map(translate)
print(rdd2.collect())

[('Cristina', 'Lengua', 10), ('Cristina', 'Matematicas', 8), ('Lucia', 'Lengua', 7), ('Lucia', 'Matematicas', 9)]


## Close the Spark context

In [None]:
sc.stop()