## Setting up Spark Session / Context

In [1]:
from pyspark.sql import SparkSession
from operator import add

spark_session = SparkSession.builder\
        .master("spark://192.168.2.251:7077") \
        .appName("Lecture1_Example2_with_spark")\
        .config("spark.dynamicAllocation.enabled", True)\
        .config("spark.dynamicAllocation.shuffleTracking.enabled",True)\
        .config("spark.shuffle.service.enabled", False)\
        .config("spark.dynamicAllocation.executorIdleTimeout","30s")\
        .config("spark.executor.cores", 2)\
        .config("spark.driver.port",9999)\
        .config("spark.blockManager.port",10005)\
        .getOrCreate()

# RDD API
spark_context = spark_session.sparkContext
spark_context.setLogLevel("ERROR")

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/02/09 16:51:34 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


## Loading data from local file system 

In [2]:
# add file located on driver machine's disk to spark cluster
spark_context.addFile("/home/ubuntu/DE1-Spark1/DE-2025/data/others/i_have_a_dream.txt")

In [3]:
# Access / Find the link of the newly added file
from pyspark import SparkFiles
SparkFiles.get("i_have_a_dream.txt")

'/tmp/spark-dfe3f312-a971-44f2-b41a-170397d1c0df/userFiles-9d687b99-3a07-4e73-a293-ec8b545cfb11/i_have_a_dream.txt'

In [4]:
# read a file from local filesystem of your driver
def func_read_file(iterator):
    with open(SparkFiles.get("i_have_a_dream.txt")) as f:
        text = f.readlines()
        return text

lines = spark_context.parallelize([1]).mapPartitions(func_read_file)
# lines = spark_context.textFile("hdfs://de1-spark-host-180:9999/files/i_have_a_dream.txt")
lines.first()

                                                                                

'I am happy to join with you today in what will go down in history as\n'

In [5]:
lines.getNumPartitions()

2

## Loading the data from HDFS

In [6]:
# The same example, this time using map and reduce from the Spark API, and loading the text file from HDFS.

lines = spark_context.textFile("hdfs://192.168.2.251:9000/data/others/i_have_a_dream.txt")
print(lines.first())

[Stage 1:>                                                          (0 + 1) / 1]

I am happy to join with you today in what will go down in history as


                                                                                

## Performing word count using MapReduce

In [7]:
words = lines.map(lambda line: line.split(" "))

word_counts = words.map(lambda w: len(w))

total_words = word_counts.reduce(add)

print(f"total words= {total_words}")  

# ... the same number of words?

total words= 1681


## More Operations

In [8]:
lines.take(10)

['I am happy to join with you today in what will go down in history as',
 'the greatest demonstration for freedom in the history of our nation.',
 'Five score years ago, a great American, in whose symbolic shadow we',
 'stand today, signed the Emancipation Proclamation. This momentous',
 'decree came as a great beacon light of hope to millions of Negro',
 'slaves who had been seared in the flames of withering injustice. It',
 'came as a joyous daybreak to end the long night of their captivity.',
 'But one hundred years later, the Negro still is not free. One hundred',
 'years later, the life of the Negro is still sadly crippled by the',
 'manacles of segregation and the chains of discrimination. One hundred']

In [9]:
lines_splitted = lines.map(lambda line: line.split(" "))
print(lines_splitted.first())

['I', 'am', 'happy', 'to', 'join', 'with', 'you', 'today', 'in', 'what', 'will', 'go', 'down', 'in', 'history', 'as']


In [10]:
# Note, we're in Python, but using Java naming conventions!

all_words = lines.flatMap(lambda line: line.split(" "))
all_words.take(5)

['I', 'am', 'happy', 'to', 'join']

In [11]:
all_words.filter(lambda word: word.startswith("d"))\
         .take(20)

['down',
 'demonstration',
 'decree',
 'daybreak',
 'discrimination.',
 'dramatize',
 'defaulted',
 'demand',
 'drug',
 'democracy.',
 'dark',
 'desolate',
 'discontent',
 'day',
 'deeds.',
 'drinking',
 'dignity',
 'discipline.',
 'degenerate',
 'distrust']

In [12]:
# release the cores for another application!
spark_context.stop()