In [1]:
import pyspark

In [2]:
from pyspark.sql import SparkSession

In [3]:
# Because we aren't running on a spark cluster, the session is just for development

spark = SparkSession \
    .builder \
    .appName("Maps and Lazy Evaluation Example") \
    .getOrCreate()

23/07/10 10:25:59 WARN Utils: Your hostname, Phoebes-MacBook-Pro.local resolves to a loopback address: 127.0.0.1; using 192.168.101.2 instead (on interface en0)
23/07/10 10:25:59 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/07/10 10:26:00 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [4]:
# Starting off with a regular python list

log_of_songs = [
        "Despacito",
        "Nice for what",
        "No tears left to cry",
        "Despacito",
        "Havana",
        "In my feelings",
        "Nice for what",
        "despacito",
        "All the stars"
]

In [5]:
# parallelize the log_of_songs to use with Spark
# distributed_song_log_rdd is an RDD (Reslient Distributed Dataset)

distributed_song_log_rdd = spark.sparkContext.parallelize(log_of_songs)

In [6]:
# notice we DO NOT use the .collect() method. What is the difference between
# .collect() and .foreach()? 
# .collect() forces all the data from the entire RDD on all nodes 
# to be collected from ALL the nodes, which kills productivity, and could crash
# .foreach() allows the data to stay on each of the independent nodes

# show the original input data is preserved

distributed_song_log_rdd.foreach(print)

Nice for what                                                       (0 + 8) / 8]
Despacito
No tears left to cry
In my feelings
Nice for what
Despacito
Havana
despacito
All the stars
                                                                                

In [7]:
# create a python function to convert strings to lowercase

def convert_song_to_lowercase(song):
    return song.lower()

In [8]:
print(convert_song_to_lowercase("Havana"))

havana


In [9]:
# use the map function to transform the list of songs with the python function that converts strings to lowercase

lower_case_songs=distributed_song_log_rdd.map(convert_song_to_lowercase)
lower_case_songs.foreach(print)

nice for what
despacito
in my feelings
havana
despacito
no tears left to cry
nice for what
despacito
all the stars


In [10]:
# Show the original input data is still mixed case

distributed_song_log_rdd.foreach(print)

Nice for what
Despacito
Havana
No tears left to cry
despacito
All the stars
In my feelings
Nice for what
Despacito


In [11]:
# Use lambda functions instead of named functions to do the same map operation

distributed_song_log_rdd.map(lambda song: song.lower()).foreach(print)

havana
no tears left to cry
in my feelings
nice for what
despacito
despacito
all the stars
despacito
nice for what


In [12]:
distributed_song_log_rdd.map(lambda x: x.lower()).foreach(print)

despacito
nice for what
no tears left to cry
despacito
in my feelings
havana
despacito
all the stars
nice for what
