## Coder Errors

In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.config("spark.ui.port", 3000).getOrCreate()

In [2]:
data_path = "../data/sparkify_log_small.json"
logs = spark.read.json(data_path)

In [3]:
logs.take(1)

[Row(artist='Showaddywaddy', auth='Logged In', firstName='Kenneth', gender='M', itemInSession=112, lastName='Matthews', length=232.93342, level='paid', location='Charlotte-Concord-Gastonia, NC-SC', method='PUT', page='NextSong', registration=1509380319284, sessionId=5132, song='Christmas Tears Will Fall', status=200, ts=1513720872284, userAgent='"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.125 Safari/537.36"', userId='1046')]

In [4]:
# If we use a function that doesn't exist (e.g., miss-spelled): AttributeError
# If we use a column name which doesn't exist: AnalysisException
# If the result is too big (to fit in memory): PyJava... OutOfMemoryError
# Column-names are case sensitive, unless we set them not to be with spark.conf.set("spark.sql.caseSensitive", "false")
# If functions are used on the wrong data: PySparkTypeError
# If we forget closing a parenthesys: SyntaxError - unexpected EOF parsing
log = logs.select(["userId","firstname","page","song"]).where(logs.userId=="1046")

In [5]:
log

DataFrame[userId: string, firstname: string, page: string, song: string]

In [6]:
songs = logs.where(logs.page == "NextSong")

In [7]:
from pyspark.sql import functions as F

In [8]:
songs.groupBy("userId").agg(F.sum(songs.length)).show()

+------+------------------+
|userId|       sum(length)|
+------+------------------+
|  2904|         348.57751|
|   691|         808.98476|
|  2294|13926.819139999998|
|  2162|        8289.81315|
|  1436|         633.39011|
|  2088|3310.0480000000002|
|  2275|         1172.1913|
|  2756|1076.6344800000002|
|   800|         517.17134|
|  1394| 5989.630679999999|
|   926|1087.8414400000001|
|  2696|         200.95955|
|   870|         463.51583|
|     7| 533.9419499999999|
|  1903|        1058.81895|
|   591|         219.79383|
|   613|         419.26439|
|   574|        1286.55491|
|   307|         281.28608|
|   577|         374.20363|
+------+------------------+
only showing top 20 rows



## Accumulators

In [9]:
sc = spark.sparkContext

# Create an accumulator (initial value = 0)
acc = sc.accumulator(0)

In [10]:
rdd = sc.parallelize([1, 2, 3, 4, 5])

def process(x):
    global acc
    if x % 2 == 0:
        acc += 1  # Increment accumulator for even numbers
    return x * 2

rdd.map(process).collect()

[2, 4, 6, 8, 10]

In [11]:
# We can’t reliably read the value inside transformations (like map, filter).
# Only read .value on the driver after all actions complete.
# If a task fails and reruns, the accumulator may be incremented again, causing overcounting.
# They are used as logging variables, not suited for conditionals, etc.
# NOTE: Accumulators are not inmune to lazy evaluation! We need to .collect() to update them!
print("Number of even numbers:", acc.value)

Number of even numbers: 2


## Broadcasts

In [12]:
from pyspark import SparkContext

# Use the existing SparkContext
sc = SparkContext.getOrCreate()

my_dict = {"item1": 1, "item2": 2, "item3": 3, "item4": 4} 
my_list = ["item1", "item2", "item3", "item4"]

my_dict_bc = sc.broadcast(my_dict)

# .value gives access to the broadcasted object.
# The broadcasted object is read-only.
# Best used for lookups and small-to-medium data (< few 100MB).
def my_func(letter):
    return my_dict_bc.value[letter] 

my_list_rdd = sc.parallelize(my_list)

result = my_list_rdd.map(lambda x: my_func(x)).collect()

print(result)
# [1, 2, 3, 4]

[1, 2, 3, 4]


## Spark Web UI