# Setup

In [1]:
# import modules
from pyspark import SparkConf, SparkContext
import re
import collections

In [2]:
# manually create spark context
conf = SparkConf() \
    .setMaster("local") \
    .setAppName("RDD")

sc = SparkContext(conf=conf)

24/10/24 12:13:31 WARN Utils: Your hostname, Nicks-MacBook-Pro-2.local resolves to a loopback address: 127.0.0.1; using 192.168.0.73 instead (on interface en0)
24/10/24 12:13:31 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/10/24 12:13:32 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


# Spark
- Fast and general engine for large-scale data processing.
- Capable of splitting workloads across multiple machines and running in parallel. 
- DAG Engine (directed acyclic graph) optimizes workflow.
- Spark Components
    - Spark Core
        - Spark Streaming
        - Spark SQL
        - MLLib
        - GraphX

# Resilient Distributed Dataset (RDD)
- Resilient
- Distributed
- Dataset
- Lazy Evaluation: Driver program does not begin execution until an **action** is called.

## Spark Context
- Created by driver program
- Creates RDDs
- Responsible for making RDDs resilient and distributed
- SparkContext (sc) object created automatically in shell

## Common RDD Transformations
- map
- flatmap
- filter
- distinct
- sample
- union
- intersection
- subtract
- cartesian

## Common RDD Actions
- collect
- count
- countByValue
- take
- top
- reduce

## Simple Example

In [3]:
# map and distinct example using lambda function
rdd1 = sc.parallelize([1, 1, 2, 2, 3, 3, 4, 4])
print(rdd1.collect())

# de-deuplicate and square every value in rdd1 (method chaining)
result1 = rdd1 \
    .distinct() \
    .map(lambda x: x*x)

print(result1.collect())

                                                                                

[1, 1, 2, 2, 3, 3, 4, 4]


[Stage 1:>                                                          (0 + 1) / 1]

[1, 4, 9, 16]


                                                                                

In [4]:
# same result as the following
def squareIt(x):
    return x*x

rdd2 = sc.parallelize([1, 2, 3, 4])
result2 = rdd2.map(squareIt)
result2.collect()

[1, 4, 9, 16]

## Key Value RDDs
- Similar to a NoSQL store
- Allows aggregation by keys

## Common Key: Value Functions
- mapValues()
- flatMapValues()
- reduceByKey()
- groupByKey()
- sortByKey()
- keys(), values()

In [5]:
# load data
# schema: userId, userName, age, numFriends
friends = sc.textFile('../data/fakefriends.csv')

In [6]:
# parse (map) input data and return key value pairs
def parseFriends(line):
    fields = line.split(',')
    age = int(fields[2])
    numFriends = int(fields[3])
    return (age, numFriends)

friends = friends.map(parseFriends)

# preview results
friends.take(5)

[(33, 385), (26, 2), (55, 221), (40, 465), (68, 21)]

In [7]:
# calculate total friends by age
totalsByAge = friends \
    .mapValues(lambda x: (x, 1)) \
    .reduceByKey(lambda x, y: (x[0] + y[0], x[1] + y[1]))

sortedTotalsByAge = totalsByAge.sortBy(lambda x: x[0])

results = sortedTotalsByAge.collect()

for result in results:
    print(result) # age, (sum friends, sum occurences)

(18, (2747, 8))
(19, (2346, 11))
(20, (825, 5))
(21, (2807, 8))
(22, (1445, 7))
(23, (2463, 10))
(24, (1169, 5))
(25, (2172, 11))
(26, (4115, 17))
(27, (1825, 8))
(28, (2091, 10))
(29, (2591, 12))
(30, (2594, 11))
(31, (2138, 8))
(32, (2287, 11))
(33, (3904, 12))
(34, (1473, 6))
(35, (1693, 8))
(36, (2466, 10))
(37, (2244, 9))
(38, (2903, 15))
(39, (1185, 7))
(40, (4264, 17))
(41, (2417, 9))
(42, (1821, 6))
(43, (1614, 7))
(44, (3386, 12))
(45, (4024, 13))
(46, (2908, 13))
(47, (2099, 9))
(48, (2814, 10))
(49, (1108, 6))
(50, (1273, 5))
(51, (2115, 7))
(52, (3747, 11))
(53, (1560, 7))
(54, (3615, 13))
(55, (3842, 13))
(56, (1840, 6))
(57, (3106, 12))
(58, (1282, 11))
(59, (1980, 9))
(60, (1419, 7))
(61, (2306, 9))
(62, (2870, 13))
(63, (1536, 4))
(64, (3376, 12))
(65, (1491, 5))
(66, (2488, 9))
(67, (3434, 16))
(68, (2696, 10))
(69, (2352, 10))


In [8]:
# calculate average friends by age
totalsByAge = friends \
    .mapValues(lambda x: (x, 1)) \
    .reduceByKey(lambda x, y: (x[0] + y[0], x[1] + y[1]))

averagesByAge = totalsByAge.mapValues(lambda x: int(x[0] / x[1]))
sortedAveragesByAge = averagesByAge.sortBy(lambda x: x[0])

results = sortedAveragesByAge.collect()

for result in results:
    print(result) # age, avg friends

(18, 343)
(19, 213)
(20, 165)
(21, 350)
(22, 206)
(23, 246)
(24, 233)
(25, 197)
(26, 242)
(27, 228)
(28, 209)
(29, 215)
(30, 235)
(31, 267)
(32, 207)
(33, 325)
(34, 245)
(35, 211)
(36, 246)
(37, 249)
(38, 193)
(39, 169)
(40, 250)
(41, 268)
(42, 303)
(43, 230)
(44, 282)
(45, 309)
(46, 223)
(47, 233)
(48, 281)
(49, 184)
(50, 254)
(51, 302)
(52, 340)
(53, 222)
(54, 278)
(55, 295)
(56, 306)
(57, 258)
(58, 116)
(59, 220)
(60, 202)
(61, 256)
(62, 220)
(63, 384)
(64, 281)
(65, 298)
(66, 276)
(67, 214)
(68, 269)
(69, 235)


## Filtering RDDs
- Removes data from RDD based on boolean condition

In [9]:
# load data
# schema: weatherStationId, date, entryType, temperature
temps = sc.textFile('../data/1800.csv')

In [10]:
# parse input data
def parseTemps(line):
    fields = line.split(',')
    stationId = fields[0]
    entryType = fields[2]
    temperature = float(fields[3]) * 0.1 * (9.0 / 5.0) + 32.0 # converts from celsius
    return (stationId, entryType, temperature)

temps = temps.map(parseTemps)

# preview results
temps.take(5)

[('ITE00100554', 'TMAX', 18.5),
 ('ITE00100554', 'TMIN', 5.359999999999999),
 ('GM000010962', 'PRCP', 32.0),
 ('EZE00100082', 'TMAX', 16.52),
 ('EZE00100082', 'TMIN', 7.699999999999999)]

In [11]:
# calculate min temp for each weather station
minTemps = temps.filter(lambda x: 'TMIN' in x[1])
minStationTemps = minTemps.map(lambda x: (x[0], x[2]))
minStationTempsPer = minStationTemps.reduceByKey(lambda x, y: min(x,y))

minResults = minStationTempsPer.collect()
# print(minResults)

for result in minResults:
    print('MIN ' + result[0] + f'\t{round(result[1], 2)}F')

# calculate max temp for each weather station
maxTemps = temps.filter(lambda x: 'TMAX' in x[1])
maxStationTemps = maxTemps.map(lambda x: (x[0], x[2]))
maxStationTempsPer = maxStationTemps.reduceByKey(lambda x, y: max(x,y))

maxResults = maxStationTempsPer.collect()
# print(maxResults)

for result in maxResults:
    print('MAX ' + result[0] + f'\t{round(result[1], 2)}F')

MIN ITE00100554	5.36F
MIN EZE00100082	7.7F
MAX ITE00100554	90.14F
MAX EZE00100082	90.14F


## Applying map() and flatMap() to RDDs
- map() transforms each element of an RDD into one new element
- flatMap() can create many new elements from each element

In [12]:
# load data
book = sc.textFile('../data/book.txt')

In [13]:
# normalize words function
# split words ignoring punctuation and transform to lower case
def normalizeWords(text):
    return re.compile(r'\W+', re.UNICODE).split(text.lower())

In [14]:
# calculate unique word count and sort
words = book.flatMap(normalizeWords)
wordCounts = words.countByValue()
sortedWordCounts = sorted(wordCounts.items(), key = lambda x: x[1], reverse=True)

for word, count in sortedWordCounts:
    cleanWord = word.encode('ascii', 'ignore')

    if cleanWord:
        print(cleanWord.decode() + ' ' + str(count))

you 1878
to 1828
your 1420
the 1292
a 1191
of 970
and 934
that 747
it 649
in 616
is 560
for 537
on 428
are 424
if 411
s 391
i 387
business 383
can 376
be 369
as 343
have 321
with 315
t 301
this 280
or 278
time 255
but 242
they 234
will 231
what 229
at 220
my 215
re 214
do 207
not 203
about 202
more 200
product 182
an 178
up 177
need 174
them 166
from 166
how 163
there 162
out 161
new 153
people 145
work 144
so 143
just 142
own 140
all 137
don 133
get 123
customers 123
by 122
want 122
company 122
their 122
some 121
ll 114
self 111
website 109
make 108
may 107
even 104
when 102
one 100
ve 95
than 92
also 91
job 90
much 90
who 88
money 86
was 85
these 82
find 81
sales 80
only 79
into 79
yourself 78
other 78
like 78
no 76
probably 76
employment 75
ads 75
day 73
good 72
many 71
before 70
most 70
might 70
ad 70
should 69
those 68
products 67
market 66
well 66
sure 65
still 65
plan 64
google 63
someone 62
over 62
any 62
software 60
idea 60
enough 59
once 59
then 59
very 58
working 58
think 58

# Exercise: Find Total Amount by Customer

In [15]:
# load data
# schema: customerId, itemId, amountSpent
orders = sc.textFile('../data/customer-orders.csv')

In [16]:
# extract cutomer price pair function
def extractCustomerPricePairs(line):
    fields = line.split(',')
    customerId = int(fields[0])
    amountSpent = float(fields[2])

    return (customerId, amountSpent)

In [17]:
# calcuate sum by customer
mappedOrders = orders.map(extractCustomerPricePairs)
totalByCustomer = mappedOrders.reduceByKey(lambda x, y: x + y)
sortedTotalByCustomer = totalByCustomer.sortBy(lambda x: x[1], ascending=False)

results = sortedTotalByCustomer.collect()

for result in results:
    customer = result[0]
    totalAmount = round(result[1], 2)
    print(f'CustomerId: {customer},\tTotal Amt: ${totalAmount}')

CustomerId: 68,	Total Amt: $6375.45
CustomerId: 73,	Total Amt: $6206.2
CustomerId: 39,	Total Amt: $6193.11
CustomerId: 54,	Total Amt: $6065.39
CustomerId: 71,	Total Amt: $5995.66
CustomerId: 2,	Total Amt: $5994.59
CustomerId: 97,	Total Amt: $5977.19
CustomerId: 46,	Total Amt: $5963.11
CustomerId: 42,	Total Amt: $5696.84
CustomerId: 59,	Total Amt: $5642.89
CustomerId: 41,	Total Amt: $5637.62
CustomerId: 0,	Total Amt: $5524.95
CustomerId: 8,	Total Amt: $5517.24
CustomerId: 85,	Total Amt: $5503.43
CustomerId: 61,	Total Amt: $5497.48
CustomerId: 32,	Total Amt: $5496.05
CustomerId: 58,	Total Amt: $5437.73
CustomerId: 63,	Total Amt: $5415.15
CustomerId: 15,	Total Amt: $5413.51
CustomerId: 6,	Total Amt: $5397.88
CustomerId: 92,	Total Amt: $5379.28
CustomerId: 43,	Total Amt: $5368.83
CustomerId: 70,	Total Amt: $5368.25
CustomerId: 72,	Total Amt: $5337.44
CustomerId: 34,	Total Amt: $5330.8
CustomerId: 9,	Total Amt: $5322.65
CustomerId: 55,	Total Amt: $5298.09
CustomerId: 90,	Total Amt: $5290.41