In [1]:
# foreach - custom code you will write to process the result by item by item
# foreachPartition - custom code you will write process data in each partition
# foreach,foreachPartition are action methods
#  foreach,foreachPartition runs on executors

In [2]:
import findspark
findspark.init()

In [3]:
from pyspark import SparkContext
sc = SparkContext("local", "ForEach")

22/05/06 22:20:46 WARN Utils: Your hostname, ubuntu-virtual-machine resolves to a loopback address: 127.0.1.1; using 192.168.174.129 instead (on interface ens33)
22/05/06 22:20:46 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
22/05/06 22:20:49 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/05/06 22:20:59 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
22/05/06 22:20:59 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.
22/05/06 22:20:59 WARN Utils: Service 'SparkUI' could not bind on port 4042. Attempting port 4043.
22/05/06 22:20:59 WARN Utils: Service 'SparkUI' could not bind on port 4043. Attempting port 4044.


In [4]:
# foreach 
# apply a function to each element in the rdd [all partitions]
# write any custom functions that deal with db, datastore, cache etc


orders = [
    # symbol, qty
    ('INFY', 200),
    ('TSLA', 50),
    ('EMR', 20),
    ('INFY', 100),
    ('TSLA', 25)
]

def add(acc, value):
    output = acc + value
    print("acc", acc, "value", value, "output", output)
    return output

orderRdd = sc.parallelize(orders, 2)
# refer S015-Fold for example,
resultRdd = orderRdd.foldByKey(0, add)
resultRdd.collect() # now we take result to driver, which is not efficient
# in spark, driver is not distributed, not scalable, runs on only one system
# the executor is distriuted, runs across clouster, parallel

acc 0 value 200 output 200                                          (0 + 1) / 2]
acc 0 value 50 output 50
acc 0 value 20 output 20
acc 0 value 100 output 100
acc 0 value 25 output 25
acc 200 value 100 output 300                                        (0 + 1) / 2]
                                                                                

[('INFY', 300), ('EMR', 20), ('TSLA', 75)]

In [5]:
# now let us process the data at executor using forEach
# resultRdd has result, now we apply foreach, it may store data to data store
# good for processing ONE RESULT at a time
# foreach executed in executor process, not in driver
def updateDB(stock):
    #Todo, update, insert, delete record
    print("Saving ", str(stock), " to db ")
    
# foreach is ACTION method, runs on executor, updateDB function is called over every item resultRdd
resultRdd.foreach(updateDB)

acc 200 value 100 output 300
Saving  ('INFY', 300)  to db 
Saving  ('EMR', 20)  to db 
acc 50 value 25 output 75
Saving  ('TSLA', 75)  to db 


In [6]:
# foreachPartition
# custom logic to handle data in the partitions
# runs inside executors
# foreach process 1 element at a time, 
# where as foreachPartition can process all 
# partition data as bulk
# bulk insert/update/delete

# iterator will have each partition data as whole
# part0 - 5 records, then iterator shall have 5 records
# processData is called by foreachPartition on executor for each partition
# iterator passed for foreachPartition
def processResultData(iterator):
    print("Process data called ")
    for record in iterator:
        print ("Processing ", str(record))
        
    print ('-' * 30)
# Action method
resultRdd.foreachPartition(processResultData)

acc 200 value 100 output 300
Process data called 
Processing  ('INFY', 300)
Processing  ('EMR', 20)
------------------------------
acc 50 value 25 output 75
Process data called 
Processing  ('TSLA', 75)
------------------------------
