In [None]:
import os
import string
import sys
from pprint import pprint

spark_home_folder = r'C:\Spark\spark-1.6.1-bin-hadoop2.6'
os.environ['SPARK_HOME'] = spark_home_folder

sys.path.append(spark_home_folder + r'\python')

try:
    from pyspark import SparkContext
    from pyspark import SparkConf

except ImportError as e:
    print("Error: ", e)
    sys.exit(1)
    
conf = SparkConf()
conf.setMaster("local")
conf.setAppName("spark_wc")
sc = SparkContext(conf=conf)

In [44]:
sc

<pyspark.context.SparkContext at 0x1ee3ceb588>

In [None]:
# flatMapValues(func)

rdd = sc.parallelize([(1,2), (1,3), (2,4)])
print("Original RDD: ")
print(rdd.collect())

rdd2 = rdd.flatMapValues(lambda x: range(x, x+2)).collect()
print("After transformation: ")
print(rdd2)

In [16]:
# Read this file and map each line to a key-value pair
# Each line has ',' separated values. Key nust be the 41st item and value must be the list of all items.

import urllib.request
# f = urllib.request.urlretrieve ("http://kdd.ics.uci.edu/databases/kddcup99/kddcup.data_10_percent.gz", "kddcup.data_10_percent.gz")
data_file = "./kddcup.data_10_percent.gz"
raw_data = sc.textFile(data_file)
# print(raw_data.take(5))

print(raw_data.top(1))

# Creating Key - Value pairs
rdd = raw_data.map(lambda x: (x.split(',')[41], x.split(',')))

# Printing generated KV pair
print("Generated Key-Value pairs:")
print(rdd.top(1))

['9949,udp,other,SF,146,105,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,2,0.00,0.00,0.00,0.00,1.00,0.00,0.00,255,3,0.01,0.73,0.98,0.00,0.00,0.00,0.00,0.00,normal.']
Generated Key-Value pairs:
[('warezmaster.', ['9', 'tcp', 'ftp_data', 'SF', '0', '5153771', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '1', '1', '0.00', '0.00', '0.00', '0.00', '1.00', '0.00', '0.00', '12', '12', '1.00', '0.00', '1.00', '0.00', '0.00', '0.00', '0.00', '0.00', 'warezmaster.'])]


In [20]:
# collectAsMap()
# Returns a dictionary of all key value pairs.
# If a key has manu values, the key has its latest value in the dictionary

rdd = sc.parallelize([(1,2), (1,3), (2,4)])
print(rdd.collectAsMap())


{1: 3, 2: 4}


In [25]:
# lookup(key)
# Returns a list of all the values paired with the given key.

rdd = sc.parallelize([(1,2), (1,3), (2,4)])
print(rdd.lookup(1))

[2, 3]


### aggregateByKey(aggregation_variable, combinine_function, merging_function)

aggregateByKey() is used to perform operations on key-value pairs.

This can also be achieved by grouping all values of a key and then applying an aggregation function
But grouping can be expensive as data needs to be shuffled across nodes.
With aggregateByKey() we first perform operations to an get a partial aggregate value in each partition
and then use partial values from each partition to compute the final value.
This is analogous to combine block in Hadoop MapReduce jobs

THe function takes 3 parameters:
    1. `aggregateVariable`: initial value of aggregation variable
    2. `combining function`: 
        This function is used to compute partial aggregation value in each partition
        Takes 2 input parameters.
        a. aggregation variable
        b. value from key-value pair
    3. `merging function`: 
        This function is merges partial aggregate values from each partition
        Takes 2 input parametes
        a. partial value from partition 1
        b. partial value from partition 2

The following example illustrates the usage of aggregateByKey()


In [37]:
# aggregateByKey(aggregation_variable, combinine_function, merging_function)


# Find the number of occurrences of each letter

rdd = sc.parallelize(list("aaaaabbbbbbdfdfsssdfdfdsddfsfefds"))
rdd = rdd.map(lambda x: (x,1))\
        .aggregateByKey(
                        # Initial value of count for each key
                        0,
                        # Combining Function: Combines values in each partition
                        # c is the count of each key. Initialized to 0 in first argument
                        lambda c, x: c + x,
                        # Merging Function: COmbines values across partitions
                        lambda x, y: x + y
                       )
print(rdd.collect())

[('s', 6), ('b', 6), ('a', 5), ('e', 1), ('f', 7), ('d', 8)]


### combineByKey()

combineByKey() is transformation similar to aggregateByKey(). 


combineByKey is more general then aggregateByKey. aggregateByKey is suitable for compute aggregations for keys, example aggregations such as sum, etc. aggregateByKey() is additional computation after map on local partitions to reduce the amount of data sent out to other nodes and driver. 

combineByKey is more general and offers the flexibility to specify any map side combine function. 

Usage:

`combineByKey(createCombiner, mergeValue, mergeCombiner)`

combineByKey() takes 3 other functions as input parameters.
    1. `createCombiner`: 
        This is the very first aggregation step for each key. 
        All required variables can be initialized here. 
        Ex: ```python lambda value: (value, 1)```
        For avgerage, we need to maintain sum and count variables (sum, count)
            
            
    2. `mergeValue`:
        Given a new value for a key, this function defines how to manipulate the data structure 
        created in createCombiner. Takes 2 input parameters. The first parameter is the combiner data structure and
        second parameter is the value.
        This operation takes place in each partition and partial values are computed.
        Ex: ```python lambda x, val: (x[0] + val, x[1] + 1)```
        x is the combiner data structure, where x[0] is sum and x[1] is count
        val is the new value
   
    3. `mergeCombiner`:
        This function defines how to merge combiners. It takes partial combiner values from 2 patitions as inputs. 
        Each input is a obtained from different partitions and merged together.
        
        Ex: ```python lambda x, y: (x[0] + y[0], x[1] + y[1])```


Refrence: http://abshinn.github.io/python/apache-spark/2014/10/11/using-combinebykey-in-apache-spark/

In [41]:
# Average of given list of numbers

rdd = sc.parallelize([1,2,3,4,5,6,7])
combiner_rdd = rdd.map(lambda x: ('key', x))\
                .combineByKey(
                    lambda val: (val, 1),
                    lambda x, val: (x[0] + val, x[1] + 1),
                    lambda x, y: (x[0] + y[0], x[1] + y[1])
                )
    
avg_rdd = combiner_rdd.map(lambda x: ("average", x[1][0]*1.0/x[1][1]))

avg_rdd.collect()

[('average', 4.0)]

In [42]:
sc.stop()