In [1]:
import pyspark
import numpy as np # we'll be using numpy for some numeric operations
sc = pyspark.SparkContext()

In [2]:
# each element of this list is a tuple,
# representing one page and its links (to other pages)
# (each page is represented by an integer)
links = [(1, [2, 3, 4, 5, 6]), 
            (2, [3, 5]), 
            (3, [2, 6]),
            (4, [1, 3, 5]),
            (5, [2, 4]),
            (6, [1, 5])] # each page has one entry in the RDD

# make an RDD
linksRDD = sc.parallelize(links)

# ask spark to keep it in memory
linksRDD.persist()

# action
N = linksRDD.count() 

In [3]:
# initialize rank scores of pages
ranks = linksRDD.map(lambda x: (x[0], 1/N))

In [4]:
## constrants and auxilliary functions

ITERATIONS = 10; alpha = 0.15

def contr(x):
    """
    return the contributions of a single page u
    """
    page_u, _tmp = x    # x is a pair
    links, rank = _tmp  # _tmp is a pair

    result = []  # the result is a list of pairs
    for node_v in links:
        result.append((node_v, rank / len(links)))
    return result

def smoothen(x):
    """
    calculate a pagerank score for node v
    from the sum of contributions it receives
    from other nodes
    """
    node_v = x[0]  # x is a pair
    sum_of_contributions = x[1]
    return (node_v, alpha / N + (1 - alpha)* sum_of_contributions)

def add(x, y):
    """ return the sum of x and y"""
    return x + y

In [5]:
# pagerank computation
for i in range(ITERATIONS):
    contribs = linksRDD.join(ranks).flatMap(contr)
    ranks = contribs.reduceByKey(add).map(smoothen) # no action yet!

In [6]:
ranks.collect() # action - collect result

[(1, 0.11552302464988459),
 (2, 0.21821520066933991),
 (3, 0.17624644596941755),
 (4, 0.14248077942041987),
 (5, 0.22691621485653363),
 (6, 0.1206183344344042)]

***

In [7]:
import time

Let's say we change the pagerank computation to add an action on `ranks` (like below).
What is the effect of this?

In [None]:
t0 = time.clock()
for i in range(ITERATIONS):
    contribs = linksRDD.join(ranks).flatMap(contr)
    ranks = contribs.reduceByKey(add).map(smoothen)
    
    # what if we add the following action?
    ranks.take(1) # try to run the cell again after you comment this line out
    
t1 = time.clock()   
dt = t1 - t0
print("It took", dt, "seconds.")