In [1]:
from __future__ import print_function

Import spark and start a local context

In [2]:
from pyspark import SparkConf, SparkContext
sc = SparkContext(conf=SparkConf().setAppName("MyApp").setMaster("local"))

Define mapping functions

In [3]:
def parse_edge(s):
  user, follower = s.split("\t")
  return (int(user), int(follower))

def step(item):
  prev_v, prev_d, next_v = item[0], item[1][0], item[1][1]
  return (next_v, prev_d + 1)

def complete(item):
  v, old_d, new_d = item[0], item[1][0], item[1][1]
  return (v, old_d if old_d is not None else new_d)



Global settings

In [4]:
n = 400  # number of partitions


Loading data


In [5]:
edges = sc.textFile("/data/twitter/twitter_sample_small.txt").map(parse_edge).cache()
forward_edges = edges.map(lambda e: (e[1], e[0])).partitionBy(n).persist()

Define start node

In [6]:
x = 12
d = 0
distances = sc.parallelize([(x, d)]).partitionBy(n)

Compute all distances

In [None]:
while True:
    print("=== Iteration ", d)
    print("Computing candidates...")
    candidates = distances.join(forward_edges, n).map(step)
    print("Computing new distances...")
    new_distances = distances.fullOuterJoin(candidates, n).map(complete, True).persist()
    print("Counting new nodes discovered...")
    count = new_distances.filter(lambda i: i[1] == d + 1).count()
    if count > 0:
        d += 1
        distances = new_distances
    else:
        break

=== Iteration  0
Computing candidates...
Computing new distances...
Counting new nodes discovered...
=== Iteration  1
Computing candidates...
Computing new distances...
Counting new nodes discovered...
