In [1]:
import time
from pyspark.sql import SparkSession

In [14]:
file = "data/test-graph.txt"
max_iter = 100
num_partition = 4

In [3]:
# Load the data
textFile = sc.textFile(file)
textFile.count()

16

In [4]:
dataFile = textFile.filter(lambda l: len(l) > 0 and l[0] != '#')
dataFile.count()

11

In [17]:
def to_int_tuple(line, delim='\t'):
    strings = line.split(delim)[:2]
    return (int(strings[0]), int(strings[1]))
    
edges = dataFile.map(to_int_tuple)
edges.partitionBy(num_partition)
edges.setName('Edges')
edges.cache()
assert edges.filter(lambda t: len(t) == 2).count() == dataFile.count()

In [18]:
def switch_key_value(kv):
    return (kv[1], kv[0])

def compose(r1, r2):
    """ Compose 2 relations represented by PairRDDs. """
    r1_flip = r1.map(switch_key_value)
    joined = r1_flip.join(r2, num_partition)
    return joined.values()

paths = edges
for i in range(2):
    print("###########")
    paths = compose(paths, edges)
    paths.cache()
    for p in paths.collect():
        print(p)

###########
(3, 5)
(0, 2)
(4, 6)
(6, 10)
(1, 3)
(9, 6)
(5, 7)
(5, 8)
(5, 9)
(10, 7)
(10, 8)
(10, 9)
(2, 4)
###########
(2, 5)
(5, 10)
(10, 10)
(3, 6)
(0, 3)
(6, 6)
(4, 7)
(4, 8)
(4, 9)
(9, 7)
(9, 8)
(9, 9)
(1, 4)
CPU times: user 64 ms, sys: 16 ms, total: 80 ms
Wall time: 569 ms


In [20]:
%%time
paths = edges
all_paths = edges

start = time.time()

for i in range(max_iter):
    print("###########")
    print("NUM e: ", edges.getNumPartitions())
    print("NUM p: ", paths.getNumPartitions())
    paths = compose(paths, edges)
    print("NUM e: ", edges.getNumPartitions())
    print("NUM p: ", paths.getNumPartitions())
    paths.cache()
    # Leave only new paths
    paths = paths.subtract(all_paths, num_partition)
    print("NUM p: ", paths.getNumPartitions())
    # Finish, when no more paths added
    if paths.isEmpty():
        break
    
    # Add new paths to all paths
    print("NUM a: ", all_paths.getNumPartitions())
    all_paths = all_paths.union(paths)
    all_paths = all_paths.partitionBy(num_partition)
    print("NUM a: ", all_paths.getNumPartitions())
    all_paths.cache()
    
    for p in paths.collect():
        print(p)
    end = time.time()
    print("Time elapsed: %f sec" % (end - start,))
    start = end

        
print("$$$$$$$$$$$$$ ALL PATHS $$$$$$$$$$$$$$")
for p in all_paths.collect():
    print(p)
    
print()
end = time.time()
print("Time elapsed: %f sec" % (end - start,))    

###########
NUM e:  2
NUM p:  2
NUM e:  2
NUM p:  4
NUM p:  4
NUM a:  2
NUM a:  4
(5, 9)
(4, 6)
(10, 8)
(2, 4)
(0, 2)
(10, 7)
(9, 6)
(1, 3)
(5, 7)
(6, 10)
(3, 5)
(5, 8)
(10, 9)
Time elapsed: 0.594687 sec
###########
NUM e:  2
NUM p:  4
NUM e:  2
NUM p:  4
NUM p:  4
NUM a:  4
NUM a:  4
(9, 9)
(4, 9)
(5, 10)
(4, 8)
(6, 6)
(10, 10)
(9, 7)
(4, 7)
(1, 4)
(3, 6)
(9, 8)
(2, 5)
(0, 3)
Time elapsed: 0.903518 sec
###########
NUM e:  2
NUM p:  4
NUM e:  2
NUM p:  4
NUM p:  4
NUM a:  4
NUM a:  4
(1, 5)
(4, 10)
(3, 7)
(3, 8)
(2, 6)
(3, 9)
(0, 4)
Time elapsed: 0.882334 sec
###########
NUM e:  2
NUM p:  4
NUM e:  2
NUM p:  4
NUM p:  4
NUM a:  4
NUM a:  4
(2, 8)
(2, 7)
(0, 5)
(1, 6)
(2, 9)
(3, 10)
Time elapsed: 0.953485 sec
###########
NUM e:  2
NUM p:  4
NUM e:  2
NUM p:  4
NUM p:  4
NUM a:  4
NUM a:  4
(0, 6)
(1, 9)
(1, 7)
(2, 10)
(1, 8)
Time elapsed: 0.929409 sec
###########
NUM e:  2
NUM p:  4
NUM e:  2
NUM p:  4
NUM p:  4
NUM a:  4
NUM a:  4
(1, 10)
(0, 9)
(0, 8)
(0, 7)
Time elapsed: 0.941687 sec