In [2]:
import time

In [3]:
# Config
file = "final/data/test-graph.txt"
max_iter = 100
num_partition = 2
debug = False

In [4]:
# Load the data
textFile = sc.textFile(file)
print("All lines:", textFile.count())
dataFile = textFile.filter(lambda l: len(l) > 0 and l[0] != '#')
print("Correct lines:", dataFile.count())

All lines: 16
Correct lines: 11


In [5]:
# Create edges RDD
def to_int_tuple(line, delim='\t'):
    strings = line.split(delim)[:2]
    return (int(strings[0]), int(strings[1]))
    
edges = dataFile.map(to_int_tuple)
edges.partitionBy(num_partition)
edges.cache()

# Check that all rows have exactly 2 entries
assert edges.filter(lambda t: len(t) == 2).count() == dataFile.count()

In [6]:
# Helper functions
def switch_key_value(kv):
    " Switch key with value. "
    return (kv[1], kv[0])

def compose(r1, r2):
    """ Compose 2 relations represented by PairRDDs. """
    r1_flip = r1.map(switch_key_value)
    # The key is now the intermediate node
    joined = r1_flip.join(r2)
    return joined.values()

In [6]:
print("############# RDD: method 1 (single steps) ###############")
new_paths = edges
all_paths = edges

start = time.time()
true_start = start

print("NUM e:", edges.rdd.getNumPartitions())
print("NUM n:", new_paths.rdd.getNumPartitions())


# invariant:
###  - all_paths and new_paths are on 'num_partitions' partitions

last_count = all_paths.count()

for i in range(1, max_iter):
    print("________________________________")
    print("Iteration #%d:" % (i,))
    new_paths = compose(new_paths, edges)
    # Leave only really new paths
    all_paths = all_paths.union(new_paths).distinct()
    
    count = all_paths.count()
    diff_count = count - last_count
    last_count = count
    print("Number of new paths: %d\n" % (diff_count,))
    
    if debug:
        print(new_paths.take(1000), '\n')
        
    end = time.time()
    print("Iteration time: %f s." % (end - start,))
    start = end
    
    # Finish, when no more paths added
    if diff_count == 0:
        print("No new paths, finishing...")
        break

print("\n\n________________________________")
print("Total paths found: %d" % (count,))
print("Number of iterations: #%d" % (i,))

if debug:
    print()
    print(all_paths.take(1000), '\n')

true_end = time.time()
method1_time = true_end - true_start
print("\nCollecting time: %f s." % (true_end - start,))
print("Total time elapsed: %f s." % (method1_time,))
print("________________________________\n\n")

############# RDD: method 1 (single steps) ###############
________________________________
Iteration #1:
Number of new paths: 13

Iteration time: 0.923982 s.
________________________________
Iteration #2:
Number of new paths: 13

Iteration time: 1.512735 s.
________________________________
Iteration #3:
Number of new paths: 7

Iteration time: 1.578706 s.
________________________________
Iteration #4:
Number of new paths: 6

Iteration time: 1.849067 s.
________________________________
Iteration #5:
Number of new paths: 5

Iteration time: 2.373683 s.
________________________________
Iteration #6:
Number of new paths: 4

Iteration time: 2.954920 s.
________________________________
Iteration #7:
Number of new paths: 1

Iteration time: 3.491072 s.
________________________________
Iteration #8:
Number of new paths: 0

Iteration time: 4.047994 s.
No new paths, finishing...


________________________________
Total paths found: 60
Number of iterations: #8

Collecting time: 0.000523 s.
Total ti

In [8]:
print("############# RDD: method 2 (single steps + delta) ###############")
new_paths = edges
all_paths = edges

start = time.time()
true_start = start

# invariant:
###  - all_paths and new_paths are on 'num_partitions' partitions

last_count = all_paths.count()

for i in range(1, max_iter):
    print("________________________________")
    print("Iteration #%d:" % (i,))
    new_paths = compose(new_paths, edges)
    # Leave only really new paths
    all_paths = all_paths.union(new_paths).distinct()
    new_paths.cache()
    all_paths.cache()
    
    print("NUM n:", new_paths.getNumPartitions())
    print("NUM a:", all_paths.getNumPartitions())

    
    count = all_paths.count()
    diff_count = count - last_count
    last_count = count
    print("Number of new paths: %d\n" % (diff_count,))
    
    if debug:
        print(new_paths.take(1000), '\n')
        
    end = time.time()
    print("Iteration time: %f s." % (end - start,))
    start = end
    
    # Finish, when no more paths added
    if diff_count == 0:
        print("No new paths, finishing...")
        break

print("\n\n________________________________")
print("Total paths found: %d" % (count,))
print("Number of iterations: #%d" % (i,))

if debug:
    print()
    print(all_paths.take(1000), '\n')

true_end = time.time()
method2_time = true_end - true_start
print("\nCollecting time: %f s." % (true_end - start,))
print("Total time elapsed: %f s." % (method2_time,))
print("________________________________\n\n")

############# RDD: method 2 (single steps + delta) ###############
________________________________
Iteration #1:
NUM n: 4
NUM a: 6
Number of new paths: 13

Iteration time: 1.028370 s.
________________________________
Iteration #2:
NUM n: 6
NUM a: 12
Number of new paths: 13

Iteration time: 1.303429 s.
________________________________
Iteration #3:
NUM n: 8
NUM a: 20
Number of new paths: 7

Iteration time: 1.905363 s.
________________________________
Iteration #4:
NUM n: 10
NUM a: 30
Number of new paths: 6

Iteration time: 2.024997 s.
________________________________
Iteration #5:
NUM n: 12
NUM a: 42
Number of new paths: 5

Iteration time: 2.591574 s.
________________________________
Iteration #6:
NUM n: 14
NUM a: 56
Number of new paths: 4

Iteration time: 2.925538 s.
________________________________
Iteration #7:
NUM n: 16
NUM a: 72
Number of new paths: 1

Iteration time: 3.642651 s.
________________________________
Iteration #8:
NUM n: 18
NUM a: 90
Number of new paths: 0

Iteration t

In [9]:
print("############# RDD: method 3 (paths combining) ###############")
all_paths = edges

start = time.time()
true_start = start

# invariant:
###  - all_paths and new_paths are on 'num_partitions' partitions

last_count = all_paths.count()

for i in range(1, max_iter):
    print("________________________________")
    print("Iteration #%d:" % (i,))
    new_paths = compose(all_paths, all_paths)
    # Leave only really new paths
    all_paths = all_paths.union(new_paths).distinct()
    all_paths.cache()
    
    count = all_paths.count()
    diff_count = count - last_count
    last_count = count
    print("Number of new paths: %d\n" % (diff_count,))
    
    if debug:
        print(new_paths.take(1000), '\n')
        
    end = time.time()
    print("Iteration time: %f s." % (end - start,))
    start = end
    
    # Finish, when no more paths added
    if diff_count == 0:
        print("No new paths, finishing...")
        break
        

print("\n\n________________________________")
print("Total paths found: %d" % (count,))
print("Number of iterations: #%d" % (i,))

if debug:
    print()
    print(all_paths.take(1000), '\n')

true_end = time.time()
method3_time = true_end - true_start
print("\nCollecting time: %f s." % (true_end - start,))
print("Total time elapsed: %f s." % (method3_time,))
print("________________________________\n\n")


############# RDD: method 3 (paths combining) ###############
________________________________
Iteration #1:
Number of new paths: 13

Iteration time: 0.588478 s.
________________________________
Iteration #2:
Number of new paths: 20

Iteration time: 1.326621 s.
________________________________
Iteration #3:
Number of new paths: 16

Iteration time: 3.308323 s.
________________________________
Iteration #4:
Number of new paths: 0

Iteration time: 10.356674 s.
No new paths, finishing...


________________________________
Total paths found: 60
Number of iterations: #4

Collecting time: 0.000388 s.
Total time elapsed: 15.580484 s.
________________________________




In [15]:
print("########### Summary ############")
print("Method 1: %f s." % (method1_time,))
print("Method 2: %f s." % (method2_time,))
print("Method 3: %f s." % (method3_time,))

########### Summary ############
Method 1: 8.287740 s.
Method 2: 6.018360 s.
Method 3: 1.567246 s.
