In [2]:
import time
from pyspark.sql.functions import *

In [3]:
# Config
file = "final/data/test-graph.txt"
max_iter = 100
num_partition = 2
debug = False

In [4]:
# Load the data
textFile = sc.textFile(file)
print("All lines:", textFile.count())
dataFile = textFile.filter(lambda l: len(l) > 0 and l[0] != '#')
print("Correct lines:", dataFile.count())

All lines: 16
Correct lines: 11


In [5]:
# Create edges RDD
def to_int_tuple(line, delim='\t'):
    strings = line.split(delim)[:2]
    return (int(strings[0]), int(strings[1]))
    
edgesRDD = dataFile.map(to_int_tuple)
edgesRDD.partitionBy(num_partition)
edgesRDD.cache()

# Check that all rows have exactly 2 entries
assert edgesRDD.filter(lambda t: len(t) == 2).count() == dataFile.count()

In [6]:
edges = edgesRDD.toDF(["fro", "to"])
edges.cache()

DataFrame[fro: bigint, to: bigint]

In [7]:
def compose(df1, df2):
    """ Compose 2 relations represented by PairRDDs. """
    r1 = df1.alias('r1')
    r2 = df2.alias('r2')
    return r1.join(r2, col('r1.to') == col('r2.fro')).drop(col('r1.to')).drop(col('r2.fro'))

In [9]:
print("############# RDD: method 1 (single steps) ###############")
new_paths = edges
all_paths = edges

start = time.time()
true_start = start

# invariant:
###  - all_paths and new_paths are on 'num_partitions' partitions

last_count = all_paths.count()

for i in range(1, max_iter):
    print("________________________________")
    print("Iteration #%d:" % (i,))
    new_paths = compose(all_paths, edges)
    # Leave only really new paths
    all_paths = all_paths.union(new_paths).distinct()
    
    count = all_paths.count()
    diff_count = count - last_count
    last_count = count
    print("Number of new paths: %d\n" % (diff_count,))
    
    if debug:
        print(new_paths.take(1000), '\n')
        
    end = time.time()
    print("Iteration time: %f s." % (end - start,))
    start = end
    
    # Finish, when no more paths added
    if diff_count == 0:
        print("No new paths, finishing...")
        break

print("\n\n________________________________")
print("Total paths found: %d" % (count,))
print("Number of iterations: #%d" % (i,))

if debug:
    print()
    print(all_paths.take(1000), '\n')

true_end = time.time()
method1_time = true_end - true_start
print("\nCollecting time: %f s." % (true_end - start,))
print("Total time elapsed: %f s." % (method1_time,))
print("________________________________\n\n")

############# RDD: method 1 (single steps) ###############
________________________________
Iteration #1:
Number of new paths: 13

Iteration time: 0.611742 s.
________________________________
Iteration #2:
Number of new paths: 13

Iteration time: 1.938486 s.
________________________________
Iteration #3:
Number of new paths: 7

Iteration time: 3.846156 s.
________________________________
Iteration #4:
Number of new paths: 6

Iteration time: 6.230818 s.
________________________________
Iteration #5:
Number of new paths: 5

Iteration time: 7.955049 s.
________________________________
Iteration #6:
Number of new paths: 4

Iteration time: 11.401214 s.
________________________________
Iteration #7:
Number of new paths: 1

Iteration time: 16.360150 s.
________________________________
Iteration #8:
Number of new paths: 0

Iteration time: 19.712316 s.
No new paths, finishing...


________________________________
Total paths found: 60
Number of iterations: #8

Collecting time: 0.000618 s.
Total

In [10]:
print("############# DF: method 2 (single steps + delta + broadcast) ###############")
new_paths = edges
all_paths = edges

broadcast(edges)

start = time.time()
true_start = start

print("NUM e:", edges.rdd.getNumPartitions())
print("NUM n:", new_paths.rdd.getNumPartitions())

# invariant:
###  - all_paths and new_paths are on 'num_partitions' partitions

last_count = all_paths.count()

for i in range(1, max_iter):
    print("________________________________")
    print("Iteration #%d:" % (i,))
    new_paths = compose(new_paths, edges)
    # Leave only really new paths
    all_paths = all_paths.union(new_paths).distinct()
    
    count = all_paths.count()
    diff_count = count - last_count
    last_count = count
    print("Number of new paths: %d\n" % (diff_count,))
    
    if debug:
        print(new_paths.take(1000), '\n')
        
    end = time.time()
    print("Iteration time: %f s." % (end - start,))
    start = end
    
    # Finish, when no more paths added
    if diff_count == 0:
        print("No new paths, finishing...")
        break

print("\n\n________________________________")
print("Total paths found: %d" % (count,))
print("Number of iterations: #%d" % (i,))

if debug:
    print()
    print(all_paths.take(1000), '\n')

true_end = time.time()
method2_time = true_end - true_start
print("\nCollecting time: %f s." % (true_end - start,))
print("Total time elapsed: %f s." % (method2_time,))
print("________________________________\n\n")

############# DF: method 2 (single steps + delta + broadcast) ###############
NUM e: 2
NUM n: 2
________________________________
Iteration #1:
Number of new paths: 13

Iteration time: 1.067346 s.
________________________________
Iteration #2:
Number of new paths: 13

Iteration time: 1.747731 s.
________________________________
Iteration #3:
Number of new paths: 7

Iteration time: 1.425187 s.
________________________________
Iteration #4:
Number of new paths: 6

Iteration time: 2.132767 s.
________________________________
Iteration #5:
Number of new paths: 5

Iteration time: 3.154470 s.
________________________________
Iteration #6:
Number of new paths: 4

Iteration time: 4.883002 s.
________________________________
Iteration #7:
Number of new paths: 1

Iteration time: 4.492604 s.
________________________________
Iteration #8:
Number of new paths: 0

Iteration time: 5.697312 s.
No new paths, finishing...


________________________________
Total paths found: 60
Number of iterations: #8



In [8]:
print("############# RDD: method 3 (paths combining) ###############")
new_paths = edges
all_paths = edges

start = time.time()
true_start = start

# invariant:
###  - all_paths and new_paths are on 'num_partitions' partitions

last_count = all_paths.count()

for i in range(1, max_iter):
    print("________________________________")
    print("Iteration #%d:" % (i,))
    new_paths = compose(all_paths, all_paths)
    # Leave only really new paths
    all_paths = all_paths.union(new_paths).distinct().coalesce(num_partition)
    
    count = all_paths.count()
    diff_count = count - last_count
    last_count = count
    print("Number of new paths: %d\n" % (diff_count,))
    
    if debug:
        print(new_paths.take(1000), '\n')
        
    end = time.time()
    print("Iteration time: %f s." % (end - start,))
    start = end
    
    # Finish, when no more paths added
    if diff_count == 0:
        print("No new paths, finishing...")
        break


print("\n\n________________________________")
print("Total paths found: %d" % (count,))
print("Number of iterations: #%d" % (i,))

if debug:
    print()
    print(all_paths.take(1000), '\n')

true_end = time.time()
method3_time = true_end - true_start
print("\nCollecting time: %f s." % (true_end - start,))
print("Total time elapsed: %f s." % (method3_time,))
print("________________________________\n\n")

############# RDD: method 3 (paths combining) ###############
________________________________
Iteration #1:
Number of new paths: 13

Iteration time: 2.571445 s.
________________________________
Iteration #2:
Number of new paths: 20

Iteration time: 1.738402 s.
________________________________
Iteration #3:
Number of new paths: 16

Iteration time: 3.878524 s.
________________________________
Iteration #4:
Number of new paths: 0

Iteration time: 7.332047 s.
No new paths, finishing...


________________________________
Total paths found: 60
Number of iterations: #4

Collecting time: 0.000642 s.
Total time elapsed: 15.521059 s.
________________________________




In [None]:
print("########### Summary ############")
print("Method 1: %f s." % (method1_time,))
print("Method 2: %f s." % (method2_time,))
print("Method 2: %f s." % (method3_time,))