# DATASCI W261: Machine Learning at Scale
## Assignment Week 7
Miki Seltzer (miki.seltzer@berkeley.edu)<br>
W261-2, Spring 2016<br>
Submission: 

In [1]:
# We will need these so we can reload modules as we modify them
%load_ext autoreload
%autoreload 2

# HW 7.0: Shortest path graph distances (toy networks)

In this part of your assignment you will develop the base of your code for the week.

Write MRJob classes to find shortest path graph distances, as described in the lectures. In addition to finding the distances, your code should also output a distance-minimizing path between the source and target. Work locally for this part of the assignment, and use both of the undirected and directed toy networks.

![Toy networks](toy_graphs.png)

To proof you code's function, run the following jobs

- shortest path in the undirected network from node 1 to node 4
Solution: 1,5,4 

- shortest path in the directed network from node 1 to node 5
Solution: 1,2,4,5

and report your output -- make sure it is correct!

## Initialize graph structure

We have the graph encoded as an adjacency list, but we need to keep track of state in each iteration
- shortest distance from start node
- node state (unvisited, queued, visited)
- path taken to get to node

In [76]:
%%writefile MRJob_Initiate.py
from mrjob.job import MRJob
from mrjob.step import MRStep
import sys

class initiate(MRJob):
        
    # Specify some custom options so we only have to write one MRJob class for each join
    def configure_options(self):
        super(initiate, self).configure_options()
        self.add_passthrough_option('--startNode', default='1')
        
    def mapper(self, _, line):
        fields = line.strip().split('\t')
        name = fields[0]
        neighbors = eval(fields[1])
        if name == self.options.startNode:
            yield name, [neighbors, 0, 'Q', [name]]
        else:
            yield name, [neighbors, sys.maxint, 'U', []]
        
if __name__ == '__main__':
    shortestPath.run()

Overwriting MRJob_Initiate.py


## Iterate through graph structure

In each mapper iteration:
- Expand each node that is in queued state, then mark that node as visited

In each reducer iteration:
- If any record for a node has a visited state, emit the visited record
- When we keep track of state, if a record has a state of queued, then the node needs to be merged
- If the record is truly unvisited, emit the unvisited node

In [115]:
%%writefile MRJob_ShortestPath.py
from mrjob.job import MRJob
from mrjob.step import MRStep
import sys

class shortestPath(MRJob):
    
    """
    Mapper: Iterate over each node in graph file
    - Expand frontier if needed
    - Update node statuses
    """
    def mapper(self, _, line):
        
        # Split text to get our data
        fields = line.strip().split('\t')
        name = fields[0]
        value = eval(fields[1])
        neighbors = value[0]
        distance = int(value[1])
        status = value[2]
        path = value[3]
        
        # If this node is queued, expand the frontier
        #  - mark current node as visited
        #  - yield neighbor nodes into queue
        if status == 'Q':
            yield name, [neighbors, distance, 'V', path]
            if neighbors:
                for node in neighbors:
                    temp_path = list(path)
                    temp_path.append(node)
                    yield node, [None, distance + 1, 'Q', temp_path]
        else:
            yield name, [neighbors, distance, status, path]
        
        
    """
    Reducer: Aggregate expanded nodes
    """
    def reducer(self, key, values):
        neighbors = None
        distance = sys.maxint
        status = None
        path = []
        
        for val in values:
            
            # We've hit a visited node. Break out of the loop.
            if val[2] == 'V':
                neighbors = val[0]
                distance = val[1]
                status = val[2]
                path = val[3]
                break
            
            # We've hit an unvisited node. Collect the neighbors and the status
            # If status is already Q, do not overwrite
            elif val[0]: 
                neighbors = val[0]
                if status != 'Q':
                    status = val[2]
            
            # We've hit a queued node. Update status and path
            else:
                status = val[2]
                path = val[3]
                
            # Update minimum distance if necessary
            distance = min(distance, val[1])
            
        yield key, [neighbors, distance, status, path]
    
    """
    Multistep pipeline definition
    """
#     def steps(self):
#         return [
#                 MRStep()
#             ]
    
if __name__ == '__main__':
    shortestPath.run()

Overwriting MRJob_ShortestPath.py


In [122]:
from MRJob_Initiate import initiate
from MRJob_ShortestPath import shortestPath

def findShortestPath(filename, startNode, endNode):

    temp_filename = filename.replace('.', '_temp.')
    
    # Initiate graph adjacency list to track state
    mr_job_init = initiate(args=[filename, '--no-strict-protocols', '--startNode', startNode])
    outInit = []
    
    with mr_job_init.make_runner() as runner:
        runner.run()
        
        for line in runner.stream_output():
            out = mr_job_init.parse_output_line(line)
            outInit.append(out)
            
        with open(temp_filename, 'w') as outfile:
            outfile.writelines(str(x[0]) + '\t' + str(x[1]) + '\n' for x in outInit)
    
    # Iterate over the adjacency list with state until all nodes are visited
    mr_job = shortestPath(args=[temp_filename, '--no-strict-protocols'])
    allVisited = False
    
    while not allVisited:
        output = []
        numQueued = 0

        with mr_job.make_runner() as runner: 
            # Run MRJob
            runner.run()

            # Write stream_output to file
            for line in runner.stream_output():
                out = mr_job.parse_output_line(line)
                output.append(out)
                if out[1][2] == 'Q':
                    numQueued += 1
        
        # Write the new states to file
        with open(temp_filename, 'w') as outfile:
            outfile.writelines(str(x[0]) + '\t' + str(x[1]) + '\n' for x in output)
        
        # Are we done yet?
        if numQueued == 0:
            allVisited = True
            for node in output:
                if node[0] == endNode:
                    print 'Start node =', startNode, 'and end node =', endNode
                    print node[1][3]

filename = 'undirected_toy.txt'
print 'Shortest path in', filename
findShortestPath(filename, '1', '4')
filename = 'directed_toy.txt'
print '\nShortest path in', filename
findShortestPath('directed_toy.txt', '1', '5')

Shortest path in undirected_toy.txt
Start node = 1 and end node = 4
['1', '5', '4']

Shortest path in directed_toy.txt
Start node = 1 and end node = 5
['1', '2', '4', '5']
