# DATASCI W261: Machine Learning at Scale
## Assignment Week 9
Miki Seltzer (miki.seltzer@berkeley.edu)<br>
W261-2, Spring 2016<br>
Submission: 

In [1]:
# We will need these so we can reload modules as we modify them
%load_ext autoreload
%autoreload 2

In [84]:
# Just in case we need a cluster
# Create job flow so that we don't need to keep spinning up clusters
!python -m mrjob.tools.emr.create_job_flow

using configs in /etc/mrjob.conf
using existing scratch bucket mrjob-ac40f1afcc0b86ce
using s3://mrjob-ac40f1afcc0b86ce/tmp/ as our scratch dir on S3
Creating persistent job flow to run several jobs in...
creating tmp directory /tmp/no_script.cloudera.20160313.010339.237936
writing master bootstrap script to /tmp/no_script.cloudera.20160313.010339.237936/b.py
Copying non-input files into s3://mrjob-ac40f1afcc0b86ce/tmp/no_script.cloudera.20160313.010339.237936/files/
Waiting 5.0s for S3 eventual consistency
Creating Elastic MapReduce job flow
Job flow created with ID: j-3D12NC1JXN44F
j-3D12NC1JXN44F


In [85]:
clusterId = 'j-3D12NC1JXN44F'

# HW9.0

### What is PageRank and what is it used for in the context of web search?
PageRank is a ranking algorithm used by Google. At a high level, it provides a measure of "popularity" of pages, due to the underlying assumption that important websites are likely to have more incoming links than unimportant websites.

### What modifications have to be made to the webgraph in order to leverage the machinery of Markov Chains to compute the steady stade distibution?
We must introduce a "teleportation" factor by scaling the webgraph by $\alpha$ and adding a factor of $(1-\alpha) * (\text{matrix with entries }1/n)$. This results in a Markov matrix since the entries will be strictly positive (Perron-Frobenius theorem).

# HW9.1: MRJob implementation of basic PageRank

Write a basic MRJob implementation of the iterative PageRank algorithm that takes sparse adjacency lists as input (as explored in HW 7). Make sure that you implementation utilizes teleportation (1-damping/the number of nodes in the network), and further, distributes the mass of dangling nodes with each iteration so that the output of each iteration is correctly normalized (sums to 1).

In [2]:
# Utilize last week's HW to count the number of nodes in the graph
from MRJob_Explore import explore

def countNodes(filename):

    mr_job = explore(args=[filename, '--no-strict-protocols', '--exploreType', 'nodes'])
#                            '-r', 'emr', '--emr-job-flow-id', clusterId,])
    output = []
    
    with mr_job.make_runner() as runner:
        runner.run()
        
        for line in runner.stream_output():
            out = mr_job.parse_output_line(line)
            print 'Number of nodes =', '{:,d}'.format(out[1])
    
    return out[1]

## MRJob to initialize the starting PageRank vector with a uniform distribution

This job requires us to do several things:
- Find all nodes in the graph (dangling nodes will not have an explicit neighbors list)
- For each node, maintain graph structure and initialize PageRank to 1/n

In [3]:
%%writefile PageRank_Initialize.py
from __future__ import division
from mrjob.job import MRJob
from mrjob.step import MRStep

class initialize(MRJob):
    
    """
    Get all nodes
    """
    
    #------------------
    # Mapper:
    # - We need to make sure we emit a line for each node in the graph
    # - Right now there are no lines for nodes with no neighbors
    
    def mapper(self, _, line):
        
        # Split fields
        
        fields = line.split('\t')
        key = fields[0]
        stripe = eval(fields[1])
        
        # Emit the key and stripe
        
        yield key, stripe
        
        # For each neighbor, emit a 0
        # We just do this so we catch all nodes
        
        for neighbor in stripe:
            yield neighbor, 0
            
    #------------------
    # Reducer:
    # - We need to deduplicate each of our nodes
    # - If we encounter a value that is a dictionary, these are the neighbors
    # - If we do not encounter any dictionaries, then the node is dangling, we emit an empty neighbor list
    
    def reducer(self, key, values):       
        stripe = {}
        
        # Loop through values for a key to see if it has neighbors
        # If it does, we need to keep the neighbors
        
        for val in values:
            if type(val) == type(stripe):
                stripe = val
                
        # For each key, emit only one thing, which is the neighbor list
        # We should now have a line for each node, even if the neighbor list is empty
        
        yield key, stripe
        
    """
    Normalize length
    """
    
    # Initialize total to 0
    def mapper_norm_init(self):
        self.total = 0.0
    
    # For each key we encounter, increment total
    # We know that we will only encounter each node once
    def mapper_norm(self, key, value):
        yield key, value
        self.total += 1
        
    # Emit the total number of nodes we saw
    def mapper_norm_final(self):
        yield '*', self.total
    
    # To combine the totals if we have multiple mappers
    def combiner_norm(self, key, values):
        if key == '*':
            yield key, sum(values)
        else:
            for val in values:
                yield key, val
        
    # Initialize the totalNodes to 0
    def reducer_norm_init(self):
        self.totalNodes = 0
       
    # If the key is '*', save the sum of the values
    # Otherwise, yield the key, stripe, and 1/n
    def reducer_norm(self, key, values):
        if key == '*':
            self.totalNodes = sum(values)
        else:
            for val in values:
                yield key, (val, 1 / self.totalNodes)
    
    """
    Multi-step pipeline
    """
    def steps(self):
        return [
            MRStep(mapper=self.mapper,
                   reducer=self.reducer),
            MRStep(mapper_init=self.mapper_norm_init,
                   mapper=self.mapper_norm,
                   mapper_final=self.mapper_norm_final,
                   combiner=self.combiner_norm,
                   reducer_init=self.reducer_norm_init,
                   reducer=self.reducer_norm,
                   jobconf={'mapred.reduce.tasks': 1})
        ]

if __name__ == '__main__':
    initialize.run()

Overwriting PageRank_Initialize.py


In [4]:
from PageRank_Initialize import initialize

def initializePR(filename, runner, outputDir):

    if runner == 'local':
        mr_job = initialize(args=[filename, '--no-strict-protocols'])
        
    elif runner == 'hadoop':
        !hdfs dfs -rm -r {outputDir}
        mr_job = initialize(args=[filename, '--no-strict-protocols', '-r', 'hadoop', '--hadoop-home', '/usr/',
                                 '--output-dir', outputDir])
        
    elif runner == 'emr':
        !aws s3 rm --quiet {outputDir}
        mr_job = initialize(args=[filename, '--no-strict-protocols', '--no-output', 
                                  '-r', 'emr', '--emr-job-flow-id', clusterId, '--output-dir', outputDir])
        

    with mr_job.make_runner() as runner:
        runner.run()

        if runner != 'emr':
            for line in runner.stream_output():
                print mr_job.parse_output_line(line)

In [19]:
inputFile = 'PageRank-test.txt'
outputDir = '/user/miki/week09/initialize'
n = countNodes(inputFile)

initializePR(inputFile, 'hadoop', outputDir)

Number of nodes = 11
Deleted /user/miki/week09/initialize


The have been translated as follows
 mapred.reduce.tasks: mapreduce.job.reduces


('A', [{}, 0.09090909090909091])
('B', [{'C': 1}, 0.09090909090909091])
('C', [{'B': 1}, 0.09090909090909091])
('D', [{'A': 1, 'B': 1}, 0.09090909090909091])
('E', [{'B': 1, 'D': 1, 'F': 1}, 0.09090909090909091])
('F', [{'B': 1, 'E': 1}, 0.09090909090909091])
('G', [{'B': 1, 'E': 1}, 0.09090909090909091])
('H', [{'B': 1, 'E': 1}, 0.09090909090909091])
('I', [{'B': 1, 'E': 1}, 0.09090909090909091])
('J', [{'E': 1}, 0.09090909090909091])
('K', [{'E': 1}, 0.09090909090909091])


## MRJob to iterate over the PageRank algorithm

In this job we need to do the following:
- Distribute a node's PageRank to its neighbors
- Distribute the mass of dangling nodes, and account for teleporting

#### Distribute node's PageRank to neighbors

For each node we encounter, we have the current PageRank (let's call this $\text{PR}_0(A)$ for node A), and the list of neighbors. We need to divide the PageRank by the number of neighbors, and emit this to each neighbor. If there are no neighbors, we cannot emit the PageRank anywhere, so we accumulate it to a special key, \*dangling. Once we accumulate the distributed PageRank mass in each node, we have a preliminary PageRank for each node, which we can denote as $\text{PR}_1(A)$.

#### Distribute the mass of dangling nodes and account for teleporting

We have accumulated the total PageRank mass that has not yet been distributed, because there were no neighbors to distribute the mass to. We also need to account for teleporting. Given the dangling mass, $m$, and the damping factor, $\alpha$, and $n$ nodes, we have the following:

$$
\text{PR}_{final}(A) = \alpha \bigg(\frac{1}{n}\bigg) + (1-\alpha)\bigg(\frac{m}{n} + \text{PR}_1(A)\bigg)
$$

In [130]:
%%writefile PageRank_Iterate.py
from __future__ import division
from mrjob.job import MRJob
from mrjob.step import MRStep
from mrjob.protocol import JSONProtocol

class iterate(MRJob):
    
    
    """
    Configurations
    """
    
    def configure_options(self):
        super(iterate, self).configure_options()
        self.add_passthrough_option('--numNodes', default=1, type='int')
        self.add_passthrough_option('--alpha', default=0.15, type='float')
    
    """
    Mapper: Distribute PageRank mass to all neighbors
    - Do not account for teleportation yet
    """
    
    INPUT_PROTOCOL = JSONProtocol
    
    #------------------
    # Mapper:
    # - Find the number of neighbors for the node
    # - Distribute current PageRank among all neighbors
    # - If there are no neighbors, keep track of dangling mass
    
    def mapper_dist(self, key, value):

        # Divide the current PageRank by the number of neighbors
        
        numNeighbors = len(value[0])
        PageRank = value[1]
        
        # If there are neighbors, distribute the PageRank to each neighbors
        
        if numNeighbors > 0:
            for neighbor in value[0]:
                yield neighbor, PageRank / numNeighbors
                
        # If there are no neighbors, we need to account for this dangling node
        
        else:
            yield '*dangling', PageRank
        
        # Maintain the graph structure
        
        yield key, value[0]
     
    #------------------
    # Reducer:
    # - For each node, accumulate PageRank distributed from other nodes
    # - Maintain graph structure
    
    def reducer_dist(self, key, values):
        
        new_PageRank = 0.0
        neighbors = {}
        
        for val in values:
            if type(val) == type(0.0):
                new_PageRank += val
            elif type(val) == type({}):
                neighbors = val
        
        if key == '*dangling':
            with open('danglingMass.txt', 'w') as myfile:
                myfile.write(str(new_PageRank))
        else:
            yield key, (neighbors, new_PageRank)

    #------------------
    # Mapper: 
    # - Account for teleportation
    # - Distribute dangling mass to all nodes
    
    def mapper_dangle_init(self):
        with open('danglingMass.txt', 'r') as f:
            for line in f:
                self.m = float(line)
    
    def mapper_dangle(self, key, value):
        a = self.options.alpha
        n = self.options.numNodes
        new_PageRank = a * (1 / n) + (1 - a) * (self.m / n + value[1])
        yield key, (value[0], new_PageRank)
    
            
    """
    Multi-step pipeline
    """
    def steps(self):
        return [
            MRStep(mapper=self.mapper_dist,
                   reducer=self.reducer_dist),
            MRStep(mapper_init=self.mapper_dangle_init,
                   mapper=self.mapper_dangle)
            ]

if __name__ == '__main__':
    iterate.run()

Overwriting PageRank_Iterate.py


In [144]:
from PageRank_Iterate import iterate

def iteratePR(filename, n, a, runnerType, outputDir, iterations):
    
    thisInputDir = filename
    thisOutputDir = outputDir + str(0)
    
    for i in range(iterations):
        
        output = []
        print '\n'
        
        if runnerType == 'local':
            mr_job = iterate(args=[thisInputDir + '.txt', '--no-strict-protocols', '--numNodes=' + str(n), 
                                   '--file', 'danglingMass.txt', '--alpha=' + str(a)])

        elif runnerType == 'hadoop':
            !hdfs dfs -rm -r {thisOutputDir}
            mr_job = iterate(args=[thisInputDir, '--no-strict-protocols', '-r', 'hadoop', '--hadoop-home', '/usr/',
                                   '--output-dir', thisOutputDir, '--numNodes=' + str(n), '--file', 'danglingMass.txt',
                                   '--alpha=' + str(a)])

        elif runnerType == 'emr':
            !aws s3 rm --quiet {thisOutputDir}
            mr_job = iterate(args=[thisInputDir, '--no-strict-protocols', '--no-output', '--numNodes=' + str(n),
                                   '-r', 'emr', '--emr-job-flow-id', clusterId, '--output-dir', thisOutputDir,
                                   '--file', 'danglingMass.txt', '--alpha=' + str(a)])


        with mr_job.make_runner() as runner:
            runner.run()

            if runnerType != 'emr':
                for line in runner.stream_output():
                    out = mr_job.parse_output_line(line)
                    output.append(out)
                    print out
                    
            if runnerType == 'local':
                with open(thisOutputDir + '.txt', 'w') as f:
                    for line in output:
                        f.writelines('"' + line[0] + '"\t' + str(line[1]) + '\n' )
                        
                    
        thisInputDir = outputDir + str(i)
        thisOutputDir = outputDir + str(i + 1)
        

In [147]:
inputFile = 'PageRank-testInitialized'
outputDir = 'iterations/iteration'

k = 2

iteratePR(inputFile, n, 0.15, 'local', outputDir, k)



('A', [{}, 0.059297520661157725])
('B', [{'C': 1}, 0.3168732782369153])
('C', [{'B': 1}, 0.09793388429752137])
('D', [{'A': 1, 'B': 1}, 0.04641873278236985])
('E', [{'B': 1, 'D': 1, 'F': 1}, 0.3297520661157031])
('F', [{'B': 1, 'E': 1}, 0.04641873278236985])
('G', [{'B': 1, 'E': 1}, 0.02066115702479409])
('H', [{'B': 1, 'E': 1}, 0.02066115702479409])
('I', [{'B': 1, 'E': 1}, 0.02066115702479409])
('J', [{'E': 1}, 0.02066115702479409])
('K', [{'E': 1}, 0.02066115702479409])


('A', [{}, 0.018218444778365452])


## Initialize the Wikipedia data

In [5]:
inputFile = 's3://ucb-mids-mls-networks/wikipedia/all-pages-indexed-out.txt'
outputDir = 's3://ms-w261-hw09/wikipedia/initialize'

initializePR(inputFile, 'emr', outputDir)