#DATASCI W261: Machine Learning at Scale

## Problem: Find all pages (aka Vroots) with  greater than 400 visits

Complete code in MrJob class file 

In [1]:
%%writefile TopPages.py
"""Find Vroots with more than 400 visits.

This program will take a CSV data file and output tab-seperated lines of

    Vroot -> number of visits

To run:

    python top_pages.py anonymous-msweb.data

To store output:

    python top_pages.py anonymous-msweb.data > top_pages.out
"""

from mrjob.job import MRJob
import csv

def csv_readline(line):
    """Given a sting CSV line, return a list of strings."""
    for row in csv.reader([line]):
        return row

class TopPages(MRJob):

    def mapper(self, line_no, line):
        """Extracts the Vroot that visit a page"""
        cell = csv_readline(line)
        if cell[0] == 'V':
            yield cell[1],1

    def reducer(self, vroot, visit_counts):
        """Sumarizes the visit counts by adding them together. If total visits
        is more than 400, yield the results"""
        total = sum(i for i in visit_counts)
        if total > 400:
            yield vroot, total
        
if __name__ == '__main__':
    TopPages.run()


Overwriting TopPages.py


Driver code is ready

In [2]:
from TopPages import TopPages
import csv

mr_job = TopPages(args=['anonymous-msweb.data'])
with mr_job.make_runner() as runner:
    runner.run()
    for line in runner.stream_output():
        print mr_job.parse_output_line(line)

(u'1000', 912)
(u'1001', 4451)
(u'1002', 749)
(u'1003', 2968)
(u'1004', 8463)
(u'1007', 865)
(u'1008', 10836)
(u'1009', 4628)
(u'1010', 698)
(u'1014', 728)
(u'1017', 5108)
(u'1018', 5330)
(u'1020', 1087)
(u'1024', 521)
(u'1025', 2123)
(u'1026', 3220)
(u'1027', 507)
(u'1030', 1115)
(u'1031', 574)
(u'1032', 1446)
(u'1034', 9383)
(u'1035', 1791)
(u'1036', 759)
(u'1037', 1160)
(u'1038', 1110)
(u'1040', 1506)
(u'1041', 1500)
(u'1045', 474)
(u'1046', 636)
(u'1052', 842)
(u'1053', 670)
(u'1058', 672)
(u'1067', 548)
(u'1070', 602)
(u'1074', 584)
(u'1076', 444)
(u'1078', 462)
(u'1295', 716)
