#DATASCI W261: Machine Learning at Scale

##This is an example of reducer side inner join

##Write two tables into files

In [1]:
%%writefile customers.dat
1|Alice Bob|31|CA
2|Sam Sneed|51|NV
3|Jon Sneed|37|CA
4|Arnold Wesise|17|NY
5|Henry Bob|25|NV
6|Yo Yo Ma|37|NY
7|Jon York|41|WA
8|Alex Ball|26|WA
9|Jim Davis|19|CA

Overwriting customers.dat


In [2]:
%%writefile orders.dat
1|Apple
3|Garlic
2|Milk
1|Iphone
4|Ipad
5|Book
7|Potato
8|Tomato
9|Orange
5|shoes

Overwriting orders.dat


##MrJob class for ReducerSideRightJoin

In [1]:
%%writefile reducersiderightjoin.py
from mrjob.job import MRJob
from mrjob.step import MRJobStep
from mrjob.compat import get_jobconf_value
 
class rightjoin(MRJob):
    def mapper(self, _, line):
        x = line.split("|")
        if len(x) == 4:
            yield x[0], ("lefttable", x[1], x[2], x[3])
        else:
            yield x[0], ("righttable", x[1])

    def reducer(self, key, values):
        customers = list()
        orders = list()
        for val in values:
            if val[0]== u'lefttable':
                customers.append(val)
            else:
                orders.append(val)
        for o in orders:
            if len(customers)==0:
                yield None, [key] + [None, None, None] + o[1:]
            for c in customers:
                yield None, [key] + c[1:] + o[1:]

if __name__ == '__main__':
    rightjoin.run()

Writing reducersiderightjoin.py


##Run the code through python driver

####  Reminder: You cannot use the programmatic runner functionality in the same file as your job class. That is because the file with the job class is sent to Hadoop to be run. Therefore, the job file cannot attempt to start the Hadoop job, or you would be recursively creating Hadoop jobs!

Use make_runner() to run an MRJob
1. seperate driver from mapreduce jobs
2. now we can run it within pythonnode book 
3. In python, typically one class is in each file. Each mrjob job is a seperate class, should be in a seperate file

In [1]:
from reducersiderightjoin import rightjoin
mr_job = rightjoin(args=['customers.dat','orders.dat'])
with mr_job.make_runner() as runner: 
    runner.run()
    count = 0
    # stream_output: get access of the output 
    for line in runner.stream_output():
        key,value =  mr_job.parse_output_line(line)
        print value
        count = count + 1
print "\n"
print "There are %s records" %count

[u'1', u'Alice Bob', u'31', u'CA', u'Apple']
[u'1', u'Alice Bob', u'31', u'CA', u'Iphone']
[u'2', u'Sam Sneed', u'51', u'NV', u'Milk']
[u'3', u'Jon Sneed', u'37', u'CA', u'Garlic']
[u'4', u'Arnold Wesise', u'17', u'NY', u'Ipad']
[u'5', u'Henry Bob', u'25', u'NV', u'Book']
[u'5', u'Henry Bob', u'25', u'NV', u'shoes']
[u'7', u'Jon York', u'41', u'WA', u'Potato']
[u'8', u'Alex Ball', u'26', u'WA', u'Tomato']
[u'9', u'Jim Davis', u'19', u'CA', u'Orange']


There are 10 records
