## MRJob Tests

In [9]:
%load_ext autoreload
%autoreload 2

### Secondary Sort
Secondary using the 3rd key in reverse order.

In [37]:
%%writefile test.data
4,10,3,Apple
2,2,4,Orange
6,-1,6,Lemon
0,9,18,Apple
6,8,7,Lemon
6,199,20,Lemon
6,-9,2,Lemon
6,-1,10,Lemon
6,-9223372036854775808,43,Orange

Overwriting test.data


- commented out
jobconf={
    "stream.num.map.output.key.fields":"3",
    "mapreduce.job.output.key.comparator.class":
        "org.apache.hadoop.mapred.lib.KeyFieldBasedComparator",
    "mapreduce.partition.keycomparator.options":"-k1,1n -k3,3nr",
      }

In [41]:
%%writefile test.py

from mrjob.job import MRJob, MRStep
import mrjob
import csv

import sys
def toStringKey(n):
    n = int(n)
    digits = len(str(sys.maxint))
    minInt = -sys.maxint - 1

    if n < 0:
        key = "-" + str(abs(minInt-n)).zfill(digits)
    else:
        key = str(n).zfill(digits)
        
    return key
    
class test(MRJob):
    SORT_VALUES = True
    
    def mapper1(self, line_no, line):
        cell = line.strip().split(',')
        
        yield cell[0], [toStringKey(cell[1])] + cell[1:]

    def reducer1(self, key, value):
        yield key, [v for v in value]

    def steps(self):
        return [
            MRStep(mapper=self.mapper1,
                   reducer=self.reducer1,
        )]
    
if __name__ == '__main__':
    test.run()


Overwriting test.py


In [40]:
from test import test
mr_job = test(args=['test.data', '-r', 'inline', '--no-strict-protocols'])
with mr_job.make_runner() as runner: 
    runner.run()
    print "Output:"
    for line in runner.stream_output():
        print line



Output:
"0"	[["0000000000000000009", "9", "18", "Apple"]]

"2"	[["0000000000000000002", "2", "4", "Orange"]]

"4"	[["0000000000000000010", "10", "3", "Apple"]]

"6"	[["-0000000000000000000", "-9223372036854775808", "43", "Orange"], ["-9223372036854775799", "-9", "2", "Lemon"], ["-9223372036854775807", "-1", "10", "Lemon"], ["-9223372036854775807", "-1", "6", "Lemon"], ["0000000000000000008", "8", "7", "Lemon"], ["0000000000000000199", "199", "20", "Lemon"]]



### Check if the input to mapper has terminating newline or not.
Answer: NO

### How about passing a list inside a tuple as values?

In [19]:
%%writefile test2.py

from mrjob.job import MRJob, MRStep
import mrjob
import csv
import sys

class test2(MRJob):
    def mapper1(self, line_no, line):
        fields = line.split(',')
        v = ["a","b","c"]
        yield fields[0], (v, len(fields))

    def reducer1(self, key, values):
        items = []
        for v in values:
            yield key, ("#".join(v[0]), v[1])
        
        
    def steps(self):
        return [
            MRStep(mapper=self.mapper1,
                  reducer=self.reducer1)
            ]

    
if __name__ == '__main__':
    test2.run()


Overwriting test2.py


In [20]:
from test2 import test2
mr_job = test2(args=['test.data', '-r', 'local'])
with mr_job.make_runner() as runner: 
    runner.run()
    print "Output:"
    for line in runner.stream_output():
        print mr_job.parse_output_line(line)

ERROR:mrjob.local:STDERR: + __mrjob_PWD=/private/var/folders/dm/nsw7wjf91f1c74hgl17ldw040000gn/T/test2.patrickng.20160210.130421.417554/job_local_dir/0/mapper/0
ERROR:mrjob.local:STDERR: + exec
ERROR:mrjob.local:STDERR: + /usr/bin/python -c 'import fcntl; fcntl.flock(9, fcntl.LOCK_EX)'
ERROR:mrjob.local:STDERR: + export PYTHONPATH=/private/var/folders/dm/nsw7wjf91f1c74hgl17ldw040000gn/T/test2.patrickng.20160210.130421.417554/job_local_dir/0/mapper/0/mrjob.tar.gz:/Users/patrickng/Programs/spark-1.5.1-bin-hadoop2.6/python/pyspark:/Users/patrickng/Programs/spark-1.5.1-bin-hadoop2.6/python/lib/py4j-0.8.2.1-src.zip:/Users/patrickng/Programs/spark-1.5.1-bin-hadoop2.6/python:/Users/patrickng/Programs/spark-1.5.1-bin-hadoop2.6/python/build::/Library/Python/2.7/site-packages
ERROR:mrjob.local:STDERR: + PYTHONPATH=/private/var/folders/dm/nsw7wjf91f1c74hgl17ldw040000gn/T/test2.patrickng.20160210.130421.417554/job_local_dir/0/mapper/0/mrjob.tar.gz:/Users/patrickng/Programs/spark-1.5.1-bin-hadoop2.

Output:
('0', ['a#b#c', 4])
('2', ['a#b#c', 4])
('4', ['a#b#c', 4])
('6', ['a#b#c', 4])
('6', ['a#b#c', 4])
('6', ['a#b#c', 4])
('6', ['a#b#c', 4])
('6', ['a#b#c', 4])
