## Simulating MapReduce jobs

### Reading the file

Make a folder, place the file 5000-8.txt and the notebook in it

In [1]:
import sys
with open('5000-8.txt', encoding = "ISO-8859-1") as f:
    lines = f.readlines()

### WordCount

##### Mapper

In [37]:
import string
import re

#Compute a list 
WC_mapper_out = []
# input comes from STDIN (standard input)
for line in lines:
    # remove leading and trailing whitespace
    line = line.strip()
    line = re.sub('['+string.punctuation+']', '', line)
    # split the line into words
    words = line.split()
    # increase counters
    for word in words:
        # write the results to STDOUT (standard output);
        # what we output here will be the input for the
        # Reduce step, i.e. the input for reducer.py
        #
        # tab-delimited; the trivial word count is 1
        WC_mapper_out.append('%s\t%s\n' % (word, 1))
print(WC_mapper_out[1:100])

['Project\t1\n', 'Gutenberg\t1\n', 'EBook\t1\n', 'of\t1\n', 'The\t1\n', 'Notebooks\t1\n', 'of\t1\n', 'Leonardo\t1\n', 'Da\t1\n', 'Vinci\t1\n', 'Complete\t1\n', 'by\t1\n', 'Leonardo\t1\n', 'Da\t1\n', 'Vinci\t1\n', '3\t1\n', 'in\t1\n', 'our\t1\n', 'series\t1\n', 'by\t1\n', 'Leonardo\t1\n', 'Da\t1\n', 'Vinci\t1\n', 'Copyright\t1\n', 'laws\t1\n', 'are\t1\n', 'changing\t1\n', 'all\t1\n', 'over\t1\n', 'the\t1\n', 'world\t1\n', 'Be\t1\n', 'sure\t1\n', 'to\t1\n', 'check\t1\n', 'the\t1\n', 'copyright\t1\n', 'laws\t1\n', 'for\t1\n', 'your\t1\n', 'country\t1\n', 'before\t1\n', 'downloading\t1\n', 'or\t1\n', 'redistributing\t1\n', 'this\t1\n', 'or\t1\n', 'any\t1\n', 'other\t1\n', 'Project\t1\n', 'Gutenberg\t1\n', 'eBook\t1\n', 'This\t1\n', 'header\t1\n', 'should\t1\n', 'be\t1\n', 'the\t1\n', 'first\t1\n', 'thing\t1\n', 'seen\t1\n', 'when\t1\n', 'viewing\t1\n', 'this\t1\n', 'Project\t1\n', 'Gutenberg\t1\n', 'file\t1\n', 'Please\t1\n', 'do\t1\n', 'not\t1\n', 'remove\t1\n', 'it\t1\n', 'Do\t1\n', 'not

#### Reducer

In [34]:
## replicate the sorting of hadoop
WC_mapper_out.sort()

#Reducer code
current_word = None
current_count = 0
word = None

WC_reducer_out =[]
# input comes from STDIN
for line in mapper_out:
    # remove leading and trailing whitespace
    line = line.strip()

    # parse the input we got from mapper.py
    word, count = line.split('\t', 1)

    # convert count (currently a string) to int
    try:
        count = int(count)
    except ValueError:
        # count was not a number, so silently
        # ignore/discard this line
        continue

    # this IF-switch only works because Hadoop sorts map output
    # by key (here: word) before it is passed to the reducer
    if current_word == word:
        current_count += count
    else:
        if current_word:
            # write result to STDOUT
            reducer_out.append('%s\t%s' % (current_word, current_count))
        current_count = count
        current_word = word

# do not forget to output the last word if needed!
if current_word == word:
    WC_reducer_out.append('%s\t%s' % (current_word, current_count))

In [None]:
for lines in reducer_out[31000:32000]:
    print(lines)

### Random sampling

#### Mapper

In [67]:
from random import randint

import string
import re

#Compute a list 
RS_mapper_out = []
# input comes from STDIN (standard input)
for line in lines:
    # remove leading and trailing whitespace
    line = line.strip()
    line = re.sub('['+string.punctuation+']', '', line)
    # split the line into words
    words = line.split()
    # increase counters
    for word in words:
        # write the results to STDOUT (standard output);
        # what we output here will be the input for the
        # Reduce step, i.e. the input for reducer.py
        #
        # tab-delimited; the trivial word count is 1
        RS_mapper_out.append('%s\t%s\n' % (word, randint(0,1000)))
print(RS_mapper_out[1:1000])

['Project\t725\n', 'Gutenberg\t249\n', 'EBook\t843\n', 'of\t174\n', 'The\t933\n', 'Notebooks\t122\n', 'of\t776\n', 'Leonardo\t601\n', 'Da\t697\n', 'Vinci\t128\n', 'Complete\t625\n', 'by\t791\n', 'Leonardo\t167\n', 'Da\t480\n', 'Vinci\t671\n', '3\t479\n', 'in\t692\n', 'our\t831\n', 'series\t321\n', 'by\t907\n', 'Leonardo\t700\n', 'Da\t150\n', 'Vinci\t570\n', 'Copyright\t820\n', 'laws\t167\n', 'are\t135\n', 'changing\t249\n', 'all\t561\n', 'over\t451\n', 'the\t44\n', 'world\t629\n', 'Be\t602\n', 'sure\t472\n', 'to\t219\n', 'check\t439\n', 'the\t522\n', 'copyright\t207\n', 'laws\t770\n', 'for\t410\n', 'your\t654\n', 'country\t116\n', 'before\t153\n', 'downloading\t125\n', 'or\t443\n', 'redistributing\t290\n', 'this\t851\n', 'or\t612\n', 'any\t460\n', 'other\t949\n', 'Project\t36\n', 'Gutenberg\t772\n', 'eBook\t633\n', 'This\t572\n', 'header\t446\n', 'should\t175\n', 'be\t169\n', 'the\t487\n', 'first\t983\n', 'thing\t884\n', 'seen\t547\n', 'when\t942\n', 'viewing\t287\n', 'this\t377\n', 'P

#### Reducers 

In [72]:
## replicate the sorting of hadoop
RS_mapper_out.sort()


RS_reducer_out =[]
# input comes from STDIN
key_val = str(randint(0,1000))
for line in RS_mapper_out:
    # remove leading and trailing whitespace
    line = line.strip()

    # parse the input we got from mapper.py
    couple = line.split('\t')
    word = couple[0]
    key = couple[1]

    # print(key_val,key)
    if key == key_val:
        RS_reducer_out.append('%s\t%s' % (word, key))

In [73]:
RS_reducer_out

['1\t540',
 '1382\t540',
 '1473\t540',
 '1487\t540',
 '42\t540',
 '5000\t540',
 '725\t540',
 '855\t540',
 'And\t540',
 'Astronomy\t540',
 'Cronista\t540',
 'Dome\t540',
 'Every\t540',
 'Footnote\t540',
 'Greco\t540',
 'HAS\t540',
 'I\t540',
 'INNUMERABLE\t540',
 'In\t540',
 'International\t540',
 'Justinus\t540',
 'Keys\t540',
 'MS\t540',
 'MS\t540',
 'No\t540',
 'Prophecies\t540',
 'SUBTERRANEAN\t540',
 'The\t540',
 'The\t540',
 'The\t540',
 'This\t540',
 'Truth\t540',
 'a\t540',
 'acting\t540',
 'acute\t540',
 'allowances\t540',
 'and\t540',
 'and\t540',
 'and\t540',
 'and\t540',
 'and\t540',
 'and\t540',
 'appreciable\t540',
 'are\t540',
 'are\t540',
 'are\t540',
 'as\t540',
 'as\t540',
 'astronomer\t540',
 'at\t540',
 'attaches\t540',
 'b\t540',
 'be\t540',
 'be\t540',
 'be\t540',
 'be\t540',
 'be\t540',
 'because\t540',
 'believes\t540',
 'below\t540',
 'below\t540',
 'bent\t540',
 'best\t540',
 'bitterly\t540',
 'both\t540',
 'brought\t540',
 'buildings\t540',
 'but\t540',
 'by\t

In [74]:
len(RS_reducer_out)

257