## Simulating MapReduce jobs

### Reading the file

Make a folder, place the file 5000-8.txt and the notebook in it

In [6]:
import sys
with open('5000-8.txt', encoding = "ISO-8859-1") as f:
    lines = f.readlines()

### WordCount

##### Mapper

In [37]:
import string
import re

#Compute a list 
WC_mapper_out = []
# input comes from STDIN (standard input)
for line in lines:
    # remove leading and trailing whitespace
    line = line.strip()
    line = re.sub('['+string.punctuation+']', '', line)
    # split the line into words
    words = line.split()
    # increase counters
    for word in words:
        # write the results to STDOUT (standard output);
        # what we output here will be the input for the
        # Reduce step, i.e. the input for reducer.py
        #
        # tab-delimited; the trivial word count is 1
        WC_mapper_out.append('%s\t%s\n' % (word, 1))
print(WC_mapper_out[1:100])

['Project\t1\n', 'Gutenberg\t1\n', 'EBook\t1\n', 'of\t1\n', 'The\t1\n', 'Notebooks\t1\n', 'of\t1\n', 'Leonardo\t1\n', 'Da\t1\n', 'Vinci\t1\n', 'Complete\t1\n', 'by\t1\n', 'Leonardo\t1\n', 'Da\t1\n', 'Vinci\t1\n', '3\t1\n', 'in\t1\n', 'our\t1\n', 'series\t1\n', 'by\t1\n', 'Leonardo\t1\n', 'Da\t1\n', 'Vinci\t1\n', 'Copyright\t1\n', 'laws\t1\n', 'are\t1\n', 'changing\t1\n', 'all\t1\n', 'over\t1\n', 'the\t1\n', 'world\t1\n', 'Be\t1\n', 'sure\t1\n', 'to\t1\n', 'check\t1\n', 'the\t1\n', 'copyright\t1\n', 'laws\t1\n', 'for\t1\n', 'your\t1\n', 'country\t1\n', 'before\t1\n', 'downloading\t1\n', 'or\t1\n', 'redistributing\t1\n', 'this\t1\n', 'or\t1\n', 'any\t1\n', 'other\t1\n', 'Project\t1\n', 'Gutenberg\t1\n', 'eBook\t1\n', 'This\t1\n', 'header\t1\n', 'should\t1\n', 'be\t1\n', 'the\t1\n', 'first\t1\n', 'thing\t1\n', 'seen\t1\n', 'when\t1\n', 'viewing\t1\n', 'this\t1\n', 'Project\t1\n', 'Gutenberg\t1\n', 'file\t1\n', 'Please\t1\n', 'do\t1\n', 'not\t1\n', 'remove\t1\n', 'it\t1\n', 'Do\t1\n', 'not

#### Reducer

In [34]:
## replicate the sorting of hadoop
WC_mapper_out.sort()

#Reducer code
current_word = None
current_count = 0
word = None

WC_reducer_out =[]
# input comes from STDIN
for line in mapper_out:
    # remove leading and trailing whitespace
    line = line.strip()

    # parse the input we got from mapper.py
    word, count = line.split('\t', 1)

    # convert count (currently a string) to int
    try:
        count = int(count)
    except ValueError:
        # count was not a number, so silently
        # ignore/discard this line
        continue

    # this IF-switch only works because Hadoop sorts map output
    # by key (here: word) before it is passed to the reducer
    if current_word == word:
        current_count += count
    else:
        if current_word:
            # write result to STDOUT
            reducer_out.append('%s\t%s' % (current_word, current_count))
        current_count = count
        current_word = word

# do not forget to output the last word if needed!
if current_word == word:
    WC_reducer_out.append('%s\t%s' % (current_word, current_count))

In [None]:
for lines in reducer_out[31000:32000]:
    print(lines)

### Random sampling

#### Mapper

In [15]:
from random import randint
import string
import re

#Compute a list 
RS_mapper_out = []
# input comes from STDIN (standard input)
for line in lines:
    # remove leading and trailing whitespace
    line = line.strip()
    line = re.sub('['+string.punctuation+']', '', line)
    # split the line into words
    words = line.split()
    # increase counters
    for word in words:
    #add a random number between 0 and 10000 as a key
        RS_mapper_out.append('%s\t%s\n' % (word, randint(0,10000)))

In [19]:
[print(i) for i in RS_mapper_out[1:1000]]

print(RS_mapper_out[1:1000])

Project	4514

Gutenberg	2433

EBook	8126

of	697

The	1565

Notebooks	8709

of	5326

Leonardo	7340

Da	4332

Vinci	1796

Complete	9390

by	4483

Leonardo	7116

Da	306

Vinci	7093

3	5852

in	6320

our	9910

series	16

by	6373

Leonardo	5960

Da	696

Vinci	969

Copyright	1842

laws	5180

are	1886

changing	7319

all	8513

over	855

the	5568

world	2432

Be	1417

sure	9867

to	8098

check	4619

the	4849

copyright	4106

laws	118

for	766

your	2226

country	9792

before	5952

downloading	1760

or	8259

redistributing	1099

this	7936

or	7705

any	4275

other	7077

Project	8280

Gutenberg	8574

eBook	9168

This	5901

header	8319

should	3241

be	3669

the	3462

first	112

thing	4693

seen	5991

when	5972

viewing	4817

this	7341

Project	2840

Gutenberg	8292

file	3235

Please	2935

do	9680

not	7107

remove	9869

it	4506

Do	1705

not	31

change	5074

or	8207

edit	8038

the	5023

header	2902

without	6698

written	1962

permission	3147

Please	5452

read	5562

the	6736

legal	9428

smal

#### Reducers 

In [49]:
import time
import random

## replicate the sorting of hadoop
RS_mapper_out.sort()

RS_reducer_out =[]

# input comes from STDIN
key_val = random.sample(range(0, 10000), 10)
out_key = random.randint(0,10000)

start_time = time.time()

for line in RS_mapper_out:
    # remove leading and trailing whitespace
    line = line.strip()

    # parse the input we got from mapper.py
    couple = line.split('\t')
    word = couple[0]
    key = int(couple[1])

    # print(key_val,key)
    if key in key_val:
        RS_reducer_out.append('%s\t%s' % (word, out_key))
        
print(time.time()-start_time)

0.5885608196258545


In [36]:
RS_reducer_out

265

In [57]:
import time
import random

## replicate the sorting of hadoop
RS_mapper_out.sort()

RS_reducer_out =[]

# input comes from STDIN
key_val = set(random.sample(range(0, 10000), 10))
out_key = random.randint(0,10000)
start_time = time.time()

for line in RS_mapper_out:
    # remove leading and trailing whitespace
    line = line.strip()

    # parse the input we got from mapper.py
    couple = line.split('\t')
    word = couple[0]
    key = int(couple[1])

    # print(key_val,key)
    try: 
        if key in key_val:
            RS_reducer_out.append('%s\t%s' % (word, out_key))
    except ValueError: 
        continue

print(time.time()-start_time)

0.2649412155151367


In [58]:
RS_reducer_out

['1207\t8345',
 '3\t8345',
 '6\t8345',
 '7The\t8345',
 'A\t8345',
 'A\t8345',
 'Accademia\t8345',
 'Author\t8345',
 'FOR\t8345',
 'HAS\t8345',
 'HAVE\t8345',
 'It\t8345',
 'It\t8345',
 'Item\t8345',
 'LIGHTS\t8345',
 'Leonardo\t8345',
 'Libr\t8345',
 'Mediterranean\t8345',
 'Milan\t8345',
 'Natura\t8345',
 'No\t8345',
 'OUGHT\t8345',
 'PARTS\t8345',
 'PERSPECTIVE\t8345',
 'Swords\t8345',
 'THE\t8345',
 'Tuscany\t8345',
 'VARNISH\t8345',
 'VASARI\t8345',
 'a\t8345',
 'a\t8345',
 'a\t8345',
 'accused\t8345',
 'ad\t8345',
 'also\t8345',
 'although\t8345',
 'always\t8345',
 'amid\t8345',
 'and\t8345',
 'and\t8345',
 'and\t8345',
 'and\t8345',
 'and\t8345',
 'and\t8345',
 'and\t8345',
 'and\t8345',
 'and\t8345',
 'and\t8345',
 'and\t8345',
 'and\t8345',
 'angle\t8345',
 'another\t8345',
 'any\t8345',
 'are\t8345',
 'are\t8345',
 'are\t8345',
 'are\t8345',
 'avail\t8345',
 'be\t8345',
 'being\t8345',
 'bodies\t8345',
 'body\t8345',
 'book\t8345',
 'by\t8345',
 'by\t8345',
 'can\t8345',
 'can

Second option with set is much more efficient (confirming https://stackoverflow.com/questions/7571635/fastest-way-to-check-if-a-value-exist-in-a-list)