- ¿Qué es Map-Reduce?
- ¿Por qué se requiere multiprocesamiento en Map-Reduce?
- ¿Qué hace el siguiente código?
- Pruebe en procesar archivos de texto (puede buscar datasets en kaggle) contabilizando la frecuencia de aparición de las palabras para todos los archivos.

In [9]:
import multiprocessing
import string

from multiprocessing_mapreduce import SimpleMapReduce

def file_to_words(filename):
    """Read a file and return a sequence of (word, occurances) values.
    """
    STOP_WORDS = set([
            'a', 'an', 'and', 'are', 'as', 'be', 'by', 'for', 'if', 'in', 
            'is', 'it', 'of', 'or', 'py', 'rst', 'that', 'the', 'to', 'with',
            ])
    TR = str.maketrans(string.punctuation, ' ' * len(string.punctuation))

    print (multiprocessing.current_process().name, 'reading', filename)
    output = []

    with open(filename, 'rt') as f:
        for line in f:
            if line.lstrip().startswith('..'): # Skip rst comment lines
                continue
            line = line.translate(TR) # Strip punctuation
            for word in line.split():
                word = word.lower()
                if word.isalpha() and word not in STOP_WORDS:
                    output.append( (word, 1) )
    return output


def count_words(item):
    """Convert the partitioned data for a word to a
    tuple containing the word and the number of occurances.
    """
    word, occurances = item
    return (word, sum(occurances))


if __name__ == '__main__':
    import operator
    import glob

    input_files = glob.glob('*.txt')
    #input_files = ['tweets_corpus1.txt', 'tweets_corpus2.txt']
    #input_file = open("tweets_corpus.txt", "r")
    #print(input_file.read(40))
    
    mapper = SimpleMapReduce(file_to_words, count_words)
    word_counts = mapper(input_files)
    word_counts.sort(key=operator.itemgetter(1))
    word_counts.reverse()
    
    print ('\nTOP 20 WORDS BY FREQUENCY\n')
    top20 = word_counts[:20]
    print(len(top20))
    longest = max(len(word) for word, count in top20)
    for word, count in top20:
        print ('%-*s: %5s' % (longest+1, word, count))

ForkPoolWorker-30 reading tweets_corpus2.txt
ForkPoolWorker-29 reading tweets_corpus1.txt

TOP 20 WORDS BY FREQUENCY

20
t         : 372937
co        : 371772
https     : 358583
ecuador   : 27873
ahora     : 22727
video     : 20733
youtube   : 18320
hoy       : 18208
ser       : 15157
dia       : 14569
gracias   : 14545
via       : 14229
emitiendo : 14014
asi       : 13443
http      : 13047
vida      : 12938
solo      : 12849
mejor     : 11817
siempre   : 10953
pais      :  9196
